diff --git a/.codiumai.toml b/.codiumai.toml new file mode 100644 index 00000000..828463d1 --- /dev/null +++ b/.codiumai.toml @@ -0,0 +1,40 @@ +#.codiumai.toml +[tests] + +## Testing framework to use - this can affect the content of the generated tests +## as well as the test run command. +## Possible values are: +## Python: Pytest, Unittest +framework = "Pytest" + +## A hint to the test generator about whether to use mocks or not. Possible values are true or false. +# use_mocks = false + +## How many tests should be generated by default. Fewer tests is faster. +## Does not apply at the moment to extend-suite tests. +num_desired_tests = 20 + +## A multiline string, delimited with triple-quotes (""") serving as an extra instruction +## that the AI model will take into consideration. +## This will appear as "General instructions" in the +## configuration section in the tests panel. +# plan_instructions = """ +# Each line should have a comment explaining it. +# Each comment should start with the comment number (1., 2. etc.) +# """ + +## A multiline string, delimited with triple-quotes (""") serving as an example test that represents +## what you would like the generated tests to look like in terms of style, setup, etc. +# example_test = """ +# describe("something", () => { +# it("says 'bar'", () => { +# // given +# +# // when +# const res = something.say(); +# +# // Then +# expect(res).to.equal("bar"); +# }); +# }); +# """ diff --git a/.github/ISSUE_TEMPLATE/sweep-template.yml b/.github/ISSUE_TEMPLATE/sweep-template.yml index 44116f53..d46b4ff1 100644 --- a/.github/ISSUE_TEMPLATE/sweep-template.yml +++ b/.github/ISSUE_TEMPLATE/sweep-template.yml @@ -12,4 +12,4 @@ body: Unit Tests: Write unit tests for . Test each function in the file. Make sure to test edge cases. Bugs: The bug might be in . Here are the logs: ... Features: the new endpoint should use the ... class from because it contains ... logic. - Refactors: We are migrating this function to ... version because ... \ No newline at end of file + Refactors: We are migrating this function to ... version because ... diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 411d0909..cdabdee9 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,7 +5,7 @@ version: 2 updates: - # Enable version updates for npm + # Enable version updates for pip - package-ecosystem: 'pip' directory: '/' # Check the npm registry for updates once a week (Monday) @@ -15,4 +15,4 @@ updates: - package-ecosystem: 'github-actions' directory: '/' schedule: - interval: 'daily' \ No newline at end of file + interval: 'daily' diff --git a/.github/workflows/deploy-github-pages.yml b/.github/workflows/deploy-github-pages.yml index 9aae6032..59055185 100644 --- a/.github/workflows/deploy-github-pages.yml +++ b/.github/workflows/deploy-github-pages.yml @@ -14,8 +14,8 @@ jobs: name: Deploy to GitHub Pages runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 + - uses: actions/checkout@v4.1.1 + - uses: actions/setup-python@v5.0.0 with: python-version: '3.7' @@ -38,4 +38,4 @@ jobs: # The GH actions bot is used by default if you didn't specify the two fields. # You can swap them out with your own user credentials. user_name: github-actions[bot] - user_email: 41898282+github-actions[bot]@users.noreply.github.com \ No newline at end of file + user_email: 41898282+github-actions[bot]@users.noreply.github.com diff --git a/.github/workflows/first-interaction.yml b/.github/workflows/first-interaction.yml index e32f91a1..422476bc 100644 --- a/.github/workflows/first-interaction.yml +++ b/.github/workflows/first-interaction.yml @@ -21,7 +21,7 @@ jobs: If this is a bug report, please include relevant logs to help us debug the problem. pr-message: | Hello! Thank you for your contribution. - + If you are fixing a bug, please reference the issue number in the description. If you are implementing a feature request, please check with the maintainers that the feature will be accepted first. diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 62851151..15dcc20a 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -13,7 +13,7 @@ on: types: [published] workflow_dispatch: # This line allows manual triggering - + #push: # branches: # - master diff --git a/.github/workflows/stale_bot.yaml b/.github/workflows/stale_bot.yaml index 93fad607..6cd72785 100644 --- a/.github/workflows/stale_bot.yaml +++ b/.github/workflows/stale_bot.yaml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Mark and close stale PRs - uses: actions/stale@v5 + uses: actions/stale@v9 with: stale-pr-message: "This PR is stale because it has been 60 days with no activity. This PR will be automatically closed within 7 days if there is no further activity." close-pr-message: "This PR was closed because it has been stalled for some time with no activity." diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6d8b84df..d1b1926a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -25,17 +25,20 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip setuptools wheel - pip install pytest pytest-cov coverage cached-property - python setup.py clean build install - - name: Run test - run: pytest - - - name: Generate coverage report + pip install pytest pytest-cov pytest-sugar coverage cached-property + - name: Install Whoosh + run: | + pip install -e . + - name: Run tests run: | - pip install pytest - pip install pytest-cov - pytest --cov=./ --cov-report=xml - + pytest --cov=./ --cov-report=xml --cov-report=html + - name: Upload HTML coverage report + uses: actions/upload-artifact@v4 + with: + name: "HTML Coverage ${{ matrix.python-version }}" + path: "htmlcov" + retention-days: 7 + - name: Upload Coverage to Codecov uses: codecov/codecov-action@v4.0.1 with: diff --git a/.gitignore b/.gitignore index 5f8f4347..4aaf9f97 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,10 @@ eggs/ *.egg-info/ /test.py /.vscode/settings.json +/.coverage +/whoosh-reloaded.code-workspace +/.vscode/launch.json +*.coverage.DESKTOP-* +/coverage.xml +/lcov.info +/.codiumai.local.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..80ce8afd --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,24 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-ast + - id: check-builtin-literals + - id: check-merge-conflict + - id: check-toml + - id: check-yaml + - id: detect-private-key + - id: end-of-file-fixer + - id: mixed-line-ending + - id: trailing-whitespace + + - repo: https://github.com/charliermarsh/ruff-pre-commit + rev: v0.2.1 + hooks: + - id: ruff + - id: ruff-format + + - repo: https://github.com/ikamensh/flynt/ + rev: '1.0.1' + hooks: + - id: flynt diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 274eb141..5b8d7726 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -26,4 +26,4 @@ sphinx: python: install: # - requirements: requirements.txt - - requirements: docs/requirements.txt \ No newline at end of file + - requirements: docs/requirements.txt diff --git a/.sonarcloud.properties b/.sonarcloud.properties new file mode 100644 index 00000000..85985f0b --- /dev/null +++ b/.sonarcloud.properties @@ -0,0 +1 @@ +sonar.python.version=3.8, 3.9, 3.10, 3.11, 3.12 diff --git a/README.md b/README.md index f92e63f3..6703235e 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,26 @@ [![CodeFactor](https://www.codefactor.io/repository/github/sygil-dev/whoosh-reloaded/badge/main)](https://www.codefactor.io/repository/github/sygil-dev/whoosh-reloaded/overview/main) +[![codecov](https://codecov.io/gh/Sygil-Dev/whoosh-reloaded/graph/badge.svg?token=O3Z2DFB8UA)](https://codecov.io/gh/Sygil-Dev/whoosh-reloaded) [![Documentation Status](https://readthedocs.org/projects/whoosh-reloaded/badge/?version=latest)](https://whoosh-reloaded.readthedocs.io/en/latest/?badge=latest) -[![codecov](https://codecov.io/gh/Sygil-Dev/whoosh-reloaded/branch/master/graph/badge.svg)](https://codecov.io/gh/Sygil-Dev/whoosh-reloaded) [![PyPI version](https://badge.fury.io/py/Whoosh-Reloaded.svg)](https://badge.fury.io/py/Whoosh-Reloaded) [![Downloads](https://pepy.tech/badge/whoosh-reloaded)](https://pepy.tech/project/whoosh-reloaded) [![License](https://img.shields.io/pypi/l/Whoosh-Reloaded)](https://pypi.org/project/Whoosh-Reloaded/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/Whoosh-Reloaded)](https://pypi.org/project/Whoosh-Reloaded/) [![PyPI - Wheel](https://img.shields.io/pypi/wheel/Whoosh-Reloaded)](https://pypi.org/project/Whoosh-Reloaded/) [![PyPI - Format](https://img.shields.io/pypi/format/Whoosh-Reloaded)](https://pypi.org/project/Whoosh-Reloaded/) [![PyPI - Status](https://img.shields.io/pypi/status/Whoosh-Reloaded)](https://pypi.org/project/Whoosh-Reloaded/) +[![Lines of Code](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=ncloc)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded) +[![Code Smells](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=code_smells)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded) +[![Maintainability Rating](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=sqale_rating)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded) +[![Security Rating](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=security_rating)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded) +[![Bugs](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=bugs)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded) +[![Vulnerabilities](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=vulnerabilities)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded) +[![Duplicated Lines (%)](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=duplicated_lines_density)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded) +[![Reliability Rating](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=reliability_rating)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded) +[![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=alert_status)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded) +[![Technical Debt](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=sqale_index)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded) + + -------------------------------------- -> **Notice:** This repository (**whoosh-reloaded**) is a fork and continuation of the Whoosh project. The original Whoosh project is no longer maintained. +> **Notice:** This repository (**whoosh-reloaded**) is a fork and continuation of the Whoosh project. ->This fork **is actively maintained** by the Sygil-Dev Organization. +> This fork **is actively maintained** by the Sygil-Dev Organization. -------------------------------------- @@ -23,25 +35,25 @@ works can be extended or replaced to meet your needs exactly. Some of Whoosh's features include: * Pythonic API. -* Pure-Python. No compilation or binary packages needed, no mysterious crashes. +* Pure-Python. No compilation or binary packages are needed, no mysterious crashes. * Fielded indexing and search. * Fast indexing and retrieval -- faster than any other pure-Python, scoring, full-text search solution I know of. * Pluggable scoring algorithm (including BM25F), text analysis, storage, posting format, etc. * Powerful query language. -* Pure Python spell-checker (as far as I know, the only one). +* Pure Python spell-checker (as far as I know, the only one). Whoosh might be useful in the following circumstances: * Anywhere a pure-Python solution is desirable to avoid having to build/compile native libraries (or force users to build/compile them). -* As a research platform (at least for programmers that find Python easier to - read and work with than Java ;) +* As a research platform (at least for programmers who find Python easier to + read and work with Java ;) * When an easy-to-use Pythonic interface is more important to you than raw - speed. + speed. -Whoosh was created by Matt Chaput and is maintained currently by the Sygil-Dev Organization. It was originally created for use in the online help system of Side Effects Software's 3D animation software Houdini. Side Effects Software Inc. graciously agreed to open-source the code. +Whoosh was created by Matt Chaput and is maintained currently by the Sygil-Dev Organization. It was created for use in the online help system of Side Effects Software's 3D animation software Houdini. Side Effects Software Inc. graciously agreed to open-source the code. This software is licensed under the terms of the simplified BSD (A.K.A. "two clause" or "FreeBSD") license. See LICENSE.txt for information. @@ -52,15 +64,10 @@ Installing Whoosh If you have ``setuptools`` or ``pip`` installed, you can use ``easy_install`` or ``pip`` to download and install Whoosh automatically:: - # install the old version from Pypi - $ easy_install Whoosh - - or - + # install the old version from PyPI $ pip install Whoosh - - - # Install the development version from Github. + + # Install the development version from GitHub. $ pip install git+https://github.com/Sygil-Dev/whoosh-reloaded.git Getting the source. @@ -70,14 +77,23 @@ You can check out the latest version of the source code on GitHub using git: $ git clone https://github.com/Sygil-Dev/whoosh-reloaded.git +Contributing +============ +We use pre-commit to format the code and run some checks before committing to avoid common mistakes. To install it, run the following commands: + +```bash +$ pip install pre-commit +$ pre-commit install +``` + Learning more ============= * Online Documentation: - - *   [Github Pages](https://sygil-dev.github.io/whoosh-reloaded/) - - *   [Read the Docs](https://whoosh-reloaded.readthedocs.io/en/latest/) + + *   [GitHub Pages](https://sygil-dev.github.io/whoosh-reloaded/) + + *   [Read the Docs](https://whoosh-reloaded.readthedocs.io/en/latest/) * Read the old online documentation at https://docs.red-dove.com/whoosh-reloaded/ (Search work properly). @@ -90,4 +106,8 @@ Maintainers =========== * [Sygil-Dev Organization](https://github.com/Sygil-Dev) -* [ZeroCool940711](https://github.com/ZeroCool940711) \ No newline at end of file +* [ZeroCool940711](https://github.com/ZeroCool940711) + +Discord Server +============== +- [Sygil-Dev - Resources](https://discord.gg/H5mftKP5S9) diff --git a/benchmark/dictionary.py b/benchmark/dictionary.py index ef9912a1..d5855f4a 100644 --- a/benchmark/dictionary.py +++ b/benchmark/dictionary.py @@ -1,4 +1,5 @@ -import os.path, gzip +import gzip +import os from whoosh import analysis, fields from whoosh.support.bench import Bench, Spec @@ -28,7 +29,7 @@ def documents(self): def whoosh_schema(self): ana = analysis.StemmingAnalyzer() - # ana = analysis.StandardAnalyzer() + schema = fields.Schema( head=fields.ID(stored=True), body=fields.TEXT(analyzer=ana, stored=True) ) diff --git a/benchmark/enron.py b/benchmark/enron.py index a82c2f26..175c0ef6 100644 --- a/benchmark/enron.py +++ b/benchmark/enron.py @@ -1,5 +1,5 @@ -from __future__ import division -import os.path, tarfile +import os.path +import tarfile from email import message_from_string from marshal import dump, load from zlib import compress, decompress @@ -10,11 +10,10 @@ pass from whoosh import analysis, fields -from whoosh.compat import urlretrieve, next +from whoosh.compat import next, urlretrieve from whoosh.support.bench import Bench, Spec from whoosh.util import now - # Benchmark class @@ -45,10 +44,10 @@ class Enron(Spec): # the messages in an easier-to-digest format def download_archive(self, archive): - print("Downloading Enron email archive to %r..." % archive) + print(f"Downloading Enron email archive to {archive}...") t = now() urlretrieve(self.enron_archive_url, archive) - print("Downloaded in ", now() - t, "seconds") + print(f"Downloaded in {now() - t} seconds") @staticmethod def get_texts(archive): @@ -84,10 +83,10 @@ def get_messages(archive, headers=True): yield d def cache_messages(self, archive, cache): - print("Caching messages in %s..." % cache) + print(f"Caching messages in {cache}...") if not os.path.exists(archive): - raise Exception("Archive file %r does not exist" % archive) + raise FileNotFoundError(f"Archive file {archive} does not exist") t = now() f = open(cache, "wb") @@ -98,7 +97,7 @@ def cache_messages(self, archive, cache): if not c % 1000: print(c) f.close() - print("Cached messages in ", now() - t, "seconds") + print(f"Cached messages in {now() - t} seconds") def setup(self): archive = os.path.abspath( @@ -118,7 +117,7 @@ def setup(self): def documents(self): if not os.path.exists(self.cache_filename): - raise Exception("Message cache does not exist, use --setup") + raise FileNotFoundError("Message cache does not exist, use --setup") f = open(self.cache_filename, "rb") try: @@ -176,7 +175,7 @@ def process_document_whoosh(self, d): d["filepos"] = self.filepos if self.options.storebody: mf = self.main_field - d["_stored_%s" % mf] = compress(d[mf], 9) + d[f"_stored_{mf}"] = compress(d[mf], 9) def process_result_whoosh(self, d): mf = self.main_field diff --git a/benchmark/marc21.py b/benchmark/marc21.py index c3c189aa..07fde36f 100644 --- a/benchmark/marc21.py +++ b/benchmark/marc21.py @@ -1,11 +1,11 @@ -from __future__ import with_statement, print_function -import fnmatch, logging, os.path, re +import fnmatch +import logging +import os.path +import re from whoosh import analysis, fields, index, qparser, query, scoring -from whoosh.compat import range from whoosh.util import now - log = logging.getLogger(__name__) @@ -27,7 +27,7 @@ def read_file(dbfile, tags=None): if not first5: return if len(first5) < 5: - raise Exception + raise ValueError("Invalid length") length = int(first5) chunk = dbfile.read(length - 5) yield parse_record(first5 + chunk, tags), pos @@ -63,7 +63,7 @@ def parse_record(data, tags=None): start = dirstart + i * DIRECTORY_ENTRY_LEN end = start + DIRECTORY_ENTRY_LEN tag = data[start : start + 3] - if tags and not tag in tags: + if tags and tag not in tags: continue entry = data[start:end] @@ -135,7 +135,7 @@ def uniform_title(d): subjectfields = ( - "600 610 611 630 648 650 651 653 654 655 656 657 658 662 " "690 691 696 697 698 699" + "600 610 611 630 648 650 651 653 654 655 656 657 658 662 690 691 696 697 698 699" ).split() @@ -191,7 +191,7 @@ def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True, glob="*. mfields.update("100 110 111".split()) # Author mfields.add("245") # Title - print("Indexing with %d processor(s) and %d MB per processor" % (procs, limitmb)) + print(f"Indexing with {procs} processor(s) and {limitmb} MB per processor") c = 0 t = now() ix = index.create_in(ixdir, schema) @@ -241,7 +241,7 @@ def search(qstring, ixdir, basedir, limit=None, optimize=True, scores=True): r = s.search(q, limit=limit, optimize=optimize) for hit in r: print_record(hit.rank, basedir, hit["file"], hit["pos"]) - print("Found %d records in %0.06f seconds" % (len(r), r.runtime)) + print(f"Found {len(r)} records in {r.runtime:0.06f} seconds") else: t = now() for i, docnum in enumerate(s.docs_for_query(q)): @@ -302,7 +302,7 @@ def search(qstring, ixdir, basedir, limit=None, optimize=True, scores=True): "-M", "--merge-segments", dest="multisegment", - help="If indexing with multiproc, merge the segments after" " indexing", + help="If indexing with multiproc, merge the segments after indexing", action="store_false", default=True, ) diff --git a/benchmark/reuters.py b/benchmark/reuters.py index 0aaa3276..dde05363 100644 --- a/benchmark/reuters.py +++ b/benchmark/reuters.py @@ -1,4 +1,5 @@ -import gzip, os.path +import gzip +import os.path from whoosh import analysis, fields, index, qparser, query from whoosh.support.bench import Bench, Spec diff --git a/docs/Makefile b/docs/Makefile index 09b24957..5c1c3530 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -174,4 +174,4 @@ xml: pseudoxml: $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." \ No newline at end of file + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/make.bat b/docs/make.bat index 6b6fea17..502a76a6 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -239,4 +239,4 @@ if "%1" == "pseudoxml" ( goto end ) -:end \ No newline at end of file +:end diff --git a/docs/requirements.txt b/docs/requirements.txt index f063a0ad..0ccde19f 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,3 @@ sphinx sphinx_rtd_theme -sphinx-jsonschema \ No newline at end of file +sphinx-jsonschema diff --git a/docs/source/analysis.rst b/docs/source/analysis.rst index 27297f61..ebbb72a9 100644 --- a/docs/source/analysis.rst +++ b/docs/source/analysis.rst @@ -324,6 +324,3 @@ change it. ;) Nothing requires that an Analyzer be implemented by calling a tokenizer and filters. Tokenizers and filters are simply a convenient way to structure the code. You're free to write an analyzer any way you want, as long as it implements ``__call__``. - - - diff --git a/docs/source/api/analysis.rst b/docs/source/api/analysis.rst index bbb1b978..81805618 100644 --- a/docs/source/api/analysis.rst +++ b/docs/source/api/analysis.rst @@ -59,4 +59,3 @@ Token classes and functions .. autoclass:: Token .. autofunction:: unstopped - diff --git a/docs/source/api/codec/base.rst b/docs/source/api/codec/base.rst index 28f707c4..a1326a57 100644 --- a/docs/source/api/codec/base.rst +++ b/docs/source/api/codec/base.rst @@ -28,5 +28,3 @@ Classes .. autoclass:: Segment :members: - - diff --git a/docs/source/api/collectors.rst b/docs/source/api/collectors.rst index b27b8c1f..6a9a8c63 100644 --- a/docs/source/api/collectors.rst +++ b/docs/source/api/collectors.rst @@ -40,8 +40,3 @@ Wrappers .. autoclass:: TimeLimitCollector .. autoclass:: TermsCollector - - - - - diff --git a/docs/source/api/columns.rst b/docs/source/api/columns.rst index 26fa7916..69b7bd9a 100644 --- a/docs/source/api/columns.rst +++ b/docs/source/api/columns.rst @@ -46,4 +46,3 @@ Experimental columns ==================== .. autoclass:: ClampedNumericColumn - diff --git a/docs/source/api/formats.rst b/docs/source/api/formats.rst index 9cd9dd19..9f184db9 100644 --- a/docs/source/api/formats.rst +++ b/docs/source/api/formats.rst @@ -20,5 +20,3 @@ Formats .. autoclass:: Characters .. autoclass:: PositionBoosts .. autoclass:: CharacterBoosts - - diff --git a/docs/source/api/lang/wordnet.rst b/docs/source/api/lang/wordnet.rst index 8adcdb0b..d1422525 100644 --- a/docs/source/api/lang/wordnet.rst +++ b/docs/source/api/lang/wordnet.rst @@ -17,4 +17,3 @@ Low-level functions .. autofunction:: parse_file .. autofunction:: synonyms .. autofunction:: make_index - diff --git a/docs/source/api/qparser.rst b/docs/source/api/qparser.rst index d3c5ecda..a0b10cc2 100644 --- a/docs/source/api/qparser.rst +++ b/docs/source/api/qparser.rst @@ -88,10 +88,3 @@ Operators .. autoclass:: PrefixOperator .. autoclass:: PostfixOperator .. autoclass:: InfixOperator - - - - - - - diff --git a/docs/source/api/reading.rst b/docs/source/api/reading.rst index e0fd2a12..b923ac05 100644 --- a/docs/source/api/reading.rst +++ b/docs/source/api/reading.rst @@ -19,4 +19,3 @@ Exceptions ========== .. autoexception:: TermNotFound - diff --git a/docs/source/api/scoring.rst b/docs/source/api/scoring.rst index 73ea1e76..46fa6ab1 100644 --- a/docs/source/api/scoring.rst +++ b/docs/source/api/scoring.rst @@ -36,7 +36,3 @@ Scoring utility classes .. autoclass:: MultiWeighting .. autoclass:: ReverseWeighting - - - - diff --git a/docs/source/api/searching.rst b/docs/source/api/searching.rst index 8acfe492..c717ff00 100644 --- a/docs/source/api/searching.rst +++ b/docs/source/api/searching.rst @@ -30,4 +30,3 @@ Exceptions .. autoexception:: NoTermsException .. autoexception:: TimeLimit - diff --git a/docs/source/api/sorting.rst b/docs/source/api/sorting.rst index faf78d0f..d7f4955a 100644 --- a/docs/source/api/sorting.rst +++ b/docs/source/api/sorting.rst @@ -44,5 +44,3 @@ FacetType objects .. autoclass:: UnorderedList .. autoclass:: Count .. autoclass:: Best - - diff --git a/docs/source/api/spelling.rst b/docs/source/api/spelling.rst index 79d5961e..34db9dc5 100644 --- a/docs/source/api/spelling.rst +++ b/docs/source/api/spelling.rst @@ -27,5 +27,3 @@ QueryCorrector objects .. autoclass:: SimpleQueryCorrector .. autoclass:: Correction - - diff --git a/docs/source/api/support/charset.rst b/docs/source/api/support/charset.rst index b0a687e9..fabd03ac 100644 --- a/docs/source/api/support/charset.rst +++ b/docs/source/api/support/charset.rst @@ -10,4 +10,3 @@ Taken from http://speeple.com/unicode-maps.txt .. autofunction:: charset_table_to_dict - diff --git a/docs/source/api/support/levenshtein.rst b/docs/source/api/support/levenshtein.rst index cb64027e..e36870bb 100644 --- a/docs/source/api/support/levenshtein.rst +++ b/docs/source/api/support/levenshtein.rst @@ -7,4 +7,3 @@ .. autofunction:: relative .. autofunction:: distance - diff --git a/docs/source/api/util.rst b/docs/source/api/util.rst index 9359f742..8380a413 100644 --- a/docs/source/api/util.rst +++ b/docs/source/api/util.rst @@ -4,4 +4,3 @@ .. automodule:: whoosh.util :members: - diff --git a/docs/source/api/writing.rst b/docs/source/api/writing.rst index 0bebc86f..5361cc02 100644 --- a/docs/source/api/writing.rst +++ b/docs/source/api/writing.rst @@ -26,5 +26,3 @@ Exceptions ========== .. autoexception:: IndexingError - - diff --git a/docs/source/batch.rst b/docs/source/batch.rst index 5caf256e..b8a741f0 100644 --- a/docs/source/batch.rst +++ b/docs/source/batch.rst @@ -106,9 +106,3 @@ So, while ``multisegment=True`` is much faster than a normal writer, you should only use it for large batch indexing jobs (or perhaps only for indexing from scratch). It should not be the only method you use for indexing, because otherwise the number of segments will tend to increase forever! - - - - - - diff --git a/docs/source/conf.py b/docs/source/conf.py index 77011a03..a8a3f8ae 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,4 +1,6 @@ -import sys, os, os.path +import os +import os.path +import sys sys.path.append(os.path.abspath("../../src")) import whoosh diff --git a/docs/source/dates.rst b/docs/source/dates.rst index ab1aadd6..ac5cd2bf 100644 --- a/docs/source/dates.rst +++ b/docs/source/dates.rst @@ -196,7 +196,3 @@ Limitations * ``DATETIME`` fields do not currently support open-ended ranges. You can simulate an open ended range by using an endpoint far in the past or future. - - - - diff --git a/docs/source/facets.rst b/docs/source/facets.rst index b8c16936..4717a250 100644 --- a/docs/source/facets.rst +++ b/docs/source/facets.rst @@ -767,5 +767,3 @@ Expert: writing your own facet ============================== TBD. - - diff --git a/docs/source/fieldcaches.rst b/docs/source/fieldcaches.rst index 49091dc7..2e399ed5 100644 --- a/docs/source/fieldcaches.rst +++ b/docs/source/fieldcaches.rst @@ -44,9 +44,3 @@ Then you can pass an instance of your policy object to the ``set_caching_policy` method:: searcher.set_caching_policy(MyPolicy()) - - - - - - diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst index e9dd52d7..c62516b4 100644 --- a/docs/source/glossary.rst +++ b/docs/source/glossary.rst @@ -62,4 +62,3 @@ Glossary Term vector A *forward index* for a certain field in a certain document. You can specify in the Schema that a given field should store term vectors. - diff --git a/docs/source/highlight.rst b/docs/source/highlight.rst index 79c76ae9..bc266c8c 100644 --- a/docs/source/highlight.rst +++ b/docs/source/highlight.rst @@ -405,15 +405,3 @@ an analyzer:: ``order`` An ordering function that determines the order of the "top" fragments in the output text. - - - - - - - - - - - - diff --git a/docs/source/index.rst b/docs/source/index.rst index 236372f7..ca3f0062 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -47,4 +47,3 @@ Indices and tables * :ref:`genindex` * :ref:`modindex` * :ref:`search` - diff --git a/docs/source/keywords.rst b/docs/source/keywords.rst index fe0e91f2..ed1440ee 100644 --- a/docs/source/keywords.rst +++ b/docs/source/keywords.rst @@ -91,4 +91,3 @@ Expansion models The ``ExpansionModel`` subclasses in the :mod:` whoosh.classify` module implement different weighting functions for key words. These models are translated into Python from original Java implementations in Terrier. - diff --git a/docs/source/nested.rst b/docs/source/nested.rst index da43d282..465b8af7 100644 --- a/docs/source/nested.rst +++ b/docs/source/nested.rst @@ -235,4 +235,3 @@ additional searches for each found document. Future versions of Whoosh may include "join" queries to make this process more efficient (or at least more automatic). - diff --git a/docs/source/ngrams.rst b/docs/source/ngrams.rst index 558f4e34..56bfe22f 100644 --- a/docs/source/ngrams.rst +++ b/docs/source/ngrams.rst @@ -46,6 +46,3 @@ whitespace and punctuation, while ``NGRAMWORDS`` extracts words from the text using a tokenizer, then runs each word through the N-gram filter. TBD. - - - diff --git a/docs/source/parsing.rst b/docs/source/parsing.rst index c4acc746..8eec5aec 100644 --- a/docs/source/parsing.rst +++ b/docs/source/parsing.rst @@ -185,7 +185,7 @@ replace the default English tokens with your own regular expressions. The :class:` whoosh.qparser.OperatorsPlugin` implements the ability to use AND, OR, NOT, ANDNOT, and ANDMAYBE clauses in queries. You can instantiate a new -``OperatorsPlugin`` and use the ``And``, ``Or``, ``Not``, ``AndNot``, and +``OperatorsPlugin`` and use the ``And``, ``Or``, ``Not``, ``AndNot``, and ``AndMaybe`` keyword arguments to change the token patterns:: # Use Spanish equivalents instead of AND and OR @@ -430,8 +430,3 @@ use the ``clean`` keyword argument:: Operators earlier in the list bind more closely than operators later in the list. - - - - - diff --git a/docs/source/query.rst b/docs/source/query.rst index f56b26b6..c7aec022 100644 --- a/docs/source/query.rst +++ b/docs/source/query.rst @@ -7,4 +7,3 @@ The classes in the :mod:` whoosh.query` module implement *queries* you can run a TBD. See :doc:`searching` for how to search the index using query objects. - diff --git a/docs/source/querylang.rst b/docs/source/querylang.rst index 085363da..7c436c12 100644 --- a/docs/source/querylang.rst +++ b/docs/source/querylang.rst @@ -186,6 +186,3 @@ in single quotes:: path:'MacHD:My Documents' 'term with spaces' title:'function()' - - - diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index b169fb7c..b96361fc 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -14,7 +14,7 @@ A quick introduction :: >>> from whoosh.index import create_in - >>> from whoosh.fields import * + >>> from whoosh.fields import Schema, TEXT, ID >>> schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) >>> ix = create_in("indexdir", schema) >>> writer = ix.writer() @@ -194,7 +194,7 @@ For example, this query would match documents that contain both "apple" and # Construct query objects directly - from whoosh.query import * + from whoosh.query import And, Term myquery = And([Term("content", u"apple"), Term("content", "bear")]) To parse a query string, you can use the default query parser in the ``qparser`` @@ -241,4 +241,3 @@ Whoosh includes extra features for dealing with search results, such as * Paginating the results (e.g. "Showing results 1-20, page 1 of 4"). See :doc:`searching` for more information. - diff --git a/docs/source/recipes.rst b/docs/source/recipes.rst index 19150153..29c9571b 100644 --- a/docs/source/recipes.rst +++ b/docs/source/recipes.rst @@ -226,4 +226,3 @@ Is term X in document Y? # ...or the slower but easier way wordset = set(searcher.vector(500, "content").all_ids()) return "wobble" in wordset - diff --git a/docs/source/releases/1_0.rst b/docs/source/releases/1_0.rst index 7312123b..08887f53 100644 --- a/docs/source/releases/1_0.rst +++ b/docs/source/releases/1_0.rst @@ -479,4 +479,3 @@ Misc previous versions. * Unit tests should no longer leave directories and files behind. - diff --git a/docs/source/releases/2_0.rst b/docs/source/releases/2_0.rst index 053966fe..20569387 100644 --- a/docs/source/releases/2_0.rst +++ b/docs/source/releases/2_0.rst @@ -328,6 +328,3 @@ Compatibility now yield :class:` whoosh.reading.TermInfo` objects. * The arguments to :class:` whoosh.query.FuzzyTerm` changed. - - - diff --git a/docs/source/releases/index.rst b/docs/source/releases/index.rst index cf63ae83..def33734 100644 --- a/docs/source/releases/index.rst +++ b/docs/source/releases/index.rst @@ -8,4 +8,3 @@ Release notes 2_0 1_0 0_3 - diff --git a/docs/source/schema.rst b/docs/source/schema.rst index 043facb5..58da2fc7 100644 --- a/docs/source/schema.rst +++ b/docs/source/schema.rst @@ -371,7 +371,3 @@ If you set ``FieldType.vector`` to a ``Format`` object, the indexing code will u ``Format`` object to store information about the terms in each document. Currently by default Whoosh does not make use of term vectors at all, but they are available to expert users who want to implement their own field types. - - - - diff --git a/docs/source/searching.rst b/docs/source/searching.rst index 603244a4..ac640424 100644 --- a/docs/source/searching.rst +++ b/docs/source/searching.rst @@ -392,9 +392,3 @@ The ``Results`` object supports the following methods: Any result documents that also appear in 'results' are moved to the top of the list of result documents. Then any other documents in 'results' are added on to the list of result documents. - - - - - - diff --git a/docs/source/stemming.rst b/docs/source/stemming.rst index e88c66b7..0d30b569 100644 --- a/docs/source/stemming.rst +++ b/docs/source/stemming.rst @@ -201,17 +201,3 @@ required by ``CharsetTokenizer`` and ``CharsetFilter``:: (The Sphinx charset table format is described at http://www.sphinxsearch.com/docs/current.html#conf-charset-table ) - - - - - - - - - - - - - - diff --git a/docs/source/tech/filedb.rst b/docs/source/tech/filedb.rst index 0fe22be7..3d96b504 100644 --- a/docs/source/tech/filedb.rst +++ b/docs/source/tech/filedb.rst @@ -26,4 +26,3 @@ The index directory will contain a set of files for each segment. A segment is l .fvz contains term vectors (forward indexes) for each document. This file is only created if at least one field in the schema stores term vectors. The size will vary based on the number of documents, field length, the formats used for each vector (e.g. storing term positions takes more space than storing frequency only), etc. - diff --git a/docs/source/threads.rst b/docs/source/threads.rst index 0b45a643..54ecba8a 100644 --- a/docs/source/threads.rst +++ b/docs/source/threads.rst @@ -69,6 +69,3 @@ returns it.) Calling ``Searcher.refresh()`` is more efficient that closing the searcher and opening a new one, since it will re-use any underlying readers and caches that haven't changed. - - - diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..3da7ee29 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,89 @@ +[tool.ruff] +target-version = "py38" + +[tool.ruff.lint] +select = [ + "AIR", # Airflow + "ASYNC", # flake8-async + "BLE", # flake8-blind-except + "C90", # McCabe cyclomatic complexity + "DJ", # flake8-django + "EXE", # flake8-executable + "F", # Pyflakes + "FA", # flake8-future-annotations + "G", # flake8-logging-format + "I", # isort + "ICN", # flake8-import-conventions + "INT", # flake8-gettext + "LOG", # flake8-logging + "NPY", # NumPy-specific rules + "PLC", # Pylint conventions + "PLE", # Pylint errors + "PLR091", # Pylint Refactor just for max-args, max-branches, etc. + "PYI", # flake8-pyi + "Q", # flake8-quotes + "SLOT", # flake8-slots + "TCH", # flake8-type-checking + "TID", # flake8-tidy-imports + "TRIO", # flake8-trio + "UP", # pyupgrade + "W", # pycodestyle + "YTT", # flake8-2020 + # "A", # flake8-builtins + # "ANN", # flake8-annotations + # "ARG", # flake8-unused-arguments + # "B", # flake8-bugbear + # "C4", # flake8-comprehensions + # "COM", # flake8-commas + # "CPY", # flake8-copyright + # "D", # pydocstyle + # "DTZ", # flake8-datetimez + # "E", # pycodestyle + # "EM", # flake8-errmsg + # "ERA", # eradicate + # "FBT", # flake8-boolean-trap + # "FIX", # flake8-fixme + # "FLY", # flynt + # "FURB", # refurb + # "INP", # flake8-no-pep420 + # "ISC", # flake8-implicit-str-concat + # "N", # pep8-naming + # "PD", # pandas-vet + # "PERF", # Perflint + # "PGH", # pygrep-hooks + # "PIE", # flake8-pie + # "PL", # Pylint + # "PT", # flake8-pytest-style + # "PTH", # flake8-use-pathlib + # "RET", # flake8-return + # "RSE", # flake8-raise + # "RUF", # Ruff-specific rules + # "S", # flake8-bandit + # "SIM", # flake8-simplify + # "SLF", # flake8-self + # "T10", # flake8-debugger + # "T20", # flake8-print + # "TD", # flake8-todos + # "TRY", # tryceratops +] +ignore = [ + "EXE001", + "F401", + "F811", + "F841", + "UP031", +] + +[tool.ruff.lint.mccabe] +max-complexity = 45 # Default is 10 + +[tool.ruff.lint.per-file-ignores] +"src/whoosh/compat.py" = ["F821"] +"src/whoosh/filedb/filestore.py" = ["UP024"] +"src/whoosh/util/__init__.py" = ["F821"] + +[tool.ruff.lint.pylint] +max-args = 22 # Default is 5 +max-branches = 79 # Default is 12 +max-returns = 16 # Default is 6 +max-statements = 256 # Default is 50 diff --git a/requirements-dev.txt b/requirements-dev.txt index 8046a275..403fbade 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,4 @@ pytest pythomata versioneer --e . \ No newline at end of file +-e . diff --git a/requirements.txt b/requirements.txt index 945c9b46..9c558e35 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -. \ No newline at end of file +. diff --git a/scripts/make_checkpoint.py b/scripts/make_checkpoint.py index 0547c880..d690826b 100644 --- a/scripts/make_checkpoint.py +++ b/scripts/make_checkpoint.py @@ -3,13 +3,14 @@ # Make a "checkpoint" index, capturing the index format created by a certain # version of Whoosh -from __future__ import print_function, with_statement -import os.path, random, sys -from datetime import datetime -from whoosh import fields, index -from whoosh.compat import u, range +import os.path +import random +import sys +from datetime import datetime, timezone +from whoosh import fields, index +from whoosh.compat import u if len(sys.argv) < 2: print("USAGE: make_checkpoint.py ") @@ -43,9 +44,14 @@ with ix.writer() as w: for num in range(100): frac += 0.15 - path = u("%s/%s" % (segnum, num)) + path = u(f"{segnum}/{num}") title = " ".join(random.choice(words) for _ in range(100)) - dt = datetime(year=2000 + counter, month=(counter % 12) + 1, day=15) + dt = datetime( + year=2000 + counter, + month=(counter % 12) + 1, + day=15, + tzinfo=timezone.utc, + ) w.add_document( path=path, diff --git a/scripts/pylint.ini b/scripts/pylint.ini index cda16306..630e452e 100644 --- a/scripts/pylint.ini +++ b/scripts/pylint.ini @@ -1,11 +1,11 @@ # lint Python modules using external checkers. -# +# # This is the main checker controling the other ones and the reports # generation. It is itself both a raw checker and an astng checker in order # to: # * handle message activation / deactivation at the module level # * handle some basic but necessary stats'data (number of classes, methods...) -# +# [MASTER] # Specify a configuration file. @@ -92,7 +92,7 @@ comment=no # * undefined variables # * redefinition of variable from builtins or from an outer scope # * use of variable before assigment -# +# [VARIABLES] # Tells wether we should check for unused import in __init__ files. @@ -107,7 +107,7 @@ additional-builtins= # try to find bugs in the code using type inference -# +# [TYPECHECK] # Tells wether missing members accessed in mixin class should be ignored. A @@ -132,7 +132,7 @@ acquired-members=REQUEST,acl_users,aq_parent # * dangerous default values as arguments # * redefinition of function / method / class # * uses of the global statement -# +# [BASIC] # Required attributes for module, separated by a comma @@ -183,7 +183,7 @@ bad-functions=apply,input # checks for sign of poor/misdesign: # * number of methods, attributes, local variables... # * size, complexity of functions, methods -# +# [DESIGN] # Maximum number of arguments for function / method @@ -219,7 +219,7 @@ max-public-methods=20 # * relative / wildcard imports # * cyclic imports # * uses of deprecated modules -# +# [IMPORTS] # Deprecated modules which should not be used, separated by a comma @@ -245,7 +245,7 @@ int-import-graph= # * attributes not defined in the __init__ method # * supported interfaces implementation # * unreachable code -# +# [CLASSES] # List of interface methods to ignore, separated by a comma. This is used for @@ -259,7 +259,7 @@ defining-attr-methods=__init__,__new__,setUp # checks for similarities and duplicated code. This computation may be # memory / CPU intensive, so you should disable it if you experiments some # problems. -# +# [SIMILARITIES] # Minimum lines number of a similarity. @@ -275,7 +275,7 @@ ignore-docstrings=yes # checks for: # * warning notes in the code like FIXME, XXX # * PEP 263: source code with non ascii character but no encoding declaration -# +# [MISCELLANEOUS] # List of note tags to take in consideration, separated by a comma. @@ -287,7 +287,7 @@ notes=FIXME,XXX,TODO # * strict indentation # * line length # * use of <> instead of != -# +# [FORMAT] # Maximum number of characters on a single line. diff --git a/scripts/read_checkpoint.py b/scripts/read_checkpoint.py index 1385637d..c6947f0c 100644 --- a/scripts/read_checkpoint.py +++ b/scripts/read_checkpoint.py @@ -2,13 +2,12 @@ # Read a "checkpoint" index, to check backwards compatibility -from __future__ import print_function, with_statement + import sys from whoosh import index, query from whoosh.compat import u - if len(sys.argv) < 2: print("USAGE: read_checkpoint.py ") sys.exit(1) diff --git a/setup.cfg b/setup.cfg index b5fe96c0..649f2149 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [wheel] -universal = 1 +universal = True [build_sphinx] build-dir = docs/build @@ -12,27 +12,19 @@ upload-dir = docs/build/html formats = zip,gztar [aliases] -push = sdist bdist_wheel upload +push = sdist bdist_wheel twine upload pushdocs = build_sphinx upload_sphinx [tool:pytest] ; --tb= traceback print mode (long/short/line/native/no) -addopts = -rs --tb=native +addopts = -rs --tb=short norecursedirs = .hg .tox _build tmp* env* benchmark stress -minversion = 2.0 +minversion = 3.0 python_files = test_*.py -pep8ignore = - *.py E121 E122 E123 E124 E125 E126 E127 E128 # continuation line indentation - *.py E401 # imports on separate lines - *.py W391 # blank line at end of file - - test_*.py E501 # Ignore long lines in tests - - upload.py ALL # 3rd party (and not in the repo): rietveld upload tool - docs/source/conf.py ALL # sphinx stuff, automatically generated, don't check this - src/whoosh/lang/*.py ALL # 3rd party / crashing py.test with non-ascii stuff - src/whoosh/lang/snowball/*.py ALL # 3rd party - src/whoosh/support/relativedelta.py ALL # 3rd party - src/whoosh/support/charset.py ALL # non-ascii py.test crash - src/whoosh/support/unicode.py ALL # non-ascii py.test crash + +[tool.coverage.run] +source = ["src/whoosh"] + +[tool.pytest.ini_options] +addopts = "--cov --cov-report=lcov:lcov.info --cov-report=term" diff --git a/setup.py b/setup.py index ff6511b5..e676f287 100644 --- a/setup.py +++ b/setup.py @@ -38,18 +38,20 @@ def run_tests(self): author_email="matt@whoosh.ca", maintainer="Sygil-Dev", description="Fast, pure-Python full text indexing, search, and spell checking library.", - long_description=open("README.md", "r").read(), + long_description=open("README.md").read(), long_description_content_type="text/markdown", license="Two-clause BSD license", keywords="index search text spell", url="https://github.com/Sygil-Dev/whoosh-reloaded", zip_safe=True, install_requires=[ - "cached-property", + "cached-property==1.5.2", + "loguru==0.7.2", ], tests_require=[ - "pytest", - "nose", + "pytest==8.0.0", + "nose==1.3.7", + "pre-commit==3.6.0", ], cmdclass={"test": PyTest}, classifiers=[ diff --git a/src/whoosh/analysis/__init__.py b/src/whoosh/analysis/__init__.py index 66293bc1..0c116bf6 100644 --- a/src/whoosh/analysis/__init__.py +++ b/src/whoosh/analysis/__init__.py @@ -60,10 +60,60 @@ a filter first or a tokenizer after the first item). """ -from whoosh.analysis.acore import * -from whoosh.analysis.tokenizers import * -from whoosh.analysis.filters import * -from whoosh.analysis.morph import * -from whoosh.analysis.intraword import * -from whoosh.analysis.ngrams import * -from whoosh.analysis.analyzers import * +from whoosh.analysis.acore import ( + Composable, + CompositionError, + Token, + entoken, + unstopped, +) +from whoosh.analysis.analyzers import ( + Analyzer, + FancyAnalyzer, + IDAnalyzer, + KeywordAnalyzer, + LanguageAnalyzer, + RegexAnalyzer, + SimpleAnalyzer, + StandardAnalyzer, + StemmingAnalyzer, +) +from whoosh.analysis.filters import ( + STOP_WORDS, + CharsetFilter, + Composable, + DelimitedAttributeFilter, + Filter, + LoggingFilter, + LowercaseFilter, + MultiFilter, + PassFilter, + ReverseTextFilter, + StopFilter, + StripFilter, + SubstitutionFilter, + TeeFilter, + url_pattern, +) +from whoosh.analysis.intraword import ( + BiWordFilter, + CompoundWordFilter, + IntraWordFilter, + ShingleFilter, +) +from whoosh.analysis.morph import DoubleMetaphoneFilter, PyStemmerFilter, StemFilter +from whoosh.analysis.ngrams import ( + NgramAnalyzer, + NgramFilter, + NgramTokenizer, + NgramWordAnalyzer, +) +from whoosh.analysis.tokenizers import ( + CharsetTokenizer, + CommaSeparatedTokenizer, + IDTokenizer, + PathTokenizer, + RegexTokenizer, + SpaceSeparatedTokenizer, + Tokenizer, +) diff --git a/src/whoosh/analysis/acore.py b/src/whoosh/analysis/acore.py index adb53b7c..74bf926a 100644 --- a/src/whoosh/analysis/acore.py +++ b/src/whoosh/analysis/acore.py @@ -27,23 +27,24 @@ from whoosh.compat import iteritems - # Exceptions + class CompositionError(Exception): pass # Utility functions + def unstopped(tokenstream): - """Removes tokens from a token stream where token.stopped = True. - """ + """Removes tokens from a token stream where token.stopped = True.""" return (t for t in tokenstream if not t.stopped) -def entoken(textstream, positions=False, chars=False, start_pos=0, - start_char=0, **kwargs): +def entoken( + textstream, positions=False, chars=False, start_pos=0, start_char=0, **kwargs +): """Takes a sequence of unicode strings and yields a series of Token objects (actually the same Token object over and over, for performance reasons), with the attributes filled in with reasonable values (for example, if @@ -72,7 +73,8 @@ def entoken(textstream, positions=False, chars=False, start_pos=0, # Token object -class Token(object): + +class Token: """ Represents a "token" (usually a word) extracted from the source text being indexed. @@ -101,8 +103,9 @@ def RemoveDuplicatesFilter(self, stream): ...or, call token.copy() to get a copy of the token object. """ - def __init__(self, positions=False, chars=False, removestops=True, mode='', - **kwargs): + def __init__( + self, positions=False, chars=False, removestops=True, mode="", **kwargs + ): """ :param positions: Whether tokens should have the token position in the 'pos' attribute. @@ -123,9 +126,10 @@ def __init__(self, positions=False, chars=False, removestops=True, mode='', self.__dict__.update(kwargs) def __repr__(self): - parms = ", ".join("%s=%r" % (name, value) - for name, value in iteritems(self.__dict__)) - return "%s(%s)" % (self.__class__.__name__, parms) + parms = ", ".join( + f"{name}={value!r}" for name, value in iteritems(self.__dict__) + ) + return f"{self.__class__.__name__}({parms})" def copy(self): # This is faster than using the copy module @@ -134,23 +138,24 @@ def copy(self): # Composition support -class Composable(object): + +class Composable: is_morph = False def __or__(self, other): from whoosh.analysis.analyzers import CompositeAnalyzer if not isinstance(other, Composable): - raise TypeError("%r is not composable with %r" % (self, other)) + raise TypeError(f"{self!r} is not composable with {other!r}") return CompositeAnalyzer(self, other) def __repr__(self): attrs = "" if self.__dict__: - attrs = ", ".join("%s=%r" % (key, value) - for key, value - in iteritems(self.__dict__)) - return self.__class__.__name__ + "(%s)" % attrs + attrs = ", ".join( + f"{key}={value!r}" for key, value in iteritems(self.__dict__) + ) + return self.__class__.__name__ + f"({attrs})" def has_morph(self): return self.is_morph diff --git a/src/whoosh/analysis/analyzers.py b/src/whoosh/analysis/analyzers.py index ba1bd42b..236733d9 100644 --- a/src/whoosh/analysis/analyzers.py +++ b/src/whoosh/analysis/analyzers.py @@ -26,19 +26,19 @@ # policies, either expressed or implied, of Matt Chaput. from whoosh.analysis.acore import Composable, CompositionError -from whoosh.analysis.tokenizers import Tokenizer -from whoosh.analysis.filters import LowercaseFilter -from whoosh.analysis.filters import StopFilter, STOP_WORDS -from whoosh.analysis.morph import StemFilter +from whoosh.analysis.filters import STOP_WORDS, LowercaseFilter, StopFilter from whoosh.analysis.intraword import IntraWordFilter -from whoosh.analysis.tokenizers import default_pattern -from whoosh.analysis.tokenizers import CommaSeparatedTokenizer -from whoosh.analysis.tokenizers import IDTokenizer -from whoosh.analysis.tokenizers import RegexTokenizer -from whoosh.analysis.tokenizers import SpaceSeparatedTokenizer +from whoosh.analysis.morph import StemFilter +from whoosh.analysis.tokenizers import ( + CommaSeparatedTokenizer, + IDTokenizer, + RegexTokenizer, + SpaceSeparatedTokenizer, + Tokenizer, + default_pattern, +) from whoosh.lang.porter import stem - # Analyzers @@ -46,7 +46,7 @@ class Analyzer(Composable): """Abstract base class for analyzers.""" def __repr__(self): - return "%s()" % self.__class__.__name__ + return f"{self.__class__.__name__}()" def __eq__(self, other): return ( @@ -59,6 +59,7 @@ def __call__(self, value, **kwargs): raise NotImplementedError def clean(self): + # This method is intentionally left empty. pass @@ -78,12 +79,11 @@ def __init__(self, *composables): for item in self.items[1:]: if isinstance(item, Tokenizer): raise CompositionError( - "Only one tokenizer allowed at the start" - " of the analyzer: %r" % self.items + f"Only one tokenizer allowed at the start of the analyzer: {self.items}" ) def __repr__(self): - return "%s(%s)" % ( + return "{}({})".format( self.__class__.__name__, ", ".join(repr(item) for item in self.items), ) @@ -239,7 +239,6 @@ def FancyAnalyzer( expression=r"\s+", stoplist=STOP_WORDS, minsize=2, - maxsize=None, gaps=True, splitwords=True, splitnums=True, diff --git a/src/whoosh/analysis/filters.py b/src/whoosh/analysis/filters.py index 110e6ba3..ee64bf9b 100644 --- a/src/whoosh/analysis/filters.py +++ b/src/whoosh/analysis/filters.py @@ -1,5 +1,3 @@ -# coding=utf-8 - # Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,11 +27,10 @@ from itertools import chain -from whoosh.compat import next from whoosh.analysis.acore import Composable +from whoosh.compat import next from whoosh.util.text import rcompile - # Default list of stop words (words so common it's usually wasteful to index # them). This list is used by the StopFilter class, which allows you to supply # an optional list to override this one. @@ -114,7 +111,7 @@ def __eq__(self, other): ) def __ne__(self, other): - return not self == other + return self != other def __call__(self, tokens): raise NotImplementedError @@ -181,8 +178,8 @@ def __eq__(self, other): def __call__(self, tokens): # Only selects on the first token t = next(tokens) - filter = self.filters.get(t.mode, self.default_filter) - return filter(chain([t], tokens)) + selected_filter = self.filters.get(t.mode, self.default_filter) + return selected_filter(chain([t], tokens)) class TeeFilter(Filter): @@ -212,7 +209,7 @@ class TeeFilter(Filter): def __init__(self, *filters): if len(filters) < 2: - raise Exception("TeeFilter requires two or more filters") + raise ValueError("TeeFilter requires two or more filters") self.filters = filters def __eq__(self, other): diff --git a/src/whoosh/analysis/intraword.py b/src/whoosh/analysis/intraword.py index 85355f11..06b99c25 100644 --- a/src/whoosh/analysis/intraword.py +++ b/src/whoosh/analysis/intraword.py @@ -28,9 +28,8 @@ import re from collections import deque -from whoosh.compat import u, text_type -from whoosh.compat import range from whoosh.analysis.filters import Filter +from whoosh.compat import text_type, u class CompoundWordFilter(Filter): diff --git a/src/whoosh/analysis/morph.py b/src/whoosh/analysis/morph.py index 50d5a631..7b1944c1 100644 --- a/src/whoosh/analysis/morph.py +++ b/src/whoosh/analysis/morph.py @@ -92,7 +92,7 @@ def __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000): def __getstate__(self): # Can't pickle a dynamic function, so we have to remove the _stem # attribute from the state - return dict([(k, self.__dict__[k]) for k in self.__dict__ if k != "_stem"]) + return {k: self.__dict__[k] for k in self.__dict__ if k != "_stem"} def __setstate__(self, state): # Check for old instances of StemFilter class, which didn't have a @@ -197,7 +197,7 @@ def _get_stemmer_fn(self): def __getstate__(self): # Can't pickle a dynamic function, so we have to remove the _stem # attribute from the state - return dict([(k, self.__dict__[k]) for k in self.__dict__ if k != "_stem"]) + return {k: self.__dict__[k] for k in self.__dict__ if k != "_stem"} def __setstate__(self, state): # Check for old instances of StemFilter class, which didn't have a diff --git a/src/whoosh/analysis/ngrams.py b/src/whoosh/analysis/ngrams.py index 8ada2b58..4281c1fa 100644 --- a/src/whoosh/analysis/ngrams.py +++ b/src/whoosh/analysis/ngrams.py @@ -25,12 +25,10 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh.compat import text_type -from whoosh.compat import range from whoosh.analysis.acore import Token from whoosh.analysis.filters import Filter, LowercaseFilter -from whoosh.analysis.tokenizers import Tokenizer, RegexTokenizer - +from whoosh.analysis.tokenizers import RegexTokenizer, Tokenizer +from whoosh.compat import text_type # Tokenizer @@ -79,9 +77,9 @@ def __call__( start_pos=0, start_char=0, mode="", - **kwargs + **kwargs, ): - assert isinstance(value, text_type), "%r is not unicode" % value + assert isinstance(value, text_type), f"{value!r} is not unicode" inlen = len(value) t = Token(positions, chars, removestops=removestops, mode=mode) diff --git a/src/whoosh/analysis/tokenizers.py b/src/whoosh/analysis/tokenizers.py index b0340fcb..b76e7df5 100644 --- a/src/whoosh/analysis/tokenizers.py +++ b/src/whoosh/analysis/tokenizers.py @@ -25,11 +25,10 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh.compat import u, text_type from whoosh.analysis.acore import Composable, Token +from whoosh.compat import text_type, u from whoosh.util.text import rcompile - default_pattern = rcompile(r"[\w\*]+(\.?[\w\*]+)*") @@ -62,9 +61,9 @@ def __call__( start_pos=0, start_char=0, mode="", - **kwargs + **kwargs, ): - assert isinstance(value, text_type), "%r is not unicode" % value + assert isinstance(value, text_type), f"{value!r} is not unicode" t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) t.text = value t.boost = 1.0 @@ -117,7 +116,7 @@ def __call__( start_char=0, tokenize=True, mode="", - **kwargs + **kwargs, ): """ :param value: The unicode string to tokenize. @@ -132,7 +131,7 @@ def __call__( :param tokenize: if True, the text should be tokenized. """ - assert isinstance(value, text_type), "%s is not unicode" % repr(value) + assert isinstance(value, text_type), f"{repr(value)} is not unicode" t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: @@ -250,7 +249,7 @@ def __call__( start_char=0, tokenize=True, mode="", - **kwargs + **kwargs, ): """ :param value: The unicode string to tokenize. @@ -265,7 +264,7 @@ def __call__( :param tokenize: if True, the text should be tokenized. """ - assert isinstance(value, text_type), "%r is not unicode" % value + assert isinstance(value, text_type), f"{value!r} is not unicode" t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: @@ -353,7 +352,7 @@ def __init__(self, expression="[^/]+"): self.expr = rcompile(expression) def __call__(self, value, positions=False, start_pos=0, **kwargs): - assert isinstance(value, text_type), "%r is not unicode" % value + assert isinstance(value, text_type), f"{value!r} is not unicode" token = Token(positions, **kwargs) pos = start_pos for match in self.expr.finditer(value): diff --git a/src/whoosh/automata/fsa.py b/src/whoosh/automata/fsa.py index 84639a27..f27c64cd 100644 --- a/src/whoosh/automata/fsa.py +++ b/src/whoosh/automata/fsa.py @@ -1,12 +1,9 @@ -from __future__ import print_function - import itertools import operator import sys from bisect import bisect_left -from whoosh.compat import iteritems, next, text_type, unichr, range - +from whoosh.compat import iteritems, next, text_type, unichr unull = unichr(0) @@ -14,12 +11,12 @@ # Marker constants -class Marker(object): +class Marker: def __init__(self, name): self.name = name def __repr__(self): - return "<%s>" % self.name + return f"<{self.name}>" EPSILON = Marker("EPSILON") @@ -29,7 +26,7 @@ def __repr__(self): # Base class -class FSA(object): +class FSA: def __init__(self, initial): self.initial = initial self.transitions = {} @@ -68,8 +65,7 @@ def generate_all(self, state=None, sofar=""): yield sofar for label in sorted(self.get_labels(state)): newstate = self.next_state(state, label) - for string in self.generate_all(newstate, sofar + label): - yield string + yield from self.generate_all(newstate, sofar + label) def start(self): return self.initial @@ -126,10 +122,10 @@ def dump(self, stream=sys.stdout): xs = self.transitions[src] for label in xs: dests = xs[label] - end = "||" if self.is_final(dests) else "" + _ = "||" if self.is_final(dests) else "" def start(self): - return frozenset(self._expand(set([self.initial]))) + return frozenset(self._expand({self.initial})) def add_transition(self, src, label, dest): self.transitions.setdefault(src, {}).setdefault(label, set()).add(dest) @@ -241,7 +237,7 @@ def dump(self, stream=sys.stdout): xs = self.transitions[src] for label in sorted(xs): dest = xs[label] - end = "||" if self.is_final(dest) else "" + _ = "||" if self.is_final(dest) else "" def start(self): return self.initial @@ -391,7 +387,7 @@ def minimize(self): assert new_initial is not None # Apply mapping to existing transitions - new_finals = set(mapping[s] for s in final_states) + new_finals = {mapping[s] for s in final_states} for state, d in iteritems(new_trans): trans = transitions[state] for label, dest in iteritems(trans): @@ -457,7 +453,7 @@ def u_to_utf8(dfa, base=0): if label is EPSILON: continue elif label is ANY: - raise Exception + raise ValueError else: assert isinstance(label, text_type) label8 = label.encode("utf8") @@ -636,14 +632,14 @@ def optional_nfa(n): # Daciuk Mihov DFA construction algorithm -class DMNode(object): +class DMNode: def __init__(self, n): self.n = n self.arcs = {} self.final = False def __repr__(self): - return "<%s, %r>" % (self.n, self.tuple()) + return f"<{self.n}, {self.tuple()!r}>" def __hash__(self): return hash(self.tuple()) @@ -663,9 +659,9 @@ def strings_dfa(strings): for string in strings: if string <= last: - raise Exception("Strings must be in order") + raise ValueError("Strings must be in order") if not string: - raise Exception("Can't add empty string") + raise ValueError("Can't add empty string") # Find the common prefix with the previous string i = 0 diff --git a/src/whoosh/automata/fst.py b/src/whoosh/automata/fst.py index fed1ce7e..2566de35 100644 --- a/src/whoosh/automata/fst.py +++ b/src/whoosh/automata/fst.py @@ -39,18 +39,33 @@ """ -import sys, copy +import copy +import sys from array import array from hashlib import sha1 # type: ignore @UnresolvedImport -from whoosh.compat import b, u, BytesIO -from whoosh.compat import range, iteritems, iterkeys, izip, array_tobytes -from whoosh.compat import bytes_type, text_type +from whoosh.compat import ( + BytesIO, + array_tobytes, + b, + bytes_type, + iteritems, + iterkeys, + izip, + range, + text_type, + u, +) from whoosh.filedb.structfile import StructFile -from whoosh.system import _INT_SIZE -from whoosh.system import pack_byte, pack_int, pack_uint, pack_long -from whoosh.system import emptybytes -from whoosh.util.text import utf8encode, utf8decode +from whoosh.system import ( + _INT_SIZE, + emptybytes, + pack_byte, + pack_int, + pack_long, + pack_uint, +) +from whoosh.util.text import utf8decode, utf8encode from whoosh.util.varints import varint @@ -73,7 +88,7 @@ class InactiveCursor(Exception): # FST Value types -class Values(object): +class Values: """Base for classes the describe how to encode and decode FST values.""" @staticmethod @@ -292,7 +307,7 @@ def write(dbfile, v): def read(self, dbfile): typecode = u(dbfile.read(1)) length = dbfile.read_int() - return dbfile.read_array(self.typecode, length) + return dbfile.read_array(typecode, length) def skip(self, dbfile): length = dbfile.read_int() @@ -349,7 +364,7 @@ def to_bytes(v): # Node-like interface wrappers -class Node(object): +class Node: """A slow but easier-to-use wrapper for FSA/DAWGs. Translates the low-level arc-based interface of GraphReader into Node objects with methods to follow edges. @@ -376,10 +391,10 @@ def _load(self): if self.address is None: d = {} else: - d = dict( - (arc.label, Node(owner, arc.target, arc.accept)) + d = { + arc.label: Node(owner, arc.target, arc.accept) for arc in self.owner.iter_arcs(self.address) - ) + } self._edges = d def keys(self): @@ -402,8 +417,7 @@ def flatten(self, sofar=emptybytes): yield sofar for key in sorted(self): node = self.edge(key) - for result in node.flatten(sofar + key): - yield result + yield from node.flatten(sofar + key) def flatten_strings(self): return (utf8decode(k)[0] for k in self.flatten()) @@ -421,7 +435,7 @@ def __init__(self, a, b): self.b = b def __repr__(self): - return "<%s %r %r>" % (self.__class__.__name__, self.a, self.b) + return f"<{self.__class__.__name__} {self.a!r} {self.b!r}>" def __contains__(self, key): return key in self.a or key in self.b @@ -461,7 +475,7 @@ def edge(self, key): # Cursor -class BaseCursor(object): +class BaseCursor: """Base class for a cursor-type object for navigating an FST/word graph, represented by a :class:`GraphReader` object. @@ -510,8 +524,7 @@ def peek_key(self): key in the graph. """ - for label in self.prefix(): - yield label + yield from self.prefix() c = self.copy() while not c.stopped(): c.follow() @@ -699,8 +712,7 @@ def peek_key(self): if not self.stack: raise InactiveCursor - for label in self.prefix(): - yield label + yield from self.prefix() arc = copy.copy(self.stack[-1]) graph = self.graph while not arc.accept and arc.target is not None: @@ -806,7 +818,7 @@ def _pop_to_prefix(self, key): return i -class UncompiledNode(object): +class UncompiledNode: # Represents an "in-memory" node used by the GraphWriter before it is # written to disk. @@ -824,7 +836,7 @@ def clear(self): self.inputcount = 0 def __repr__(self): - return "<%r>" % ([(a.label, a.value) for a in self.arcs],) + return f"<{[(a.label, a.value) for a in self.arcs]!r}>" def digest(self): if self._digest is None: @@ -855,7 +867,7 @@ def add_arc(self, label, target): def replace_last(self, label, target, accept, acceptval=None): arc = self.arcs[-1] - assert arc.label == label, "%r != %r" % (arc.label, label) + assert arc.label == label, f"{arc.label!r} != {label!r}" arc.target = target arc.accept = accept arc.acceptval = acceptval @@ -867,7 +879,7 @@ def delete_last(self, label, target): def set_last_value(self, label, value): arc = self.arcs[-1] - assert arc.label == label, "%r->%r" % (arc.label, label) + assert arc.label == label, f"{arc.label!r}->{label!r}" arc.value = value def prepend_value(self, prefix): @@ -878,7 +890,7 @@ def prepend_value(self, prefix): self.value = add(prefix, self.value) -class Arc(object): +class Arc: """ Represents a directed arc between two nodes in an FSA/FST graph. @@ -918,11 +930,11 @@ def __init__( self.endpos = endpos def __repr__(self): - return "<%r-%s %s%s>" % ( + return "<{!r}-{} {}{}>".format( self.label, self.target, "." if self.accept else "", - (" %r" % self.value) if self.value else "", + f" {self.value!r}" if self.value else "", ) def __eq__(self, other): @@ -953,7 +965,7 @@ def copy(self): # Graph writer -class GraphWriter(object): +class GraphWriter: """Writes an FSA/FST graph to disk. Call ``insert(key)`` to insert keys into the graph. You must @@ -1052,7 +1064,7 @@ def insert(self, key, value=None): """ if not self._infield: - raise Exception("Inserted %r before starting a field" % key) + raise Exception(f"Inserted {key!r} before starting a field") self._inserted = True key = to_labels(key) # Python 3 sucks @@ -1060,9 +1072,9 @@ def insert(self, key, value=None): lastkey = self.lastkey nodes = self.nodes if len(key) < 1: - raise KeyError("Can't store a null key %r" % (key,)) + raise KeyError(f"Can't store a null key {key!r}") if lastkey and lastkey > key: - raise KeyError("Keys out of order %r..%r" % (lastkey, key)) + raise KeyError(f"Keys out of order {lastkey!r}..{key!r}") # Find the common prefix shared by this key and the previous one prefixlen = 0 @@ -1085,7 +1097,7 @@ def insert(self, key, value=None): if vtype: if value is not None and not vtype.is_valid(value): - raise ValueError("%r is not valid for %s" % (value, vtype)) + raise ValueError(f"{value!r} is not valid for {vtype}") # Push value commonalities through the tree common = None @@ -1109,7 +1121,7 @@ def insert(self, key, value=None): else: nodes[prefixlen].set_last_value(key[prefixlen], value) elif value: - raise Exception("Value %r but no value type" % value) + raise Exception(f"Value {value!r} but no value type") self.lastkey = key @@ -1232,7 +1244,7 @@ def _write_node(self, uncnode): # Graph reader -class BaseGraphReader(object): +class BaseGraphReader: def cursor(self, rootname=None): return Cursor(self, self.root(rootname)) @@ -1265,7 +1277,7 @@ def list_arcs(self, address): return list(arc.copy() for arc in self.iter_arcs(address)) def arc_dict(self, address): - return dict((arc.label, arc.copy()) for arc in self.iter_arcs(address)) + return {arc.label: arc.copy() for arc in self.iter_arcs(address)} def find_path(self, path, arc=None, address=None): path = to_labels(path) @@ -1447,7 +1459,7 @@ def to_labels(key): # I hate the Python 3 bytes object so friggin much if keytype is tuple or keytype is list: if not all(isinstance(e, bytes_type) for e in key): - raise TypeError("%r contains a non-bytestring" % key) + raise TypeError(f"{key!r} contains a non-bytestring") if keytype is list: key = tuple(key) elif isinstance(key, bytes_type): @@ -1455,7 +1467,7 @@ def to_labels(key): elif isinstance(key, text_type): key = tuple(utf8encode(key[i : i + 1])[0] for i in range(len(key))) else: - raise TypeError("Don't know how to convert %r" % key) + raise TypeError(f"Don't know how to convert {key!r}") return key @@ -1561,6 +1573,6 @@ def dump_graph(graph, address=None, tab=0, out=None): else: out.write(" " * 6) out.write(" " * tab) - out.write("%r %r %s %r\n" % (arc.label, arc.target, arc.accept, arc.value)) + out.write(f"{arc.label!r} {arc.target!r} {arc.accept} {arc.value!r}\n") if arc.target is not None: dump_graph(graph, arc.target, tab + 1, out=out) diff --git a/src/whoosh/automata/glob.py b/src/whoosh/automata/glob.py index 32573afa..c41074c7 100644 --- a/src/whoosh/automata/glob.py +++ b/src/whoosh/automata/glob.py @@ -27,7 +27,6 @@ from whoosh.automata.fsa import ANY, EPSILON, NFA - # Constants for glob _LIT = 0 _STAR = 1 diff --git a/src/whoosh/automata/lev.py b/src/whoosh/automata/lev.py index 8d71fae4..08317edd 100644 --- a/src/whoosh/automata/lev.py +++ b/src/whoosh/automata/lev.py @@ -1,6 +1,3 @@ -from __future__ import print_function - -from whoosh.compat import range from whoosh.automata.fsa import ANY, EPSILON, NFA diff --git a/src/whoosh/automata/nfa.py b/src/whoosh/automata/nfa.py deleted file mode 100644 index 5853a1e4..00000000 --- a/src/whoosh/automata/nfa.py +++ /dev/null @@ -1,389 +0,0 @@ -# Copyright 2012 Matt Chaput. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR -# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO -# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, -# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# The views and conclusions contained in the software and documentation are -# those of the authors and should not be interpreted as representing official -# policies, either expressed or implied, of Matt Chaput. - -from whoosh.automata.fst import Arc - - -class Instruction(object): - def __repr__(self): - return "%s()" % (self.__class__.__name__,) - - -class Char(Instruction): - """ - Matches a literal character. - """ - - def __init__(self, c): - self.c = c - - def __repr__(self): - return "Char(%r)" % self.c - - -class Lit(Instruction): - """ - Matches a literal string. - """ - - def __init__(self, c): - self.c = c - - def __repr__(self): - return "Lit(%r)" % self.c - - -class Any(Instruction): - """ - Matches any character. - """ - - -class Match(Instruction): - """ - Stop this thread: the string matched. - """ - - def __repr__(self): - return "Match()" - - -class Jmp(Instruction): - """ - Jump to a specified instruction. - """ - - def __init__(self, x): - self.x = x - - def __repr__(self): - return "Jmp(%s)" % self.x - - -class Split(Instruction): - """ - Split execution: continue at two separate specified instructions. - """ - - def __init__(self, x, y): - self.x = x - self.y = y - - def __repr__(self): - return "Split(%s, %s)" % (self.x, self.y) - - -class Label(Instruction): - """ - Placeholder to act as a target for JMP instructions - """ - - def __hash__(self): - return id(self) - - def __repr__(self): - return "L(%s)" % hex(id(self)) - - -def concat(e1, e2): - return e1 + e2 - - -def alt(e1, e2): - L1, L2, L3 = Label(), Label(), Label() - return [L1] + e1 + [Jmp(L3), L2] + e2 + [L3] - - -def zero_or_one(e): - L1, L2 = Label(), Label() - return [Split(L1, L2), L1] + e + [L2] - - -def zero_or_more(e): - L1, L2, L3 = Label(), Label(), Label() - return [L1, Split(L2, L3), L2] + e + [Jmp(L1), L3] - - -def one_or_more(e): - L1, L2 = Label(), Label() - return [L1] + e + [Split(L1, L2), L2] - - -def fixup(program): - refs = {} - i = 0 - while i < len(program): - op = program[i] - if isinstance(op, Label): - refs[op] = i - program.pop(i) - else: - i += 1 - - if refs: - for op in program: - if isinstance(op, (Jmp, Split)): - op.x = refs[op.x] - if isinstance(op, Split): - op.y = refs[op.y] - - return program + [Match] - - -class ThreadList(object): - def __init__(self, program, max=1000): - self.program = program - self.max = max - self.threads = [] - - def __nonzero__(self): - return bool(self.threads) - - def current(self): - return self.threads.pop() - - def add(self, thread): - op = self.program[thread.pc] - optype = type(op) - if optype is Jmp: - self.add(thread.at(op.x)) - elif optype is Split: - self.add(thread.copy_at(op.x)) - self.add(thread.at(op.y)) - else: - self.threads.append(thread) - - -class Thread(object): - def __init__(self, pc, address, sofar="", accept=False): - self.pc = pc - self.address = address - self.sofar = sofar - self.accept = accept - - def at(self, pc): - self.pc = pc - return self - - def copy_at(self, pc): - return Thread(pc, self.address, self.sofar, self.accept) - - def __repr__(self): - d = self.__dict__ - return "Thread(%s)" % ",".join("%s=%r" % (k, v) for k, v in d.items()) - - -def advance(thread, arc, c): - thread.pc += 1 - thread.address = arc.target - thread.sofar += c - thread.accept = arc.accept - - -def run(graph, program, address): - threads = ThreadList(program) - threads.add(Thread(0, address)) - arc = Arc() - while threads: - thread = threads.current() - address = thread.address - op = program[thread.pc] - optype = type(op) - - if optype is Char: - if address: - arc = graph.find_arc(address, op.c, arc) - if arc: - advance(thread, arc) - threads.add(thread) - elif optype is Lit: - if address: - c = op.c - arc = graph.find_path(c, arc, address) - if arc: - advance(thread, arc, c) - threads.add(thread) - elif optype is Any: - if address: - sofar = thread.sofar - pc = thread.pc + 1 - for arc in graph.iter_arcs(address, arc): - t = Thread(pc, arc.target, sofar + arc.label, arc.accept) - threads.add(t) - elif op is Match: - if thread.accept: - yield thread.sofar - else: - raise Exception("Don't know what to do with %r" % op) - - -LO = 0 -HI = 1 - - -def regex_limit(graph, mode, program, address): - low = mode == LO - output = [] - threads = ThreadList(program) - threads.add(Thread(0, address)) - arc = Arc() - while threads: - thread = threads.current() - address = thread.address - op = program[thread.pc] - optype = type(op) - - if optype is Char: - if address: - arc = graph.find_arc(address, op.c, arc) - if arc: - if low and arc.accept: - return thread.sofar + thread.label - advance(thread, arc) - threads.add(thread) - elif optype is Lit: - if address: - labels = op.c - for label in labels: - arc = graph.find_arc(address, label) - if arc is None: - return thread.sofar - elif thread.accept: - return thread.sofar - elif optype is Any: - if address: - if low: - arc = graph.arc_at(address, arc) - else: - for arc in graph.iter_arcs(address): - pass - advance(thread, arc, arc.label) - threads.add(thread) - elif thread.accept: - return thread.sofar - elif op is Match: - return thread.sofar - else: - raise Exception("Don't know what to do with %r" % op) - - -# if __name__ == "__main__": -# from whoosh import index, query -# from whoosh.filedb.filestore import RamStorage -# from whoosh.automata import fst -# from whoosh.util.testing import timing -# -# st = RamStorage() -# gw = fst.GraphWriter(st.create_file("test")) -# gw.start_field("test") -# for key in ["aaaa", "aaab", "aabb", "abbb", "babb", "bbab", "bbba"]: -# gw.insert(key) -# gw.close() -# gr = fst.GraphReader(st.open_file("test")) -# -# program = one_or_more([Lit("a")]) -# print program -# program = fixup(program) -# print program -# print list(run(gr, program, gr.root("test"))) -# -# ix = index.open_dir("e:/dev/src/houdini/help/index") -# r = ix.reader() -# gr = r._get_graph() -# -# # program = fixup([Any(), Any(), Any(), Any(), Any()]) -# # program = fixup(concat(zero_or_more([Any()]), [Char("/")])) -# # with timing(): -# # x = list(run(gr, program, gr.root("path"))) -# # print len(x) -# -# q = query.Regex("path", "^.[abc].*/$") -# with timing(): -# y = list(q._btexts(r)) -# print len(y) -# print y[0], y[-1] -# -# pr = [Any()] + alt([Lit("c")], alt([Lit("b")], [Lit("a")])) + zero_or_more([Any()]) + [Lit("/")] -# program = fixup(pr) -# # with timing(): -# # x = list(run(gr, program, gr.root("path"))) -# # print len(x), x -# -# with timing(): -# print ("lo=", regex_limit(gr, LO, program, gr.root("path"))) -# print ("hi=", regex_limit(gr, HI, program, gr.root("path"))) -# -# -# -# #int -# #backtrackingvm(Inst *prog, char *input) -# #{ -# # enum { MAXTHREAD = 1000 }; -# # Thread ready[MAXTHREAD]; -# # int nready; -# # Inst *pc; -# # char *sp; -# # -# # /* queue initial thread */ -# # ready[0] = thread(prog, input); -# # nready = 1; -# # -# # /* run threads in stack order */ -# # while(nready > 0){ -# # --nready; /* pop state for next thread to run */ -# # pc = ready[nready].pc; -# # sp = ready[nready].sp; -# # for(;;){ -# # switch(pc->opcode){ -# # case Char: -# # if(*sp != pc->c) -# # goto Dead; -# # pc++; -# # sp++; -# # continue; -# # case Match: -# # return 1; -# # case Jmp: -# # pc = pc->x; -# # continue; -# # case Split: -# # if(nready >= MAXTHREAD){ -# # fprintf(stderr, "regexp overflow"); -# # return -1; -# # } -# # /* queue new thread */ -# # ready[nready++] = thread(pc->y, sp); -# # pc = pc->x; /* continue current thread */ -# # continue; -# # } -# # } -# # Dead:; -# # } -# # return 0; -# #} -# -# diff --git a/src/whoosh/automata/reg.py b/src/whoosh/automata/reg.py index f70f68b4..54a5ecf6 100644 --- a/src/whoosh/automata/reg.py +++ b/src/whoosh/automata/reg.py @@ -27,7 +27,6 @@ from whoosh.automata.fsa import ANY, EPSILON, NFA - # Operator precedence CHOICE = ("|",) ops = () @@ -38,7 +37,7 @@ def parse(pattern): ops = [] -class RegexBuilder(object): +class RegexBuilder: def __init__(self): self.statenum = 1 diff --git a/src/whoosh/classify.py b/src/whoosh/classify.py index 37898c77..4678ac0b 100644 --- a/src/whoosh/classify.py +++ b/src/whoosh/classify.py @@ -29,18 +29,17 @@ documents. """ -from __future__ import division + import random from collections import defaultdict from math import log -from whoosh.compat import range, iteritems - +from whoosh.compat import iteritems # Expansion models -class ExpansionModel(object): +class ExpansionModel: def __init__(self, doc_count, field_length): self.N = doc_count self.collection_total = field_length @@ -99,7 +98,7 @@ def score(self, weight_in_top, weight_in_collection, top_total): ) -class Expander(object): +class Expander: """Uses an ExpansionModel to expand the set of query terms based on the top N result documents. """ diff --git a/src/whoosh/codec/base.py b/src/whoosh/codec/base.py index d2cfeebe..23fbb594 100644 --- a/src/whoosh/codec/base.py +++ b/src/whoosh/codec/base.py @@ -33,12 +33,11 @@ from whoosh import columns from whoosh.automata import lev -from whoosh.compat import abstractmethod, izip, unichr, range +from whoosh.compat import abstractmethod, izip, unichr from whoosh.filedb.compound import CompoundStorage from whoosh.system import emptybytes from whoosh.util import random_name - # Exceptions @@ -49,7 +48,7 @@ class OutOfOrderError(Exception): # Base classes -class Codec(object): +class Codec: length_stats = True # Per document value writer @@ -77,6 +76,7 @@ def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None): # Index readers def automata(self, storage, segment): + _ = storage, segment # Unused arguments return Automata() @abstractmethod @@ -128,7 +128,7 @@ def new_segment(self, storage, indexname): # Writer classes -class PerDocumentWriter(object): +class PerDocumentWriter: @abstractmethod def start_doc(self, docnum): raise NotImplementedError @@ -157,13 +157,15 @@ def readitems(): self.add_vector_items(fieldname, fieldobj, readitems()) def finish_doc(self): + # This method is intentionally left empty. pass def close(self): + # This method is intentionally left empty. pass -class FieldWriter(object): +class FieldWriter: def add_postings(self, schema, lengths, items): # This method translates a generator of (fieldname, btext, docnum, w, v) # postings into calls to start_field(), start_term(), add(), @@ -192,10 +194,10 @@ def add_postings(self, schema, lengths, items): # Check for out-of-order postings. This is convoluted because Python # 3 removed the ability to compare a string to None if lastfn is not None and fieldname < lastfn: - raise OutOfOrderError("Field %r .. %r" % (lastfn, fieldname)) + raise OutOfOrderError(f"Field {lastfn!r} .. {fieldname!r}") if fieldname == lastfn and lasttext and btext < lasttext: raise OutOfOrderError( - "Term %s:%r .. %s:%r" % (lastfn, lasttext, fieldname, btext) + f"Term {lastfn}:{lasttext!r} .. {fieldname}:{btext!r}" ) # If the fieldname of this posting is different from the last one, @@ -261,6 +263,7 @@ def finish_term(self): raise NotImplementedError def finish_field(self): + # This method is intentionally left empty. pass def close(self): @@ -270,7 +273,7 @@ def close(self): # Postings -class PostingsWriter(object): +class PostingsWriter: @abstractmethod def start_postings(self, format_, terminfo): raise NotImplementedError @@ -280,6 +283,7 @@ def add_posting(self, id_, weight, vbytes, length=None): raise NotImplementedError def finish_postings(self): + # This method is intentionally left empty. pass @abstractmethod @@ -292,7 +296,7 @@ def written(self): # Reader classes -class FieldCursor(object): +class FieldCursor: def first(self): raise NotImplementedError @@ -306,7 +310,7 @@ def term(self): raise NotImplementedError -class TermsReader(object): +class TermsReader: @abstractmethod def __contains__(self, term): raise NotImplementedError @@ -352,10 +356,11 @@ def indexed_field_names(self): raise NotImplementedError def close(self): + # This method is intentionally left empty. pass -class Automata(object): +class Automata: @staticmethod def levenshtein_dfa(uterm, maxdist, prefix=0): return lev.levenshtein_automaton(uterm, maxdist, prefix).to_dfa() @@ -387,8 +392,9 @@ def terms_within(self, fieldcur, uterm, maxdist, prefix=0): # Per-doc value reader -class PerDocumentReader(object): +class PerDocumentReader: def close(self): + # This method is intentionally left empty. pass @abstractmethod @@ -433,6 +439,7 @@ def supports_columns(self): return False def has_column(self, fieldname): + _ = fieldname # Unused argument return False def list_columns(self): @@ -445,6 +452,7 @@ def column_reader(self, fieldname, column): # Bitmaps def field_docs(self, fieldname): + _ = fieldname # Unused argument return None # Lengths @@ -468,6 +476,7 @@ def max_field_length(self, fieldname): # Vectors def has_vector(self, docnum, fieldname): + _ = docnum, fieldname # Unused arguments return False # Don't need to override this if has_vector() always returns False @@ -488,7 +497,7 @@ def all_stored_fields(self): # Segment base class -class Segment(object): +class Segment: """Do not instantiate this object directly. It is used by the Index object to hold information about a segment. A list of objects of this class are pickled as part of the TOC file. @@ -518,7 +527,7 @@ def _random_id(cls, size=16): return random_name(size=size) def __repr__(self): - return "<%s %s>" % (self.__class__.__name__, self.segment_id()) + return f"<{self.__class__.__name__} {self.segment_id()}>" def __eq__(self, other): return isinstance(other, type(self)) and self.segment_id() == other.segment_id() @@ -537,7 +546,7 @@ def segment_id(self): # Old segment class return self.name else: - return "%s_%s" % (self.index_name(), self.segid) + return f"{self.index_name()}_{self.segid}" def is_compound(self): if not hasattr(self, "compound"): @@ -547,10 +556,10 @@ def is_compound(self): # File convenience methods def make_filename(self, ext): - return "%s%s" % (self.segment_id(), ext) + return f"{self.segment_id()}{ext}" def list_files(self, storage): - prefix = "%s." % self.segment_id() + prefix = f"{self.segment_id()}." return [name for name in storage.list() if name.startswith(prefix)] def create_file(self, storage, ext, **kwargs): @@ -776,7 +785,7 @@ def has_column(self, fieldname): def column_reader(self, fieldname, column): if not self.has_column(fieldname): - raise ValueError("No column %r" % (fieldname,)) + raise ValueError(f"No column {fieldname!r}") default = column.default_value() colreaders = [] diff --git a/src/whoosh/codec/memory.py b/src/whoosh/codec/memory.py index 312befc0..de2f5cf5 100644 --- a/src/whoosh/codec/memory.py +++ b/src/whoosh/codec/memory.py @@ -25,11 +25,10 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import with_statement + from bisect import bisect_left from threading import Lock -from whoosh.compat import range from whoosh.codec import base from whoosh.matching import ListMatcher from whoosh.reading import SegmentReader, TermInfo, TermNotFound @@ -37,7 +36,7 @@ class MemWriter(SegmentWriter): - def commit(self): + def commit(self, mergetype=None, optimize=False, merge=True): self._finalize_segment() @@ -83,7 +82,7 @@ def _has_column(self, fieldname): return fieldname in self._colwriters def _create_column(self, fieldname, column): - colfile = self._storage.create_file("%s.c" % fieldname) + colfile = self._storage.create_file(f"{fieldname}.c") self._colwriters[fieldname] = (colfile, column.writer(colfile)) def _get_column(self, fieldname): @@ -145,11 +144,11 @@ def supports_columns(self): return True def has_column(self, fieldname): - filename = "%s.c" % fieldname + filename = f"{fieldname}.c" return self._storage.file_exists(filename) def column_reader(self, fieldname, column): - filename = "%s.c" % fieldname + filename = f"{fieldname}.c" colfile = self._storage.open_file(filename) length = self._storage.file_length(filename) return column.reader(colfile, 0, length, self._segment.doc_count_all()) @@ -189,6 +188,7 @@ def stored_fields(self, docnum): return self._segment._stored[docnum] def close(self): + # This method is intentionally left empty. pass @@ -202,7 +202,9 @@ def __init__(self, storage, segment): def start_field(self, fieldname, fieldobj): if self._fieldname is not None: - raise Exception("Called start_field in a field") + raise ValueError( + "Called start_field in a field" + ) # Replaced generic Exception with ValueError with self._segment._lock: invindex = self._segment._invindex @@ -214,7 +216,7 @@ def start_field(self, fieldname, fieldobj): def start_term(self, btext): if self._btext is not None: - raise Exception("Called start_term in a term") + raise ValueError("Called start_term in a term") fieldname = self._fieldname fielddict = self._segment._invindex[fieldname] @@ -236,7 +238,7 @@ def add(self, docnum, weight, vbytes, length): def finish_term(self): if self._btext is None: - raise Exception("Called finish_term outside a term") + raise ValueError("Called finish_term outside a term") self._postings = None self._btext = None @@ -244,7 +246,7 @@ def finish_term(self): def finish_field(self): if self._fieldname is None: - raise Exception("Called finish_field outside a field") + raise ValueError("Called finish_field outside a field") self._fieldname = None self._fieldobj = None @@ -268,7 +270,7 @@ def terms(self): def terms_from(self, fieldname, prefix): if fieldname not in self._invindex: - raise TermNotFound("Unknown field %r" % (fieldname,)) + raise TermNotFound(f"Unknown field {fieldname!r}") terms = sorted(self._invindex[fieldname]) if not terms: return @@ -288,6 +290,7 @@ def indexed_field_names(self): return self._invindex.keys() def close(self): + # This method is intentionally left empty. pass @@ -317,7 +320,7 @@ def doc_count_all(self): def delete_document(self, docnum, delete=True): if not delete: - raise Exception("MemoryCodec can't undelete") + raise ValueError("MemoryCodec can't undelete") with self._lock: del self._stored[docnum] del self._lengths[docnum] diff --git a/src/whoosh/codec/plaintext.py b/src/whoosh/codec/plaintext.py index 6c547368..95b54b5d 100644 --- a/src/whoosh/codec/plaintext.py +++ b/src/whoosh/codec/plaintext.py @@ -27,9 +27,18 @@ from ast import literal_eval -from whoosh.compat import b, bytes_type, text_type, integer_types, PY3 -from whoosh.compat import iteritems, dumps, loads, range from whoosh.codec import base +from whoosh.compat import ( + PY3, + b, + bytes_type, + dumps, + integer_types, + iteritems, + loads, + range, + text_type, +) from whoosh.matching import ListMatcher from whoosh.reading import TermInfo, TermNotFound @@ -45,7 +54,7 @@ class memoryview: # Mixin classes for producing and consuming the simple text format -class LineWriter(object): +class LineWriter: def _print_line(self, indent, command, **kwargs): self._dbfile.write(b(" ") * indent) self._dbfile.write(command.encode("latin1")) @@ -54,11 +63,11 @@ def _print_line(self, indent, command, **kwargs): v = bytes(v) if v is not None and not isinstance(v, _reprable): raise TypeError(type(v)) - self._dbfile.write(("\t%s=%r" % (k, v)).encode("latin1")) + self._dbfile.write(f"\t{k}={v!r}".encode("latin1")) self._dbfile.write(b("\n")) -class LineReader(object): +class LineReader: def __init__(self, dbfile): self._dbfile = dbfile @@ -115,7 +124,7 @@ def _find_root(self, command): self._reset() c = self._find_line(0, command) if c is None: - raise Exception("No root section %r" % (command,)) + raise ValueError(f"No root section {command}") # Codec class @@ -163,6 +172,7 @@ def add_vector_items(self, fieldname, fieldobj, items): self._print_line(3, "VPOST", t=text, w=weight, v=vbytes) def finish_doc(self): + # This method is intentionally left empty. pass def close(self): @@ -212,8 +222,7 @@ def _iter_docs(self): def _iter_docfields(self, fieldname): for _ in self._iter_docs(): - for c in self._find_lines(2, "DOCFIELD", fn=fieldname): - yield c + yield from self._find_lines(2, "DOCFIELD", fn=fieldname) def _iter_lengths(self, fieldname): return (c.get("len", 0) for c in self._iter_docfields(fieldname)) @@ -232,14 +241,12 @@ def doc_field_length(self, docnum, fieldname, default=0): def _column_values(self, fieldname): for i, docnum in enumerate(self._iter_docs()): if i != docnum: - raise Exception( - "Missing column value for field %r doc %d?" % (fieldname, i) - ) + raise ValueError(f"Missing column value for field {fieldname} doc {i}?") c = self._find_line(2, "COLVAL", fn=fieldname) if c is None: - raise Exception( - "Missing column value for field %r doc %d?" % (fieldname, docnum) + raise ValueError( + f"Missing column value for field {fieldname} doc {docnum}" ) yield c.get("v") @@ -262,16 +269,15 @@ def max_field_length(self, fieldname): return max(self._iter_lengths(fieldname)) def has_vector(self, docnum, fieldname): - if self._find_doc(docnum): - if self._find_line(2, "VECTOR"): - return True + if self._find_doc(docnum) and self._find_line(2, "VECTOR"): + return True return False def vector(self, docnum, fieldname, format_): if not self._find_doc(docnum): - raise Exception + raise ValueError("Document not found.") if not self._find_line(2, "VECTOR"): - raise Exception + raise ValueError("Vector not found.") ids = [] weights = [] @@ -303,7 +309,7 @@ def _read_stored_fields(self): def stored_fields(self, docnum): if not self._find_doc(docnum): - raise Exception + raise ValueError("Document not found.") return self._read_stored_fields() def iter_docs(self): @@ -369,7 +375,7 @@ def __init__(self, storage, segment): def _find_field(self, fieldname): self._find_root("TERMS") if self._find_line(1, "TERMFIELD", fn=fieldname) is None: - raise TermNotFound("No field %r" % fieldname) + raise TermNotFound(f"No field {fieldname!r}") def _iter_fields(self): self._find_root() diff --git a/src/whoosh/codec/whoosh2.py b/src/whoosh/codec/whoosh2.py index ffcf3f20..c146aff2 100644 --- a/src/whoosh/codec/whoosh2.py +++ b/src/whoosh/codec/whoosh2.py @@ -25,7 +25,8 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -import struct, sys +import struct +import sys from array import array from binascii import crc32 from collections import defaultdict @@ -38,28 +39,48 @@ except ImportError: zlib = None -from whoosh.compat import b, PY3 -from whoosh.compat import loads, dumps -from whoosh.compat import range, iteritems -from whoosh.compat import bytes_type, text_type, string_type, integer_types -from whoosh.compat import array_frombytes, array_tobytes +from whoosh.automata.fst import GraphReader, GraphWriter from whoosh.codec import base +from whoosh.compat import ( + PY3, + array_frombytes, + array_tobytes, + b, + bytes_type, + dumps, + integer_types, + iteritems, + loads, + range, + string_type, + text_type, +) from whoosh.filedb.filestore import Storage -from whoosh.matching import ListMatcher, ReadTooFar, LeafMatcher +from whoosh.matching import LeafMatcher, ListMatcher, ReadTooFar from whoosh.reading import NoGraphError, TermInfo, TermNotFound -from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE, IS_LITTLE -from whoosh.system import emptybytes -from whoosh.system import pack_byte -from whoosh.system import pack_ushort, unpack_ushort, pack_long, unpack_long - -from whoosh.automata.fst import GraphWriter, GraphReader -from whoosh.util.numeric import byte_to_length, length_to_byte -from whoosh.util.numeric import to_sortable, from_sortable, NaN +from whoosh.system import ( + _FLOAT_SIZE, + _INT_SIZE, + _LONG_SIZE, + IS_LITTLE, + emptybytes, + pack_byte, + pack_long, + pack_ushort, + unpack_long, + unpack_ushort, +) +from whoosh.util.numeric import ( + NaN, + byte_to_length, + from_sortable, + length_to_byte, + to_sortable, +) from whoosh.util.numlists import GrowableArray -from whoosh.util.text import utf8encode, utf8decode +from whoosh.util.text import utf8decode, utf8encode from whoosh.util.times import datetime_to_long, long_to_datetime - # Old hash file implementations _4GB = 4 * 1024 * 1024 * 1024 @@ -68,16 +89,16 @@ def cdb_hash(key): h = 5381 for c in key: - h = (h + (h << 5)) & 0xffffffff ^ ord(c) + h = (h + (h << 5)) & 0xFFFFFFFF ^ ord(c) return h def md5_hash(key): - return int(md5(key).hexdigest(), 16) & 0xffffffff + return int(md5(key).hexdigest(), 16) & 0xFFFFFFFF def crc_hash(key): - return crc32(key) & 0xffffffff + return crc32(key) & 0xFFFFFFFF hash_functions = (hash, cdb_hash, md5_hash, crc_hash) @@ -100,7 +121,8 @@ def crc_hash(key): # Table classes -class HashWriter(object): + +class HashWriter: def __init__(self, dbfile, hashtype=2): self.dbfile = dbfile self.hashtype = hashtype @@ -195,7 +217,7 @@ def close(self): self.dbfile.close() -class HashReader(object): +class HashReader: def __init__(self, dbfile, startoffset=0): self.dbfile = dbfile self.startoffset = startoffset @@ -205,7 +227,7 @@ def __init__(self, dbfile, startoffset=0): # Check magic tag magic = dbfile.read(4) if magic != b("HASH"): - raise Exception("Unknown file header %r" % magic) + raise ValueError(f"Unknown file header {magic}") self.hashtype = dbfile.read_byte() # Hash function type self.hash_func = hash_functions[self.hashtype] @@ -234,7 +256,7 @@ def _read_extras(self): def close(self): if self.is_closed: - raise Exception("Tried to close %r twice" % self) + raise ValueError(f"Tried to close {self} twice") self.dbfile.close() self.is_closed = True @@ -314,7 +336,7 @@ def _key_at(self, pos): def ranges_for_key(self, key): read = self.read if not isinstance(key, bytes_type): - raise TypeError("Key %r should be bytes" % key) + raise TypeError(f"Key {key} should be bytes") keyhash = self.hash_func(key) hpos, hslots = self._hashtable_info(keyhash) if not hslots: @@ -351,8 +373,7 @@ def __init__(self, dbfile): def add(self, key, value): if key <= self.lastkey: - raise ValueError("Keys must increase: %r..%r" - % (self.lastkey, key)) + raise ValueError(f"Keys must increase: {self.lastkey!r}..{key!r}") self.index.append(self.dbfile.tell()) HashWriter.add(self, key, value) self.lastkey = key @@ -392,7 +413,7 @@ def __init__(self, dbfile): elif indextype == "q": self._ixpos = dbfile.get_long else: - raise Exception("Unknown index type %r" % indextype) + raise ValueError(f"Unknown index type {indextype}") def _closest_key(self, key): key_at = self._key_at @@ -402,7 +423,7 @@ def _closest_key(self, key): lo = 0 hi = self.indexlen if not isinstance(key, bytes_type): - raise TypeError("Key %r should be bytes" % key) + raise TypeError(f"Key {key} should be bytes") while lo < hi: mid = (lo + hi) // 2 midkey = key_at(ixpos(indexbase + mid * ixsize)) @@ -410,7 +431,7 @@ def _closest_key(self, key): lo = mid + 1 else: hi = mid - #i = max(0, mid - 1) + # i = max(0, mid - 1) if lo == self.indexlen: return None return ixpos(indexbase + lo * ixsize) @@ -422,13 +443,12 @@ def closest_key(self, key): return self._key_at(pos) def _ranges_from(self, key): - #read = self.read + # read = self.read pos = self._closest_key(key) if pos is None: return - for x in self._ranges(pos=pos): - yield x + yield from self._ranges(pos=pos) def items_from(self, key): read = self.read @@ -443,6 +463,7 @@ def keys_from(self, key): # Standard codec top-level object + class W2Codec(base.Codec): TERMS_EXT = ".trm" # Term index POSTS_EXT = ".pst" # Term postings @@ -452,8 +473,7 @@ class W2Codec(base.Codec): VPOSTS_EXT = ".vps" # Vector postings STORED_EXT = ".sto" # Stored fields file - def __init__(self, blocklimit=128, compression=3, loadlengths=False, - inlinelimit=1): + def __init__(self, blocklimit=128, compression=3, loadlengths=False, inlinelimit=1): self.blocklimit = blocklimit self.compression = compression self.loadlengths = loadlengths @@ -461,14 +481,19 @@ def __init__(self, blocklimit=128, compression=3, loadlengths=False, # Per-document value writer def per_document_writer(self, storage, segment): - return W2PerDocWriter(storage, segment, blocklimit=self.blocklimit, - compression=self.compression) + return W2PerDocWriter( + storage, segment, blocklimit=self.blocklimit, compression=self.compression + ) # Inverted index writer def field_writer(self, storage, segment): - return W2FieldWriter(storage, segment, blocklimit=self.blocklimit, - compression=self.compression, - inlinelimit=self.inlinelimit) + return W2FieldWriter( + storage, + segment, + blocklimit=self.blocklimit, + compression=self.compression, + inlinelimit=self.inlinelimit, + ) # Readers @@ -483,7 +508,7 @@ def per_document_reader(self, storage, segment): def graph_reader(self, storage, segment): try: dawgfile = segment.open_file(storage, self.DAWG_EXT) - except: + except ValueError: raise NoGraphError return GraphReader(dawgfile) @@ -495,6 +520,7 @@ def new_segment(self, storage, indexname): # Per-document value writer + class W2PerDocWriter(base.PerDocumentWriter): def __init__(self, storage, segment, blocklimit=128, compression=3): if not isinstance(blocklimit, int): @@ -519,8 +545,7 @@ def __init__(self, storage, segment, blocklimit=128, compression=3): def _make_vector_files(self): vifile = self.segment.create_file(self.storage, W2Codec.VECTOR_EXT) self.vindex = VectorWriter(vifile) - self.vpostfile = self.segment.create_file(self.storage, - W2Codec.VPOSTS_EXT) + self.vpostfile = self.segment.create_file(self.storage, W2Codec.VPOSTS_EXT) def start_doc(self, docnum): self.docnum = docnum @@ -594,9 +619,9 @@ def close(self): # Inverted index writer + class W2FieldWriter(base.FieldWriter): - def __init__(self, storage, segment, blocklimit=128, compression=3, - inlinelimit=1): + def __init__(self, storage, segment, blocklimit=128, compression=3, inlinelimit=1): assert isinstance(storage, Storage) assert isinstance(segment, base.Segment) assert isinstance(blocklimit, int) @@ -669,11 +694,13 @@ def start_field(self, fieldname, fieldobj): def start_term(self, text): if self.block is not None: - raise Exception("Called start_term in a block") + raise ValueError("Called start_term in a block") self.text = text self.terminfo = FileTermInfo() if self.spelling: - self.dawg.insert(text.decode("utf-8")) # TODO: how to decode bytes? + self.dawg.insert( + text.decode() + ) # use text.decode() to convert bytes to string. Revert to text.decode("utf-8") if error occurs self._start_blocklist() def add(self, docnum, weight, valuestring, length): @@ -689,7 +716,7 @@ def add_spell_word(self, fieldname, text): def finish_term(self): block = self.block if block is None: - raise Exception("Called finish_term when not in a block") + raise ValueError("Called finish_term when not in a block") terminfo = self.terminfo if self.blockcount < 1 and block and len(block) < self.inlinelimit: @@ -720,7 +747,7 @@ def finish_term(self): def finish_field(self): if not self._infield: - raise Exception("Called finish_field before start_field") + raise ValueError("Called finish_field before start_field") self._infield = False if self._dawgfield: @@ -737,9 +764,11 @@ def close(self): # Matcher + class W2LeafMatcher(LeafMatcher): - def __init__(self, postfile, startoffset, fmt, scorer=None, term=None, - stringids=False): + def __init__( + self, postfile, startoffset, fmt, scorer=None, term=None, stringids=False + ): self.postfile = postfile self.startoffset = startoffset self.format = fmt @@ -783,8 +812,7 @@ def all_ids(self): block = self._read_block(nextoffset) nextoffset = block.nextoffset ids = block.read_ids() - for id in ids: - yield id + yield from ids def next(self): if self.i == self.block.count - 1: @@ -841,8 +869,9 @@ def block_max_wol(self): def _read_block(self, offset): pf = self.postfile pf.seek(offset) - return self.blockclass.from_file(pf, self.format.posting_size, - stringids=self.stringids) + return self.blockclass.from_file( + pf, self.format.posting_size, stringids=self.stringids + ) def _consume_block(self): self.block.read_ids() @@ -850,8 +879,8 @@ def _consume_block(self): self.i = 0 def _next_block(self, consume=True): - if not (self.currentblock < self.blockcount): - raise Exception("No next block") + if self.currentblock >= self.blockcount: + raise ValueError("No next block") self.currentblock += 1 if self.currentblock == self.blockcount: @@ -883,6 +912,7 @@ def _skip_to_block(self, targetfn): # Writers + class TermIndexWriter(HashWriter): def __init__(self, dbfile): HashWriter.__init__(self, dbfile) @@ -941,6 +971,7 @@ def valuecoder(self, offset): # Readers + class PostingIndexBase(HashReader): def __init__(self, dbfile, postfile): HashReader.__init__(self, dbfile) @@ -965,7 +996,7 @@ def _closest_key(self, key): lo = 0 hi = self.length if not isinstance(key, bytes_type): - raise TypeError("Key %r should be bytes" % key) + raise TypeError(f"Key {key!r} should be bytes") while lo < hi: mid = (lo + hi) // 2 midkey = key_at(dbfile.get_long(indexbase + mid * _LONG_SIZE)) @@ -973,7 +1004,7 @@ def _closest_key(self, key): lo = mid + 1 else: hi = mid - #i = max(0, mid - 1) + # i = max(0, mid - 1) if lo == self.length: return None return dbfile.get_long(indexbase + lo * _LONG_SIZE) @@ -985,13 +1016,12 @@ def closest_key(self, key): return self._key_at(pos) def _ranges_from(self, key): - #read = self.read + # read = self.read pos = self._closest_key(key) if pos is None: return - for x in self._ranges(pos=pos): - yield x + yield from self._ranges(pos=pos) def __getitem__(self, key): k = self.keycoder(key) @@ -1068,7 +1098,7 @@ def matcher(self, fieldname, text, format_, scorer=None): try: terminfo = self[term] except KeyError: - raise TermNotFound("No term %s:%r" % (fieldname, text)) + raise TermNotFound(f"No term {fieldname}:{text!r}") p = terminfo.postings if isinstance(p, integer_types): @@ -1077,8 +1107,7 @@ def matcher(self, fieldname, text, format_, scorer=None): else: # terminfo.postings is an inlined tuple of (ids, weights, values) docids, weights, values = p - pr = ListMatcher(docids, weights, values, format_, scorer=scorer, - term=term) + pr = ListMatcher(docids, weights, values, format_, scorer=scorer, term=term) return pr def keycoder(self, key): @@ -1192,7 +1221,7 @@ def has_vector(self, docnum, fieldname): if self._vectors is None: try: self._prep_vectors() - except (NameError, IOError): + except (NameError, OSError): return False return (docnum, fieldname) in self._vectors @@ -1209,7 +1238,8 @@ def stored_fields(self, docnum): # Single-byte field lengths implementations -class ByteLengthsBase(object): + +class ByteLengthsBase: magic = b("~LN1") def __init__(self): @@ -1229,7 +1259,7 @@ def _read_header(self, dbfile, doccount): fieldcount = dbfile.read_ushort() # Number of fields # Read per-field info for i in range(fieldcount): - fieldname = dbfile.read_string().decode('utf-8') + fieldname = dbfile.read_string().decode("utf-8") self.totals[fieldname] = dbfile.read_long() self.minlens[fieldname] = byte_to_length(dbfile.read_byte()) self.maxlens[fieldname] = byte_to_length(dbfile.read_byte()) @@ -1276,7 +1306,7 @@ def to_file(self, dbfile, doccount): # Write per-field info for fieldname in fieldnames: - dbfile.write_string(fieldname.encode('utf-8')) # Fieldname + dbfile.write_string(fieldname.encode("utf-8")) # Fieldname dbfile.write_long(self.field_length(fieldname)) dbfile.write_byte(length_to_byte(self.min_field_length(fieldname))) dbfile.write_byte(length_to_byte(self.max_field_length(fieldname))) @@ -1402,7 +1432,7 @@ def close(self): unpack_stored_pointer = _stored_pointer_struct.unpack -class StoredFieldWriter(object): +class StoredFieldWriter: def __init__(self, dbfile): self.dbfile = dbfile self.length = 0 @@ -1451,7 +1481,7 @@ def close(self): f.close() -class StoredFieldReader(object): +class StoredFieldReader: def __init__(self, dbfile): self.dbfile = dbfile @@ -1489,22 +1519,23 @@ def __iter__(self): dbfile.seek(self.basepos) for length in lengths: vlist = loads(dbfile.read(length) + b(".")) - vdict = dict((names[i], vlist[i]) for i in range(len(vlist)) - if vlist[i] is not None) + vdict = { + names[i]: vlist[i] for i in range(len(vlist)) if vlist[i] is not None + } yield vdict def __getitem__(self, num): if num > self.length - 1: - raise IndexError("Tried to get document %s, file has %s" - % (num, self.length)) + raise IndexError(f"Tried to get document {num}, file has {self.length}") dbfile = self.dbfile start = self.directory_offset + num * stored_pointer_size dbfile.seek(start) ptr = dbfile.read(stored_pointer_size) if len(ptr) != stored_pointer_size: - raise Exception("Error reading %r @%s %s < %s" - % (dbfile, start, len(ptr), stored_pointer_size)) + raise ValueError( + f"Error reading {dbfile} @{start} {len(ptr)} < {stored_pointer_size}" + ) position, length = unpack_stored_pointer(ptr) dbfile.seek(position) vlist = loads(dbfile.read(length) + b(".")) @@ -1513,13 +1544,13 @@ def __getitem__(self, num): # Recreate a dictionary by putting the field names and values back # together by position. We can't just use dict(zip(...)) because we # want to filter out the None values. - vdict = dict((names[i], vlist[i]) for i in range(len(vlist)) - if vlist[i] is not None) + vdict = {names[i]: vlist[i] for i in range(len(vlist)) if vlist[i] is not None} return vdict # Segment object + class W2Segment(base.Segment): def __init__(self, indexname, doccount=0, segid=None, deleted=None): """ @@ -1581,11 +1612,21 @@ def deleted_docs(self): # Posting blocks -class W2Block(object): + +class W2Block: magic = b("Blk3") - infokeys = ("count", "maxid", "maxweight", "minlength", "maxlength", - "idcode", "compression", "idslen", "weightslen") + infokeys = ( + "count", + "maxid", + "maxweight", + "minlength", + "maxlength", + "idcode", + "compression", + "idslen", + "weightslen", + ) def __init__(self, postingsize, stringids=False): self.postingsize = postingsize @@ -1646,14 +1687,23 @@ def to_file(self, postfile, compression=3): wtstring = minimize_weights(self.weights, compression) vstring = minimize_values(self.postingsize, self.values, compression) - info = (len(ids), ids[-1], self.maxweight, - length_to_byte(self.minlength), length_to_byte(self.maxlength), - idcode, compression, len(idstring), len(wtstring)) + info = ( + len(ids), + ids[-1], + self.maxweight, + length_to_byte(self.minlength), + length_to_byte(self.maxlength), + idcode, + compression, + len(idstring), + len(wtstring), + ) infostring = dumps(info, -1) # Offset to next block - postfile.write_uint(len(infostring) + len(idstring) + len(wtstring) - + len(vstring)) + postfile.write_uint( + len(infostring) + len(idstring) + len(wtstring) + len(vstring) + ) # Block contents postfile.write(infostring) postfile.write(idstring) @@ -1681,8 +1731,7 @@ def read_ids(self): offset = self.dataoffset self.postfile.seek(offset) idstring = self.postfile.read(self.idslen) - ids = deminimize_ids(self.idcode, self.count, idstring, - self.compression) + ids = deminimize_ids(self.idcode, self.count, idstring, self.compression) self.ids = ids return ids @@ -1693,8 +1742,7 @@ def read_weights(self): offset = self.dataoffset + self.idslen self.postfile.seek(offset) wtstring = self.postfile.read(self.weightslen) - weights = deminimize_weights(self.count, wtstring, - self.compression) + weights = deminimize_weights(self.count, wtstring, self.compression) self.weights = weights return weights @@ -1706,15 +1754,16 @@ def read_values(self): offset = self.dataoffset + self.idslen + self.weightslen self.postfile.seek(offset) vstring = self.postfile.read(self.nextoffset - offset) - values = deminimize_values(postingsize, self.count, vstring, - self.compression) + values = deminimize_values( + postingsize, self.count, vstring, self.compression + ) self.values = values return values # File TermInfo -NO_ID = 0xffffffff +NO_ID = 0xFFFFFFFF class FileTermInfo(TermInfo): @@ -1756,8 +1805,9 @@ def to_string(self): xid = NO_ID if self._maxid is None else self._maxid # Pack the term info into bytes - st = self.struct.pack(self._weight, self._df, ml, xl, self._maxweight, - 0, mid, xid) + st = self.struct.pack( + self._weight, self._df, ml, xl, self._maxweight, 0, mid, xid + ) if isinstance(self.postings, tuple): # Postings are inlined - dump them using the pickle protocol @@ -1788,11 +1838,11 @@ def from_string(cls, s): if hbyte < 2: st = cls.struct # Weight, Doc freq, min len, max len, max w, unused, min ID, max ID - w, df, ml, xl, xw, _, mid, xid = st.unpack(s[1:st.size + 1]) + w, df, ml, xl, xw, _, mid, xid = st.unpack(s[1 : st.size + 1]) mid = None if mid == NO_ID else mid xid = None if xid == NO_ID else xid # Postings - pstr = s[st.size + 1:] + pstr = s[st.size + 1 :] if hbyte == 0: p = unpack_long(pstr)[0] else: @@ -1844,11 +1894,12 @@ def read_max_weight(cls, dbfile, datapos): # Utility functions + def minimize_ids(arry, stringids, compression=0): amax = arry[-1] if stringids: - typecode = '' + typecode = "" string = dumps(arry) else: typecode = arry.typecode @@ -1870,7 +1921,7 @@ def minimize_ids(arry, stringids, compression=0): def deminimize_ids(typecode, count, string, compression=0): if compression: string = zlib.decompress(string) - if typecode == '': + if typecode == "": return loads(string) else: arry = array(typecode) @@ -1908,9 +1959,9 @@ def minimize_values(postingsize, values, compression=0): if postingsize < 0: string = dumps(values, -1)[2:] elif postingsize == 0: - string = b('') + string = b("") else: - string = b('').join(values) + string = b("").join(values) if string and compression: string = zlib.compress(string, compression) return string @@ -1925,8 +1976,7 @@ def deminimize_values(postingsize, count, string, compression=0): elif postingsize == 0: return [None] * count else: - return [string[i:i + postingsize] for i - in range(0, len(string), postingsize)] + return [string[i : i + postingsize] for i in range(0, len(string), postingsize)] # Legacy field types @@ -1936,14 +1986,29 @@ def deminimize_values(postingsize, count, string, compression=0): class OLD_NUMERIC(NUMERIC): - NUMERIC_DEFAULTS = {"b": 2 ** 7 - 1, "B": 2 ** 8 - 1, "h": 2 ** 15 - 1, - "H": 2 ** 16 - 1, "i": 2 ** 31 - 1, "I": 2 ** 32 - 1, - "q": 2 ** 63 - 1, "Q": 2 ** 64 - 1, "f": NaN, - "d": NaN, - } - - def __init__(self, type=int, stored=False, unique=False, field_boost=1.0, - decimal_places=0, shift_step=4, signed=True): + NUMERIC_DEFAULTS = { + "b": 2**7 - 1, + "B": 2**8 - 1, + "h": 2**15 - 1, + "H": 2**16 - 1, + "i": 2**31 - 1, + "I": 2**32 - 1, + "q": 2**63 - 1, + "Q": 2**64 - 1, + "f": NaN, + "d": NaN, + } + + def __init__( + self, + type=int, + stored=False, + unique=False, + field_boost=1.0, + decimal_places=0, + shift_step=4, + signed=True, + ): from whoosh import analysis, formats self.type = type @@ -1961,11 +2026,12 @@ def __init__(self, type=int, stored=False, unique=False, field_boost=1.0, self._from_text = self._text_to_float self.sortable_typecode = "f" elif self.type is Decimal: - raise TypeError("To store Decimal instances, set type to int or " - "float and use the decimal_places argument") + raise TypeError( + "To store Decimal instances, set type to int or " + "float and use the decimal_places argument" + ) else: - raise TypeError("%s field type can't store %r" % (self.__class__, - self.type)) + raise TypeError(f"{self.__class__} field type can't store {self.type!r}") self.stored = stored self.unique = unique @@ -1986,7 +2052,7 @@ def prepare_number(self, x): return x if self.decimal_places: x = Decimal(x) - x *= 10 ** self.decimal_places + x *= 10**self.decimal_places x = self.type(x) return x @@ -2033,14 +2099,13 @@ def parse_query(self, fieldname, qstring, boost=1.0): try: text = self.to_text(qstring) - except Exception: + except ValueError: e = sys.exc_info()[1] return query.error_query(e) return query.Term(fieldname, text, boost=boost) - def parse_range(self, fieldname, start, end, startexcl, endexcl, - boost=1.0): + def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): from whoosh import query from whoosh.qparser.common import QueryParserError @@ -2049,12 +2114,13 @@ def parse_range(self, fieldname, start, end, startexcl, endexcl, start = self.from_text(self.to_text(start)) if end is not None: end = self.from_text(self.to_text(end)) - except Exception: + except ValueError: e = sys.exc_info()[1] raise QueryParserError(e) - return query.NumericRange(fieldname, start, end, startexcl, endexcl, - boost=boost) + return query.NumericRange( + fieldname, start, end, startexcl, endexcl, boost=boost + ) def sortable_terms(self, ixreader, fieldname): for btext in ixreader.lexicon(fieldname): @@ -2066,11 +2132,13 @@ def sortable_terms(self, ixreader, fieldname): class OLD_DATETIME(OLD_NUMERIC): def __init__(self, stored=False, unique=False): - OLD_NUMERIC.__init__(self, type=long_type, stored=stored, - unique=unique, shift_step=8) + OLD_NUMERIC.__init__( + self, type=long_type, stored=stored, unique=unique, shift_step=8 + ) def to_text(self, x, shift=0): from datetime import datetime + from whoosh.util.times import floor try: @@ -2082,8 +2150,8 @@ def to_text(self, x, shift=0): x = datetime_to_long(x) elif not isinstance(x, integer_types): raise TypeError() - except Exception: - raise ValueError("DATETIME.to_text can't convert from %r" % (x,)) + except ValueError: + raise ValueError(f"DATETIME.to_text can't convert from {x!r}") x = OLD_NUMERIC.to_text(self, x, shift=shift) return x @@ -2114,10 +2182,9 @@ def _parse_datestring(self, qstring): if len(qstring) == 20: microsecond = int(qstring[14:]) - at = fix(adatetime(year, month, day, hour, minute, second, - microsecond)) + at = fix(adatetime(year, month, day, hour, minute, second, microsecond)) if is_void(at): - raise Exception("%r is not a parseable date" % qstring) + raise Exception(f"{qstring!r} is not a parseable date") return at def parse_query(self, fieldname, qstring, boost=1.0): @@ -2137,8 +2204,7 @@ def parse_query(self, fieldname, qstring, boost=1.0): else: return query.Term(fieldname, self.to_text(at), boost=boost) - def parse_range(self, fieldname, start, end, startexcl, endexcl, - boost=1.0): + def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): from whoosh import query if start is None and end is None: @@ -2157,6 +2223,7 @@ def parse_range(self, fieldname, start, end, startexcl, endexcl, # Functions for converting numbers to and from text + def int_to_text(x, shift=0, signed=True): x = to_sortable(int, 32, signed, x) return sortable_int_to_text(x, shift) @@ -2192,13 +2259,13 @@ def text_to_float(text, signed=True): # Functions for converting sortable representations to and from text. -from whoosh.support.base85 import to_base85, from_base85 +from whoosh.support.base85 import from_base85, to_base85 def sortable_int_to_text(x, shift=0): if shift: x >>= shift - #text = chr(shift) + u"%08x" % x + # text = chr(shift) + u"%08x" % x text = chr(shift) + to_base85(x, False) return text @@ -2206,19 +2273,19 @@ def sortable_int_to_text(x, shift=0): def sortable_long_to_text(x, shift=0): if shift: x >>= shift - #text = chr(shift) + u"%016x" % x - #assert len(text) == 17 + # text = chr(shift) + u"%016x" % x + # assert len(text) == 17 text = chr(shift) + to_base85(x, True) return text def text_to_sortable_int(text): - #assert len(text) == 9 - #return int(text[1:], 16) + # assert len(text) == 9 + # return int(text[1:], 16) return from_base85(text[1:]) def text_to_sortable_long(text): - #assert len(text) == 17 - #return long(text[1:], 16) + # assert len(text) == 17 + # return long(text[1:], 16) return from_base85(text[1:]) diff --git a/src/whoosh/codec/whoosh3.py b/src/whoosh/codec/whoosh3.py index ca846af3..16107445 100644 --- a/src/whoosh/codec/whoosh3.py +++ b/src/whoosh/codec/whoosh3.py @@ -34,18 +34,35 @@ from collections import defaultdict from whoosh import columns, formats -from whoosh.compat import b, bytes_type, string_type, integer_types -from whoosh.compat import dumps, loads, iteritems, range from whoosh.codec import base +from whoosh.compat import ( + b, + bytes_type, + dumps, + integer_types, + iteritems, + loads, + range, + string_type, +) from whoosh.filedb import compound, filetables -from whoosh.matching import ListMatcher, ReadTooFar, LeafMatcher +from whoosh.matching import LeafMatcher, ListMatcher, ReadTooFar from whoosh.reading import TermInfo, TermNotFound -from whoosh.system import emptybytes -from whoosh.system import _SHORT_SIZE, _INT_SIZE, _LONG_SIZE, _FLOAT_SIZE -from whoosh.system import pack_ushort, unpack_ushort -from whoosh.system import pack_int, unpack_int, pack_long, unpack_long -from whoosh.util.numlists import delta_encode, delta_decode -from whoosh.util.numeric import length_to_byte, byte_to_length +from whoosh.system import ( + _FLOAT_SIZE, + _INT_SIZE, + _LONG_SIZE, + _SHORT_SIZE, + emptybytes, + pack_int, + pack_long, + pack_ushort, + unpack_int, + unpack_long, + unpack_ushort, +) +from whoosh.util.numeric import byte_to_length, length_to_byte +from whoosh.util.numlists import delta_decode, delta_encode try: import zlib @@ -155,11 +172,11 @@ def new_segment(self, storage, indexname): def _vecfield(fieldname): - return "_%s_vec" % fieldname + return f"_{fieldname}_vec" def _lenfield(fieldname): - return "_%s_len" % fieldname + return f"_{fieldname}_len" # Per-doc information writer @@ -171,7 +188,7 @@ def __init__(self, codec, storage, segment): self._storage = storage self._segment = segment - tempst = storage.temp_storage("%s.tmp" % segment.indexname) + tempst = storage.temp_storage(f"{segment.indexname}.tmp") self._cols = compound.CompoundWriter(tempst) self._colwriters = {} self._create_column("_stored", STORED_COLUMN) @@ -196,7 +213,7 @@ def _has_column(self, fieldname): def _create_column(self, fieldname, column): writers = self._colwriters if fieldname in writers: - raise Exception("Already added column %r" % fieldname) + raise Exception(f"Already added column {fieldname!r}") f = self._cols.create_file(fieldname) writers[fieldname] = column.writer(f) @@ -215,7 +232,7 @@ def start_doc(self, docnum): raise Exception("Called start_doc when already in a doc") if docnum != self._doccount: raise Exception( - "Called start_doc(%r) was expecting %r" % (docnum, self._doccount) + f"Called start_doc({docnum!r}) was expecting {self._doccount!r}" ) self._docnum = docnum @@ -498,7 +515,7 @@ def vector(self, docnum, fieldname, format_): self._prep_vectors() offset, length = self._vector_extent(docnum, fieldname) if not offset: - raise Exception("Field %r has no vector in docnum %s" % (fieldname, docnum)) + raise Exception(f"Field {fieldname!r} has no vector in docnum {docnum}") m = W3LeafMatcher(self._vpostfile, offset, length, format_, byteids=True) return m @@ -583,7 +600,7 @@ def __init__(self, codec, dbfile, length, postfile): self._fieldunmap[num] = fieldname def _keycoder(self, fieldname, tbytes): - assert isinstance(tbytes, bytes_type), "tbytes=%r" % tbytes + assert isinstance(tbytes, bytes_type), f"tbytes={tbytes!r}" fnum = self._fieldmap.get(fieldname, 65535) return pack_ushort(fnum) + tbytes @@ -639,7 +656,7 @@ def term_info(self, fieldname, tbytes): try: return W3TermInfo.from_bytes(self._tindex[key]) except KeyError: - raise TermNotFound("No term %s:%r" % (fieldname, tbytes)) + raise TermNotFound(f"No term {fieldname}:{tbytes!r}") def frequency(self, fieldname, tbytes): datapos = self._range_for_key(fieldname, tbytes)[0] @@ -713,11 +730,11 @@ def add_posting(self, id_, weight, vbytes, length=None): # Check types if self._byteids: - assert isinstance(id_, string_type), "id_=%r" % id_ + assert isinstance(id_, string_type), f"id_={id_!r}" else: - assert isinstance(id_, integer_types), "id_=%r" % id_ - assert isinstance(weight, (int, float)), "weight=%r" % weight - assert isinstance(vbytes, bytes_type), "vbytes=%r" % vbytes + assert isinstance(id_, integer_types), f"id_={id_!r}" + assert isinstance(weight, (int, float)), f"weight={weight!r}" + assert isinstance(vbytes, bytes_type), f"vbytes={vbytes!r}" assert length is None or isinstance(length, integer_types) self._ids.append(id_) @@ -930,7 +947,7 @@ def _read_header(self): postfile.seek(self._startoffset) magic = postfile.read(4) if magic != WHOOSH3_HEADER_MAGIC: - raise Exception("Block tag error %r" % magic) + raise Exception(f"Block tag error {magic!r}") # Remember the base offset (start of postings, after the header) self._baseoffset = postfile.tell() diff --git a/src/whoosh/collectors.py b/src/whoosh/collectors.py index 50b71c1e..4d1d1ba1 100644 --- a/src/whoosh/collectors.py +++ b/src/whoosh/collectors.py @@ -83,11 +83,10 @@ def collect(self, sub_docnum): from heapq import heapify, heappush, heapreplace from whoosh import sorting -from whoosh.compat import abstractmethod, iteritems, itervalues, range +from whoosh.compat import abstractmethod, iteritems, itervalues from whoosh.searching import Results, TimeLimit from whoosh.util import now - # Functions @@ -101,7 +100,7 @@ def ilen(iterator): # Base class -class Collector(object): +class Collector: """Base class for collectors.""" def prepare(self, top_searcher, q, context): diff --git a/src/whoosh/columns.py b/src/whoosh/columns.py index a711ef72..e51392c8 100644 --- a/src/whoosh/columns.py +++ b/src/whoosh/columns.py @@ -46,8 +46,9 @@ and ``reader()`` to return a ``ColumnReader`` object. """ -from __future__ import division, with_statement -import struct, warnings + +import struct +import warnings from array import array from bisect import bisect_right @@ -56,21 +57,18 @@ except ImportError: zlib = None -from whoosh.compat import b, bytes_type, BytesIO -from whoosh.compat import array_tobytes, range -from whoosh.compat import dumps, loads +from whoosh.compat import BytesIO, array_tobytes, b, bytes_type, dumps, loads from whoosh.filedb.structfile import StructFile from whoosh.idsets import BitSet, OnDiskBitSet from whoosh.system import emptybytes from whoosh.util.numeric import typecode_max, typecode_min from whoosh.util.numlists import GrowableArray -from whoosh.util.varints import varint, read_varint - +from whoosh.util.varints import read_varint, varint # Base classes -class Column(object): +class Column: """Represents a "column" of rows mapping docnums to document values. The interface requires that you store the start offset of the column, the @@ -105,6 +103,7 @@ def reader(self, dbfile, basepos, length, doccount): def default_value(self, reverse=False): """Returns the default value for this column type.""" + _ = reverse # unused variable return self._default @@ -116,7 +115,7 @@ def stores_lists(self): return False -class ColumnWriter(object): +class ColumnWriter: def __init__(self, dbfile): self._dbfile = dbfile self._count = 0 @@ -132,10 +131,11 @@ def add(self, docnum, value): raise NotImplementedError def finish(self, docnum): + # This method is intentionally left empty. pass -class ColumnReader(object): +class ColumnReader: def __init__(self, dbfile, basepos, length, doccount): self._dbfile = dbfile self._basepos = basepos @@ -241,7 +241,7 @@ def finish(self, doccount): # ...but if we wrote offsets, make the last byte "X" so we know if write_offsets: dbfile.write(offsets.typecode.encode("ascii")) - dbfile.write("X".encode("ascii")) + dbfile.write(b"X") class Reader(ColumnReader): def __init__(self, dbfile, basepos, length, doccount): @@ -487,7 +487,7 @@ def add(self, docnum, v): else: if ref > 65535: warnings.warn( - "RefBytesColumn dropped unique value %r" % v, UserWarning + f"RefBytesColumn dropped unique value {v!r}", UserWarning ) ref = 0 dbfile.write_ushort(ref) @@ -906,12 +906,17 @@ def __repr__(self): return "" def _find_block(self, docnum): - # TODO: use binary search instead of linear - for i, b in enumerate(self._blocks): - if docnum < b[0]: - return None - elif docnum <= b[1]: - return i + # Use binary search instead of linear search + left = 0 + right = len(self._blocks) - 1 + while left <= right: + mid = (left + right) // 2 + if docnum < self._blocks[mid][0]: + right = mid - 1 + elif docnum <= self._blocks[mid][1]: + return mid + else: + left = mid + 1 return None def _get_block(self, blocknum): @@ -1065,8 +1070,7 @@ def __getitem__(self, docnum): def __iter__(self): for r in self._readers: - for v in r: - yield v + yield from r class TranslatingColumnReader(ColumnReader): diff --git a/src/whoosh/compat.py b/src/whoosh/compat.py index c2b602eb..9bd790c7 100644 --- a/src/whoosh/compat.py +++ b/src/whoosh/compat.py @@ -76,7 +76,7 @@ def b(s): import io BytesIO = io.BytesIO - callable = lambda o: isinstance(o, collections.Callable) + callable = lambda o: isinstance(o, collections.abc.Callable) exec_ = eval("exec") integer_types = (int,) iteritems = lambda o: o.items() @@ -197,8 +197,7 @@ def sentinel(counter=([fillvalue] * (len(args) - 1)).pop): fillers = repeat(fillvalue) iters = [chain(it, sentinel(), fillers) for it in args] try: - for tup in izip(*iters): - yield tup + yield from izip(*iters) except IndexError: pass diff --git a/src/whoosh/externalsort.py b/src/whoosh/externalsort.py index 2dc299a7..46fd39b8 100644 --- a/src/whoosh/externalsort.py +++ b/src/whoosh/externalsort.py @@ -29,14 +29,13 @@ This module implements a general external merge sort for Python objects. """ -from __future__ import with_statement -import os, tempfile +import os +import tempfile from heapq import heapify, heappop, heapreplace from whoosh.compat import dump, load - ## Python 3.2 had a bug that make marshal.load unusable # if (hasattr(platform, "python_implementation") # and platform.python_implementation() == "CPython" @@ -84,7 +83,7 @@ def imerge(iterables): return -class SortingPool(object): +class SortingPool: """This object implements a general K-way external merge sort for Python objects. @@ -113,7 +112,7 @@ def __init__(self, maxsize=1000000, tempdir=None, prefix="", suffix=".run"): self.tempdir = tempdir if maxsize < 1: - raise ValueError("maxsize=%s must be >= 1" % maxsize) + raise ValueError(f"maxsize={maxsize} must be >= 1") self.maxsize = maxsize self.prefix = prefix self.suffix = suffix @@ -148,8 +147,7 @@ def _read_run(self, path): def _merge_runs(self, paths): iters = [self._read_run(path) for path in paths] - for item in imerge(iters): - yield item + yield from imerge(iters) def add(self, item): """Adds `item` to the pool to be sorted.""" @@ -186,9 +184,9 @@ def reduce_to(self, target, k): # Reduce the number of runs to "target" by merging "k" runs at a time if k < 2: - raise ValueError("k=%s must be > 2" % k) + raise ValueError(f"k={k} must be > 2") if target < 1: - raise ValueError("target=%s must be >= 1" % target) + raise ValueError(f"target={target} must be >= 1") runs = self.runs while len(runs) > target: newpath, f = self._new_run() @@ -207,7 +205,7 @@ def items(self, maxfiles=128): """ if maxfiles < 2: - raise ValueError("maxfiles=%s must be >= 2" % maxfiles) + raise ValueError(f"maxfiles={maxfiles} must be >= 2") if not self.runs: # We never wrote a run to disk, so just sort the queue in memory diff --git a/src/whoosh/fields.py b/src/whoosh/fields.py index 2ffa9ea7..ef585192 100644 --- a/src/whoosh/fields.py +++ b/src/whoosh/fields.py @@ -29,22 +29,21 @@ Contains functions and classes related to fields. """ -import datetime, fnmatch, re, struct, sys +import datetime +import fnmatch +import re +import struct +import sys from array import array from decimal import Decimal from whoosh import analysis, columns, formats -from whoosh.compat import with_metaclass -from whoosh.compat import itervalues -from whoosh.compat import bytes_type, string_type, text_type -from whoosh.system import emptybytes -from whoosh.system import pack_byte -from whoosh.util.numeric import to_sortable, from_sortable -from whoosh.util.numeric import typecode_max, NaN -from whoosh.util.text import utf8encode, utf8decode +from whoosh.compat import bytes_type, itervalues, string_type, text_type, with_metaclass +from whoosh.system import emptybytes, pack_byte +from whoosh.util.numeric import NaN, from_sortable, to_sortable, typecode_max +from whoosh.util.text import utf8decode, utf8encode from whoosh.util.times import datetime_to_long, long_to_datetime - # Exceptions @@ -59,7 +58,7 @@ class UnknownFieldError(Exception): # Field Types -class FieldType(object): +class FieldType: """ Represents a field configuration. @@ -134,7 +133,7 @@ def __init__( self.vector = None def __repr__(self): - return "%s(format=%r, scorable=%s, stored=%s, unique=%s)" % ( + return "{}(format={!r}, scorable={}, stored={}, unique={})".format( self.__class__.__name__, self.format, self.scorable, @@ -173,7 +172,7 @@ def index(self, value, **kwargs): % (self.__class__.__name__, self) ) if not isinstance(value, (text_type, list, tuple)): - raise ValueError("%r is not unicode or sequence" % value) + raise ValueError(f"{value!r} is not unicode or sequence") assert isinstance(self.format, formats.Format) if "mode" not in kwargs: @@ -192,7 +191,7 @@ def tokenize(self, value, **kwargs): """ if not self.analyzer: - raise Exception("%s field has no analyzer" % self.__class__) + raise Exception(f"{self.__class__} field has no analyzer") return self.analyzer(value, **kwargs) def process_text(self, qstring, mode="", **kwargs): @@ -205,7 +204,7 @@ def process_text(self, qstring, mode="", **kwargs): """ if not self.format: - raise Exception("%s field has no format" % self) + raise Exception(f"{self} field has no format") return (t.text for t in self.tokenize(qstring, mode=mode, **kwargs)) # Conversion @@ -604,7 +603,7 @@ def __init__( "decimal_places argument" ) elif numtype not in (int, float): - raise TypeError("Can't use %r as a type, use int or float" % numtype) + raise TypeError(f"Can't use {numtype!r} as a type, use int or float") # Sanity check if numtype is float and decimal_places: raise Exception( @@ -619,7 +618,7 @@ def __init__( bits = 64 # Floats are converted to 64 bit ints else: if bits not in intsizes: - raise Exception("Invalid bits %r, use 8, 16, 32, or 64" % bits) + raise Exception(f"Invalid bits {bits!r}, use 8, 16, 32, or 64") # Type code for the *sortable* representation self.sortable_typecode = intcodes[intsizes.index(bits)] self._struct = struct.Struct(">" + str(self.sortable_typecode)) @@ -644,7 +643,7 @@ def __init__( default = NaN elif not self.is_valid(default): raise Exception( - "The default %r is not a valid number for this " "field" % default + f"The default {default!r} is not a valid number for this field" ) self.default = default @@ -690,8 +689,7 @@ def index(self, num, **kwargs): # If the user gave us a list of numbers, recurse on the list if isinstance(num, (list, tuple)): for n in num: - for item in self.index(n): - yield item + yield from self.index(n) return # word, freq, weight, valuestring @@ -717,7 +715,7 @@ def prepare_number(self, x): try: x = self.numtype(x) except OverflowError: - raise ValueError("Value %r overflowed number type %r" % (x, self.numtype)) + raise ValueError(f"Value {x!r} overflowed number type {self.numtype!r}") if x < self.min_value or x > self.max_value: raise ValueError( @@ -782,7 +780,7 @@ def parse_query(self, fieldname, qstring, boost=1.0): return query.Every(fieldname, boost=boost) if not self.is_valid(qstring): - raise QueryParserError("%r is not a valid number" % qstring) + raise QueryParserError(f"{qstring!r} is not a valid number") token = self.to_bytes(qstring) return query.Term(fieldname, token, boost=boost) @@ -793,11 +791,11 @@ def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): if start is not None: if not self.is_valid(start): - raise QueryParserError("Range start %r is not a valid number" % start) + raise QueryParserError(f"Range start {start!r} is not a valid number") start = self.prepare_number(start) if end is not None: if not self.is_valid(end): - raise QueryParserError("Range end %r is not a valid number" % end) + raise QueryParserError(f"Range end {end!r} is not a valid number") end = self.prepare_number(end) return query.NumericRange( fieldname, start, end, startexcl, endexcl, boost=boost @@ -838,7 +836,7 @@ def __init__(self, stored=False, unique=False, sortable=False): :param unique: Whether the value of this field is unique per-document. """ - super(DATETIME, self).__init__( + super().__init__( int, 64, stored=stored, unique=unique, shift_step=8, sortable=sortable ) @@ -856,11 +854,11 @@ def prepare_datetime(self, x): elif isinstance(x, bytes_type): return x else: - raise Exception("%r is not a datetime" % (x,)) + raise Exception(f"{x!r} is not a datetime") def to_column_value(self, x): if isinstance(x, bytes_type): - raise Exception("%r is not a datetime" % (x,)) + raise Exception(f"{x!r} is not a datetime") if isinstance(x, (list, tuple)): x = x[0] return self.prepare_datetime(x) @@ -900,7 +898,7 @@ def _parse_datestring(self, qstring): at = fix(adatetime(year, month, day, hour, minute, second, microsecond)) if is_void(at): - raise Exception("%r is not a parseable date" % qstring) + raise Exception(f"{qstring!r} is not a parseable date") return at def parse_query(self, fieldname, qstring, boost=1.0): @@ -1031,7 +1029,7 @@ def __init__(self, columnobj=None): if columnobj is None: columnobj = columns.VarBytesColumn() if not isinstance(columnobj, columns.Column): - raise TypeError("%r is not a column object" % (columnobj,)) + raise TypeError(f"{columnobj!r} is not a column object") self.column_type = columnobj def to_bytes(self, v): @@ -1357,7 +1355,7 @@ def subfields(self): class MetaSchema(type): def __new__(cls, name, bases, attrs): - super_new = super(MetaSchema, cls).__new__ + super_new = super().__new__ if not any(b for b in bases if isinstance(b, MetaSchema)): # If this isn't a subclass of MetaSchema, don't do anything special return super_new(cls, name, bases, attrs) @@ -1381,7 +1379,7 @@ def schema(self): return Schema(**self._clsfields) -class Schema(object): +class Schema: """ Represents the collection of fields in an index. Maps field names to FieldType objects which define the behavior of each field. @@ -1429,7 +1427,7 @@ def __ne__(self, other): return not (self.__eq__(other)) def __repr__(self): - return "<%s: %r>" % (self.__class__.__name__, self.names()) + return f"<{self.__class__.__name__}: {self.names()!r}>" def __iter__(self): """ @@ -1452,7 +1450,7 @@ def __getitem__(self, name): if expr.match(name): return fieldtype - raise KeyError("No field named %r" % (name,)) + raise KeyError(f"No field named {name!r}") def __len__(self): """ @@ -1534,11 +1532,11 @@ def add(self, name, fieldtype, glob=False): except: e = sys.exc_info()[1] raise FieldConfigurationError( - "Error: %s instantiating field " "%r: %r" % (e, name, fieldtype) + f"Error: {e} instantiating field {name!r}: {fieldtype!r}" ) if not isinstance(fieldtype, FieldType): - raise FieldConfigurationError("%r is not a FieldType object" % fieldtype) + raise FieldConfigurationError(f"{fieldtype!r} is not a FieldType object") self._subfields[name] = sublist = [] for prefix, subfield in fieldtype.subfields(): @@ -1551,7 +1549,7 @@ def add(self, name, fieldtype, glob=False): elif " " in fname: raise FieldConfigurationError("Names cannot contain spaces") elif fname in self._fields or (glob and fname in self._dyn_fields): - raise FieldConfigurationError("%r already in schema" % fname) + raise FieldConfigurationError(f"{fname!r} already in schema") # Add the field if glob: @@ -1576,7 +1574,7 @@ def remove(self, fieldname): del self._dyn_fields[fieldname] else: - raise KeyError("No field named %r" % fieldname) + raise KeyError(f"No field named {fieldname!r}") def indexable_fields(self, fieldname): if fieldname in self._subfields: @@ -1653,7 +1651,7 @@ def ensure_schema(schema): if isinstance(schema, type) and issubclass(schema, Schema): schema = schema.schema() if not isinstance(schema, Schema): - raise FieldConfigurationError("%r is not a Schema" % schema) + raise FieldConfigurationError(f"{schema!r} is not a Schema") return schema @@ -1664,7 +1662,7 @@ def merge_fielddict(d1, d2): field1 = d1.get(name) field2 = d2.get(name) if field1 and field2 and field1 != field2: - raise Exception("Inconsistent field %r: %r != %r" % (name, field1, field2)) + raise Exception(f"Inconsistent field {name!r}: {field1!r} != {field2!r}") out[name] = field1 or field2 return out diff --git a/src/whoosh/filedb/compound.py b/src/whoosh/filedb/compound.py index 1e257774..7f1e2793 100644 --- a/src/whoosh/filedb/compound.py +++ b/src/whoosh/filedb/compound.py @@ -28,8 +28,8 @@ import errno import os import sys -from threading import Lock from shutil import copyfileobj +from threading import Lock try: import mmap @@ -37,8 +37,8 @@ mmap = None from whoosh.compat import BytesIO, memoryview_ -from whoosh.filedb.structfile import BufferFile, StructFile from whoosh.filedb.filestore import FileStorage, StorageError +from whoosh.filedb.structfile import BufferFile, StructFile from whoosh.system import emptybytes from whoosh.util import random_name @@ -73,7 +73,7 @@ def __init__(self, dbfile, use_mmap=True, basepos=0): try: fileno = self._file.fileno() self._source = mmap.mmap(fileno, 0, access=mmap.ACCESS_READ) - except (mmap.error, OSError): + except OSError: e = sys.exc_info()[1] # If we got an error because there wasn't enough memory to # open the map, ignore it and fall through, we'll just use the @@ -88,7 +88,7 @@ def __init__(self, dbfile, use_mmap=True, basepos=0): self._file = None def __repr__(self): - return "<%s (%s)>" % (self.__class__.__name__, self._name) + return f"<{self.__class__.__name__} ({self._name})>" def close(self): if self.is_closed: @@ -107,7 +107,7 @@ def range(self, name): try: fileinfo = self._dir[name] except KeyError: - raise NameError("Unknown file %r" % (name,)) + raise NameError(f"Unknown file {name!r}") return fileinfo["offset"], fileinfo["length"] def open_file(self, name, *args, **kwargs): @@ -185,7 +185,7 @@ def write_dir(dbfile, basepos, directory, options=None): dbfile.close() -class SubFile(object): +class SubFile: def __init__(self, parentfile, offset, length, name=None): self._file = parentfile self._offset = offset @@ -247,11 +247,11 @@ def tell(self): return self._pos -class CompoundWriter(object): +class CompoundWriter: def __init__(self, tempstorage, buffersize=32 * 1024): assert isinstance(buffersize, int) self._tempstorage = tempstorage - self._tempname = "%s.ctmp" % random_name() + self._tempname = f"{random_name()}.ctmp" self._temp = tempstorage.create_file(self._tempname, mode="w+b") self._buffersize = buffersize self._streams = {} @@ -298,7 +298,7 @@ def save_as_files(self, storage, name_fn): f.write(block) f.close() - class SubStream(object): + class SubStream: def __init__(self, dbfile, buffersize): self._dbfile = dbfile self._buffersize = buffersize diff --git a/src/whoosh/filedb/fileindex.py b/src/whoosh/filedb/fileindex.py index 7148ee7e..9b3e9985 100644 --- a/src/whoosh/filedb/fileindex.py +++ b/src/whoosh/filedb/fileindex.py @@ -15,12 +15,12 @@ # =============================================================================== import os +import pickle import re from bisect import bisect_right from threading import Lock from time import time -import pickle from whoosh import __version__ from whoosh.fields import Schema from whoosh.index import ( @@ -28,9 +28,9 @@ EmptyIndexError, Index, IndexVersionError, + LockError, OutOfDateError, ) -from whoosh.index import LockError from whoosh.support.bitvector import BitVector from whoosh.system import _FLOAT_SIZE, _INT_SIZE @@ -42,7 +42,7 @@ # well as Index for convenience, so they're broken out here. -class SegmentDeletionMixin(object): +class SegmentDeletionMixin: """Mix-in for classes that support deleting documents from self.segments.""" def delete_document(self, docnum, delete=True): @@ -72,7 +72,7 @@ def __init__(self, storage, schema, create=False, indexname=_DEF_INDEX_NAME): self.indexname = indexname if schema is not None and not isinstance(schema, Schema): - raise ValueError("%r is not a Schema object" % schema) + raise ValueError(f"{schema!r} is not a Schema object") self.generation = self.latest_generation() @@ -86,7 +86,7 @@ def __init__(self, storage, schema, create=False, indexname=_DEF_INDEX_NAME): self.segments = SegmentSet() # Clear existing files - prefix = "_%s_" % self.indexname + prefix = f"_{self.indexname}_" for filename in self.storage: if filename.startswith(prefix): storage.delete_file(filename) @@ -96,7 +96,7 @@ def __init__(self, storage, schema, create=False, indexname=_DEF_INDEX_NAME): self._read(schema) else: raise EmptyIndexError( - "No index named %r in storage %r" % (indexname, storage) + f"No index named {indexname!r} in storage {storage!r}" ) # Open a reader for this index. This is used by the @@ -107,7 +107,7 @@ def __init__(self, storage, schema, create=False, indexname=_DEF_INDEX_NAME): self.segment_num_lock = None def __repr__(self): - return "%s(%r, %r)" % (self.__class__.__name__, self.storage, self.indexname) + return f"{self.__class__.__name__}({self.storage!r}, {self.indexname!r})" def _acquire_readlocks(self): self._readlocks = [ @@ -151,7 +151,7 @@ def _write(self): # Use a temporary file for atomic write. tocfilename = self._toc_filename() - tempfilename = "%s.%s" % (tocfilename, time()) + tempfilename = f"{tocfilename}.{time()}" stream = self.storage.create_file(tempfilename) stream.write_varint(_INT_SIZE) @@ -185,7 +185,7 @@ def _read(self, schema): version = stream.read_int() if version != _INDEX_VERSION: - raise IndexVersionError("Can't read format %s" % version, version) + raise IndexVersionError(f"Can't read format {version}", version) self.version = version self.release = ( stream.read_varint(), @@ -215,7 +215,7 @@ def _next_segment_name(self): if self.segment_num_lock.acquire(): try: self.segment_counter += 1 - return "_%s_%s" % (self.indexname, self.segment_counter) + return f"_{self.indexname}_{self.segment_counter}" finally: self.segment_num_lock.release() else: @@ -224,7 +224,7 @@ def _next_segment_name(self): def _toc_filename(self): # Returns the computed filename of the TOC for this index name and # generation. - return "_%s_%s.toc" % (self.indexname, self.generation) + return f"_{self.indexname}_{self.generation}.toc" def last_modified(self): return self.storage.file_modified(self._toc_filename()) @@ -272,7 +272,7 @@ def _clean_files(self): # probably be deleted eventually by a later call to clean_files. storage = self.storage - current_segment_names = set(s.name for s in self.segments) + current_segment_names = {s.name for s in self.segments} tocpattern = _toc_pattern(self.indexname) segpattern = _segment_pattern(self.indexname) @@ -317,7 +317,7 @@ def writer(self, **kwargs): # SegmentSet object -class SegmentSet(object): +class SegmentSet: """This class is never instantiated by the user. It is used by the Index object to keep track of the segments in the index. """ @@ -450,7 +450,7 @@ def reader(self, storage, schema): return MultiReader(readers, schema) -class Segment(object): +class Segment: """Do not instantiate this object directly. It is used by the Index object to hold information about a segment. A list of objects of this class are pickled as part of the TOC file. @@ -496,12 +496,12 @@ def __init__( self._filenames = set() for attr, ext in self.EXTENSIONS.iteritems(): - fname = "%s.%s" % (self.name, ext) + fname = f"{self.name}.{ext}" setattr(self, attr + "_filename", fname) self._filenames.add(fname) def __repr__(self): - return "%s(%r)" % (self.__class__.__name__, self.name) + return f"{self.__class__.__name__}({self.name!r})" def copy(self): if self.deleted: @@ -575,13 +575,13 @@ def delete_document(self, docnum, delete=True): self.deleted = set() elif docnum in self.deleted: raise KeyError( - "Document %s in segment %r is already deleted" % (docnum, self.name) + f"Document {docnum} in segment {self.name!r} is already deleted" ) self.deleted.add(docnum) else: if self.deleted is None or docnum not in self.deleted: - raise KeyError("Document %s is not deleted" % docnum) + raise KeyError(f"Document {docnum} is not deleted") self.deleted.clear(docnum) @@ -601,7 +601,7 @@ def _toc_pattern(indexname): name is the name of the index. """ - return re.compile("_%s_([0-9]+).toc" % indexname) + return re.compile(f"_{indexname}_([0-9]+).toc") def _segment_pattern(indexname): @@ -609,4 +609,4 @@ def _segment_pattern(indexname): name is the name of the index. """ - return re.compile("(_%s_[0-9]+).(%s)" % (indexname, Segment.EXTENSIONS.values())) + return re.compile(f"(_{indexname}_[0-9]+).({Segment.EXTENSIONS.values()})") diff --git a/src/whoosh/filedb/filepostings.py b/src/whoosh/filedb/filepostings.py index 3daf19bf..abbae1b5 100644 --- a/src/whoosh/filedb/filepostings.py +++ b/src/whoosh/filedb/filepostings.py @@ -17,15 +17,15 @@ import types from array import array from struct import Struct -from whoosh.support import unicode -from whoosh.writing import PostingWriter from whoosh.matching import Matcher, ReadTooFar -from whoosh.system import _INT_SIZE, _FLOAT_SIZE -from whoosh.util import utf8encode, utf8decode, length_to_byte, byte_to_length +from whoosh.support import unicode +from whoosh.system import _FLOAT_SIZE, _INT_SIZE +from whoosh.util import byte_to_length, length_to_byte, utf8decode, utf8encode +from whoosh.writing import PostingWriter -class BlockInfo(object): +class BlockInfo: __slots__ = ( "nextoffset", "postcount", @@ -318,8 +318,7 @@ def all_ids(self): blockinfo = self._read_blockinfo(nextoffset) nextoffset = blockinfo.nextoffset ids, __ = self._read_ids(blockinfo.dataoffset, blockinfo.postcount) - for id in ids: - yield id + yield from ids def next(self): if self.i == self.blockinfo.postcount - 1: diff --git a/src/whoosh/filedb/filereading.py b/src/whoosh/filedb/filereading.py index fe105d44..8b3ea6e4 100644 --- a/src/whoosh/filedb/filereading.py +++ b/src/whoosh/filedb/filereading.py @@ -14,23 +14,22 @@ # limitations under the License. # =============================================================================== -from threading import Lock from marshal import loads +from threading import Lock from whoosh.fields import FieldConfigurationError +from whoosh.filedb import misc from whoosh.filedb.filepostings import FilePostingReader from whoosh.filedb.filetables import ( - FileTableReader, FileListReader, - StructHashReader, + FileTableReader, LengthReader, + StructHashReader, ) -from whoosh.filedb import misc # from whoosh.postings import Exclude from whoosh.reading import IndexReader, TermNotFound -from whoosh.util import protected, byte_to_length - +from whoosh.util import byte_to_length, protected # Reader class @@ -67,7 +66,7 @@ def decode_storedfields(value): # Field length file scorables = schema.scorable_fields() if scorables: - self.indices = dict((fieldnum, i) for i, fieldnum in enumerate(scorables)) + self.indices = {fieldnum: i for i, fieldnum in enumerate(scorables)} lengthcount = segment.doc_count_all() * len(self.indices) flf = storage.open_file(segment.fieldlengths_filename) self.fieldlengths = flf.read_array("B", lengthcount) @@ -105,7 +104,7 @@ def _open_postfile(self): ) def __repr__(self): - return "%s(%s)" % (self.__class__.__name__, self.segment) + return f"{self.__class__.__name__}({self.segment})" @protected def __contains__(self, term): @@ -169,7 +168,7 @@ def _term_info(self, fieldnum, text): try: return self.termsindex[(fieldnum, text)] except KeyError: - raise TermNotFound("%s:%r" % (fieldnum, text)) + raise TermNotFound(f"{fieldnum}:{text!r}") def doc_frequency(self, fieldid, text): try: @@ -219,7 +218,7 @@ def postings(self, fieldid, text, exclude_docs=frozenset()): try: offset = self.termsindex[(fieldnum, text)][1] except KeyError: - raise TermNotFound("%s:%r" % (fieldid, text)) + raise TermNotFound(f"{fieldid}:{text!r}") if self.segment.deleted and exclude_docs: exclude_docs = self.segment.deleted | exclude_docs @@ -237,13 +236,11 @@ def vector(self, docnum, fieldid): fieldnum = schema.to_number(fieldid) vformat = schema[fieldnum].vector if not vformat: - raise Exception("No vectors are stored for field %r" % fieldid) + raise Exception(f"No vectors are stored for field {fieldid!r}") self._open_vectors() offset = self.vectorindex.get((docnum, fieldnum)) if offset is None: - raise Exception( - "No vector found for document %s field %r" % (docnum, fieldid) - ) + raise Exception(f"No vector found for document {docnum} field {fieldid!r}") return FilePostingReader(self.vpostfile, offset, vformat, stringids=True) diff --git a/src/whoosh/filedb/filestore.py b/src/whoosh/filedb/filestore.py index f21eca2a..d142e898 100644 --- a/src/whoosh/filedb/filestore.py +++ b/src/whoosh/filedb/filestore.py @@ -25,8 +25,11 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import with_statement -import errno, os, sys, tempfile + +import errno +import os +import sys +import tempfile from threading import Lock from whoosh.compat import BytesIO, memoryview_ @@ -35,7 +38,6 @@ from whoosh.util import random_name from whoosh.util.filelock import FileLock - # Exceptions @@ -50,7 +52,7 @@ class ReadOnlyError(StorageError): # Base class -class Storage(object): +class Storage: """Abstract base class for storage objects. A storage object is a virtual flat filesystem, allowing the creation and @@ -406,7 +408,7 @@ def __init__(self, path, supports_mmap=True, readonly=False, debug=False): self.locks = {} def __repr__(self): - return "%s(%r)" % (self.__class__.__name__, self.folder) + return f"{self.__class__.__name__}({self.folder!r})" def create(self): """Creates this storage object's directory path using ``os.makedirs`` if @@ -446,7 +448,7 @@ def create(self): # Raise an exception if the given path is not a directory if not os.path.isdir(dirpath): - e = IOError("%r is not a directory" % dirpath) + e = IOError(f"{dirpath!r} is not a directory") e.errno = errno.ENOTDIR raise e @@ -463,7 +465,7 @@ def destroy(self): try: # Try to remove the directory os.rmdir(self.folder) - except IOError: + except OSError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: pass @@ -527,7 +529,7 @@ def clean(self, ignore=False): def list(self): try: files = os.listdir(self.folder) - except IOError: + except OSError: files = [] return files @@ -553,7 +555,7 @@ def rename_file(self, oldname, newname, safe=False): if os.path.exists(self._fpath(newname)): if safe: - raise NameError("File %r exists" % newname) + raise NameError(f"File {newname!r} exists") else: os.remove(self._fpath(newname)) os.rename(self._fpath(oldname), self._fpath(newname)) @@ -562,7 +564,7 @@ def lock(self, name): return FileLock(self._fpath(name)) def temp_storage(self, name=None): - name = name or "%s.tmp" % random_name() + name = name or f"{random_name()}.tmp" path = os.path.join(self.folder, name) tempstore = FileStorage(path) return tempstore.create() @@ -611,7 +613,7 @@ def rename_file(self, name, newname, safe=False): if name not in self.files: raise NameError(name) if safe and newname in self.files: - raise NameError("File %r exists" % newname) + raise NameError(f"File {newname!r} exists") content = self.files[name] del self.files[name] @@ -637,7 +639,7 @@ def lock(self, name): def temp_storage(self, name=None): tdir = tempfile.gettempdir() - name = name or "%s.tmp" % random_name() + name = name or f"{random_name()}.tmp" path = os.path.join(tdir, name) tempstore = FileStorage(path) return tempstore.create() diff --git a/src/whoosh/filedb/filetables.py b/src/whoosh/filedb/filetables.py index 0e877468..46045012 100644 --- a/src/whoosh/filedb/filetables.py +++ b/src/whoosh/filedb/filetables.py @@ -30,15 +30,15 @@ D. J. Bernstein's CDB format (http://cr.yp.to/cdb.html). """ -import os, struct, sys +import os +import struct +import sys from binascii import crc32 from hashlib import md5 # type: ignore @UnresolvedImport from whoosh.compat import b, bytes_type -from whoosh.compat import range -from whoosh.util.numlists import GrowableArray from whoosh.system import _INT_SIZE, emptybytes - +from whoosh.util.numlists import GrowableArray # Exceptions @@ -85,7 +85,7 @@ def crc_hash(key): # Basic hash file -class HashWriter(object): +class HashWriter: """Implements a fast on-disk key-value store. This hash uses a two-level hashing scheme, where a key is hashed, the low eight bits of the hash value are used to index into one of 256 hash tables. This is basically the CDB @@ -219,7 +219,7 @@ def close(self): return endpos -class HashReader(object): +class HashReader: """Reader for the fast on-disk key-value files created by :class:`HashWriter`. """ @@ -248,7 +248,7 @@ def __init__(self, dbfile, length=None, magic=b("HSH3"), startoffset=0): # Check format tag filemagic = dbfile.read(4) if filemagic != magic: - raise FileFormatError("Unknown file header %r" % filemagic) + raise FileFormatError(f"Unknown file header {filemagic!r}") # Read hash type self.hashtype = dbfile.read_byte() self.hashfn = _hash_functions[self.hashtype] @@ -299,7 +299,7 @@ def _read_extras(self): def close(self): if self.is_closed: - raise Exception("Tried to close %r twice" % self) + raise Exception(f"Tried to close {self!r} twice") self.dbfile.close() self.is_closed = True @@ -390,7 +390,7 @@ def ranges_for_key(self, key): """ if not isinstance(key, bytes_type): - raise TypeError("Key %r should be bytes" % key) + raise TypeError(f"Key {key!r} should be bytes") dbfile = self.dbfile # Hash the key @@ -457,7 +457,7 @@ def __init__(self, dbfile): def add(self, key, value): if key <= self.lastkey: - raise ValueError("Keys must increase: %r..%r" % (self.lastkey, key)) + raise ValueError(f"Keys must increase: {self.lastkey!r}..{key!r}") self.index.append(self.dbfile.tell()) HashWriter.add(self, key, value) self.lastkey = key @@ -496,8 +496,7 @@ def ranges_from(self, key): if pos is None: return - for item in self._ranges(pos=pos): - yield item + yield from self._ranges(pos=pos) def keys_from(self, key): """Yields an ordered series of keys equal to or greater than the given @@ -540,13 +539,13 @@ def _read_extras(self): elif indextype == "q": self._get_pos = dbfile.get_long else: - raise Exception("Unknown index type %r" % indextype) + raise Exception(f"Unknown index type {indextype!r}") def closest_key_pos(self, key): # Given a key, return the position of that key OR the next highest key # if the given key does not exist if not isinstance(key, bytes_type): - raise TypeError("Key %r should be bytes" % key) + raise TypeError(f"Key {key!r} should be bytes") indexbase = self.indexbase indexsize = self.indexsize @@ -596,7 +595,7 @@ def start_field(self, fieldname): def add(self, key, value): if key <= self.lastkey: - raise ValueError("Keys must increase: %r..%r" % (self.lastkey, key)) + raise ValueError(f"Keys must increase: {self.lastkey!r}..{key!r}") self.poses.append(self.dbfile.tell() - self.fieldstart) HashWriter.add(self, key, value) self.lastkey = key @@ -676,7 +675,7 @@ def closest_term_pos(self, fieldname, key): # Given a key, return the position of that key OR the next highest key # if the given key does not exist if not isinstance(key, bytes_type): - raise TypeError("Key %r should be bytes" % key) + raise TypeError(f"Key {key!r} should be bytes") dbfile = self.dbfile key_at = self.key_at @@ -693,7 +692,7 @@ def closest_term_pos(self, fieldname, key): elif ixtype == "q": get_pos = dbfile.get_long else: - raise Exception("Unknown index type %r" % ixtype) + raise Exception(f"Unknown index type {ixtype!r}") # Do a binary search of the positions in the index array lo = 0 @@ -724,8 +723,7 @@ def term_ranges_from(self, fieldname, btext): return startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname] - for item in self._ranges(pos, ixpos): - yield item + yield from self._ranges(pos, ixpos) def terms_from(self, fieldname, btext): dbfile = self.dbfile diff --git a/src/whoosh/filedb/filewriting.py b/src/whoosh/filedb/filewriting.py index e204044a..36060a68 100644 --- a/src/whoosh/filedb/filewriting.py +++ b/src/whoosh/filedb/filewriting.py @@ -16,25 +16,24 @@ from collections import defaultdict from marshal import dumps -from build.lib.whoosh.support import unicode from whoosh.fields import UnknownFieldError -from whoosh.filedb.fileindex import SegmentDeletionMixin, Segment, SegmentSet +from whoosh.filedb import misc +from whoosh.filedb.fileindex import Segment, SegmentDeletionMixin, SegmentSet from whoosh.filedb.filepostings import FilePostingWriter from whoosh.filedb.filetables import ( FileListWriter, FileTableWriter, - StructHashWriter, LengthWriter, + StructHashWriter, ) -from whoosh.filedb import misc -from whoosh.filedb.pools import TempfilePool, MultiPool +from whoosh.filedb.pools import MultiPool, TempfilePool from whoosh.index import LockError -from whoosh.util.filelock import try_for +from whoosh.support import unicode from whoosh.util import fib +from whoosh.util.filelock import try_for from whoosh.writing import IndexWriter - # Merge policies # A merge policy is a callable that takes the Index object, the SegmentWriter @@ -44,6 +43,7 @@ def NO_MERGE(ix, writer, segments): """This policy does not merge any existing segments.""" + _ = ix, writer return segments @@ -86,7 +86,7 @@ def __init__( blocklimit=128, timeout=0.0, delay=0.1, - **poolargs + **poolargs, ): self.lock = ix.storage.lock(ix.indexname + "_LOCK") if not try_for(self.lock.acquire, timeout=timeout, delay=delay): @@ -213,7 +213,7 @@ def add_document(self, **fields): # Check if the caller gave us a bogus field for name in fieldnames: if name not in schema: - raise UnknownFieldError("There is no field named %r" % name) + raise UnknownFieldError(f"There is no field named {name!r}") storedvalues = {} @@ -258,7 +258,7 @@ def _add_vector(self, fieldnum, vlist): offset = vpostwriter.start(vformat) for text, valuestring in vlist: - assert isinstance(text, unicode), "%r is not unicode" % text + assert isinstance(text, unicode), f"{text!r} is not unicode" vpostwriter.write(text, valuestring) vpostwriter.finish() diff --git a/src/whoosh/filedb/gae.py b/src/whoosh/filedb/gae.py index 2e13d846..7ffca5ea 100644 --- a/src/whoosh/filedb/gae.py +++ b/src/whoosh/filedb/gae.py @@ -23,9 +23,9 @@ from google.appengine.ext import db # type: ignore @UnresolvedImport from whoosh.compat import BytesIO -from whoosh.index import TOC, FileIndex, _DEF_INDEX_NAME from whoosh.filedb.filestore import ReadOnlyError, Storage from whoosh.filedb.structfile import StructFile +from whoosh.index import _DEF_INDEX_NAME, TOC, FileIndex class DatastoreFile(db.Model): @@ -37,7 +37,7 @@ class DatastoreFile(db.Model): mtime = db.IntegerProperty(default=0) def __init__(self, *args, **kwargs): - super(DatastoreFile, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self.data = BytesIO() @classmethod @@ -78,7 +78,7 @@ def getvalue(self): return self.data.getvalue() -class MemcacheLock(object): +class MemcacheLock: def __init__(self, name): self.name = name diff --git a/src/whoosh/filedb/pools.py b/src/whoosh/filedb/pools.py index 3ca83768..fee6f3d0 100644 --- a/src/whoosh/filedb/pools.py +++ b/src/whoosh/filedb/pools.py @@ -198,7 +198,7 @@ def write_postings(schema, termtable, postwriter, postiter): ) -class LengthSpool(object): +class LengthSpool: def __init__(self, filename): self.filename = filename self.file = None @@ -224,7 +224,7 @@ def readback(self): f.close() -class PoolBase(object): +class PoolBase: def __init__(self, dir): self._dir = dir self._fieldlength_totals = defaultdict(int) diff --git a/src/whoosh/filedb/structfile.py b/src/whoosh/filedb/structfile.py index ffe4649f..9db58ffd 100644 --- a/src/whoosh/filedb/structfile.py +++ b/src/whoosh/filedb/structfile.py @@ -29,24 +29,39 @@ from copy import copy from struct import calcsize -from whoosh.compat import BytesIO, bytes_type +from whoosh.compat import BytesIO, array_frombytes, array_tobytes, bytes_type from whoosh.compat import dump as dump_pickle from whoosh.compat import load as load_pickle -from whoosh.compat import array_frombytes, array_tobytes -from whoosh.system import _INT_SIZE, _SHORT_SIZE, _FLOAT_SIZE, _LONG_SIZE -from whoosh.system import IS_LITTLE -from whoosh.system import pack_byte, unpack_byte, pack_sbyte, unpack_sbyte -from whoosh.system import pack_ushort, unpack_ushort -from whoosh.system import pack_ushort_le, unpack_ushort_le -from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint -from whoosh.system import pack_uint_le, unpack_uint_le -from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong -from whoosh.system import pack_float, unpack_float -from whoosh.util.varints import varint, read_varint -from whoosh.util.varints import signed_varint, decode_signed_varint - - -_SIZEMAP = dict((typecode, calcsize(typecode)) for typecode in "bBiIhHqQf") +from whoosh.system import ( + _FLOAT_SIZE, + _INT_SIZE, + _LONG_SIZE, + _SHORT_SIZE, + IS_LITTLE, + pack_byte, + pack_float, + pack_int, + pack_long, + pack_sbyte, + pack_uint, + pack_uint_le, + pack_ulong, + pack_ushort, + pack_ushort_le, + unpack_byte, + unpack_float, + unpack_int, + unpack_long, + unpack_sbyte, + unpack_uint, + unpack_uint_le, + unpack_ulong, + unpack_ushort, + unpack_ushort_le, +) +from whoosh.util.varints import decode_signed_varint, read_varint, signed_varint, varint + +_SIZEMAP = {typecode: calcsize(typecode) for typecode in "bBiIhHqQf"} _ORDERMAP = {"little": "<", "big": ">"} _types = (("sbyte", "b"), ("ushort", "H"), ("int", "i"), ("long", "q"), ("float", "f")) @@ -55,7 +70,7 @@ # Main function -class StructFile(object): +class StructFile: """Returns a "structured file" object that wraps the given file object and provides numerous additional methods for writing structured data, such as "write_varint" and "write_long". @@ -72,7 +87,7 @@ def __init__(self, fileobj, name=None, onclose=None): self.fileno = fileobj.fileno def __repr__(self): - return "%s(%r)" % (self.__class__.__name__, self._name) + return f"{self.__class__.__name__}({self._name!r})" def __str__(self): return self._name diff --git a/src/whoosh/formats.py b/src/whoosh/formats.py index 43d4aadc..ef36f195 100644 --- a/src/whoosh/formats.py +++ b/src/whoosh/formats.py @@ -33,17 +33,22 @@ from collections import defaultdict -from whoosh.analysis import unstopped, entoken -from whoosh.compat import iteritems, dumps, loads, b -from whoosh.system import emptybytes -from whoosh.system import _INT_SIZE, _FLOAT_SIZE -from whoosh.system import pack_uint, unpack_uint, pack_float, unpack_float - +from whoosh.analysis import entoken, unstopped +from whoosh.compat import b, dumps, iteritems, loads +from whoosh.system import ( + _FLOAT_SIZE, + _INT_SIZE, + emptybytes, + pack_float, + pack_uint, + unpack_float, + unpack_uint, +) # Format base class -class Format(object): +class Format: """Abstract base class representing a storage format for a field or vector. Format objects are responsible for writing and reading the low-level representation of a field. It controls what kind/level of information to @@ -71,7 +76,7 @@ def __eq__(self, other): ) def __repr__(self): - return "%s(boost=%s)" % (self.__class__.__name__, self.field_boost) + return f"{self.__class__.__name__}(boost={self.field_boost})" def fixed_value_size(self): if self.posting_size < 0: @@ -147,7 +152,7 @@ def __init__(self, field_boost=1.0, **options): def word_values(self, value, analyzer, **kwargs): fb = self.field_boost - wordset = set(t.text for t in tokens(value, analyzer, kwargs)) + wordset = {t.text for t in tokens(value, analyzer, kwargs)} return ((w, 1, fb, emptybytes) for w in wordset) def encode(self, value): diff --git a/src/whoosh/highlight.py b/src/whoosh/highlight.py index 3b526413..562c68c1 100644 --- a/src/whoosh/highlight.py +++ b/src/whoosh/highlight.py @@ -48,7 +48,6 @@ See :doc:`/highlight` for more information. """ -from __future__ import division from collections import deque from heapq import nlargest @@ -80,7 +79,7 @@ def mkfrag(text, tokens, startchar=None, endchar=None, charsbefore=0, charsafter return Fragment(text, tokens, startchar, endchar) -class Fragment(object): +class Fragment: """Represents a fragment (extract) from a hit document. This object is mainly used to keep track of the start and end points of the fragment and the "matched" character ranges inside; it does not contain the text of the @@ -194,7 +193,7 @@ def set_matched_filter_phrases(tokens, text, terms, phrases): """ Implementation note: Because the Token object follows a Singleton pattern, we can only read each one once. Because phrase matching requires rescanning, - we require a rendered token list (the text parameter) instead. The function must + we require a rendered token list (the text parameter) instead. The function must still yield Token objects at the end, so the text list is used as a way to build a list of Token indices (the matches set). The yield loop at the end uses this to properly set .matched on the yielded Token objects. @@ -241,9 +240,7 @@ def set_matched_filter_phrases(tokens, text, terms, phrases): """ text_sub = text[ current_word_index + 1 : current_word_index + 1 + slop - ][ - ::-1 - ] # Substring to scan (reversed) + ][::-1] # Substring to scan (reversed) len_sub = len(text_sub) next_word_index = ( len_sub - text_sub.index(word) - 1 @@ -276,7 +273,7 @@ def set_matched_filter_phrases(tokens, text, terms, phrases): # Fragmenters -class Fragmenter(object): +class Fragmenter: def must_retokenize(self): """Returns True if this fragmenter requires retokenized text. @@ -607,7 +604,7 @@ def fragment_matches(self, text, tokens): # Fragment scorers -class FragmentScorer(object): +class FragmentScorer: pass @@ -664,7 +661,7 @@ def get_text(original, token, replace): return original[token.startchar : token.endchar] -class Formatter(object): +class Formatter: """Base class for formatters. For highlighters that return strings, it is usually only necessary to @@ -864,8 +861,13 @@ def __init__(self, qname="strong", between="..."): self.qname = qname self.between = between - from genshi.core import START, END, TEXT # type: ignore @UnresolvedImport - from genshi.core import Attrs, Stream # type: ignore @UnresolvedImport + from genshi.core import ( # type: ignore @UnresolvedImport # type: ignore @UnresolvedImport + END, + START, + TEXT, + Attrs, + Stream, + ) self.START, self.END, self.TEXT = START, END, TEXT self.Attrs, self.Stream = Attrs, Stream @@ -956,7 +958,7 @@ def highlight( return formatter(text, fragments) -class Highlighter(object): +class Highlighter: def __init__( self, fragmenter=None, @@ -1052,7 +1054,7 @@ def highlight_hit( if text is None: if fieldname not in hitobj: - raise KeyError("Field %r is not stored." % fieldname) + raise KeyError(f"Field {fieldname!r} is not stored.") text = hitobj[fieldname] # Get the terms searched for/matched in this field diff --git a/src/whoosh/idsets.py b/src/whoosh/idsets.py index d17aff10..c2ce3885 100644 --- a/src/whoosh/idsets.py +++ b/src/whoosh/idsets.py @@ -6,10 +6,9 @@ from array import array from bisect import bisect_left, bisect_right -from whoosh.compat import izip, izip_longest, next, range +from whoosh.compat import izip, izip_longest, next from whoosh.util.numeric import bytes_for_bits - # Number of '1' bits in each byte (0-255) _1SPERBYTE = array( "B", @@ -274,7 +273,7 @@ ) -class DocIdSet(object): +class DocIdSet: """Base class for a set of positive integers, implementing a subset of the built-in ``set`` type's interface with extra docid-related methods. @@ -561,7 +560,7 @@ def __init__(self, source=None, size=0): add(num) def __repr__(self): - return "%s(%r)" % (self.__class__.__name__, list(self)) + return f"{self.__class__.__name__}({list(self)!r})" def byte_count(self): return len(self.bits) @@ -709,7 +708,7 @@ def size(self): return len(self.data) * self.data.itemsize def __repr__(self): - return "%s(%r)" % (self.__class__.__name__, self.data) + return f"{self.__class__.__name__}({self.data!r})" def __len__(self): return len(self.data) @@ -766,10 +765,10 @@ def difference_update(self, other): self.data = array(self.typecode, (num for num in self if num not in other)) def intersection(self, other): - return SortedIntSet((num for num in self if num in other)) + return SortedIntSet(num for num in self if num in other) def difference(self, other): - return SortedIntSet((num for num in self if num not in other)) + return SortedIntSet(num for num in self if num not in other) def first(self): return self.data[0] diff --git a/src/whoosh/index.py b/src/whoosh/index.py index b2acf559..eac6c603 100644 --- a/src/whoosh/index.py +++ b/src/whoosh/index.py @@ -29,18 +29,17 @@ an index. """ -from __future__ import division import os.path import re import sys -from time import time, sleep +from time import sleep, time from whoosh import __version__ from whoosh.compat import pickle, string_type from whoosh.fields import ensure_schema from whoosh.legacy import toc_loaders -from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE +from whoosh.system import _FLOAT_SIZE, _INT_SIZE, _LONG_SIZE _DEF_INDEX_NAME = "MAIN" _CURRENT_TOC_VERSION = -111 @@ -217,7 +216,7 @@ def version(storage, indexname=None): # Index base class -class Index(object): +class Index: """Represents an indexed collection of documents.""" def close(self): @@ -377,7 +376,7 @@ def clean_files(storage, indexname, gen, segments): # open, they may not be deleted immediately (i.e. on Windows) but will # probably be deleted eventually by a later call to clean_files. - current_segment_names = set(s.segment_id() for s in segments) + current_segment_names = {s.segment_id() for s in segments} tocpattern = TOC._pattern(indexname) segpattern = TOC._segment_pattern(indexname) @@ -408,9 +407,9 @@ def __init__(self, storage, schema=None, indexname=_DEF_INDEX_NAME): from whoosh.filedb.filestore import Storage if not isinstance(storage, Storage): - raise ValueError("%r is not a Storage object" % storage) + raise ValueError(f"{storage!r} is not a Storage object") if not isinstance(indexname, string_type): - raise ValueError("indexname %r is not a string" % indexname) + raise ValueError(f"indexname {indexname!r} is not a string") if schema: schema = ensure_schema(schema) @@ -428,7 +427,7 @@ def create(cls, storage, schema, indexname=_DEF_INDEX_NAME): return cls(storage, schema, indexname) def __repr__(self): - return "%s(%r, %r)" % (self.__class__.__name__, self.storage, self.indexname) + return f"{self.__class__.__name__}({self.storage!r}, {self.indexname!r})" def close(self): pass @@ -498,7 +497,7 @@ def version(self): def _reader(cls, storage, schema, segments, generation, reuse=None): # Returns a reader for the given segments, possibly reusing already # opened readers - from whoosh.reading import SegmentReader, MultiReader, EmptyReader + from whoosh.reading import EmptyReader, MultiReader, SegmentReader if reuse: # Merge segments with reuse segments @@ -516,9 +515,7 @@ def _reader(cls, storage, schema, segments, generation, reuse=None): if reuse: # Put all atomic readers in a dictionary readers = [r for r, _ in reuse.leaf_readers()] - reusable = dict( - (r.segment(), r) for r in readers if r.segment() is not None - ) + reusable = {r.segment(): r for r in readers if r.segment() is not None} # Make a function to open readers, which reuses reusable readers. # It removes any readers it reuses from the "reusable" dictionary, @@ -561,7 +558,7 @@ def reader(self, reuse=None): info.generation, reuse=reuse, ) - except IOError: + except OSError: # Presume that we got a "file not found error" because a writer # deleted one of the files just as we were trying to open it, # and so retry a few times before actually raising the @@ -576,7 +573,7 @@ def reader(self, reuse=None): # TOC class -class TOC(object): +class TOC: """Object representing the state of the index after a commit. Essentially a container for the index's schema and the list of segment objects. """ @@ -597,15 +594,15 @@ def __init__( @classmethod def _filename(cls, indexname, gen): - return "_%s_%s.toc" % (indexname, gen) + return f"_{indexname}_{gen}.toc" @classmethod def _pattern(cls, indexname): - return re.compile("^_%s_([0-9]+).toc$" % indexname) + return re.compile(f"^_{indexname}_([0-9]+).toc$") @classmethod def _segment_pattern(cls, indexname): - return re.compile("(%s_[0-9a-z]+)[.][A-Za-z0-9_.]+" % indexname) + return re.compile(f"({indexname}_[0-9a-z]+)[.][A-Za-z0-9_.]+") @classmethod def _latest_generation(cls, storage, indexname): @@ -623,7 +620,7 @@ def create(cls, storage, schema, indexname=_DEF_INDEX_NAME): schema = ensure_schema(schema) # Clear existing files - prefix = "_%s_" % indexname + prefix = f"_{indexname}_" for filename in storage: if filename.startswith(prefix): storage.delete_file(filename) @@ -638,7 +635,7 @@ def read(cls, storage, indexname, gen=None, schema=None): gen = cls._latest_generation(storage, indexname) if gen < 0: raise EmptyIndexError( - "Index %r does not exist in %r" % (indexname, storage) + f"Index {indexname!r} does not exist in {storage!r}" ) # Read the content of this index from the .toc file. @@ -657,7 +654,7 @@ def check_size(name, target): check_size("long", _LONG_SIZE) check_size("float", _FLOAT_SIZE) - if not stream.read_int() == -12345: + if stream.read_int() != -12345: raise IndexError("Number misread: byte order problem") version = stream.read_int() @@ -668,7 +665,7 @@ def check_size(name, target): loader = toc_loaders[version] schema, segments = loader(stream, gen, schema, version) else: - raise IndexVersionError("Can't read format %s" % version, version) + raise IndexVersionError(f"Can't read format {version}", version) else: # If the user supplied a schema object with the constructor, don't # load the pickled schema from the saved index. @@ -694,7 +691,7 @@ def write(self, storage, indexname): # Use a temporary file for atomic write. tocfilename = self._filename(indexname, self.generation) - tempfilename = "%s.%s" % (tocfilename, time()) + tempfilename = f"{tocfilename}.{time()}" stream = storage.create_file(tempfilename) stream.write_varint(_INT_SIZE) @@ -715,10 +712,10 @@ def write(self, storage, indexname): pickle.dumps(field) except pickle.PicklingError: e = sys.exc_info()[1] - raise pickle.PicklingError("%s %s=%r" % (e, fieldname, field)) + raise pickle.PicklingError(f"{e} {fieldname}={field!r}") except TypeError: e = sys.exc_info()[1] - raise TypeError("%s %s=%r" % (e, fieldname, field)) + raise TypeError(f"{e} {fieldname}={field!r}") # Otherwise, re-raise the original exception raise diff --git a/src/whoosh/lang/__init__.py b/src/whoosh/lang/__init__.py index 54a7e04d..1d4924cd 100644 --- a/src/whoosh/lang/__init__.py +++ b/src/whoosh/lang/__init__.py @@ -1,5 +1,3 @@ -# coding=utf-8 - # Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -30,6 +28,7 @@ # Exceptions + class NoStemmer(Exception): pass @@ -40,50 +39,76 @@ class NoStopWords(Exception): # Data and functions for language names -languages = ("ar", "da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "pt", - "ro", "ru", "es", "sv", "tr") +languages = ( + "ar", + "da", + "nl", + "en", + "fi", + "fr", + "de", + "hu", + "it", + "no", + "pt", + "ro", + "ru", + "es", + "sv", + "tr", +) aliases = { - # By ISO 639-1 three letter codes - "ara": "ar", - "dan": "da", "nld": "nl", "eng": "en", "fin": "fi", "fra": "fr", - "deu": "de", "hun": "hu", "ita": "it", "nor": "no", "por": "pt", - "ron": "ro", "rus": "ru", "spa": "es", "swe": "sv", "tur": "tr", - - # By name in English - "arabic": "ar", - "danish": "da", - "dutch": "nl", - "english": "en", - "finnish": "fi", - "french": "fr", - "german": "de", - "hungarian": "hu", - "italian": "it", - "norwegian": "no", - "portuguese": "pt", - "romanian": "ro", - "russian": "ru", - "spanish": "es", - "swedish": "sv", - "turkish": "tr", - - # By name in own language - "العربية": "ar", - "dansk": "da", - "nederlands": "nl", - "suomi": "fi", - "français": "fr", - "deutsch": "de", - "magyar": "hu", - "italiano": "it", - "norsk": "no", - "português": "pt", - "русский язык": "ru", - "español": "es", - "svenska": "sv", - "türkçe": "tr", - } + # By ISO 639-1 three letter codes + "ara": "ar", + "dan": "da", + "nld": "nl", + "eng": "en", + "fin": "fi", + "fra": "fr", + "deu": "de", + "hun": "hu", + "ita": "it", + "nor": "no", + "por": "pt", + "ron": "ro", + "rus": "ru", + "spa": "es", + "swe": "sv", + "tur": "tr", + # By name in English + "arabic": "ar", + "danish": "da", + "dutch": "nl", + "english": "en", + "finnish": "fi", + "french": "fr", + "german": "de", + "hungarian": "hu", + "italian": "it", + "norwegian": "no", + "portuguese": "pt", + "romanian": "ro", + "russian": "ru", + "spanish": "es", + "swedish": "sv", + "turkish": "tr", + # By name in own language + "العربية": "ar", + "dansk": "da", + "nederlands": "nl", + "suomi": "fi", + "français": "fr", + "deutsch": "de", + "magyar": "hu", + "italiano": "it", + "norsk": "no", + "português": "pt", + "русский язык": "ru", + "español": "es", + "svenska": "sv", + "türkçe": "tr", +} def two_letter_code(name): @@ -96,6 +121,7 @@ def two_letter_code(name): # Getter functions + def has_stemmer(lang): try: return bool(stemmer_for_language(lang)) @@ -115,19 +141,22 @@ def stemmer_for_language(lang): # Original porter stemming algorithm is several times faster than the # more correct porter2 algorithm in snowball package from .porter import stem as porter_stem + return porter_stem tlc = two_letter_code(lang) if tlc == "ar": from .isri import ISRIStemmer + return ISRIStemmer().stem from .snowball import classes as snowball_classes + if tlc in snowball_classes: return snowball_classes[tlc]().stem - raise NoStemmer("No stemmer available for %r" % lang) + raise NoStemmer(f"No stemmer available for {lang!r}") def stopwords_for_language(lang): @@ -137,4 +166,4 @@ def stopwords_for_language(lang): if tlc in stoplists: return stoplists[tlc] - raise NoStopWords("No stop-word list available for %r" % lang) + raise NoStopWords(f"No stop-word list available for {lang!r}") diff --git a/src/whoosh/lang/dmetaphone.py b/src/whoosh/lang/dmetaphone.py index b7bb23ce..ac5a78a4 100644 --- a/src/whoosh/lang/dmetaphone.py +++ b/src/whoosh/lang/dmetaphone.py @@ -1,5 +1,3 @@ -# coding= utf-8 - # This script implements the Double Metaphone algorythm (c) 1998, 1999 by # Lawrence Philips. It was translated to Python from the C source written by # Kevin Atkinson (http://aspell.net/metaphone/) By Andrew Collins - January 12, @@ -15,7 +13,13 @@ silent_starts = re.compile("GN|KN|PN|WR|PS") -def double_metaphone(text): +def double_metaphone(text): # noqa: C901, PLR0912, PLR0915 + """ + This function is too complex (125) -- ruff rule C901 + This function has too many branches (181) -- ruff rule PLR0912 + This function has too many statements (318) -- ruff rule PLR0915 + Future edits to this function should reduce, not increase its complexity. + """ text = text.upper() slavo_germanic = bool(slavo_germ_exp.search(text)) diff --git a/src/whoosh/lang/isri.py b/src/whoosh/lang/isri.py index dafffd8b..97d32328 100644 --- a/src/whoosh/lang/isri.py +++ b/src/whoosh/lang/isri.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Natural Language Toolkit: The ISRI Arabic Stemmer # @@ -34,12 +33,12 @@ root. """ -from __future__ import unicode_literals + import re -class ISRIStemmer(object): - ''' +class ISRIStemmer: + """ ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary. Information Science Research Institute. University of Nevada, Las Vegas, USA. @@ -51,93 +50,136 @@ class ISRIStemmer(object): The ISRI Stemmer requires that all tokens have Unicode string types. If you use Python IDLE on Arabic Windows you have to decode text first using Arabic '1256' coding. - ''' + """ def __init__(self): - self.stm = 'defult none' - - self.p3 = ['\u0643\u0627\u0644', '\u0628\u0627\u0644', - '\u0648\u0644\u0644', '\u0648\u0627\u0644'] # length three prefixes - self.p2 = ['\u0627\u0644', '\u0644\u0644'] # length two prefixes - self.p1 = ['\u0644', '\u0628', '\u0641', '\u0633', '\u0648', - '\u064a', '\u062a', '\u0646', '\u0627'] # length one prefixes - - self.s3 = ['\u062a\u0645\u0644', '\u0647\u0645\u0644', - '\u062a\u0627\u0646', '\u062a\u064a\u0646', - '\u0643\u0645\u0644'] # length three suffixes - self.s2 = ['\u0648\u0646', '\u0627\u062a', '\u0627\u0646', - '\u064a\u0646', '\u062a\u0646', '\u0643\u0645', - '\u0647\u0646', '\u0646\u0627', '\u064a\u0627', - '\u0647\u0627', '\u062a\u0645', '\u0643\u0646', - '\u0646\u064a', '\u0648\u0627', '\u0645\u0627', - '\u0647\u0645'] # length two suffixes - self.s1 = ['\u0629', '\u0647', '\u064a', '\u0643', '\u062a', - '\u0627', '\u0646'] # length one suffixes - - self.pr4 = {0: ['\u0645'], 1:['\u0627'], - 2: ['\u0627', '\u0648', '\u064A'], 3:['\u0629']} # groups of length four patterns - self.pr53 = {0: ['\u0627', '\u062a'], - 1: ['\u0627', '\u064a', '\u0648'], - 2: ['\u0627', '\u062a', '\u0645'], - 3: ['\u0645', '\u064a', '\u062a'], - 4: ['\u0645', '\u062a'], - 5: ['\u0627', '\u0648'], - 6: ['\u0627', '\u0645']} # Groups of length five patterns and length three roots - - self.re_short_vowels = re.compile('[\u064B-\u0652]') - self.re_hamza = re.compile('[\u0621\u0624\u0626]') - self.re_intial_hamza = re.compile('^[\u0622\u0623\u0625]') - - self.stop_words = ['\u064a\u0643\u0648\u0646', - '\u0648\u0644\u064a\u0633', - '\u0648\u0643\u0627\u0646', - '\u0643\u0630\u0644\u0643', - '\u0627\u0644\u062a\u064a', - '\u0648\u0628\u064a\u0646', - '\u0639\u0644\u064a\u0647\u0627', - '\u0645\u0633\u0627\u0621', - '\u0627\u0644\u0630\u064a', - '\u0648\u0643\u0627\u0646\u062a', - '\u0648\u0644\u0643\u0646', - '\u0648\u0627\u0644\u062a\u064a', - '\u062a\u0643\u0648\u0646', - '\u0627\u0644\u064a\u0648\u0645', - '\u0627\u0644\u0644\u0630\u064a\u0646', - '\u0639\u0644\u064a\u0647', - '\u0643\u0627\u0646\u062a', - '\u0644\u0630\u0644\u0643', - '\u0623\u0645\u0627\u0645', - '\u0647\u0646\u0627\u0643', - '\u0645\u0646\u0647\u0627', - '\u0645\u0627\u0632\u0627\u0644', - '\u0644\u0627\u0632\u0627\u0644', - '\u0644\u0627\u064a\u0632\u0627\u0644', - '\u0645\u0627\u064a\u0632\u0627\u0644', - '\u0627\u0635\u0628\u062d', - '\u0623\u0635\u0628\u062d', - '\u0623\u0645\u0633\u0649', - '\u0627\u0645\u0633\u0649', - '\u0623\u0636\u062d\u0649', - '\u0627\u0636\u062d\u0649', - '\u0645\u0627\u0628\u0631\u062d', - '\u0645\u0627\u0641\u062a\u0626', - '\u0645\u0627\u0627\u0646\u0641\u0643', - '\u0644\u0627\u0633\u064a\u0645\u0627', - '\u0648\u0644\u0627\u064a\u0632\u0627\u0644', - '\u0627\u0644\u062d\u0627\u0644\u064a', - '\u0627\u0644\u064a\u0647\u0627', - '\u0627\u0644\u0630\u064a\u0646', - '\u0641\u0627\u0646\u0647', - '\u0648\u0627\u0644\u0630\u064a', - '\u0648\u0647\u0630\u0627', - '\u0644\u0647\u0630\u0627', - '\u0641\u0643\u0627\u0646', - '\u0633\u062a\u0643\u0648\u0646', - '\u0627\u0644\u064a\u0647', - '\u064a\u0645\u0643\u0646', - '\u0628\u0647\u0630\u0627', - '\u0627\u0644\u0630\u0649'] - + self.stm = "defult none" + + self.p3 = [ + "\u0643\u0627\u0644", + "\u0628\u0627\u0644", + "\u0648\u0644\u0644", + "\u0648\u0627\u0644", + ] # length three prefixes + self.p2 = ["\u0627\u0644", "\u0644\u0644"] # length two prefixes + self.p1 = [ + "\u0644", + "\u0628", + "\u0641", + "\u0633", + "\u0648", + "\u064a", + "\u062a", + "\u0646", + "\u0627", + ] # length one prefixes + + self.s3 = [ + "\u062a\u0645\u0644", + "\u0647\u0645\u0644", + "\u062a\u0627\u0646", + "\u062a\u064a\u0646", + "\u0643\u0645\u0644", + ] # length three suffixes + self.s2 = [ + "\u0648\u0646", + "\u0627\u062a", + "\u0627\u0646", + "\u064a\u0646", + "\u062a\u0646", + "\u0643\u0645", + "\u0647\u0646", + "\u0646\u0627", + "\u064a\u0627", + "\u0647\u0627", + "\u062a\u0645", + "\u0643\u0646", + "\u0646\u064a", + "\u0648\u0627", + "\u0645\u0627", + "\u0647\u0645", + ] # length two suffixes + self.s1 = [ + "\u0629", + "\u0647", + "\u064a", + "\u0643", + "\u062a", + "\u0627", + "\u0646", + ] # length one suffixes + + self.pr4 = { + 0: ["\u0645"], + 1: ["\u0627"], + 2: ["\u0627", "\u0648", "\u064A"], + 3: ["\u0629"], + } # groups of length four patterns + self.pr53 = { + 0: ["\u0627", "\u062a"], + 1: ["\u0627", "\u064a", "\u0648"], + 2: ["\u0627", "\u062a", "\u0645"], + 3: ["\u0645", "\u064a", "\u062a"], + 4: ["\u0645", "\u062a"], + 5: ["\u0627", "\u0648"], + 6: ["\u0627", "\u0645"], + } # Groups of length five patterns and length three roots + + self.re_short_vowels = re.compile("[\u064B-\u0652]") + self.re_hamza = re.compile("[\u0621\u0624\u0626]") + self.re_intial_hamza = re.compile("^[\u0622\u0623\u0625]") + + self.stop_words = [ + "\u064a\u0643\u0648\u0646", + "\u0648\u0644\u064a\u0633", + "\u0648\u0643\u0627\u0646", + "\u0643\u0630\u0644\u0643", + "\u0627\u0644\u062a\u064a", + "\u0648\u0628\u064a\u0646", + "\u0639\u0644\u064a\u0647\u0627", + "\u0645\u0633\u0627\u0621", + "\u0627\u0644\u0630\u064a", + "\u0648\u0643\u0627\u0646\u062a", + "\u0648\u0644\u0643\u0646", + "\u0648\u0627\u0644\u062a\u064a", + "\u062a\u0643\u0648\u0646", + "\u0627\u0644\u064a\u0648\u0645", + "\u0627\u0644\u0644\u0630\u064a\u0646", + "\u0639\u0644\u064a\u0647", + "\u0643\u0627\u0646\u062a", + "\u0644\u0630\u0644\u0643", + "\u0623\u0645\u0627\u0645", + "\u0647\u0646\u0627\u0643", + "\u0645\u0646\u0647\u0627", + "\u0645\u0627\u0632\u0627\u0644", + "\u0644\u0627\u0632\u0627\u0644", + "\u0644\u0627\u064a\u0632\u0627\u0644", + "\u0645\u0627\u064a\u0632\u0627\u0644", + "\u0627\u0635\u0628\u062d", + "\u0623\u0635\u0628\u062d", + "\u0623\u0645\u0633\u0649", + "\u0627\u0645\u0633\u0649", + "\u0623\u0636\u062d\u0649", + "\u0627\u0636\u062d\u0649", + "\u0645\u0627\u0628\u0631\u062d", + "\u0645\u0627\u0641\u062a\u0626", + "\u0645\u0627\u0627\u0646\u0641\u0643", + "\u0644\u0627\u0633\u064a\u0645\u0627", + "\u0648\u0644\u0627\u064a\u0632\u0627\u0644", + "\u0627\u0644\u062d\u0627\u0644\u064a", + "\u0627\u0644\u064a\u0647\u0627", + "\u0627\u0644\u0630\u064a\u0646", + "\u0641\u0627\u0646\u0647", + "\u0648\u0627\u0644\u0630\u064a", + "\u0648\u0647\u0630\u0627", + "\u0644\u0647\u0630\u0627", + "\u0641\u0643\u0627\u0646", + "\u0633\u062a\u0643\u0648\u0646", + "\u0627\u0644\u064a\u0647", + "\u064a\u0645\u0643\u0646", + "\u0628\u0647\u0630\u0627", + "\u0627\u0644\u0630\u0649", + ] def stem(self, token): """ @@ -145,26 +187,28 @@ def stem(self, token): """ self.stm = token - self.norm(1) # remove diacritics which representing Arabic short vowels - if self.stm in self.stop_words: return self.stm # exclude stop words from being processed - self.pre32() # remove length three and length two prefixes in this order - self.suf32() # remove length three and length two suffixes in this order - self.waw() # remove connective ‘و’ if it precedes a word beginning with ‘و’ - self.norm(2) # normalize initial hamza to bare alif - if len(self.stm) <= 3: return self.stm # return stem if less than or equal to three - - if len(self.stm) == 4: # length 4 word + self.norm(1) # remove diacritics which representing Arabic short vowels + if self.stm in self.stop_words: + return self.stm # exclude stop words from being processed + self.pre32() # remove length three and length two prefixes in this order + self.suf32() # remove length three and length two suffixes in this order + self.waw() # remove connective ‘و’ if it precedes a word beginning with ‘و’ + self.norm(2) # normalize initial hamza to bare alif + if len(self.stm) <= 3: + return self.stm # return stem if less than or equal to three + + if len(self.stm) == 4: # length 4 word self.pro_w4() return self.stm - elif len(self.stm) == 5: # length 5 word + elif len(self.stm) == 5: # length 5 word self.pro_w53() self.end_w5() return self.stm - elif len(self.stm) == 6: # length 6 word + elif len(self.stm) == 6: # length 6 word self.pro_w6() self.end_w6() return self.stm - elif len(self.stm) == 7: # length 7 word + elif len(self.stm) == 7: # length 7 word self.suf1() if len(self.stm) == 7: self.pre1() @@ -172,7 +216,7 @@ def stem(self, token): self.pro_w6() self.end_w6() return self.stm - return self.stm # if word length >7 , then no stemming + return self.stm # if word length >7 , then no stemming def norm(self, num): """ @@ -184,14 +228,14 @@ def norm(self, num): self.k = num if self.k == 1: - self.stm = self.re_short_vowels.sub('', self.stm) + self.stm = self.re_short_vowels.sub("", self.stm) return self.stm elif self.k == 2: - self.stm = self.re_intial_hamza.sub('\u0627', self.stm) + self.stm = self.re_intial_hamza.sub("\u0627", self.stm) return self.stm elif self.k == 3: - self.stm = self.re_short_vowels.sub('', self.stm) - self.stm = self.re_intial_hamza.sub('\u0627', self.stm) + self.stm = self.re_short_vowels.sub("", self.stm) + self.stm = self.re_intial_hamza.sub("\u0627", self.stm) return self.stm def pre32(self): @@ -220,95 +264,108 @@ def suf32(self): self.stm = self.stm[:-2] return self.stm - def waw(self): - """remove connective ‘و’ if it precedes a word beginning with ‘و’ """ - if (len(self.stm) >= 4) & (self.stm[:2] == '\u0648\u0648'): + """remove connective ‘و’ if it precedes a word beginning with ‘و’""" + if (len(self.stm) >= 4) & (self.stm[:2] == "\u0648\u0648"): self.stm = self.stm[1:] return self.stm def pro_w4(self): """process length four patterns and extract length three roots""" - if self.stm[0] in self.pr4[0]: # مفعل + if self.stm[0] in self.pr4[0]: # مفعل self.stm = self.stm[1:] return self.stm - elif self.stm[1] in self.pr4[1]: # فاعل + elif self.stm[1] in self.pr4[1]: # فاعل self.stm = self.stm[0] + self.stm[2:] return self.stm - elif self.stm[2] in self.pr4[2]: # فعال - فعول - فعيل + elif self.stm[2] in self.pr4[2]: # فعال - فعول - فعيل self.stm = self.stm[:2] + self.stm[3] return self.stm - elif self.stm[3] in self.pr4[3]: # فعلة + elif self.stm[3] in self.pr4[3]: # فعلة self.stm = self.stm[:-1] return self.stm else: - self.suf1() # do - normalize short sufix + self.suf1() # do - normalize short sufix if len(self.stm) == 4: - self.pre1() # do - normalize short prefix + self.pre1() # do - normalize short prefix return self.stm def pro_w53(self): """process length five patterns and extract length three roots""" - if ((self.stm[2] in self.pr53[0]) & (self.stm[0] == '\u0627')): # افتعل - افاعل + if (self.stm[2] in self.pr53[0]) & ( + self.stm[0] == "\u0627" + ): # افتعل - افاعل self.stm = self.stm[1] + self.stm[3:] return self.stm - elif ((self.stm[3] in self.pr53[1]) & (self.stm[0] == '\u0645')): # مفعول - مفعال - مفعيل + elif (self.stm[3] in self.pr53[1]) & ( + self.stm[0] == "\u0645" + ): # مفعول - مفعال - مفعيل self.stm = self.stm[1:3] + self.stm[4] return self.stm - elif ((self.stm[0] in self.pr53[2]) & (self.stm[4] == '\u0629')): # مفعلة - تفعلة - افعلة + elif (self.stm[0] in self.pr53[2]) & ( + self.stm[4] == "\u0629" + ): # مفعلة - تفعلة - افعلة self.stm = self.stm[1:4] return self.stm - elif ((self.stm[0] in self.pr53[3]) & (self.stm[2] == '\u062a')): # مفتعل - يفتعل - تفتعل + elif (self.stm[0] in self.pr53[3]) & ( + self.stm[2] == "\u062a" + ): # مفتعل - يفتعل - تفتعل self.stm = self.stm[1] + self.stm[3:] return self.stm - elif ((self.stm[0] in self.pr53[4]) & (self.stm[2] == '\u0627')): #مفاعل - تفاعل + elif (self.stm[0] in self.pr53[4]) & ( + self.stm[2] == "\u0627" + ): # مفاعل - تفاعل self.stm = self.stm[1] + self.stm[3:] return self.stm - elif ((self.stm[2] in self.pr53[5]) & (self.stm[4] == '\u0629')): # فعولة - فعالة + elif (self.stm[2] in self.pr53[5]) & ( + self.stm[4] == "\u0629" + ): # فعولة - فعالة self.stm = self.stm[:2] + self.stm[3] return self.stm - elif ((self.stm[0] in self.pr53[6]) & (self.stm[1] == '\u0646')): # انفعل - منفعل + elif (self.stm[0] in self.pr53[6]) & ( + self.stm[1] == "\u0646" + ): # انفعل - منفعل self.stm = self.stm[2:] return self.stm - elif ((self.stm[3] == '\u0627') & (self.stm[0] == '\u0627')): # افعال + elif (self.stm[3] == "\u0627") & (self.stm[0] == "\u0627"): # افعال self.stm = self.stm[1:3] + self.stm[4] return self.stm - elif ((self.stm[4] == '\u0646') & (self.stm[3] == '\u0627')): # فعلان + elif (self.stm[4] == "\u0646") & (self.stm[3] == "\u0627"): # فعلان self.stm = self.stm[:3] return self.stm - elif ((self.stm[3] == '\u064a') & (self.stm[0] == '\u062a')): # تفعيل + elif (self.stm[3] == "\u064a") & (self.stm[0] == "\u062a"): # تفعيل self.stm = self.stm[1:3] + self.stm[4] return self.stm - elif ((self.stm[3] == '\u0648') & (self.stm[1] == '\u0627')): # فاعول + elif (self.stm[3] == "\u0648") & (self.stm[1] == "\u0627"): # فاعول self.stm = self.stm[0] + self.stm[2] + self.stm[4] return self.stm - elif ((self.stm[2] == '\u0627') & (self.stm[1] == '\u0648')): # فواعل + elif (self.stm[2] == "\u0627") & (self.stm[1] == "\u0648"): # فواعل self.stm = self.stm[0] + self.stm[3:] return self.stm - elif ((self.stm[3] == '\u0626') & (self.stm[2] == '\u0627')): # فعائل + elif (self.stm[3] == "\u0626") & (self.stm[2] == "\u0627"): # فعائل self.stm = self.stm[:2] + self.stm[4] return self.stm - elif ((self.stm[4] == '\u0629') & (self.stm[1] == '\u0627')): # فاعلة + elif (self.stm[4] == "\u0629") & (self.stm[1] == "\u0627"): # فاعلة self.stm = self.stm[0] + self.stm[2:4] return self.stm - elif ((self.stm[4] == '\u064a') & (self.stm[2] == '\u0627')): # فعالي + elif (self.stm[4] == "\u064a") & (self.stm[2] == "\u0627"): # فعالي self.stm = self.stm[:2] + self.stm[3] return self.stm else: - self.suf1() # do - normalize short sufix + self.suf1() # do - normalize short sufix if len(self.stm) == 5: - self.pre1() # do - normalize short prefix + self.pre1() # do - normalize short prefix return self.stm def pro_w54(self): """process length five patterns and extract length four roots""" - if (self.stm[0] in self.pr53[2]): #تفعلل - افعلل - مفعلل + if self.stm[0] in self.pr53[2]: # تفعلل - افعلل - مفعلل self.stm = self.stm[1:] return self.stm - elif (self.stm[4] == '\u0629'): # فعللة + elif self.stm[4] == "\u0629": # فعللة self.stm = self.stm[:4] return self.stm - elif (self.stm[2] == '\u0627'): # فعالل + elif self.stm[2] == "\u0627": # فعالل self.stm = self.stm[:2] + self.stm[3:] return self.stm @@ -325,33 +382,51 @@ def end_w5(self): def pro_w6(self): """process length six patterns and extract length three roots""" - if ((self.stm.startswith('\u0627\u0633\u062a')) or (self.stm.startswith('\u0645\u0633\u062a'))): # مستفعل - استفعل + if (self.stm.startswith("\u0627\u0633\u062a")) or ( + self.stm.startswith("\u0645\u0633\u062a") + ): # مستفعل - استفعل self.stm = self.stm[3:] return self.stm - elif (self.stm[0] == '\u0645' and self.stm[3] == '\u0627' and self.stm[5] == '\u0629'): # مفعالة + elif ( + self.stm[0] == "\u0645" + and self.stm[3] == "\u0627" + and self.stm[5] == "\u0629" + ): # مفعالة self.stm = self.stm[1:3] + self.stm[4] return self.stm - elif (self.stm[0] == '\u0627' and self.stm[2] == '\u062a' and self.stm[4] == '\u0627'): # افتعال + elif ( + self.stm[0] == "\u0627" + and self.stm[2] == "\u062a" + and self.stm[4] == "\u0627" + ): # افتعال self.stm = self.stm[1] + self.stm[3] + self.stm[5] return self.stm - elif (self.stm[0] == '\u0627' and self.stm[3] == '\u0648' and self.stm[2] == self.stm[4]): # افعوعل + elif ( + self.stm[0] == "\u0627" + and self.stm[3] == "\u0648" + and self.stm[2] == self.stm[4] + ): # افعوعل self.stm = self.stm[1] + self.stm[4:] return self.stm - elif (self.stm[0] == '\u062a' and self.stm[2] == '\u0627' and self.stm[4] == '\u064a'): # تفاعيل new pattern + elif ( + self.stm[0] == "\u062a" + and self.stm[2] == "\u0627" + and self.stm[4] == "\u064a" + ): # تفاعيل new pattern self.stm = self.stm[1] + self.stm[3] + self.stm[5] return self.stm else: - self.suf1() # do - normalize short sufix + self.suf1() # do - normalize short sufix if len(self.stm) == 6: - self.pre1() # do - normalize short prefix + self.pre1() # do - normalize short prefix return self.stm def pro_w64(self): """process length six patterns and extract length four roots""" - if (self.stm[0] and self.stm[4]) == '\u0627': # افعلال + if (self.stm[0] and self.stm[4]) == "\u0627": # افعلال self.stm = self.stm[1:4] + self.stm[5] return self.stm - elif (self.stm.startswith('\u0645\u062a')): # متفعلل + elif self.stm.startswith("\u0645\u062a"): # متفعلل self.stm = self.stm[2:] return self.stm @@ -363,7 +438,7 @@ def end_w6(self): self.pro_w53() self.end_w5() return self.stm - elif len (self.stm) == 6: + elif len(self.stm) == 6: self.pro_w64() return self.stm diff --git a/src/whoosh/lang/lovins.py b/src/whoosh/lang/lovins.py index 1e5a933a..e3b114ba 100644 --- a/src/whoosh/lang/lovins.py +++ b/src/whoosh/lang/lovins.py @@ -6,9 +6,9 @@ from collections import defaultdict - # Conditions + def A(base): # A No restrictions on stem return True @@ -159,14 +159,22 @@ def a(base): # a Remove ending only after d, f, ph, th, l, er, or, es or t c = base[-1] l2 = base[-2:] - return (c == "d" or c == "f" or l2 == "ph" or l2 == "th" or c == "l" - or l2 == "er" or l2 == "or" or l2 == "es" or c == "t") + return ( + c == "d" + or c == "f" + or l2 == "ph" + or l2 == "th" + or c == "l" + or l2 == "er" + or l2 == "or" + or l2 == "es" + or c == "t" + ) def b(base): # b Minimum stem length = 3 and do not remove ending after met or ryst - return len(base) > 2 and not (base.endswith("met") - or base.endswith("ryst")) + return len(base) > 2 and not (base.endswith("met") or base.endswith("ryst")) def c(base): @@ -178,16 +186,12 @@ def c(base): m = [None] * 12 -m[11] = dict(( - ("alistically", B), - ("arizability", A), - ("izationally", B))) -m[10] = dict(( - ("antialness", A), - ("arisations", A), - ("arizations", A), - ("entialness", A))) -m[9] = dict(( +m[11] = dict((("alistically", B), ("arizability", A), ("izationally", B))) +m[10] = dict( + (("antialness", A), ("arisations", A), ("arizations", A), ("entialness", A)) +) +m[9] = dict( + ( ("allically", C), ("antaneous", A), ("antiality", A), @@ -204,8 +208,11 @@ def c(base): ("istically", A), ("itousness", A), ("izability", A), - ("izational", A))) -m[8] = dict(( + ("izational", A), + ) +) +m[8] = dict( + ( ("ableness", A), ("arizable", A), ("entation", A), @@ -218,8 +225,11 @@ def c(base): ("ionalize", A), ("iousness", A), ("izations", A), - ("lessness", A))) -m[7] = dict(( + ("lessness", A), + ) +) +m[7] = dict( + ( ("ability", A), ("aically", A), ("alistic", B), @@ -259,8 +269,11 @@ def c(base): ("ization", F), ("izement", A), ("oidally", A), - ("ousness", A))) -m[6] = dict(( + ("ousness", A), + ) +) +m[6] = dict( + ( ("aceous", A), ("acious", B), ("action", G), @@ -299,8 +312,11 @@ def c(base): ("izable", E), ("lessly", A), ("nesses", A), - ("oidism", A))) -m[5] = dict(( + ("oidism", A), + ) +) +m[5] = dict( + ( ("acies", A), ("acity", A), ("aging", B), @@ -367,8 +383,11 @@ def c(base): ("oidal", A), ("oides", A), ("otide", A), - ("ously", A))) -m[4] = dict(( + ("ously", A), + ) +) +m[4] = dict( + ( ("able", A), ("ably", A), ("ages", B), @@ -416,8 +435,11 @@ def c(base): ("ward", A), ("wise", A), ("ying", B), - ("yish", A))) -m[3] = dict(( + ("yish", A), + ) +) +m[3] = dict( + ( ("acy", A), ("age", B), ("aic", A), @@ -456,8 +478,11 @@ def c(base): ("ize", F), ("oid", A), ("one", R), - ("ous", A))) -m[2] = dict(( + ("ous", A), + ) +) +m[2] = dict( + ( ("ae", A), ("al", b), ("ar", X), @@ -475,14 +500,10 @@ def c(base): ("us", V), ("yl", R), ("s'", A), - ("'s", A))) -m[1] = dict(( - ("a", A), - ("e", A), - ("i", A), - ("o", A), - ("s", W), - ("y", B))) + ("'s", A), + ) +) +m[1] = dict((("a", A), ("e", A), ("i", A), ("o", A), ("s", W), ("y", B))) def remove_ending(word): @@ -490,52 +511,54 @@ def remove_ending(word): el = 11 while el > 0: if length - el > 1: - ending = word[length - el:] + ending = word[length - el :] cond = m[el].get(ending) if cond: - base = word[:length - el] + base = word[: length - el] if cond(base): return base el -= 1 return word -_endings = (("iev", "ief"), - ("uct", "uc"), - ("iev", "ief"), - ("uct", "uc"), - ("umpt", "um"), - ("rpt", "rb"), - ("urs", "ur"), - ("istr", "ister"), - ("metr", "meter"), - ("olv", "olut"), - ("ul", "l", "aoi"), - ("bex", "bic"), - ("dex", "dic"), - ("pex", "pic"), - ("tex", "tic"), - ("ax", "ac"), - ("ex", "ec"), - ("ix", "ic"), - ("lux", "luc"), - ("uad", "uas"), - ("vad", "vas"), - ("cid", "cis"), - ("lid", "lis"), - ("erid", "eris"), - ("pand", "pans"), - ("end", "ens", "s"), - ("ond", "ons"), - ("lud", "lus"), - ("rud", "rus"), - ("her", "hes", "pt"), - ("mit", "mis"), - ("ent", "ens", "m"), - ("ert", "ers"), - ("et", "es", "n"), - ("yt", "ys"), - ("yz", "ys")) +_endings = ( + ("iev", "ief"), + ("uct", "uc"), + ("iev", "ief"), + ("uct", "uc"), + ("umpt", "um"), + ("rpt", "rb"), + ("urs", "ur"), + ("istr", "ister"), + ("metr", "meter"), + ("olv", "olut"), + ("ul", "l", "aoi"), + ("bex", "bic"), + ("dex", "dic"), + ("pex", "pic"), + ("tex", "tic"), + ("ax", "ac"), + ("ex", "ec"), + ("ix", "ic"), + ("lux", "luc"), + ("uad", "uas"), + ("vad", "vas"), + ("cid", "cis"), + ("lid", "lis"), + ("erid", "eris"), + ("pand", "pans"), + ("end", "ens", "s"), + ("ond", "ons"), + ("lud", "lus"), + ("rud", "rus"), + ("her", "hes", "pt"), + ("mit", "mis"), + ("ent", "ens", "m"), + ("ert", "ers"), + ("et", "es", "n"), + ("yt", "ys"), + ("yz", "ys"), +) # Hash the ending rules by the last letter of the target ending @@ -559,12 +582,11 @@ def fix_ending(word): if c in exceptafter: return word - return word[:0 - len(target)] + newend + return word[: 0 - len(target)] + newend return word def stem(word): - """Returns the stemmed version of the argument string. - """ + """Returns the stemmed version of the argument string.""" return fix_ending(remove_ending(word)) diff --git a/src/whoosh/lang/morph_en.py b/src/whoosh/lang/morph_en.py index 359baa24..14dd4c97 100644 --- a/src/whoosh/lang/morph_en.py +++ b/src/whoosh/lang/morph_en.py @@ -8,7 +8,7 @@ class of Sun's `Minion search engine `_. import re -from whoosh.compat import range, iteritems +from whoosh.compat import iteritems # Rule exceptions @@ -611,7 +611,7 @@ class of Sun's `Minion search engine `_. # Words ending in S # (e.g., happiness, business) ( - r"[%s].*[%s](iness)" % (vowels, cons), + rf"[{vowels}].*[{cons}](iness)", "y,ies,ier,iers,iest,ied,ying,yings,ily,inesses,iment,iments,iless,iful", ), # (e.g., baseless, shoeless) @@ -621,7 +621,7 @@ class of Sun's `Minion search engine `_. ), # (e.g., gutless, hatless, spotless) ( - r"[%s][%s][bdgklmnprt]?(less)" % (cons, vowels), + rf"[{cons}][{vowels}][bdgklmnprt]?(less)", ",s,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,ful", ), # (e.g., thoughtless, worthless) @@ -661,12 +661,12 @@ class of Sun's `Minion search engine `_. ), # (e.g., tokenizes) // adds British variations ( - r"[%s].*[%s](izes)" % (vowels, cons), + rf"[{vowels}].*[{cons}](izes)", "ize,izes,izer,izers,ized,izing,izings,ization,izations,ise,iser,isers,ised,ising,isings,isation,isations", ), # (e.g., tokenises) // British variant // ~expertise ( - r"[%s].*[%s](ises)" % (vowels, cons), + rf"[{vowels}].*[{cons}](ises)", "ize,izes,izer,izers,ized,izing,izings,ization,izations,ise,iser,isers,ised,ising,isings,isation,isations", ), # (e.g., aches, arches) @@ -688,15 +688,15 @@ class of Sun's `Minion search engine `_. # (e.g., judgments, abridgments) (r"[%s].*dg(ments)" % vowels, "ment,*ments"), # (e.g., merriments, embodiments) -iment in turn will generate y and *y (redo y) - (r"[%s].*[%s]iment(s)" % (vowels, cons), ",*"), + (rf"[{vowels}].*[{cons}]iment(s)", ",*"), # (e.g., atonements, entrapments) (r"[%s].*ment(s)" % vowels, ",*"), # (e.g., viewers, meters, traders, transfers) (r"[%s].*er(s)" % vowels, ",*"), # (e.g., unflags) polysyllables - (r"[%s].*[%s][%s][bdglmnprt](s)" % (vowels, cons, vowels), ",*"), + (rf"[{vowels}].*[{cons}][{vowels}][bdglmnprt](s)", ",*"), # (e.g., frogs) monosyllables - (r"[%s][%s][bdglmnprt](s)" % (vowels, cons), ",*"), + (rf"[{vowels}][{cons}][bdglmnprt](s)", ",*"), # (e.g., killings, muggings) (r"[%s].*ing(s)" % vowels, ",*"), # (e.g., hulls, tolls) @@ -704,16 +704,16 @@ class of Sun's `Minion search engine `_. # e.g., boas, polkas, spas) don't generate latin endings (r"a(s)", ",er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), # (e.g., beads, toads) - (r"[%s].*[%s].*(s)" % (vowels, cons), ",*"), + (rf"[{vowels}].*[{cons}].*(s)", ",*"), # (e.g., boas, zoos) ( - r"[%s].*[%s](s)" % (cons, vowels), + rf"[{cons}].*[{vowels}](s)", ",er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful", ), # (e.g., ss, sss, ssss) no vowel (vowel case is already handled above) (r"ss()", ""), # (e.g., cds, lcds, m-16s) no vowel (can be a plural noun, but not verb) - (r"[%s].*[%s1234567890](s)" % (cons, cons), ""), + (rf"[{cons}].*[{cons}1234567890](s)", ""), # Words ending in E # (e.g., apple, so it doesn't include apply) (r"appl(e)", "es,er,ers,est,ed,ing,ings,ely,eness,enesses,ement,ements,eless,eful"), @@ -724,7 +724,7 @@ class of Sun's `Minion search engine `_. ), # (e.g., able, abominable, fungible, table, enable, idle, subtle) ( - r"[%s].*[%s]l(e)" % (vowels, cons), + rf"[{vowels}].*[{cons}]l(e)", "es,er,ers,est,ed,ing,ings,y,ely,eness,enesses,ement,ements,eless,eful", ), # (e.g., bookie, magpie, vie) @@ -746,17 +746,17 @@ class of Sun's `Minion search engine `_. ), # (e.g., tokenize) // adds British variations ( - r"[%s].*[%s](ize)" % (vowels, cons), + rf"[{vowels}].*[{cons}](ize)", "izes,izer,izers,ized,izing,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations", ), # (e.g., tokenise) // British variant // ~expertise ( - r"[%s].*[%s](ise)" % (vowels, cons), + rf"[{vowels}].*[{cons}](ise)", "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,iser,isers,ised,ising,isings,isation,isations", ), # (e.g., tree, agree, rage, horse, hoarse) ( - r"[%s].*[%s](e)" % (vowels, cons), + rf"[{vowels}].*[{cons}](e)", "es,er,ers,est,ed,ing,ings,eing,eings,ely,eness,enesses,ement,ements,eless,eful", ), # Words ending in -ED @@ -774,7 +774,7 @@ class of Sun's `Minion search engine `_. ), # (e.g., controlled, fulfilled, rebelled) ( - r"[%s].*[%s].*l(led)" % (vowels, cons), + rf"[{vowels}].*[{cons}].*l(led)", ",s,er,ers,est,ing,ings,ly,ness,nesses,ment,ments,less,ful,&,&s,&er,&ers,&est,&ing,&ings,&y,&ness,&nesses,&ment,&ments,&ful", ), # (e.g., pulled, filled, fulled) @@ -794,12 +794,12 @@ class of Sun's `Minion search engine `_. ), # (e.g., tokenize) // adds British variations ( - r"[%s].*[%s](ized)" % (vowels, cons), + rf"[{vowels}].*[{cons}](ized)", "izes,izer,izers,ize,izing,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations", ), # (e.g., tokenise) // British variant // ~expertise ( - r"[%s].*[%s](ized)" % (vowels, cons), + rf"[{vowels}].*[{cons}](ized)", "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,iser,isers,ise,ising,isings,isation,isations", ), # (e.g., spoiled, tooled, tracked, roasted, atoned, abridged) @@ -819,7 +819,7 @@ class of Sun's `Minion search engine `_. ), # (e.g., acidifier, saltier) ( - r"[%s].*[%s](ier)" % (vowels, cons), + rf"[{vowels}].*[{cons}](ier)", "y,ie,ies,iest,ied,ying,yings,ily,yly,iness,yness,inesses,ynesses,yment,yments,yless,yful,iment,iments,iless,iful,iers,iered,iering,ierings,ierly,ierness,iernesses,ierment,ierments,ierless,ierful,ierer,ierers,ierest", ), # (e.g., puller, filler, fuller) @@ -834,17 +834,17 @@ class of Sun's `Minion search engine `_. ), # (e.g., bigger, trekker, hitter) ( - r"[%s][%s](?P[bdgkmnprt])((?P=er1)er)" % (cons, vowels), + rf"[{cons}][{vowels}](?P[bdgkmnprt])((?P=er1)er)", "s,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful,&ers,&ered,&ering,&erings,&erly,&erness,&ernesses,&erments,&erless,&erful", ), # (e.g., tokenize) // adds British variations ( - r"[%s].*[%s](izer)" % (vowels, cons), + rf"[{vowels}].*[{cons}](izer)", "izes,ize,izers,ized,izing,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations", ), # (e.g., tokenise) // British variant // ~expertise ( - r"[%s].*[%s](iser)" % (vowels, cons), + rf"[{vowels}].*[{cons}](iser)", "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,ise,isers,ised,ising,isings,isation,isations", ), # (e.g., actioner, atoner, icer, trader, accruer, churchgoer, prefer) @@ -870,7 +870,7 @@ class of Sun's `Minion search engine `_. ), # (e.g., biggest) ( - r"[%s][%s](?P[bdglmnprst])((?P=est1)est)" % (cons, vowels), + rf"[{cons}][{vowels}](?P[bdglmnprst])((?P=est1)est)", ",s,&er,&ers,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful", ), # (e.g., basest, archest, rashest) @@ -892,7 +892,7 @@ class of Sun's `Minion search engine `_. (r"est", "s,er,ers,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), # Words ending in -FUL # (e.g., beautiful, plentiful) - (r"[%s].*[%s](iful)" % (vowels, cons), "ifully,ifulness,*y"), + (rf"[{vowels}].*[{cons}](iful)", "ifully,ifulness,*y"), # (e.g., hopeful, sorrowful) (r"[%s].*(ful)" % vowels, "fully,fulness,,*"), # Words ending in -ICAL @@ -917,7 +917,7 @@ class of Sun's `Minion search engine `_. ), # (e.g., hugging, trekking) ( - r"[%s][%s](?P[bdgklmnprt])((?P=ing1)ing)" % (cons, vowels), + rf"[{cons}][{vowels}](?P[bdgklmnprt])((?P=ing1)ing)", ",s,&er,&ers,&est,&ed,&ings,ly,ness,nesses,ment,ments,less,ful", ), # (e.g., freeing, agreeing) @@ -937,22 +937,22 @@ class of Sun's `Minion search engine `_. ), # (e.g., editing, crediting, expediting, siting, exciting) ( - r"[%s].*[%s][eio]t(ing)" % (vowels, cons), + rf"[{vowels}].*[{cons}][eio]t(ing)", ",*,*e,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful", ), # (e.g., robing, siding, doling, translating, flaking) ( - r"[%s][%s][bdgklmt](ing)" % (cons, vowels), + rf"[{cons}][{vowels}][bdgklmt](ing)", "*e,ings,inger,ingers,ingest,inged,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful", ), # (e.g., tokenize) // adds British variations ( - r"[%s].*[%s](izing)" % (vowels, cons), + rf"[{vowels}].*[{cons}](izing)", "izes,izer,izers,ized,ize,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations", ), # (e.g., tokenise) // British variant // ~expertise ( - r"[%s].*[%s](ising)" % (vowels, cons), + rf"[{vowels}].*[{cons}](ising)", "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,iser,isers,ised,ise,isings,isation,isations", ), # (e.g., icing, aging, achieving, amazing, housing) @@ -972,7 +972,7 @@ class of Sun's `Minion search engine `_. ), # (e.g., farming, harping, interesting, bedspring, redwing) ( - r"[%s].*[%s][bdfjkmnpqrtwxz](ing)" % (vowels, cons), + rf"[{vowels}].*[{cons}][bdfjkmnpqrtwxz](ing)", ",*,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful", ), # (e.g., spoiling, reviling, autoing, egging, hanging, hingeing) @@ -998,9 +998,9 @@ class of Sun's `Minion search engine `_. # (e.g., judgment, abridgment) (r"[%s].*dg(ment)" % vowels, "*e"), # (e.g., merriment, embodiment) - (r"[%s].*[%s](iment)" % (vowels, cons), "*y"), + (rf"[{vowels}].*[{cons}](iment)", "*y"), # (e.g., atonement, entrapment) - (r"[%s].*[%s](ment)" % (vowels, cons), ",*"), + (rf"[{vowels}].*[{cons}](ment)", ",*"), # Words ending in -O # (e.g., taboo, rodeo) ( @@ -1026,7 +1026,7 @@ class of Sun's `Minion search engine `_. ), # (e.g., happily, dizzily) ( - r"[%s].*[%s](ily)" % (vowels, cons), + rf"[{vowels}].*[{cons}](ily)", "y,ies,ier,iers,iest,ied,ying,yings,yness,iness,ynesses,inesses,iment,iments,iless,iful", ), # (e.g., peaceful+ly) @@ -1064,24 +1064,24 @@ class of Sun's `Minion search engine `_. ), # (e.g., unflag, open, besot) ( - r"[%s].*[%s][%s][bdglmnprt]()" % (vowels, cons, vowels), + rf"[{vowels}].*[{cons}][{vowels}][bdglmnprt]()", "s,er,ers,est,ed,ing,ings,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful", ), # (e.g., bed, cop) ( - r"[%s][%s][bdglmnprt]()" % (cons, vowels), + rf"[{cons}][{vowels}][bdglmnprt]()", "s,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful", ), # (e.g., schemata, automata) ( - r"[%s].*[%s][%s]ma(ta)" % (vowels, cons, vowels), + rf"[{vowels}].*[{cons}][{vowels}]ma(ta)", ",s,tas,tum,tums,ton,tons,tic,tical", ), # (e.g., chordata, data, errata, sonata, toccata) (r"[%s].*t(a)" % vowels, "as,ae,um,ums,on,ons,ic,ical"), # (e.g., polka, spa, schema, ova, polyhedra) ( - r"[%s].*[%s](a)" % (vowels, cons), + rf"[{vowels}].*[{cons}](a)", "as,aed,aing,ae,ata,um,ums,on,ons,al,atic,atical", ), # (e.g., full) @@ -1102,9 +1102,7 @@ class of Sun's `Minion search engine `_. for p in range(0, len(rules) // _partition_size + 1): start = p * _partition_size end = (p + 1) * _partition_size - pattern = "|".join( - "(?P<_g%s>%s)$" % (i, r[0]) for i, r in enumerate(rules[start:end]) - ) + pattern = "|".join(f"(?P<_g{i}>{r[0]})$" for i, r in enumerate(rules[start:end])) _partitions.append(re.compile(pattern)) @@ -1137,7 +1135,7 @@ def variations(word): ending = groups[-1] root = word[: 0 - len(ending)] if ending else word - out = set((word,)) + out = {word} results = rules[i * _partition_size + num][1] for result in results.split(","): if result.startswith("&"): diff --git a/src/whoosh/lang/paicehusk.py b/src/whoosh/lang/paicehusk.py index 6aee9066..1540318f 100644 --- a/src/whoosh/lang/paicehusk.py +++ b/src/whoosh/lang/paicehusk.py @@ -18,17 +18,19 @@ from collections import defaultdict -class PaiceHuskStemmer(object): - """Implements the Paice-Husk stemming algorithm. - """ +class PaiceHuskStemmer: + """Implements the Paice-Husk stemming algorithm.""" - rule_expr = re.compile(r""" + rule_expr = re.compile( + r""" ^(?P\w+) (?P[*]?) (?P\d+) (?P\w*) (?P[.>]) - """, re.UNICODE | re.VERBOSE) + """, + re.UNICODE | re.VERBOSE, + ) stem_expr = re.compile(r"^\w+", re.UNICODE) @@ -60,26 +62,33 @@ def read_rules(self, ruletable): rules[lastchar].append((ending, intact, num, append, cont)) else: - raise Exception("Bad rule: %r" % line) + raise Exception(f"Bad rule: {line!r}") def first_vowel(self, word): - vp = min([p for p in [word.find(v) for v in "aeiou"] - if p > -1]) + vp = min([p for p in [word.find(v) for v in "aeiou"] if p > -1]) yp = word.find("y") if yp > 0 and yp < vp: return yp return vp def strip_prefix(self, word): - for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega", - "nano", "pico", "pseudo"): + for prefix in ( + "kilo", + "micro", + "milli", + "intra", + "ultra", + "mega", + "nano", + "pico", + "pseudo", + ): if word.startswith(prefix): - return word[len(prefix):] + return word[len(prefix) :] return word def stem(self, word): - """Returns a stemmed version of the argument string. - """ + """Returns a stemmed version of the argument string.""" rules = self.rules match = self.stem_expr.match(word) @@ -102,21 +111,21 @@ def stem(self, word): continue newlen = len(stem) - num + len(append) - if ((pfv == 0 and newlen < 2) - or (pfv > 0 and newlen < 3)): + if (pfv == 0 and newlen < 2) or (pfv > 0 and newlen < 3): # If word starts with vowel, minimum stem length is 2. # If word starts with consonant, minimum stem length is # 3. - continue + continue is_intact = False - stem = stem[:0 - num] + append + stem = stem[: 0 - num] + append continuing = cont break return stem + # The default rules for the Paice-Husk stemming algorithm defaultrules = """ diff --git a/src/whoosh/lang/phonetic.py b/src/whoosh/lang/phonetic.py index 4a760ec7..fcec3c7c 100644 --- a/src/whoosh/lang/phonetic.py +++ b/src/whoosh/lang/phonetic.py @@ -1,5 +1,3 @@ -# encoding: utf-8 - """ This module contains quasi-phonetic encoders for words in different languages. """ diff --git a/src/whoosh/lang/porter.py b/src/whoosh/lang/porter.py index 65d169a9..15d06a57 100644 --- a/src/whoosh/lang/porter.py +++ b/src/whoosh/lang/porter.py @@ -12,38 +12,38 @@ # Suffix replacement lists _step2list = { - "ational": "ate", - "tional": "tion", - "enci": "ence", - "anci": "ance", - "izer": "ize", - "bli": "ble", - "alli": "al", - "entli": "ent", - "eli": "e", - "ousli": "ous", - "ization": "ize", - "ation": "ate", - "ator": "ate", - "alism": "al", - "iveness": "ive", - "fulness": "ful", - "ousness": "ous", - "aliti": "al", - "iviti": "ive", - "biliti": "ble", - "logi": "log", - } + "ational": "ate", + "tional": "tion", + "enci": "ence", + "anci": "ance", + "izer": "ize", + "bli": "ble", + "alli": "al", + "entli": "ent", + "eli": "e", + "ousli": "ous", + "ization": "ize", + "ation": "ate", + "ator": "ate", + "alism": "al", + "iveness": "ive", + "fulness": "ful", + "ousness": "ous", + "aliti": "al", + "iviti": "ive", + "biliti": "ble", + "logi": "log", +} _step3list = { - "icate": "ic", - "ative": "", - "alize": "al", - "iciti": "ic", - "ical": "ic", - "ful": "", - "ness": "", - } + "icate": "ic", + "ative": "", + "alize": "al", + "iciti": "ic", + "ical": "ic", + "ful": "", + "ness": "", +} _cons = "[^aeiou]" @@ -54,9 +54,13 @@ # m > 0 _mgr0 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq) # m == 0 -_meq1 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + "(" + _vowel_seq + ")?$") +_meq1 = re.compile( + "^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + "(" + _vowel_seq + ")?$" +) # m > 1 -_mgr1 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + _vowel_seq + _cons_seq) +_mgr1 = re.compile( + "^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + _vowel_seq + _cons_seq +) # vowel in stem _s_v = re.compile("^(" + _cons_seq + ")?" + _vowel) # ??? @@ -67,15 +71,20 @@ _ed_ing = re.compile("^(.*)(ed|ing)$") _at_bl_iz = re.compile("(at|bl|iz)$") _step1b = re.compile("([^aeiouylsz])\\1$") -_step2 = re.compile("^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$") +_step2 = re.compile( + "^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$" +) _step3 = re.compile("^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$") -_step4_1 = re.compile("^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$") +_step4_1 = re.compile( + "^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$" +) _step4_2 = re.compile("^(.+?)(s|t)(ion)$") _step5 = re.compile("^(.+?)e$") # Stemming function + def stem(w): """Uses the Porter stemming algorithm to remove suffixes from English words. diff --git a/src/whoosh/lang/porter2.py b/src/whoosh/lang/porter2.py index 4d669752..896b7393 100644 --- a/src/whoosh/lang/porter2.py +++ b/src/whoosh/lang/porter2.py @@ -20,9 +20,9 @@ def get_r1(word): # exceptional forms - if word.startswith('gener') or word.startswith('arsen'): + if word.startswith("gener") or word.startswith("arsen"): return 5 - if word.startswith('commun'): + if word.startswith("commun"): return 6 # normal form @@ -62,9 +62,9 @@ def remove_initial_apostrophe(word): def capitalize_consonant_ys(word): - if word.startswith('y'): - word = 'Y' + word[1:] - return ccy_exp.sub(r'\g<1>Y', word) + if word.startswith("y"): + word = "Y" + word[1:] + return ccy_exp.sub(r"\g<1>Y", word) def step_0(word): @@ -78,16 +78,16 @@ def step_0(word): def step_1a(word): - if word.endswith('sses'): - return word[:-4] + 'ss' - if word.endswith('ied') or word.endswith('ies'): + if word.endswith("sses"): + return word[:-4] + "ss" + if word.endswith("ied") or word.endswith("ies"): if len(word) > 4: - return word[:-3] + 'i' + return word[:-3] + "i" else: - return word[:-3] + 'ie' - if word.endswith('us') or word.endswith('ss'): + return word[:-3] + "ie" + if word.endswith("us") or word.endswith("ss"): return word - if word.endswith('s'): + if word.endswith("s"): preceding = word[:-1] if s1a_exp.search(preceding): return preceding @@ -95,7 +95,7 @@ def step_1a(word): return word -doubles = ('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt') +doubles = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt") def ends_with_double(word): @@ -106,31 +106,31 @@ def ends_with_double(word): def step_1b_helper(word): - if word.endswith('at') or word.endswith('bl') or word.endswith('iz'): - return word + 'e' + if word.endswith("at") or word.endswith("bl") or word.endswith("iz"): + return word + "e" if ends_with_double(word): return word[:-1] if is_short_word(word): - return word + 'e' + return word + "e" return word -s1b_suffixes = ('ed', 'edly', 'ing', 'ingly') +s1b_suffixes = ("ed", "edly", "ing", "ingly") def step_1b(word, r1): - if word.endswith('eedly'): + if word.endswith("eedly"): if len(word) - 5 >= r1: return word[:-3] return word - if word.endswith('eed'): + if word.endswith("eed"): if len(word) - 3 >= r1: return word[:-1] return word for suffix in s1b_suffixes: if word.endswith(suffix): - preceding = word[:-len(suffix)] + preceding = word[: -len(suffix)] if s1b_exp.search(preceding): return step_1b_helper(preceding) return word @@ -139,49 +139,51 @@ def step_1b(word, r1): def step_1c(word): - if word.endswith('y') or word.endswith('Y') and len(word) > 1: - if word[-2] not in 'aeiouy': + if word.endswith("y") or word.endswith("Y") and len(word) > 1: + if word[-2] not in "aeiouy": if len(word) > 2: - return word[:-1] + 'i' + return word[:-1] + "i" return word def step_2_helper(word, r1, end, repl, prev): - if word.endswith(end): - if len(word) - len(end) >= r1: - if prev == []: - return word[:-len(end)] + repl - for p in prev: - if word[:-len(end)].endswith(p): - return word[:-len(end)] + repl - return word - return None - - -s2_triples = (('ization', 'ize', []), - ('ational', 'ate', []), - ('fulness', 'ful', []), - ('ousness', 'ous', []), - ('iveness', 'ive', []), - ('tional', 'tion', []), - ('biliti', 'ble', []), - ('lessli', 'less', []), - ('entli', 'ent', []), - ('ation', 'ate', []), - ('alism', 'al', []), - ('aliti', 'al', []), - ('ousli', 'ous', []), - ('iviti', 'ive', []), - ('fulli', 'ful', []), - ('enci', 'ence', []), - ('anci', 'ance', []), - ('abli', 'able', []), - ('izer', 'ize', []), - ('ator', 'ate', []), - ('alli', 'al', []), - ('bli', 'ble', []), - ('ogi', 'og', ['l']), - ('li', '', ['c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't'])) + if word.endswith(end): + if len(word) - len(end) >= r1: + if prev == []: + return word[: -len(end)] + repl + for p in prev: + if word[: -len(end)].endswith(p): + return word[: -len(end)] + repl + return word + return None + + +s2_triples = ( + ("ization", "ize", []), + ("ational", "ate", []), + ("fulness", "ful", []), + ("ousness", "ous", []), + ("iveness", "ive", []), + ("tional", "tion", []), + ("biliti", "ble", []), + ("lessli", "less", []), + ("entli", "ent", []), + ("ation", "ate", []), + ("alism", "al", []), + ("aliti", "al", []), + ("ousli", "ous", []), + ("iviti", "ive", []), + ("fulli", "ful", []), + ("enci", "ence", []), + ("anci", "ance", []), + ("abli", "able", []), + ("izer", "ize", []), + ("ator", "ate", []), + ("alli", "al", []), + ("bli", "ble", []), + ("ogi", "og", ["l"]), + ("li", "", ["c", "d", "e", "g", "h", "k", "m", "n", "r", "t"]), +) def step_2(word, r1): @@ -196,23 +198,25 @@ def step_3_helper(word, r1, r2, end, repl, r2_necessary): if word.endswith(end): if len(word) - len(end) >= r1: if not r2_necessary: - return word[:-len(end)] + repl + return word[: -len(end)] + repl else: if len(word) - len(end) >= r2: - return word[:-len(end)] + repl + return word[: -len(end)] + repl return word return None -s3_triples = (('ational', 'ate', False), - ('tional', 'tion', False), - ('alize', 'al', False), - ('icate', 'ic', False), - ('iciti', 'ic', False), - ('ative', '', True), - ('ical', 'ic', False), - ('ness', '', False), - ('ful', '', False)) +s3_triples = ( + ("ational", "ate", False), + ("tional", "tion", False), + ("alize", "al", False), + ("icate", "ic", False), + ("iciti", "ic", False), + ("ative", "", True), + ("ical", "ic", False), + ("ness", "", False), + ("ful", "", False), +) def step_3(word, r1, r2): @@ -223,18 +227,35 @@ def step_3(word, r1, r2): return word -s4_delete_list = ('al', 'ance', 'ence', 'er', 'ic', 'able', 'ible', 'ant', 'ement', - 'ment', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize') +s4_delete_list = ( + "al", + "ance", + "ence", + "er", + "ic", + "able", + "ible", + "ant", + "ement", + "ment", + "ent", + "ism", + "ate", + "iti", + "ous", + "ive", + "ize", +) def step_4(word, r2): for end in s4_delete_list: if word.endswith(end): if len(word) - len(end) >= r2: - return word[:-len(end)] + return word[: -len(end)] return word - if word.endswith('sion') or word.endswith('tion'): + if word.endswith("sion") or word.endswith("tion"): if len(word) - 3 >= r2: return word[:-3] @@ -242,12 +263,12 @@ def step_4(word, r2): def step_5(word, r1, r2): - if word.endswith('l'): - if len(word) - 1 >= r2 and word[-2] == 'l': + if word.endswith("l"): + if len(word) - 1 >= r2 and word[-2] == "l": return word[:-1] return word - if word.endswith('e'): + if word.endswith("e"): if len(word) - 1 >= r2: return word[:-1] if len(word) - 1 >= r1 and not ends_with_short_syllable(word[:-1]): @@ -257,30 +278,42 @@ def step_5(word, r1, r2): def normalize_ys(word): - return word.replace('Y', 'y') - - -exceptional_forms = {'skis': 'ski', - 'skies': 'sky', - 'dying': 'die', - 'lying': 'lie', - 'tying': 'tie', - 'idly': 'idl', - 'gently': 'gentl', - 'ugly': 'ugli', - 'early': 'earli', - 'only': 'onli', - 'singly': 'singl', - 'sky': 'sky', - 'news': 'news', - 'howe': 'howe', - 'atlas': 'atlas', - 'cosmos': 'cosmos', - 'bias': 'bias', - 'andes': 'andes'} - -exceptional_early_exit_post_1a = frozenset(['inning', 'outing', 'canning', 'herring', - 'earring', 'proceed', 'exceed', 'succeed']) + return word.replace("Y", "y") + + +exceptional_forms = { + "skis": "ski", + "skies": "sky", + "dying": "die", + "lying": "lie", + "tying": "tie", + "idly": "idl", + "gently": "gentl", + "ugly": "ugli", + "early": "earli", + "only": "onli", + "singly": "singl", + "sky": "sky", + "news": "news", + "howe": "howe", + "atlas": "atlas", + "cosmos": "cosmos", + "bias": "bias", + "andes": "andes", +} + +exceptional_early_exit_post_1a = frozenset( + [ + "inning", + "outing", + "canning", + "herring", + "earring", + "proceed", + "exceed", + "succeed", + ] +) def stem(word): diff --git a/src/whoosh/lang/snowball/__init__.py b/src/whoosh/lang/snowball/__init__.py index d450288c..4b99cfbc 100644 --- a/src/whoosh/lang/snowball/__init__.py +++ b/src/whoosh/lang/snowball/__init__.py @@ -54,21 +54,21 @@ from .spanish import SpanishStemmer from .swedish import SwedishStemmer - # Map two-letter codes to stemming classes -classes = {"da": DanishStemmer, - "nl": DutchStemmer, - "en": EnglishStemmer, - "fi": FinnishStemmer, - "fr": FrenchStemmer, - "de": GermanStemmer, - "hu": HungarianStemmer, - "it": ItalianStemmer, - "no": NorwegianStemmer, - "pt": PortugueseStemmer, - "ro": RomanianStemmer, - "ru": RussianStemmer, - "es": SpanishStemmer, - "sv": SwedishStemmer, - } +classes = { + "da": DanishStemmer, + "nl": DutchStemmer, + "en": EnglishStemmer, + "fi": FinnishStemmer, + "fr": FrenchStemmer, + "de": GermanStemmer, + "hu": HungarianStemmer, + "it": ItalianStemmer, + "no": NorwegianStemmer, + "pt": PortugueseStemmer, + "ro": RomanianStemmer, + "ru": RussianStemmer, + "es": SpanishStemmer, + "sv": SwedishStemmer, +} diff --git a/src/whoosh/lang/snowball/bases.py b/src/whoosh/lang/snowball/bases.py index 0602385d..864d8a07 100644 --- a/src/whoosh/lang/snowball/bases.py +++ b/src/whoosh/lang/snowball/bases.py @@ -1,7 +1,7 @@ # Base classes -class _ScandinavianStemmer(object): +class _ScandinavianStemmer: """ This subclass encapsulates a method for defining the string region R1. @@ -33,10 +33,10 @@ def _r1_scandinavian(self, word, vowels): r1 = "" for i in range(1, len(word)): if word[i] not in vowels and word[i - 1] in vowels: - if len(word[:i + 1]) < 3 and len(word[:i + 1]) > 0: + if len(word[: i + 1]) < 3 and len(word[: i + 1]) > 0: r1 = word[3:] - elif len(word[:i + 1]) >= 3: - r1 = word[i + 1:] + elif len(word[: i + 1]) >= 3: + r1 = word[i + 1 :] else: return word break @@ -44,7 +44,7 @@ def _r1_scandinavian(self, word, vowels): return r1 -class _StandardStemmer(object): +class _StandardStemmer: """ This subclass encapsulates two methods for defining the standard versions of the string regions R1, R2, and RV. @@ -82,12 +82,12 @@ def _r1r2_standard(self, word, vowels): r2 = "" for i in range(1, len(word)): if word[i] not in vowels and word[i - 1] in vowels: - r1 = word[i + 1:] + r1 = word[i + 1 :] break for i in range(1, len(r1)): if r1[i] not in vowels and r1[i - 1] in vowels: - r2 = r1[i + 1:] + r2 = r1[i + 1 :] break return (r1, r2) @@ -119,13 +119,13 @@ def _rv_standard(self, word, vowels): if word[1] not in vowels: for i in range(2, len(word)): if word[i] in vowels: - rv = word[i + 1:] + rv = word[i + 1 :] break elif word[:2] in vowels: for i in range(2, len(word)): if word[i] not in vowels: - rv = word[i + 1:] + rv = word[i + 1 :] break else: rv = word[3:] diff --git a/src/whoosh/lang/snowball/danish.py b/src/whoosh/lang/snowball/danish.py index 8c4f4878..9a4351af 100644 --- a/src/whoosh/lang/snowball/danish.py +++ b/src/whoosh/lang/snowball/danish.py @@ -1,7 +1,7 @@ -from .bases import _ScandinavianStemmer - from whoosh.compat import u +from .bases import _ScandinavianStemmer + class DanishStemmer(_ScandinavianStemmer): """ diff --git a/src/whoosh/lang/snowball/dutch.py b/src/whoosh/lang/snowball/dutch.py index 0d683649..8f73195a 100644 --- a/src/whoosh/lang/snowball/dutch.py +++ b/src/whoosh/lang/snowball/dutch.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class DutchStemmer(_StandardStemmer): """ diff --git a/src/whoosh/lang/snowball/english.py b/src/whoosh/lang/snowball/english.py index a2567dab..aae50791 100644 --- a/src/whoosh/lang/snowball/english.py +++ b/src/whoosh/lang/snowball/english.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class EnglishStemmer(_StandardStemmer): """ @@ -142,7 +142,7 @@ class EnglishStemmer(_StandardStemmer): "succeeding": "succeed", } - def stem(self, word): + def stem(self, word): # noqa: C901, PLR0912 """ Stem an English word and return the stemmed form. @@ -151,6 +151,10 @@ def stem(self, word): :return: The stemmed form. :rtype: unicode + This method is too complex (91) -- ruff rule C901 + This method has too many branches (117) -- ruff rule PLR0912 + This method has too many statements (254) -- ruff rule PLR0915 + Future edits to this method should reduce, not increase its complexity. """ word = word.lower() diff --git a/src/whoosh/lang/snowball/finnish.py b/src/whoosh/lang/snowball/finnish.py index 63f5a752..6119db65 100644 --- a/src/whoosh/lang/snowball/finnish.py +++ b/src/whoosh/lang/snowball/finnish.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class FinnishStemmer(_StandardStemmer): """ @@ -128,7 +128,7 @@ class FinnishStemmer(_StandardStemmer): u("ej\xE4"), ) - def stem(self, word): + def stem(self, word): # noqa: C901 """ Stem a Finnish word and return the stemmed form. @@ -137,6 +137,10 @@ def stem(self, word): :return: The stemmed form. :rtype: unicode + This method is too complex (51) -- ruff rule C901 + This method has too many branches (58) -- ruff rule PLR0912 + This method has too many statements (148) -- ruff rule PLR0915 + Future edits to this method should reduce, not increase its complexity. """ word = word.lower() diff --git a/src/whoosh/lang/snowball/french.py b/src/whoosh/lang/snowball/french.py index f204adf3..c7ddd402 100644 --- a/src/whoosh/lang/snowball/french.py +++ b/src/whoosh/lang/snowball/french.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class FrenchStemmer(_StandardStemmer): @@ -148,7 +148,7 @@ class FrenchStemmer(_StandardStemmer): ) __step4_suffixes = (u("i\xE8re"), u("I\xE8re"), "ion", "ier", "Ier", "e", u("\xEB")) - def stem(self, word): + def stem(self, word): # noqa: C901 """ Stem a French word and return the stemmed form. @@ -157,6 +157,10 @@ def stem(self, word): :return: The stemmed form. :rtype: unicode + This method is too complex (74) -- ruff rule C901 + This method has too many branches (79) -- ruff rule PLR0912 + This method has too many statements (160) -- ruff rule PLR0915 + Future edits to this method should reduce, not increase its complexity. """ word = word.lower() diff --git a/src/whoosh/lang/snowball/german.py b/src/whoosh/lang/snowball/german.py index 1c5f94f3..263b4972 100644 --- a/src/whoosh/lang/snowball/german.py +++ b/src/whoosh/lang/snowball/german.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class GermanStemmer(_StandardStemmer): diff --git a/src/whoosh/lang/snowball/hungarian.py b/src/whoosh/lang/snowball/hungarian.py index 05597c5d..b3050721 100644 --- a/src/whoosh/lang/snowball/hungarian.py +++ b/src/whoosh/lang/snowball/hungarian.py @@ -1,7 +1,7 @@ from whoosh.compat import u -class HungarianStemmer(object): +class HungarianStemmer: """ The Hungarian Snowball stemmer. diff --git a/src/whoosh/lang/snowball/italian.py b/src/whoosh/lang/snowball/italian.py index daadac9a..2165a8d5 100644 --- a/src/whoosh/lang/snowball/italian.py +++ b/src/whoosh/lang/snowball/italian.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class ItalianStemmer(_StandardStemmer): diff --git a/src/whoosh/lang/snowball/norwegian.py b/src/whoosh/lang/snowball/norwegian.py index 4bc0f7b0..c011ca94 100644 --- a/src/whoosh/lang/snowball/norwegian.py +++ b/src/whoosh/lang/snowball/norwegian.py @@ -1,7 +1,7 @@ -from .bases import _ScandinavianStemmer - from whoosh.compat import u +from .bases import _ScandinavianStemmer + class NorwegianStemmer(_ScandinavianStemmer): diff --git a/src/whoosh/lang/snowball/portugese.py b/src/whoosh/lang/snowball/portugese.py index 54dcb5aa..bed4e943 100644 --- a/src/whoosh/lang/snowball/portugese.py +++ b/src/whoosh/lang/snowball/portugese.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class PortugueseStemmer(_StandardStemmer): diff --git a/src/whoosh/lang/snowball/romanian.py b/src/whoosh/lang/snowball/romanian.py index 89a96de6..c33b0d90 100644 --- a/src/whoosh/lang/snowball/romanian.py +++ b/src/whoosh/lang/snowball/romanian.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class RomanianStemmer(_StandardStemmer): diff --git a/src/whoosh/lang/snowball/russian.py b/src/whoosh/lang/snowball/russian.py index dc4a825e..76e0ccb7 100644 --- a/src/whoosh/lang/snowball/russian.py +++ b/src/whoosh/lang/snowball/russian.py @@ -1,7 +1,7 @@ from whoosh.compat import u -class RussianStemmer(object): +class RussianStemmer: """ The Russian Snowball stemmer. diff --git a/src/whoosh/lang/snowball/spanish.py b/src/whoosh/lang/snowball/spanish.py index ccb21871..f1e50ed2 100644 --- a/src/whoosh/lang/snowball/spanish.py +++ b/src/whoosh/lang/snowball/spanish.py @@ -1,7 +1,7 @@ -from .bases import _StandardStemmer - from whoosh.compat import u +from .bases import _StandardStemmer + class SpanishStemmer(_StandardStemmer): diff --git a/src/whoosh/lang/snowball/swedish.py b/src/whoosh/lang/snowball/swedish.py index 9303e3f7..cb46fbfd 100644 --- a/src/whoosh/lang/snowball/swedish.py +++ b/src/whoosh/lang/snowball/swedish.py @@ -1,7 +1,7 @@ -from .bases import _ScandinavianStemmer - from whoosh.compat import u +from .bases import _ScandinavianStemmer + class SwedishStemmer(_ScandinavianStemmer): diff --git a/src/whoosh/lang/stopwords.py b/src/whoosh/lang/stopwords.py index 8fc1703d..1bb67370 100644 --- a/src/whoosh/lang/stopwords.py +++ b/src/whoosh/lang/stopwords.py @@ -1,7 +1,3 @@ -# coding=utf-8 - -from __future__ import unicode_literals - # Stopwords Corpus # # This module contains lists of stop words for several languages. These @@ -15,11 +11,11 @@ # ===== # This module was generated from the original files using the following script -#import os.path -#import textwrap +# import os.path +# import textwrap # -#names = os.listdir("stopwords") -#for name in names: +# names = os.listdir("stopwords") +# for name in names: # f = open("stopwords/" + name) # wordls = [line.strip() for line in f] # words = " ".join(wordls) @@ -30,16 +26,18 @@ stoplists = { - "da": frozenset(""" + "da": frozenset( + """ og i jeg det at en den til er som på de med han af for ikke der var mig sig men et har om vi min havde ham hun nu over da fra du ud sin dem os op man hans hvor eller hvad skal selv her alle vil blev kunne ind når være dog noget ville jo deres efter ned skulle denne end dette mit også under have dig anden hende mine alt meget sit sine vor mod disse hvis din nogle hos blive mange ad bliver hendes været thi jer sådan - """.split()), - - "nl": frozenset(""" + """.split() + ), + "nl": frozenset( + """ de en van ik te dat die in een hij het niet zijn is was op aan met als voor had er maar om hem dan zou of wat mijn men dit zo door over ze zich bij ook tot je mij uit der daar haar naar heb hoe heeft hebben deze u @@ -47,9 +45,10 @@ doen toen moet ben zonder kan hun dus alles onder ja eens hier wie werd altijd doch wordt wezen kunnen ons zelf tegen na reeds wil kon niets uw iemand geweest andere - """.split()), - - "en": frozenset(""" + """.split() + ), + "en": frozenset( + """ i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are @@ -59,9 +58,10 @@ out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don should now - """.split()), - - "fi": frozenset(""" + """.split() + ), + "fi": frozenset( + """ olla olen olet on olemme olette ovat ole oli olisi olisit olisin olisimme olisitte olisivat olit olin olimme olitte olivat ollut olleet en et ei emme ette eivät minä minun minut minua minussa minusta minuun @@ -85,9 +85,10 @@ joita joissa joista joihin joilla joilta joille joina joiksi että ja jos koska kuin mutta niin sekä sillä tai vaan vai vaikka kanssa mukaan noin poikki yli kun niin nyt itse - """.split()), - - "fr": frozenset(""" + """.split() + ), + "fr": frozenset( + """ au aux avec ce ces dans de des du elle en et eux il je la le leur lui ma mais me même mes moi mon ne nos notre nous on ou par pas pour qu que qui sa se ses son sur ta te tes toi ton tu un une vos votre vous c d j l @@ -100,9 +101,10 @@ auront aurais aurait aurions auriez auraient avais avait avions aviez avaient eut eûmes eûtes eurent aie aies ait ayons ayez aient eusse eusses eût eussions eussiez eussent - """.split()), - - "de": frozenset(""" + """.split() + ), + "de": frozenset( + """ aber alle allem allen aller alles als also am an ander andere anderem anderen anderer anderes anderm andern anderr anders auch auf aus bei bin bis bist da damit dann der den des dem die das daß derselbe derselben @@ -122,9 +124,10 @@ unter viel vom von vor während war waren warst was weg weil weiter welche welchem welchen welcher welches wenn werde werden wie wieder will wir wird wirst wo wollen wollte würde würden zu zum zur zwar zwischen - """.split()), - - "hu": frozenset(""" + """.split() + ), + "hu": frozenset( + """ a ahogy ahol aki akik akkor alatt által általában amely amelyek amelyekben amelyeket amelyet amelynek ami amit amolyan amíg amikor át abban ahhoz annak arra arról az azok azon azt azzal azért aztán @@ -143,9 +146,10 @@ több úgy ugyanis új újabb újra után utána utolsó vagy vagyis valaki valami valamint való vagyok van vannak volt voltam voltak voltunk vissza vele viszont volna - """.split()), - - "it": frozenset(""" + """.split() + ), + "it": frozenset( + """ ad al allo ai agli all agl alla alle con col coi da dal dallo dai dagli dall dagl dalla dalle di del dello dei degli dell degl della delle in nel nello nei negli nell negl nella nelle su sul sullo sui sugli sull @@ -170,9 +174,10 @@ staresti starebbe staremmo stareste starebbero stavo stavi stava stavamo stavate stavano stetti stesti stette stemmo steste stettero stessi stesse stessimo stessero stando - """.split()), - - "no": frozenset(""" + """.split() + ), + "no": frozenset( + """ og i jeg det at en et den til er som på de med han av ikke ikkje der så var meg seg men ett har om vi min mitt ha hadde hun nå over da ved fra du ut sin dem oss opp man kan hans hvor eller hva skal selv sjøl @@ -185,9 +190,10 @@ hennes hoss hossen ikkje ingi inkje korleis korso kva kvar kvarhelst kven kvi kvifor me medan mi mine mykje no nokon noka nokor noko nokre si sia sidan so somt somme um upp vere vore verte vort varte vart - """.split()), - - "pt": frozenset(""" + """.split() + ), + "pt": frozenset( + """ de a o que e do da em um para com não uma os no se na por mais as dos como mas ao ele das à seu sua ou quando muito nos já eu também só pelo pela até isso ela entre depois sem mesmo aos seus quem nas me esse @@ -207,9 +213,10 @@ tivemos tiveram tivera tivéramos tenha tenhamos tenham tivesse tivéssemos tivessem tiver tivermos tiverem terei terá teremos terão teria teríamos teriam - """.split()), - - "ru": frozenset(""" + """.split() + ), + "ru": frozenset( + """ и в во не что он на я с со как а то все она так его но да ты к у же вы за бы по только ее мне было вот от меня еще нет о из ему @@ -228,9 +235,10 @@ впрочем хорошо свою этой перед иногда лучше чуть том нельзя такой им более всегда конечно всю между - """.split()), - - "es": frozenset(""" + """.split() + ), + "es": frozenset( + """ de la que el en y a los del se las por un para con no una su al lo como más pero sus le ya o este sí porque esta entre cuando muy sin sobre también me hasta hay donde quien desde todo nos durante todos uno les @@ -263,9 +271,10 @@ tuvieron tuviera tuvieras tuviéramos tuvierais tuvieran tuviese tuvieses tuviésemos tuvieseis tuviesen teniendo tenido tenida tenidos tenidas tened - """.split()), - - "sv": frozenset(""" + """.split() + ), + "sv": frozenset( + """ och det att i en jag hon som han på den med var sig för så till är men ett om hade de av icke mig du henne då sin nu har inte hans honom skulle hennes där min man ej vid kunde något från ut när efter upp @@ -274,12 +283,14 @@ mitt ni bli blev oss din dessa några deras blir mina samma vilken er sådan vår blivit dess inom mellan sådant varför varje vilka ditt vem vilket sitta sådana vart dina vars vårt våra ert era vilkas - """.split()), - - "tr": frozenset(""" + """.split() + ), + "tr": frozenset( + """ acaba ama aslında az bazı belki biri birkaç birşey biz bu çok çünkü da daha de defa diye eğer en gibi hem hep hepsi her hiç için ile ise kez ki kim mı mu mü nasıl ne neden nerde nerede nereye niçin niye o sanki şey siz şu tüm ve veya ya yani - """.split()), + """.split() + ), } diff --git a/src/whoosh/lang/wordnet.py b/src/whoosh/lang/wordnet.py index 843da196..69fbffb3 100644 --- a/src/whoosh/lang/wordnet.py +++ b/src/whoosh/lang/wordnet.py @@ -35,7 +35,7 @@ from collections import defaultdict from whoosh.compat import iterkeys, text_type -from whoosh.fields import Schema, ID, STORED +from whoosh.fields import ID, STORED, Schema def parse_file(f): @@ -97,7 +97,7 @@ def synonyms(word2nums, num2words, word): return sorted(syns) -class Thesaurus(object): +class Thesaurus: """Represents the WordNet synonym database, either loaded into memory from the wn_s.pl Prolog file, or stored on disk in a Whoosh index. diff --git a/src/whoosh/matching/__init__.py b/src/whoosh/matching/__init__.py index 3f826b98..e640bd61 100644 --- a/src/whoosh/matching/__init__.py +++ b/src/whoosh/matching/__init__.py @@ -25,7 +25,38 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh.matching.mcore import * -from whoosh.matching.binary import * -from whoosh.matching.wrappers import * -from whoosh.matching.combo import * +from whoosh.matching.binary import ( + AdditiveBiMatcher, + AndMaybeMatcher, + AndNotMatcher, + BiMatcher, + DisjunctionMaxMatcher, + IntersectionMatcher, + UnionMatcher, +) +from whoosh.matching.combo import ( + ArrayUnionMatcher, + CombinationMatcher, + PreloadedUnionMatcher, +) +from whoosh.matching.mcore import ( + ConstantScoreMatcher, + LeafMatcher, + ListMatcher, + Matcher, + NoQualityAvailable, + NullMatcher, + NullMatcherClass, + ReadTooFar, +) +from whoosh.matching.wrappers import ( + ConstantScoreWrapperMatcher, + CoordMatcher, + ExcludeMatcher, + FilterMatcher, + InverseMatcher, + MultiMatcher, + RequireMatcher, + SingleTermMatcher, + WrappingMatcher, +) diff --git a/src/whoosh/matching/binary.py b/src/whoosh/matching/binary.py index 7ff1183d..43eee663 100644 --- a/src/whoosh/matching/binary.py +++ b/src/whoosh/matching/binary.py @@ -34,7 +34,7 @@ class BiMatcher(mcore.Matcher): """ def __init__(self, a, b): - super(BiMatcher, self).__init__() + super().__init__() self.a = a self.b = b @@ -43,7 +43,7 @@ def reset(self): self.b.reset() def __repr__(self): - return "%s(%r, %r)" % (self.__class__.__name__, self.a, self.b) + return f"{self.__class__.__name__}({self.a!r}, {self.b!r})" def children(self): return [self.a, self.b] @@ -307,7 +307,7 @@ class DisjunctionMaxMatcher(UnionMatcher): # inheritance. def __init__(self, a, b, tiebreak=0.0): - super(DisjunctionMaxMatcher, self).__init__(a, b) + super().__init__(a, b) self.tiebreak = tiebreak def copy(self): @@ -406,7 +406,7 @@ class IntersectionMatcher(AdditiveBiMatcher): """Matches the intersection (AND) of the postings in the two sub-matchers.""" def __init__(self, a, b): - super(IntersectionMatcher, self).__init__(a, b) + super().__init__(a, b) self._find_first() def reset(self): @@ -562,7 +562,7 @@ class AndNotMatcher(BiMatcher): """ def __init__(self, a, b): - super(AndNotMatcher, self).__init__(a, b) + super().__init__(a, b) self._find_first() def reset(self): diff --git a/src/whoosh/matching/combo.py b/src/whoosh/matching/combo.py index e642feec..aa673373 100644 --- a/src/whoosh/matching/combo.py +++ b/src/whoosh/matching/combo.py @@ -25,10 +25,9 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import division + from array import array -from whoosh.compat import range from whoosh.matching import mcore diff --git a/src/whoosh/matching/mcore.py b/src/whoosh/matching/mcore.py index a901cc2f..6e8112b3 100644 --- a/src/whoosh/matching/mcore.py +++ b/src/whoosh/matching/mcore.py @@ -51,9 +51,7 @@ from itertools import repeat -from whoosh.compat import izip -from whoosh.compat import abstractmethod - +from whoosh.compat import abstractmethod, izip # Exceptions @@ -74,7 +72,7 @@ class NoQualityAvailable(Exception): # Classes -class Matcher(object): +class Matcher: """Base class for all matchers.""" @abstractmethod @@ -110,8 +108,7 @@ def term_matchers(self): yield self else: for cm in self.children(): - for m in cm.term_matchers(): - yield m + yield from cm.term_matchers() def matching_terms(self, id=None): """Returns an iterator of ``("fieldname", "termtext")`` tuples for the @@ -259,13 +256,13 @@ def supports(self, astype): for example 'frequency' or 'characters'. """ - raise NotImplementedError("supports not implemented in %s" % self.__class__) + raise NotImplementedError(f"supports not implemented in {self.__class__}") @abstractmethod def value_as(self, astype): """Returns the value(s) of the current posting as the given type.""" - raise NotImplementedError("value_as not implemented in %s" % self.__class__) + raise NotImplementedError(f"value_as not implemented in {self.__class__}") def spans(self): """Returns a list of :class:`~whoosh.query.spans.Span` objects for the @@ -441,7 +438,7 @@ def __init__( self._terminfo = terminfo def __repr__(self): - return "<%s>" % self.__class__.__name__ + return f"<{self.__class__.__name__}>" def is_active(self): return self._i < len(self._ids) @@ -586,7 +583,7 @@ class LeafMatcher(Matcher): # self.format -- Format object for the posting values def __repr__(self): - return "%s(%r, %s)" % (self.__class__.__name__, self.term(), self.is_active()) + return f"{self.__class__.__name__}({self.term()!r}, {self.is_active()})" def term(self): return self._term @@ -614,7 +611,7 @@ def spans(self): elif self.supports("positions"): return [Span(pos) for pos in self.value_as("positions")] else: - raise Exception("Field does not support positions (%r)" % self.term()) + raise Exception(f"Field does not support positions ({self.term()!r})") def supports_block_quality(self): return self.scorer and self.scorer.supports_block_quality() diff --git a/src/whoosh/matching/wrappers.py b/src/whoosh/matching/wrappers.py index 0532bde4..ee7ab92e 100644 --- a/src/whoosh/matching/wrappers.py +++ b/src/whoosh/matching/wrappers.py @@ -25,7 +25,6 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import division from whoosh.matching import mcore @@ -38,7 +37,7 @@ def __init__(self, child, boost=1.0): self.boost = boost def __repr__(self): - return "%s(%r, boost=%s)" % (self.__class__.__name__, self.child, self.boost) + return f"{self.__class__.__name__}({self.child!r}, boost={self.boost})" def copy(self): kwargs = {} @@ -130,7 +129,7 @@ def __init__(self, matchers, idoffsets, scorer=None, current=0): self._next_matcher() def __repr__(self): - return "%s(%r, %r, current=%s)" % ( + return "{}({!r}, {!r}, current={})".format( self.__class__.__name__, self.matchers, self.offsets, @@ -268,14 +267,14 @@ def __init__(self, child, ids, exclude=False, boost=1.0): the wrapped matcher that are **not in** the set are used. """ - super(FilterMatcher, self).__init__(child) + super().__init__(child) self._ids = ids self._exclude = exclude self.boost = boost self._find_next() def __repr__(self): - return "%s(%r, %r, %r, boost=%s)" % ( + return "{}({!r}, {!r}, {!r}, boost={})".format( self.__class__.__name__, self.child, self._ids, @@ -339,7 +338,7 @@ class InverseMatcher(WrappingMatcher): """ def __init__(self, child, limit, missing=None, weight=1.0, id=0): - super(InverseMatcher, self).__init__(child) + super().__init__(child) self.limit = limit self._weight = weight self.missing = missing or (lambda id: False) diff --git a/src/whoosh/multiproc.py b/src/whoosh/multiproc.py index bf792680..36264e2e 100644 --- a/src/whoosh/multiproc.py +++ b/src/whoosh/multiproc.py @@ -25,14 +25,14 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import with_statement + from multiprocessing import Process, Queue, cpu_count -from whoosh.compat import queue, range, pickle from whoosh.codec import base -from whoosh.writing import SegmentWriter +from whoosh.compat import pickle, queue from whoosh.externalsort import imerge from whoosh.util import random_name +from whoosh.writing import SegmentWriter def finish_subsegment(writer, k=64): @@ -204,7 +204,7 @@ def _enqueue(self): dump = pickle.dump length = len(docbuffer) - filename = "%s.doclist" % random_name() + filename = f"{random_name()}.doclist" with self.temp_storage().create_file(filename).raw_file() as f: for item in docbuffer: dump(item, f, 2) @@ -227,7 +227,9 @@ def start_group(self): def end_group(self): if not self._grouping: - raise Exception("Unbalanced end_group") + raise ValueError( + "Unbalanced end_group" + ) # Replaced generic Exception with specific ValueError self._grouping -= 1 def add_document(self, **fields): @@ -311,6 +313,7 @@ def _commit(self, mergetype, optimize, merge): self._finish() def _merge_subsegments(self, results, mergetype): + _ = mergetype # Unused variable schema = self.schema schemanames = set(schema.names()) storage = self.storage diff --git a/src/whoosh/qparser/__init__.py b/src/whoosh/qparser/__init__.py index a61f9052..d5ce2ab3 100644 --- a/src/whoosh/qparser/__init__.py +++ b/src/whoosh/qparser/__init__.py @@ -25,6 +25,59 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh.qparser.default import * -from whoosh.qparser.plugins import * -from whoosh.qparser.syntax import * +from whoosh.qparser.default import ( + DisMaxParser, + MultifieldParser, + QueryParser, + SimpleParser, +) +from whoosh.qparser.plugins import ( + BoostPlugin, + CopyFieldPlugin, + EveryPlugin, + FieldAliasPlugin, + FieldsPlugin, + FunctionPlugin, + FuzzyTermPlugin, + GroupPlugin, + GtLtPlugin, + MultifieldPlugin, + OperatorsPlugin, + PhrasePlugin, + Plugin, + PlusMinusPlugin, + PrefixPlugin, + PseudoFieldPlugin, + RangePlugin, + RegexPlugin, + RegexTagger, + SequencePlugin, + SingleQuotePlugin, + TaggingPlugin, + WhitespacePlugin, + WildcardPlugin, +) +from whoosh.qparser.syntax import ( + AndGroup, + AndMaybeGroup, + AndNotGroup, + BinaryGroup, + DisMaxGroup, + ErrorNode, + FieldnameNode, + GroupNode, + InfixOperator, + MarkerNode, + NotGroup, + Operator, + OrderedGroup, + OrGroup, + PostfixOperator, + PrefixOperator, + RequireGroup, + SyntaxNode, + TextNode, + Whitespace, + WordNode, + Wrapper, +) diff --git a/src/whoosh/qparser/common.py b/src/whoosh/qparser/common.py index 4d721634..9195241b 100644 --- a/src/whoosh/qparser/common.py +++ b/src/whoosh/qparser/common.py @@ -35,13 +35,12 @@ class QueryParserError(Exception): def __init__(self, cause, msg=None): - super(QueryParserError, self).__init__(str(cause)) + super().__init__(str(cause)) self.cause = cause def get_single_text(field, text, **kwargs): - """Returns the first token from an analyzer's output. - """ + """Returns the first token from an analyzer's output.""" for t in field.process_text(text, mode="query", **kwargs): return t @@ -53,11 +52,10 @@ def attach(q, stxnode): q.startchar = stxnode.startchar q.endchar = stxnode.endchar except AttributeError: - raise AttributeError("Can't set attribute on %s" - % q.__class__.__name__) + raise AttributeError(f"Can't set attribute on {q.__class__.__name__}") return q def print_debug(level, msg, out=sys.stderr): if level: - out.write("%s%s\n" % (" " * (level - 1), msg)) + out.write(f"{' ' * (level - 1)}{msg}\n") diff --git a/src/whoosh/qparser/dateparse.py b/src/whoosh/qparser/dateparse.py index b1ff47cf..35552c17 100644 --- a/src/whoosh/qparser/dateparse.py +++ b/src/whoosh/qparser/dateparse.py @@ -27,16 +27,21 @@ import re import sys -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone -from whoosh.compat import string_type, iteritems +from whoosh.compat import iteritems, string_type from whoosh.qparser import plugins, syntax from whoosh.qparser.taggers import Tagger from whoosh.support.relativedelta import relativedelta from whoosh.util.text import rcompile -from whoosh.util.times import adatetime, timespan -from whoosh.util.times import fill_in, is_void, relative_days -from whoosh.util.times import TimeError +from whoosh.util.times import ( + TimeError, + adatetime, + fill_in, + is_void, + relative_days, + timespan, +) class DateParseError(Exception): @@ -54,7 +59,7 @@ def print_debug(level, msg, *args): # Parser element objects -class Props(object): +class Props: """A dumb little object that just puts copies a dictionary into attibutes so I can use dot syntax instead of square bracket string item lookup and save a little bit of typing. Used by :class:`Regex`. @@ -70,7 +75,7 @@ def get(self, key, default=None): return self.__dict__.get(key, default) -class ParserBase(object): +class ParserBase: """Base class for date parser elements.""" def to_parser(self, e): @@ -84,7 +89,7 @@ def parse(self, text, dt, pos=0, debug=-9999): def date_from(self, text, dt=None, pos=0, debug=-9999): if dt is None: - dt = datetime.now() + dt = datetime.now(tz=timezone.utc) d, pos = self.parse(text, dt, pos, debug + 1) return d @@ -105,7 +110,7 @@ def __init__(self, elements, name=None): self.name = name def __repr__(self): - return "%s<%s>%r" % (self.__class__.__name__, self.name or "", self.elements) + return f"{self.__class__.__name__}<{self.name or ''}>{self.elements!r}" class Sequence(MultiBase): @@ -122,7 +127,7 @@ def __init__(self, elements, sep="(\\s+|\\s*,\\s*)", name=None, progressive=Fals sequence matches like ``a[b[c]]``. """ - super(Sequence, self).__init__(elements, name) + super().__init__(elements, name) self.sep_pattern = sep if sep: self.sep_expr = rcompile(sep, re.IGNORECASE) @@ -205,7 +210,7 @@ def __init__( :param name: a name for this element (for debugging purposes only). """ - super(Combo, self).__init__(elements, sep=sep, name=name) + super().__init__(elements, sep=sep, name=name) self.fn = fn self.min = min self.max = max @@ -260,7 +265,7 @@ def dates_to_timespan(self, dates): elif len(dates) == 2: return timespan(dates[0], dates[1]) else: - raise DateParseError("Don't know what to do with %r" % (dates,)) + raise DateParseError(f"Don't know what to do with {dates!r}") class Choice(MultiBase): @@ -311,7 +316,7 @@ def __init__( :param name: a name for this element (for debugging purposes only). """ - super(Bag, self).__init__(elements, name) + super().__init__(elements, name) self.sep_expr = rcompile(sep, re.IGNORECASE) self.onceper = onceper self.requireall = requireall @@ -380,7 +385,7 @@ def __init__(self, element): self.element = self.to_parser(element) def __repr__(self): - return "%s(%r)" % (self.__class__.__name__, self.element) + return f"{self.__class__.__name__}({self.element!r})" def parse(self, text, dt, pos=0, debug=-9999): try: @@ -403,7 +408,7 @@ def __init__(self, element): self.element = element def __repr__(self): - return "%s(%r)" % (self.__class__.__name__, self.element) + return f"{self.__class__.__name__}({self.element!r})" def parse(self, text, dt, pos=0, debug=-9999): try: @@ -440,7 +445,7 @@ def __init__(self, pattern, fn=None, modify=None): self.modify = modify def __repr__(self): - return "<%r>" % (self.pattern,) + return f"<{self.pattern!r}>" def parse(self, text, dt, pos=0, debug=-9999): m = self.expr.match(text, pos) @@ -490,7 +495,7 @@ def __init__(self, *patterns): self.exprs = [rcompile(pat, re.IGNORECASE) for pat in self.patterns] self.pattern = ( - "(?P" + "|".join("(%s)" % pat for pat in self.patterns) + ")" + "(?P" + "|".join(f"({pat})" for pat in self.patterns) + ")" ) self.expr = rcompile(self.pattern, re.IGNORECASE) @@ -505,15 +510,15 @@ def modify_props(self, p): class PlusMinus(Regex): def __init__(self, years, months, weeks, days, hours, minutes, seconds): - rel_years = "((?P[0-9]+) *(%s))?" % years - rel_months = "((?P[0-9]+) *(%s))?" % months - rel_weeks = "((?P[0-9]+) *(%s))?" % weeks - rel_days = "((?P[0-9]+) *(%s))?" % days - rel_hours = "((?P[0-9]+) *(%s))?" % hours - rel_mins = "((?P[0-9]+) *(%s))?" % minutes - rel_secs = "((?P[0-9]+) *(%s))?" % seconds - - self.pattern = "(?P[+-]) *%s *%s *%s *%s *%s *%s *%s(?=(\\W|$))" % ( + rel_years = f"((?P[0-9]+) *({years}))?" + rel_months = f"((?P[0-9]+) *({months}))?" + rel_weeks = f"((?P[0-9]+) *({weeks}))?" + rel_days = f"((?P[0-9]+) *({days}))?" + rel_hours = f"((?P[0-9]+) *({hours}))?" + rel_mins = f"((?P[0-9]+) *({minutes}))?" + rel_secs = f"((?P[0-9]+) *({seconds}))?" + + self.pattern = "(?P[+-]) *{} *{} *{} *{} *{} *{} *{}(?=(\\W|$))".format( rel_years, rel_months, rel_weeks, @@ -548,11 +553,7 @@ def __init__(self, next, last, daynames): self.last_pattern = last self._dayname_exprs = tuple(rcompile(pat, re.IGNORECASE) for pat in daynames) dn_pattern = "|".join(daynames) - self.pattern = "(?P%s|%s) +(?P%s)(?=(\\W|$))" % ( - next, - last, - dn_pattern, - ) + self.pattern = f"(?P{next}|{last}) +(?P{dn_pattern})(?=(\\W|$))" self.expr = rcompile(self.pattern, re.IGNORECASE) def props_to_date(self, p, dt): @@ -600,7 +601,7 @@ def props_to_date(self, p, dt): # Top-level parser classes -class DateParser(object): +class DateParser: """Base class for locale-specific parser classes.""" day = Regex( @@ -660,7 +661,7 @@ def parse(self, text, dt, pos=0, debug=-9999): def date_from(self, text, basedate=None, pos=0, debug=-9999, toend=True): if basedate is None: - basedate = datetime.utcnow() + basedate = datetime.now(tz=timezone.utc) parser = self.get_parser() if toend: @@ -937,7 +938,7 @@ def query(self, parser): elif isinstance(self.dt, timespan): return query.DateRange(fieldname, dt.start, dt.end, boost=self.boost) else: - raise Exception("Unknown time object: %r" % dt) + raise Exception(f"Unknown time object: {dt!r}") class DateRangeNode(syntax.SyntaxNode): @@ -951,7 +952,7 @@ def __init__(self, fieldname, start, end, boost=1.0): self.boost = 1.0 def r(self): - return "%r-%r" % (self.start, self.end) + return f"{self.start!r}-{self.end!r}" def query(self, parser): from whoosh import query diff --git a/src/whoosh/qparser/default.py b/src/whoosh/qparser/default.py index 0369c319..2b028e53 100644 --- a/src/whoosh/qparser/default.py +++ b/src/whoosh/qparser/default.py @@ -30,13 +30,12 @@ from whoosh import query from whoosh.compat import text_type from whoosh.qparser import syntax -from whoosh.qparser.common import print_debug, QueryParserError - +from whoosh.qparser.common import QueryParserError, print_debug # Query parser object -class QueryParser(object): +class QueryParser: """A hand-written query parser built on modular plug-ins. The default configuration implements a powerful fielded query language similar to Lucene's. @@ -202,7 +201,7 @@ def multitoken_query(self, spec, texts, fieldname, termclass, boost): elif spec == "or": qclass = query.Or else: - raise QueryParserError("Unknown multitoken_query value %r" % spec) + raise QueryParserError(f"Unknown multitoken_query value {spec!r}") return qclass([termclass(fieldname, t, boost=boost) for t in texts]) def term_query( @@ -278,7 +277,7 @@ def tag(self, text, pos=0, debug=False): # Priorized list of taggers provided by the parser's plugins taggers = self.taggers() if debug: - print_debug(debug, "Taggers: %r" % taggers) + print_debug(debug, f"Taggers: {taggers!r}") # Define a function that will make a WordNode from the "interstitial" # text between matches @@ -302,11 +301,11 @@ def inter(startchar, endchar): if prev < pos: tween = inter(prev, pos) if debug: - print_debug(debug, "Tween: %r" % tween) + print_debug(debug, f"Tween: {tween!r}") stack.append(tween) if debug: - print_debug(debug, "Tagger: %r at %s: %r" % (tagger, pos, node)) + print_debug(debug, f"Tagger: {tagger!r} at {pos}: {node!r}") stack.append(node) prev = pos = node.endchar break @@ -322,7 +321,7 @@ def inter(startchar, endchar): # Wrap the list of nodes in a group node group = self.group(stack) if debug: - print_debug(debug, "Tagged group: %r" % group) + print_debug(debug, f"Tagged group: {group!r}") return group def filterize(self, nodes, debug=False): @@ -332,15 +331,15 @@ def filterize(self, nodes, debug=False): # Call each filter in the priorized list of plugin filters if debug: - print_debug(debug, "Pre-filtered group: %r" % nodes) + print_debug(debug, f"Pre-filtered group: {nodes!r}") for f in self.filters(): if debug: - print_debug(debug, "..Applying: %r" % f) + print_debug(debug, f"..Applying: {f!r}") nodes = f(self, nodes) if debug: - print_debug(debug, "..Result: %r" % nodes) + print_debug(debug, f"..Result: {nodes!r}") if nodes is None: - raise Exception("Filter %r did not return anything" % f) + raise Exception(f"Filter {f!r} did not return anything") return nodes def process(self, text, pos=0, debug=False): @@ -371,18 +370,18 @@ def parse(self, text, normalize=True, debug=False): nodes = self.process(text, debug=debug) if debug: - print_debug(debug, "Syntax tree: %r" % nodes) + print_debug(debug, f"Syntax tree: {nodes!r}") q = nodes.query(self) if not q: q = query.NullQuery if debug: - print_debug(debug, "Pre-normalized query: %r" % q) + print_debug(debug, f"Pre-normalized query: {q!r}") if normalize: q = q.normalize() if debug: - print_debug(debug, "Normalized query: %r" % q) + print_debug(debug, f"Normalized query: {q!r}") return q def parse_(self, text, normalize=True): diff --git a/src/whoosh/qparser/plugins.py b/src/whoosh/qparser/plugins.py index a2d2f6bb..15f32170 100644 --- a/src/whoosh/qparser/plugins.py +++ b/src/whoosh/qparser/plugins.py @@ -28,15 +28,14 @@ import copy from whoosh import query -from whoosh.compat import u -from whoosh.compat import iteritems, range +from whoosh.compat import iteritems, u from whoosh.qparser import syntax from whoosh.qparser.common import attach -from whoosh.qparser.taggers import RegexTagger, FnTagger +from whoosh.qparser.taggers import FnTagger, RegexTagger from whoosh.util.text import rcompile -class Plugin(object): +class Plugin: """Base class for parser plugins.""" def taggers(self, parser): @@ -81,7 +80,7 @@ def filters(self, parser): def create(self, parser, match): # Groupdict keys can be unicode sometimes apparently? Convert them to # str for use as keyword arguments. This should be Py3-safe. - kwargs = dict((str(k), v) for k, v in iteritems(match.groupdict())) + kwargs = {str(k): v for k, v in iteritems(match.groupdict())} return self.nodetype(**kwargs) @@ -138,7 +137,7 @@ class PrefixNode(syntax.TextNode): qclass = query.Prefix def r(self): - return "%r*" % self.text + return f"{self.text!r}*" expr = "(?P[^ \t\r\n*]+)[*](?= |$|\\))" nodetype = PrefixNode @@ -149,7 +148,7 @@ class WildcardPlugin(TaggingPlugin): # \u061F = Arabic question mark # \u1367 = Ethiopic question mark qmarks = u("?\u055E\u061F\u1367") - expr = "(?P[*%s])" % qmarks + expr = f"(?P[*{qmarks}])" def filters(self, parser): # Run early, but definitely before multifield plugin @@ -193,7 +192,7 @@ class WildcardNode(syntax.TextNode): qclass = query.Wildcard def r(self): - return "Wild %r" % self.text + return f"Wild {self.text!r}" nodetype = WildcardNode @@ -212,7 +211,7 @@ class RegexNode(syntax.TextNode): qclass = query.Regex def r(self): - return "Regex %r" % self.text + return f"Regex {self.text!r}" expr = 'r"(?P[^"]*)"' nodetype = RegexNode @@ -233,7 +232,7 @@ def __init__(self, original, boost): self.boost = boost def r(self): - return "^ %s" % self.boost + return f"^ {self.boost}" def create(self, parser, match): # Override create so we can grab group 0 @@ -592,7 +591,7 @@ def __init__(self, name, fn, args, kwargs): self.boost = None def __repr__(self): - return "#%s<%r>(%r)" % (self.name, self.args, self.nodes) + return f"#{self.name}<{self.args!r}>({self.nodes!r})" def query(self, parser): qs = [n.query(parser) for n in self.nodes] @@ -691,7 +690,7 @@ def __init__(self, text, textstartchar, slop=1): self.slop = slop def r(self): - return "%s %r~%s" % (self.__class__.__name__, self.text, self.slop) + return f"{self.__class__.__name__} {self.text!r}~{self.slop}" def apply(self, fn): return self.__class__( @@ -931,11 +930,7 @@ def __init__( self.memo = memo def __repr__(self): - return "<%s %r (%s)>" % ( - self.__class__.__name__, - self.expr.pattern, - self.memo, - ) + return f"<{self.__class__.__name__} {self.expr.pattern!r} ({self.memo})>" def create(self, parser, match): return self.optype(match.group(0), self.grouptype, self.leftassoc) @@ -1111,7 +1106,7 @@ def __init__(self, rel): self.rel = rel def __repr__(self): - return "(%s)" % self.rel + return f"({self.rel})" expr = r"(?P(<=|>=|<|>|=<|=>))" nodetype = GtLtNode @@ -1287,7 +1282,7 @@ def __init__(self, map, group=syntax.OrGroup, mirror=False): self.group = group if mirror: # Add in reversed mappings - map.update(dict((v, k) for k, v in iteritems(map))) + map.update({v: k for k, v in iteritems(map)}) def filters(self, parser): # Run after the fieldname filter (100) but before multifield (110) diff --git a/src/whoosh/qparser/syntax.py b/src/whoosh/qparser/syntax.py index a9297350..99a1c6a6 100644 --- a/src/whoosh/qparser/syntax.py +++ b/src/whoosh/qparser/syntax.py @@ -25,13 +25,14 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -import sys, weakref +import sys +import weakref from whoosh import query -from whoosh.qparser.common import get_single_text, QueryParserError, attach +from whoosh.qparser.common import QueryParserError, attach, get_single_text -class SyntaxNode(object): +class SyntaxNode: """Base class for nodes that make up the abstract syntax tree (AST) of a parsed user query string. The AST is an intermediate step, generated from the query string, then converted into a :class:`whoosh.query.Query` @@ -59,10 +60,10 @@ class SyntaxNode(object): def __repr__(self): r = "<" if self.has_fieldname: - r += "%r:" % self.fieldname + r += f"{self.fieldname!r}:" r += self.r() if self.has_boost and self.boost != 1.0: - r += " ^%s" % self.boost + r += f" ^{self.boost}" r += ">" return r @@ -72,7 +73,7 @@ def r(self): fieldname and boost where appropriate. """ - return "%s %r" % (self.__class__.__name__, self.__dict__) + return f"{self.__class__.__name__} {self.__dict__!r}" def apply(self, fn): return self @@ -178,7 +179,7 @@ def __init__(self, fieldname, original): self.original = original def __repr__(self): - return "<%r:>" % self.fieldname + return f"<{self.fieldname!r}:>" class GroupNode(SyntaxNode): @@ -208,10 +209,7 @@ def __init__(self, nodes=None, boost=1.0, **kwargs): self.kwargs = kwargs def r(self): - return "%s %s" % ( - self.__class__.__name__, - ", ".join(repr(n) for n in self.nodes), - ) + return f"{self.__class__.__name__} {', '.join(repr(n) for n in self.nodes)}" @property def startchar(self): @@ -230,7 +228,7 @@ def apply(self, fn): self.type, [fn(node) for node in self.nodes], boost=self.boost, - **self.kwargs + **self.kwargs, ) def query(self, parser): @@ -383,7 +381,7 @@ def __init__(self, message, node=None): self.node = node def r(self): - return "ERR %r %r" % (self.node, self.message) + return f"ERR {self.node!r} {self.message!r}" @property def startchar(self): @@ -415,7 +413,7 @@ class ScaledOrGroup(OrGroup): def __init__(self, nodes=None, **kwargs): if "scale" in kwargs: del kwargs["scale"] - super(ScaledOrGroup, self).__init__(nodes=nodes, scale=scale, **kwargs) + super().__init__(nodes=nodes, scale=scale, **kwargs) return ScaledOrGroup @@ -462,7 +460,7 @@ def __init__(self, start, end, startexcl, endexcl): def r(self): b1 = "{" if self.startexcl else "[" b2 = "}" if self.endexcl else "]" - return "%s%r %r%s" % (b1, self.start, self.end, b2) + return f"{b1}{self.start!r} {self.end!r}{b2}" def query(self, parser): fieldname = self.fieldname or parser.fieldname @@ -529,7 +527,7 @@ def __init__(self, text): self.boost = 1.0 def r(self): - return "%s %r" % (self.__class__.__name__, self.text) + return f"{self.__class__.__name__} {self.text!r}" def is_text(self): return True @@ -584,7 +582,7 @@ def __init__(self, text, grouptype, leftassoc=True): self.leftassoc = leftassoc def r(self): - return "OP %r" % self.text + return f"OP {self.text!r}" def replace_self(self, parser, group, position): """Called with the parser, a group, and the position at which the diff --git a/src/whoosh/qparser/taggers.py b/src/whoosh/qparser/taggers.py index 6c492d3d..46900203 100644 --- a/src/whoosh/qparser/taggers.py +++ b/src/whoosh/qparser/taggers.py @@ -27,11 +27,10 @@ from whoosh.util.text import rcompile - # Tagger objects -class Tagger(object): +class Tagger: """Base class for taggers, objects which match syntax in the query string and translate it into a :class:`whoosh.qparser.syntax.SyntaxNode` object. """ @@ -88,7 +87,7 @@ def __init__(self, expr, fn, memo=""): self.memo = memo def __repr__(self): - return "<%s %r (%s)>" % (self.__class__.__name__, self.expr, self.memo) + return f"<{self.__class__.__name__} {self.expr!r} ({self.memo})>" def create(self, parser, match): return self.fn(**match.groupdict()) diff --git a/src/whoosh/query/__init__.py b/src/whoosh/query/__init__.py index 97e34a40..5129f0fd 100644 --- a/src/whoosh/query/__init__.py +++ b/src/whoosh/query/__init__.py @@ -25,12 +25,62 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from whoosh.query.qcore import * -from whoosh.query.terms import * -from whoosh.query.compound import * -from whoosh.query.positional import * -from whoosh.query.ranges import * -from whoosh.query.wrappers import * -from whoosh.query.nested import * -from whoosh.query.qcolumns import * -from whoosh.query.spans import * + +from whoosh.query.compound import ( + And, + AndMaybe, + AndNot, + BinaryQuery, + BooleanQuery, + CompoundQuery, + DefaultOr, + DisjunctionMax, + Or, + Otherwise, + PreloadedOr, + Require, + SplitOr, +) +from whoosh.query.nested import NestedChildren, NestedParent +from whoosh.query.positional import Ordered, Phrase, Sequence +from whoosh.query.qcolumns import ColumnMatcher, ColumnQuery +from whoosh.query.qcore import ( + Every, + Highest, + Lowest, + NullQuery, + Query, + QueryError, + _NullQuery, + error_query, + token_lists, +) +from whoosh.query.ranges import DateRange, NumericRange, RangeMixin, TermRange +from whoosh.query.spans import ( + Span, + SpanBefore, + SpanBiMatcher, + SpanBiQuery, + SpanCondition, + SpanContains, + SpanFirst, + SpanNear, + SpanNear2, + SpanNot, + SpanOr, + SpanQuery, + SpanWrappingMatcher, + WrappingSpan, +) +from whoosh.query.terms import ( + ExpandingTerm, + FuzzyTerm, + MultiTerm, + PatternQuery, + Prefix, + Regex, + Term, + Variations, + Wildcard, +) +from whoosh.query.wrappers import ConstantScoreQuery, Not, WeightingQuery, WrappingQuery diff --git a/src/whoosh/query/compound.py b/src/whoosh/query/compound.py index a7c02798..fc2787aa 100644 --- a/src/whoosh/query/compound.py +++ b/src/whoosh/query/compound.py @@ -25,7 +25,6 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import division from whoosh import matching from whoosh.compat import text_type, u @@ -41,14 +40,14 @@ class CompoundQuery(qcore.Query): def __init__(self, subqueries, boost=1.0): for subq in subqueries: if not isinstance(subq, qcore.Query): - raise qcore.QueryError("%r is not a query" % subq) + raise qcore.QueryError(f"{subq!r} is not a query") self.subqueries = subqueries self.boost = boost def __repr__(self): - r = "%s(%r" % (self.__class__.__name__, self.subqueries) + r = f"{self.__class__.__name__}({self.subqueries!r}" if hasattr(self, "boost") and self.boost != 1: - r += ", boost=%s" % self.boost + r += f", boost={self.boost}" r += ")" return r @@ -118,7 +117,7 @@ def estimate_min_size(self, ixreader): return 0 def normalize(self): - from whoosh.query import Every, TermRange, NumericRange + from whoosh.query import Every, NumericRange, TermRange # Normalize subqueries and merge nested instances of this class subqueries = [] @@ -361,7 +360,7 @@ def _matcher(self, subs, searcher, context): # Implementation that pre-loads docnums and scores into an array cls = PreloadedOr else: - raise ValueError("Unknown matcher_type %r" % self.matcher_type) + raise ValueError(f"Unknown matcher_type {self.matcher_type!r}") return cls( subs, boost=self.boost, minmatch=self.minmatch, scale=self.scale diff --git a/src/whoosh/query/nested.py b/src/whoosh/query/nested.py index c0a9bb29..45b78344 100644 --- a/src/whoosh/query/nested.py +++ b/src/whoosh/query/nested.py @@ -26,7 +26,6 @@ # policies, either expressed or implied, of Matt Chaput. from whoosh import matching -from whoosh.compat import range from whoosh.query import qcore from whoosh.query.wrappers import WrappingQuery @@ -128,8 +127,7 @@ def deletion_docs(self, searcher): docnum = m.id() parentdoc = bits.before(docnum + 1) nextparent = bits.after(docnum) or maxdoc - for i in range(parentdoc, nextparent): - yield i + yield from range(parentdoc, nextparent) m.skip_to(nextparent) class NestedParentMatcher(matching.Matcher): @@ -296,11 +294,7 @@ def __init__( self._find_next_children() def __repr__(self): - return "%s(%r, %r)" % ( - self.__class__.__name__, - self.parent_comb, - self.child, - ) + return f"{self.__class__.__name__}({self.parent_comb!r}, {self.child!r})" def reset(self): self.child.reset() diff --git a/src/whoosh/query/positional.py b/src/whoosh/query/positional.py index 66596fb9..bb6b381a 100644 --- a/src/whoosh/query/positional.py +++ b/src/whoosh/query/positional.py @@ -25,13 +25,13 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import division + import copy from whoosh import matching from whoosh.analysis import Token from whoosh.compat import u -from whoosh.query import qcore, terms, compound +from whoosh.query import compound, qcore, terms class Sequence(compound.CompoundQuery): @@ -165,7 +165,7 @@ def __eq__(self, other): ) def __repr__(self): - return "%s(%r, %r, slop=%s, boost=%f)" % ( + return "{}({!r}, {!r}, slop={}, boost={:f})".format( self.__class__.__name__, self.fieldname, self.words, @@ -244,7 +244,7 @@ def estimate_min_size(self, ixreader): return self._and_query().estimate_min_size(ixreader) def matcher(self, searcher, context=None): - from whoosh.query import Term, SpanNear2 + from whoosh.query import SpanNear2, Term fieldname = self.fieldname if fieldname not in searcher.schema: @@ -253,7 +253,7 @@ def matcher(self, searcher, context=None): field = searcher.schema[fieldname] if not field.format or not field.format.supports("positions"): raise qcore.QueryError( - "Phrase search: %r field has no positions" % self.fieldname + f"Phrase search: {self.fieldname!r} field has no positions" ) terms = [] diff --git a/src/whoosh/query/qcolumns.py b/src/whoosh/query/qcolumns.py index 6aeab5cd..58175529 100644 --- a/src/whoosh/query/qcolumns.py +++ b/src/whoosh/query/qcolumns.py @@ -26,7 +26,7 @@ # policies, either expressed or implied, of Matt Chaput. from whoosh.matching import ConstantScoreMatcher, NullMatcher, ReadTooFar -from whoosh.query import Query +from whoosh.query.qcore import Query class ColumnQuery(Query): diff --git a/src/whoosh/query/qcore.py b/src/whoosh/query/qcore.py index 61305036..a827a9bc 100644 --- a/src/whoosh/query/qcore.py +++ b/src/whoosh/query/qcore.py @@ -25,15 +25,13 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import division + import copy from array import array from whoosh import matching -from whoosh.compat import u +from whoosh.compat import methodcaller, u from whoosh.reading import TermNotFound -from whoosh.compat import methodcaller - # Exceptions @@ -83,7 +81,7 @@ def token_lists(q, phrases=True): # Utility classes -class Lowest(object): +class Lowest: """A value that is always compares lower than any other object except itself. """ @@ -112,7 +110,7 @@ def __ge__(self, other): return self.__eq__(other) or self.__gt__(other) -class Highest(object): +class Highest: """A value that is always compares higher than any other object except itself. """ @@ -148,7 +146,7 @@ def __ge__(self, other): # Base classes -class Query(object): +class Query: """Abstract base class for all queries. Note that this base class implements __or__, __and__, and __sub__ to allow @@ -410,8 +408,7 @@ def leaves(self): yield self else: for q in self.children(): - for qq in q.leaves(): - yield qq + yield from q.leaves() def iter_all_terms(self, phrases=True): """Returns an iterator of (fieldname, text) pairs for all terms in @@ -436,8 +433,7 @@ def iter_all_terms(self, phrases=True): for q in self.leaves(): if q.has_terms(): - for t in q.terms(phrases=phrases): - yield t + yield from q.terms(phrases=phrases) def all_tokens(self, boost=1.0): """Returns an iterator of :class:`analysis.Token` objects corresponding @@ -449,13 +445,11 @@ def all_tokens(self, boost=1.0): """ if self.is_leaf(): - for token in self.tokens(boost): - yield token + yield from self.tokens(boost) else: boost *= self.boost if hasattr(self, "boost") else 1.0 for child in self.children(): - for token in child.all_tokens(boost): - yield token + yield from child.all_tokens(boost) def tokens(self, boost=1.0, exreader=None): """Yields zero or more :class:`analysis.Token` objects corresponding to @@ -496,7 +490,7 @@ def requires(self): # Subclasses should implement the _add_required_to(qset) method - return set([self]) + return {self} def field(self): """Returns the field this query matches in, or None if this query does @@ -611,7 +605,7 @@ def __call__(self): return self def __repr__(self): - return "<%s>" % (self.__class__.__name__) + return f"<{self.__class__.__name__}>" def __eq__(self, other): return isinstance(other, _NullQuery) @@ -705,11 +699,7 @@ def __init__(self, fieldname=None, boost=1.0): self.boost = boost def __repr__(self): - return "%s(%r, boost=%s)" % ( - self.__class__.__name__, - self.fieldname, - self.boost, - ) + return f"{self.__class__.__name__}({self.fieldname!r}, boost={self.boost})" def __eq__(self, other): return ( diff --git a/src/whoosh/query/ranges.py b/src/whoosh/query/ranges.py index 75ff8a82..cd96e063 100644 --- a/src/whoosh/query/ranges.py +++ b/src/whoosh/query/ranges.py @@ -25,18 +25,17 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import division from whoosh.compat import b, u -from whoosh.query import qcore, terms, compound, wrappers +from whoosh.query import compound, qcore, terms, wrappers from whoosh.util.times import datetime_to_long -class RangeMixin(object): +class RangeMixin: # Contains methods shared by TermRange and NumericRange def __repr__(self): - return "%s(%r, %r, %r, %s, %s, boost=%s, constantscore=%s)" % ( + return "{}({!r}, {!r}, {!r}, {}, {}, boost={}, constantscore={})".format( self.__class__.__name__, self.fieldname, self.start, @@ -255,12 +254,45 @@ def _btexts(self, ixreader): class NumericRange(RangeMixin, qcore.Query): - """A range query for NUMERIC fields. Takes advantage of tiered indexing + """ + A range query for NUMERIC fields. Takes advantage of tiered indexing to speed up large ranges by matching at a high resolution at the edges of the range and a low resolution in the middle. - >>> # Match numbers from 10 to 5925 in the "number" field. - >>> nr = NumericRange("number", 10, 5925) + Example Usage: + # Match numbers from 10 to 5925 in the "number" field. + nr = NumericRange("number", 10, 5925) + + Methods: + __init__(self, fieldname, start, end, startexcl=False, endexcl=False, boost=1.0, constantscore=True): + Initializes a NumericRange object with the specified parameters. + + simplify(self, ixreader): + Simplifies the range query by compiling it and calling the simplify method on the compiled query. + + estimate_size(self, ixreader): + Estimates the size of the range query by compiling it and calling the estimate_size method on the compiled query. + + estimate_min_size(self, ixreader): + Estimates the minimum size of the range query by compiling it and calling the estimate_min_size method on the compiled query. + + docs(self, searcher): + Retrieves the documents that match the range query by compiling it and calling the docs method on the compiled query. + + _compile_query(self, ixreader): + Compiles the range query by preparing the start and end values, generating subqueries for different resolutions, and combining them into a single query. + + matcher(self, searcher, context=None): + Retrieves the matcher for the range query by compiling it and calling the matcher method on the compiled query. + + Fields: + fieldname: The name of the field to search. + start: Match terms equal to or greater than this number. This should be a number type, not a string. + end: Match terms equal to or less than this number. This should be a number type, not a string. + startexcl: If True, the range start is exclusive. If False, the range start is inclusive. + endexcl: If True, the range end is exclusive. If False, the range end is inclusive. + boost: Boost factor that should be applied to the raw score of results matched by this query. + constantscore: If True, the compiled query returns a constant score (the value of the boost keyword argument) instead of actually scoring the matched terms. This gives a nice speed boost and won't affect the results in most cases since numeric ranges will almost always be used as a filter. """ def __init__( @@ -300,6 +332,13 @@ def __init__( self.boost = boost self.constantscore = constantscore + # NumericRange should raise an error if the start and end parameters are not numeric. + # Some of the old tests fail if this is enabled. We need to confirm if this is a bug or not. + # if not isinstance(self.start, (int, float)): + # raise ValueError("NumericRange: start parameter must be numeric") + # if not isinstance(self.end, (int, float)): + # raise ValueError("NumericRange: end parameter must be numeric") + def simplify(self, ixreader): return self._compile_query(ixreader).simplify(ixreader) @@ -319,7 +358,7 @@ def _compile_query(self, ixreader): field = ixreader.schema[self.fieldname] if not isinstance(field, NUMERIC): - raise Exception("NumericRange: field %r is not numeric" % self.fieldname) + raise ValueError(f"NumericRange: field {self.fieldname} is not numeric") start = self.start if start is not None: @@ -393,7 +432,7 @@ def __init__( start = datetime_to_long(start) if end: end = datetime_to_long(end) - super(DateRange, self).__init__( + super().__init__( fieldname, start, end, @@ -404,7 +443,7 @@ def __init__( ) def __repr__(self): - return "%s(%r, %r, %r, %s, %s, boost=%s)" % ( + return "{}({!r}, {!r}, {!r}, {}, {}, boost={})".format( self.__class__.__name__, self.fieldname, self.startdate, diff --git a/src/whoosh/query/spans.py b/src/whoosh/query/spans.py index 13a8321a..8d13add5 100644 --- a/src/whoosh/query/spans.py +++ b/src/whoosh/query/spans.py @@ -43,15 +43,16 @@ """ -from whoosh.matching import mcore, wrappers, binary -from whoosh.query import Query, And, AndMaybe, Or, Term +from whoosh.matching import binary, mcore, wrappers +from whoosh.query.compound import And, AndMaybe, Or +from whoosh.query.qcore import Query +from whoosh.query.terms import Term from whoosh.util import make_binary_tree - # Span class -class Span(object): +class Span: __slots__ = ("start", "end", "startchar", "endchar", "boost") def __init__(self, start, end=None, startchar=None, endchar=None, boost=1.0): @@ -199,7 +200,7 @@ class SpanWrappingMatcher(wrappers.WrappingMatcher): """ def __init__(self, child): - super(SpanWrappingMatcher, self).__init__(child) + super().__init__(child) self._spans = None if self.is_active(): self._find_next() @@ -279,7 +280,7 @@ def _subm(self, s, context=None): return self.q.matcher(s, context) def __repr__(self): - return "%s(%r)" % (self.__class__.__name__, self.q) + return f"{self.__class__.__name__}({self.q!r})" def __eq__(self, other): return other and self.__class__ is other.__class__ and self.q == other.q @@ -339,7 +340,7 @@ def matcher(self, searcher, context=None): class SpanFirstMatcher(SpanWrappingMatcher): def __init__(self, child, limit=0): self.limit = limit - super(SpanFirst.SpanFirstMatcher, self).__init__(child) + super().__init__(child) def copy(self): return self.__class__(self.child.copy(), limit=self.limit) @@ -480,7 +481,7 @@ def __init__(self, a, b, slop=1, ordered=True, mindist=1): self.ordered = ordered self.mindist = mindist isect = binary.IntersectionMatcher(a, b) - super(SpanNear.SpanNearMatcher, self).__init__(isect) + super().__init__(isect) def copy(self): return self.__class__( @@ -633,7 +634,7 @@ def __init__(self, ms, slop=1, ordered=True, mindist=1): self.ordered = ordered self.mindist = mindist isect = make_binary_tree(binary.IntersectionMatcher, ms) - super(SpanNear2.SpanNear2Matcher, self).__init__(isect) + super().__init__(isect) def copy(self): return self.__class__( @@ -727,7 +728,7 @@ def __init__(self, a, b): self.a = a self.b = b um = binary.UnionMatcher(a, b) - super(SpanOr.SpanOrMatcher, self).__init__(um) + super().__init__(um) def _get_spans(self): a_active = self.a.is_active() @@ -798,7 +799,7 @@ def __init__(self, a, b): self.a = a self.b = b amm = binary.AndMaybeMatcher(a, b) - super(SpanNot._Matcher, self).__init__(amm) + super().__init__(amm) def _get_spans(self): if self.a.id() == self.b.id(): @@ -847,7 +848,7 @@ def __init__(self, a, b): self.a = a self.b = b im = binary.IntersectionMatcher(a, b) - super(SpanContains._Matcher, self).__init__(im) + super().__init__(im) def _get_spans(self): spans = [] @@ -893,7 +894,7 @@ def __init__(self, a, b): self.a = a self.b = b im = binary.IntersectionMatcher(a, b) - super(SpanBefore._Matcher, self).__init__(im) + super().__init__(im) def _get_spans(self): bminstart = min(bspan.start for bspan in self.b.spans()) @@ -923,7 +924,7 @@ class _Matcher(SpanBiMatcher): def __init__(self, a, b): self.a = a im = binary.IntersectionMatcher(a, b) - super(SpanCondition._Matcher, self).__init__(im) + super().__init__(im) def _get_spans(self): return self.a.spans() diff --git a/src/whoosh/query/terms.py b/src/whoosh/query/terms.py index 6922a2b7..cffd471b 100644 --- a/src/whoosh/query/terms.py +++ b/src/whoosh/query/terms.py @@ -25,7 +25,7 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import division + import copy import fnmatch import re @@ -61,9 +61,9 @@ def __eq__(self, other): ) def __repr__(self): - r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text) + r = f"{self.__class__.__name__}({self.fieldname!r}, {self.text!r}" if self.boost != 1.0: - r += ", boost=%s" % self.boost + r += f", boost={self.boost}" r += ")" return r @@ -266,9 +266,9 @@ def __eq__(self, other): ) def __repr__(self): - r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text) + r = f"{self.__class__.__name__}({self.fieldname!r}, {self.text!r}" if self.boost != 1: - r += ", boost=%s" % self.boost + r += f", boost={self.boost}" r += ")" return r @@ -318,7 +318,7 @@ class Prefix(PatternQuery): """ def __unicode__(self): - return "%s:%s*" % (self.fieldname, self.text) + return f"{self.fieldname}:{self.text}*" __str__ = __unicode__ @@ -345,7 +345,7 @@ class Wildcard(PatternQuery): SPECIAL_CHARS = frozenset("*?[") def __unicode__(self): - return "%s:%s" % (self.fieldname, self.text) + return f"{self.fieldname}:{self.text}" __str__ = __unicode__ @@ -391,7 +391,7 @@ class Regex(PatternQuery): SPECIAL_CHARS = frozenset("{}()[].?*+^$\\") def __unicode__(self): - return '%s:r"%s"' % (self.fieldname, self.text) + return f'{self.fieldname}:r"{self.text}"' __str__ = __unicode__ @@ -537,9 +537,9 @@ def __init__(self, fieldname, text, boost=1.0): self.boost = boost def __repr__(self): - r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text) + r = f"{self.__class__.__name__}({self.fieldname!r}, {self.text!r}" if self.boost != 1: - r += ", boost=%s" % self.boost + r += f", boost={self.boost}" r += ")" return r diff --git a/src/whoosh/query/wrappers.py b/src/whoosh/query/wrappers.py index 5f07a80d..a3fc62eb 100644 --- a/src/whoosh/query/wrappers.py +++ b/src/whoosh/query/wrappers.py @@ -25,7 +25,7 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import division + from array import array from whoosh import matching @@ -38,7 +38,7 @@ def __init__(self, child): self.child = child def __repr__(self): - return "%s(%r)" % (self.__class__.__name__, self.child) + return f"{self.__class__.__name__}({self.child!r})" def __hash__(self): return hash(self.__class__.__name__) ^ hash(self.child) @@ -102,7 +102,7 @@ def __eq__(self, other): return other and self.__class__ is other.__class__ and self.query == other.query def __repr__(self): - return "%s(%s)" % (self.__class__.__name__, repr(self.query)) + return f"{self.__class__.__name__}({repr(self.query)})" def __unicode__(self): return u("NOT ") + text_type(self.query) diff --git a/src/whoosh/reading.py b/src/whoosh/reading.py index f9f0a13e..484fff6c 100644 --- a/src/whoosh/reading.py +++ b/src/whoosh/reading.py @@ -28,21 +28,19 @@ """This module contains classes that allow reading from an index. """ -from math import log from bisect import bisect_right -from heapq import heapify, heapreplace, heappop, nlargest +from heapq import heapify, heappop, heapreplace, nlargest +from math import log from cached_property import cached_property from whoosh import columns -from whoosh.compat import abstractmethod -from whoosh.compat import zip_, next, iteritems +from whoosh.compat import abstractmethod, iteritems, next, zip_ from whoosh.filedb.filestore import OverlayStorage from whoosh.matching import MultiMatcher from whoosh.support.levenshtein import distance from whoosh.system import emptybytes - # Exceptions @@ -61,7 +59,7 @@ class TermNotFound(Exception): # Term Info base class -class TermInfo(object): +class TermInfo: """Represents a set of statistics about a term. This object is returned by :meth:`IndexReader.term_info`. These statistics may be useful for optimizations and scoring algorithms. @@ -145,7 +143,7 @@ def max_id(self): # Reader base class -class IndexReader(object): +class IndexReader: """Do not instantiate this object directly. Instead use Index.reader().""" def __enter__(self): @@ -664,7 +662,7 @@ def generation(self): return self._gen def __repr__(self): - return "%s(%r, %r)" % (self.__class__.__name__, self._storage, self._segment) + return f"{self.__class__.__name__}({self._storage!r}, {self._segment!r})" def __contains__(self, term): if self.is_closed: @@ -745,9 +743,9 @@ def _test_field(self, fieldname): if self.is_closed: raise ReaderClosed if fieldname not in self.schema: - raise TermNotFound("No field %r" % fieldname) + raise TermNotFound(f"No field {fieldname!r}") if self.schema[fieldname].format is None: - raise TermNotFound("Field %r is not indexed" % fieldname) + raise TermNotFound(f"Field {fieldname!r} is not indexed") def indexed_field_names(self): return self._terms.indexed_field_names() @@ -778,7 +776,7 @@ def term_info(self, fieldname, text): try: return self._terms.term_info(fieldname, text) except KeyError: - raise TermNotFound("%s:%r" % (fieldname, text)) + raise TermNotFound(f"{fieldname}:{text!r}") def expand_prefix(self, fieldname, prefix): self._test_field(fieldname) @@ -834,7 +832,7 @@ def postings(self, fieldname, text, scorer=None): if self.is_closed: raise ReaderClosed if fieldname not in self.schema: - raise TermNotFound("No field %r" % fieldname) + raise TermNotFound(f"No field {fieldname!r}") text = self._text_to_bytes(fieldname, text) format_ = self.schema[fieldname].format matcher = self._terms.matcher(fieldname, text, format_, scorer=scorer) @@ -847,10 +845,10 @@ def vector(self, docnum, fieldname, format_=None): if self.is_closed: raise ReaderClosed if fieldname not in self.schema: - raise TermNotFound("No field %r" % fieldname) + raise TermNotFound(f"No field {fieldname!r}") vformat = format_ or self.schema[fieldname].vector if not vformat: - raise Exception("No vectors are stored for field %r" % fieldname) + raise Exception(f"No vectors are stored for field {fieldname!r}") return self._perdoc.vector(docnum, fieldname, vformat) def cursor(self, fieldname): @@ -884,7 +882,7 @@ def column_reader(self, fieldname, column=None, reverse=False, translate=True): fieldobj = self.schema[fieldname] column = column or fieldobj.column_type if not column: - raise Exception("No column for field %r in %r" % (fieldname, self)) + raise Exception(f"No column for field {fieldname!r} in {self!r}") if self._perdoc.has_column(fieldname): creader = self._perdoc.column_reader(fieldname, column) @@ -954,7 +952,7 @@ def is_deleted(self, docnum): return False def stored_fields(self, docnum): - raise KeyError("No document number %s" % docnum) + raise KeyError(f"No document number {docnum}") def all_stored_fields(self): return iter([]) @@ -984,13 +982,13 @@ def doc_field_length(self, docnum, fieldname, default=0): return default def postings(self, fieldname, text, scorer=None): - raise TermNotFound("%s:%r" % (fieldname, text)) + raise TermNotFound(f"{fieldname}:{text!r}") def has_vector(self, docnum, fieldname): return False def vector(self, docnum, fieldname, format_=None): - raise KeyError("No document number %s" % docnum) + raise KeyError(f"No document number {docnum}") def most_frequent_terms(self, fieldname, number=5, prefix=""): return iter([]) @@ -1224,8 +1222,7 @@ def column_reader(self, fieldname, column=None, reverse=False, translate=True): def all_stored_fields(self): for reader in self.readers: - for result in reader.all_stored_fields(): - yield result + yield from reader.all_stored_fields() def doc_count_all(self): return sum(dr.doc_count_all() for dr in self.readers) @@ -1281,7 +1278,7 @@ def combine_terminfos(tis): return TermInfo(w, df, ml, xl, xw, mid, xid) -class MultiCursor(object): +class MultiCursor: def __init__(self, cursors): self._cursors = [c for c in cursors if c.is_valid()] self._low = [] diff --git a/src/whoosh/scoring.py b/src/whoosh/scoring.py index 40de76e5..ad515b43 100644 --- a/src/whoosh/scoring.py +++ b/src/whoosh/scoring.py @@ -29,16 +29,15 @@ This module contains classes for scoring (and sorting) search results. """ -from __future__ import division + from math import log, pi from whoosh.compat import iteritems - # Base classes -class WeightingModel(object): +class WeightingModel: """Abstract base class for scoring models. A WeightingModel object provides a method, ``scorer``, which returns an instance of :class:`whoosh.scoring.Scorer`. @@ -85,7 +84,7 @@ def final(self, searcher, docnum, score): return score -class BaseScorer(object): +class BaseScorer: """Base class for "scorer" implementations. A scorer provides a method for scoring a document, and sometimes methods for rating the "quality" of a document and a matcher's current "block", to implement quality-based diff --git a/src/whoosh/searching.py b/src/whoosh/searching.py index af5e3b8e..87af340d 100644 --- a/src/whoosh/searching.py +++ b/src/whoosh/searching.py @@ -29,14 +29,13 @@ """ -from __future__ import division import copy import weakref from math import ceil from whoosh import classify, highlight, query, scoring -from whoosh.compat import iteritems, itervalues, iterkeys, range -from whoosh.idsets import DocIdSet, BitSet +from whoosh.compat import iteritems, iterkeys, itervalues +from whoosh.idsets import BitSet, DocIdSet from whoosh.reading import TermNotFound @@ -62,7 +61,7 @@ class TimeLimit(Exception): # Context class -class SearchContext(object): +class SearchContext: """A container for information about the current search that may be used by the collector or the query objects to change how they operate. """ @@ -87,7 +86,7 @@ def __init__(self, needs_current=False, weighting=None, top_query=None, limit=0) self.limit = limit def __repr__(self): - return "%s(%r)" % (self.__class__.__name__, self.__dict__) + return f"{self.__class__.__name__}({self.__dict__!r})" def set(self, **kwargs): ctx = copy.copy(self) @@ -98,7 +97,7 @@ def set(self, **kwargs): # Searcher class -class Searcher(object): +class Searcher: """Wraps an :class:`~whoosh.reading.IndexReader` object and provides methods for searching the index. """ @@ -242,7 +241,9 @@ def up_to_date(self): """ if not self._ix: - raise Exception("No reference to index") + raise ValueError( + "No reference to index" + ) # Replace generic exception with ValueError return self._ix.latest_generation() == self.ixreader.generation() def refresh(self): @@ -259,7 +260,7 @@ def refresh(self): """ if not self._ix: - raise Exception("No reference to index") + raise ValueError("No reference to index") if self._ix.latest_generation() == self.reader().generation(): return self @@ -474,7 +475,7 @@ def _filter_to_comb(self, obj): elif isinstance(obj, query.Query): c = self._query_to_comb(obj) else: - raise Exception("Don't know what to do with filter object %r" % obj) + raise ValueError(f"Don't know what to do with filter object {obj}") return c @@ -612,7 +613,7 @@ def more_like( [query.Term(fieldname, word, boost=weight) for word, weight in kts] ) - return self.search(q, limit=top, filter=filter, mask=set([docnum])) + return self.search(q, limit=top, filter=filter, mask={docnum}) def search_page(self, query, pagenum, pagelen=10, **kwargs): """This method is Like the :meth:`Searcher.search` method, but returns @@ -977,7 +978,7 @@ def correct_query( return sqc.correct_query(q, qstring) -class Results(object): +class Results: """This object is returned by a Searcher. This object represents the results of a search query. You can mostly use it as if it was a list of dictionaries, where each dictionary is the stored fields of the document at @@ -1018,11 +1019,7 @@ def __init__( self._char_cache = {} def __repr__(self): - return "" % ( - len(self.top_n), - self.q, - self.runtime, - ) + return f"" def __len__(self): """Returns the total number of documents that matched the query. Note @@ -1052,7 +1049,7 @@ def __getitem__(self, n): else: if n >= len(self.top_n): raise IndexError( - "results[%r]: Results only has %s hits" % (n, len(self.top_n)) + f"results[{n!r}]: Results only has {len(self.top_n)} hits" ) return Hit(self, self.top_n[n][1], n, self.top_n[n][0]) @@ -1145,7 +1142,7 @@ def groups(self, name=None): # for Python 3 name = list(self._facetmaps.keys())[0] elif name not in self._facetmaps: - raise KeyError("%r not in facet names %r" % (name, self.facet_names())) + raise KeyError(f"{name!r} not in facet names {self.facet_names()!r}") return self._facetmaps[name].as_dict() def has_exact_length(self): @@ -1400,7 +1397,7 @@ def upgrade_and_extend(self, results): self.top_n = arein + notin + other -class Hit(object): +class Hit: """Represents a single search result ("hit") in a Results object. This object acts like a dictionary of the matching document's stored @@ -1561,7 +1558,7 @@ def more_like_this( ) def __repr__(self): - return "<%s %r>" % (self.__class__.__name__, self.fields()) + return f"<{self.__class__.__name__} {self.fields()!r}>" def __eq__(self, other): if isinstance(other, Hit): @@ -1609,23 +1606,11 @@ def iterkeys(self): def itervalues(self): return itervalues(self.fields()) - def get(self, key, default=None): - return self.fields().get(key, default) - - def __setitem__(self, key, value): - raise NotImplementedError("You cannot modify a search result") - - def __delitem__(self, key, value): - raise NotImplementedError("You cannot modify a search result") - - def clear(self): - raise NotImplementedError("You cannot modify a search result") - - def update(self, dict=None, **kwargs): + def __delitem__(self, key): raise NotImplementedError("You cannot modify a search result") -class ResultsPage(object): +class ResultsPage: """Represents a single page out of a longer list of results, as returned by :func:`whoosh.searching.Searcher.search_page`. Supports a subset of the interface of the :class:`~whoosh.searching.Results` object, namely getting diff --git a/src/whoosh/sorting.py b/src/whoosh/sorting.py index b3cfc9a6..cd36fb99 100644 --- a/src/whoosh/sorting.py +++ b/src/whoosh/sorting.py @@ -28,14 +28,12 @@ from array import array from collections import defaultdict -from whoosh.compat import string_type -from whoosh.compat import iteritems, izip, range - +from whoosh.compat import iteritems, izip, string_type # Faceting objects -class FacetType(object): +class FacetType: """Base class for "facets", aspects that can be sorted/faceted.""" maptype = None @@ -65,7 +63,7 @@ def default_name(self): return "facet" -class Categorizer(object): +class Categorizer: """Base class for categorizer objects which compute a key value for a document based on certain criteria, for use in sorting/faceting. @@ -224,7 +222,7 @@ def __init__(self, global_searcher, fieldname, reverse=False): self._creader = None def __repr__(self): - return "%s(%r, %r, reverse=%r)" % ( + return "{}({!r}, {!r}, reverse={!r})".format( self.__class__.__name__, self._fieldobj, self._fieldname, @@ -501,7 +499,7 @@ def _range_name(self, startval, endval): def _queries(self): if not self.gap: - raise Exception("No gap secified (%r)" % self.gap) + raise Exception(f"No gap secified ({self.gap!r})") if isinstance(self.gap, (list, tuple)): gaps = self.gap gapindex = 0 @@ -780,7 +778,7 @@ def __init__(self, items=None, maptype=None): self.maptype = maptype def __repr__(self): - return "%s(%r, %r)" % (self.__class__.__name__, self.facets, self.maptype) + return f"{self.__class__.__name__}({self.facets!r}, {self.maptype!r})" @classmethod def from_sortedby(cls, sortedby): @@ -800,7 +798,7 @@ def _add(self, item): elif isinstance(item, string_type): self.add_field(item) else: - raise Exception("Don't know what to do with facet %r" % (item,)) + raise Exception(f"Don't know what to do with facet {item!r}") def add_field(self, fieldname, reverse=False): self.facets.append(FieldFacet(fieldname, reverse=reverse)) @@ -819,7 +817,7 @@ def add_score(self): def add_facet(self, facet): if not isinstance(facet, FacetType): raise TypeError( - "%r is not a facet object, perhaps you meant " "add_field()" % (facet,) + f"{facet!r} is not a facet object, perhaps you meant add_field()" ) self.facets.append(facet) return self @@ -857,7 +855,7 @@ def key_to_name(self, key): ) -class Facets(object): +class Facets: """Maps facet names to :class:`FacetType` objects, for creating multiple groupings of documents. @@ -892,7 +890,7 @@ def from_groupedby(cls, groupedby): for item in groupedby: facets.add_facets(cls.from_groupedby(item)) else: - raise Exception("Don't know what to do with groupedby=%r" % groupedby) + raise Exception(f"Don't know what to do with groupedby={groupedby!r}") return facets @@ -931,7 +929,7 @@ def add_facet(self, name, facet): """Adds a :class:`FacetType` object under the given ``name``.""" if not isinstance(facet, FacetType): - raise Exception("%r:%r is not a facet" % (name, facet)) + raise Exception(f"{name!r}:{facet!r} is not a facet") self.facets[name] = facet return self @@ -941,7 +939,7 @@ def add_facets(self, facets, replace=True): """ if not isinstance(facets, (dict, Facets)): - raise Exception("%r is not a Facets object or dict" % facets) + raise Exception(f"{facets!r} is not a Facets object or dict") for name, facet in facets.items(): if replace or name not in self.facets: self.facets[name] = facet @@ -951,7 +949,7 @@ def add_facets(self, facets, replace=True): # Objects for holding facet groups -class FacetMap(object): +class FacetMap: """Base class for objects holding the results of grouping search results by a Facet. Use an object's ``as_dict()`` method to access the results. @@ -999,7 +997,7 @@ def __init__(self): self.dict = defaultdict(list) def __repr__(self): - return "<%s %r>" % (self.__class__.__name__, self.dict) + return f"<{self.__class__.__name__} {self.dict!r}>" def add(self, groupname, docid, sortkey): self.dict[groupname].append((sortkey, docid)) @@ -1025,7 +1023,7 @@ def __init__(self): self.dict = defaultdict(list) def __repr__(self): - return "<%s %r>" % (self.__class__.__name__, self.dict) + return f"<{self.__class__.__name__} {self.dict!r}>" def add(self, groupname, docid, sortkey): self.dict[groupname].append(docid) @@ -1045,7 +1043,7 @@ def __init__(self): self.dict = defaultdict(int) def __repr__(self): - return "<%s %r>" % (self.__class__.__name__, self.dict) + return f"<{self.__class__.__name__} {self.dict!r}>" def add(self, groupname, docid, sortkey): self.dict[groupname] += 1 @@ -1067,7 +1065,7 @@ def __init__(self): self.bestkeys = {} def __repr__(self): - return "<%s %r>" % (self.__class__.__name__, self.bestids) + return f"<{self.__class__.__name__} {self.bestids!r}>" def add(self, groupname, docid, sortkey): if groupname not in self.bestids or sortkey < self.bestkeys[groupname]: @@ -1112,7 +1110,7 @@ def add_sortable(writer, fieldname, facet, column=None): if fieldname in schema: field = schema[fieldname] if field.column_type: - raise Exception("%r field is already sortable" % fieldname) + raise Exception(f"{fieldname!r} field is already sortable") if column: if fieldname not in schema: @@ -1124,7 +1122,7 @@ def add_sortable(writer, fieldname, facet, column=None): if fieldname in schema: column = field.default_column() else: - raise Exception("Field %r does not exist" % fieldname) + raise Exception(f"Field {fieldname!r} does not exist") searcher = writer.searcher() catter = facet.categorizer(searcher) @@ -1133,7 +1131,7 @@ def add_sortable(writer, fieldname, facet, column=None): reader = subsearcher.reader() if reader.has_column(fieldname): - raise Exception("%r field already has a column" % fieldname) + raise Exception(f"{fieldname!r} field already has a column") codec = reader.codec() segment = reader.segment() diff --git a/src/whoosh/spelling.py b/src/whoosh/spelling.py index cbc1eca2..e929b4f1 100644 --- a/src/whoosh/spelling.py +++ b/src/whoosh/spelling.py @@ -33,13 +33,12 @@ from heapq import heappush, heapreplace from whoosh import highlight -from whoosh.compat import iteritems, range - +from whoosh.compat import iteritems # Corrector objects -class Corrector(object): +class Corrector: """ Base class for spelling correction objects. Concrete sub-classes should implement the ``_suggestions`` method. @@ -125,8 +124,8 @@ def __init__(self, wordlist): self.wordlist = wordlist def _suggestions(self, text, maxdist, prefix): - from whoosh.automata.lev import levenshtein_automaton from whoosh.automata.fsa import find_all_matches + from whoosh.automata.lev import levenshtein_automaton seen = set() for mxd in range(1, maxdist + 1): @@ -137,7 +136,7 @@ def _suggestions(self, text, maxdist, prefix): seen.add(sug) yield (0 - mxd), sug - class Skipper(object): + class Skipper: def __init__(self, data): self.data = data self.i = 0 @@ -177,7 +176,7 @@ def _suggestions(self, text, maxdist, prefix): # Query correction -class Correction(object): +class Correction: """ Represents the corrected version of a user query string. Has the following attributes: @@ -218,7 +217,7 @@ def __init__(self, q, qstring, corr_q, tokens): self.string = "" def __repr__(self): - return "%s(%r, %r)" % (self.__class__.__name__, self.query, self.string) + return f"{self.__class__.__name__}({self.query!r}, {self.string!r})" def format_string(self, formatter): """ @@ -242,7 +241,7 @@ def format_string(self, formatter): # QueryCorrector objects -class QueryCorrector(object): +class QueryCorrector: """ Base class for objects that correct words in a user query. """ diff --git a/src/whoosh/support/base85.py b/src/whoosh/support/base85.py index 66e7915c..adb9e74c 100644 --- a/src/whoosh/support/base85.py +++ b/src/whoosh/support/base85.py @@ -9,9 +9,6 @@ import struct -from whoosh.compat import range - - # Instead of using the character set from the ascii85 algorithm, I put the # characters in order so that the encoded text sorts properly (my life would be # a lot easier if they had just done that from the start) diff --git a/src/whoosh/support/bench.py b/src/whoosh/support/bench.py index 3ec9d6f0..d4495745 100644 --- a/src/whoosh/support/bench.py +++ b/src/whoosh/support/bench.py @@ -25,13 +25,13 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import division + import os.path from optparse import OptionParser from shutil import rmtree from whoosh import index, qparser, query, scoring -from whoosh.util import now, find_object +from whoosh.util import find_object, now try: import xappy # type: ignore @@ -57,7 +57,7 @@ def __init__(self, d): pass -class Module(object): +class Module: def __init__(self, bench, options, args): self.bench = bench self.options = options @@ -76,7 +76,7 @@ def finish(self, **kwargs): pass def _process_result(self, d): - attrname = "process_result_%s" % self.options.lib + attrname = f"process_result_{self.options.lib}" if hasattr(self.bench.spec, attrname): method = getattr(self.bench.spec, attrname) self._process_result = method @@ -102,7 +102,7 @@ def results(self, r): yield self._process_result(hit) -class Spec(object): +class Spec: headline_field = "title" main_field = "body" @@ -134,7 +134,7 @@ def print_results(self, ls): class WhooshModule(Module): def indexer(self, create=True): schema = self.bench.spec.whoosh_schema() - path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname) + path = os.path.join(self.options.dir, f"{self.options.indexname}_whoosh") if not os.path.exists(path): os.mkdir(path) @@ -169,7 +169,7 @@ def finish(self, merge=True, optimize=False): self.writer.commit(merge=merge, optimize=optimize) def searcher(self): - path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname) + path = os.path.join(self.options.dir, f"{self.options.indexname}_whoosh") ix = index.open_dir(path) self.srch = ix.searcher(weighting=scoring.PL2()) self.parser = qparser.QueryParser(self.bench.spec.main_field, schema=ix.schema) @@ -194,7 +194,7 @@ def findterms(self, terms): class XappyModule(Module): def indexer(self, **kwargs): - path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname) + path = os.path.join(self.options.dir, f"{self.options.indexname}_xappy") conn = self.bench.spec.xappy_connection(path) return conn @@ -213,7 +213,7 @@ def finish(self, conn): conn.flush() def searcher(self): - path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname) + path = os.path.join(self.options.dir, f"{self.options.indexname}_xappy") return xappy.SearchConnection(path) def query(self, conn): @@ -237,7 +237,7 @@ def results(self, r): class XapianModule(Module): def indexer(self, **kwargs): - path = os.path.join(self.options.dir, "%s_xapian" % self.options.indexname) + path = os.path.join(self.options.dir, f"{self.options.indexname}_xapian") self.database = xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OPEN) self.ixer = xapian.TermGenerator() @@ -255,7 +255,7 @@ def finish(self, **kwargs): self.database.flush() def searcher(self): - path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname) + path = os.path.join(self.options.dir, f"{self.options.indexname}_xappy") self.db = xapian.Database(path) self.enq = xapian.Enquire(self.db) self.qp = xapian.QueryParser() @@ -320,12 +320,14 @@ def findterms(self, terms): class ZcatalogModule(Module): def indexer(self, **kwargs): - from ZODB.FileStorage import FileStorage # type: ignore # type: ignore @UnresolvedImport - from ZODB.DB import DB # type: ignore # type: ignore @UnresolvedImport - from zcatalog import catalog # type: ignore # type: ignore @UnresolvedImport import transaction # type: ignore # type: ignore @UnresolvedImport + from zcatalog import catalog # type: ignore # type: ignore @UnresolvedImport + from ZODB.DB import DB # type: ignore # type: ignore @UnresolvedImport + from ZODB.FileStorage import ( + FileStorage, # type: ignore # type: ignore @UnresolvedImport + ) - dir = os.path.join(self.options.dir, "%s_zcatalog" % self.options.indexname) + dir = os.path.join(self.options.dir, f"{self.options.indexname}_zcatalog") if os.path.exists(dir): rmtree(dir) os.mkdir(dir) @@ -360,11 +362,13 @@ def finish(self, **kwargs): del self.zcatalog_count def searcher(self): - from ZODB.FileStorage import FileStorage # type: ignore # type: ignore @UnresolvedImport from ZODB.DB import DB # type: ignore # type: ignore @UnresolvedImport + from ZODB.FileStorage import ( + FileStorage, # type: ignore # type: ignore @UnresolvedImport + ) path = os.path.join( - self.options.dir, "%s_zcatalog" % self.options.indexname, "index" + self.options.dir, f"{self.options.indexname}_zcatalog", "index" ) storage = FileStorage(path) db = DB(storage) @@ -393,9 +397,10 @@ def results(self, r): class NucularModule(Module): def indexer(self, create=True): import shutil + from nucular import Nucular # type: ignore # type: ignore @UnresolvedImport - dir = os.path.join(self.options.dir, "%s_nucular" % self.options.indexname) + dir = os.path.join(self.options.dir, f"{self.options.indexname}_nucular") if create: if os.path.exists(dir): shutil.rmtree(dir) @@ -426,7 +431,7 @@ def finish(self, **kwargs): def searcher(self): from nucular import Nucular # type: ignore # type: ignore @UnresolvedImport - dir = os.path.join(self.options.dir, "%s_nucular" % self.options.indexname) + dir = os.path.join(self.options.dir, f"{self.options.indexname}_nucular") self.archive = Nucular.Nucular(dir) def query(self): @@ -442,7 +447,7 @@ def findterms(self, terms): yield q.resultDictionaries() -class Bench(object): +class Bench: libs = { "whoosh": WhooshModule, "xappy": XappyModule, @@ -453,7 +458,7 @@ class Bench(object): } def index(self, lib): - print("Indexing with %s..." % lib) + print(f"Indexing with {lib}...") options = self.options every = None if options.every is None else int(options.every) @@ -499,7 +504,7 @@ def index(self, lib): "Total time to index %d documents: %0.3f secs (%0.3f minutes)" % (count, totaltime, totaltime / 60.0) ) - print("Indexed %0.3f docs/s" % (count / totaltime)) + print(f"Indexed {count / totaltime:0.3f} docs/s") def search(self, lib): lib.searcher() @@ -519,7 +524,7 @@ def search_file(self, lib): terms = [line.strip() for line in f] f.close() - print("Searching %d terms with %s" % (len(terms), lib)) + print(f"Searching {len(terms)} terms with {lib}") lib.searcher() starttime = now() for r in lib.findterms(terms): @@ -566,7 +571,7 @@ def _parser(self, name): dest="indexname", metavar="PREFIX", help="Index name prefix.", - default="%s_index" % name, + default=f"{name}_index", ) p.add_option( "-U", @@ -723,7 +728,7 @@ def run(self, specclass): self.args = args if options.lib not in self.libs: - raise Exception("Unknown library: %r" % options.lib) + raise Exception(f"Unknown library: {options.lib!r}") lib = self.libs[options.lib](self, options, args) self.spec = specclass(options, args) diff --git a/src/whoosh/support/bitstream.py b/src/whoosh/support/bitstream.py index 682afbb8..50984639 100644 --- a/src/whoosh/support/bitstream.py +++ b/src/whoosh/support/bitstream.py @@ -8,11 +8,10 @@ from whoosh.system import _LONG_SIZE - _bitsperlong = _LONG_SIZE * 8 -class BitStreamReader(object): +class BitStreamReader: def __init__(self, source): self._totalbits = len(source) * _bitsperlong self._position = 0 diff --git a/src/whoosh/support/bitvector.py b/src/whoosh/support/bitvector.py index 7790735b..d7ef507d 100644 --- a/src/whoosh/support/bitvector.py +++ b/src/whoosh/support/bitvector.py @@ -6,206 +6,453 @@ from array import array #: Table of the number of '1' bits in each byte (0-255) -BYTE_COUNTS = array('B', [ - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8]) - - -class BitVector(object): +BYTE_COUNTS = array( + "B", + [ + 0, + 1, + 1, + 2, + 1, + 2, + 2, + 3, + 1, + 2, + 2, + 3, + 2, + 3, + 3, + 4, + 1, + 2, + 2, + 3, + 2, + 3, + 3, + 4, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 1, + 2, + 2, + 3, + 2, + 3, + 3, + 4, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 1, + 2, + 2, + 3, + 2, + 3, + 3, + 4, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 4, + 5, + 5, + 6, + 5, + 6, + 6, + 7, + 1, + 2, + 2, + 3, + 2, + 3, + 3, + 4, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 4, + 5, + 5, + 6, + 5, + 6, + 6, + 7, + 2, + 3, + 3, + 4, + 3, + 4, + 4, + 5, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 4, + 5, + 5, + 6, + 5, + 6, + 6, + 7, + 3, + 4, + 4, + 5, + 4, + 5, + 5, + 6, + 4, + 5, + 5, + 6, + 5, + 6, + 6, + 7, + 4, + 5, + 5, + 6, + 5, + 6, + 6, + 7, + 5, + 6, + 6, + 7, + 6, + 7, + 7, + 8, + ], +) + + +class BitVector: """ Implements a memory-efficient array of bits. - + >>> bv = BitVector(10) >>> bv >>> bv[5] = True >>> bv - + You can initialize the BitVector using an iterable of integers representing bit positions to turn on. - + >>> bv2 = BitVector(10, [2, 4, 7]) >>> bv2 >>> bv[2] True - + BitVector supports bit-wise logic operations & (and), | (or), and ^ (xor) between itself and another BitVector of equal size, or itself and a collection of integers (usually a set() or frozenset()). - + >>> bv | bv2 - + Note that ``BitVector.__len__()`` returns the number of "on" bits, not the size of the bit array. This is to make BitVector interchangeable with a set()/frozenset() of integers. To get the size, use BitVector.size. """ - + def __init__(self, size, source=None, bits=None): self.size = size - + if bits: self.bits = bits else: self.bits = array("B", ([0x00] * ((size >> 3) + 1))) - + if source: set = self.set for num in source: set(num) - + self.bcount = None - + def __eq__(self, other): if isinstance(other, BitVector): return self.bits == other.bits return False - + def __repr__(self): - return "" % self.__str__() - + return f"" + def __len__(self): # This returns the count of "on" bits instead of the size to # make BitVector exchangeable with a set() object. return self.count() - + def __contains__(self, index): return self[index] - + def __iter__(self): get = self.__getitem__ for i in range(0, self.size): if get(i): yield i - + def __str__(self): get = self.__getitem__ - return "".join("1" if get(i) else "0" - for i in range(0, self.size)) - + return "".join("1" if get(i) else "0" for i in range(0, self.size)) + def __nonzero__(self): return self.count() > 0 - + def __getitem__(self, index): return self.bits[index >> 3] & (1 << (index & 7)) != 0 - + def __setitem__(self, index, value): if value: self.set(index) else: self.clear(index) - + def _logic(self, op, bitv): if self.size != bitv.size: raise ValueError("Can't combine bitvectors of different sizes") res = BitVector(size=self.size) lpb = map(op, self.bits, bitv.bits) - res.bits = array('B', lpb) + res.bits = array("B", lpb) return res - + def union(self, other): return self.__or__(other) - + def intersection(self, other): return self.__and__(other) - + def __and__(self, other): if not isinstance(other, BitVector): other = BitVector(self.size, source=other) return self._logic(operator.__and__, other) - + def __or__(self, other): if not isinstance(other, BitVector): other = BitVector(self.size, source=other) return self._logic(operator.__or__, other) - + def __ror__(self, other): return self.__or__(other) - + def __rand__(self, other): return self.__and__(other) - + def __xor__(self, other): if not isinstance(other, BitVector): other = BitVector(self.size, source=other) return self._logic(operator.__xor__, other) - + def __invert__(self): - return BitVector(self.size, source=(x for x in range(self.size) if x not in self)) - + return BitVector( + self.size, source=(x for x in range(self.size) if x not in self) + ) + def count(self): """Returns the number of "on" bits in the bit array.""" - + if self.bcount is None: self.bcount = sum(BYTE_COUNTS[b & 0xFF] for b in self.bits) return self.bcount - + def set(self, index): """Turns the bit at the given position on.""" - + if index >= self.size: - raise IndexError("Position %s greater than the size of the vector" % repr(index)) + raise IndexError( + f"Position {repr(index)} greater than the size of the vector" + ) self.bits[index >> 3] |= 1 << (index & 7) self.bcount = None - + def clear(self, index): """Turns the bit at the given position off.""" - + self.bits[index >> 3] &= ~(1 << (index & 7)) self.bcount = None - + def set_from(self, iterable): """Takes an iterable of integers representing positions, and turns on the bits at those positions. """ - + set = self.set for index in iterable: set(index) - + def copy(self): """Returns a copy of this BitArray.""" - + return BitVector(self.size, bits=self.bits) -class BitSet(object): +class BitSet: """A set-like object for holding positive integers. It is dynamically backed by either a set or BitVector depending on how many numbers are in the set. - + Provides ``add``, ``remove``, ``union``, ``intersection``, ``__contains__``, ``__len__``, ``__iter__``, ``__and__``, ``__or__``, and ``__nonzero__`` methods. """ - + def __init__(self, size, source=None): self.size = size - + self._back = () self._switch(size > 256) - + if source: add = self.add for num in source: add(num) - + def _switch(self, toset): if toset: self._back = set(self._back) @@ -215,7 +462,7 @@ def _switch(self, toset): self._back = BitVector() self.add = self._back.set self.remove = self._vec_remove - + self.__contains__ = self._back.__contains__ self.__len__ = self._back.__len__ self.__iter__ = self._back.__iter__ @@ -226,13 +473,13 @@ def as_set(self): def union(self, other): return self.__or__(other) - + def intersection(self, other): return self.__and__(other) def __and__(self, other): self._back = self._back.intersection(other) - + def __or__(self, other): self._back = self._back.union(other) @@ -240,14 +487,8 @@ def _set_add(self, num): self._back.add(num) if len(self._back) * 4 > self.size // 8 + 32: self._switch(False) - + def _vec_remove(self, num): self._back.clear(num) if len(self._back) * 4 < self.size // 8 - 32: self._switch(True) - - - - - - diff --git a/src/whoosh/support/charset.py b/src/whoosh/support/charset.py index 2aef38a7..4acee88f 100644 --- a/src/whoosh/support/charset.py +++ b/src/whoosh/support/charset.py @@ -1,14 +1,12 @@ -# coding=utf-8 - """This module contains tools for working with Sphinx charset table files. These files are useful for doing case and accent folding. See :class:`whoosh.analysis.CharsetTokenizer` and :class:`whoosh.analysis.CharsetFilter`. """ -from collections import defaultdict import re +from collections import defaultdict -from whoosh.compat import izip, u, iteritems, unichr, range +from whoosh.compat import iteritems, izip, range, u, unichr # This is a straightforward accent-folding charset taken from Carlos Bueno's # article "Accent Folding for Auto-Complete", for use with CharsetFilter. @@ -732,7 +730,7 @@ # The unicode.translate() method actually requires a dictionary mapping # character *numbers* to characters, for some reason. -accent_map = dict((ord(k), v) for k, v in iteritems(accent_map)) +accent_map = {ord(k): v for k, v in iteritems(accent_map)} # This Sphinx charset table taken from http://speeple.com/unicode-maps.txt @@ -1297,7 +1295,7 @@ def charspec_to_int(string): elif len(string) == 1: return ord(string) else: - raise Exception("Can't convert charspec: %r" % string) + raise Exception(f"Can't convert charspec: {string!r}") def charset_table_to_dict(tablestring): @@ -1378,5 +1376,5 @@ def charset_table_to_dict(tablestring): pass continue - raise Exception("Don't know what to do with %r" % item) + raise Exception(f"Don't know what to do with {item!r}") return dict(map) diff --git a/src/whoosh/support/levenshtein.py b/src/whoosh/support/levenshtein.py index 2ee222fb..6f15e6d4 100644 --- a/src/whoosh/support/levenshtein.py +++ b/src/whoosh/support/levenshtein.py @@ -2,8 +2,6 @@ Contains functions implementing edit distance algorithms. """ -from whoosh.compat import range - def levenshtein(seq1, seq2, limit=None): """Returns the Levenshtein edit distance between two strings.""" diff --git a/src/whoosh/support/pyparsing.py b/src/whoosh/support/pyparsing.py index 6d25fd81..8db43458 100644 --- a/src/whoosh/support/pyparsing.py +++ b/src/whoosh/support/pyparsing.py @@ -21,10 +21,9 @@ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # -#from __future__ import generators +# from __future__ import generators -__doc__ = \ -""" +__doc__ = """ pyparsing module - Classes and methods to define and execute parsing grammars The pyparsing module is an alternative approach to creating and executing simple grammars, @@ -62,35 +61,120 @@ class names, and the use of '+', '|' and '^' operators. __versionTime__ = "17 February 2009 19:45" __author__ = "Paul McGuire " -import string -from weakref import ref as wkref import copy -import sys -import warnings import re import sre_constants +import string +import sys +import warnings +from weakref import ref as wkref from whoosh.support import unicode -#~ sys.stderr.write( "testing pyparsing module, version %s, %s\n" % (__version__,__versionTime__ ) ) + +# ~ sys.stderr.write( "testing pyparsing module, version %s, %s\n" % (__version__,__versionTime__ ) ) __all__ = [ -'And', 'CaselessKeyword', 'CaselessLiteral', 'CharsNotIn', 'Combine', 'Dict', 'Each', 'Empty', -'FollowedBy', 'Forward', 'GoToColumn', 'Group', 'Keyword', 'LineEnd', 'LineStart', 'Literal', -'MatchFirst', 'NoMatch', 'NotAny', 'OneOrMore', 'OnlyOnce', 'Optional', 'Or', -'ParseBaseException', 'ParseElementEnhance', 'ParseException', 'ParseExpression', 'ParseFatalException', -'ParseResults', 'ParseSyntaxException', 'ParserElement', 'QuotedString', 'RecursiveGrammarException', -'Regex', 'SkipTo', 'StringEnd', 'StringStart', 'Suppress', 'Token', 'TokenConverter', 'Upcase', -'White', 'Word', 'WordEnd', 'WordStart', 'ZeroOrMore', -'alphanums', 'alphas', 'alphas8bit', 'anyCloseTag', 'anyOpenTag', 'cStyleComment', 'col', -'commaSeparatedList', 'commonHTMLEntity', 'countedArray', 'cppStyleComment', 'dblQuotedString', -'dblSlashComment', 'delimitedList', 'dictOf', 'downcaseTokens', 'empty', 'getTokensEndLoc', 'hexnums', -'htmlComment', 'javaStyleComment', 'keepOriginalText', 'line', 'lineEnd', 'lineStart', 'lineno', -'makeHTMLTags', 'makeXMLTags', 'matchOnlyAtCol', 'matchPreviousExpr', 'matchPreviousLiteral', -'nestedExpr', 'nullDebugAction', 'nums', 'oneOf', 'opAssoc', 'operatorPrecedence', 'printables', -'punc8bit', 'pythonStyleComment', 'quotedString', 'removeQuotes', 'replaceHTMLEntity', -'replaceWith', 'restOfLine', 'sglQuotedString', 'srange', 'stringEnd', -'stringStart', 'traceParseAction', 'unicodeString', 'upcaseTokens', 'withAttribute', -'indentedBlock', 'originalTextFor', + "And", + "CaselessKeyword", + "CaselessLiteral", + "CharsNotIn", + "Combine", + "Dict", + "Each", + "Empty", + "FollowedBy", + "Forward", + "GoToColumn", + "Group", + "Keyword", + "LineEnd", + "LineStart", + "Literal", + "MatchFirst", + "NoMatch", + "NotAny", + "OneOrMore", + "OnlyOnce", + "Optional", + "Or", + "ParseBaseException", + "ParseElementEnhance", + "ParseException", + "ParseExpression", + "ParseFatalException", + "ParseResults", + "ParseSyntaxException", + "ParserElement", + "QuotedString", + "RecursiveGrammarException", + "Regex", + "SkipTo", + "StringEnd", + "StringStart", + "Suppress", + "Token", + "TokenConverter", + "Upcase", + "White", + "Word", + "WordEnd", + "WordStart", + "ZeroOrMore", + "alphanums", + "alphas", + "alphas8bit", + "anyCloseTag", + "anyOpenTag", + "cStyleComment", + "col", + "commaSeparatedList", + "commonHTMLEntity", + "countedArray", + "cppStyleComment", + "dblQuotedString", + "dblSlashComment", + "delimitedList", + "dictOf", + "downcaseTokens", + "empty", + "getTokensEndLoc", + "hexnums", + "htmlComment", + "javaStyleComment", + "keepOriginalText", + "line", + "lineEnd", + "lineStart", + "lineno", + "makeHTMLTags", + "makeXMLTags", + "matchOnlyAtCol", + "matchPreviousExpr", + "matchPreviousLiteral", + "nestedExpr", + "nullDebugAction", + "nums", + "oneOf", + "opAssoc", + "operatorPrecedence", + "printables", + "punc8bit", + "pythonStyleComment", + "quotedString", + "removeQuotes", + "replaceHTMLEntity", + "replaceWith", + "restOfLine", + "sglQuotedString", + "srange", + "stringEnd", + "stringStart", + "traceParseAction", + "unicodeString", + "upcaseTokens", + "withAttribute", + "indentedBlock", + "originalTextFor", ] @@ -107,12 +191,13 @@ class names, and the use of '+', '|' and '^' operators. _MAX_INT = sys.maxint if not _PY3K: + def _ustr(obj): """Drop-in replacement for str(obj) that tries to be Unicode friendly. It first tries - str(obj). If that fails with a UnicodeEncodeError, then it tries unicode(obj). It - then < returns the unicode object | encodes it with the default encoding | ... >. + str(obj). If that fails with a UnicodeEncodeError, then it tries unicode(obj). It + then < returns the unicode object | encodes it with the default encoding | ... >. """ - if isinstance(obj,unicode): + if isinstance(obj, unicode): return obj try: @@ -129,48 +214,54 @@ def _ustr(obj): return unicode(obj) # Else encode it... but how? There are many choices... :) # Replace unprintables with escape codes? - #return unicode(obj).encode(sys.getdefaultencoding(), 'backslashreplace_errors') + # return unicode(obj).encode(sys.getdefaultencoding(), 'backslashreplace_errors') # Replace unprintables with question marks? - #return unicode(obj).encode(sys.getdefaultencoding(), 'replace') + # return unicode(obj).encode(sys.getdefaultencoding(), 'replace') # ... else: _ustr = str unichr = chr if not _PY3K: + def _str2dict(strg): - return dict( [(c,0) for c in strg] ) + return {c: 0 for c in strg} else: _str2dict = set + def _xml_escape(data): """Escape &, <, >, ", ', etc. in a string of data.""" # ampersand must be replaced first - from_symbols = '&><"\'' - to_symbols = ['&'+s+';' for s in "amp gt lt quot apos".split()] - for from_,to_ in zip(from_symbols, to_symbols): + from_symbols = "&><\"'" + to_symbols = ["&" + s + ";" for s in "amp gt lt quot apos".split()] + for from_, to_ in zip(from_symbols, to_symbols): data = data.replace(from_, to_) return data -class _Constants(object): + +class _Constants: pass + if not _PY3K: - alphas = string.lowercase + string.uppercase + alphas = string.lowercase + string.uppercase else: - alphas = string.ascii_lowercase + string.ascii_uppercase -nums = string.digits -hexnums = nums + "ABCDEFabcdef" -alphanums = alphas + nums + alphas = string.ascii_lowercase + string.ascii_uppercase +nums = string.digits +hexnums = nums + "ABCDEFabcdef" +alphanums = alphas + nums _bslash = chr(92) -printables = "".join( [ c for c in string.printable if c not in string.whitespace ] ) +printables = "".join([c for c in string.printable if c not in string.whitespace]) + class ParseBaseException(Exception): """base exception class for all parsing runtime exceptions""" + # Performance tuning: we construct a *lot* of these, so keep this # constructor as small and fast as possible - def __init__( self, pstr, loc=0, msg=None, elem=None ): + def __init__(self, pstr, loc=0, msg=None, elem=None): self.loc = loc if msg is None: self.msg = pstr @@ -180,101 +271,134 @@ def __init__( self, pstr, loc=0, msg=None, elem=None ): self.pstr = pstr self.parserElement = elem - def __getattr__( self, aname ): + def __getattr__(self, aname): """supported attributes by name are: - - lineno - returns the line number of the exception text - - col - returns the column number of the exception text - - line - returns the line containing the exception text + - lineno - returns the line number of the exception text + - col - returns the column number of the exception text + - line - returns the line containing the exception text """ - if( aname == "lineno" ): - return lineno( self.loc, self.pstr ) - elif( aname in ("col", "column") ): - return col( self.loc, self.pstr ) - elif( aname == "line" ): - return line( self.loc, self.pstr ) + if aname == "lineno": + return lineno(self.loc, self.pstr) + elif aname in ("col", "column"): + return col(self.loc, self.pstr) + elif aname == "line": + return line(self.loc, self.pstr) else: raise AttributeError(aname) - def __str__( self ): - return "%s (at char %d), (line:%d, col:%d)" % \ - ( self.msg, self.loc, self.lineno, self.column ) - def __repr__( self ): + def __str__(self): + return "%s (at char %d), (line:%d, col:%d)" % ( + self.msg, + self.loc, + self.lineno, + self.column, + ) + + def __repr__(self): return _ustr(self) - def markInputline( self, markerString = ">!<" ): + + def markInputline(self, markerString=">!<"): """Extracts the exception line from the input string, and marks - the location of the exception with a special symbol. + the location of the exception with a special symbol. """ line_str = self.line line_column = self.column - 1 if markerString: - line_str = "".join( [line_str[:line_column], - markerString, line_str[line_column:]]) + line_str = "".join( + [line_str[:line_column], markerString, line_str[line_column:]] + ) return line_str.strip() + def __dir__(self): - return "loc msg pstr parserElement lineno col line " \ - "markInputLine __str__ __repr__".split() + return ( + "loc msg pstr parserElement lineno col line " + "markInputLine __str__ __repr__".split() + ) + class ParseException(ParseBaseException): """exception thrown when parse expressions don't match class; - supported attributes by name are: - - lineno - returns the line number of the exception text - - col - returns the column number of the exception text - - line - returns the line containing the exception text + supported attributes by name are: + - lineno - returns the line number of the exception text + - col - returns the column number of the exception text + - line - returns the line containing the exception text """ + pass + class ParseFatalException(ParseBaseException): """user-throwable exception thrown when inconsistent parse content - is found; stops all parsing immediately""" + is found; stops all parsing immediately""" + pass + class ParseSyntaxException(ParseFatalException): """just like ParseFatalException, but thrown internally when an - ErrorStop indicates that parsing is to stop immediately because - an unbacktrackable syntax error has been found""" + ErrorStop indicates that parsing is to stop immediately because + an unbacktrackable syntax error has been found""" + def __init__(self, pe): - super(ParseSyntaxException, self).__init__( - pe.pstr, pe.loc, pe.msg, pe.parserElement) - -#~ class ReparseException(ParseBaseException): - #~ """Experimental class - parse actions can raise this exception to cause - #~ pyparsing to reparse the input string: - #~ - with a modified input string, and/or - #~ - with a modified start location - #~ Set the values of the ReparseException in the constructor, and raise the - #~ exception in a parse action to cause pyparsing to use the new string/location. - #~ Setting the values as None causes no change to be made. - #~ """ - #~ def __init_( self, newstring, restartLoc ): - #~ self.newParseText = newstring - #~ self.reparseLoc = restartLoc + super().__init__(pe.pstr, pe.loc, pe.msg, pe.parserElement) + + +# ~ class ReparseException(ParseBaseException): +# ~ """Experimental class - parse actions can raise this exception to cause +# ~ pyparsing to reparse the input string: +# ~ - with a modified input string, and/or +# ~ - with a modified start location +# ~ Set the values of the ReparseException in the constructor, and raise the +# ~ exception in a parse action to cause pyparsing to use the new string/location. +# ~ Setting the values as None causes no change to be made. +# ~ """ +# ~ def __init_( self, newstring, restartLoc ): +# ~ self.newParseText = newstring +# ~ self.reparseLoc = restartLoc + class RecursiveGrammarException(Exception): """exception thrown by validate() if the grammar could be improperly recursive""" - def __init__( self, parseElementList ): + + def __init__(self, parseElementList): self.parseElementTrace = parseElementList - def __str__( self ): - return "RecursiveGrammarException: %s" % self.parseElementTrace + def __str__(self): + return f"RecursiveGrammarException: {self.parseElementTrace}" + -class _ParseResultsWithOffset(object): - def __init__(self,p1,p2): - self.tup = (p1,p2) - def __getitem__(self,i): +class _ParseResultsWithOffset: + def __init__(self, p1, p2): + self.tup = (p1, p2) + + def __getitem__(self, i): return self.tup[i] + def __repr__(self): return repr(self.tup) - def setOffset(self,i): - self.tup = (self.tup[0],i) -class ParseResults(object): + def setOffset(self, i): + self.tup = (self.tup[0], i) + + +class ParseResults: """Structured parse results, to provide multiple means of access to the parsed data: - - as a list (len(results)) - - by list index (results[0], results[1], etc.) - - by attribute (results.) - """ - __slots__ = ( "__toklist", "__tokdict", "__doinit", "__name", "__parent", "__accumNames", "__weakref__" ) - def __new__(cls, toklist, name=None, asList=True, modal=True ): + - as a list (len(results)) + - by list index (results[0], results[1], etc.) + - by attribute (results.) + """ + + __slots__ = ( + "__toklist", + "__tokdict", + "__doinit", + "__name", + "__parent", + "__accumNames", + "__weakref__", + ) + + def __new__(cls, toklist, name=None, asList=True, modal=True): if isinstance(toklist, cls): return toklist retobj = object.__new__(cls) @@ -283,7 +407,7 @@ def __new__(cls, toklist, name=None, asList=True, modal=True ): # Performance tuning: we construct a *lot* of these, so keep this # constructor as small and fast as possible - def __init__( self, toklist, name=None, asList=True, modal=True ): + def __init__(self, toklist, name=None, asList=True, modal=True): if self.__doinit: self.__doinit = False self.__name = None @@ -293,61 +417,67 @@ def __init__( self, toklist, name=None, asList=True, modal=True ): self.__toklist = toklist[:] else: self.__toklist = [toklist] - self.__tokdict = dict() + self.__tokdict = {} if name: if not modal: self.__accumNames[name] = 0 - if isinstance(name,int): - name = _ustr(name) # will always return a str, but use _ustr for consistency + if isinstance(name, int): + name = _ustr( + name + ) # will always return a str, but use _ustr for consistency self.__name = name - if not toklist in (None,'',[]): - if isinstance(toklist,basestring): - toklist = [ toklist ] + if not toklist in (None, "", []): + if isinstance(toklist, basestring): + toklist = [toklist] if asList: - if isinstance(toklist,ParseResults): - self[name] = _ParseResultsWithOffset(toklist.copy(),0) + if isinstance(toklist, ParseResults): + self[name] = _ParseResultsWithOffset(toklist.copy(), 0) else: - self[name] = _ParseResultsWithOffset(ParseResults(toklist[0]),0) + self[name] = _ParseResultsWithOffset( + ParseResults(toklist[0]), 0 + ) self[name].__name = name else: try: self[name] = toklist[0] - except (KeyError,TypeError,IndexError): + except (KeyError, TypeError, IndexError): self[name] = toklist - def __getitem__( self, i ): - if isinstance( i, (int,slice) ): + def __getitem__(self, i): + if isinstance(i, (int, slice)): return self.__toklist[i] else: if i not in self.__accumNames: return self.__tokdict[i][-1][0] else: - return ParseResults([ v[0] for v in self.__tokdict[i] ]) + return ParseResults([v[0] for v in self.__tokdict[i]]) - def __setitem__( self, k, v ): - if isinstance(v,_ParseResultsWithOffset): - self.__tokdict[k] = self.__tokdict.get(k,list()) + [v] + def __setitem__(self, k, v): + if isinstance(v, _ParseResultsWithOffset): + self.__tokdict[k] = self.__tokdict.get(k, list()) + [v] sub = v[0] - elif isinstance(k,int): + elif isinstance(k, int): self.__toklist[k] = v sub = v else: - self.__tokdict[k] = self.__tokdict.get(k,list()) + [_ParseResultsWithOffset(v,0)] + self.__tokdict[k] = self.__tokdict.get(k, list()) + [ + _ParseResultsWithOffset(v, 0) + ] sub = v - if isinstance(sub,ParseResults): + if isinstance(sub, ParseResults): sub.__parent = wkref(self) - def __delitem__( self, i ): - if isinstance(i,(int,slice)): - mylen = len( self.__toklist ) + def __delitem__(self, i): + if isinstance(i, (int, slice)): + mylen = len(self.__toklist) del self.__toklist[i] # convert int to slice if isinstance(i, int): if i < 0: i += mylen - i = slice(i, i+1) + i = slice(i, i + 1) # get removed indices removed = list(range(*i.indices(mylen))) removed.reverse() @@ -356,91 +486,107 @@ def __delitem__( self, i ): occurrences = self.__tokdict[name] for j in removed: for k, (value, position) in enumerate(occurrences): - occurrences[k] = _ParseResultsWithOffset(value, position - (position > j)) + occurrences[k] = _ParseResultsWithOffset( + value, position - (position > j) + ) else: del self.__tokdict[i] - def __contains__( self, k ): + def __contains__(self, k): return k in self.__tokdict - def __len__( self ): return len( self.__toklist ) - def __bool__(self): return len( self.__toklist ) > 0 + def __len__(self): + return len(self.__toklist) + + def __bool__(self): + return len(self.__toklist) > 0 + __nonzero__ = __bool__ - def __iter__( self ): return iter( self.__toklist ) - def __reversed__( self ): return iter( reversed(self.__toklist) ) - def keys( self ): + + def __iter__(self): + return iter(self.__toklist) + + def __reversed__(self): + return iter(reversed(self.__toklist)) + + def keys(self): """Returns all named result keys.""" return self.__tokdict.keys() - def pop( self, index=-1 ): + def pop(self, index=-1): """Removes and returns item at specified index (default=last). - Will work with either numeric indices or dict-key indicies.""" + Will work with either numeric indices or dict-key indicies.""" ret = self[index] del self[index] return ret def get(self, key, defaultValue=None): """Returns named result matching the given key, or if there is no - such name, then returns the given defaultValue or None if no - defaultValue is specified.""" + such name, then returns the given defaultValue or None if no + defaultValue is specified.""" if key in self: return self[key] else: return defaultValue - def insert( self, index, insStr ): + def insert(self, index, insStr): self.__toklist.insert(index, insStr) # fixup indices in token dictionary for name in self.__tokdict: occurrences = self.__tokdict[name] for k, (value, position) in enumerate(occurrences): - occurrences[k] = _ParseResultsWithOffset(value, position + (position > index)) + occurrences[k] = _ParseResultsWithOffset( + value, position + (position > index) + ) - def items( self ): + def items(self): """Returns all named result keys and values as a list of tuples.""" - return [(k,self[k]) for k in self.__tokdict] + return [(k, self[k]) for k in self.__tokdict] - def values( self ): + def values(self): """Returns all named result values.""" - return [ v[-1][0] for v in self.__tokdict.values() ] + return [v[-1][0] for v in self.__tokdict.values()] - def __getattr__( self, name ): + def __getattr__(self, name): if name not in self.__slots__: if name in self.__tokdict: if name not in self.__accumNames: return self.__tokdict[name][-1][0] else: - return ParseResults([ v[0] for v in self.__tokdict[name] ]) + return ParseResults([v[0] for v in self.__tokdict[name]]) else: return "" return None - def __add__( self, other ): + def __add__(self, other): ret = self.copy() ret += other return ret - def __iadd__( self, other ): + def __iadd__(self, other): if other.__tokdict: offset = len(self.__toklist) - addoffset = ( lambda a: (a<0 and offset) or (a+offset) ) + addoffset = lambda a: (a < 0 and offset) or (a + offset) otheritems = other.__tokdict.items() - otherdictitems = [(k, _ParseResultsWithOffset(v[0],addoffset(v[1])) ) - for (k,vlist) in otheritems for v in vlist] - for k,v in otherdictitems: + otherdictitems = [ + (k, _ParseResultsWithOffset(v[0], addoffset(v[1]))) + for (k, vlist) in otheritems + for v in vlist + ] + for k, v in otherdictitems: self[k] = v - if isinstance(v[0],ParseResults): + if isinstance(v[0], ParseResults): v[0].__parent = wkref(self) - + self.__toklist += other.__toklist - self.__accumNames.update( other.__accumNames ) + self.__accumNames.update(other.__accumNames) del other return self - def __repr__( self ): - return "(%s, %s)" % ( repr( self.__toklist ), repr( self.__tokdict ) ) + def __repr__(self): + return f"({repr(self.__toklist)}, {repr(self.__tokdict)})" - def __str__( self ): + def __str__(self): out = "[" sep = "" for i in self.__toklist: @@ -452,46 +598,45 @@ def __str__( self ): out += "]" return out - def _asStringList( self, sep='' ): + def _asStringList(self, sep=""): out = [] for item in self.__toklist: if out and sep: out.append(sep) - if isinstance( item, ParseResults ): + if isinstance(item, ParseResults): out += item._asStringList() else: - out.append( _ustr(item) ) + out.append(_ustr(item)) return out - def asList( self ): + def asList(self): """Returns the parse results as a nested list of matching tokens, all converted to strings.""" out = [] for res in self.__toklist: - if isinstance(res,ParseResults): - out.append( res.asList() ) + if isinstance(res, ParseResults): + out.append(res.asList()) else: - out.append( res ) + out.append(res) return out - def asDict( self ): + def asDict(self): """Returns the named parse results as dictionary.""" - return dict( self.items() ) + return dict(self.items()) - def copy( self ): + def copy(self): """Returns a new copy of a ParseResults object.""" - ret = ParseResults( self.__toklist ) + ret = ParseResults(self.__toklist) ret.__tokdict = self.__tokdict.copy() ret.__parent = self.__parent - ret.__accumNames.update( self.__accumNames ) + ret.__accumNames.update(self.__accumNames) ret.__name = self.__name return ret - def asXML( self, doctag=None, namedItemsOnly=False, indent="", formatted=True ): + def asXML(self, doctag=None, namedItemsOnly=False, indent="", formatted=True): """Returns the parse results as XML. Tags are created for tokens and lists that have defined results names.""" nl = "\n" out = [] - namedItems = dict( [ (v[1],k) for (k,vlist) in self.__tokdict.items() - for v in vlist ] ) + namedItems = {v[1]: k for (k, vlist) in self.__tokdict.items() for v in vlist} nextLevelIndent = indent + " " # collapse out indents if formatting is not desired @@ -513,21 +658,29 @@ def asXML( self, doctag=None, namedItemsOnly=False, indent="", formatted=True ): else: selfTag = "ITEM" - out += [ nl, indent, "<", selfTag, ">" ] + out += [nl, indent, "<", selfTag, ">"] worklist = self.__toklist - for i,res in enumerate(worklist): - if isinstance(res,ParseResults): + for i, res in enumerate(worklist): + if isinstance(res, ParseResults): if i in namedItems: - out += [ res.asXML(namedItems[i], - namedItemsOnly and doctag is None, - nextLevelIndent, - formatted)] + out += [ + res.asXML( + namedItems[i], + namedItemsOnly and doctag is None, + nextLevelIndent, + formatted, + ) + ] else: - out += [ res.asXML(None, - namedItemsOnly and doctag is None, - nextLevelIndent, - formatted)] + out += [ + res.asXML( + None, + namedItemsOnly and doctag is None, + nextLevelIndent, + formatted, + ) + ] else: # individual token, see if there is a name for it resTag = None @@ -539,16 +692,24 @@ def asXML( self, doctag=None, namedItemsOnly=False, indent="", formatted=True ): else: resTag = "ITEM" xmlBodyText = _xml_escape(_ustr(res)) - out += [ nl, nextLevelIndent, "<", resTag, ">", - xmlBodyText, - "" ] - - out += [ nl, indent, "" ] + out += [ + nl, + nextLevelIndent, + "<", + resTag, + ">", + xmlBodyText, + "", + ] + + out += [nl, indent, ""] return "".join(out) - def __lookup(self,sub): - for k,vlist in self.__tokdict.items(): - for v,loc in vlist: + def __lookup(self, sub): + for k, vlist in self.__tokdict.items(): + for v, loc in vlist: if sub is v: return k return None @@ -563,51 +724,54 @@ def getName(self): return par.__lookup(self) else: return None - elif (len(self) == 1 and - len(self.__tokdict) == 1 and - self.__tokdict.values()[0][0][1] in (0,-1)): + elif ( + len(self) == 1 + and len(self.__tokdict) == 1 + and self.__tokdict.values()[0][0][1] in (0, -1) + ): return self.__tokdict.keys()[0] else: return None - def dump(self,indent='',depth=0): + def dump(self, indent="", depth=0): """Diagnostic method for listing out the contents of a ParseResults. - Accepts an optional indent argument so that this string can be embedded - in a nested display of other data.""" + Accepts an optional indent argument so that this string can be embedded + in a nested display of other data.""" out = [] - out.append( indent+_ustr(self.asList()) ) + out.append(indent + _ustr(self.asList())) keys = self.items() keys.sort() - for k,v in keys: + for k, v in keys: if out: - out.append('\n') - out.append( "%s%s- %s: " % (indent,(' '*depth), k) ) - if isinstance(v,ParseResults): + out.append("\n") + out.append(f"{indent}{' ' * depth}- {k}: ") + if isinstance(v, ParseResults): if v.keys(): - #~ out.append('\n') - out.append( v.dump(indent,depth+1) ) - #~ out.append('\n') + # ~ out.append('\n') + out.append(v.dump(indent, depth + 1)) + # ~ out.append('\n') else: out.append(_ustr(v)) else: out.append(_ustr(v)) - #~ out.append('\n') + # ~ out.append('\n') return "".join(out) # add support for pickle protocol def __getstate__(self): - return ( self.__toklist, - ( self.__tokdict.copy(), - self.__parent is not None and self.__parent() or None, - self.__accumNames, - self.__name ) ) - - def __setstate__(self,state): + return ( + self.__toklist, + ( + self.__tokdict.copy(), + self.__parent is not None and self.__parent() or None, + self.__accumNames, + self.__name, + ), + ) + + def __setstate__(self, state): self.__toklist = state[0] - self.__tokdict, \ - par, \ - inAccumNames, \ - self.__name = state[1] + self.__tokdict, par, inAccumNames, self.__name = state[1] self.__accumNames = {} self.__accumNames.update(inAccumNames) if par is not None: @@ -616,144 +780,163 @@ def __setstate__(self,state): self.__parent = None def __dir__(self): - return dir(super(ParseResults,self)) + self.keys() + return dir(super()) + self.keys() -def col (loc,strg): + +def col(loc, strg): """Returns current column within a string, counting newlines as line separators. - The first column is number 1. + The first column is number 1. - Note: the default parsing behavior is to expand tabs in the input string - before starting the parsing process. See L{I{ParserElement.parseString}} for more information - on parsing strings containing s, and suggested methods to maintain a - consistent view of the parsed string, the parse location, and line and column - positions within the parsed string. - """ - return (loc} for more information + on parsing strings containing s, and suggested methods to maintain a + consistent view of the parsed string, the parse location, and line and column + positions within the parsed string. + """ + return ( + (loc < len(strg) and strg[loc] == "\n") and 1 or loc - strg.rfind("\n", 0, loc) + ) -def lineno(loc,strg): + +def lineno(loc, strg): """Returns current line number within a string, counting newlines as line separators. - The first line is number 1. - - Note: the default parsing behavior is to expand tabs in the input string - before starting the parsing process. See L{I{ParserElement.parseString}} for more information - on parsing strings containing s, and suggested methods to maintain a - consistent view of the parsed string, the parse location, and line and column - positions within the parsed string. - """ - return strg.count("\n",0,loc) + 1 - -def line( loc, strg ): - """Returns the line of text containing loc within a string, counting newlines as line separators. - """ + The first line is number 1. + + Note: the default parsing behavior is to expand tabs in the input string + before starting the parsing process. See L{I{ParserElement.parseString}} for more information + on parsing strings containing s, and suggested methods to maintain a + consistent view of the parsed string, the parse location, and line and column + positions within the parsed string. + """ + return strg.count("\n", 0, loc) + 1 + + +def line(loc, strg): + """Returns the line of text containing loc within a string, counting newlines as line separators.""" lastCR = strg.rfind("\n", 0, loc) nextCR = strg.find("\n", loc) if nextCR > 0: - return strg[lastCR+1:nextCR] + return strg[lastCR + 1 : nextCR] else: - return strg[lastCR+1:] + return strg[lastCR + 1 :] + + +def _defaultStartDebugAction(instring, loc, expr): + print( + "Match " + + _ustr(expr) + + " at loc " + + _ustr(loc) + + "(%d,%d)" % (lineno(loc, instring), col(loc, instring)) + ) + -def _defaultStartDebugAction( instring, loc, expr ): - print ("Match " + _ustr(expr) + " at loc " + _ustr(loc) + "(%d,%d)" % ( lineno(loc,instring), col(loc,instring) )) +def _defaultSuccessDebugAction(instring, startloc, endloc, expr, toks): + print("Matched " + _ustr(expr) + " -> " + str(toks.asList())) -def _defaultSuccessDebugAction( instring, startloc, endloc, expr, toks ): - print ("Matched " + _ustr(expr) + " -> " + str(toks.asList())) -def _defaultExceptionDebugAction( instring, loc, expr, exc ): - print ("Exception raised:" + _ustr(exc)) +def _defaultExceptionDebugAction(instring, loc, expr, exc): + print("Exception raised:" + _ustr(exc)) + def nullDebugAction(*args): """'Do-nothing' debug action, to suppress debugging output during parsing.""" pass -class ParserElement(object): + +class ParserElement: """Abstract base level parser element class.""" + DEFAULT_WHITE_CHARS = " \n\t\r" - def setDefaultWhitespaceChars( chars ): - """Overrides the default whitespace chars - """ + def setDefaultWhitespaceChars(chars): + """Overrides the default whitespace chars""" ParserElement.DEFAULT_WHITE_CHARS = chars + setDefaultWhitespaceChars = staticmethod(setDefaultWhitespaceChars) - def __init__( self, savelist=False ): - self.parseAction = list() + def __init__(self, savelist=False): + self.parseAction = [] self.failAction = None - #~ self.name = "" # don't define self.name, let subclasses try/except upcall + # ~ self.name = "" # don't define self.name, let subclasses try/except upcall self.strRepr = None self.resultsName = None self.saveAsList = savelist self.skipWhitespace = True self.whiteChars = ParserElement.DEFAULT_WHITE_CHARS self.copyDefaultWhiteChars = True - self.mayReturnEmpty = False # used when checking for left-recursion + self.mayReturnEmpty = False # used when checking for left-recursion self.keepTabs = False - self.ignoreExprs = list() + self.ignoreExprs = [] self.debug = False self.streamlined = False - self.mayIndexError = True # used to optimize exception handling for subclasses that don't advance parse index + self.mayIndexError = True # used to optimize exception handling for subclasses that don't advance parse index self.errmsg = "" - self.modalResults = True # used to mark results names as modal (report only last) or cumulative (list all) - self.debugActions = ( None, None, None ) #custom debug actions + self.modalResults = True # used to mark results names as modal (report only last) or cumulative (list all) + self.debugActions = (None, None, None) # custom debug actions self.re = None - self.callPreparse = True # used to avoid redundant calls to preParse + self.callPreparse = True # used to avoid redundant calls to preParse self.callDuringTry = False - def copy( self ): + def copy(self): """Make a copy of this ParserElement. Useful for defining different parse actions - for the same parsing pattern, using copies of the original parse element.""" - cpy = copy.copy( self ) + for the same parsing pattern, using copies of the original parse element.""" + cpy = copy.copy(self) cpy.parseAction = self.parseAction[:] cpy.ignoreExprs = self.ignoreExprs[:] if self.copyDefaultWhiteChars: cpy.whiteChars = ParserElement.DEFAULT_WHITE_CHARS return cpy - def setName( self, name ): + def setName(self, name): """Define name for this expression, for use in debugging.""" self.name = name self.errmsg = "Expected " + self.name - if hasattr(self,"exception"): + if hasattr(self, "exception"): self.exception.msg = self.errmsg return self - def setResultsName( self, name, listAllMatches=False ): + def setResultsName(self, name, listAllMatches=False): """Define name for referencing matching tokens as a nested attribute - of the returned parse results. - NOTE: this returns a *copy* of the original ParserElement object; - this is so that the client can define a basic element, such as an - integer, and reference it in multiple places with different names. + of the returned parse results. + NOTE: this returns a *copy* of the original ParserElement object; + this is so that the client can define a basic element, such as an + integer, and reference it in multiple places with different names. """ newself = self.copy() newself.resultsName = name newself.modalResults = not listAllMatches return newself - def setBreak(self,breakFlag = True): + def setBreak(self, breakFlag=True): """Method to invoke the Python pdb debugger when this element is - about to be parsed. Set breakFlag to True to enable, False to - disable. + about to be parsed. Set breakFlag to True to enable, False to + disable. """ if breakFlag: _parseMethod = self._parse + def breaker(instring, loc, doActions=True, callPreParse=True): import pdb + pdb.set_trace() - return _parseMethod( instring, loc, doActions, callPreParse ) + return _parseMethod(instring, loc, doActions, callPreParse) + breaker._originalParseMethod = _parseMethod self._parse = breaker else: - if hasattr(self._parse,"_originalParseMethod"): + if hasattr(self._parse, "_originalParseMethod"): self._parse = self._parse._originalParseMethod return self - def _normalizeParseActionArgs( f ): + def _normalizeParseActionArgs(f): """Internal method used to decorate parse actions that take fewer than 3 arguments, - so that all parse actions can be called as f(s,l,t).""" + so that all parse actions can be called as f(s,l,t).""" STAR_ARGS = 4 try: restore = None - if isinstance(f,type): + if isinstance(f, type): restore = f f = f.__init__ if not _PY3K: @@ -764,10 +947,10 @@ def _normalizeParseActionArgs( f ): return f numargs = codeObj.co_argcount if not _PY3K: - if hasattr(f,"im_self"): + if hasattr(f, "im_self"): numargs -= 1 else: - if hasattr(f,"__self__"): + if hasattr(f, "__self__"): numargs -= 1 if restore: f = restore @@ -784,10 +967,10 @@ def _normalizeParseActionArgs( f ): return f numargs = call_im_func_code.co_argcount if not _PY3K: - if hasattr(f.__call__,"im_self"): + if hasattr(f.__call__, "im_self"): numargs -= 1 else: - if hasattr(f.__call__,"__self__"): + if hasattr(f.__call__, "__self__"): numargs -= 0 except AttributeError: if not _PY3K: @@ -799,103 +982,111 @@ def _normalizeParseActionArgs( f ): return f numargs = call_func_code.co_argcount if not _PY3K: - if hasattr(f.__call__,"im_self"): + if hasattr(f.__call__, "im_self"): numargs -= 1 else: - if hasattr(f.__call__,"__self__"): + if hasattr(f.__call__, "__self__"): numargs -= 1 - - #~ print ("adding function %s with %d args" % (f.func_name,numargs)) + # ~ print ("adding function %s with %d args" % (f.func_name,numargs)) if numargs == 3: return f else: if numargs > 3: - def tmp(s,l,t): - return f(f.__call__.__self__, s,l,t) + + def tmp(s, l, t): + return f(f.__call__.__self__, s, l, t) + if numargs == 2: - def tmp(s,l,t): - return f(l,t) + + def tmp(s, l, t): + return f(l, t) elif numargs == 1: - def tmp(s,l,t): + + def tmp(s, l, t): return f(t) - else: #~ numargs == 0: - def tmp(s,l,t): + else: # ~ numargs == 0: + + def tmp(s, l, t): return f() + try: tmp.__name__ = f.__name__ - except (AttributeError,TypeError): + except (AttributeError, TypeError): # no need for special handling if attribute doesnt exist pass try: tmp.__doc__ = f.__doc__ - except (AttributeError,TypeError): + except (AttributeError, TypeError): # no need for special handling if attribute doesnt exist pass try: tmp.__dict__.update(f.__dict__) - except (AttributeError,TypeError): + except (AttributeError, TypeError): # no need for special handling if attribute doesnt exist pass return tmp + _normalizeParseActionArgs = staticmethod(_normalizeParseActionArgs) - def setParseAction( self, *fns, **kwargs ): + def setParseAction(self, *fns, **kwargs): """Define action to perform when successfully matching parse element definition. - Parse action fn is a callable method with 0-3 arguments, called as fn(s,loc,toks), - fn(loc,toks), fn(toks), or just fn(), where: - - s = the original string being parsed (see note below) - - loc = the location of the matching substring - - toks = a list of the matched tokens, packaged as a ParseResults object - If the functions in fns modify the tokens, they can return them as the return - value from fn, and the modified list of tokens will replace the original. - Otherwise, fn does not need to return any value. - - Note: the default parsing behavior is to expand tabs in the input string - before starting the parsing process. See L{I{parseString}} for more information - on parsing strings containing s, and suggested methods to maintain a - consistent view of the parsed string, the parse location, and line and column - positions within the parsed string. - """ + Parse action fn is a callable method with 0-3 arguments, called as fn(s,loc,toks), + fn(loc,toks), fn(toks), or just fn(), where: + - s = the original string being parsed (see note below) + - loc = the location of the matching substring + - toks = a list of the matched tokens, packaged as a ParseResults object + If the functions in fns modify the tokens, they can return them as the return + value from fn, and the modified list of tokens will replace the original. + Otherwise, fn does not need to return any value. + + Note: the default parsing behavior is to expand tabs in the input string + before starting the parsing process. See L{I{parseString}} for more information + on parsing strings containing s, and suggested methods to maintain a + consistent view of the parsed string, the parse location, and line and column + positions within the parsed string. + """ self.parseAction = list(map(self._normalizeParseActionArgs, list(fns))) - self.callDuringTry = ("callDuringTry" in kwargs and kwargs["callDuringTry"]) + self.callDuringTry = "callDuringTry" in kwargs and kwargs["callDuringTry"] return self - def addParseAction( self, *fns, **kwargs ): + def addParseAction(self, *fns, **kwargs): """Add parse action to expression's list of parse actions. See L{I{setParseAction}}.""" self.parseAction += list(map(self._normalizeParseActionArgs, list(fns))) - self.callDuringTry = self.callDuringTry or ("callDuringTry" in kwargs and kwargs["callDuringTry"]) + self.callDuringTry = self.callDuringTry or ( + "callDuringTry" in kwargs and kwargs["callDuringTry"] + ) return self - def setFailAction( self, fn ): + def setFailAction(self, fn): """Define action to perform if parsing fails at this expression. - Fail acton fn is a callable function that takes the arguments - fn(s,loc,expr,err) where: - - s = string being parsed - - loc = location where expression match was attempted and failed - - expr = the parse expression that failed - - err = the exception thrown - The function returns no value. It may throw ParseFatalException - if it is desired to stop parsing immediately.""" + Fail acton fn is a callable function that takes the arguments + fn(s,loc,expr,err) where: + - s = string being parsed + - loc = location where expression match was attempted and failed + - expr = the parse expression that failed + - err = the exception thrown + The function returns no value. It may throw ParseFatalException + if it is desired to stop parsing immediately.""" self.failAction = fn return self - def _skipIgnorables( self, instring, loc ): + def _skipIgnorables(self, instring, loc): exprsFound = True while exprsFound: exprsFound = False for e in self.ignoreExprs: try: while 1: - loc,dummy = e._parse( instring, loc ) + loc, dummy = e._parse(instring, loc) exprsFound = True except ParseException: pass return loc - def preParse( self, instring, loc ): + def preParse(self, instring, loc): if self.ignoreExprs: - loc = self._skipIgnorables( instring, loc ) + loc = self._skipIgnorables(instring, loc) if self.skipWhitespace: wt = self.whiteChars @@ -905,188 +1096,200 @@ def preParse( self, instring, loc ): return loc - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): return loc, [] - def postParse( self, instring, loc, tokenlist ): + def postParse(self, instring, loc, tokenlist): return tokenlist - #~ @profile - def _parseNoCache( self, instring, loc, doActions=True, callPreParse=True ): - debugging = ( self.debug ) #and doActions ) + # ~ @profile + def _parseNoCache(self, instring, loc, doActions=True, callPreParse=True): + debugging = self.debug # and doActions ) if debugging or self.failAction: - #~ print ("Match",self,"at loc",loc,"(%d,%d)" % ( lineno(loc,instring), col(loc,instring) )) - if (self.debugActions[0] ): - self.debugActions[0]( instring, loc, self ) + # ~ print ("Match",self,"at loc",loc,"(%d,%d)" % ( lineno(loc,instring), col(loc,instring) )) + if self.debugActions[0]: + self.debugActions[0](instring, loc, self) if callPreParse and self.callPreparse: - preloc = self.preParse( instring, loc ) + preloc = self.preParse(instring, loc) else: preloc = loc tokensStart = loc try: try: - loc,tokens = self.parseImpl( instring, preloc, doActions ) + loc, tokens = self.parseImpl(instring, preloc, doActions) except IndexError: - raise ParseException( instring, len(instring), self.errmsg, self ) + raise ParseException(instring, len(instring), self.errmsg, self) except ParseBaseException as err: - #~ print ("Exception raised:", err) + # ~ print ("Exception raised:", err) if self.debugActions[2]: - self.debugActions[2]( instring, tokensStart, self, err ) + self.debugActions[2](instring, tokensStart, self, err) if self.failAction: - self.failAction( instring, tokensStart, self, err ) + self.failAction(instring, tokensStart, self, err) raise else: if callPreParse and self.callPreparse: - preloc = self.preParse( instring, loc ) + preloc = self.preParse(instring, loc) else: preloc = loc tokensStart = loc if self.mayIndexError or loc >= len(instring): try: - loc,tokens = self.parseImpl( instring, preloc, doActions ) + loc, tokens = self.parseImpl(instring, preloc, doActions) except IndexError: - raise ParseException( instring, len(instring), self.errmsg, self ) + raise ParseException(instring, len(instring), self.errmsg, self) else: - loc,tokens = self.parseImpl( instring, preloc, doActions ) + loc, tokens = self.parseImpl(instring, preloc, doActions) - tokens = self.postParse( instring, loc, tokens ) + tokens = self.postParse(instring, loc, tokens) - retTokens = ParseResults( tokens, self.resultsName, asList=self.saveAsList, modal=self.modalResults ) + retTokens = ParseResults( + tokens, self.resultsName, asList=self.saveAsList, modal=self.modalResults + ) if self.parseAction and (doActions or self.callDuringTry): if debugging: try: for fn in self.parseAction: - tokens = fn( instring, tokensStart, retTokens ) + tokens = fn(instring, tokensStart, retTokens) if tokens is not None: - retTokens = ParseResults( tokens, - self.resultsName, - asList=self.saveAsList and isinstance(tokens,(ParseResults,list)), - modal=self.modalResults ) + retTokens = ParseResults( + tokens, + self.resultsName, + asList=self.saveAsList + and isinstance(tokens, (ParseResults, list)), + modal=self.modalResults, + ) except ParseBaseException as err: # print ("Exception raised in user parse action:", err) - if (self.debugActions[2] ): - self.debugActions[2]( instring, tokensStart, self, err ) + if self.debugActions[2]: + self.debugActions[2](instring, tokensStart, self, err) raise else: for fn in self.parseAction: - tokens = fn( instring, tokensStart, retTokens ) + tokens = fn(instring, tokensStart, retTokens) if tokens is not None: - retTokens = ParseResults( tokens, - self.resultsName, - asList=self.saveAsList and isinstance(tokens,(ParseResults,list)), - modal=self.modalResults ) + retTokens = ParseResults( + tokens, + self.resultsName, + asList=self.saveAsList + and isinstance(tokens, (ParseResults, list)), + modal=self.modalResults, + ) if debugging: - #~ print ("Matched",self,"->",retTokens.asList()) - if (self.debugActions[1] ): - self.debugActions[1]( instring, tokensStart, loc, self, retTokens ) + # ~ print ("Matched",self,"->",retTokens.asList()) + if self.debugActions[1]: + self.debugActions[1](instring, tokensStart, loc, self, retTokens) return loc, retTokens - def tryParse( self, instring, loc ): + def tryParse(self, instring, loc): try: - return self._parse( instring, loc, doActions=False )[0] + return self._parse(instring, loc, doActions=False)[0] except ParseFatalException: - raise ParseException( instring, loc, self.errmsg, self) + raise ParseException(instring, loc, self.errmsg, self) # this method gets repeatedly called during backtracking with the same arguments - # we can cache these arguments and save ourselves the trouble of re-parsing the contained expression - def _parseCache( self, instring, loc, doActions=True, callPreParse=True ): - lookup = (self,instring,loc,callPreParse,doActions) + def _parseCache(self, instring, loc, doActions=True, callPreParse=True): + lookup = (self, instring, loc, callPreParse, doActions) if lookup in ParserElement._exprArgCache: - value = ParserElement._exprArgCache[ lookup ] - if isinstance(value,Exception): + value = ParserElement._exprArgCache[lookup] + if isinstance(value, Exception): raise value return value else: try: - value = self._parseNoCache( instring, loc, doActions, callPreParse ) - ParserElement._exprArgCache[ lookup ] = (value[0],value[1].copy()) + value = self._parseNoCache(instring, loc, doActions, callPreParse) + ParserElement._exprArgCache[lookup] = (value[0], value[1].copy()) return value except ParseBaseException as pe: - ParserElement._exprArgCache[ lookup ] = pe + ParserElement._exprArgCache[lookup] = pe raise _parse = _parseNoCache # argument cache for optimizing repeated calls when backtracking through recursive expressions _exprArgCache = {} + def resetCache(): ParserElement._exprArgCache.clear() + resetCache = staticmethod(resetCache) _packratEnabled = False + def enablePackrat(): """Enables "packrat" parsing, which adds memoizing to the parsing logic. - Repeated parse attempts at the same string location (which happens - often in many complex grammars) can immediately return a cached value, - instead of re-executing parsing/validating code. Memoizing is done of - both valid results and parsing exceptions. - - This speedup may break existing programs that use parse actions that - have side-effects. For this reason, packrat parsing is disabled when - you first import pyparsing. To activate the packrat feature, your - program must call the class method ParserElement.enablePackrat(). If - your program uses psyco to "compile as you go", you must call - enablePackrat before calling psyco.full(). If you do not do this, - Python will crash. For best results, call enablePackrat() immediately - after importing pyparsing. + Repeated parse attempts at the same string location (which happens + often in many complex grammars) can immediately return a cached value, + instead of re-executing parsing/validating code. Memoizing is done of + both valid results and parsing exceptions. + + This speedup may break existing programs that use parse actions that + have side-effects. For this reason, packrat parsing is disabled when + you first import pyparsing. To activate the packrat feature, your + program must call the class method ParserElement.enablePackrat(). If + your program uses psyco to "compile as you go", you must call + enablePackrat before calling psyco.full(). If you do not do this, + Python will crash. For best results, call enablePackrat() immediately + after importing pyparsing. """ if not ParserElement._packratEnabled: ParserElement._packratEnabled = True ParserElement._parse = ParserElement._parseCache + enablePackrat = staticmethod(enablePackrat) - def parseString( self, instring, parseAll=False ): + def parseString(self, instring, parseAll=False): """Execute the parse expression with the given string. - This is the main interface to the client code, once the complete - expression has been built. - - If you want the grammar to require that the entire input string be - successfully parsed, then set parseAll to True (equivalent to ending - the grammar with StringEnd()). - - Note: parseString implicitly calls expandtabs() on the input string, - in order to report proper column numbers in parse actions. - If the input string contains tabs and - the grammar uses parse actions that use the loc argument to index into the - string being parsed, you can ensure you have a consistent view of the input - string by: - - calling parseWithTabs on your grammar before calling parseString - (see L{I{parseWithTabs}}) - - define your parse action using the full (s,loc,toks) signature, and - reference the input string using the parse action's s argument - - explictly expand the tabs in your input string before calling - parseString + This is the main interface to the client code, once the complete + expression has been built. + + If you want the grammar to require that the entire input string be + successfully parsed, then set parseAll to True (equivalent to ending + the grammar with StringEnd()). + + Note: parseString implicitly calls expandtabs() on the input string, + in order to report proper column numbers in parse actions. + If the input string contains tabs and + the grammar uses parse actions that use the loc argument to index into the + string being parsed, you can ensure you have a consistent view of the input + string by: + - calling parseWithTabs on your grammar before calling parseString + (see L{I{parseWithTabs}}) + - define your parse action using the full (s,loc,toks) signature, and + reference the input string using the parse action's s argument + - explictly expand the tabs in your input string before calling + parseString """ ParserElement.resetCache() if not self.streamlined: self.streamline() - #~ self.saveAsList = True + # ~ self.saveAsList = True for e in self.ignoreExprs: e.streamline() if not self.keepTabs: instring = instring.expandtabs() try: - loc, tokens = self._parse( instring, 0 ) + loc, tokens = self._parse(instring, 0) if parseAll: - loc = self.preParse( instring, loc ) - StringEnd()._parse( instring, loc ) + loc = self.preParse(instring, loc) + StringEnd()._parse(instring, loc) except ParseBaseException as exc: # catch and re-raise exception from here, clears out pyparsing internal stack trace raise exc else: return tokens - def scanString( self, instring, maxMatches=_MAX_INT ): + def scanString(self, instring, maxMatches=_MAX_INT): """Scan the input string for expression matches. Each match will return the - matching tokens, start location, and end location. May be called with optional - maxMatches argument, to clip scanning after 'n' matches are found. + matching tokens, start location, and end location. May be called with optional + maxMatches argument, to clip scanning after 'n' matches are found. - Note that the start and end locations are reported relative to the string - being parsed. See L{I{parseString}} for more information on parsing - strings with embedded tabs.""" + Note that the start and end locations are reported relative to the string + being parsed. See L{I{parseString}} for more information on parsing + strings with embedded tabs.""" if not self.streamlined: self.streamline() for e in self.ignoreExprs: @@ -1103,10 +1306,10 @@ def scanString( self, instring, maxMatches=_MAX_INT ): try: while loc <= instrlen and matches < maxMatches: try: - preloc = preparseFn( instring, loc ) - nextLoc,tokens = parseFn( instring, preloc, callPreParse=False ) + preloc = preparseFn(instring, loc) + nextLoc, tokens = parseFn(instring, preloc, callPreParse=False) except ParseException: - loc = preloc+1 + loc = preloc + 1 else: matches += 1 yield tokens, preloc, nextLoc @@ -1114,288 +1317,335 @@ def scanString( self, instring, maxMatches=_MAX_INT ): except ParseBaseException as pe: raise pe - def transformString( self, instring ): + def transformString(self, instring): """Extension to scanString, to modify matching text with modified tokens that may - be returned from a parse action. To use transformString, define a grammar and - attach a parse action to it that modifies the returned token list. - Invoking transformString() on a target string will then scan for matches, - and replace the matched text patterns according to the logic in the parse - action. transformString() returns the resulting transformed string.""" + be returned from a parse action. To use transformString, define a grammar and + attach a parse action to it that modifies the returned token list. + Invoking transformString() on a target string will then scan for matches, + and replace the matched text patterns according to the logic in the parse + action. transformString() returns the resulting transformed string.""" out = [] lastE = 0 # force preservation of s, to minimize unwanted transformation of string, and to # keep string locs straight between transformString and scanString self.keepTabs = True try: - for t,s,e in self.scanString( instring ): - out.append( instring[lastE:s] ) + for t, s, e in self.scanString(instring): + out.append(instring[lastE:s]) if t: - if isinstance(t,ParseResults): + if isinstance(t, ParseResults): out += t.asList() - elif isinstance(t,list): + elif isinstance(t, list): out += t else: out.append(t) lastE = e out.append(instring[lastE:]) - return "".join(map(_ustr,out)) + return "".join(map(_ustr, out)) except ParseBaseException as pe: raise pe - def searchString( self, instring, maxMatches=_MAX_INT ): + def searchString(self, instring, maxMatches=_MAX_INT): """Another extension to scanString, simplifying the access to the tokens found - to match the given parse expression. May be called with optional - maxMatches argument, to clip searching after 'n' matches are found. + to match the given parse expression. May be called with optional + maxMatches argument, to clip searching after 'n' matches are found. """ try: - return ParseResults([ t for t,s,e in self.scanString( instring, maxMatches ) ]) + return ParseResults( + [t for t, s, e in self.scanString(instring, maxMatches)] + ) except ParseBaseException as pe: raise pe - def __add__(self, other ): + def __add__(self, other): """Implementation of + operator - returns And""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None - return And( [ self, other ] ) + return And([self, other]) - def __radd__(self, other ): + def __radd__(self, other): """Implementation of + operator when left operand is not a ParserElement""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None return other + self def __sub__(self, other): """Implementation of - operator, returns And with error stop""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None - return And( [ self, And._ErrorStop(), other ] ) + return And([self, And._ErrorStop(), other]) - def __rsub__(self, other ): + def __rsub__(self, other): """Implementation of - operator when left operand is not a ParserElement""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None return other - self - def __mul__(self,other): - if isinstance(other,int): - minElements, optElements = other,0 - elif isinstance(other,tuple): + def __mul__(self, other): + if isinstance(other, int): + minElements, optElements = other, 0 + elif isinstance(other, tuple): other = (other + (None, None))[:2] if other[0] is None: other = (0, other[1]) - if isinstance(other[0],int) and other[1] is None: + if isinstance(other[0], int) and other[1] is None: if other[0] == 0: return ZeroOrMore(self) if other[0] == 1: return OneOrMore(self) else: - return self*other[0] + ZeroOrMore(self) - elif isinstance(other[0],int) and isinstance(other[1],int): + return self * other[0] + ZeroOrMore(self) + elif isinstance(other[0], int) and isinstance(other[1], int): minElements, optElements = other optElements -= minElements else: - raise TypeError("cannot multiply 'ParserElement' and ('%s','%s') objects", type(other[0]),type(other[1])) + raise TypeError( + "cannot multiply 'ParserElement' and ('%s','%s') objects", + type(other[0]), + type(other[1]), + ) else: - raise TypeError("cannot multiply 'ParserElement' and '%s' objects", type(other)) + raise TypeError( + "cannot multiply 'ParserElement' and '%s' objects", type(other) + ) if minElements < 0: raise ValueError("cannot multiply ParserElement by negative value") if optElements < 0: - raise ValueError("second tuple value must be greater or equal to first tuple value") + raise ValueError( + "second tuple value must be greater or equal to first tuple value" + ) if minElements == optElements == 0: raise ValueError("cannot multiply ParserElement by 0 or (0,0)") - if (optElements): + if optElements: + def makeOptionalList(n): - if n>1: - return Optional(self + makeOptionalList(n-1)) + if n > 1: + return Optional(self + makeOptionalList(n - 1)) else: return Optional(self) + if minElements: if minElements == 1: ret = self + makeOptionalList(optElements) else: - ret = And([self]*minElements) + makeOptionalList(optElements) + ret = And([self] * minElements) + makeOptionalList(optElements) else: ret = makeOptionalList(optElements) else: if minElements == 1: ret = self else: - ret = And([self]*minElements) + ret = And([self] * minElements) return ret def __rmul__(self, other): return self.__mul__(other) - def __or__(self, other ): + def __or__(self, other): """Implementation of | operator - returns MatchFirst""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None - return MatchFirst( [ self, other ] ) + return MatchFirst([self, other]) - def __ror__(self, other ): + def __ror__(self, other): """Implementation of | operator when left operand is not a ParserElement""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None return other | self - def __xor__(self, other ): + def __xor__(self, other): """Implementation of ^ operator - returns Or""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None - return Or( [ self, other ] ) + return Or([self, other]) - def __rxor__(self, other ): + def __rxor__(self, other): """Implementation of ^ operator when left operand is not a ParserElement""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None return other ^ self - def __and__(self, other ): + def __and__(self, other): """Implementation of & operator - returns Each""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None - return Each( [ self, other ] ) + return Each([self, other]) - def __rand__(self, other ): + def __rand__(self, other): """Implementation of & operator when left operand is not a ParserElement""" - if isinstance( other, basestring ): - other = Literal( other ) - if not isinstance( other, ParserElement ): - warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), - SyntaxWarning, stacklevel=2) + if isinstance(other, basestring): + other = Literal(other) + if not isinstance(other, ParserElement): + warnings.warn( + f"Cannot combine element of type {type(other)} with ParserElement", + SyntaxWarning, + stacklevel=2, + ) return None return other & self - def __invert__( self ): + def __invert__(self): """Implementation of ~ operator - returns NotAny""" - return NotAny( self ) + return NotAny(self) def __call__(self, name): """Shortcut for setResultsName, with listAllMatches=default:: - userdata = Word(alphas).setResultsName("name") + Word(nums+"-").setResultsName("socsecno") - could be written as:: - userdata = Word(alphas)("name") + Word(nums+"-")("socsecno") - """ + userdata = Word(alphas).setResultsName("name") + Word(nums+"-").setResultsName("socsecno") + could be written as:: + userdata = Word(alphas)("name") + Word(nums+"-")("socsecno") + """ return self.setResultsName(name) - def suppress( self ): + def suppress(self): """Suppresses the output of this ParserElement; useful to keep punctuation from - cluttering up returned output. + cluttering up returned output. """ - return Suppress( self ) + return Suppress(self) - def leaveWhitespace( self ): + def leaveWhitespace(self): """Disables the skipping of whitespace before matching the characters in the - ParserElement's defined pattern. This is normally only used internally by - the pyparsing module, but may be needed in some whitespace-sensitive grammars. + ParserElement's defined pattern. This is normally only used internally by + the pyparsing module, but may be needed in some whitespace-sensitive grammars. """ self.skipWhitespace = False return self - def setWhitespaceChars( self, chars ): - """Overrides the default whitespace chars - """ + def setWhitespaceChars(self, chars): + """Overrides the default whitespace chars""" self.skipWhitespace = True self.whiteChars = chars self.copyDefaultWhiteChars = False return self - def parseWithTabs( self ): + def parseWithTabs(self): """Overrides default behavior to expand s to spaces before parsing the input string. - Must be called before parseString when the input grammar contains elements that - match characters.""" + Must be called before parseString when the input grammar contains elements that + match characters.""" self.keepTabs = True return self - def ignore( self, other ): + def ignore(self, other): """Define expression to be ignored (e.g., comments) while doing pattern - matching; may be called repeatedly, to define multiple comment or other - ignorable patterns. + matching; may be called repeatedly, to define multiple comment or other + ignorable patterns. """ - if isinstance( other, Suppress ): + if isinstance(other, Suppress): if other not in self.ignoreExprs: - self.ignoreExprs.append( other ) + self.ignoreExprs.append(other) else: - self.ignoreExprs.append( Suppress( other ) ) + self.ignoreExprs.append(Suppress(other)) return self - def setDebugActions( self, startAction, successAction, exceptionAction ): + def setDebugActions(self, startAction, successAction, exceptionAction): """Enable display of debugging messages while doing pattern matching.""" - self.debugActions = (startAction or _defaultStartDebugAction, - successAction or _defaultSuccessDebugAction, - exceptionAction or _defaultExceptionDebugAction) + self.debugActions = ( + startAction or _defaultStartDebugAction, + successAction or _defaultSuccessDebugAction, + exceptionAction or _defaultExceptionDebugAction, + ) self.debug = True return self - def setDebug( self, flag=True ): + def setDebug(self, flag=True): """Enable display of debugging messages while doing pattern matching. - Set flag to True to enable, False to disable.""" + Set flag to True to enable, False to disable.""" if flag: - self.setDebugActions( _defaultStartDebugAction, _defaultSuccessDebugAction, _defaultExceptionDebugAction ) + self.setDebugActions( + _defaultStartDebugAction, + _defaultSuccessDebugAction, + _defaultExceptionDebugAction, + ) else: self.debug = False return self - def __str__( self ): + def __str__(self): return self.name - def __repr__( self ): + def __repr__(self): return _ustr(self) - def streamline( self ): + def streamline(self): self.streamlined = True self.strRepr = None return self - def checkRecursion( self, parseElementList ): + def checkRecursion(self, parseElementList): pass - def validate( self, validateTrace=[] ): + def validate(self, validateTrace=[]): """Check defined expressions for valid structure, check for infinite recursive definitions.""" - self.checkRecursion( [] ) + self.checkRecursion([]) - def parseFile( self, file_or_filename, parseAll=False ): + def parseFile(self, file_or_filename, parseAll=False): """Execute the parse expression on the given file or filename. - If a filename is specified (instead of a file object), - the entire file is opened, read, and closed before parsing. + If a filename is specified (instead of a file object), + the entire file is opened, read, and closed before parsing. """ try: file_contents = file_or_filename.read() @@ -1410,16 +1660,16 @@ def parseFile( self, file_or_filename, parseAll=False ): raise exc def getException(self): - return ParseException("",0,self.errmsg,self) + return ParseException("", 0, self.errmsg, self) - def __getattr__(self,aname): + def __getattr__(self, aname): if aname == "myException": - self.myException = ret = self.getException(); - return ret; + self.myException = ret = self.getException() + return ret else: raise AttributeError("no such attribute " + aname) - def __eq__(self,other): + def __eq__(self, other): if isinstance(other, ParserElement): return self is other or self.__dict__ == other.__dict__ elif isinstance(other, basestring): @@ -1429,38 +1679,40 @@ def __eq__(self,other): except ParseBaseException: return False else: - return super(ParserElement,self)==other + return super() == other - def __ne__(self,other): + def __ne__(self, other): return not (self == other) def __hash__(self): return hash(id(self)) - def __req__(self,other): + def __req__(self, other): return self == other - def __rne__(self,other): + def __rne__(self, other): return not (self == other) class Token(ParserElement): """Abstract ParserElement subclass, for defining atomic matching patterns.""" - def __init__( self ): - super(Token,self).__init__( savelist=False ) - #self.myException = ParseException("",0,"",self) + + def __init__(self): + super().__init__(savelist=False) + # self.myException = ParseException("",0,"",self) def setName(self, name): - s = super(Token,self).setName(name) + s = super().setName(name) self.errmsg = "Expected " + self.name - #s.myException.msg = self.errmsg + # s.myException.msg = self.errmsg return s class Empty(Token): """An empty token, will always match.""" - def __init__( self ): - super(Empty,self).__init__() + + def __init__(self): + super().__init__() self.name = "Empty" self.mayReturnEmpty = True self.mayIndexError = False @@ -1468,15 +1720,16 @@ def __init__( self ): class NoMatch(Token): """A token that will never match.""" - def __init__( self ): - super(NoMatch,self).__init__() + + def __init__(self): + super().__init__() self.name = "NoMatch" self.mayReturnEmpty = True self.mayIndexError = False self.errmsg = "Unmatchable token" - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): exc = self.myException exc.loc = loc exc.pstr = instring @@ -1485,62 +1738,74 @@ def parseImpl( self, instring, loc, doActions=True ): class Literal(Token): """Token to exactly match a specified string.""" - def __init__( self, matchString ): - super(Literal,self).__init__() + + def __init__(self, matchString): + super().__init__() self.match = matchString self.matchLen = len(matchString) try: self.firstMatchChar = matchString[0] except IndexError: - warnings.warn("null string passed to Literal; use Empty() instead", - SyntaxWarning, stacklevel=2) + warnings.warn( + "null string passed to Literal; use Empty() instead", + SyntaxWarning, + stacklevel=2, + ) self.__class__ = Empty - self.name = '"%s"' % _ustr(self.match) + self.name = f'"{_ustr(self.match)}"' self.errmsg = "Expected " + self.name self.mayReturnEmpty = False - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg self.mayIndexError = False # Performance tuning: this routine gets called a *lot* # if this is a single character match string and the first character matches, # short-circuit as quickly as possible, and avoid calling startswith - #~ @profile - def parseImpl( self, instring, loc, doActions=True ): - if (instring[loc] == self.firstMatchChar and - (self.matchLen==1 or instring.startswith(self.match,loc)) ): - return loc+self.matchLen, self.match - #~ raise ParseException( instring, loc, self.errmsg ) + # ~ @profile + def parseImpl(self, instring, loc, doActions=True): + if instring[loc] == self.firstMatchChar and ( + self.matchLen == 1 or instring.startswith(self.match, loc) + ): + return loc + self.matchLen, self.match + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring raise exc + + _L = Literal + class Keyword(Token): """Token to exactly match a specified string as a keyword, that is, it must be - immediately followed by a non-keyword character. Compare with Literal:: - Literal("if") will match the leading 'if' in 'ifAndOnlyIf'. - Keyword("if") will not; it will only match the leading 'if in 'if x=1', or 'if(y==2)' - Accepts two optional constructor arguments in addition to the keyword string: - identChars is a string of characters that would be valid identifier characters, - defaulting to all alphanumerics + "_" and "$"; caseless allows case-insensitive - matching, default is False. + immediately followed by a non-keyword character. Compare with Literal:: + Literal("if") will match the leading 'if' in 'ifAndOnlyIf'. + Keyword("if") will not; it will only match the leading 'if in 'if x=1', or 'if(y==2)' + Accepts two optional constructor arguments in addition to the keyword string: + identChars is a string of characters that would be valid identifier characters, + defaulting to all alphanumerics + "_" and "$"; caseless allows case-insensitive + matching, default is False. """ - DEFAULT_KEYWORD_CHARS = alphanums+"_$" - def __init__( self, matchString, identChars=DEFAULT_KEYWORD_CHARS, caseless=False ): - super(Keyword,self).__init__() + DEFAULT_KEYWORD_CHARS = alphanums + "_$" + + def __init__(self, matchString, identChars=DEFAULT_KEYWORD_CHARS, caseless=False): + super().__init__() self.match = matchString self.matchLen = len(matchString) try: self.firstMatchChar = matchString[0] except IndexError: - warnings.warn("null string passed to Keyword; use Empty() instead", - SyntaxWarning, stacklevel=2) - self.name = '"%s"' % self.match + warnings.warn( + "null string passed to Keyword; use Empty() instead", + SyntaxWarning, + stacklevel=2, + ) + self.name = f'"{self.match}"' self.errmsg = "Expected " + self.name self.mayReturnEmpty = False - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg self.mayIndexError = False self.caseless = caseless if caseless: @@ -1548,85 +1813,104 @@ def __init__( self, matchString, identChars=DEFAULT_KEYWORD_CHARS, caseless=Fals identChars = identChars.upper() self.identChars = _str2dict(identChars) - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): if self.caseless: - if ( (instring[ loc:loc+self.matchLen ].upper() == self.caselessmatch) and - (loc >= len(instring)-self.matchLen or instring[loc+self.matchLen].upper() not in self.identChars) and - (loc == 0 or instring[loc-1].upper() not in self.identChars) ): - return loc+self.matchLen, self.match + if ( + (instring[loc : loc + self.matchLen].upper() == self.caselessmatch) + and ( + loc >= len(instring) - self.matchLen + or instring[loc + self.matchLen].upper() not in self.identChars + ) + and (loc == 0 or instring[loc - 1].upper() not in self.identChars) + ): + return loc + self.matchLen, self.match else: - if (instring[loc] == self.firstMatchChar and - (self.matchLen==1 or instring.startswith(self.match,loc)) and - (loc >= len(instring)-self.matchLen or instring[loc+self.matchLen] not in self.identChars) and - (loc == 0 or instring[loc-1] not in self.identChars) ): - return loc+self.matchLen, self.match - #~ raise ParseException( instring, loc, self.errmsg ) + if ( + instring[loc] == self.firstMatchChar + and (self.matchLen == 1 or instring.startswith(self.match, loc)) + and ( + loc >= len(instring) - self.matchLen + or instring[loc + self.matchLen] not in self.identChars + ) + and (loc == 0 or instring[loc - 1] not in self.identChars) + ): + return loc + self.matchLen, self.match + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring raise exc def copy(self): - c = super(Keyword,self).copy() + c = super().copy() c.identChars = Keyword.DEFAULT_KEYWORD_CHARS return c - def setDefaultKeywordChars( chars ): - """Overrides the default Keyword chars - """ + def setDefaultKeywordChars(chars): + """Overrides the default Keyword chars""" Keyword.DEFAULT_KEYWORD_CHARS = chars + setDefaultKeywordChars = staticmethod(setDefaultKeywordChars) + class CaselessLiteral(Literal): """Token to match a specified string, ignoring case of letters. - Note: the matched results will always be in the case of the given - match string, NOT the case of the input text. + Note: the matched results will always be in the case of the given + match string, NOT the case of the input text. """ - def __init__( self, matchString ): - super(CaselessLiteral,self).__init__( matchString.upper() ) + + def __init__(self, matchString): + super().__init__(matchString.upper()) # Preserve the defining literal. self.returnString = matchString - self.name = "'%s'" % self.returnString + self.name = f"'{self.returnString}'" self.errmsg = "Expected " + self.name - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg - def parseImpl( self, instring, loc, doActions=True ): - if instring[ loc:loc+self.matchLen ].upper() == self.match: - return loc+self.matchLen, self.returnString - #~ raise ParseException( instring, loc, self.errmsg ) + def parseImpl(self, instring, loc, doActions=True): + if instring[loc : loc + self.matchLen].upper() == self.match: + return loc + self.matchLen, self.returnString + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring raise exc + class CaselessKeyword(Keyword): - def __init__( self, matchString, identChars=Keyword.DEFAULT_KEYWORD_CHARS ): - super(CaselessKeyword,self).__init__( matchString, identChars, caseless=True ) - - def parseImpl( self, instring, loc, doActions=True ): - if ( (instring[ loc:loc+self.matchLen ].upper() == self.caselessmatch) and - (loc >= len(instring)-self.matchLen or instring[loc+self.matchLen].upper() not in self.identChars) ): - return loc+self.matchLen, self.match - #~ raise ParseException( instring, loc, self.errmsg ) + def __init__(self, matchString, identChars=Keyword.DEFAULT_KEYWORD_CHARS): + super().__init__(matchString, identChars, caseless=True) + + def parseImpl(self, instring, loc, doActions=True): + if (instring[loc : loc + self.matchLen].upper() == self.caselessmatch) and ( + loc >= len(instring) - self.matchLen + or instring[loc + self.matchLen].upper() not in self.identChars + ): + return loc + self.matchLen, self.match + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring raise exc + class Word(Token): """Token for matching words composed of allowed character sets. - Defined with string containing all allowed initial characters, - an optional string containing allowed body characters (if omitted, - defaults to the initial character set), and an optional minimum, - maximum, and/or exact length. The default value for min is 1 (a - minimum value < 1 is not valid); the default values for max and exact - are 0, meaning no maximum or exact length restriction. + Defined with string containing all allowed initial characters, + an optional string containing allowed body characters (if omitted, + defaults to the initial character set), and an optional minimum, + maximum, and/or exact length. The default value for min is 1 (a + minimum value < 1 is not valid); the default values for max and exact + are 0, meaning no maximum or exact length restriction. """ - def __init__( self, initChars, bodyChars=None, min=1, max=0, exact=0, asKeyword=False ): - super(Word,self).__init__() + + def __init__( + self, initChars, bodyChars=None, min=1, max=0, exact=0, asKeyword=False + ): + super().__init__() self.initCharsOrig = initChars self.initChars = _str2dict(initChars) - if bodyChars : + if bodyChars: self.bodyCharsOrig = bodyChars self.bodyChars = _str2dict(bodyChars) else: @@ -1636,7 +1920,9 @@ def __init__( self, initChars, bodyChars=None, min=1, max=0, exact=0, asKeyword= self.maxSpecified = max > 0 if min < 1: - raise ValueError("cannot specify a minimum length < 1; use Optional(Word()) if zero-length word is permitted") + raise ValueError( + "cannot specify a minimum length < 1; use Optional(Word()) if zero-length word is permitted" + ) self.minLen = min @@ -1651,31 +1937,35 @@ def __init__( self, initChars, bodyChars=None, min=1, max=0, exact=0, asKeyword= self.name = _ustr(self) self.errmsg = "Expected " + self.name - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg self.mayIndexError = False self.asKeyword = asKeyword - if ' ' not in self.initCharsOrig+self.bodyCharsOrig and (min==1 and max==0 and exact==0): + if " " not in self.initCharsOrig + self.bodyCharsOrig and ( + min == 1 and max == 0 and exact == 0 + ): if self.bodyCharsOrig == self.initCharsOrig: - self.reString = "[%s]+" % _escapeRegexRangeChars(self.initCharsOrig) + self.reString = f"[{_escapeRegexRangeChars(self.initCharsOrig)}]+" elif len(self.bodyCharsOrig) == 1: - self.reString = "%s[%s]*" % \ - (re.escape(self.initCharsOrig), - _escapeRegexRangeChars(self.bodyCharsOrig),) + self.reString = "{}[{}]*".format( + re.escape(self.initCharsOrig), + _escapeRegexRangeChars(self.bodyCharsOrig), + ) else: - self.reString = "[%s][%s]*" % \ - (_escapeRegexRangeChars(self.initCharsOrig), - _escapeRegexRangeChars(self.bodyCharsOrig),) + self.reString = "[{}][{}]*".format( + _escapeRegexRangeChars(self.initCharsOrig), + _escapeRegexRangeChars(self.bodyCharsOrig), + ) if self.asKeyword: - self.reString = r"\b"+self.reString+r"\b" + self.reString = r"\b" + self.reString + r"\b" try: - self.re = re.compile( self.reString ) + self.re = re.compile(self.reString) except: self.re = None - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): if self.re: - result = self.re.match(instring,loc) + result = self.re.match(instring, loc) if not result: exc = self.myException exc.loc = loc @@ -1683,10 +1973,10 @@ def parseImpl( self, instring, loc, doActions=True ): raise exc loc = result.end() - return loc,result.group() + return loc, result.group() - if not(instring[ loc ] in self.initChars): - #~ raise ParseException( instring, loc, self.errmsg ) + if not (instring[loc] in self.initChars): + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring @@ -1696,7 +1986,7 @@ def parseImpl( self, instring, loc, doActions=True ): instrlen = len(instring) bodychars = self.bodyChars maxloc = start + self.maxLen - maxloc = min( maxloc, instrlen ) + maxloc = min(maxloc, instrlen) while loc < maxloc and instring[loc] in bodychars: loc += 1 @@ -1706,11 +1996,13 @@ def parseImpl( self, instring, loc, doActions=True ): if self.maxSpecified and loc < instrlen and instring[loc] in bodychars: throwException = True if self.asKeyword: - if (start>0 and instring[start-1] in bodychars) or (loc 0 and instring[start - 1] in bodychars) or ( + loc < instrlen and instring[loc] in bodychars + ): throwException = True if throwException: - #~ raise ParseException( instring, loc, self.errmsg ) + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring @@ -1718,40 +2010,43 @@ def parseImpl( self, instring, loc, doActions=True ): return loc, instring[start:loc] - def __str__( self ): + def __str__(self): try: - return super(Word,self).__str__() + return super().__str__() except: pass - if self.strRepr is None: def charsAsStr(s): - if len(s)>4: - return s[:4]+"..." + if len(s) > 4: + return s[:4] + "..." else: return s - if ( self.initCharsOrig != self.bodyCharsOrig ): - self.strRepr = "W:(%s,%s)" % ( charsAsStr(self.initCharsOrig), charsAsStr(self.bodyCharsOrig) ) + if self.initCharsOrig != self.bodyCharsOrig: + self.strRepr = f"W:({charsAsStr(self.initCharsOrig)},{charsAsStr(self.bodyCharsOrig)})" else: - self.strRepr = "W:(%s)" % charsAsStr(self.initCharsOrig) + self.strRepr = f"W:({charsAsStr(self.initCharsOrig)})" return self.strRepr class Regex(Token): """Token for matching strings that match a given regular expression. - Defined with string specifying the regular expression in a form recognized by the inbuilt Python re module. + Defined with string specifying the regular expression in a form recognized by the inbuilt Python re module. """ - def __init__( self, pattern, flags=0): + + def __init__(self, pattern, flags=0): """The parameters pattern and flags are passed to the re.compile() function as-is. See the Python re module for an explanation of the acceptable patterns and flags.""" - super(Regex,self).__init__() + super().__init__() if len(pattern) == 0: - warnings.warn("null string passed to Regex; use Empty() instead", - SyntaxWarning, stacklevel=2) + warnings.warn( + "null string passed to Regex; use Empty() instead", + SyntaxWarning, + stacklevel=2, + ) self.pattern = pattern self.flags = flags @@ -1760,18 +2055,21 @@ def __init__( self, pattern, flags=0): self.re = re.compile(self.pattern, self.flags) self.reString = self.pattern except sre_constants.error: - warnings.warn("invalid pattern (%s) passed to Regex" % pattern, - SyntaxWarning, stacklevel=2) + warnings.warn( + f"invalid pattern ({pattern}) passed to Regex", + SyntaxWarning, + stacklevel=2, + ) raise self.name = _ustr(self) self.errmsg = "Expected " + self.name - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg self.mayIndexError = False self.mayReturnEmpty = True - def parseImpl( self, instring, loc, doActions=True ): - result = self.re.match(instring,loc) + def parseImpl(self, instring, loc, doActions=True): + result = self.re.match(instring, loc) if not result: exc = self.myException exc.loc = loc @@ -1784,39 +2082,49 @@ def parseImpl( self, instring, loc, doActions=True ): if d: for k in d: ret[k] = d[k] - return loc,ret + return loc, ret - def __str__( self ): + def __str__(self): try: - return super(Regex,self).__str__() + return super().__str__() except: pass if self.strRepr is None: - self.strRepr = "Re:(%s)" % repr(self.pattern) + self.strRepr = f"Re:({repr(self.pattern)})" return self.strRepr class QuotedString(Token): - """Token for matching strings that are delimited by quoting characters. - """ - def __init__( self, quoteChar, escChar=None, escQuote=None, multiline=False, unquoteResults=True, endQuoteChar=None): + """Token for matching strings that are delimited by quoting characters.""" + + def __init__( + self, + quoteChar, + escChar=None, + escQuote=None, + multiline=False, + unquoteResults=True, + endQuoteChar=None, + ): """ - Defined with the following parameters: - - quoteChar - string of one or more characters defining the quote delimiting string - - escChar - character to escape quotes, typically backslash (default=None) - - escQuote - special quote sequence to escape an embedded quote string (such as SQL's "" to escape an embedded ") (default=None) - - multiline - boolean indicating whether quotes can span multiple lines (default=False) - - unquoteResults - boolean indicating whether the matched text should be unquoted (default=True) - - endQuoteChar - string of one or more characters defining the end of the quote delimited string (default=None => same as quoteChar) + Defined with the following parameters: + - quoteChar - string of one or more characters defining the quote delimiting string + - escChar - character to escape quotes, typically backslash (default=None) + - escQuote - special quote sequence to escape an embedded quote string (such as SQL's "" to escape an embedded ") (default=None) + - multiline - boolean indicating whether quotes can span multiple lines (default=False) + - unquoteResults - boolean indicating whether the matched text should be unquoted (default=True) + - endQuoteChar - string of one or more characters defining the end of the quote delimited string (default=None => same as quoteChar) """ - super(QuotedString,self).__init__() + super().__init__() # remove white space from quote chars - wont work anyway quoteChar = quoteChar.strip() if len(quoteChar) == 0: - warnings.warn("quoteChar cannot be the empty string",SyntaxWarning,stacklevel=2) + warnings.warn( + "quoteChar cannot be the empty string", SyntaxWarning, stacklevel=2 + ) raise SyntaxError() if endQuoteChar is None: @@ -1824,7 +2132,11 @@ def __init__( self, quoteChar, escChar=None, escQuote=None, multiline=False, unq else: endQuoteChar = endQuoteChar.strip() if len(endQuoteChar) == 0: - warnings.warn("endQuoteChar cannot be the empty string",SyntaxWarning,stacklevel=2) + warnings.warn( + "endQuoteChar cannot be the empty string", + SyntaxWarning, + stacklevel=2, + ) raise SyntaxError() self.quoteChar = quoteChar @@ -1838,45 +2150,63 @@ def __init__( self, quoteChar, escChar=None, escQuote=None, multiline=False, unq if multiline: self.flags = re.MULTILINE | re.DOTALL - self.pattern = r'%s(?:[^%s%s]' % \ - ( re.escape(self.quoteChar), - _escapeRegexRangeChars(self.endQuoteChar[0]), - (escChar is not None and _escapeRegexRangeChars(escChar) or '') ) + self.pattern = r"{}(?:[^{}{}]".format( + re.escape(self.quoteChar), + _escapeRegexRangeChars(self.endQuoteChar[0]), + (escChar is not None and _escapeRegexRangeChars(escChar) or ""), + ) else: self.flags = 0 - self.pattern = r'%s(?:[^%s\n\r%s]' % \ - ( re.escape(self.quoteChar), - _escapeRegexRangeChars(self.endQuoteChar[0]), - (escChar is not None and _escapeRegexRangeChars(escChar) or '') ) + self.pattern = r"{}(?:[^{}\n\r{}]".format( + re.escape(self.quoteChar), + _escapeRegexRangeChars(self.endQuoteChar[0]), + (escChar is not None and _escapeRegexRangeChars(escChar) or ""), + ) if len(self.endQuoteChar) > 1: self.pattern += ( - '|(?:' + ')|(?:'.join(["%s[^%s]" % (re.escape(self.endQuoteChar[:i]), - _escapeRegexRangeChars(self.endQuoteChar[i])) - for i in range(len(self.endQuoteChar)-1,0,-1)]) + ')' + "|(?:" + + ")|(?:".join( + [ + "%s[^%s]" + % ( + re.escape(self.endQuoteChar[:i]), + _escapeRegexRangeChars(self.endQuoteChar[i]), + ) + for i in range(len(self.endQuoteChar) - 1, 0, -1) + ] ) + + ")" + ) if escQuote: - self.pattern += (r'|(?:%s)' % re.escape(escQuote)) + self.pattern += r"|(?:%s)" % re.escape(escQuote) if escChar: - self.pattern += (r'|(?:%s.)' % re.escape(escChar)) - self.escCharReplacePattern = re.escape(self.escChar)+"(.)" - self.pattern += (r')*%s' % re.escape(self.endQuoteChar)) + self.pattern += r"|(?:%s.)" % re.escape(escChar) + self.escCharReplacePattern = re.escape(self.escChar) + "(.)" + self.pattern += r")*%s" % re.escape(self.endQuoteChar) try: self.re = re.compile(self.pattern, self.flags) self.reString = self.pattern except sre_constants.error: - warnings.warn("invalid pattern (%s) passed to Regex" % self.pattern, - SyntaxWarning, stacklevel=2) + warnings.warn( + f"invalid pattern ({self.pattern}) passed to Regex", + SyntaxWarning, + stacklevel=2, + ) raise self.name = _ustr(self) self.errmsg = "Expected " + self.name - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg self.mayIndexError = False self.mayReturnEmpty = True - def parseImpl( self, instring, loc, doActions=True ): - result = instring[loc] == self.firstQuoteChar and self.re.match(instring,loc) or None + def parseImpl(self, instring, loc, doActions=True): + result = ( + instring[loc] == self.firstQuoteChar + and self.re.match(instring, loc) + or None + ) if not result: exc = self.myException exc.loc = loc @@ -1887,14 +2217,13 @@ def parseImpl( self, instring, loc, doActions=True ): ret = result.group() if self.unquoteResults: - # strip off quotes - ret = ret[self.quoteCharLen:-self.endQuoteCharLen] + ret = ret[self.quoteCharLen : -self.endQuoteCharLen] - if isinstance(ret,basestring): + if isinstance(ret, basestring): # replace escaped characters if self.escChar: - ret = re.sub(self.escCharReplacePattern,"\g<1>",ret) + ret = re.sub(self.escCharReplacePattern, r"\g<1>", ret) # replace escaped quotes if self.escQuote: @@ -1902,32 +2231,35 @@ def parseImpl( self, instring, loc, doActions=True ): return loc, ret - def __str__( self ): + def __str__(self): try: - return super(QuotedString,self).__str__() + return super().__str__() except: pass if self.strRepr is None: - self.strRepr = "quoted string, starting with %s ending with %s" % (self.quoteChar, self.endQuoteChar) + self.strRepr = f"quoted string, starting with {self.quoteChar} ending with {self.endQuoteChar}" return self.strRepr class CharsNotIn(Token): """Token for matching words composed of characters *not* in a given set. - Defined with string containing all disallowed characters, and an optional - minimum, maximum, and/or exact length. The default value for min is 1 (a - minimum value < 1 is not valid); the default values for max and exact - are 0, meaning no maximum or exact length restriction. + Defined with string containing all disallowed characters, and an optional + minimum, maximum, and/or exact length. The default value for min is 1 (a + minimum value < 1 is not valid); the default values for max and exact + are 0, meaning no maximum or exact length restriction. """ - def __init__( self, notChars, min=1, max=0, exact=0 ): - super(CharsNotIn,self).__init__() + + def __init__(self, notChars, min=1, max=0, exact=0): + super().__init__() self.skipWhitespace = False self.notChars = notChars if min < 1: - raise ValueError("cannot specify a minimum length < 1; use Optional(CharsNotIn()) if zero-length char group is permitted") + raise ValueError( + "cannot specify a minimum length < 1; use Optional(CharsNotIn()) if zero-length char group is permitted" + ) self.minLen = min @@ -1942,13 +2274,13 @@ def __init__( self, notChars, min=1, max=0, exact=0 ): self.name = _ustr(self) self.errmsg = "Expected " + self.name - self.mayReturnEmpty = ( self.minLen == 0 ) - #self.myException.msg = self.errmsg + self.mayReturnEmpty = self.minLen == 0 + # self.myException.msg = self.errmsg self.mayIndexError = False - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): if instring[loc] in self.notChars: - #~ raise ParseException( instring, loc, self.errmsg ) + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring @@ -1957,13 +2289,12 @@ def parseImpl( self, instring, loc, doActions=True ): start = loc loc += 1 notchars = self.notChars - maxlen = min( start+self.maxLen, len(instring) ) - while loc < maxlen and \ - (instring[loc] not in notchars): + maxlen = min(start + self.maxLen, len(instring)) + while loc < maxlen and (instring[loc] not in notchars): loc += 1 if loc - start < self.minLen: - #~ raise ParseException( instring, loc, self.errmsg ) + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring @@ -1971,42 +2302,47 @@ def parseImpl( self, instring, loc, doActions=True ): return loc, instring[start:loc] - def __str__( self ): + def __str__(self): try: - return super(CharsNotIn, self).__str__() + return super().__str__() except: pass if self.strRepr is None: if len(self.notChars) > 4: - self.strRepr = "!W:(%s...)" % self.notChars[:4] + self.strRepr = f"!W:({self.notChars[:4]}...)" else: - self.strRepr = "!W:(%s)" % self.notChars + self.strRepr = f"!W:({self.notChars})" return self.strRepr + class White(Token): """Special matching class for matching whitespace. Normally, whitespace is ignored - by pyparsing grammars. This class is included when some whitespace structures - are significant. Define with a string containing the whitespace characters to be - matched; default is " \\t\\r\\n". Also takes optional min, max, and exact arguments, - as defined for the Word class.""" + by pyparsing grammars. This class is included when some whitespace structures + are significant. Define with a string containing the whitespace characters to be + matched; default is " \\t\\r\\n". Also takes optional min, max, and exact arguments, + as defined for the Word class.""" + whiteStrs = { - " " : "", + " ": "", "\t": "", "\n": "", "\r": "", "\f": "", - } + } + def __init__(self, ws=" \t\r\n", min=1, max=0, exact=0): - super(White,self).__init__() + super().__init__() self.matchWhite = ws - self.setWhitespaceChars( "".join([c for c in self.whiteChars if c not in self.matchWhite]) ) - #~ self.leaveWhitespace() - self.name = ("".join([White.whiteStrs[c] for c in self.matchWhite])) + self.setWhitespaceChars( + "".join([c for c in self.whiteChars if c not in self.matchWhite]) + ) + # ~ self.leaveWhitespace() + self.name = "".join([White.whiteStrs[c] for c in self.matchWhite]) self.mayReturnEmpty = True self.errmsg = "Expected " + self.name - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg self.minLen = min @@ -2019,9 +2355,9 @@ def __init__(self, ws=" \t\r\n", min=1, max=0, exact=0): self.maxLen = exact self.minLen = exact - def parseImpl( self, instring, loc, doActions=True ): - if not(instring[ loc ] in self.matchWhite): - #~ raise ParseException( instring, loc, self.errmsg ) + def parseImpl(self, instring, loc, doActions=True): + if not (instring[loc] in self.matchWhite): + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring @@ -2029,12 +2365,12 @@ def parseImpl( self, instring, loc, doActions=True ): start = loc loc += 1 maxloc = start + self.maxLen - maxloc = min( maxloc, len(instring) ) + maxloc = min(maxloc, len(instring)) while loc < maxloc and instring[loc] in self.matchWhite: loc += 1 if loc - start < self.minLen: - #~ raise ParseException( instring, loc, self.errmsg ) + # ~ raise ParseException( instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring @@ -2044,120 +2380,136 @@ def parseImpl( self, instring, loc, doActions=True ): class _PositionToken(Token): - def __init__( self ): - super(_PositionToken,self).__init__() - self.name=self.__class__.__name__ + def __init__(self): + super().__init__() + self.name = self.__class__.__name__ self.mayReturnEmpty = True self.mayIndexError = False + class GoToColumn(_PositionToken): """Token to advance to a specific column of input text; useful for tabular report scraping.""" - def __init__( self, colno ): - super(GoToColumn,self).__init__() + + def __init__(self, colno): + super().__init__() self.col = colno - def preParse( self, instring, loc ): - if col(loc,instring) != self.col: + def preParse(self, instring, loc): + if col(loc, instring) != self.col: instrlen = len(instring) if self.ignoreExprs: - loc = self._skipIgnorables( instring, loc ) - while loc < instrlen and instring[loc].isspace() and col( loc, instring ) != self.col : + loc = self._skipIgnorables(instring, loc) + while ( + loc < instrlen + and instring[loc].isspace() + and col(loc, instring) != self.col + ): loc += 1 return loc - def parseImpl( self, instring, loc, doActions=True ): - thiscol = col( loc, instring ) + def parseImpl(self, instring, loc, doActions=True): + thiscol = col(loc, instring) if thiscol > self.col: - raise ParseException( instring, loc, "Text not in expected column", self ) + raise ParseException(instring, loc, "Text not in expected column", self) newloc = loc + self.col - thiscol - ret = instring[ loc: newloc ] + ret = instring[loc:newloc] return newloc, ret + class LineStart(_PositionToken): """Matches if current position is at the beginning of a line within the parse string""" - def __init__( self ): - super(LineStart,self).__init__() - self.setWhitespaceChars( ParserElement.DEFAULT_WHITE_CHARS.replace("\n","") ) + + def __init__(self): + super().__init__() + self.setWhitespaceChars(ParserElement.DEFAULT_WHITE_CHARS.replace("\n", "")) self.errmsg = "Expected start of line" - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg - def preParse( self, instring, loc ): - preloc = super(LineStart,self).preParse(instring,loc) + def preParse(self, instring, loc): + preloc = super().preParse(instring, loc) if instring[preloc] == "\n": loc += 1 return loc - def parseImpl( self, instring, loc, doActions=True ): - if not( loc==0 or - (loc == self.preParse( instring, 0 )) or - (instring[loc-1] == "\n") ): #col(loc, instring) != 1: - #~ raise ParseException( instring, loc, "Expected start of line" ) + def parseImpl(self, instring, loc, doActions=True): + if not ( + loc == 0 + or (loc == self.preParse(instring, 0)) + or (instring[loc - 1] == "\n") + ): # col(loc, instring) != 1: + # ~ raise ParseException( instring, loc, "Expected start of line" ) exc = self.myException exc.loc = loc exc.pstr = instring raise exc return loc, [] + class LineEnd(_PositionToken): """Matches if current position is at the end of a line within the parse string""" - def __init__( self ): - super(LineEnd,self).__init__() - self.setWhitespaceChars( ParserElement.DEFAULT_WHITE_CHARS.replace("\n","") ) + + def __init__(self): + super().__init__() + self.setWhitespaceChars(ParserElement.DEFAULT_WHITE_CHARS.replace("\n", "")) self.errmsg = "Expected end of line" - #self.myException.msg = self.errmsg + # self.myException.msg = self.errmsg - def parseImpl( self, instring, loc, doActions=True ): - if loc len(instring): return loc, [] else: @@ -2166,47 +2518,55 @@ def parseImpl( self, instring, loc, doActions=True ): exc.pstr = instring raise exc + class WordStart(_PositionToken): """Matches if the current position is at the beginning of a Word, and - is not preceded by any character in a given set of wordChars - (default=printables). To emulate the \b behavior of regular expressions, - use WordStart(alphanums). WordStart will also match at the beginning of - the string being parsed, or at the beginning of a line. + is not preceded by any character in a given set of wordChars + (default=printables). To emulate the \b behavior of regular expressions, + use WordStart(alphanums). WordStart will also match at the beginning of + the string being parsed, or at the beginning of a line. """ - def __init__(self, wordChars = printables): - super(WordStart,self).__init__() + + def __init__(self, wordChars=printables): + super().__init__() self.wordChars = _str2dict(wordChars) self.errmsg = "Not at the start of a word" - def parseImpl(self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): if loc != 0: - if (instring[loc-1] in self.wordChars or - instring[loc] not in self.wordChars): + if ( + instring[loc - 1] in self.wordChars + or instring[loc] not in self.wordChars + ): exc = self.myException exc.loc = loc exc.pstr = instring raise exc return loc, [] + class WordEnd(_PositionToken): """Matches if the current position is at the end of a Word, and - is not followed by any character in a given set of wordChars - (default=printables). To emulate the \b behavior of regular expressions, - use WordEnd(alphanums). WordEnd will also match at the end of - the string being parsed, or at the end of a line. + is not followed by any character in a given set of wordChars + (default=printables). To emulate the \b behavior of regular expressions, + use WordEnd(alphanums). WordEnd will also match at the end of + the string being parsed, or at the end of a line. """ - def __init__(self, wordChars = printables): - super(WordEnd,self).__init__() + + def __init__(self, wordChars=printables): + super().__init__() self.wordChars = _str2dict(wordChars) self.skipWhitespace = False self.errmsg = "Not at the end of a word" - def parseImpl(self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): instrlen = len(instring) - if instrlen>0 and loc 0 and loc < instrlen: + if ( + instring[loc] in self.wordChars + or instring[loc - 1] not in self.wordChars + ): + # ~ raise ParseException( instring, loc, "Expected end of word" ) exc = self.myException exc.loc = loc exc.pstr = instring @@ -2216,60 +2576,61 @@ def parseImpl(self, instring, loc, doActions=True ): class ParseExpression(ParserElement): """Abstract subclass of ParserElement, for combining and post-processing parsed tokens.""" - def __init__( self, exprs, savelist = False ): - super(ParseExpression,self).__init__(savelist) - if isinstance( exprs, list ): + + def __init__(self, exprs, savelist=False): + super().__init__(savelist) + if isinstance(exprs, list): self.exprs = exprs - elif isinstance( exprs, basestring ): - self.exprs = [ Literal( exprs ) ] + elif isinstance(exprs, basestring): + self.exprs = [Literal(exprs)] else: try: - self.exprs = list( exprs ) + self.exprs = list(exprs) except TypeError: - self.exprs = [ exprs ] + self.exprs = [exprs] self.callPreparse = False - def __getitem__( self, i ): + def __getitem__(self, i): return self.exprs[i] - def append( self, other ): - self.exprs.append( other ) + def append(self, other): + self.exprs.append(other) self.strRepr = None return self - def leaveWhitespace( self ): + def leaveWhitespace(self): """Extends leaveWhitespace defined in base class, and also invokes leaveWhitespace on - all contained expressions.""" + all contained expressions.""" self.skipWhitespace = False - self.exprs = [ e.copy() for e in self.exprs ] + self.exprs = [e.copy() for e in self.exprs] for e in self.exprs: e.leaveWhitespace() return self - def ignore( self, other ): - if isinstance( other, Suppress ): + def ignore(self, other): + if isinstance(other, Suppress): if other not in self.ignoreExprs: - super( ParseExpression, self).ignore( other ) + super().ignore(other) for e in self.exprs: - e.ignore( self.ignoreExprs[-1] ) + e.ignore(self.ignoreExprs[-1]) else: - super( ParseExpression, self).ignore( other ) + super().ignore(other) for e in self.exprs: - e.ignore( self.ignoreExprs[-1] ) + e.ignore(self.ignoreExprs[-1]) return self - def __str__( self ): + def __str__(self): try: - return super(ParseExpression,self).__str__() + return super().__str__() except: pass if self.strRepr is None: - self.strRepr = "%s:(%s)" % ( self.__class__.__name__, _ustr(self.exprs) ) + self.strRepr = f"{self.__class__.__name__}:({_ustr(self.exprs)})" return self.strRepr - def streamline( self ): - super(ParseExpression,self).streamline() + def streamline(self): + super().streamline() for e in self.exprs: e.streamline() @@ -2277,65 +2638,72 @@ def streamline( self ): # collapse nested And's of the form And( And( And( a,b), c), d) to And( a,b,c,d ) # but only if there are no parse actions or resultsNames on the nested And's # (likewise for Or's and MatchFirst's) - if ( len(self.exprs) == 2 ): + if len(self.exprs) == 2: other = self.exprs[0] - if ( isinstance( other, self.__class__ ) and - not(other.parseAction) and - other.resultsName is None and - not other.debug ): - self.exprs = other.exprs[:] + [ self.exprs[1] ] + if ( + isinstance(other, self.__class__) + and not (other.parseAction) + and other.resultsName is None + and not other.debug + ): + self.exprs = other.exprs[:] + [self.exprs[1]] self.strRepr = None self.mayReturnEmpty |= other.mayReturnEmpty - self.mayIndexError |= other.mayIndexError + self.mayIndexError |= other.mayIndexError other = self.exprs[-1] - if ( isinstance( other, self.__class__ ) and - not(other.parseAction) and - other.resultsName is None and - not other.debug ): + if ( + isinstance(other, self.__class__) + and not (other.parseAction) + and other.resultsName is None + and not other.debug + ): self.exprs = self.exprs[:-1] + other.exprs[:] self.strRepr = None self.mayReturnEmpty |= other.mayReturnEmpty - self.mayIndexError |= other.mayIndexError + self.mayIndexError |= other.mayIndexError return self - def setResultsName( self, name, listAllMatches=False ): - ret = super(ParseExpression,self).setResultsName(name,listAllMatches) + def setResultsName(self, name, listAllMatches=False): + ret = super().setResultsName(name, listAllMatches) return ret - def validate( self, validateTrace=[] ): - tmp = validateTrace[:]+[self] + def validate(self, validateTrace=[]): + tmp = validateTrace[:] + [self] for e in self.exprs: e.validate(tmp) - self.checkRecursion( [] ) + self.checkRecursion([]) + class And(ParseExpression): """Requires all given ParseExpressions to be found in the given order. - Expressions may be separated by whitespace. - May be constructed using the '+' operator. + Expressions may be separated by whitespace. + May be constructed using the '+' operator. """ class _ErrorStop(Empty): def __init__(self, *args, **kwargs): - super(Empty,self).__init__(*args, **kwargs) + super(Empty, self).__init__(*args, **kwargs) self.leaveWhitespace() - def __init__( self, exprs, savelist = True ): - super(And,self).__init__(exprs, savelist) + def __init__(self, exprs, savelist=True): + super().__init__(exprs, savelist) self.mayReturnEmpty = True for e in self.exprs: if not e.mayReturnEmpty: self.mayReturnEmpty = False break - self.setWhitespaceChars( exprs[0].whiteChars ) + self.setWhitespaceChars(exprs[0].whiteChars) self.skipWhitespace = exprs[0].skipWhitespace self.callPreparse = True - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): # pass False as last arg to _parse for first element, since we already # pre-parsed the string as part of our And pre-parsing - loc, resultlist = self.exprs[0]._parse( instring, loc, doActions, callPreParse=False ) + loc, resultlist = self.exprs[0]._parse( + instring, loc, doActions, callPreParse=False + ) errorStop = False for e in self.exprs[1:]: if isinstance(e, And._ErrorStop): @@ -2343,68 +2711,73 @@ def parseImpl( self, instring, loc, doActions=True ): continue if errorStop: try: - loc, exprtokens = e._parse( instring, loc, doActions ) + loc, exprtokens = e._parse(instring, loc, doActions) except ParseSyntaxException: raise except ParseBaseException as pe: raise ParseSyntaxException(pe) except IndexError as ie: - raise ParseSyntaxException( ParseException(instring, len(instring), self.errmsg, self) ) + raise ParseSyntaxException( + ParseException(instring, len(instring), self.errmsg, self) + ) else: - loc, exprtokens = e._parse( instring, loc, doActions ) + loc, exprtokens = e._parse(instring, loc, doActions) if exprtokens or exprtokens.keys(): resultlist += exprtokens return loc, resultlist - def __iadd__(self, other ): - if isinstance( other, basestring ): - other = Literal( other ) - return self.append( other ) #And( [ self, other ] ) + def __iadd__(self, other): + if isinstance(other, basestring): + other = Literal(other) + return self.append(other) # And( [ self, other ] ) - def checkRecursion( self, parseElementList ): - subRecCheckList = parseElementList[:] + [ self ] + def checkRecursion(self, parseElementList): + subRecCheckList = parseElementList[:] + [self] for e in self.exprs: - e.checkRecursion( subRecCheckList ) + e.checkRecursion(subRecCheckList) if not e.mayReturnEmpty: break - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name if self.strRepr is None: - self.strRepr = "{" + " ".join( [ _ustr(e) for e in self.exprs ] ) + "}" + self.strRepr = "{" + " ".join([_ustr(e) for e in self.exprs]) + "}" return self.strRepr class Or(ParseExpression): """Requires that at least one ParseExpression is found. - If two expressions match, the expression that matches the longest string will be used. - May be constructed using the '^' operator. + If two expressions match, the expression that matches the longest string will be used. + May be constructed using the '^' operator. """ - def __init__( self, exprs, savelist = False ): - super(Or,self).__init__(exprs, savelist) + + def __init__(self, exprs, savelist=False): + super().__init__(exprs, savelist) self.mayReturnEmpty = False for e in self.exprs: if e.mayReturnEmpty: self.mayReturnEmpty = True break - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): maxExcLoc = -1 maxMatchLoc = -1 maxException = None for e in self.exprs: try: - loc2 = e.tryParse( instring, loc ) + loc2 = e.tryParse(instring, loc) except ParseException as err: if err.loc > maxExcLoc: maxException = err maxExcLoc = err.loc except IndexError: if len(instring) > maxExcLoc: - maxException = ParseException(instring,len(instring),e.errmsg,self) + maxException = ParseException( + instring, len(instring), e.errmsg, self + ) maxExcLoc = len(instring) else: if loc2 > maxMatchLoc: @@ -2415,37 +2788,40 @@ def parseImpl( self, instring, loc, doActions=True ): if maxException is not None: raise maxException else: - raise ParseException(instring, loc, "no defined alternatives to match", self) + raise ParseException( + instring, loc, "no defined alternatives to match", self + ) - return maxMatchExp._parse( instring, loc, doActions ) + return maxMatchExp._parse(instring, loc, doActions) - def __ixor__(self, other ): - if isinstance( other, basestring ): - other = Literal( other ) - return self.append( other ) #Or( [ self, other ] ) + def __ixor__(self, other): + if isinstance(other, basestring): + other = Literal(other) + return self.append(other) # Or( [ self, other ] ) - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name if self.strRepr is None: - self.strRepr = "{" + " ^ ".join( [ _ustr(e) for e in self.exprs ] ) + "}" + self.strRepr = "{" + " ^ ".join([_ustr(e) for e in self.exprs]) + "}" return self.strRepr - def checkRecursion( self, parseElementList ): - subRecCheckList = parseElementList[:] + [ self ] + def checkRecursion(self, parseElementList): + subRecCheckList = parseElementList[:] + [self] for e in self.exprs: - e.checkRecursion( subRecCheckList ) + e.checkRecursion(subRecCheckList) class MatchFirst(ParseExpression): """Requires that at least one ParseExpression is found. - If two expressions match, the first one listed is the one that will match. - May be constructed using the '|' operator. + If two expressions match, the first one listed is the one that will match. + May be constructed using the '|' operator. """ - def __init__( self, exprs, savelist = False ): - super(MatchFirst,self).__init__(exprs, savelist) + + def __init__(self, exprs, savelist=False): + super().__init__(exprs, savelist) if exprs: self.mayReturnEmpty = False for e in self.exprs: @@ -2455,12 +2831,12 @@ def __init__( self, exprs, savelist = False ): else: self.mayReturnEmpty = True - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): maxExcLoc = -1 maxException = None for e in self.exprs: try: - ret = e._parse( instring, loc, doActions ) + ret = e._parse(instring, loc, doActions) return ret except ParseException as err: if err.loc > maxExcLoc: @@ -2468,7 +2844,9 @@ def parseImpl( self, instring, loc, doActions=True ): maxExcLoc = err.loc except IndexError: if len(instring) > maxExcLoc: - maxException = ParseException(instring,len(instring),e.errmsg,self) + maxException = ParseException( + instring, len(instring), e.errmsg, self + ) maxExcLoc = len(instring) # only got here if no expression matched, raise exception for match that made it the furthest @@ -2476,35 +2854,38 @@ def parseImpl( self, instring, loc, doActions=True ): if maxException is not None: raise maxException else: - raise ParseException(instring, loc, "no defined alternatives to match", self) + raise ParseException( + instring, loc, "no defined alternatives to match", self + ) - def __ior__(self, other ): - if isinstance( other, basestring ): - other = Literal( other ) - return self.append( other ) #MatchFirst( [ self, other ] ) + def __ior__(self, other): + if isinstance(other, basestring): + other = Literal(other) + return self.append(other) # MatchFirst( [ self, other ] ) - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name if self.strRepr is None: - self.strRepr = "{" + " | ".join( [ _ustr(e) for e in self.exprs ] ) + "}" + self.strRepr = "{" + " | ".join([_ustr(e) for e in self.exprs]) + "}" return self.strRepr - def checkRecursion( self, parseElementList ): - subRecCheckList = parseElementList[:] + [ self ] + def checkRecursion(self, parseElementList): + subRecCheckList = parseElementList[:] + [self] for e in self.exprs: - e.checkRecursion( subRecCheckList ) + e.checkRecursion(subRecCheckList) class Each(ParseExpression): """Requires all given ParseExpressions to be found, but in any order. - Expressions may be separated by whitespace. - May be constructed using the '&' operator. + Expressions may be separated by whitespace. + May be constructed using the '&' operator. """ - def __init__( self, exprs, savelist = True ): - super(Each,self).__init__(exprs, savelist) + + def __init__(self, exprs, savelist=True): + super().__init__(exprs, savelist) self.mayReturnEmpty = True for e in self.exprs: if not e.mayReturnEmpty: @@ -2513,17 +2894,25 @@ def __init__( self, exprs, savelist = True ): self.skipWhitespace = True self.initExprGroups = True - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): if self.initExprGroups: - self.optionals = [ e.expr for e in self.exprs if isinstance(e,Optional) ] - self.multioptionals = [ e.expr for e in self.exprs if isinstance(e,ZeroOrMore) ] - self.multirequired = [ e.expr for e in self.exprs if isinstance(e,OneOrMore) ] - self.required = [ e for e in self.exprs if not isinstance(e,(Optional,ZeroOrMore,OneOrMore)) ] + self.optionals = [e.expr for e in self.exprs if isinstance(e, Optional)] + self.multioptionals = [ + e.expr for e in self.exprs if isinstance(e, ZeroOrMore) + ] + self.multirequired = [ + e.expr for e in self.exprs if isinstance(e, OneOrMore) + ] + self.required = [ + e + for e in self.exprs + if not isinstance(e, (Optional, ZeroOrMore, OneOrMore)) + ] self.required += self.multirequired self.initExprGroups = False tmpLoc = loc tmpReqd = self.required[:] - tmpOpt = self.optionals[:] + tmpOpt = self.optionals[:] matchOrder = [] keepMatching = True @@ -2532,7 +2921,7 @@ def parseImpl( self, instring, loc, doActions=True ): failed = [] for e in tmpExprs: try: - tmpLoc = e.tryParse( instring, tmpLoc ) + tmpLoc = e.tryParse(instring, tmpLoc) except ParseException: failed.append(e) else: @@ -2545,15 +2934,19 @@ def parseImpl( self, instring, loc, doActions=True ): keepMatching = False if tmpReqd: - missing = ", ".join( [ _ustr(e) for e in tmpReqd ] ) - raise ParseException(instring,loc,"Missing one or more required elements (%s)" % missing ) + missing = ", ".join([_ustr(e) for e in tmpReqd]) + raise ParseException( + instring, loc, f"Missing one or more required elements ({missing})" + ) # add any unmatched Optionals, in case they have default values defined - matchOrder += list(e for e in self.exprs if isinstance(e,Optional) and e.expr in tmpOpt) + matchOrder += list( + e for e in self.exprs if isinstance(e, Optional) and e.expr in tmpOpt + ) resultlist = [] for e in matchOrder: - loc,results = e._parse(instring,loc,doActions) + loc, results = e._parse(instring, loc, doActions) resultlist.append(results) finalResults = ParseResults([]) @@ -2565,94 +2958,95 @@ def parseImpl( self, instring, loc, doActions=True ): tmp += ParseResults(r[k]) dups[k] = tmp finalResults += ParseResults(r) - for k,v in dups.items(): + for k, v in dups.items(): finalResults[k] = v return loc, finalResults - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name if self.strRepr is None: - self.strRepr = "{" + " & ".join( [ _ustr(e) for e in self.exprs ] ) + "}" + self.strRepr = "{" + " & ".join([_ustr(e) for e in self.exprs]) + "}" return self.strRepr - def checkRecursion( self, parseElementList ): - subRecCheckList = parseElementList[:] + [ self ] + def checkRecursion(self, parseElementList): + subRecCheckList = parseElementList[:] + [self] for e in self.exprs: - e.checkRecursion( subRecCheckList ) + e.checkRecursion(subRecCheckList) class ParseElementEnhance(ParserElement): """Abstract subclass of ParserElement, for combining and post-processing parsed tokens.""" - def __init__( self, expr, savelist=False ): - super(ParseElementEnhance,self).__init__(savelist) - if isinstance( expr, basestring ): + + def __init__(self, expr, savelist=False): + super().__init__(savelist) + if isinstance(expr, basestring): expr = Literal(expr) self.expr = expr self.strRepr = None if expr is not None: self.mayIndexError = expr.mayIndexError self.mayReturnEmpty = expr.mayReturnEmpty - self.setWhitespaceChars( expr.whiteChars ) + self.setWhitespaceChars(expr.whiteChars) self.skipWhitespace = expr.skipWhitespace self.saveAsList = expr.saveAsList self.callPreparse = expr.callPreparse self.ignoreExprs.extend(expr.ignoreExprs) - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): if self.expr is not None: - return self.expr._parse( instring, loc, doActions, callPreParse=False ) + return self.expr._parse(instring, loc, doActions, callPreParse=False) else: - raise ParseException("",loc,self.errmsg,self) + raise ParseException("", loc, self.errmsg, self) - def leaveWhitespace( self ): + def leaveWhitespace(self): self.skipWhitespace = False self.expr = self.expr.copy() if self.expr is not None: self.expr.leaveWhitespace() return self - def ignore( self, other ): - if isinstance( other, Suppress ): + def ignore(self, other): + if isinstance(other, Suppress): if other not in self.ignoreExprs: - super( ParseElementEnhance, self).ignore( other ) + super().ignore(other) if self.expr is not None: - self.expr.ignore( self.ignoreExprs[-1] ) + self.expr.ignore(self.ignoreExprs[-1]) else: - super( ParseElementEnhance, self).ignore( other ) + super().ignore(other) if self.expr is not None: - self.expr.ignore( self.ignoreExprs[-1] ) + self.expr.ignore(self.ignoreExprs[-1]) return self - def streamline( self ): - super(ParseElementEnhance,self).streamline() + def streamline(self): + super().streamline() if self.expr is not None: self.expr.streamline() return self - def checkRecursion( self, parseElementList ): + def checkRecursion(self, parseElementList): if self in parseElementList: - raise RecursiveGrammarException( parseElementList+[self] ) - subRecCheckList = parseElementList[:] + [ self ] + raise RecursiveGrammarException(parseElementList + [self]) + subRecCheckList = parseElementList[:] + [self] if self.expr is not None: - self.expr.checkRecursion( subRecCheckList ) + self.expr.checkRecursion(subRecCheckList) - def validate( self, validateTrace=[] ): - tmp = validateTrace[:]+[self] + def validate(self, validateTrace=[]): + tmp = validateTrace[:] + [self] if self.expr is not None: self.expr.validate(tmp) - self.checkRecursion( [] ) + self.checkRecursion([]) - def __str__( self ): + def __str__(self): try: - return super(ParseElementEnhance,self).__str__() + return super().__str__() except: pass if self.strRepr is None and self.expr is not None: - self.strRepr = "%s:(%s)" % ( self.__class__.__name__, _ustr(self.expr) ) + self.strRepr = f"{self.__class__.__name__}:({_ustr(self.expr)})" return self.strRepr @@ -2661,12 +3055,13 @@ class FollowedBy(ParseElementEnhance): does *not* advance the parsing position within the input string, it only verifies that the specified parse expression matches at the current position. FollowedBy always returns a null token list.""" - def __init__( self, expr ): - super(FollowedBy,self).__init__(expr) + + def __init__(self, expr): + super().__init__(expr) self.mayReturnEmpty = True - def parseImpl( self, instring, loc, doActions=True ): - self.expr.tryParse( instring, loc ) + def parseImpl(self, instring, loc, doActions=True): + self.expr.tryParse(instring, loc) return loc, [] @@ -2676,29 +3071,32 @@ class NotAny(ParseElementEnhance): verifies that the specified parse expression does *not* match at the current position. Also, NotAny does *not* skip over leading whitespace. NotAny always returns a null token list. May be constructed using the '~' operator.""" - def __init__( self, expr ): - super(NotAny,self).__init__(expr) - #~ self.leaveWhitespace() - self.skipWhitespace = False # do NOT use self.leaveWhitespace(), don't want to propagate to exprs + + def __init__(self, expr): + super().__init__(expr) + # ~ self.leaveWhitespace() + self.skipWhitespace = ( + False # do NOT use self.leaveWhitespace(), don't want to propagate to exprs + ) self.mayReturnEmpty = True - self.errmsg = "Found unwanted token, "+_ustr(self.expr) - #self.myException = ParseException("",0,self.errmsg,self) + self.errmsg = "Found unwanted token, " + _ustr(self.expr) + # self.myException = ParseException("",0,self.errmsg,self) - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): try: - self.expr.tryParse( instring, loc ) - except (ParseException,IndexError): + self.expr.tryParse(instring, loc) + except (ParseException, IndexError): pass else: - #~ raise ParseException(instring, loc, self.errmsg ) + # ~ raise ParseException(instring, loc, self.errmsg ) exc = self.myException exc.loc = loc exc.pstr = instring raise exc return loc, [] - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name if self.strRepr is None: @@ -2709,30 +3107,31 @@ def __str__( self ): class ZeroOrMore(ParseElementEnhance): """Optional repetition of zero or more of the given expression.""" - def __init__( self, expr ): - super(ZeroOrMore,self).__init__(expr) + + def __init__(self, expr): + super().__init__(expr) self.mayReturnEmpty = True - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): tokens = [] try: - loc, tokens = self.expr._parse( instring, loc, doActions, callPreParse=False ) - hasIgnoreExprs = ( len(self.ignoreExprs) > 0 ) + loc, tokens = self.expr._parse(instring, loc, doActions, callPreParse=False) + hasIgnoreExprs = len(self.ignoreExprs) > 0 while 1: if hasIgnoreExprs: - preloc = self._skipIgnorables( instring, loc ) + preloc = self._skipIgnorables(instring, loc) else: preloc = loc - loc, tmptokens = self.expr._parse( instring, preloc, doActions ) + loc, tmptokens = self.expr._parse(instring, preloc, doActions) if tmptokens or tmptokens.keys(): tokens += tmptokens - except (ParseException,IndexError): + except (ParseException, IndexError): pass return loc, tokens - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name if self.strRepr is None: @@ -2740,34 +3139,35 @@ def __str__( self ): return self.strRepr - def setResultsName( self, name, listAllMatches=False ): - ret = super(ZeroOrMore,self).setResultsName(name,listAllMatches) + def setResultsName(self, name, listAllMatches=False): + ret = super().setResultsName(name, listAllMatches) ret.saveAsList = True return ret class OneOrMore(ParseElementEnhance): """Repetition of one or more of the given expression.""" - def parseImpl( self, instring, loc, doActions=True ): + + def parseImpl(self, instring, loc, doActions=True): # must be at least one - loc, tokens = self.expr._parse( instring, loc, doActions, callPreParse=False ) + loc, tokens = self.expr._parse(instring, loc, doActions, callPreParse=False) try: - hasIgnoreExprs = ( len(self.ignoreExprs) > 0 ) + hasIgnoreExprs = len(self.ignoreExprs) > 0 while 1: if hasIgnoreExprs: - preloc = self._skipIgnorables( instring, loc ) + preloc = self._skipIgnorables(instring, loc) else: preloc = loc - loc, tmptokens = self.expr._parse( instring, preloc, doActions ) + loc, tmptokens = self.expr._parse(instring, preloc, doActions) if tmptokens or tmptokens.keys(): tokens += tmptokens - except (ParseException,IndexError): + except (ParseException, IndexError): pass return loc, tokens - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name if self.strRepr is None: @@ -2775,45 +3175,52 @@ def __str__( self ): return self.strRepr - def setResultsName( self, name, listAllMatches=False ): - ret = super(OneOrMore,self).setResultsName(name,listAllMatches) + def setResultsName(self, name, listAllMatches=False): + ret = super().setResultsName(name, listAllMatches) ret.saveAsList = True return ret -class _NullToken(object): + +class _NullToken: def __bool__(self): return False + __nonzero__ = __bool__ + def __str__(self): return "" + _optionalNotMatched = _NullToken() + + class Optional(ParseElementEnhance): """Optional matching of the given expression. - A default return string can also be specified, if the optional expression - is not found. + A default return string can also be specified, if the optional expression + is not found. """ - def __init__( self, exprs, default=_optionalNotMatched ): - super(Optional,self).__init__( exprs, savelist=False ) + + def __init__(self, exprs, default=_optionalNotMatched): + super().__init__(exprs, savelist=False) self.defaultValue = default self.mayReturnEmpty = True - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): try: - loc, tokens = self.expr._parse( instring, loc, doActions, callPreParse=False ) - except (ParseException,IndexError): + loc, tokens = self.expr._parse(instring, loc, doActions, callPreParse=False) + except (ParseException, IndexError): if self.defaultValue is not _optionalNotMatched: if self.expr.resultsName: - tokens = ParseResults([ self.defaultValue ]) + tokens = ParseResults([self.defaultValue]) tokens[self.expr.resultsName] = self.defaultValue else: - tokens = [ self.defaultValue ] + tokens = [self.defaultValue] else: tokens = [] return loc, tokens - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name if self.strRepr is None: @@ -2824,13 +3231,14 @@ def __str__( self ): class SkipTo(ParseElementEnhance): """Token for skipping over all undefined text until the matched expression is found. - If include is set to true, the matched expression is also parsed (the skipped text - and matched expression are returned as a 2-element list). The ignore - argument is used to define grammars (typically quoted strings and comments) that - might contain false matches. + If include is set to true, the matched expression is also parsed (the skipped text + and matched expression are returned as a 2-element list). The ignore + argument is used to define grammars (typically quoted strings and comments) that + might contain false matches. """ - def __init__( self, other, include=False, ignore=None, failOn=None ): - super( SkipTo, self ).__init__( other ) + + def __init__(self, other, include=False, ignore=None, failOn=None): + super().__init__(other) self.ignoreExpr = ignore self.mayReturnEmpty = True self.mayIndexError = False @@ -2840,10 +3248,10 @@ def __init__( self, other, include=False, ignore=None, failOn=None ): self.failOn = Literal(failOn) else: self.failOn = failOn - self.errmsg = "No match found for "+_ustr(self.expr) - #self.myException = ParseException("",0,self.errmsg,self) + self.errmsg = "No match found for " + _ustr(self.expr) + # self.myException = ParseException("",0,self.errmsg,self) - def parseImpl( self, instring, loc, doActions=True ): + def parseImpl(self, instring, loc, doActions=True): startLoc = loc instrlen = len(instring) expr = self.expr @@ -2857,28 +3265,30 @@ def parseImpl( self, instring, loc, doActions=True ): pass else: failParse = True - raise ParseException(instring, loc, "Found expression " + str(self.failOn)) + raise ParseException( + instring, loc, "Found expression " + str(self.failOn) + ) failParse = False if self.ignoreExpr is not None: while 1: try: - loc = self.ignoreExpr.tryParse(instring,loc) - print ("found ignoreExpr, advance to", loc) + loc = self.ignoreExpr.tryParse(instring, loc) + print("found ignoreExpr, advance to", loc) except ParseBaseException: break - expr._parse( instring, loc, doActions=False, callPreParse=False ) + expr._parse(instring, loc, doActions=False, callPreParse=False) skipText = instring[startLoc:loc] if self.includeMatch: - loc,mat = expr._parse(instring,loc,doActions,callPreParse=False) + loc, mat = expr._parse(instring, loc, doActions, callPreParse=False) if mat: - skipRes = ParseResults( skipText ) + skipRes = ParseResults(skipText) skipRes += mat - return loc, [ skipRes ] + return loc, [skipRes] else: - return loc, [ skipText ] + return loc, [skipText] else: - return loc, [ skipText ] - except (ParseException,IndexError): + return loc, [skipText] + except (ParseException, IndexError): if failParse: raise else: @@ -2888,57 +3298,59 @@ def parseImpl( self, instring, loc, doActions=True ): exc.pstr = instring raise exc + class Forward(ParseElementEnhance): """Forward declaration of an expression to be defined later - - used for recursive grammars, such as algebraic infix notation. - When the expression is known, it is assigned to the Forward variable using the '<<' operator. - - Note: take care when assigning to Forward not to overlook precedence of operators. - Specifically, '|' has a lower precedence than '<<', so that:: - fwdExpr << a | b | c - will actually be evaluated as:: - (fwdExpr << a) | b | c - thereby leaving b and c out as parseable alternatives. It is recommended that you - explicitly group the values inserted into the Forward:: - fwdExpr << (a | b | c) + used for recursive grammars, such as algebraic infix notation. + When the expression is known, it is assigned to the Forward variable using the '<<' operator. + + Note: take care when assigning to Forward not to overlook precedence of operators. + Specifically, '|' has a lower precedence than '<<', so that:: + fwdExpr << a | b | c + will actually be evaluated as:: + (fwdExpr << a) | b | c + thereby leaving b and c out as parseable alternatives. It is recommended that you + explicitly group the values inserted into the Forward:: + fwdExpr << (a | b | c) """ - def __init__( self, other=None ): - super(Forward,self).__init__( other, savelist=False ) - def __lshift__( self, other ): - if isinstance( other, basestring ): + def __init__(self, other=None): + super().__init__(other, savelist=False) + + def __lshift__(self, other): + if isinstance(other, basestring): other = Literal(other) self.expr = other self.mayReturnEmpty = other.mayReturnEmpty self.strRepr = None self.mayIndexError = self.expr.mayIndexError self.mayReturnEmpty = self.expr.mayReturnEmpty - self.setWhitespaceChars( self.expr.whiteChars ) + self.setWhitespaceChars(self.expr.whiteChars) self.skipWhitespace = self.expr.skipWhitespace self.saveAsList = self.expr.saveAsList self.ignoreExprs.extend(self.expr.ignoreExprs) return None - def leaveWhitespace( self ): + def leaveWhitespace(self): self.skipWhitespace = False return self - def streamline( self ): + def streamline(self): if not self.streamlined: self.streamlined = True if self.expr is not None: self.expr.streamline() return self - def validate( self, validateTrace=[] ): + def validate(self, validateTrace=[]): if self not in validateTrace: - tmp = validateTrace[:]+[self] + tmp = validateTrace[:] + [self] if self.expr is not None: self.expr.validate(tmp) self.checkRecursion([]) - def __str__( self ): - if hasattr(self,"name"): + def __str__(self): + if hasattr(self, "name"): return self.name self._revertClass = self.__class__ @@ -2954,40 +3366,49 @@ def __str__( self ): def copy(self): if self.expr is not None: - return super(Forward,self).copy() + return super().copy() else: ret = Forward() ret << self return ret + class _ForwardNoRecurse(Forward): - def __str__( self ): + def __str__(self): return "..." + class TokenConverter(ParseElementEnhance): """Abstract subclass of ParseExpression, for converting parsed results.""" - def __init__( self, expr, savelist=False ): - super(TokenConverter,self).__init__( expr )#, savelist ) + + def __init__(self, expr, savelist=False): + super().__init__(expr) # , savelist ) self.saveAsList = False + class Upcase(TokenConverter): """Converter to upper case all matching tokens.""" + def __init__(self, *args): - super(Upcase,self).__init__(*args) - warnings.warn("Upcase class is deprecated, use upcaseTokens parse action instead", - DeprecationWarning,stacklevel=2) + super().__init__(*args) + warnings.warn( + "Upcase class is deprecated, use upcaseTokens parse action instead", + DeprecationWarning, + stacklevel=2, + ) - def postParse( self, instring, loc, tokenlist ): - return list(map( string.upper, tokenlist )) + def postParse(self, instring, loc, tokenlist): + return list(map(string.upper, tokenlist)) class Combine(TokenConverter): """Converter to concatenate all matching tokens to a single string. - By default, the matching patterns must also be contiguous in the input string; - this can be disabled by specifying 'adjacent=False' in the constructor. + By default, the matching patterns must also be contiguous in the input string; + this can be disabled by specifying 'adjacent=False' in the constructor. """ - def __init__( self, expr, joinString="", adjacent=True ): - super(Combine,self).__init__( expr ) + + def __init__(self, expr, joinString="", adjacent=True): + super().__init__(expr) # suppress whitespace-stripping in contained parse expressions, but re-enable it on the Combine itself if adjacent: self.leaveWhitespace() @@ -2995,388 +3416,474 @@ def __init__( self, expr, joinString="", adjacent=True ): self.skipWhitespace = True self.joinString = joinString - def ignore( self, other ): + def ignore(self, other): if self.adjacent: ParserElement.ignore(self, other) else: - super( Combine, self).ignore( other ) + super().ignore(other) return self - def postParse( self, instring, loc, tokenlist ): + def postParse(self, instring, loc, tokenlist): retToks = tokenlist.copy() del retToks[:] - retToks += ParseResults([ "".join(tokenlist._asStringList(self.joinString)) ], modal=self.modalResults) + retToks += ParseResults( + ["".join(tokenlist._asStringList(self.joinString))], modal=self.modalResults + ) - if self.resultsName and len(retToks.keys())>0: - return [ retToks ] + if self.resultsName and len(retToks.keys()) > 0: + return [retToks] else: return retToks + class Group(TokenConverter): """Converter to return the matched tokens as a list - useful for returning tokens of ZeroOrMore and OneOrMore expressions.""" - def __init__( self, expr ): - super(Group,self).__init__( expr ) + + def __init__(self, expr): + super().__init__(expr) self.saveAsList = True - def postParse( self, instring, loc, tokenlist ): - return [ tokenlist ] + def postParse(self, instring, loc, tokenlist): + return [tokenlist] + class Dict(TokenConverter): """Converter to return a repetitive expression as a list, but also as a dictionary. - Each element can also be referenced using the first token in the expression as its key. - Useful for tabular report scraping when the first column can be used as a item key. + Each element can also be referenced using the first token in the expression as its key. + Useful for tabular report scraping when the first column can be used as a item key. """ - def __init__( self, exprs ): - super(Dict,self).__init__( exprs ) + + def __init__(self, exprs): + super().__init__(exprs) self.saveAsList = True - def postParse( self, instring, loc, tokenlist ): - for i,tok in enumerate(tokenlist): + def postParse(self, instring, loc, tokenlist): + for i, tok in enumerate(tokenlist): if len(tok) == 0: continue ikey = tok[0] - if isinstance(ikey,int): + if isinstance(ikey, int): ikey = _ustr(tok[0]).strip() - if len(tok)==1: - tokenlist[ikey] = _ParseResultsWithOffset("",i) - elif len(tok)==2 and not isinstance(tok[1],ParseResults): - tokenlist[ikey] = _ParseResultsWithOffset(tok[1],i) + if len(tok) == 1: + tokenlist[ikey] = _ParseResultsWithOffset("", i) + elif len(tok) == 2 and not isinstance(tok[1], ParseResults): + tokenlist[ikey] = _ParseResultsWithOffset(tok[1], i) else: - dictvalue = tok.copy() #ParseResults(i) + dictvalue = tok.copy() # ParseResults(i) del dictvalue[0] - if len(dictvalue)!= 1 or (isinstance(dictvalue,ParseResults) and dictvalue.keys()): - tokenlist[ikey] = _ParseResultsWithOffset(dictvalue,i) + if len(dictvalue) != 1 or ( + isinstance(dictvalue, ParseResults) and dictvalue.keys() + ): + tokenlist[ikey] = _ParseResultsWithOffset(dictvalue, i) else: - tokenlist[ikey] = _ParseResultsWithOffset(dictvalue[0],i) + tokenlist[ikey] = _ParseResultsWithOffset(dictvalue[0], i) if self.resultsName: - return [ tokenlist ] + return [tokenlist] else: return tokenlist class Suppress(TokenConverter): """Converter for ignoring the results of a parsed expression.""" - def postParse( self, instring, loc, tokenlist ): + + def postParse(self, instring, loc, tokenlist): return [] - def suppress( self ): + def suppress(self): return self -class OnlyOnce(object): +class OnlyOnce: """Wrapper for parse actions, to ensure they are only called once.""" + def __init__(self, methodCall): self.callable = ParserElement._normalizeParseActionArgs(methodCall) self.called = False - def __call__(self,s,l,t): + + def __call__(self, s, l, t): if not self.called: - results = self.callable(s,l,t) + results = self.callable(s, l, t) self.called = True return results - raise ParseException(s,l,"") + raise ParseException(s, l, "") + def reset(self): self.called = False + def traceParseAction(f): """Decorator for debugging parse actions.""" f = ParserElement._normalizeParseActionArgs(f) + def z(*paArgs): thisFunc = f.func_name - s,l,t = paArgs[-3:] - if len(paArgs)>3: - thisFunc = paArgs[0].__class__.__name__ + '.' + thisFunc - sys.stderr.write( ">>entering %s(line: '%s', %d, %s)\n" % (thisFunc,line(l,s),l,t) ) + s, l, t = paArgs[-3:] + if len(paArgs) > 3: + thisFunc = paArgs[0].__class__.__name__ + "." + thisFunc + sys.stderr.write( + ">>entering %s(line: '%s', %d, %s)\n" % (thisFunc, line(l, s), l, t) + ) try: ret = f(*paArgs) - except Exception as exc: - sys.stderr.write( "<", "|".join( [ _escapeRegexChars(sym) for sym in symbols] )) + # ~ print (strs,"->", "|".join( [ _escapeRegexChars(sym) for sym in symbols] )) try: - if len(symbols)==len("".join(symbols)): - return Regex( "[%s]" % "".join( [ _escapeRegexRangeChars(sym) for sym in symbols] ) ) + if len(symbols) == len("".join(symbols)): + return Regex( + f"[{''.join([_escapeRegexRangeChars(sym) for sym in symbols])}]" + ) else: - return Regex( "|".join( [ re.escape(sym) for sym in symbols] ) ) + return Regex("|".join([re.escape(sym) for sym in symbols])) except: - warnings.warn("Exception creating Regex for oneOf, building MatchFirst", - SyntaxWarning, stacklevel=2) - + warnings.warn( + "Exception creating Regex for oneOf, building MatchFirst", + SyntaxWarning, + stacklevel=2, + ) # last resort, just use MatchFirst - return MatchFirst( [ parseElementClass(sym) for sym in symbols ] ) + return MatchFirst([parseElementClass(sym) for sym in symbols]) + -def dictOf( key, value ): +def dictOf(key, value): """Helper to easily and clearly define a dictionary by specifying the respective patterns - for the key and value. Takes care of defining the Dict, ZeroOrMore, and Group tokens - in the proper order. The key pattern can include delimiting markers or punctuation, - as long as they are suppressed, thereby leaving the significant key text. The value - pattern can include named results, so that the Dict results can include named token - fields. + for the key and value. Takes care of defining the Dict, ZeroOrMore, and Group tokens + in the proper order. The key pattern can include delimiting markers or punctuation, + as long as they are suppressed, thereby leaving the significant key text. The value + pattern can include named results, so that the Dict results can include named token + fields. """ - return Dict( ZeroOrMore( Group ( key + value ) ) ) + return Dict(ZeroOrMore(Group(key + value))) + def originalTextFor(expr, asString=True): """Helper to return the original, untokenized text for a given expression. Useful to - restore the parsed fields of an HTML start tag into the raw tag text itself, or to - revert separate tokens with intervening whitespace back to the original matching - input text. Simpler to use than the parse action keepOriginalText, and does not - require the inspect module to chase up the call stack. By default, returns a - string containing the original parsed text. - - If the optional asString argument is passed as False, then the return value is a - ParseResults containing any results names that were originally matched, and a - single token containing the original matched text from the input string. So if - the expression passed to originalTextFor contains expressions with defined - results names, you must set asString to False if you want to preserve those - results name values.""" - locMarker = Empty().setParseAction(lambda s,loc,t: loc) + restore the parsed fields of an HTML start tag into the raw tag text itself, or to + revert separate tokens with intervening whitespace back to the original matching + input text. Simpler to use than the parse action keepOriginalText, and does not + require the inspect module to chase up the call stack. By default, returns a + string containing the original parsed text. + + If the optional asString argument is passed as False, then the return value is a + ParseResults containing any results names that were originally matched, and a + single token containing the original matched text from the input string. So if + the expression passed to originalTextFor contains expressions with defined + results names, you must set asString to False if you want to preserve those + results name values.""" + locMarker = Empty().setParseAction(lambda s, loc, t: loc) matchExpr = locMarker("_original_start") + expr + locMarker("_original_end") if asString: - extractText = lambda s,l,t: s[t._original_start:t._original_end] + extractText = lambda s, l, t: s[t._original_start : t._original_end] else: - def extractText(s,l,t): + + def extractText(s, l, t): del t[:] - t.insert(0, s[t._original_start:t._original_end]) + t.insert(0, s[t._original_start : t._original_end]) del t["_original_start"] del t["_original_end"] + matchExpr.setParseAction(extractText) return matchExpr - + + # convenience constants for positional expressions -empty = Empty().setName("empty") -lineStart = LineStart().setName("lineStart") -lineEnd = LineEnd().setName("lineEnd") +empty = Empty().setName("empty") +lineStart = LineStart().setName("lineStart") +lineEnd = LineEnd().setName("lineEnd") stringStart = StringStart().setName("stringStart") -stringEnd = StringEnd().setName("stringEnd") - -_escapedPunc = Word( _bslash, r"\[]-*.$+^?()~ ", exact=2 ).setParseAction(lambda s,l,t:t[0][1]) -_printables_less_backslash = "".join([ c for c in printables if c not in r"\]" ]) -_escapedHexChar = Combine( Suppress(_bslash + "0x") + Word(hexnums) ).setParseAction(lambda s,l,t:unichr(int(t[0],16))) -_escapedOctChar = Combine( Suppress(_bslash) + Word("0","01234567") ).setParseAction(lambda s,l,t:unichr(int(t[0],8))) -_singleChar = _escapedPunc | _escapedHexChar | _escapedOctChar | Word(_printables_less_backslash,exact=1) +stringEnd = StringEnd().setName("stringEnd") + +_escapedPunc = Word(_bslash, r"\[]-*.$+^?()~ ", exact=2).setParseAction( + lambda s, l, t: t[0][1] +) +_printables_less_backslash = "".join([c for c in printables if c not in r"\]"]) +_escapedHexChar = Combine(Suppress(_bslash + "0x") + Word(hexnums)).setParseAction( + lambda s, l, t: unichr(int(t[0], 16)) +) +_escapedOctChar = Combine(Suppress(_bslash) + Word("0", "01234567")).setParseAction( + lambda s, l, t: unichr(int(t[0], 8)) +) +_singleChar = ( + _escapedPunc + | _escapedHexChar + | _escapedOctChar + | Word(_printables_less_backslash, exact=1) +) _charRange = Group(_singleChar + Suppress("-") + _singleChar) -_reBracketExpr = Literal("[") + Optional("^").setResultsName("negate") + Group( OneOrMore( _charRange | _singleChar ) ).setResultsName("body") + "]" +_reBracketExpr = ( + Literal("[") + + Optional("^").setResultsName("negate") + + Group(OneOrMore(_charRange | _singleChar)).setResultsName("body") + + "]" +) + +_expanded = lambda p: ( + isinstance(p, ParseResults) + and "".join([unichr(c) for c in range(ord(p[0]), ord(p[1]) + 1)]) + or p +) -_expanded = lambda p: (isinstance(p,ParseResults) and ''.join([ unichr(c) for c in range(ord(p[0]),ord(p[1])+1) ]) or p) def srange(s): r"""Helper to easily define string ranges for use in Word construction. Borrows - syntax from regexp '[]' string range definitions:: - srange("[0-9]") -> "0123456789" - srange("[a-z]") -> "abcdefghijklmnopqrstuvwxyz" - srange("[a-z$_]") -> "abcdefghijklmnopqrstuvwxyz$_" - The input string must be enclosed in []'s, and the returned string is the expanded - character set joined into a single string. - The values enclosed in the []'s may be:: - a single character - an escaped character with a leading backslash (such as \- or \]) - an escaped hex character with a leading '\0x' (\0x21, which is a '!' character) - an escaped octal character with a leading '\0' (\041, which is a '!' character) - a range of any of the above, separated by a dash ('a-z', etc.) - any combination of the above ('aeiouy', 'a-zA-Z0-9_$', etc.) + syntax from regexp '[]' string range definitions:: + srange("[0-9]") -> "0123456789" + srange("[a-z]") -> "abcdefghijklmnopqrstuvwxyz" + srange("[a-z$_]") -> "abcdefghijklmnopqrstuvwxyz$_" + The input string must be enclosed in []'s, and the returned string is the expanded + character set joined into a single string. + The values enclosed in the []'s may be:: + a single character + an escaped character with a leading backslash (such as \- or \]) + an escaped hex character with a leading '\0x' (\0x21, which is a '!' character) + an escaped octal character with a leading '\0' (\041, which is a '!' character) + a range of any of the above, separated by a dash ('a-z', etc.) + any combination of the above ('aeiouy', 'a-zA-Z0-9_$', etc.) """ try: return "".join([_expanded(part) for part in _reBracketExpr.parseString(s).body]) except: return "" + def matchOnlyAtCol(n): """Helper method for defining parse actions that require matching at a specific - column in the input text. + column in the input text. """ - def verifyCol(strg,locn,toks): - if col(locn,strg) != n: - raise ParseException(strg,locn,"matched token not at column %d" % n) + + def verifyCol(strg, locn, toks): + if col(locn, strg) != n: + raise ParseException(strg, locn, "matched token not at column %d" % n) + return verifyCol + def replaceWith(replStr): """Helper method for common parse actions that simply return a literal value. Especially - useful when used with transformString(). + useful when used with transformString(). """ + def _replFunc(*args): return [replStr] + return _replFunc -def removeQuotes(s,l,t): + +def removeQuotes(s, l, t): """Helper parse action for removing quotation marks from parsed quoted strings. - To use, add this parse action to quoted string using:: - quotedString.setParseAction( removeQuotes ) + To use, add this parse action to quoted string using:: + quotedString.setParseAction( removeQuotes ) """ return t[0][1:-1] -def upcaseTokens(s,l,t): + +def upcaseTokens(s, l, t): """Helper parse action to convert tokens to upper case.""" - return [ tt.upper() for tt in map(_ustr,t) ] + return [tt.upper() for tt in map(_ustr, t)] -def downcaseTokens(s,l,t): + +def downcaseTokens(s, l, t): """Helper parse action to convert tokens to lower case.""" - return [ tt.lower() for tt in map(_ustr,t) ] + return [tt.lower() for tt in map(_ustr, t)] + -def keepOriginalText(s,startLoc,t): +def keepOriginalText(s, startLoc, t): """Helper parse action to preserve original parsed text, - overriding any nested parse actions.""" + overriding any nested parse actions.""" try: endloc = getTokensEndLoc() except ParseException: - raise ParseFatalException("incorrect usage of keepOriginalText - may only be called as a parse action") + raise ParseFatalException( + "incorrect usage of keepOriginalText - may only be called as a parse action" + ) del t[:] t += ParseResults(s[startLoc:endloc]) return t + def getTokensEndLoc(): """Method to be called from within a parse action to determine the end - location of the parsed tokens.""" + location of the parsed tokens.""" import inspect + fstack = inspect.stack() try: # search up the stack (through intervening argument normalizers) for correct calling routine @@ -3385,268 +3892,368 @@ def getTokensEndLoc(): endloc = f[0].f_locals["loc"] return endloc else: - raise ParseFatalException("incorrect usage of getTokensEndLoc - may only be called from within a parse action") + raise ParseFatalException( + "incorrect usage of getTokensEndLoc - may only be called from within a parse action" + ) finally: del fstack + def _makeTags(tagStr, xml): """Internal helper to construct opening and closing tag expressions, given a tag name""" - if isinstance(tagStr,basestring): + if isinstance(tagStr, basestring): resname = tagStr tagStr = Keyword(tagStr, caseless=not xml) else: resname = tagStr.name - tagAttrName = Word(alphas,alphanums+"_-:") - if (xml): - tagAttrValue = dblQuotedString.copy().setParseAction( removeQuotes ) - openTag = Suppress("<") + tagStr + \ - Dict(ZeroOrMore(Group( tagAttrName + Suppress("=") + tagAttrValue ))) + \ - Optional("/",default=[False]).setResultsName("empty").setParseAction(lambda s,l,t:t[0]=='/') + Suppress(">") + tagAttrName = Word(alphas, alphanums + "_-:") + if xml: + tagAttrValue = dblQuotedString.copy().setParseAction(removeQuotes) + openTag = ( + Suppress("<") + + tagStr + + Dict(ZeroOrMore(Group(tagAttrName + Suppress("=") + tagAttrValue))) + + Optional("/", default=[False]) + .setResultsName("empty") + .setParseAction(lambda s, l, t: t[0] == "/") + + Suppress(">") + ) else: - printablesLessRAbrack = "".join( [ c for c in printables if c not in ">" ] ) - tagAttrValue = quotedString.copy().setParseAction( removeQuotes ) | Word(printablesLessRAbrack) - openTag = Suppress("<") + tagStr + \ - Dict(ZeroOrMore(Group( tagAttrName.setParseAction(downcaseTokens) + \ - Optional( Suppress("=") + tagAttrValue ) ))) + \ - Optional("/",default=[False]).setResultsName("empty").setParseAction(lambda s,l,t:t[0]=='/') + Suppress(">") + printablesLessRAbrack = "".join([c for c in printables if c not in ">"]) + tagAttrValue = quotedString.copy().setParseAction(removeQuotes) | Word( + printablesLessRAbrack + ) + openTag = ( + Suppress("<") + + tagStr + + Dict( + ZeroOrMore( + Group( + tagAttrName.setParseAction(downcaseTokens) + + Optional(Suppress("=") + tagAttrValue) + ) + ) + ) + + Optional("/", default=[False]) + .setResultsName("empty") + .setParseAction(lambda s, l, t: t[0] == "/") + + Suppress(">") + ) closeTag = Combine(_L("") - openTag = openTag.setResultsName("start"+"".join(resname.replace(":"," ").title().split())).setName("<%s>" % tagStr) - closeTag = closeTag.setResultsName("end"+"".join(resname.replace(":"," ").title().split())).setName("" % tagStr) + openTag = openTag.setResultsName( + "start" + "".join(resname.replace(":", " ").title().split()) + ).setName(f"<{tagStr}>") + closeTag = closeTag.setResultsName( + "end" + "".join(resname.replace(":", " ").title().split()) + ).setName(f"") return openTag, closeTag + def makeHTMLTags(tagStr): """Helper to construct opening and closing tag expressions for HTML, given a tag name""" - return _makeTags( tagStr, False ) + return _makeTags(tagStr, False) + def makeXMLTags(tagStr): """Helper to construct opening and closing tag expressions for XML, given a tag name""" - return _makeTags( tagStr, True ) + return _makeTags(tagStr, True) -def withAttribute(*args,**attrDict): + +def withAttribute(*args, **attrDict): """Helper to create a validating parse action to be used with start tags created - with makeXMLTags or makeHTMLTags. Use withAttribute to qualify a starting tag - with a required attribute value, to avoid false matches on common tags such as - or
. - - Call withAttribute with a series of attribute names and values. Specify the list - of filter attributes names and values as: - - keyword arguments, as in (class="Customer",align="right"), or - - a list of name-value tuples, as in ( ("ns1:class", "Customer"), ("ns2:align","right") ) - For attribute names with a namespace prefix, you must use the second form. Attribute - names are matched insensitive to upper/lower case. - - To verify that the attribute exists, but without specifying a value, pass - withAttribute.ANY_VALUE as the value. - """ + with makeXMLTags or makeHTMLTags. Use withAttribute to qualify a starting tag + with a required attribute value, to avoid false matches on common tags such as + or
. + + Call withAttribute with a series of attribute names and values. Specify the list + of filter attributes names and values as: + - keyword arguments, as in (class="Customer",align="right"), or + - a list of name-value tuples, as in ( ("ns1:class", "Customer"), ("ns2:align","right") ) + For attribute names with a namespace prefix, you must use the second form. Attribute + names are matched insensitive to upper/lower case. + + To verify that the attribute exists, but without specifying a value, pass + withAttribute.ANY_VALUE as the value. + """ if args: attrs = args[:] else: attrs = attrDict.items() - attrs = [(k,v) for k,v in attrs] - def pa(s,l,tokens): - for attrName,attrValue in attrs: + attrs = [(k, v) for k, v in attrs] + + def pa(s, l, tokens): + for attrName, attrValue in attrs: if attrName not in tokens: - raise ParseException(s,l,"no matching attribute " + attrName) + raise ParseException(s, l, "no matching attribute " + attrName) if attrValue != withAttribute.ANY_VALUE and tokens[attrName] != attrValue: - raise ParseException(s,l,"attribute '%s' has value '%s', must be '%s'" % - (attrName, tokens[attrName], attrValue)) + raise ParseException( + s, + l, + "attribute '%s' has value '%s', must be '%s'" + % (attrName, tokens[attrName], attrValue), + ) + return pa + + withAttribute.ANY_VALUE = object() opAssoc = _Constants() opAssoc.LEFT = object() opAssoc.RIGHT = object() -def operatorPrecedence( baseExpr, opList ): + +def operatorPrecedence(baseExpr, opList): """Helper method for constructing grammars of expressions made up of - operators working in a precedence hierarchy. Operators may be unary or - binary, left- or right-associative. Parse actions can also be attached - to operator expressions. - - Parameters: - - baseExpr - expression representing the most basic element for the nested - - opList - list of tuples, one for each operator precedence level in the - expression grammar; each tuple is of the form - (opExpr, numTerms, rightLeftAssoc, parseAction), where: - - opExpr is the pyparsing expression for the operator; - may also be a string, which will be converted to a Literal; - if numTerms is 3, opExpr is a tuple of two expressions, for the - two operators separating the 3 terms - - numTerms is the number of terms for this operator (must - be 1, 2, or 3) - - rightLeftAssoc is the indicator whether the operator is - right or left associative, using the pyparsing-defined - constants opAssoc.RIGHT and opAssoc.LEFT. - - parseAction is the parse action to be associated with - expressions matching this operator expression (the - parse action tuple member may be omitted) + operators working in a precedence hierarchy. Operators may be unary or + binary, left- or right-associative. Parse actions can also be attached + to operator expressions. + + Parameters: + - baseExpr - expression representing the most basic element for the nested + - opList - list of tuples, one for each operator precedence level in the + expression grammar; each tuple is of the form + (opExpr, numTerms, rightLeftAssoc, parseAction), where: + - opExpr is the pyparsing expression for the operator; + may also be a string, which will be converted to a Literal; + if numTerms is 3, opExpr is a tuple of two expressions, for the + two operators separating the 3 terms + - numTerms is the number of terms for this operator (must + be 1, 2, or 3) + - rightLeftAssoc is the indicator whether the operator is + right or left associative, using the pyparsing-defined + constants opAssoc.RIGHT and opAssoc.LEFT. + - parseAction is the parse action to be associated with + expressions matching this operator expression (the + parse action tuple member may be omitted) """ ret = Forward() - lastExpr = baseExpr | ( Suppress('(') + ret + Suppress(')') ) - for i,operDef in enumerate(opList): - opExpr,arity,rightLeftAssoc,pa = (operDef + (None,))[:4] + lastExpr = baseExpr | (Suppress("(") + ret + Suppress(")")) + for i, operDef in enumerate(opList): + opExpr, arity, rightLeftAssoc, pa = (operDef + (None,))[:4] if arity == 3: if opExpr is None or len(opExpr) != 2: - raise ValueError("if numterms=3, opExpr must be a tuple or list of two expressions") + raise ValueError( + "if numterms=3, opExpr must be a tuple or list of two expressions" + ) opExpr1, opExpr2 = opExpr - thisExpr = Forward()#.setName("expr%d" % i) + thisExpr = Forward() # .setName("expr%d" % i) if rightLeftAssoc == opAssoc.LEFT: if arity == 1: - matchExpr = FollowedBy(lastExpr + opExpr) + Group( lastExpr + OneOrMore( opExpr ) ) + matchExpr = FollowedBy(lastExpr + opExpr) + Group( + lastExpr + OneOrMore(opExpr) + ) elif arity == 2: if opExpr is not None: - matchExpr = FollowedBy(lastExpr + opExpr + lastExpr) + Group( lastExpr + OneOrMore( opExpr + lastExpr ) ) + matchExpr = FollowedBy(lastExpr + opExpr + lastExpr) + Group( + lastExpr + OneOrMore(opExpr + lastExpr) + ) else: - matchExpr = FollowedBy(lastExpr+lastExpr) + Group( lastExpr + OneOrMore(lastExpr) ) + matchExpr = FollowedBy(lastExpr + lastExpr) + Group( + lastExpr + OneOrMore(lastExpr) + ) elif arity == 3: - matchExpr = FollowedBy(lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr) + \ - Group( lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr ) + matchExpr = FollowedBy( + lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr + ) + Group(lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr) else: - raise ValueError("operator must be unary (1), binary (2), or ternary (3)") + raise ValueError( + "operator must be unary (1), binary (2), or ternary (3)" + ) elif rightLeftAssoc == opAssoc.RIGHT: if arity == 1: # try to avoid LR with this extra test if not isinstance(opExpr, Optional): opExpr = Optional(opExpr) - matchExpr = FollowedBy(opExpr.expr + thisExpr) + Group( opExpr + thisExpr ) + matchExpr = FollowedBy(opExpr.expr + thisExpr) + Group( + opExpr + thisExpr + ) elif arity == 2: if opExpr is not None: - matchExpr = FollowedBy(lastExpr + opExpr + thisExpr) + Group( lastExpr + OneOrMore( opExpr + thisExpr ) ) + matchExpr = FollowedBy(lastExpr + opExpr + thisExpr) + Group( + lastExpr + OneOrMore(opExpr + thisExpr) + ) else: - matchExpr = FollowedBy(lastExpr + thisExpr) + Group( lastExpr + OneOrMore( thisExpr ) ) + matchExpr = FollowedBy(lastExpr + thisExpr) + Group( + lastExpr + OneOrMore(thisExpr) + ) elif arity == 3: - matchExpr = FollowedBy(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr) + \ - Group( lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr ) + matchExpr = FollowedBy( + lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr + ) + Group(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr) else: - raise ValueError("operator must be unary (1), binary (2), or ternary (3)") + raise ValueError( + "operator must be unary (1), binary (2), or ternary (3)" + ) else: raise ValueError("operator must indicate right or left associativity") if pa: - matchExpr.setParseAction( pa ) - thisExpr << ( matchExpr | lastExpr ) + matchExpr.setParseAction(pa) + thisExpr << (matchExpr | lastExpr) lastExpr = thisExpr ret << lastExpr return ret -dblQuotedString = Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\x[0-9a-fA-F]+)|(?:\\.))*"').setName("string enclosed in double quotes") -sglQuotedString = Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\x[0-9a-fA-F]+)|(?:\\.))*'").setName("string enclosed in single quotes") -quotedString = Regex(r'''(?:"(?:[^"\n\r\\]|(?:"")|(?:\\x[0-9a-fA-F]+)|(?:\\.))*")|(?:'(?:[^'\n\r\\]|(?:'')|(?:\\x[0-9a-fA-F]+)|(?:\\.))*')''').setName("quotedString using single or double quotes") -unicodeString = Combine(_L('u') + quotedString.copy()) + +dblQuotedString = Regex( + r'"(?:[^"\n\r\\]|(?:"")|(?:\\x[0-9a-fA-F]+)|(?:\\.))*"' +).setName("string enclosed in double quotes") +sglQuotedString = Regex( + r"'(?:[^'\n\r\\]|(?:'')|(?:\\x[0-9a-fA-F]+)|(?:\\.))*'" +).setName("string enclosed in single quotes") +quotedString = Regex( + r"""(?:"(?:[^"\n\r\\]|(?:"")|(?:\\x[0-9a-fA-F]+)|(?:\\.))*")|(?:'(?:[^'\n\r\\]|(?:'')|(?:\\x[0-9a-fA-F]+)|(?:\\.))*')""" +).setName("quotedString using single or double quotes") +unicodeString = Combine(_L("u") + quotedString.copy()) + def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString): """Helper method for defining nested lists enclosed in opening and closing - delimiters ("(" and ")" are the default). - - Parameters: - - opener - opening character for a nested list (default="("); can also be a pyparsing expression - - closer - closing character for a nested list (default=")"); can also be a pyparsing expression - - content - expression for items within the nested lists (default=None) - - ignoreExpr - expression for ignoring opening and closing delimiters (default=quotedString) - - If an expression is not provided for the content argument, the nested - expression will capture all whitespace-delimited content between delimiters - as a list of separate values. - - Use the ignoreExpr argument to define expressions that may contain - opening or closing characters that should not be treated as opening - or closing characters for nesting, such as quotedString or a comment - expression. Specify multiple expressions using an Or or MatchFirst. - The default is quotedString, but if no expressions are to be ignored, - then pass None for this argument. + delimiters ("(" and ")" are the default). + + Parameters: + - opener - opening character for a nested list (default="("); can also be a pyparsing expression + - closer - closing character for a nested list (default=")"); can also be a pyparsing expression + - content - expression for items within the nested lists (default=None) + - ignoreExpr - expression for ignoring opening and closing delimiters (default=quotedString) + + If an expression is not provided for the content argument, the nested + expression will capture all whitespace-delimited content between delimiters + as a list of separate values. + + Use the ignoreExpr argument to define expressions that may contain + opening or closing characters that should not be treated as opening + or closing characters for nesting, such as quotedString or a comment + expression. Specify multiple expressions using an Or or MatchFirst. + The default is quotedString, but if no expressions are to be ignored, + then pass None for this argument. """ if opener == closer: raise ValueError("opening and closing strings cannot be the same") if content is None: - if isinstance(opener,basestring) and isinstance(closer,basestring): - if len(opener) == 1 and len(closer)==1: + if isinstance(opener, basestring) and isinstance(closer, basestring): + if len(opener) == 1 and len(closer) == 1: if ignoreExpr is not None: - content = (Combine(OneOrMore(~ignoreExpr + - CharsNotIn(opener+closer+ParserElement.DEFAULT_WHITE_CHARS,exact=1)) - ).setParseAction(lambda t:t[0].strip())) + content = Combine( + OneOrMore( + ~ignoreExpr + + CharsNotIn( + opener + closer + ParserElement.DEFAULT_WHITE_CHARS, + exact=1, + ) + ) + ).setParseAction(lambda t: t[0].strip()) else: - content = (empty+CharsNotIn(opener+closer+ParserElement.DEFAULT_WHITE_CHARS - ).setParseAction(lambda t:t[0].strip())) + content = empty + CharsNotIn( + opener + closer + ParserElement.DEFAULT_WHITE_CHARS + ).setParseAction(lambda t: t[0].strip()) else: if ignoreExpr is not None: - content = (Combine(OneOrMore(~ignoreExpr + - ~Literal(opener) + ~Literal(closer) + - CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1)) - ).setParseAction(lambda t:t[0].strip())) + content = Combine( + OneOrMore( + ~ignoreExpr + + ~Literal(opener) + + ~Literal(closer) + + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1) + ) + ).setParseAction(lambda t: t[0].strip()) else: - content = (Combine(OneOrMore(~Literal(opener) + ~Literal(closer) + - CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1)) - ).setParseAction(lambda t:t[0].strip())) + content = Combine( + OneOrMore( + ~Literal(opener) + + ~Literal(closer) + + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1) + ) + ).setParseAction(lambda t: t[0].strip()) else: - raise ValueError("opening and closing arguments must be strings if no content expression is given") + raise ValueError( + "opening and closing arguments must be strings if no content expression is given" + ) ret = Forward() if ignoreExpr is not None: - ret << Group( Suppress(opener) + ZeroOrMore( ignoreExpr | ret | content ) + Suppress(closer) ) + ret << Group( + Suppress(opener) + ZeroOrMore(ignoreExpr | ret | content) + Suppress(closer) + ) else: - ret << Group( Suppress(opener) + ZeroOrMore( ret | content ) + Suppress(closer) ) + ret << Group(Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer)) return ret + def indentedBlock(blockStatementExpr, indentStack, indent=True): """Helper method for defining space-delimited indentation blocks, such as - those used to define block statements in Python source code. - - Parameters: - - blockStatementExpr - expression defining syntax of statement that - is repeated within the indented block - - indentStack - list created by caller to manage indentation stack - (multiple statementWithIndentedBlock expressions within a single grammar - should share a common indentStack) - - indent - boolean indicating whether block must be indented beyond the - the current level; set to False for block of left-most statements - (default=True) - - A valid block must contain at least one blockStatement. + those used to define block statements in Python source code. + + Parameters: + - blockStatementExpr - expression defining syntax of statement that + is repeated within the indented block + - indentStack - list created by caller to manage indentation stack + (multiple statementWithIndentedBlock expressions within a single grammar + should share a common indentStack) + - indent - boolean indicating whether block must be indented beyond the + the current level; set to False for block of left-most statements + (default=True) + + A valid block must contain at least one blockStatement. """ - def checkPeerIndent(s,l,t): - if l >= len(s): return - curCol = col(l,s) + + def checkPeerIndent(s, l, t): + if l >= len(s): + return + curCol = col(l, s) if curCol != indentStack[-1]: if curCol > indentStack[-1]: - raise ParseFatalException(s,l,"illegal nesting") - raise ParseException(s,l,"not a peer entry") + raise ParseFatalException(s, l, "illegal nesting") + raise ParseException(s, l, "not a peer entry") - def checkSubIndent(s,l,t): - curCol = col(l,s) + def checkSubIndent(s, l, t): + curCol = col(l, s) if curCol > indentStack[-1]: - indentStack.append( curCol ) + indentStack.append(curCol) else: - raise ParseException(s,l,"not a subentry") - - def checkUnindent(s,l,t): - if l >= len(s): return - curCol = col(l,s) - if not(indentStack and curCol < indentStack[-1] and curCol <= indentStack[-2]): - raise ParseException(s,l,"not an unindent") + raise ParseException(s, l, "not a subentry") + + def checkUnindent(s, l, t): + if l >= len(s): + return + curCol = col(l, s) + if not (indentStack and curCol < indentStack[-1] and curCol <= indentStack[-2]): + raise ParseException(s, l, "not an unindent") indentStack.pop() NL = OneOrMore(LineEnd().setWhitespaceChars("\t ").suppress()) INDENT = Empty() + Empty().setParseAction(checkSubIndent) - PEER = Empty().setParseAction(checkPeerIndent) + PEER = Empty().setParseAction(checkPeerIndent) UNDENT = Empty().setParseAction(checkUnindent) if indent: - smExpr = Group( Optional(NL) + - FollowedBy(blockStatementExpr) + - INDENT + (OneOrMore( PEER + Group(blockStatementExpr) + Optional(NL) )) + UNDENT) + smExpr = Group( + Optional(NL) + + FollowedBy(blockStatementExpr) + + INDENT + + (OneOrMore(PEER + Group(blockStatementExpr) + Optional(NL))) + + UNDENT + ) else: - smExpr = Group( Optional(NL) + - (OneOrMore( PEER + Group(blockStatementExpr) + Optional(NL) )) ) + smExpr = Group( + Optional(NL) + (OneOrMore(PEER + Group(blockStatementExpr) + Optional(NL))) + ) blockStatementExpr.ignore(_bslash + LineEnd()) return smExpr + alphas8bit = srange(r"[\0xc0-\0xd6\0xd8-\0xf6\0xf8-\0xff]") punc8bit = srange(r"[\0xa1-\0xbf\0xd7\0xf7]") -anyOpenTag,anyCloseTag = makeHTMLTags(Word(alphas,alphanums+"_:")) -commonHTMLEntity = Combine(_L("&") + oneOf("gt lt amp nbsp quot").setResultsName("entity") +";").streamline() -_htmlEntityMap = dict(zip("gt lt amp nbsp quot".split(),'><& "')) -replaceHTMLEntity = lambda t : t.entity in _htmlEntityMap and _htmlEntityMap[t.entity] or None +anyOpenTag, anyCloseTag = makeHTMLTags(Word(alphas, alphanums + "_:")) +commonHTMLEntity = Combine( + _L("&") + oneOf("gt lt amp nbsp quot").setResultsName("entity") + ";" +).streamline() +_htmlEntityMap = dict(zip("gt lt amp nbsp quot".split(), '><& "')) +replaceHTMLEntity = ( + lambda t: t.entity in _htmlEntityMap and _htmlEntityMap[t.entity] or None +) # it's easy to get these comment structures wrong - they're very common, so may as well make them available cStyleComment = Regex(r"/\*(?:[^*]*\*+)+?/").setName("C style comment") @@ -3654,56 +4261,66 @@ def checkUnindent(s,l,t): htmlComment = Regex(r"") restOfLine = Regex(r".*").leaveWhitespace() dblSlashComment = Regex(r"\/\/(\\\n|.)*").setName("// comment") -cppStyleComment = Regex(r"/(?:\*(?:[^*]*\*+)+?/|/[^\n]*(?:\n[^\n]*)*?(?:(?" + str(tokenlist)) - print ("tokens = " + str(tokens)) - print ("tokens.columns = " + str(tokens.columns)) - print ("tokens.tables = " + str(tokens.tables)) - print (tokens.asXML("SQL",True)) + print(teststring + "->" + str(tokenlist)) + print("tokens = " + str(tokens)) + print("tokens.columns = " + str(tokens.columns)) + print("tokens.tables = " + str(tokens.tables)) + print(tokens.asXML("SQL", True)) except ParseBaseException as err: - print (teststring + "->") - print (err.line) - print (" "*(err.column-1) + "^") - print (err) + print(teststring + "->") + print(err.line) + print(" " * (err.column - 1) + "^") + print(err) print() - selectToken = CaselessLiteral( "select" ) - fromToken = CaselessLiteral( "from" ) - - ident = Word( alphas, alphanums + "_$" ) - columnName = delimitedList( ident, ".", combine=True ).setParseAction( upcaseTokens ) - columnNameList = Group( delimitedList( columnName ) )#.setName("columns") - tableName = delimitedList( ident, ".", combine=True ).setParseAction( upcaseTokens ) - tableNameList = Group( delimitedList( tableName ) )#.setName("tables") - simpleSQL = ( selectToken + \ - ( '*' | columnNameList ).setResultsName( "columns" ) + \ - fromToken + \ - tableNameList.setResultsName( "tables" ) ) - - test( "SELECT * from XYZZY, ABC" ) - test( "select * from SYS.XYZZY" ) - test( "Select A from Sys.dual" ) - test( "Select AA,BB,CC from Sys.dual" ) - test( "Select A, B, C from Sys.dual" ) - test( "Select A, B, C from Sys.dual" ) - test( "Xelect A, B, C from Sys.dual" ) - test( "Select A, B, C frox Sys.dual" ) - test( "Select" ) - test( "Select ^^^ frox Sys.dual" ) - test( "Select A, B, C from Sys.dual, Table2 " ) + selectToken = CaselessLiteral("select") + fromToken = CaselessLiteral("from") + + ident = Word(alphas, alphanums + "_$") + columnName = delimitedList(ident, ".", combine=True).setParseAction(upcaseTokens) + columnNameList = Group(delimitedList(columnName)) # .setName("columns") + tableName = delimitedList(ident, ".", combine=True).setParseAction(upcaseTokens) + tableNameList = Group(delimitedList(tableName)) # .setName("tables") + simpleSQL = ( + selectToken + + ("*" | columnNameList).setResultsName("columns") + + fromToken + + tableNameList.setResultsName("tables") + ) + + test("SELECT * from XYZZY, ABC") + test("select * from SYS.XYZZY") + test("Select A from Sys.dual") + test("Select AA,BB,CC from Sys.dual") + test("Select A, B, C from Sys.dual") + test("Select A, B, C from Sys.dual") + test("Xelect A, B, C from Sys.dual") + test("Select A, B, C frox Sys.dual") + test("Select") + test("Select ^^^ frox Sys.dual") + test("Select A, B, C from Sys.dual, Table2 ") diff --git a/src/whoosh/support/relativedelta.py b/src/whoosh/support/relativedelta.py index 23ca7ee6..5dfa8f03 100644 --- a/src/whoosh/support/relativedelta.py +++ b/src/whoosh/support/relativedelta.py @@ -7,13 +7,13 @@ __author__ = "Gustavo Niemeyer " __license__ = "PSF License" -import datetime import calendar +import datetime __all__ = ["relativedelta", "MO", "TU", "WE", "TH", "FR", "SA", "SU"] -class weekday(object): +class weekday: __slots__ = ["weekday", "n"] def __init__(self, weekday, n=None): @@ -41,82 +41,100 @@ def __repr__(self): else: return "%s(%+d)" % (s, self.n) + MO, TU, WE, TH, FR, SA, SU = weekdays = tuple([weekday(x) for x in range(7)]) class relativedelta: """ -The relativedelta type is based on the specification of the excellent -work done by M.-A. Lemburg in his mx.DateTime extension. However, -notice that this type does *NOT* implement the same algorithm as -his work. Do *NOT* expect it to behave like mx.DateTime's counterpart. + The relativedelta type is based on the specification of the excellent + work done by M.-A. Lemburg in his mx.DateTime extension. However, + notice that this type does *NOT* implement the same algorithm as + his work. Do *NOT* expect it to behave like mx.DateTime's counterpart. -There's two different ways to build a relativedelta instance. The -first one is passing it two date/datetime classes: + There's two different ways to build a relativedelta instance. The + first one is passing it two date/datetime classes: - relativedelta(datetime1, datetime2) + relativedelta(datetime1, datetime2) -And the other way is to use the following keyword arguments: + And the other way is to use the following keyword arguments: - year, month, day, hour, minute, second, microsecond: - Absolute information. + year, month, day, hour, minute, second, microsecond: + Absolute information. - years, months, weeks, days, hours, minutes, seconds, microseconds: - Relative information, may be negative. + years, months, weeks, days, hours, minutes, seconds, microseconds: + Relative information, may be negative. - weekday: - One of the weekday instances (MO, TU, etc). These instances may - receive a parameter N, specifying the Nth weekday, which could - be positive or negative (like MO(+1) or MO(-2). Not specifying - it is the same as specifying +1. You can also use an integer, - where 0=MO. + weekday: + One of the weekday instances (MO, TU, etc). These instances may + receive a parameter N, specifying the Nth weekday, which could + be positive or negative (like MO(+1) or MO(-2). Not specifying + it is the same as specifying +1. You can also use an integer, + where 0=MO. - leapdays: - Will add given days to the date found, if year is a leap - year, and the date found is post 28 of february. + leapdays: + Will add given days to the date found, if year is a leap + year, and the date found is post 28 of february. - yearday, nlyearday: - Set the yearday or the non-leap year day (jump leap days). - These are converted to day/month/leapdays information. + yearday, nlyearday: + Set the yearday or the non-leap year day (jump leap days). + These are converted to day/month/leapdays information. -Here is the behavior of operations with relativedelta: + Here is the behavior of operations with relativedelta: -1) Calculate the absolute year, using the 'year' argument, or the - original datetime year, if the argument is not present. + 1) Calculate the absolute year, using the 'year' argument, or the + original datetime year, if the argument is not present. -2) Add the relative 'years' argument to the absolute year. + 2) Add the relative 'years' argument to the absolute year. -3) Do steps 1 and 2 for month/months. + 3) Do steps 1 and 2 for month/months. -4) Calculate the absolute day, using the 'day' argument, or the - original datetime day, if the argument is not present. Then, - subtract from the day until it fits in the year and month - found after their operations. + 4) Calculate the absolute day, using the 'day' argument, or the + original datetime day, if the argument is not present. Then, + subtract from the day until it fits in the year and month + found after their operations. -5) Add the relative 'days' argument to the absolute day. Notice - that the 'weeks' argument is multiplied by 7 and added to - 'days'. + 5) Add the relative 'days' argument to the absolute day. Notice + that the 'weeks' argument is multiplied by 7 and added to + 'days'. -6) Do steps 1 and 2 for hour/hours, minute/minutes, second/seconds, - microsecond/microseconds. + 6) Do steps 1 and 2 for hour/hours, minute/minutes, second/seconds, + microsecond/microseconds. -7) If the 'weekday' argument is present, calculate the weekday, - with the given (wday, nth) tuple. wday is the index of the - weekday (0-6, 0=Mon), and nth is the number of weeks to add - forward or backward, depending on its signal. Notice that if - the calculated date is already Monday, for example, using - (0, 1) or (0, -1) won't change the day. + 7) If the 'weekday' argument is present, calculate the weekday, + with the given (wday, nth) tuple. wday is the index of the + weekday (0-6, 0=Mon), and nth is the number of weeks to add + forward or backward, depending on its signal. Notice that if + the calculated date is already Monday, for example, using + (0, 1) or (0, -1) won't change the day. """ - def __init__(self, dt1=None, dt2=None, - years=0, months=0, days=0, leapdays=0, weeks=0, - hours=0, minutes=0, seconds=0, microseconds=0, - year=None, month=None, day=None, weekday=None, - yearday=None, nlyearday=None, - hour=None, minute=None, second=None, microsecond=None): + def __init__( + self, + dt1=None, + dt2=None, + years=0, + months=0, + days=0, + leapdays=0, + weeks=0, + hours=0, + minutes=0, + seconds=0, + microseconds=0, + year=None, + month=None, + day=None, + weekday=None, + yearday=None, + nlyearday=None, + hour=None, + minute=None, + second=None, + microsecond=None, + ): if dt1 and dt2: - if not isinstance(dt1, datetime.date) or \ - not isinstance(dt2, datetime.date): + if not isinstance(dt1, datetime.date) or not isinstance(dt2, datetime.date): raise TypeError("relativedelta only diffs datetime/date") if type(dt1) is not type(dt2): if not isinstance(dt1, datetime.datetime): @@ -187,8 +205,7 @@ def __init__(self, dt1=None, dt2=None, if yearday > 59: self.leapdays = -1 if yday: - ydayidx = [31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, - 366] + ydayidx = [31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 366] for idx, ydays in enumerate(ydayidx): if yday <= ydays: self.month = idx + 1 @@ -228,9 +245,16 @@ def _fix(self): div, mod = divmod(self.months * s, 12) self.months = mod * s self.years += div * s - if (self.hours or self.minutes or self.seconds or self.microseconds or - self.hour is not None or self.minute is not None or - self.second is not None or self.microsecond is not None): + if ( + self.hours + or self.minutes + or self.seconds + or self.microseconds + or self.hour is not None + or self.minute is not None + or self.second is not None + or self.microsecond is not None + ): self._has_time = 1 else: self._has_time = 0 @@ -261,8 +285,7 @@ def __radd__(self, other): elif month < 1: year -= 1 month += 12 - day = min(calendar.monthrange(year, month)[1], - self.day or other.day) + day = min(calendar.monthrange(year, month)[1], self.day or other.day) repl = {"year": year, "month": month, "day": day} for attr in ["hour", "minute", "second", "microsecond"]: value = getattr(self, attr) @@ -271,12 +294,13 @@ def __radd__(self, other): days = self.days if self.leapdays and month > 2 and calendar.isleap(year): days += self.leapdays - ret = (other.replace(**repl) - + datetime.timedelta(days=days, - hours=self.hours, - minutes=self.minutes, - seconds=self.seconds, - microseconds=self.microseconds)) + ret = other.replace(**repl) + datetime.timedelta( + days=days, + hours=self.hours, + minutes=self.minutes, + seconds=self.seconds, + microseconds=self.microseconds, + ) if self.weekday: weekday, nth = self.weekday.weekday, self.weekday.n or 1 jumpdays = (abs(nth) - 1) * 7 @@ -294,99 +318,109 @@ def __rsub__(self, other): def __add__(self, other): if not isinstance(other, relativedelta): raise TypeError("unsupported type for add operation") - return relativedelta(years=other.years + self.years, - months=other.months + self.months, - days=other.days + self.days, - hours=other.hours + self.hours, - minutes=other.minutes + self.minutes, - seconds=other.seconds + self.seconds, - microseconds=other.microseconds + self.microseconds, - leapdays=other.leapdays or self.leapdays, - year=other.year or self.year, - month=other.month or self.month, - day=other.day or self.day, - weekday=other.weekday or self.weekday, - hour=other.hour or self.hour, - minute=other.minute or self.minute, - second=other.second or self.second, - microsecond=other.second or self.microsecond) + return relativedelta( + years=other.years + self.years, + months=other.months + self.months, + days=other.days + self.days, + hours=other.hours + self.hours, + minutes=other.minutes + self.minutes, + seconds=other.seconds + self.seconds, + microseconds=other.microseconds + self.microseconds, + leapdays=other.leapdays or self.leapdays, + year=other.year or self.year, + month=other.month or self.month, + day=other.day or self.day, + weekday=other.weekday or self.weekday, + hour=other.hour or self.hour, + minute=other.minute or self.minute, + second=other.second or self.second, + microsecond=other.second or self.microsecond, + ) def __sub__(self, other): if not isinstance(other, relativedelta): raise TypeError("unsupported type for sub operation") - return relativedelta(years=other.years - self.years, - months=other.months - self.months, - days=other.days - self.days, - hours=other.hours - self.hours, - minutes=other.minutes - self.minutes, - seconds=other.seconds - self.seconds, - microseconds=other.microseconds - self.microseconds, - leapdays=other.leapdays or self.leapdays, - year=other.year or self.year, - month=other.month or self.month, - day=other.day or self.day, - weekday=other.weekday or self.weekday, - hour=other.hour or self.hour, - minute=other.minute or self.minute, - second=other.second or self.second, - microsecond=other.second or self.microsecond) + return relativedelta( + years=other.years - self.years, + months=other.months - self.months, + days=other.days - self.days, + hours=other.hours - self.hours, + minutes=other.minutes - self.minutes, + seconds=other.seconds - self.seconds, + microseconds=other.microseconds - self.microseconds, + leapdays=other.leapdays or self.leapdays, + year=other.year or self.year, + month=other.month or self.month, + day=other.day or self.day, + weekday=other.weekday or self.weekday, + hour=other.hour or self.hour, + minute=other.minute or self.minute, + second=other.second or self.second, + microsecond=other.second or self.microsecond, + ) def __neg__(self): - return relativedelta(years= -self.years, - months= -self.months, - days= -self.days, - hours= -self.hours, - minutes= -self.minutes, - seconds= -self.seconds, - microseconds= -self.microseconds, - leapdays=self.leapdays, - year=self.year, - month=self.month, - day=self.day, - weekday=self.weekday, - hour=self.hour, - minute=self.minute, - second=self.second, - microsecond=self.microsecond) + return relativedelta( + years=-self.years, + months=-self.months, + days=-self.days, + hours=-self.hours, + minutes=-self.minutes, + seconds=-self.seconds, + microseconds=-self.microseconds, + leapdays=self.leapdays, + year=self.year, + month=self.month, + day=self.day, + weekday=self.weekday, + hour=self.hour, + minute=self.minute, + second=self.second, + microsecond=self.microsecond, + ) def __nonzero__(self): - return not (not self.years and - not self.months and - not self.days and - not self.hours and - not self.minutes and - not self.seconds and - not self.microseconds and - not self.leapdays and - self.year is None and - self.month is None and - self.day is None and - self.weekday is None and - self.hour is None and - self.minute is None and - self.second is None and - self.microsecond is None) + return not ( + not self.years + and not self.months + and not self.days + and not self.hours + and not self.minutes + and not self.seconds + and not self.microseconds + and not self.leapdays + and self.year is None + and self.month is None + and self.day is None + and self.weekday is None + and self.hour is None + and self.minute is None + and self.second is None + and self.microsecond is None + ) __bool__ = __nonzero__ def __mul__(self, other): f = float(other) - return relativedelta(years=self.years * f, - months=self.months * f, - days=self.days * f, - hours=self.hours * f, - minutes=self.minutes * f, - seconds=self.seconds * f, - microseconds=self.microseconds * f, - leapdays=self.leapdays, - year=self.year, - month=self.month, - day=self.day, - weekday=self.weekday, - hour=self.hour, - minute=self.minute, - second=self.second, - microsecond=self.microsecond) + return relativedelta( + years=self.years * f, + months=self.months * f, + days=self.days * f, + hours=self.hours * f, + minutes=self.minutes * f, + seconds=self.seconds * f, + microseconds=self.microseconds * f, + leapdays=self.leapdays, + year=self.year, + month=self.month, + day=self.day, + weekday=self.weekday, + hour=self.hour, + minute=self.minute, + second=self.second, + microsecond=self.microsecond, + ) def __eq__(self, other): if not isinstance(other, relativedelta): @@ -399,20 +433,22 @@ def __eq__(self, other): n1, n2 = self.weekday.n, other.weekday.n if n1 != n2 and not ((not n1 or n1 == 1) and (not n2 or n2 == 1)): return False - return (self.years == other.years and - self.months == other.months and - self.days == other.days and - self.hours == other.hours and - self.minutes == other.minutes and - self.seconds == other.seconds and - self.leapdays == other.leapdays and - self.year == other.year and - self.month == other.month and - self.day == other.day and - self.hour == other.hour and - self.minute == other.minute and - self.second == other.second and - self.microsecond == other.microsecond) + return ( + self.years == other.years + and self.months == other.months + and self.days == other.days + and self.hours == other.hours + and self.minutes == other.minutes + and self.seconds == other.seconds + and self.leapdays == other.leapdays + and self.year == other.year + and self.month == other.month + and self.day == other.day + and self.hour == other.hour + and self.minute == other.minute + and self.second == other.second + and self.microsecond == other.microsecond + ) def __ne__(self, other): return not self.__eq__(other) @@ -422,16 +458,33 @@ def __div__(self, other): def __repr__(self): l = [] - for attr in ["years", "months", "days", "leapdays", - "hours", "minutes", "seconds", "microseconds"]: + for attr in [ + "years", + "months", + "days", + "leapdays", + "hours", + "minutes", + "seconds", + "microseconds", + ]: value = getattr(self, attr) if value: l.append("%s=%+d" % (attr, value)) - for attr in ["year", "month", "day", "weekday", - "hour", "minute", "second", "microsecond"]: + for attr in [ + "year", + "month", + "day", + "weekday", + "hour", + "minute", + "second", + "microsecond", + ]: value = getattr(self, attr) if value is not None: - l.append("%s=%s" % (attr, repr(value))) - return "%s(%s)" % (self.__class__.__name__, ", ".join(l)) + l.append(f"{attr}={repr(value)}") + return f"{self.__class__.__name__}({', '.join(l)})" + # vim:ts=4:sw=4:et diff --git a/src/whoosh/support/unicode.py b/src/whoosh/support/unicode.py index 351c7130..4010744b 100644 --- a/src/whoosh/support/unicode.py +++ b/src/whoosh/support/unicode.py @@ -3,7 +3,6 @@ from whoosh.compat import text_type, u - # http://unicode.org/Public/UNIDATA/Blocks.txt _blockdata = """ # Blocks-5.1.0.txt @@ -217,7 +216,7 @@ _names = [] -class blocks(object): +class blocks: pass diff --git a/src/whoosh/system.py b/src/whoosh/system.py index 2bdce1b1..13c3da66 100644 --- a/src/whoosh/system.py +++ b/src/whoosh/system.py @@ -28,7 +28,6 @@ import sys from struct import Struct, calcsize - IS_LITTLE = sys.byteorder == "little" _INT_SIZE = calcsize("!i") @@ -76,4 +75,4 @@ if sys.version_info[0] < 3: emptybytes = "" else: - emptybytes = "".encode("latin-1") + emptybytes = b"" diff --git a/src/whoosh/util/__init__.py b/src/whoosh/util/__init__.py index cc91d3d9..baddf02e 100644 --- a/src/whoosh/util/__init__.py +++ b/src/whoosh/util/__init__.py @@ -25,14 +25,13 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import with_statement -import random, sys, time + +import random +import sys +import time from bisect import insort from functools import wraps -from whoosh.compat import range - - # These must be valid separate characters in CASE-INSENSTIVE filenames IDCHARS = "0123456789abcdefghijklmnopqrstuvwxyz" @@ -78,7 +77,7 @@ def make_binary_tree(fn, args, **kwargs): return fn( make_binary_tree(fn, args[:half], **kwargs), make_binary_tree(fn, args[half:], **kwargs), - **kwargs + **kwargs, ) diff --git a/src/whoosh/util/cache.py b/src/whoosh/util/cache.py index 00cb3f27..6e8b7a09 100644 --- a/src/whoosh/util/cache.py +++ b/src/whoosh/util/cache.py @@ -25,14 +25,13 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import with_statement + import functools from heapq import nsmallest from operator import itemgetter from whoosh.compat import iteritems - try: from collections import Counter except ImportError: diff --git a/src/whoosh/util/filelock.py b/src/whoosh/util/filelock.py index 5534d123..e8beae57 100644 --- a/src/whoosh/util/filelock.py +++ b/src/whoosh/util/filelock.py @@ -58,9 +58,8 @@ def try_for(fn, timeout=5.0, delay=0.1): return v -class LockBase(object): - """Base class for file locks. - """ +class LockBase: + """Base class for file locks.""" def __init__(self, filename): self.fd = None @@ -88,8 +87,7 @@ def release(self): class FcntlLock(LockBase): - """File lock based on UNIX-only fcntl module. - """ + """File lock based on UNIX-only fcntl module.""" def acquire(self, blocking=False): import fcntl # type: ignore @UnresolvedImport @@ -105,7 +103,7 @@ def acquire(self, blocking=False): fcntl.flock(self.fd, mode) self.locked = True return True - except IOError: + except OSError: e = sys.exc_info()[1] if e.errno not in (errno.EAGAIN, errno.EACCES): raise @@ -118,14 +116,14 @@ def release(self): raise Exception("Lock was not acquired") import fcntl # type: ignore @UnresolvedImport + fcntl.flock(self.fd, fcntl.LOCK_UN) os.close(self.fd) self.fd = None class MsvcrtLock(LockBase): - """File lock based on Windows-only msvcrt module. - """ + """File lock based on Windows-only msvcrt module.""" def acquire(self, blocking=False): import msvcrt # type: ignore @UnresolvedImport @@ -139,7 +137,7 @@ def acquire(self, blocking=False): try: msvcrt.locking(self.fd, mode, 1) return True - except IOError: + except OSError: e = sys.exc_info()[1] if e.errno not in (errno.EAGAIN, errno.EACCES, errno.EDEADLK): raise diff --git a/src/whoosh/util/loading.py b/src/whoosh/util/loading.py index 8d549d3a..0daf281b 100644 --- a/src/whoosh/util/loading.py +++ b/src/whoosh/util/loading.py @@ -37,17 +37,17 @@ def __init__(self, f, objmap, shortcuts=None): pickle.Unpickler.__init__(self, f) if shortcuts: - objmap = dict((k % shortcuts, v % shortcuts) for k, v in objmap.items()) + objmap = {k % shortcuts: v % shortcuts for k, v in objmap.items()} self._objmap = objmap def find_class(self, modulename, objname): - fqname = "%s.%s" % (modulename, objname) + fqname = f"{modulename}.{objname}" if fqname in self._objmap: fqname = self._objmap[fqname] try: obj = find_object(fqname) except ImportError: - raise ImportError("Couldn't find %r" % fqname) + raise ImportError(f"Couldn't find {fqname!r}") return obj @@ -62,7 +62,7 @@ def find_object(name, blacklist=None, whitelist=None): for pre in blacklist: if name.startswith(pre): raise TypeError( - "%r: can't instantiate names starting with %r" % (name, pre) + f"{name!r}: can't instantiate names starting with {pre!r}" ) if whitelist: passes = False @@ -71,11 +71,11 @@ def find_object(name, blacklist=None, whitelist=None): passes = True break if not passes: - raise TypeError("Can't instantiate %r" % name) + raise TypeError(f"Can't instantiate {name!r}") lastdot = name.rfind(".") - assert lastdot > -1, "Name %r must be fully qualified" % name + assert lastdot > -1, f"Name {name!r} must be fully qualified" modname = name[:lastdot] clsname = name[lastdot + 1 :] diff --git a/src/whoosh/util/numeric.py b/src/whoosh/util/numeric.py index 5b4670c8..af813f49 100644 --- a/src/whoosh/util/numeric.py +++ b/src/whoosh/util/numeric.py @@ -25,17 +25,31 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -import math, struct +import math +import struct from array import array from bisect import bisect_left from struct import pack, unpack from whoosh.compat import b, long_type -from whoosh.system import pack_byte, unpack_byte, pack_ushort, unpack_ushort -from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint -from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong -from whoosh.system import pack_float, unpack_float, pack_double, unpack_double - +from whoosh.system import ( + pack_byte, + pack_double, + pack_float, + pack_int, + pack_long, + pack_uint, + pack_ulong, + pack_ushort, + unpack_byte, + unpack_double, + unpack_float, + unpack_int, + unpack_long, + unpack_uint, + unpack_ulong, + unpack_ushort, +) NaN = struct.unpack(" other.tuple() def __ge__(self, other): if not hasattr(other, "tuple"): - raise ValueError("Can't compare %r with %r" % (self, other)) + raise ValueError(f"Can't compare {self!r} with {other!r}") return self.tuple() >= other.tuple() def __le__(self, other): if not hasattr(other, "tuple"): - raise ValueError("Can't compare %r with %r" % (self, other)) + raise ValueError(f"Can't compare {self!r} with {other!r}") return self.tuple() <= other.tuple() def __ne__(self, other): if not hasattr(other, "tuple"): - raise ValueError("Can't compare %r with %r" % (self, other)) + raise ValueError(f"Can't compare {self!r} with {other!r}") return self.tuple() != other.tuple() @@ -128,7 +128,7 @@ class SimpleVersion(BaseVersion): ] _ex_bits = {"a": 0, "b": 1, "c": 2, "rc": 10, "z": 15} - _bits_ex = dict((v, k) for k, v in _ex_bits.items()) + _bits_ex = {v: k for k, v in _ex_bits.items()} __slots__ = ("major", "minor", "release", "ex", "exnum") diff --git a/src/whoosh/writing.py b/src/whoosh/writing.py index 57b7d271..54b76f92 100644 --- a/src/whoosh/writing.py +++ b/src/whoosh/writing.py @@ -25,8 +25,9 @@ # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. -from __future__ import with_statement -import threading, time + +import threading +import time from bisect import bisect_right from contextlib import contextmanager @@ -39,7 +40,6 @@ from whoosh.util.filelock import try_for from whoosh.util.text import utf8encode - # Exceptions @@ -67,6 +67,7 @@ def groupmanager(writer): def NO_MERGE(writer, segments): """This policy does not merge any existing segments.""" + _ = writer return segments @@ -124,6 +125,7 @@ def CLEAR(writer, segments): """This policy DELETES all existing segments and only writes the new segment. """ + _ = writer return [] @@ -146,7 +148,7 @@ def __init__(self, tempstore, segment, limitmb=128, **kwargs): self.fieldnames = set() def _new_run(self): - path = "%s.run" % random_name() + path = f"{random_name()}.run" f = self.tempstore.create_file(path).raw_file() return path, f @@ -157,10 +159,9 @@ def _remove_run(self, path): return self.tempstore.delete_file(path) def add(self, item): - # item = (fieldname, tbytes, docnum, weight, vbytes) - assert isinstance(item[1], bytes_type), "tbytes=%r" % item[1] + assert isinstance(item[1], bytes_type), f"tbytes={item[1]!r}" if item[4] is not None: - assert isinstance(item[4], bytes_type), "vbytes=%r" % item[4] + assert isinstance(item[4], bytes_type), f"vbytes={item[4]!r}" self.fieldnames.add(item[0]) size = ( 28 @@ -192,7 +193,7 @@ def save(self): # Writer base class -class IndexWriter(object): +class IndexWriter: """High-level object for writing to an index. To get a writer for a particular index, call @@ -359,7 +360,7 @@ def add_document(self, **fields): from datetime import datetime, timedelta from whoosh import index - from whoosh.fields import * + from whoosh.fields import Schema, DATETIME, NUMERIC, TEXT schema = Schema(date=DATETIME, size=NUMERIC(float), content=TEXT) myindex = index.create_in("indexdir", schema) @@ -420,7 +421,7 @@ def _doc_boost(self, fields, default=1.0): return default def _field_boost(self, fields, fieldname, default=1.0): - boostkw = "_%s_boost" % fieldname + boostkw = f"_{fieldname}_boost" if boostkw in fields: return float(fields[boostkw]) else: @@ -526,7 +527,7 @@ def __init__( docbase=0, codec=None, compound=True, - **kwargs + **kwargs, ): # Lock the index self.writelock = None @@ -552,7 +553,7 @@ def __init__( self._setup_doc_offsets() # Internals - self._tempstorage = self.storage.temp_storage("%s.tmp" % self.indexname) + self._tempstorage = self.storage.temp_storage(f"{self.indexname}.tmp") newsegment = codec.new_segment(self.storage, self.indexname) self.newsegment = newsegment self.compound = compound and newsegment.should_assemble() @@ -574,7 +575,7 @@ def __repr__(self): # Origin bitbucket issue: https://bitbucket.org/mchaput/whoosh/issues/483 # newsegment might not be set due to LockError # so use getattr to be safe - return "<%s %r>" % (self.__class__.__name__, getattr(self, "newsegment", None)) + return f"<{self.__class__.__name__} {getattr(self, 'newsegment', None)!r}>" def _check_state(self): if self.is_closed: @@ -623,13 +624,13 @@ def add_field(self, fieldname, fieldspec, **kwargs): self._check_state() if self._added: raise Exception("Can't modify schema after adding data to writer") - super(SegmentWriter, self).add_field(fieldname, fieldspec, **kwargs) + super().add_field(fieldname, fieldspec, **kwargs) def remove_field(self, fieldname): self._check_state() if self._added: raise Exception("Can't modify schema after adding data to writer") - super(SegmentWriter, self).remove_field(fieldname) + super().remove_field(fieldname) def has_deletions(self): """ @@ -642,7 +643,7 @@ def has_deletions(self): def delete_document(self, docnum, delete=True): self._check_state() if docnum >= sum(seg.doc_count_all() for seg in self.segments): - raise IndexingError("No document ID %r in this index" % docnum) + raise IndexingError(f"No document ID {docnum!r} in this index") segment, segdocnum = self._segment_and_docnum(docnum) segment.delete_document(segdocnum, delete=delete) @@ -708,7 +709,7 @@ def write_per_doc(self, fieldnames, reader): pdw.start_doc(self.docnum) # Set disjunction includes dynamic fields (can be different for each document) - for fieldname in fieldnames | set(s for s in stored if s in self.schema): + for fieldname in fieldnames | {s for s in stored if s in self.schema}: fieldobj = schema[fieldname] length = reader.doc_field_length(docnum, fieldname) pdw.add_field(fieldname, fieldobj, stored.get(fieldname), length) @@ -729,9 +730,9 @@ def write_per_doc(self, fieldnames, reader): def add_reader(self, reader): self._check_state() basedoc = self.docnum - ndxnames = set( + ndxnames = { fname for fname in reader.indexed_field_names() if fname in self.schema - ) + } fieldnames = set(self.schema.names()) | ndxnames docmap = self.write_per_doc(fieldnames, reader) @@ -742,7 +743,7 @@ def _check_fields(self, schema, fieldnames): # Check if the caller gave us a bogus field for name in fieldnames: if name not in schema: - raise UnknownFieldError("No field named %r in %s" % (name, schema)) + raise UnknownFieldError(f"No field named {name!r} in {schema}") def add_document(self, **fields): self._check_state() @@ -786,7 +787,7 @@ def add_document(self, **fields): spellfield = field.spelling_fieldname(fieldname) for word in field.spellable_words(value): word = utf8encode(word)[0] - # item = (fieldname, tbytes, docnum, weight, vbytes) + add_post((spellfield, word, 0, 1, vbytes)) vformat = field.vector @@ -801,7 +802,7 @@ def add_document(self, **fields): perdocwriter.add_vector_items(fieldname, field, vitems) # Allow a custom value for stored field/column - customval = fields.get("_stored_%s" % fieldname, value) + customval = fields.get(f"_stored_{fieldname}", value) # Add the stored value and length for this field to the per- # document writer @@ -812,7 +813,7 @@ def add_document(self, **fields): if column and customval is not None: cv = field.to_column_value(customval) perdocwriter.add_column_value(fieldname, column, cv) - except Exception as ex: + except ValueError as ex: perdocwriter.cancel_doc() raise ex @@ -830,7 +831,7 @@ def get_segment(self): def per_document_reader(self): if not self.perdocwriter.is_closed: - raise Exception("Per-doc writer is still open") + raise RuntimeError("Per-doc writer is still open") return self.codec.per_document_reader(self.storage, self.get_segment()) def searcher(self, **kwargs): @@ -838,10 +839,10 @@ def searcher(self, **kwargs): # We have a write lock, nothing is changing. Only cache if kwargs is emtpy # and the SegmentWriter is still open. if kwargs or self.is_closed: - return super(SegmentWriter, self).searcher(**kwargs) + return super().searcher(**kwargs) if self._searcher is None: - s = super(SegmentWriter, self).searcher() + s = super().searcher() self._searcher = s s._orig_close = s.close # called in _finish() s.close = lambda: None diff --git a/stress/test_bigfacet.py b/stress/test_bigfacet.py index a806f621..e41dbe7d 100644 --- a/stress/test_bigfacet.py +++ b/stress/test_bigfacet.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - import os.path import random import string diff --git a/stress/test_bigindex.py b/stress/test_bigindex.py index 3501b28a..80ca84a3 100644 --- a/stress/test_bigindex.py +++ b/stress/test_bigindex.py @@ -1,11 +1,9 @@ -from __future__ import with_statement - import random from whoosh import fields -from whoosh.compat import range, text_type, u -from whoosh.util.testing import TempIndex +from whoosh.compat import text_type, u from whoosh.util import now +from whoosh.util.testing import TempIndex def test_20000_single(): diff --git a/stress/test_bigsort.py b/stress/test_bigsort.py index a017e0e1..b25b9007 100644 --- a/stress/test_bigsort.py +++ b/stress/test_bigsort.py @@ -1,8 +1,10 @@ -import os.path, random, shutil -from datetime import datetime +import os.path +import random +import shutil +from datetime import datetime, timezone from whoosh import fields, index, query -from whoosh.compat import text_type, range +from whoosh.compat import text_type from whoosh.util import now @@ -22,7 +24,9 @@ def test_bigsort(): t = now() w = ix.writer(limitmb=512) for i in range(times): - dt = datetime.fromtimestamp(random.randint(15839593, 1294102139)) + dt = datetime.fromtimestamp( + random.randint(15839593, 1294102139), tz=timezone.utc + ) w.add_document(id=text_type(i), date=dt) w.commit() print("Writing took ", now() - t) diff --git a/stress/test_bigtable.py b/stress/test_bigtable.py index 33ef1ef2..0040d53d 100644 --- a/stress/test_bigtable.py +++ b/stress/test_bigtable.py @@ -1,11 +1,8 @@ -from __future__ import with_statement - from random import randint, shuffle from nose.tools import assert_equal # type: ignore @UnresolvedImport - -from whoosh.compat import range, iteritems -from whoosh.filedb.filetables import HashWriter, HashReader +from whoosh.compat import iteritems +from whoosh.filedb.filetables import HashReader, HashWriter from whoosh.util.testing import TempStorage @@ -16,7 +13,7 @@ def randstring(min, max): return "".join(chr(randint(1, 255)) for _ in range(randint(min, max))) count = 100000 - samp = dict((randstring(1, 50), randstring(1, 50)) for _ in range(count)) + samp = {randstring(1, 50): randstring(1, 50) for _ in range(count)} fhw = HashWriter(st.create_file("big.hsh")) fhw.add_all(iteritems(samp)) diff --git a/stress/test_hugeindex.py b/stress/test_hugeindex.py index ed37e740..d82193df 100644 --- a/stress/test_hugeindex.py +++ b/stress/test_hugeindex.py @@ -1,10 +1,7 @@ -from __future__ import with_statement import struct from nose.tools import assert_equal # type: ignore @UnresolvedImport - from whoosh import formats -from whoosh.compat import range from whoosh.filedb.filepostings import FilePostingReader, FilePostingWriter from whoosh.util.testing import TempStorage @@ -19,8 +16,8 @@ def test_huge_postfile(): assert_equal(pf.tell(), gb5 + 4) fpw = FilePostingWriter(pf) - format = formats.Frequency(None) - offset = fpw.start(format) + f = formats.Frequency(None) + offset = fpw.start(f) for i in range(10): fpw.write(i, float(i), struct.pack("!I", i), 10) posttotal = fpw.finish() @@ -28,7 +25,7 @@ def test_huge_postfile(): fpw.close() pf = st.open_file("test.pst") - pfr = FilePostingReader(pf, offset, format) + pfr = FilePostingReader(pf, offset, f) i = 0 while pfr.is_active(): assert_equal(pfr.id(), i) diff --git a/stress/test_threading.py b/stress/test_threading.py index d0c3a13d..c02b3d47 100644 --- a/stress/test_threading.py +++ b/stress/test_threading.py @@ -1,8 +1,9 @@ -from __future__ import with_statement -import random, threading, time +import random +import threading +import time from whoosh import fields, query -from whoosh.compat import range, u, text_type +from whoosh.compat import text_type, u from whoosh.util.testing import TempStorage diff --git a/stress/test_update.py b/stress/test_update.py index 06f935c3..63524989 100644 --- a/stress/test_update.py +++ b/stress/test_update.py @@ -1,10 +1,8 @@ -from __future__ import with_statement import random from nose.tools import assert_equal - from whoosh import fields, query -from whoosh.compat import range, text_type +from whoosh.compat import text_type from whoosh.util.testing import TempIndex diff --git a/tests/test_analysis.py b/tests/test_analysis.py index 4508b4c7..394ce032 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -1,12 +1,6 @@ -# coding=utf-8 - -from __future__ import with_statement - import pytest - from whoosh import analysis, fields, qparser -from whoosh.compat import b, u, unichr -from whoosh.compat import dumps +from whoosh.compat import b, dumps, u, unichr from whoosh.filedb.filestore import RamStorage @@ -256,7 +250,7 @@ def test_shingles(): def test_unicode_blocks(): - from whoosh.support.unicode import blocks, blockname, blocknum + from whoosh.support.unicode import blockname, blocknum, blocks assert blockname(u("a")) == "Basic Latin" assert blockname(unichr(0x0B80)) == "Tamil" diff --git a/tests/test_automata.py b/tests/test_automata.py index dd122b48..d66fa021 100644 --- a/tests/test_automata.py +++ b/tests/test_automata.py @@ -2,9 +2,8 @@ import os.path from bisect import bisect_left -from whoosh.compat import permutations -from whoosh.compat import range from whoosh.automata import fsa, glob, lev +from whoosh.compat import permutations from whoosh.support.levenshtein import levenshtein @@ -156,15 +155,7 @@ def test_glob_range(): assert not nfa.accept("acc") -# def test_glob_negate_range(): -# nfa = glob.glob_automaton("a[!ab]a") -# assert not nfa.accept("aaa") -# assert not nfa.accept("aba") -# assert nfa.accept("aca") -# assert not nfa.accept("bcb") - - -class Skipper(object): +class Skipper: def __init__(self, data): self.data = data self.i = 0 diff --git a/tests/test_classify.py b/tests/test_classify.py index c36c0e4a..89d008d3 100644 --- a/tests/test_classify.py +++ b/tests/test_classify.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - from whoosh import analysis, classify, fields, formats, query, reading from whoosh.compat import text_type, u from whoosh.filedb.filestore import RamStorage @@ -58,9 +56,11 @@ def test_add_text(model=classify.Bo1Model): with ix.reader() as r: exp = classify.Expander(r, "content", model=model) exp.add_text(text) - assert set([t[0] for t in exp.expanded_terms(3)]) == set( - ["particles", "velocity", "field"] - ) + assert {t[0] for t in exp.expanded_terms(3)} == { + "particles", + "velocity", + "field", + } exp = classify.Expander(r, "extra", model=model) exp.add_text(text) assert exp.expanded_terms(3) == [] @@ -80,7 +80,7 @@ def test_keyterms_from_text(model=classify.Bo2Model): ix = create_index() with ix.searcher() as s: keys = list(s.key_terms_from_text("content", text, model=model)) - assert set([t[0] for t in keys]) == set(["particles", "velocity", "field"]) + assert {t[0] for t in keys} == {"particles", "velocity", "field"} keys = list(s.key_terms_from_text("extra", text, model=model)) assert keys == [] diff --git a/tests/test_codecs.py b/tests/test_codecs.py index 49eba32a..ef0b74d6 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -1,13 +1,10 @@ -from __future__ import with_statement import random from array import array import pytest - from whoosh import analysis, fields, formats, query -from whoosh.compat import u, b, text_type -from whoosh.compat import array_tobytes, range from whoosh.codec import default_codec +from whoosh.compat import array_tobytes, b, text_type, u from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempStorage @@ -19,14 +16,13 @@ def _make_codec(**kwargs): return st, codec, seg -class FakeLengths(object): +class FakeLengths: def __init__(self, **lens): self.lens = lens def doc_field_length(self, docnum, fieldname): - if fieldname in self.lens: - if docnum < len(self.lens[fieldname]): - return self.lens[fieldname][docnum] + if fieldname in self.lens and docnum < len(self.lens[fieldname]): + return self.lens[fieldname][docnum] return 1 @@ -65,9 +61,7 @@ def random_btext(): return array_tobytes(a).decode("utf-16") domain = sorted( - set( - [(random_fieldname(), random_btext().encode("utf-8")) for _ in range(1000)] - ) + {(random_fieldname(), random_btext().encode("utf-8")) for _ in range(1000)} ) st, codec, seg = _make_codec() @@ -171,19 +165,23 @@ def test_termindex(): assert ti.doc_frequency() == 1 +test123 = "Testing one two three" + + def test_docwriter_one(): field = fields.TEXT(stored=True) st, codec, seg = _make_codec() dw = codec.per_document_writer(st, seg) dw.start_doc(0) - dw.add_field("text", field, "Testing one two three", 4) + + dw.add_field("text", field, test123, 4) dw.finish_doc() dw.close() seg.set_doc_count(1) pdr = codec.per_document_reader(st, seg) assert pdr.doc_field_length(0, "text") == 4 - assert pdr.stored_fields(0) == {"text": "Testing one two three"} + assert pdr.stored_fields(0) == {"text": test123} def test_docwriter_two(): @@ -192,7 +190,7 @@ def test_docwriter_two(): dw = codec.per_document_writer(st, seg) dw.start_doc(0) dw.add_field("title", field, ("a", "b"), 2) - dw.add_field("text", field, "Testing one two three", 4) + dw.add_field("text", field, test123, 4) dw.finish_doc() dw.start_doc(1) dw.add_field("title", field, "The second document", 3) @@ -296,6 +294,8 @@ def test_store_zero(): def test_fieldwriter_single_term(): + import math + field = fields.TEXT() st, codec, seg = _make_codec() @@ -310,16 +310,18 @@ def test_fieldwriter_single_term(): tr = codec.terms_reader(st, seg) assert ("text", b("alfa")) in tr ti = tr.term_info("text", b("alfa")) - assert ti.weight() == 1.5 + assert math.isclose(ti.weight(), 1.5) assert ti.doc_frequency() == 1 assert ti.min_length() == 1 assert ti.max_length() == 1 - assert ti.max_weight() == 1.5 + assert math.isclose(ti.max_weight(), 1.5) assert ti.min_id() == 0 assert ti.max_id() == 0 def test_fieldwriter_two_terms(): + import math + field = fields.TEXT() st, codec, seg = _make_codec() @@ -338,21 +340,23 @@ def test_fieldwriter_two_terms(): tr = codec.terms_reader(st, seg) assert ("text", b("alfa")) in tr + ti = tr.term_info("text", b("alfa")) - assert ti.weight() == 3.0 + assert math.isclose(ti.weight(), 3.0) assert ti.doc_frequency() == 2 assert ti.min_length() == 1 assert ti.max_length() == 2 - assert ti.max_weight() == 2.0 + assert math.isclose(ti.max_weight(), 2.0) assert ti.min_id() == 0 assert ti.max_id() == 1 assert ("text", b("bravo")) in tr + ti = tr.term_info("text", b("bravo")) - assert ti.weight() == 5.0 + assert math.isclose(ti.weight(), 5.0) assert ti.doc_frequency() == 2 assert ti.min_length() == 2 assert ti.max_length() == 3 - assert ti.max_weight() == 3.0 + assert math.isclose(ti.max_weight(), 3.0) assert ti.min_id() == 0 assert ti.max_id() == 2 @@ -361,6 +365,8 @@ def test_fieldwriter_two_terms(): def test_fieldwriter_multiblock(): + import math + field = fields.TEXT() st, codec, seg = _make_codec(blocklimit=2) @@ -378,11 +384,11 @@ def test_fieldwriter_multiblock(): tr = codec.terms_reader(st, seg) ti = tr.term_info("text", b("alfa")) - assert ti.weight() == 15.0 + assert math.isclose(ti.weight(), 15.0) assert ti.doc_frequency() == 5 assert ti.min_length() == 1 assert ti.max_length() == 5 - assert ti.max_weight() == 5.0 + assert math.isclose(ti.max_weight(), 5.0) assert ti.min_id() == 0 assert ti.max_id() == 4 @@ -517,6 +523,8 @@ def test_skip(): # cur = codec.graph_reader(st, seg).cursor("text") # assert list(cur.flatten_strings()) == ["specials", "specifically"] +cde = "charlie delta echo" + def test_plaintext_codec(): pytest.importorskip("ast") @@ -540,9 +548,7 @@ def test_plaintext_codec(): w.add_document( a=u("bravo charlie delta"), b=1000, c=200, d=u("rolling timing yelling") ) - w.add_document( - a=u("charlie delta echo"), b=5.5, c=300, d=u("using opening pulling") - ) + w.add_document(a=u(cde), b=5.5, c=300, d=u("using opening pulling")) w.add_document( a=u("delta echo foxtrot"), b=True, c=-100, d=u("aching selling dipping") ) @@ -553,7 +559,7 @@ def test_plaintext_codec(): with ix.reader() as r: assert r.has_column("a") c = r.column_reader("a") - assert c[2] == u("charlie delta echo") + assert c[2] == u(cde) w = ix.writer(codec=PlainTextCodec()) w.commit(optimize=True) @@ -574,7 +580,7 @@ def test_plaintext_codec(): storage = ix.storage for fname in storage.list(): if fname.endswith(".dcs"): - f = storage.open_file(fname) + storage.open_file(fname) # print(f.read().decode("utf8")) assert reader.doc_field_length(0, "a") == 3 @@ -588,7 +594,7 @@ def test_plaintext_codec(): assert reader.has_column("a") c = reader.column_reader("a") - assert c[2] == u("charlie delta echo") + assert c[2] == u(cde) assert reader.has_column("c") c = reader.column_reader("c") @@ -596,7 +602,7 @@ def test_plaintext_codec(): assert s.has_vector(2, "a") v = s.vector(2, "a") - assert " ".join(v.all_ids()) == "charlie delta echo" + assert " ".join(v.all_ids()) == cde def test_memory_codec(): @@ -619,9 +625,7 @@ def test_memory_codec(): w.add_document( a=u("bravo charlie delta"), b=1000, c=200, d=u("rolling timing yelling") ) - w.add_document( - a=u("charlie delta echo"), b=5.5, c=300, d=u("using opening pulling") - ) + w.add_document(a=u(cde), b=5.5, c=300, d=u("using opening pulling")) w.add_document( a=u("delta echo foxtrot"), b=True, c=-100, d=u("aching selling dipping") ) @@ -651,7 +655,7 @@ def test_memory_codec(): assert s.has_vector(2, "a") v = s.vector(2, "a") - assert " ".join(v.all_ids()) == "charlie delta echo" + assert " ".join(v.all_ids()) == cde def test_memory_multiwrite(): @@ -678,3 +682,43 @@ def test_memory_multiwrite(): " ".join(reader.field_terms("line")) == "alfa bravo charlie delta echo foxtrot india juliet" ) + + +# can add a new field to the schema before adding documents +def test_add_new_field_to_schema(): + from whoosh.codec.memory import MemoryCodec, MemWriter + from whoosh.fields import TEXT, Schema + + codec = MemoryCodec() + schema = Schema(title=TEXT(stored=True), content=TEXT) + ix = codec.storage.create_index(schema) + writer = MemWriter(ix, _lk=False, codec=codec, docbase=0) + + new_field = TEXT(stored=True) + writer.add_field("author", new_field) + + assert "author" in writer.schema.names() + assert writer.schema["author"] == new_field + + +# can add a reader to the index +def test_add_reader_to_index(): + from whoosh.codec.memory import MemoryCodec, MemWriter + from whoosh.fields import TEXT, Schema + + # Define the schema for the index + schema = Schema(title=TEXT(stored=True), content=TEXT) + + # Create a codec and an index + codec = MemoryCodec() + ix = codec.storage.create_index(schema) + + # Create a writer for the index + writer = MemWriter(ix, _lk=False, codec=codec, docbase=0) + + # Get a reader from the writer and add it to the index + reader = writer.reader() + writer.add_reader(reader) + + # Assert that the reader was added to the index + assert writer._added == True diff --git a/tests/test_collector.py b/tests/test_collector.py index f4609e21..16260a06 100644 --- a/tests/test_collector.py +++ b/tests/test_collector.py @@ -1,7 +1,4 @@ -from __future__ import with_statement - import pytest - from whoosh import collectors, fields, query, searching from whoosh.compat import u from whoosh.filedb.filestore import RamStorage @@ -124,7 +121,7 @@ def matcher(self, searcher, context=None): c = collectors.TimeLimitCollector(c, 0.2) with pytest.raises(searching.TimeLimit): _ = s.search_with_collector(q, c) - assert time.time() - t < 0.5, "Actual time interval: {}".format(time.time() - t) + assert time.time() - t < 0.5, f"Actual time interval: {time.time() - t}" def test_reverse_collapse(): diff --git a/tests/test_columns.py b/tests/test_columns.py index c4ff967a..55a5cec7 100644 --- a/tests/test_columns.py +++ b/tests/test_columns.py @@ -1,12 +1,15 @@ -from __future__ import with_statement -import inspect, random, sys +import inspect +import random +import sys +import pytest from whoosh import columns, fields, query from whoosh.codec.whoosh3 import W3Codec -from whoosh.compat import b, u, BytesIO, bytes_type, text_type -from whoosh.compat import izip, range, dumps, loads +from whoosh.compat import BytesIO, b, bytes_type, dumps, izip, loads, text_type, u from whoosh.filedb import compound from whoosh.filedb.filestore import RamStorage +from whoosh.matching import ConstantScoreMatcher +from whoosh.query import ColumnMatcher, ColumnQuery from whoosh.util.testing import TempIndex, TempStorage @@ -26,7 +29,7 @@ def test_pickleability(): coltypes = [ c for _, c in inspect.getmembers(columns, inspect.isclass) - if issubclass(c, columns.Column) and not c in ignore + if issubclass(c, columns.Column) and c not in ignore ] for coltype in coltypes: @@ -35,7 +38,7 @@ def test_pickleability(): inst = coltype(*args) except TypeError: e = sys.exc_info()[1] - raise TypeError("Error instantiating %r: %s" % (coltype, e)) + raise TypeError(f"Error instantiating {coltype!r}: {e}") _ = loads(dumps(inst, -1)) @@ -54,7 +57,7 @@ def test_multistream(): st = RamStorage() msw = compound.CompoundWriter(st) - files = dict((name, msw.create_file(name)) for name in "abc") + files = {name: msw.create_file(name) for name in "abc"} for name, data in domain: files[name].write(b(data)) f = st.create_file("test") @@ -80,7 +83,7 @@ def randstring(n): value = randstring(2500) domain[name] = value - outfiles = dict((name, BytesIO(value)) for name, value in domain.items()) + outfiles = {name: BytesIO(value) for name, value in domain.items()} with TempStorage() as st: msw = compound.CompoundWriter(st, buffersize=1024) @@ -197,7 +200,7 @@ def test_roundtrip(): c = columns.VarBytesListColumn() _rt(c, [[b("garnet"), b("amethyst")], [b("pearl")]], []) - c = columns.VarBytesListColumn() + _c = columns.VarBytesListColumn() c = columns.FixedBytesListColumn(4) _rt(c, [[b("garn"), b("amet")], [b("pear")]], []) @@ -225,8 +228,9 @@ def test_column_field(): a=fields.TEXT(sortable=True), b=fields.COLUMN(columns.RefBytesColumn()) ) with TempIndex(schema, "columnfield") as ix: + cd = b("charlie delta") with ix.writer(codec=W3Codec()) as w: - w.add_document(a=u("alfa bravo"), b=b("charlie delta")) + w.add_document(a=u("alfa bravo"), b=cd) w.add_document(a=u("bravo charlie"), b=b("delta echo")) w.add_document(a=u("charlie delta"), b=b("echo foxtrot")) @@ -239,7 +243,7 @@ def test_column_field(): assert type(cra[0]) == text_type crb = r.column_reader("b") - assert crb[0] == b("charlie delta") + assert crb[0] == cd assert type(crb[0]) == bytes_type @@ -247,7 +251,7 @@ def test_column_query(): schema = fields.Schema( id=fields.STORED, a=fields.ID(sortable=True), b=fields.NUMERIC(sortable=True) ) - with TempIndex(schema, "columnquery") as ix: + with TempIndex(schema, "ColumnQuery") as ix: with ix.writer(codec=W3Codec()) as w: w.add_document(id=1, a=u("alfa"), b=10) w.add_document(id=2, a=u("bravo"), b=20) @@ -261,16 +265,16 @@ def test_column_query(): def check(q): return [s.stored_fields(docnum)["id"] for docnum in q.docs(s)] - q = query.ColumnQuery("a", u("bravo")) + q = ColumnQuery("a", u("bravo")) assert check(q) == [2] - q = query.ColumnQuery("b", 30) + q = ColumnQuery("b", 30) assert check(q) == [3] - q = query.ColumnQuery("a", lambda v: v != u("delta")) + q = ColumnQuery("a", lambda v: v != u("delta")) assert check(q) == [1, 2, 3, 5, 6] - q = query.ColumnQuery("b", lambda v: v > 30) + q = ColumnQuery("b", lambda v: v > 30) assert check(q) == [4, 5, 6] @@ -348,3 +352,142 @@ def test_varbytes_offsets(): assert cr.raw_column().had_stored_offsets for i in (10, 100, 1000, 3000): assert cr[i] == values[i % vlen] + + +# Initializes the 'fieldname' and 'condition' attributes with the values passed as parameters. +def test_initializes_fieldname_and_condition_attributes(): + fieldname = "test_field" + condition = lambda x: x > 0 + query = ColumnQuery(fieldname, condition) + assert query.fieldname == fieldname + assert query.condition == condition + + +# If 'condition' is a callable, sets it as the 'condition' attribute. +def test_sets_condition_attribute_if_condition_is_callable(): + fieldname = "test_field" + condition = lambda x: x > 0 + query = ColumnQuery(fieldname, condition) + assert query.condition == condition + + +# If 'condition' is not a callable, creates a lambda function that compares the document values to it (using '==') and sets it as the 'comp' attribute. +def test_creates_lambda_function_if_condition_is_not_callable(): + fieldname = "test_field" + condition = 10 + query = ColumnQuery(fieldname, condition) + assert query.condition == 10 + + +# If 'fieldname' is not a string, it should not raise a TypeError. +def test_raises_typeerror_if_fieldname_is_not_string(): + fieldname = 10 + condition = lambda x: x > 0 + query = ColumnQuery(fieldname, condition) + assert query.fieldname == fieldname + assert query.condition == condition + + +# If 'condition' is not a callable and not a hashable type, the columns.ColumnQuery object should be created without raising any exception. +def test_behavior_if_condition_is_not_callable_and_not_hashable(): + fieldname = "test_field" + condition = [] + query = ColumnQuery(fieldname, condition) + assert query.fieldname == fieldname + assert query.condition == condition + + +# If 'condition' is a callable and it raises an exception when called with a document value, raises that exception. +def test_raises_exception_if_condition_callable_raises_exception(): + fieldname = "test_field" + condition = lambda x: 1 / x + with pytest.raises(ZeroDivisionError): + query = ColumnQuery(fieldname, condition) + query.condition(0) + + +# If 'condition' is a callable and it returns a non-boolean value when called with a document value, does not raise a TypeError. +def test_does_not_raise_typeerror_if_condition_callable_returns_non_boolean_value(): + fieldname = "test_field" + condition = lambda x: "True" + query = ColumnQuery(fieldname, condition) + assert isinstance(query, ColumnQuery) + + +# If 'condition' is a callable and it returns True for all document values, returns a ConstantScoreMatcher that matches all documents. +def test_returns_constantscorematcher_matching_all_documents_if_condition_callable_returns_true_for_all_values(): + from unittest.mock import Mock + + fieldname = "test_field" + condition = lambda x: True + query = ColumnQuery(fieldname, condition) + searcher = Mock() + creader = Mock() + creader.__len__ = Mock(return_value=10) + creader.__getitem__ = Mock(side_effect=lambda i: i) + searcher.reader.return_value.column_reader.return_value = creader + assert isinstance(query.matcher(searcher), ConstantScoreMatcher) + + +# If 'condition' is a callable and it is very slow, the matcher may take a long time to initialize. +def test_matcher_initialization_may_take_long_time_if_condition_callable_is_very_slow(): + import time + from unittest.mock import Mock, patch + + fieldname = "test_field" + condition = lambda x: time.sleep(10) + query = ColumnQuery(fieldname, condition) + searcher = Mock() + searcher.reader.return_value.has_column.return_value = True + with patch.object(query, "matcher") as matcher_mock: + query.matcher(searcher) + matcher_mock.assert_called_once_with(searcher) + + +# Initializes the '_i' attribute to 0. +def test_initializes_i_attribute_to_0(): + condition = lambda x: x > 0 + creader = [] # Define creader variable + matcher = ColumnMatcher(creader, condition) + assert matcher._i == 0 + + +# Initializes the 'creader' attribute with the value passed as parameter. +def test_initializes_creader_attribute(): + condition = lambda x: x > 0 + creader = [1, 2, 3, 4, 5] + matcher = ColumnMatcher(creader, condition) + assert matcher.creader == creader + + +# Initializes the 'condition' attribute with the value passed as parameter. +def test_initializes_condition_attribute(): + condition = lambda x: x > 0 + creader = [] + matcher = ColumnMatcher(creader, condition) + assert matcher.condition == condition + + +# Returns True if the '_i' attribute is less than the length of the 'creader' attribute. +def test_returns_true_if_i_attribute_is_less_than_length_of_creader_attribute(): + condition = lambda x: x > 0 + creader = [1, 2, 3, 4, 5] + matcher = ColumnMatcher(creader, condition) + assert matcher.is_active() == True + + +# Returns False if the '_i' attribute is equal to or greater than the length of the 'creader' attribute. +def test_returns_false_if_i_attribute_is_equal_to_or_greater_than_length_of_creader_attribute(): + condition = lambda x: x > 0 + creader = [1, 2, 3, 4, 5] + matcher = ColumnMatcher(creader, condition) + matcher._i = len(creader) + assert matcher.is_active() == False + + +# test if the `is_leaf` function is True +def test_is_leaf_true(): + fieldname = "test_field" + condition = lambda x: x > 0 + query = ColumnQuery(fieldname, condition) + assert query.is_leaf() == True diff --git a/tests/test_compound.py b/tests/test_compound.py index 515b966b..38de3f22 100644 --- a/tests/test_compound.py +++ b/tests/test_compound.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - from whoosh.compat import b from whoosh.filedb.compound import CompoundStorage from whoosh.filedb.filestore import RamStorage diff --git a/tests/test_dateparse.py b/tests/test_dateparse.py index 7180e3b3..9acba143 100644 --- a/tests/test_dateparse.py +++ b/tests/test_dateparse.py @@ -1,7 +1,14 @@ -from whoosh.qparser.dateparse import * +from datetime import datetime, timedelta, timezone +from whoosh.qparser.dateparse import ( + English, + adatetime, + relative_days, + relativedelta, + timespan, +) -basedate = datetime(2010, 9, 20, 15, 16, 6, 454000) +basedate = datetime(2010, 9, 20, 15, 16, 6, 454000, tzinfo=timezone.utc) english = English() @@ -10,7 +17,7 @@ def assert_adatetime(at, **kwargs): for key in adatetime.units: val = getattr(at, key) target = kwargs.get(key) - assert val == target, "at.%s=%r not %r in %r" % (key, val, target, at) + assert val == target, f"at.{key}={val!r} not {target!r} in {at!r}" def assert_timespan(ts, sargs, eargs): @@ -24,8 +31,8 @@ def assert_unamb(ts, **kwargs): def assert_unamb_span(ts, sargs, eargs): startdt = adatetime(**sargs).floor() enddt = adatetime(**eargs).ceil() - assert ts.start == startdt, "start %s != %s" % (ts.start, startdt) - assert ts.end == enddt, "end %s != %s" % (ts.end, enddt) + assert ts.start == startdt, f"start {ts.start} != {startdt}" + assert ts.end == enddt, f"end {ts.end} != {enddt}" def assert_datespan(ts, startdate, enddate): diff --git a/tests/test_fields.py b/tests/test_fields.py index 4c796c9e..47acff8e 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -1,10 +1,8 @@ -from __future__ import with_statement -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone import pytest - from whoosh import fields, qparser, query -from whoosh.compat import u, b, range +from whoosh.compat import b, u from whoosh.filedb.filestore import RamStorage from whoosh.util import times from whoosh.util.testing import TempIndex @@ -342,7 +340,7 @@ def test_nontext_document(): ) ix = RamStorage().create_index(schema) - dt = datetime.now() + dt = datetime.now(tz=timezone.utc) w = ix.writer() for i in range(50): w.add_document(id=i, num=i, date=dt + timedelta(days=i), even=not (i % 2)) @@ -367,7 +365,7 @@ def test_nontext_update(): ) ix = RamStorage().create_index(schema) - dt = datetime.now() + dt = datetime.now(tz=timezone.utc) w = ix.writer() for i in range(10): w.add_document(id=i, num=i, date=dt + timedelta(days=i)) @@ -393,7 +391,8 @@ def test_datetime(): for month in range(1, 12): for day in range(1, 28): w.add_document( - id=u("%s-%s") % (month, day), date=datetime(2010, month, day, 14, 0, 0) + id=u("%s-%s") % (month, day), + date=datetime(2010, month, day, 14, 0, 0, tzinfo=timezone.utc), ) w.commit() @@ -411,8 +410,8 @@ def test_datetime(): assert len(r) == 27 q = qp.parse(u("date:[2010-05 to 2010-08]")) - startdt = datetime(2010, 5, 1, 0, 0, 0, 0) - enddt = datetime(2010, 8, 31, 23, 59, 59, 999999) + startdt = datetime(2010, 5, 1, 0, 0, 0, 0, tzinfo=timezone.utc) + enddt = datetime(2010, 8, 31, 23, 59, 59, 999999, tzinfo=timezone.utc) assert q.__class__ is query.NumericRange assert q.start == times.datetime_to_long(startdt) assert q.end == times.datetime_to_long(enddt) @@ -622,7 +621,7 @@ def test_missing_field(): def test_token_boost(): - from whoosh.analysis import RegexTokenizer, DoubleMetaphoneFilter + from whoosh.analysis import DoubleMetaphoneFilter, RegexTokenizer ana = RegexTokenizer() | DoubleMetaphoneFilter() field = fields.TEXT(analyzer=ana, phrase=False) @@ -646,8 +645,8 @@ def test_pickle_idlist(): def test_pickle_schema(): from whoosh import analysis - from whoosh.support.charset import accent_map from whoosh.compat import dumps + from whoosh.support.charset import accent_map freetext_analyzer = analysis.StemmingAnalyzer() | analysis.CharsetFilter(accent_map) @@ -677,3 +676,37 @@ def test_pickle_schema(): with ix.reader() as r: assert dumps(r.schema, 2) + + +def test_valid_date_string(): + """Can parse a valid date string and return a NumericRange query with the parsed date as the value""" + import datetime + + from whoosh.fields import DATETIME, datetime_to_long + from whoosh.query import NumericRange + + # Initialize the DATETIME field + field = DATETIME() + + # Define a valid date string + date_string = "2022-01-01" + + # Invoke the parse_query method with the valid date string + query = field.parse_query("date", date_string) + + # Define the expected start and end dates + expected_start = datetime_to_long( + datetime.datetime(2022, 1, 1, tzinfo=timezone.utc) + ) + expected_end = datetime_to_long( + datetime.datetime(2022, 1, 1, tzinfo=timezone.utc) + + datetime.timedelta(days=1) + - datetime.timedelta(microseconds=1) + ) + + # Check that the query is a NumericRange query with the parsed date as the value + assert isinstance(query, NumericRange), "Query is not a NumericRange" + assert query.fieldname == "date", "Fieldname is not correct" + assert query.start == expected_start, "Start date is not correct" + assert query.end == expected_end, "End date is not correct" + assert query.boost == 1.0, "Boost value is not correct" diff --git a/tests/test_flexible.py b/tests/test_flexible.py index 18ac0267..3eb6c257 100644 --- a/tests/test_flexible.py +++ b/tests/test_flexible.py @@ -1,7 +1,5 @@ -from __future__ import with_statement - from whoosh import fields -from whoosh.compat import u, b +from whoosh.compat import b, u from whoosh.util.testing import TempIndex diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index ca85897f..e47b0d9b 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -1,12 +1,7 @@ -# coding: utf-8 - -from __future__ import with_statement - import pytest # from jieba.analyse import ChineseAnalyzer - -from whoosh import analysis, highlight, fields, qparser, query +from whoosh import analysis, fields, highlight, qparser, query from whoosh.compat import u from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempIndex diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 5820e7bd..01c1fb07 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -1,16 +1,14 @@ -from __future__ import with_statement import random from collections import defaultdict -from datetime import datetime +from datetime import datetime, timezone import pytest - -from whoosh import analysis, fields, index, qparser, query, __version__ -from whoosh.compat import b, u, range, text_type, permutations +from whoosh import __version__, analysis, fields, index, qparser, query +from whoosh.compat import b, permutations, text_type, u from whoosh.filedb.filestore import RamStorage -from whoosh.writing import IndexingError -from whoosh.util.numeric import length_to_byte, byte_to_length +from whoosh.util.numeric import byte_to_length, length_to_byte from whoosh.util.testing import TempIndex, TempStorage +from whoosh.writing import IndexingError def test_creation(): @@ -474,7 +472,7 @@ def test_noscorables1(): u("kilo"), u("lima"), ] - from random import choice, sample, randint + from random import choice, randint, sample times = 1000 @@ -631,15 +629,20 @@ def test_multivalue(): ) ix = RamStorage().create_index(schema) with ix.writer() as w: - w.add_document(id=1, date=datetime(2001, 1, 1), num=5) + w.add_document(id=1, date=datetime(2001, 1, 1, tzinfo=timezone.utc), num=5) w.add_document( - id=2, date=[datetime(2002, 2, 2), datetime(2003, 3, 3)], num=[1, 2, 3, 12] + id=2, + date=[ + datetime(2002, 2, 2, tzinfo=timezone.utc), + datetime(2003, 3, 3, tzinfo=timezone.utc), + ], + num=[1, 2, 3, 12], ) w.add_document(txt=u("a b c").split()) with ix.reader() as r: assert ("num", 3) in r - assert ("date", datetime(2003, 3, 3)) in r + assert ("date", datetime(2003, 3, 3, tzinfo=timezone.utc)) in r assert " ".join(r.field_terms("txt")) == "a b c" diff --git a/tests/test_matching.py b/tests/test_matching.py index 06358f1e..295d80df 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -1,12 +1,11 @@ -from __future__ import with_statement -from random import randint, choice, sample +from random import choice, randint, sample from whoosh import fields, matching, qparser, query -from whoosh.compat import b, u, range, permutations +from whoosh.compat import b, permutations, u from whoosh.filedb.filestore import RamStorage from whoosh.query import And, Term -from whoosh.util import make_binary_tree from whoosh.scoring import WeightScorer +from whoosh.util import make_binary_tree def _keys(searcher, docnums): @@ -501,12 +500,12 @@ def test_dismax(): def test_exclusion(): - from datetime import datetime + from datetime import datetime, timezone schema = fields.Schema(id=fields.ID(stored=True), date=fields.DATETIME) ix = RamStorage().create_index(schema) - dt1 = datetime(1950, 1, 1) - dt2 = datetime(1960, 1, 1) + dt1 = datetime(1950, 1, 1, tzinfo=timezone.utc) + dt2 = datetime(1960, 1, 1, tzinfo=timezone.utc) with ix.writer() as w: # Make 39 documents with dates != dt1 and then make a last document # with feed == dt1. diff --git a/tests/test_misc.py b/tests/test_misc.py index fe1bf151..5a4f2224 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -1,9 +1,10 @@ -from __future__ import with_statement -import os, threading, time +import os +import threading +import time from whoosh.compat import u from whoosh.util.filelock import try_for -from whoosh.util.numeric import length_to_byte, byte_to_length +from whoosh.util.numeric import byte_to_length, length_to_byte from whoosh.util.testing import TempStorage @@ -16,7 +17,9 @@ def test_now(): def test_storage_creation(): - import tempfile, uuid + import tempfile + import uuid + from whoosh import fields from whoosh.filedb.filestore import FileStorage diff --git a/tests/test_mpwriter.py b/tests/test_mpwriter.py index d63506de..0703a849 100644 --- a/tests/test_mpwriter.py +++ b/tests/test_mpwriter.py @@ -1,12 +1,10 @@ -from __future__ import with_statement import random from collections import deque import pytest - from whoosh import fields, query -from whoosh.compat import u, izip, range, permutations, text_type -from whoosh.util.numeric import length_to_byte, byte_to_length +from whoosh.compat import izip, permutations, text_type, u +from whoosh.util.numeric import byte_to_length, length_to_byte from whoosh.util.testing import TempIndex diff --git a/tests/test_nested.py b/tests/test_nested.py index 4da5274d..41dc704c 100644 --- a/tests/test_nested.py +++ b/tests/test_nested.py @@ -1,5 +1,3 @@ -from __future__ import with_statement - from whoosh import fields, query, sorting from whoosh.compat import u from whoosh.filedb.filestore import RamStorage diff --git a/tests/test_parse_plugins.py b/tests/test_parse_plugins.py index 706e8a11..eecfbe00 100644 --- a/tests/test_parse_plugins.py +++ b/tests/test_parse_plugins.py @@ -1,9 +1,8 @@ -from __future__ import with_statement import inspect -from datetime import datetime +from datetime import datetime, timezone from whoosh import analysis, fields, formats, qparser, query -from whoosh.compat import u, text_type, range +from whoosh.compat import text_type, u from whoosh.filedb.filestore import RamStorage from whoosh.qparser import dateparse, default, plugins, syntax from whoosh.util.times import adatetime @@ -39,7 +38,7 @@ def test_combos(): try: pis[i] = plugin(*init_args.get(plugin, ())) except TypeError: - raise TypeError("Error instantiating %s" % plugin) + raise TypeError(f"Error instantiating {plugin}") count = 0 for i, first in enumerate(pis): @@ -70,7 +69,7 @@ def test_dateparser(): def cb(arg): errs.append(arg) - basedate = datetime(2010, 9, 20, 15, 16, 6, 454000) + basedate = datetime(2010, 9, 20, 15, 16, 6, 454000, tzinfo=timezone.utc) qp.add_plugin(dateparse.DateParserPlugin(basedate, callback=cb)) q = qp.parse(u("hello date:'last tuesday'")) @@ -119,7 +118,7 @@ def cb(arg): def test_date_range(): schema = fields.Schema(text=fields.TEXT, date=fields.DATETIME) qp = qparser.QueryParser("text", schema) - basedate = datetime(2010, 9, 20, 15, 16, 6, 454000) + basedate = datetime(2010, 9, 20, 15, 16, 6, 454000, tzinfo=timezone.utc) qp.add_plugin(dateparse.DateParserPlugin(basedate)) q = qp.parse(u("date:['30 march' to 'next wednesday']")) @@ -156,7 +155,7 @@ def test_date_range(): def test_daterange_multi(): schema = fields.Schema(text=fields.TEXT, start=fields.DATETIME, end=fields.DATETIME) qp = qparser.QueryParser("text", schema) - basedate = datetime(2010, 9, 20, 15, 16, 6, 454000) + basedate = datetime(2010, 9, 20, 15, 16, 6, 454000, tzinfo=timezone.utc) qp.add_plugin(dateparse.DateParserPlugin(basedate)) q = qp.parse("start:[2008 to] AND end:[2011 to 2011]") @@ -178,7 +177,11 @@ def test_daterange_empty_field(): writer.commit() with ix.searcher() as s: - q = query.DateRange("test", datetime.fromtimestamp(86400), datetime.today()) + q = query.DateRange( + "test", + datetime.fromtimestamp(86400, tz=timezone.utc), + datetime.now(tz=timezone.utc), + ) r = s.search(q) assert len(r) == 0 @@ -187,7 +190,7 @@ def test_free_dates(): a = analysis.StandardAnalyzer(stoplist=None) schema = fields.Schema(text=fields.TEXT(analyzer=a), date=fields.DATETIME) qp = qparser.QueryParser("text", schema) - basedate = datetime(2010, 9, 20, 15, 16, 6, 454000) + basedate = datetime(2010, 9, 20, 15, 16, 6, 454000, tzinfo=timezone.utc) qp.add_plugin(dateparse.DateParserPlugin(basedate, free=True)) q = qp.parse(u("hello date:last tuesday")) @@ -367,7 +370,9 @@ def test_gtlt(): assert len(q) == 3 assert q[0] == query.Term("a", "hello") # As of this writing, date ranges don't support startexcl/endexcl - assert q[1] == query.DateRange("e", datetime(2001, 3, 29, 0, 0), None) + assert q[1] == query.DateRange( + "e", datetime(2001, 3, 29, 0, 0, tzinfo=timezone.utc), None + ) assert q[2] == query.Term("a", "there") q = qp.parse(u("a:> alfa c:<= bravo")) @@ -506,7 +511,7 @@ def test_fuzzy_prefix(): # Match -> fire is within 2 edits (transpose + delete) of first w.add_document(title=u("Fifth"), content=u("The fire is beautiful")) - from whoosh.qparser import QueryParser, FuzzyTermPlugin + from whoosh.qparser import FuzzyTermPlugin, QueryParser parser = QueryParser("content", ix.schema) parser.add_plugin(FuzzyTermPlugin()) diff --git a/tests/test_parsing.py b/tests/test_parsing.py index b1f9f799..66c56ee7 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -1,5 +1,5 @@ from whoosh import analysis, fields, query -from whoosh.compat import u, text_type +from whoosh.compat import text_type, u from whoosh.qparser import default, plugins @@ -642,20 +642,20 @@ def test_numeric_range(): teststart = 40 testend = 100 - q = qp.parse("[%s to *]" % teststart) + q = qp.parse(f"[{teststart} to *]") assert q == query.NullQuery - q = qp.parse("[%s to]" % teststart) + q = qp.parse(f"[{teststart} to]") assert q.__class__ == query.NumericRange assert q.start == teststart assert q.end is None - q = qp.parse("[to %s]" % testend) + q = qp.parse(f"[to {testend}]") assert q.__class__ == query.NumericRange assert q.start is None assert q.end == testend - q = qp.parse("[%s to %s]" % (teststart, testend)) + q = qp.parse(f"[{teststart} to {testend}]") assert q.__class__ == query.NumericRange assert q.start == teststart assert q.end == testend @@ -1106,10 +1106,10 @@ def test_quoted_prefix(): expr = r"(^|(?<=[ (]))(?P\w+|[*]):" qp.replace_plugin(plugins.FieldsPlugin(expr)) - q = qp.parse(u("foo url:http://apple.com:8080/bar* baz")) + q = qp.parse(u("foo url:https://apple.com:8080/bar* baz")) assert isinstance(q, query.And) assert q[0] == query.Term("f", "foo") - assert q[1] == query.Prefix("url", "http://apple.com:8080/bar") + assert q[1] == query.Prefix("url", "https://apple.com:8080/bar") assert q[2] == query.Term("f", "baz") assert len(q) == 3 @@ -1123,4 +1123,4 @@ def test_multitoken_with_factory(): qp = default.QueryParser("title", schema, group=og) querystring = "get my name/address" - userquery = qp.parse(querystring) + _ = qp.parse(querystring) diff --git a/tests/test_postings.py b/tests/test_postings.py index f586e1d8..6d836a19 100644 --- a/tests/test_postings.py +++ b/tests/test_postings.py @@ -1,11 +1,14 @@ -from __future__ import with_statement - from whoosh import analysis, fields -from whoosh.compat import u from whoosh.codec import default_codec -from whoosh.formats import Existence, Frequency -from whoosh.formats import Positions, PositionBoosts -from whoosh.formats import Characters, CharacterBoosts +from whoosh.compat import u +from whoosh.formats import ( + CharacterBoosts, + Characters, + Existence, + Frequency, + PositionBoosts, + Positions, +) from whoosh.util.testing import TempStorage diff --git a/tests/test_quality.py b/tests/test_quality.py index e051bd2f..845f9d1f 100644 --- a/tests/test_quality.py +++ b/tests/test_quality.py @@ -1,10 +1,9 @@ -from __future__ import with_statement import random from whoosh import fields, matching, scoring -from whoosh.compat import u, range +from whoosh.compat import u from whoosh.filedb.filestore import RamStorage -from whoosh.util.numeric import length_to_byte, byte_to_length +from whoosh.util.numeric import byte_to_length, length_to_byte def _discreet(length): diff --git a/tests/test_queries.py b/tests/test_queries.py index 3e5f34e7..b9a6090e 100644 --- a/tests/test_queries.py +++ b/tests/test_queries.py @@ -1,35 +1,31 @@ -from __future__ import with_statement import copy import pytest - from whoosh import fields, qparser, query from whoosh.compat import b, u from whoosh.filedb.filestore import RamStorage from whoosh.qparser import QueryParser -from whoosh.query import And -from whoosh.query import AndMaybe -from whoosh.query import ConstantScoreQuery -from whoosh.query import DateRange -from whoosh.query import DisjunctionMax -from whoosh.query import Every -from whoosh.query import FuzzyTerm -from whoosh.query import Not -from whoosh.query import NullQuery -from whoosh.query import NumericRange -from whoosh.query import Or -from whoosh.query import Phrase -from whoosh.query import Prefix -from whoosh.query import Require -from whoosh.query import Term -from whoosh.query import TermRange -from whoosh.query import Variations -from whoosh.query import Wildcard -from whoosh.query.spans import SpanContains -from whoosh.query.spans import SpanFirst -from whoosh.query.spans import SpanNear -from whoosh.query.spans import SpanNot -from whoosh.query.spans import SpanOr +from whoosh.query import ( + And, + AndMaybe, + ConstantScoreQuery, + DateRange, + DisjunctionMax, + Every, + FuzzyTerm, + Not, + NullQuery, + NumericRange, + Or, + Phrase, + Prefix, + Require, + Term, + TermRange, + Variations, + Wildcard, +) +from whoosh.query.spans import SpanContains, SpanFirst, SpanNear, SpanNot, SpanOr from whoosh.util.testing import TempIndex @@ -96,7 +92,7 @@ def words(terms): q = query.Variations("value", "render") ts = q.existing_terms(r, expand=False) - assert ts == set([("value", b("render"))]) + assert ts == {("value", b("render"))} ts = q.existing_terms(r, expand=True) assert words(ts) == b("render rendering renders") @@ -394,14 +390,14 @@ def do(q1, q2): def test_requires(): a = Term("f", u("a")) b = Term("f", u("b")) - assert And([a, b]).requires() == set([a, b]) + assert And([a, b]).requires() == {a, b} assert Or([a, b]).requires() == set() - assert AndMaybe(a, b).requires() == set([a]) - assert a.requires() == set([a]) + assert AndMaybe(a, b).requires() == {a} + assert a.requires() == {a} def test_highlight_daterange(): - from datetime import datetime + from datetime import datetime, timezone schema = fields.Schema( id=fields.ID(unique=True, stored=True), @@ -416,7 +412,7 @@ def test_highlight_daterange(): id=u("1"), title=u("Life Aquatic"), content=u("A nautic film crew sets out to kill a gigantic shark."), - released=datetime(2004, 12, 25), + released=datetime(2004, 12, 25, tzinfo=timezone.utc), ) w.update_document( id=u("2"), @@ -424,7 +420,7 @@ def test_highlight_daterange(): content=u( "Three brothers meet in India for a life changing train " + "journey." ), - released=datetime(2007, 10, 27), + released=datetime(2007, 10, 27, tzinfo=timezone.utc), ) w.commit() @@ -437,7 +433,7 @@ def test_highlight_daterange(): == 'for a life changing train journey' ) - r = s.search(DateRange("released", datetime(2007, 1, 1), None)) + r = s.search(DateRange("released", datetime(2007, 1, 1, tzinfo=timezone.utc), None)) assert len(r) == 1 assert r[0].highlights("content") == "" @@ -682,3 +678,209 @@ def test_andnot_reverse(): assert len(names_fw) == len(names_rv) == 1 assert names_fw == names_rv + + +# NumericRange with valid fieldname, start, and end +def test_valid_fieldname_start_end(): + from whoosh.query.ranges import NumericRange + + nr = NumericRange("number", 10, 5925) + assert nr.fieldname == "number" + assert nr.start == 10 + assert nr.end == 5925 + assert nr.startexcl == False + assert nr.endexcl == False + assert nr.boost == 1.0 + assert nr.constantscore == True + + +# NumericRange with valid fieldname, start, end, startexcl=True, and endexcl=True +def test_valid_fieldname_start_end_startexcl_endexcl(): + from whoosh.query.ranges import NumericRange + + nr = NumericRange("number", 10, 5925, startexcl=True, endexcl=True) + assert nr.fieldname == "number" + assert nr.start == 10 + assert nr.end == 5925 + assert nr.startexcl == True + assert nr.endexcl == True + assert nr.boost == 1.0 + assert nr.constantscore == True + + +# NumericRange with valid fieldname, start, end, boost=2.0, and constantscore=False +def test_valid_fieldname_start_end_boost_constantscore(): + from whoosh.query.ranges import NumericRange + + nr = NumericRange("number", 10, 5925, boost=2.0, constantscore=False) + assert nr.fieldname == "number" + assert nr.start == 10 + assert nr.end == 5925 + assert nr.startexcl == False + assert nr.endexcl == False + assert nr.boost == 2.0 + assert nr.constantscore == False + + +# NumericRange with valid fieldname, start=None, and end=None +def test_valid_fieldname_start_none_end_none(): + from whoosh.query.ranges import NumericRange + + nr = NumericRange("number", 0, 0) + assert nr.fieldname == "number" + assert nr.start == 0 + assert nr.end == 0 + assert nr.startexcl == False + assert nr.endexcl == False + assert nr.boost == 1.0 + assert nr.constantscore == True + + +# NumericRange with valid fieldname, start=0, and end=0 +def test_valid_fieldname_start_zero_end_zero(): + from whoosh.query.ranges import NumericRange + + nr = NumericRange("number", 0, 0) + assert nr.fieldname == "number" + assert nr.start == 0 + assert nr.end == 0 + assert nr.startexcl == False + assert nr.endexcl == False + assert nr.boost == 1.0 + assert nr.constantscore == True + + +# NumericRange with valid fieldname, start=-1, and end=1 +def test_valid_fieldname_start_minus_one_end_one(): + from whoosh.query.ranges import NumericRange + + nr = NumericRange("number", -1, 1) + assert nr.fieldname == "number" + assert nr.start == -1 + assert nr.end == 1 + assert nr.startexcl == False + assert nr.endexcl == False + assert nr.boost == 1.0 + assert nr.constantscore == True + + +# NumericRange with valid fieldname, start=1, and end=-1 +def test_valid_fieldname_start_end(): + from whoosh.query.ranges import NumericRange + + nr = NumericRange("fieldname", 1, -1) + assert nr.fieldname == "fieldname" + assert nr.start == 1 + assert nr.end == -1 + assert nr.startexcl == False + assert nr.endexcl == False + assert nr.boost == 1.0 + assert nr.constantscore == True + + +# NumericRange with valid fieldname, start=1.5, and end=2.5 +def test_valid_fieldname_start_end_float(): + from whoosh.query.ranges import NumericRange + + nr = NumericRange("fieldname", 1.5, 2.5) + assert nr.fieldname == "fieldname" + assert nr.start == 1.5 + assert nr.end == 2.5 + assert nr.startexcl == False + assert nr.endexcl == False + assert nr.boost == 1.0 + assert nr.constantscore == True + + +# NumericRange with valid fieldname, start=1.5, and end=2.5, startexcl=True, and endexcl=True +def test_valid_fieldname_start_end_excl(): + from whoosh.query.ranges import NumericRange + + nr = NumericRange("fieldname", 1.5, 2.5, startexcl=True, endexcl=True) + assert nr.fieldname == "fieldname" + assert nr.start == 1.5 + assert nr.end == 2.5 + assert nr.startexcl == True + assert nr.endexcl == True + assert nr.boost == 1.0 + assert nr.constantscore == True + + +# NumericRange with valid fieldname, start=1.5, and end=2.5, boost=2.0, and constantscore=False +def test_valid_fieldname_start_end_boost_constantscore(): + from whoosh.query.ranges import NumericRange + + nr = NumericRange("fieldname", 1.5, 2.5, boost=2.0, constantscore=False) + assert nr.fieldname == "fieldname" + assert nr.start == 1.5 + assert nr.end == 2.5 + assert nr.startexcl == False + assert nr.endexcl == False + assert nr.boost == 2.0 + assert nr.constantscore == False + + +# NumericRange with invalid boost +def test_invalid_boost(): + from whoosh.query.ranges import NumericRange + + nr = NumericRange("number", 10, 5925, boost="invalid") + assert nr.boost == "invalid" + + +# NumericRange with valid start and invalid end + + +# NumericRange with invalid startexcl and valid endexcl +def test_invalid_startexcl_valid_endexcl(): + """ + Test that NumericRange works with invalid startexcl and valid endexcl. + """ + from whoosh.query.ranges import NumericRange + + nr = NumericRange("number", 10, 5925, startexcl=True, endexcl=False) + + assert nr.fieldname == "number" + assert nr.start == 10 + assert nr.end == 5925 + assert nr.startexcl == True + assert nr.endexcl == False + assert nr.boost == 1.0 + assert nr.constantscore == True + + +# NumericRange with invalid constantscore +def test_invalid_constantscore(): + """ + Test that NumericRange does not raise an exception when constantscore is set to False. + """ + from whoosh.query.ranges import NumericRange + + NumericRange("number", 10, 5925, constantscore=False) + + +# NumericRange with valid startexcl and invalid endexcl +def test_valid_startexcl_invalid_endexcl(): + from whoosh.query.ranges import NumericRange + + nr = NumericRange("number", 10, 5925, startexcl=True, endexcl=True) + assert nr.startexcl == True + assert nr.endexcl == True + + +# NumbericRange with negative boost field +def test_numeric_range_with_negative_boost(): + """ + Test that NumericRange works with a negative boost field. + """ + from whoosh.query.ranges import NumericRange + + nr = NumericRange("number", 10, 5925, boost=-1.0) + + assert nr.fieldname == "number" + assert nr.start == 10 + assert nr.end == 5925 + assert nr.startexcl == False + assert nr.endexcl == False + assert nr.boost == -1.0 + assert nr.constantscore == True diff --git a/tests/test_reading.py b/tests/test_reading.py index 0177f4b5..8311c66a 100644 --- a/tests/test_reading.py +++ b/tests/test_reading.py @@ -1,13 +1,12 @@ -# coding=utf-8 -from __future__ import with_statement -import random, threading, time +import random +import threading +import time import pytest from whoosh import fields, formats, reading - -from whoosh.compat import b, u, range -from whoosh.reading import SegmentReader +from whoosh.compat import b, u from whoosh.filedb.filestore import RamStorage +from whoosh.reading import SegmentReader from whoosh.util.testing import TempIndex @@ -124,20 +123,18 @@ def test_term_inspection(): a_exp = list(r.expand_prefix("content", "a")) assert a_exp == [b("aa"), b("ab"), b("ax")] - assert set(r.all_terms()) == set( - [ - ("content", b("aa")), - ("content", b("ab")), - ("content", b("ax")), - ("content", b("bb")), - ("content", b("cc")), - ("content", b("dd")), - ("content", b("ee")), - ("title", b("document")), - ("title", b("my")), - ("title", b("other")), - ] - ) + assert set(r.all_terms()) == { + ("content", b("aa")), + ("content", b("ab")), + ("content", b("ax")), + ("content", b("bb")), + ("content", b("cc")), + ("content", b("dd")), + ("content", b("ee")), + ("title", b("document")), + ("title", b("my")), + ("title", b("other")), + } # (text, doc_freq, index_freq) cstats = _fstats(r.iter_field("content")) @@ -495,8 +492,8 @@ def test_cursor(): def _check_inspection_results(ix): - AE = "aé".encode("utf-8") - AU = "aú".encode("utf-8") + AE = "aé".encode() + AU = "aú".encode() with ix.reader() as r: cterms = " ".join(r.field_terms("content")) @@ -506,20 +503,18 @@ def _check_inspection_results(ix): assert a_exp == [b("aa"), AE, AU] tset = set(r.all_terms()) - assert tset == set( - [ - ("content", b("aa")), - ("content", AE), - ("content", AU), - ("content", b("bb")), - ("content", b("cc")), - ("content", b("dd")), - ("content", b("ee")), - ("title", b("document")), - ("title", b("my")), - ("title", b("other")), - ] - ) + assert tset == { + ("content", b("aa")), + ("content", AE), + ("content", AU), + ("content", b("bb")), + ("content", b("cc")), + ("content", b("dd")), + ("content", b("ee")), + ("title", b("document")), + ("title", b("my")), + ("title", b("other")), + } # (text, doc_freq, index_freq) assert _fstats(r.iter_field("content")) == [ diff --git a/tests/test_results.py b/tests/test_results.py index b98f39c1..3494ea15 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -1,12 +1,9 @@ -from __future__ import with_statement - import pytest - from whoosh import analysis, fields, formats, highlight, qparser, query from whoosh.codec.whoosh3 import W3Codec -from whoosh.compat import u, range, text_type, permutations +from whoosh.compat import permutations, text_type, u from whoosh.filedb.filestore import RamStorage -from whoosh.util.testing import TempStorage, TempIndex +from whoosh.util.testing import TempIndex, TempStorage def test_score_retrieval(): @@ -215,8 +212,8 @@ def test_extend_filtered(): hits = lambda result: [hit["id"] for hit in result] with ix.searcher() as s: - r1 = s.search(query.Term("text", u("alfa")), filter=set([1, 4])) - assert r1.allowed == set([1, 4]) + r1 = s.search(query.Term("text", u("alfa")), filter={1, 4}) + assert r1.allowed == {1, 4} assert len(r1.top_n) == 0 r2 = s.search(query.Term("text", u("bravo"))) @@ -224,7 +221,7 @@ def test_extend_filtered(): assert hits(r2) == [1, 2, 4] r3 = r1.copy() - assert r3.allowed == set([1, 4]) + assert r3.allowed == {1, 4} assert len(r3.top_n) == 0 r3.extend(r2) assert len(r3.top_n) == 3 diff --git a/tests/test_searching.py b/tests/test_searching.py index b140aa56..3b4c7e56 100644 --- a/tests/test_searching.py +++ b/tests/test_searching.py @@ -1,15 +1,10 @@ -# encoding: utf-8 - -from __future__ import with_statement import copy -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone import pytest - from whoosh import analysis, fields, index, qparser, query, scoring from whoosh.codec.whoosh3 import W3Codec -from whoosh.compat import b, u, text_type -from whoosh.compat import range, permutations, izip_longest +from whoosh.compat import b, izip_longest, permutations, text_type, u from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempIndex @@ -371,7 +366,7 @@ def test_open_numeric_ranges(): def test_open_date_ranges(): - basedate = datetime(2011, 1, 24, 6, 25, 0, 0) + basedate = datetime(2011, 1, 24, 6, 25, 0, 0, tzinfo=timezone.utc) domain = [basedate + timedelta(days=n) for n in range(-20, 20)] schema = fields.Schema(date=fields.DATETIME(stored=True)) @@ -387,13 +382,17 @@ def test_open_date_ranges(): q = qp.parse("[2011-01-10 to]") r = [hit["date"] for hit in s.search(q, limit=None)] assert len(r) > 0 - target = [d for d in domain if d >= datetime(2011, 1, 10, 6, 25)] + target = [ + d for d in domain if d >= datetime(2011, 1, 10, 6, 25, tzinfo=timezone.utc) + ] assert r == target q = qp.parse("[to 2011-01-30]") r = [hit["date"] for hit in s.search(q, limit=None)] assert len(r) > 0 - target = [d for d in domain if d <= datetime(2011, 1, 30, 6, 25)] + target = [ + d for d in domain if d <= datetime(2011, 1, 30, 6, 25, tzinfo=timezone.utc) + ] assert r == target # With date parser @@ -404,13 +403,17 @@ def test_open_date_ranges(): q = qp.parse("[10 jan 2011 to]") r = [hit["date"] for hit in s.search(q, limit=None)] assert len(r) > 0 - target = [d for d in domain if d >= datetime(2011, 1, 10, 6, 25)] + target = [ + d for d in domain if d >= datetime(2011, 1, 10, 6, 25, tzinfo=timezone.utc) + ] assert r == target q = qp.parse("[to 30 jan 2011]") r = [hit["date"] for hit in s.search(q, limit=None)] assert len(r) > 0 - target = [d for d in domain if d <= datetime(2011, 1, 30, 6, 25)] + target = [ + d for d in domain if d <= datetime(2011, 1, 30, 6, 25, tzinfo=timezone.utc) + ] assert r == target @@ -425,7 +428,7 @@ def test_negated_unlimited_ranges(): domain = text_type(ascii_letters) - dt = datetime.now() + dt = datetime.now(tz=timezone.utc) for i, letter in enumerate(domain): w.add_document(id=letter, num=i, date=dt + timedelta(days=i)) w.commit() @@ -747,7 +750,7 @@ def test_short_prefix(): def test_weighting(): - from whoosh.scoring import Weighting, BaseScorer + from whoosh.scoring import BaseScorer, Weighting schema = fields.Schema(id=fields.ID(stored=True), n_comments=fields.STORED) st = RamStorage() @@ -1642,7 +1645,7 @@ def test_groupedby_with_terms(): assert len(r) == 2 assert r.groups("organism") == {"mus": [1, 0]} assert r.has_matched_terms() - assert r.matched_terms() == set([("content", b("ipfstd1"))]) + assert r.matched_terms() == {("content", b("ipfstd1"))} def test_buffered_refresh(): @@ -1703,7 +1706,7 @@ def test_terms_with_filter(): w.add_document(text=u("hotel alfa bravo charlie")) with ix.searcher() as s: - workingset = set([1, 2, 3]) + workingset = {1, 2, 3} q = query.Term("text", u("foxtrot")) r = s.search_page(q, pagenum=1, pagelen=5, terms=True, filter=workingset) @@ -1868,7 +1871,7 @@ def pos_score_fn(searcher, fieldname, text, matcher): assert not m.supports_block_quality() r = s.search(q, limit=5) - ids = "".join(([hit["id"] for hit in r])) + ids = "".join([hit["id"] for hit in r]) assert ids == "agmsb" q = query.Or( diff --git a/tests/test_sorting.py b/tests/test_sorting.py index 69881e9b..2883defd 100644 --- a/tests/test_sorting.py +++ b/tests/test_sorting.py @@ -1,14 +1,11 @@ -from __future__ import with_statement -from datetime import datetime, timedelta import random +from datetime import datetime, timedelta, timezone -from whoosh import fields, query, sorting, columns -from whoosh.compat import u -from whoosh.compat import permutations, range +from whoosh import columns, fields, query, sorting +from whoosh.compat import permutations, u from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempIndex - try: import multiprocessing except ImportError: @@ -255,7 +252,7 @@ def test_query_facet_overlap(): ix = RamStorage().create_index(schema) with ix.writer() as w: for i, ltr in enumerate(domain): - v = "%s %s" % (ltr, domain[8 - i]) + v = f"{ltr} {domain[8 - i]}" w.add_document(num=i, v=v) with ix.searcher() as s: @@ -332,8 +329,8 @@ def test_date_facet(): ix = RamStorage().create_index(schema) w = ix.writer() - d1 = datetime(2011, 7, 13) - d2 = datetime(1984, 3, 29) + d1 = datetime(2011, 7, 13) # noqa: DTZ001 + d2 = datetime(1984, 3, 29) # noqa: DTZ001 w.add_document(id=0, date=d1) w.add_document(id=1, date=d1) w.add_document(id=2) @@ -395,24 +392,36 @@ def test_daterange_facet(): schema = fields.Schema(id=fields.STORED, date=fields.DATETIME) ix = RamStorage().create_index(schema) w = ix.writer() - w.add_document(id=0, date=datetime(2001, 1, 15)) - w.add_document(id=1, date=datetime(2001, 1, 10)) + w.add_document(id=0, date=datetime(2001, 1, 15, tzinfo=timezone.utc)) + w.add_document(id=1, date=datetime(2001, 1, 10, tzinfo=timezone.utc)) w.add_document(id=2) - w.add_document(id=3, date=datetime(2001, 1, 3)) - w.add_document(id=4, date=datetime(2001, 1, 8)) - w.add_document(id=5, date=datetime(2001, 1, 6)) + w.add_document(id=3, date=datetime(2001, 1, 3, tzinfo=timezone.utc)) + w.add_document(id=4, date=datetime(2001, 1, 8, tzinfo=timezone.utc)) + w.add_document(id=5, date=datetime(2001, 1, 6, tzinfo=timezone.utc)) w.commit() with ix.searcher() as s: rf = sorting.DateRangeFacet( - "date", datetime(2001, 1, 1), datetime(2001, 1, 20), timedelta(days=5) + "date", + datetime(2001, 1, 1, tzinfo=timezone.utc), + datetime(2001, 1, 20, tzinfo=timezone.utc), + timedelta(days=5), ) r = s.search(query.Every(), groupedby={"date": rf}) dt = datetime assert r.groups("date") == { - (dt(2001, 1, 1, 0, 0), dt(2001, 1, 6, 0, 0)): [3], - (dt(2001, 1, 6, 0, 0), dt(2001, 1, 11, 0, 0)): [1, 4, 5], - (dt(2001, 1, 11, 0, 0), dt(2001, 1, 16, 0, 0)): [0], + ( + dt(2001, 1, 1, 0, 0, tzinfo=timezone.utc), + dt(2001, 1, 6, 0, 0, tzinfo=timezone.utc), + ): [3], + ( + dt(2001, 1, 6, 0, 0, tzinfo=timezone.utc), + dt(2001, 1, 11, 0, 0, tzinfo=timezone.utc), + ): [1, 4, 5], + ( + dt(2001, 1, 11, 0, 0, tzinfo=timezone.utc), + dt(2001, 1, 16, 0, 0, tzinfo=timezone.utc), + ): [0], None: [2], } @@ -424,30 +433,68 @@ def test_relative_daterange(): schema = fields.Schema(id=fields.STORED, date=fields.DATETIME) ix = RamStorage().create_index(schema) - basedate = datetime(2001, 1, 1) + basedate = datetime(2001, 1, 1, tzinfo=timezone.utc) count = 0 with ix.writer() as w: - while basedate < datetime(2001, 12, 1): + while basedate < datetime(2001, 12, 1, tzinfo=timezone.utc): w.add_document(id=count, date=basedate) basedate += timedelta(days=14, hours=16) count += 1 with ix.searcher() as s: gap = relativedelta(months=1) - rf = sorting.DateRangeFacet("date", dt(2001, 1, 1), dt(2001, 12, 31), gap) + rf = sorting.DateRangeFacet( + "date", + dt(2001, 1, 1, tzinfo=timezone.utc), + dt(2001, 12, 31, tzinfo=timezone.utc), + gap, + ) r = s.search(query.Every(), groupedby={"date": rf}) assert r.groups("date") == { - (dt(2001, 1, 1), dt(2001, 2, 1)): [0, 1, 2], - (dt(2001, 2, 1), dt(2001, 3, 1)): [3, 4], - (dt(2001, 3, 1), dt(2001, 4, 1)): [5, 6], - (dt(2001, 4, 1), dt(2001, 5, 1)): [7, 8], - (dt(2001, 5, 1), dt(2001, 6, 1)): [9, 10], - (dt(2001, 6, 1), dt(2001, 7, 1)): [11, 12], - (dt(2001, 7, 1), dt(2001, 8, 1)): [13, 14], - (dt(2001, 8, 1), dt(2001, 9, 1)): [15, 16], - (dt(2001, 9, 1), dt(2001, 10, 1)): [17, 18], - (dt(2001, 10, 1), dt(2001, 11, 1)): [19, 20], - (dt(2001, 11, 1), dt(2001, 12, 1)): [21, 22], + ( + dt(2001, 1, 1, tzinfo=timezone.utc), + dt(2001, 2, 1, tzinfo=timezone.utc), + ): [0, 1, 2], + ( + dt(2001, 2, 1, tzinfo=timezone.utc), + dt(2001, 3, 1, tzinfo=timezone.utc), + ): [3, 4], + ( + dt(2001, 3, 1, tzinfo=timezone.utc), + dt(2001, 4, 1, tzinfo=timezone.utc), + ): [5, 6], + ( + dt(2001, 4, 1, tzinfo=timezone.utc), + dt(2001, 5, 1, tzinfo=timezone.utc), + ): [7, 8], + ( + dt(2001, 5, 1, tzinfo=timezone.utc), + dt(2001, 6, 1, tzinfo=timezone.utc), + ): [9, 10], + ( + dt(2001, 6, 1, tzinfo=timezone.utc), + dt(2001, 7, 1, tzinfo=timezone.utc), + ): [11, 12], + ( + dt(2001, 7, 1, tzinfo=timezone.utc), + dt(2001, 8, 1, tzinfo=timezone.utc), + ): [13, 14], + ( + dt(2001, 8, 1, tzinfo=timezone.utc), + dt(2001, 9, 1, tzinfo=timezone.utc), + ): [15, 16], + ( + dt(2001, 9, 1, tzinfo=timezone.utc), + dt(2001, 10, 1, tzinfo=timezone.utc), + ): [17, 18], + ( + dt(2001, 10, 1, tzinfo=timezone.utc), + dt(2001, 11, 1, tzinfo=timezone.utc), + ): [19, 20], + ( + dt(2001, 11, 1, tzinfo=timezone.utc), + dt(2001, 12, 1, tzinfo=timezone.utc), + ): [21, 22], } @@ -563,7 +610,7 @@ def test_multifacet(): with ix.searcher() as s: facet = sorting.MultiFacet(["tag", "size"]) r = s.search(query.Every(), groupedby={"tag/size": facet}) - cats = r.groups(("tag/size")) + cats = r.groups("tag/size") assert cats == correct diff --git a/tests/test_spans.py b/tests/test_spans.py index 4587afe1..ffb2e269 100644 --- a/tests/test_spans.py +++ b/tests/test_spans.py @@ -1,13 +1,9 @@ -from __future__ import with_statement - from whoosh import analysis, fields, formats -from whoosh.compat import u, range, permutations +from whoosh.compat import permutations, u from whoosh.filedb.filestore import RamStorage -from whoosh.query import spans -from whoosh.query import And, Or, Term, Phrase +from whoosh.query import And, Or, Phrase, Term, spans from whoosh.util.testing import TempIndex - domain = ("alfa", "bravo", "bravo", "charlie", "delta", "echo") _ix = None diff --git a/tests/test_spelling.py b/tests/test_spelling.py index 93878126..8a313eb1 100644 --- a/tests/test_spelling.py +++ b/tests/test_spelling.py @@ -1,4 +1,3 @@ -from __future__ import with_statement import gzip from whoosh import analysis, fields, highlight, query, spelling @@ -7,7 +6,6 @@ from whoosh.support.levenshtein import levenshtein from whoosh.util.testing import TempIndex - _wordlist = sorted( u( "render animation animate shader shading zebra koala" diff --git a/tests/test_stem.py b/tests/test_stem.py index cbfa274b..0912d8d5 100644 --- a/tests/test_stem.py +++ b/tests/test_stem.py @@ -1,6 +1,6 @@ from whoosh.lang.snowball.english import EnglishStemmer -from whoosh.lang.snowball.french import FrenchStemmer from whoosh.lang.snowball.finnish import FinnishStemmer +from whoosh.lang.snowball.french import FrenchStemmer from whoosh.lang.snowball.spanish import SpanishStemmer diff --git a/tests/test_tables.py b/tests/test_tables.py index d142e55f..480f3633 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -1,10 +1,6 @@ -# encoding: utf-8 - -from __future__ import with_statement - import random -from whoosh.compat import b, iteritems, range +from whoosh.compat import b, iteritems from whoosh.filedb.filestore import RamStorage from whoosh.filedb.filetables import ( HashReader, @@ -71,7 +67,7 @@ def test_hash_contents(): ("whiskey", "xray"), ] # Convert to bytes - samp = set((b(k), b(v)) for k, v in samp) + samp = {(b(k), b(v)) for k, v in samp} with TempStorage("hashcontents") as st: hw = HashWriter(st.create_file("test.hsh")) @@ -85,8 +81,8 @@ def test_hash_contents(): for key, value in probes: assert hr[key] == value - assert set(hr.keys()) == set([k for k, v in samp]) - assert set(hr.values()) == set([v for k, v in samp]) + assert set(hr.keys()) == {k for k, v in samp} + assert set(hr.values()) == {v for k, v in samp} assert set(hr.items()) == samp hr.close() @@ -104,7 +100,7 @@ def randstring(): return b(s) with TempStorage("randomhash") as st: - samp = dict((randstring(), randstring()) for _ in range(times)) + samp = {randstring(): randstring() for _ in range(times)} hw = HashWriter(st.create_file("test.hsh")) for k, v in iteritems(samp): @@ -123,14 +119,14 @@ def test_random_access(): times = 1000 with TempStorage("orderedhash") as st: hw = HashWriter(st.create_file("test.hsh")) - hw.add_all((b("%08x" % x), b(str(x))) for x in range(times)) + hw.add_all((b(f"{x:08x}"), b(str(x))) for x in range(times)) hw.close() keys = list(range(times)) random.shuffle(keys) hr = HashReader.open(st, "test.hsh") for x in keys: - assert hr[b("%08x" % x)] == b(str(x)) + assert hr[b(f"{x:08x}")] == b(str(x)) hr.close() diff --git a/tests/test_vectors.py b/tests/test_vectors.py index cdd13148..339859f3 100644 --- a/tests/test_vectors.py +++ b/tests/test_vectors.py @@ -1,6 +1,3 @@ -# encoding: utf-8 -from __future__ import with_statement - from whoosh import fields, formats from whoosh.compat import u from whoosh.filedb.filestore import RamStorage diff --git a/tests/test_weightings.py b/tests/test_weightings.py index 8a51440f..5a275355 100644 --- a/tests/test_weightings.py +++ b/tests/test_weightings.py @@ -1,10 +1,9 @@ -from __future__ import with_statement import inspect -from random import choice, randint import sys +from random import choice, randint from whoosh import fields, query, scoring -from whoosh.compat import u, range, permutations +from whoosh.compat import permutations, u from whoosh.filedb.filestore import RamStorage @@ -24,9 +23,7 @@ def test_all(): ix = storage.create_index(schema) w = ix.writer() for _ in range(100): - w.add_document( - text=u(" ").join(choice(domain) for _ in range(randint(10, 20))) - ) + w.add_document(text=u(" ").join(choice(domain) for _ in range(randint(10, 20)))) w.commit() # List ABCs that should not be tested @@ -46,15 +43,15 @@ def test_all(): weighting = wclass() except TypeError: e = sys.exc_info()[1] - raise TypeError("Error instantiating %r: %s" % (wclass, e)) + raise TypeError(f"Error instantiating {wclass!r}: {e}") with ix.searcher(weighting=weighting) as s: try: for word in domain: s.search(query.Term("text", word)) - except Exception: + except ValueError: e = sys.exc_info()[1] - e.msg = "Error searching with %r: %s" % (wclass, e) + e.msg = f"Error searching with {wclass!r}: {e}" raise diff --git a/tests/test_writing.py b/tests/test_writing.py index de032b6e..ff713082 100644 --- a/tests/test_writing.py +++ b/tests/test_writing.py @@ -1,10 +1,10 @@ -from __future__ import with_statement -import random, time, threading +import random +import threading +import time import pytest - from whoosh import analysis, fields, query, writing -from whoosh.compat import b, u, range, text_type +from whoosh.compat import b, text_type, u from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempIndex