Merge branch 'main' into patch-2

Sygil-Dev · Jan 4, 2024 · c7bd0df · c7bd0df
2 parents 506c16c + ed74e95
commit c7bd0df
Show file tree

Hide file tree

Showing 26 changed files with 259 additions and 77 deletions.
diff --git a/.github/ISSUE_TEMPLATE/sweep-template.yml b/.github/ISSUE_TEMPLATE/sweep-template.yml
@@ -0,0 +1,15 @@
+name: Sweep Issue
+title: 'Sweep: '
+description: For small bugs, features, refactors, and tests to be handled by Sweep, an AI-powered junior developer.
+labels: sweep
+body:
+  - type: textarea
+    id: description
+    attributes:
+      label: Details
+      description: Tell Sweep where and what to edit and provide enough context for a new developer to the codebase
+      placeholder: |
+        Unit Tests: Write unit tests for <FILE>. Test each function in the file. Make sure to test edge cases.
+        Bugs: The bug might be in <FILE>. Here are the logs: ...
+        Features: the new endpoint should use the ... class from <FILE> because it contains ... logic.
+        Refactors: We are migrating this function to ... version because ...
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -0,0 +1,37 @@
+name: Python package
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.6, 3.7, 3.8, 3.9]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pip install jieba
+        pytest
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,28 @@
+---
+name: Test
+
+on: [pull_request, push]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.8, 3.9, "3.10", 3.11, 3.12]
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip setuptools wheel
+        pip install pytest pytest-cov coverage cached-property
+          python setup.py clean build install
+    - name: Run test
+      run: pytest
+    - name: Upload coverage reports to Codecov
+      uses: codecov/codecov-action@v3
+      env:
+        CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,10 @@
 *.pyc
 __pycache__/
 .tox/
+env/
+build/
+dist/
+eggs/
+.eggs/
+*.egg
+*.egg-info/
diff --git a/.travis.yml b/.travis.yml
diff --git a/README.md b/README.md
@@ -29,10 +29,7 @@ Whoosh might be useful in the following circumstances:
 * When an easy-to-use Pythonic interface is more important to you than raw
   speed. 
 
-Whoosh was created and is maintained by Matt Chaput. It was originally created
-for use in the online help system of Side Effects Software's 3D animation
-software Houdini. Side Effects Software Inc. graciously agreed to open-source
-the code.
+Whoosh was created by Matt Chaput and is maintained currently by the Sygil-Dev Organization. It was originally created for use in the online help system of Side Effects Software's 3D animation software Houdini. Side Effects Software Inc. graciously agreed to open-source the code.
 
 This software is licensed under the terms of the simplified BSD (A.K.A. "two
 clause" or "FreeBSD") license. See LICENSE.txt for information.
@@ -43,28 +40,29 @@ Installing Whoosh
 If you have ``setuptools`` or ``pip`` installed, you can use ``easy_install``
 or ``pip`` to download and install Whoosh automatically::
 
+    # install the old version from Pypi
     $ easy_install Whoosh
-
+    
     or
-
+    
     $ pip install Whoosh
+
+
+    # Install the development version from Github.
+    $ pip install git+https://github.com/Sygil-Dev/whoosh.git
 
 Learning more
 =============
 
-* Read the online documentation at https://whoosh.readthedocs.org/en/latest/
+* Read the online documentation at https://docs.red-dove.com/whoosh/ (Search DOES work).
 
-* Join the Whoosh mailing list at http://groups.google.com/group/whoosh
+* Read the old online documentation at https://whoosh.readthedocs.org/en/latest/ (Search DOES NOT work).
 
-* File bug reports and view the Whoosh wiki at
-  http://bitbucket.org/mchaput/whoosh/
+* File bug reports and issues at https://github.com/Sygil-Dev/whoosh/issues
 
-Getting the source
+Getting the source.
 ==================
 
-Download source releases from PyPI at http://pypi.python.org/pypi/Whoosh/
-
-You can check out the latest version of the source code using Mercurial::
-
-    hg clone http://bitbucket.org/mchaput/whoosh
+You can check out the latest version of the source code on GitHub using git:
 
+    $ git clone https://github.com/Sygil-Dev/whoosh.git
diff --git a/setup.py b/setup.py
@@ -1,6 +1,8 @@
 #!python
 
-import os.path, sys
+import os.path
+import sys
+
 from setuptools import setup, find_packages
 from setuptools.command.test import test as TestCommand
 
@@ -20,7 +22,7 @@ def finalize_options(self):
         self.test_suite = True
 
     def run_tests(self):
-        #import here, cause outside the eggs aren't loaded
+        # import here, cause outside the eggs aren't loaded
         import pytest
         pytest.main(self.test_args)
 
@@ -43,19 +45,28 @@ def run_tests(self):
         url="http://bitbucket.org/mchaput/whoosh",
 
         zip_safe=True,
-        install_requires=['cached-property'],
-        tests_require=['pytest'],
+        install_requires=['cached-property', 'jieba'],
+        tests_require=['pytest', 'jieba'],
         cmdclass={'test': PyTest},
 
         classifiers=[
-        "Development Status :: 5 - Production/Stable",
-        "Intended Audience :: Developers",
-        "License :: OSI Approved :: BSD License",
-        "Natural Language :: English",
-        "Operating System :: OS Independent",
-        "Programming Language :: Python :: 2.5",
-        "Programming Language :: Python :: 3",
-        "Topic :: Software Development :: Libraries :: Python Modules",
-        "Topic :: Text Processing :: Indexing",
+            "Programming Language :: Python :: 3",
+            "Development Status :: 5 - Production/Stable",
+            "Intended Audience :: Developers",
+            "License :: OSI Approved :: BSD License",
+            "Natural Language :: English",
+            "Operating System :: OS Independent",
+            "Programming Language :: Python :: 2.7",
+            "Programming Language :: Python :: 3.4",
+            "Programming Language :: Python :: 3.5",
+            "Programming Language :: Python :: 3.6",
+            "Programming Language :: Python :: 3.7",
+            "Programming Language :: Python :: 3.8",
+            "Programming Language :: Python :: 3.9",
+            "Programming Language :: Python :: 3.10",
+            "Programming Language :: Python :: 3.11",
+            "Programming Language :: Python :: 3.12",
+            "Topic :: Software Development :: Libraries :: Python Modules",
+            "Topic :: Text Processing :: Indexing",
         ],
     )
diff --git a/src/whoosh/analysis/filters.py b/src/whoosh/analysis/filters.py
@@ -53,7 +53,7 @@
     \\S+?                  # URL body
     (?=\\s|[.]\\s|$|[.]$)  # Stop at space/end, or a dot followed by space/end
 ) | (                      # or...
-    \w+([:.]?\w+)*         # word characters, with opt. internal colons/dots
+    \\w+([:.]?\\w+)*         # word characters, with opt. internal colons/dots
 )
 """, verbose=True)
 
@@ -145,7 +145,7 @@ def __call__(self, tokens):
 
 
 class TeeFilter(Filter):
-    """Interleaves the results of two or more filters (or filter chains).
+    r"""Interleaves the results of two or more filters (or filter chains).
 
     NOTE: because it needs to create copies of each token for each sub-filter,
     this filter is quite slow.

diff --git a/src/whoosh/analysis/intraword.py b/src/whoosh/analysis/intraword.py
@@ -34,7 +34,7 @@
 
 
 class CompoundWordFilter(Filter):
-    """Given a set of words (or any object with a ``__contains__`` method),
+    r"""Given a set of words (or any object with a ``__contains__`` method),
     break any tokens in the stream that are composites of words in the word set
     into their individual parts.
 
@@ -272,7 +272,7 @@ class IntraWordFilter(Filter):
     >>> iwf_i = IntraWordFilter(mergewords=True, mergenums=True)
     >>> iwf_q = IntraWordFilter(mergewords=False, mergenums=False)
     >>> iwf = MultiFilter(index=iwf_i, query=iwf_q)
-    >>> analyzer = RegexTokenizer(r"\S+") | iwf | LowercaseFilter()
+    >>> analyzer = RegexTokenizer(r"\\S+") | iwf | LowercaseFilter()
 
     (See :class:`MultiFilter`.)
     """
@@ -282,7 +282,7 @@ class IntraWordFilter(Filter):
     __inittypes__ = dict(delims=text_type, splitwords=bool, splitnums=bool,
                          mergewords=bool, mergenums=bool)
 
-    def __init__(self, delims=u("-_'\"()!@#$%^&*[]{}<>\|;:,./?`~=+"),
+    def __init__(self, delims=u("-_'\"()!@#$%^&*[]{}<>\\|;:,./?`~=+"),
                  splitwords=True, splitnums=True,
                  mergewords=False, mergenums=False):
         """

diff --git a/src/whoosh/analysis/tokenizers.py b/src/whoosh/analysis/tokenizers.py
@@ -30,7 +30,7 @@
 from whoosh.util.text import rcompile
 
 
-default_pattern = rcompile(r"\w+(\.?\w+)*")
+default_pattern = rcompile(r"[\w\*]+(\.?[\w\*]+)*")
 
 
 # Tokenizers

diff --git a/src/whoosh/fields.py b/src/whoosh/fields.py
@@ -885,7 +885,7 @@ def parse_query(self, fieldname, qstring, boost=1.0):
         if is_ambiguous(at):
             startnum = datetime_to_long(at.floor())
             endnum = datetime_to_long(at.ceil())
-            return query.NumericRange(fieldname, startnum, endnum)
+            return query.NumericRange(fieldname, startnum, endnum, boost=boost)
         else:
             return query.Term(fieldname, at, boost=boost)
 
@@ -1224,8 +1224,12 @@ def self_parsing(self):
     def parse_query(self, fieldname, qstring, boost=1.0):
         from whoosh import query
 
-        terms = [query.Term(fieldname, g)
-                 for g in self.process_text(qstring, mode='query')]
+        terms = []
+        for g in self.process_text(qstring, mode='query'):
+            if g == "*":
+                terms.append(query.Wildcard(fieldname, g, boost=boost))
+            else:
+                terms.append(query.Term(fieldname, g, boost=boost))
         cls = query.Or if self.queryor else query.And
 
         return cls(terms, boost=boost)

diff --git a/src/whoosh/filedb/filetables.py b/src/whoosh/filedb/filetables.py
@@ -30,7 +30,7 @@
 D. J. Bernstein's CDB format (http://cr.yp.to/cdb.html).
 """
 
-import os, struct
+import os, struct, sys
 from binascii import crc32
 from hashlib import md5  # @UnresolvedImport
 
@@ -56,7 +56,9 @@ def cdb_hash(key):
 
 
 def md5_hash(key):
-    return int(md5(key).hexdigest(), 16) & 0xffffffff
+    if sys.version_info[0] < 3 or sys.version_info[1] < 9:
+        return int(md5(key).hexdigest(), 16) & 0xffffffff
+    return int(md5(key, usedforsecurity=False).hexdigest(), 16) & 0xffffffff
 
 
 def crc_hash(key):

diff --git a/src/whoosh/highlight.py b/src/whoosh/highlight.py
@@ -131,8 +131,8 @@ def __init__(self, text, matches, startchar=0, endchar=-1):
                 self.matched_terms.add(t.text)
 
     def __repr__(self):
-        return "<Fragment %d:%d %d>" % (self.startchar, self.endchar,
-                                        len(self.matches))
+        return "<Fragment %d:%d has %d matches>" % (self.startchar, self.endchar,
+                                                    len(self.matches))
 
     def __len__(self):
         return self.endchar - self.startchar
@@ -695,7 +695,12 @@ def format_fragment(self, fragment, replace=False):
         index = fragment.startchar
         text = fragment.text
 
-        for t in fragment.matches:
+        # For overlapping tokens (such as in Chinese), sort by position,
+        # then by inverse of length.
+        # Because the formatter is sequential, it will only pick the first
+        # token for a given position to highlight. This makes sure it picks
+        # the longest overlapping token.
+        for t in sorted(fragment.matches, key=lambda token: (token.startchar, -(token.endchar - token.startchar))):
             if t.startchar is None:
                 continue
             if t.startchar < index:

diff --git a/src/whoosh/lang/paicehusk.py b/src/whoosh/lang/paicehusk.py
@@ -30,7 +30,7 @@ class PaiceHuskStemmer(object):
     (?P<cont>[.>])
     """, re.UNICODE | re.VERBOSE)
 
-    stem_expr = re.compile("^\w+", re.UNICODE)
+    stem_expr = re.compile(r"^\w+", re.UNICODE)
 
     def __init__(self, ruletable):
         """

diff --git a/src/whoosh/lang/porter2.py b/src/whoosh/lang/porter2.py
@@ -64,7 +64,7 @@ def remove_initial_apostrophe(word):
 def capitalize_consonant_ys(word):
     if word.startswith('y'):
         word = 'Y' + word[1:]
-    return ccy_exp.sub('\g<1>Y', word)
+    return ccy_exp.sub(r'\g<1>Y', word)
 
 
 def step_0(word):

diff --git a/src/whoosh/matching/binary.py b/src/whoosh/matching/binary.py
@@ -395,11 +395,11 @@ def skip_to_quality(self, minquality):
         skipped = 0
         aq = a.block_quality()
         bq = b.block_quality()
-        while a.is_active() and b.is_active() and max(aq, bq) <= minquality:
-            if aq <= minquality:
+        while a.is_active() and b.is_active() and max(aq, bq) < minquality:
+            if aq < minquality:
                 skipped += a.skip_to_quality(minquality)
                 aq = a.block_quality()
-            if bq <= minquality:
+            if bq < minquality:
                 skipped += b.skip_to_quality(minquality)
                 bq = b.block_quality()
         return skipped

diff --git a/src/whoosh/qparser/default.py b/src/whoosh/qparser/default.py
@@ -92,7 +92,8 @@ def default_set(self):
 
         from whoosh.qparser import plugins
 
-        return [plugins.WhitespacePlugin(),
+        return [
+                # plugins.WhitespacePlugin(),
                 plugins.SingleQuotePlugin(),
                 plugins.FieldsPlugin(),
                 plugins.WildcardPlugin(),