Skip to content

Commit

Permalink
do not group docs / paginate search results (#20)
Browse files Browse the repository at this point in the history
  • Loading branch information
ciur authored Aug 5, 2024
1 parent 1e73500 commit 5901222
Show file tree
Hide file tree
Showing 8 changed files with 290 additions and 387 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog

## 0.6.0 - 2024-08-05

- Do **not** group search results by `document_id`
- Paginate search results

## 0.3.10 - 2024-04-21

- Add extra logging/debug statements
261 changes: 137 additions & 124 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
[tool.poetry]
name = "salinic"
version = "0.5.0"
version = "0.6.0"
description = "Search abstraction layer"
authors = ["Eugen Ciur <[email protected]>"]
readme = "README.md"

[tool.poetry.dependencies]
python = ">=3.11, <4.0"
pydantic = "^2.7"
pydantic = "^2.8"
requests = "^2.31.0"
xapianpy = {version = "1.4.22.post2406040406", optional = true}
glom = "^23.5.0"
Expand Down
9 changes: 2 additions & 7 deletions salinic/backends/solr/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,8 @@ class ClientRW(Base):
def search(self, sq: SearchQuery, user_id: str | None = None):
payload = {
'q': sq.query.original_query,
'group': 'true',
'group.field': 'document_id',
'rows': sq.rows,
'start': sq.start,
'group.limit': sq.group_limit,
'group.offset': sq.group_offset,
'group.sort': 'page_number asc'
'rows': sq.page_size,
'start': sq.page_size * (sq.page_number - 1),
}

if user_id:
Expand Down
156 changes: 37 additions & 119 deletions salinic/backends/solr/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from salinic.field import Field
from salinic.query import SearchQuery
from salinic.schema import Document, Folder, Page
from salinic.schema import DocumentPage, Folder, PaginatedResponse
from salinic.utils import first

logger = logging.getLogger(__name__)
Expand All @@ -21,129 +21,47 @@ def search(
self,
sq: SearchQuery,
user_id: str | None = None
) -> list[Document | Folder]:
"""Query index
Solr results are grouped by `document_id` field: this way
all folder entries will be part of group with `document_id=null`,
while all page entities will be grouped per document i.e.
pages which belong together are all in the same group.
{
"responseHeader":{
...
"grouped":{
"document_id":{
"matches":26,
"groups":[
"groupValue":null,
"doclist":{"numFound":4,"start":0,"numFoundExact":true,"docs":[
{
"id":"0b663599-32b1-4396-8dbe-ae7cd327cec6",
"lang":"en",
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
"entity_type":"folder",
"title_txt_en":"A2 updated",
"_version_":1801539995817738240},
{
"id":"768c6841-d37a-4d02-857f-ab7eaf69b27e",
"lang":"en",
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
"entity_type":"folder",
"title_txt_en":".inbox",
"_version_":1801539995692957696}]
}},
{
"groupValue":"9bc57688-302e-4e1f-840a-c747dcccb362",
"doclist":{"numFound":5,"start":0,"numFoundExact":true,"docs":[
{
"id":"a6e4916f-dea6-414b-aa38-f5b9ea375725",
"document_id":"9bc57688-302e-4e1f-840a-c747dcccb362",
"lang":"en",
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
"page_number":1,
"entity_type":"page",
"title_txt_en":"brother_004603.pdf",
"_version_":1801539996374532096},
{
"id":"72f6ca9e-af4b-4235-a56c-a62508e24efe",
"document_id":"9bc57688-302e-4e1f-840a-c747dcccb362",
"lang":"en",
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
"page_number":2,
"entity_type":"page",
"title_txt_en":"brother_004603.pdf",
"_version_":1801539996403892224},]
}},
{
"groupValue":"200b0201-cfcd-43df-b41f-f1732568a0d2",
"doclist":{"numFound":2,"start":0,"numFoundExact":true,"docs":[
{
"id":"9fa936e6-fe94-46bf-ad01-d8591cc290d4",
"document_id":"200b0201-cfcd-43df-b41f-f1732568a0d2",
"lang":"en",
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
"page_number":1,
"entity_type":"page",
"title_txt_en":"brother_004598.pdf",
"_version_":1801539995874361344},
{
"id":"c364994c-eab5-4c6a-842a-6f40537f7a2e",
"document_id":"200b0201-cfcd-43df-b41f-f1732568a0d2",
"lang":"en",
"user_id":"4cee7c39-7c34-4cc5-8543-42a8c88c9fe6",
"page_number":2,
"entity_type":"page",
"title_txt_en":"brother_004598.pdf",
"_version_":1801539995910012928}]
}},
}}]}}}
"""
) -> PaginatedResponse:
"""Query index"""
result = self.client.search(sq, user_id)
grouped = glom(result, 'grouped.document_id')
if glom(grouped, 'matches') == 0:
return []

result = []
for group in glom(grouped, 'groups'):
if glom(group, 'groupValue'):
# groupValue != null => document
document_id = glom(group, 'groupValue')
title = ''
lang = 'en'
tags = []
pages = []
for page in glom(group, 'doclist.docs'):
lang = page.get('lang', 'en')
title = page.get(f'title_txt_{lang}', None)
text = page.get(f'text_txt_{lang}', None)
tags = page.get('tags', [])
p = Page(
id=page['id'],
page_number=page['page_number'],
text=text
)
pages.append(p)
item = Document(
id=document_id,
items = glom(result, 'response.docs')
total_found = glom(result, 'response.numFound')
start = glom(result, 'response.start')
page_number = int(start / sq.page_size) + 1
num_pages = int(total_found / sq.page_size) + 1
returned_list = []

for item in items:
if document_id := item.get('document_id', None):
lang = item.get('lang', 'en')
title = item.get(f'title_txt_{lang}', lang)
tags = item.get('tags', [])
dp = DocumentPage(
id=item['id'],
page_number=item['page_number'],
document_id=document_id,
title=title,
lang=lang,
pages=pages,
tags=tags,
tags=tags
)
result.append(item)
returned_list.append(dp)
else:
for folder in glom(group, 'doclist.docs'):
lang = folder.get('lang', 'en')
title = folder.get(f'title_txt_{lang}', None)
item = Folder(
id=folder['id'],
title=title,
tags=folder.get('tags', []),
)
result.append(item)

return result
lang = item.get('lang', 'en')
title = item.get(f'title_txt_{lang}', lang)
folder = Folder(
id=item['id'],
title=title,
lang=lang,
tags=item.get('tags', []),
)
returned_list.append(folder)

return PaginatedResponse(
page_size=sq.page_size,
page_number=page_number,
num_pages=num_pages,
items=returned_list
)


class IndexRW(Base):
Expand Down
18 changes: 6 additions & 12 deletions salinic/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,26 +169,20 @@ def __repr__(self):

class SearchQuery:
query: Query
rows: int
start: int
group_limit: int
group_offset: int
page_number: int # starts with 1
page_size: int

def __init__(
self,
entity,
q: str,
rows: int = 100,
start: int = 0,
group_limit: int = 100,
group_offset: int = 0
page_size: int = 50,
page_number: int = 1
):
self.entity = entity
self.query = Query(q)
self.rows = rows
self.start = start
self.group_limit = group_limit
self.group_offset = group_offset
self.page_size = page_size
self.page_number = page_number

def __str__(self):
return f"SearchQuery(query={self.query}, entity={self.entity})"
Expand Down
31 changes: 17 additions & 14 deletions salinic/schema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from uuid import UUID

from pydantic import BaseModel, ConfigDict, model_serializer

Expand Down Expand Up @@ -85,29 +84,33 @@ def needs_transform(self, field_name):
return hasattr(self, f'get_idx_value__{field_name}')


class Page(BaseModel):
id: UUID
page_number: int
text: str | None = None


class Document(BaseModel):
id: UUID
class SearchResultItem(BaseModel):
id: str
title: str
lang: str
tags: list[str] = []
pages: list[Page]


class DocumentPage(SearchResultItem):
page_number: int
document_id: str
entity_type: str = 'document'

def __hash__(self):
return hash(self.model_dump_json())


class Folder(BaseModel):
id: UUID
title: str
tags: list[str] = []
class Folder(SearchResultItem):
entity_type: str = 'folder'

def __hash__(self):
return hash(self.model_dump_json())


class PaginatedResponse(BaseModel):
page_size: int
page_number: int
num_pages: int
items: list[Folder | DocumentPage]

model_config = ConfigDict(from_attributes=True)
Loading

0 comments on commit 5901222

Please sign in to comment.