From 62e9fca04fd9a14bffae587638a3fddcaf3eb925 Mon Sep 17 00:00:00 2001 From: Eugen Ciur Date: Sat, 7 Dec 2024 09:51:49 +0100 Subject: [PATCH] updates for OCRWorker (#565) --- .dockerignore | 1 + .gitignore | 1 + docker/standard/Dockerfile | 7 +- docker/standard/core.js.tmpl | 5 ++ docker/standard/entrypoint.sh | 9 +- papermerge/core/config.py | 10 ++- papermerge/core/features/document/cli/cli.py | 32 +++++++ papermerge/core/features/document/db/api.py | 14 ++- ui2/index.html | 1 + ui2/papermerge-runtime-config.js | 5 ++ .../ScheduleOCRProcessCheckbox.tsx | 85 +++++++++++++++++++ .../ScheduleOCRProcessCheckbox/index.tsx | 0 .../Commander/NodesCommander/DropFiles.tsx | 37 +++++--- ui2/src/features/nodes/uploadFile.ts | 9 +- ui2/src/hooks/runtime_config.tsx | 20 +++++ ui2/src/types/runtime_config.ts | 13 +++ 16 files changed, 230 insertions(+), 19 deletions(-) create mode 100644 docker/standard/core.js.tmpl create mode 100644 papermerge/core/features/document/cli/cli.py create mode 100644 ui2/papermerge-runtime-config.js create mode 100644 ui2/src/components/ScheduleOCRProcessCheckbox/ScheduleOCRProcessCheckbox.tsx create mode 100644 ui2/src/components/ScheduleOCRProcessCheckbox/index.tsx create mode 100644 ui2/src/hooks/runtime_config.tsx create mode 100644 ui2/src/types/runtime_config.ts diff --git a/.dockerignore b/.dockerignore index 67127e1b4..10b14d499 100644 --- a/.dockerignore +++ b/.dockerignore @@ -17,3 +17,4 @@ ui2/.vscode ui2/.pnp.cjs ui2/.pnp.loader.mjs ui2/.env.development.local +ui2/public/papermerge-runtime-config.js diff --git a/.gitignore b/.gitignore index a26e09bd5..62e679277 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,4 @@ simplest.yml ui/public/runtime/config.js .ruff_cache/ .pytest_cache/ +.enwardrc diff --git a/docker/standard/Dockerfile b/docker/standard/Dockerfile index 89931a98e..8d7e63824 100644 --- a/docker/standard/Dockerfile +++ b/docker/standard/Dockerfile @@ -14,11 +14,14 @@ ENV CORE_APP=/core_app ENV PAPERMERGE__DATABASE__URL=sqlite:////db/db.sqlite3 ENV PAPERMERGE__AUTH__USERNAME=admin ENV PAPERMERGE__AUTH__EMAIL=admin@example.com -ENV PAPERMERGE__OCR__DEFAULT_LANGUAGE=deu ENV PAPERMERGE__MAIN__API_PREFIX="" +ENV PAPERMERGE__OCR__LANG_CODES="deu,eng,ron" +ENV PAPERMERGE__OCR__DEFAULT_LANG_CODE="deu" +ENV PAPERMERGE__OCR__AUTOMATIC="false" RUN apk update && apk add linux-headers python3-dev \ gcc \ + curl \ libc-dev \ supervisor \ imagemagick \ @@ -27,6 +30,7 @@ RUN apk update && apk add linux-headers python3-dev \ poppler-utils RUN pip install --upgrade poetry roco==0.4.2 +RUN curl -L -o /bin/env2js https://github.com/papermerge/env2js/releases/download/0.2/env2js.amd64 COPY poetry.lock pyproject.toml README.md LICENSE ${CORE_APP}/ @@ -37,6 +41,7 @@ COPY docker/standard/entrypoint.sh /entrypoint.sh COPY docker/standard/bundles/supervisor/* /etc/papermerge/ COPY docker/standard/bundles/nginx/* /etc/nginx/ COPY docker/standard/logging.yaml /etc/papermerge/ +COPY docker/standard/core.js.tmpl /${CORE_APP}/core.js.tmpl COPY ./papermerge ${CORE_APP}/papermerge/ COPY alembic.ini ${CORE_APP}/ diff --git a/docker/standard/core.js.tmpl b/docker/standard/core.js.tmpl new file mode 100644 index 000000000..54aff6e42 --- /dev/null +++ b/docker/standard/core.js.tmpl @@ -0,0 +1,5 @@ +window.__PAPERMERGE_RUNTIME_CONFIG__ = { + ocr__lang_codes: "{{ .PAPERMERGE__OCR__LANG_CODES }}", + ocr__default_lang_code: "{{ .PAPERMERGE__OCR__DEFAULT_LANG_CODE }}", + ocr__automatic: {{ .PAPERMERGE__OCR__AUTOMATIC }} +} diff --git a/docker/standard/entrypoint.sh b/docker/standard/entrypoint.sh index 4e84aa15d..b2e199272 100644 --- a/docker/standard/entrypoint.sh +++ b/docker/standard/entrypoint.sh @@ -55,8 +55,15 @@ case $CMD in ;; server) exec_init + # TODO: replace roco with env2js roco > /usr/share/nginx/html/auth_server/papermerge-runtime-config.js - roco > /usr/share/nginx/html/ui/papermerge-runtime-config.js + /bin/env2js -f /core_app/core.js.tmpl > /usr/share/nginx/html/ui/papermerge-runtime-config.js + exec /usr/bin/supervisord -c /etc/papermerge/supervisord.conf + ;; + server_without_init) + # TODO: replace roco with env2js + roco > /usr/share/nginx/html/auth_server/papermerge-runtime-config.js + /bin/env2js -f /core_app/core.js.tmpl > /usr/share/nginx/html/ui/papermerge-runtime-config.js exec /usr/bin/supervisord -c /etc/papermerge/supervisord.conf ;; create_token.sh) diff --git a/papermerge/core/config.py b/papermerge/core/config.py index 1b647aa92..9ff883910 100644 --- a/papermerge/core/config.py +++ b/papermerge/core/config.py @@ -19,7 +19,15 @@ class Settings(BaseSettings): papermerge__main__cf_domain: str | None = None papermerge__database__url: str = "sqlite:////db/db.sqlite3" papermerge__redis__url: str | None = None - papermerge__ocr__default_language: str = 'deu' + papermerge__ocr__default_lang_code: str = 'deu' + # When is OCR triggered ? + # `ocr__automatic` = True means that OCR will be performed without + # end user intervention i.e. via background scheduler like celery scheduler + # `ocr__automatic` = False means that OCR will be performed only + # if requested by end user. In this case user can choose to + # start schedule OCR on upload; also in this case use can choose to + # scheduler OCR later on any document. + papermerge__ocr__automatic: bool = False papermerge__search__url: str | None = None diff --git a/papermerge/core/features/document/cli/cli.py b/papermerge/core/features/document/cli/cli.py new file mode 100644 index 000000000..9aafdee86 --- /dev/null +++ b/papermerge/core/features/document/cli/cli.py @@ -0,0 +1,32 @@ +import uuid +import typer + +from papermerge.core.tasks import send_task +from papermerge.core.db.engine import Session + +from papermerge.core import dbapi, constants, types + + +app = typer.Typer(help="OCR tasks") + + +@app.command() +def schedule_ocr(node_id: uuid.UUID, force: bool = False, lang: str | None = None): + """Schedules OCR for given node ID""" + with Session() as db_session: + node_type: types.CType = dbapi.get_node_type(db_session, node_id) + + if node_type == "document": + if lang is None: + lang = dbapi.get_document_lang(db_session, node_id) + send_task( + constants.WORKER_OCR_DOCUMENT, + kwargs={ + "document_id": str(node_id), + "lang": lang, + }, + route_name="ocr", + ) + else: + # get all descendants of node_id + pass diff --git a/papermerge/core/features/document/db/api.py b/papermerge/core/features/document/db/api.py index 7487bac62..97ab3c72e 100644 --- a/papermerge/core/features/document/db/api.py +++ b/papermerge/core/features/document/db/api.py @@ -27,10 +27,11 @@ ) from papermerge.core.features.document.schema import DocumentCFVRow from papermerge.core.features.document.ordered_document_cfv import OrderedDocumentCFV +from papermerge.core import config from .selectors import select_doc_cfv, select_docs_by_type - +settings = config.get_settings() logger = logging.getLogger(__name__) @@ -634,6 +635,17 @@ def upload( route_name="s3", ) + if not settings.papermerge__ocr__automatic: + if doc.ocr is True: + # user chose "schedule OCR" when uploading document + tasks.send_task( + constants.WORKER_OCR_DOCUMENT, + kwargs={ + "document_id": str(doc.id), + "lang": doc.lang, + }, + route_name="ocr", + ) return validated_model, None diff --git a/ui2/index.html b/ui2/index.html index 04c1169a9..1fa75fa9e 100644 --- a/ui2/index.html +++ b/ui2/index.html @@ -9,6 +9,7 @@
+ diff --git a/ui2/papermerge-runtime-config.js b/ui2/papermerge-runtime-config.js new file mode 100644 index 000000000..392e53130 --- /dev/null +++ b/ui2/papermerge-runtime-config.js @@ -0,0 +1,5 @@ +window.__PAPERMERGE_RUNTIME_CONFIG__ = { + ocr__lang_codes: "deu,eng,ron, spa, ita, fra", + ocr__default_lang_code: "eng", + ocr__automatic: false +} diff --git a/ui2/src/components/ScheduleOCRProcessCheckbox/ScheduleOCRProcessCheckbox.tsx b/ui2/src/components/ScheduleOCRProcessCheckbox/ScheduleOCRProcessCheckbox.tsx new file mode 100644 index 000000000..65c2bd025 --- /dev/null +++ b/ui2/src/components/ScheduleOCRProcessCheckbox/ScheduleOCRProcessCheckbox.tsx @@ -0,0 +1,85 @@ +import {OCR_LANG} from "@/cconstants" +import {useRuntimeConfig} from "@/hooks/runtime_config" +import {OCRCode} from "@/types/ocr" +import {Checkbox, ComboboxData, Select, Stack} from "@mantine/core" +import {useState} from "react" + +interface Args { + initialCheckboxValue: boolean + defaultLang: OCRCode + onLangChange: (newLang: OCRCode) => void + onCheckboxChange: (newValue: boolean) => void +} + +export default function ScheduleOCRProcessCheckbox({ + initialCheckboxValue, + defaultLang, + onCheckboxChange, + onLangChange +}: Args) { + const runtimeConfig = useRuntimeConfig() + const langData = langCodes2ComboboxData(runtimeConfig.ocr__lang_codes) + const [checked, setChecked] = useState(initialCheckboxValue) + const [lang, setLang] = useState(defaultLang) + + const onCheckboxChangeLocal = (e: React.ChangeEvent) => { + const newValue = e.currentTarget.checked + setChecked(newValue) + onCheckboxChange(newValue) + } + const onLangChangeLocal = (value: string | null) => { + if (value) { + setLang(value as OCRCode) + onLangChange(value as OCRCode) + } + } + + return ( + + + {checked && ( +