Merge pull request #7 from papermerge/ocr-per-page

OCR per page
papermerge · Jun 14, 2024 · 0b5c26b · 0b5c26b
2 parents 3c99a61 + c61a018
commit 0b5c26b
Show file tree

Hide file tree

Showing 10 changed files with 81 additions and 99 deletions.
diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
@@ -1,11 +1,10 @@
 name: Publish to PyPi
 
 on:
-  workflow_dispatch:
-  workflow_call:
-    secrets:
-      PYPI_TOKEN:
-        required: true
+  push:
+    tags:
+      - '[0-9]+.[0-9]+.[0-9]+'
+      - '[0-9]+.[0-9]+'
 
 jobs:
   publish-to-pypi:

diff --git a/changelog.md b/changelog.md
@@ -1,5 +1,10 @@
 # Changelog
 
+
+## [0.7.0] - 2024-06-14
+
+- OCR per page (pass one `uuid` instead of `uuids` list)
+
 ## [0.4.9] - 2023-08-02
 
 - Fix Image.ANTIALIAS deprication issue

diff --git a/ocrmypdf_papermerge/generate_preview.py b/ocrmypdf_papermerge/generate_preview.py
@@ -1,5 +1,4 @@
 from pathlib import Path
-from typing import List
 
 from PIL import Image
 
@@ -11,15 +10,15 @@ def generate_preview(
     *,
     preview_width: int,
     base_dir: Path,
-    uuids: List[str]
+    uuid: str
 ) -> None:
     """
     Generates page preview as jpeg
     """
     output_file_path = get_result_file_path(
         input_file_path=input_file,
         base_dir=base_dir,
-        uuids=uuids,
+        uuid=uuid,
         output_ext="jpg"
     )
     output_file_path.parent.mkdir(parents=True, exist_ok=True)

diff --git a/ocrmypdf_papermerge/generate_svg.py b/ocrmypdf_papermerge/generate_svg.py
@@ -18,7 +18,7 @@ def generate_svg(
         input_file,
         base_dir=Path(options.sidecar_dir),
         output_ext=options.sidecar_format,
-        uuids=options.uuids.split(',')
+        uuid=options.uuid
     )
 
     base64_img, size = image_to_base64(input_file)

diff --git a/ocrmypdf_papermerge/plugin.py b/ocrmypdf_papermerge/plugin.py
@@ -31,7 +31,7 @@ def generate_hocr(input_file, output_hocr, output_text, options):
             input_file=Path(input_file),
             preview_width=options.preview_width,
             base_dir=options.sidecar_dir,
-            uuids=options.uuids.split(',')
+            uuid=options.uuid
         )
         # svg | html with embedded raster image plus
         # mapped hocr text
@@ -44,13 +44,13 @@ def generate_hocr(input_file, output_hocr, output_text, options):
         copy_hocr(
             input_file_path=Path(output_hocr),
             output_dir=options.sidecar_dir,
-            uuids=options.uuids.split(',')
+            uuid=options.uuid
         )
         # actual extracted text
         copy_txt(
             input_file_path=Path(output_text),
             output_dir=options.sidecar_dir,
-            uuids=options.uuids.split(',')
+            uuid=options.uuid
         )
 
 
@@ -79,9 +79,6 @@ def add_options(parser):
     )
     parser.add_argument(
         '-u',
-        '--uuids',
-        help="A list of uuids separated by comma. "
-        " Order of UUIDs matters. First UUID corresponds to first page ID, "
-        " second UUID corresponds to second page ID etc "
-        "Number of UUIDs should match number of pages in the document.",
+        '--uuid',
+        help="UUID of the target page"
     )
diff --git a/ocrmypdf_papermerge/utils.py b/ocrmypdf_papermerge/utils.py
@@ -2,7 +2,6 @@
 import re
 import shutil
 from pathlib import Path
-from typing import List
 
 
 def get_page_number(input_file_path: Path) -> int:
@@ -36,7 +35,7 @@ def get_result_file_path(
     input_file_path: Path,
     *,
     base_dir: Path,
-    uuids: List[str],
+    uuid: str,
     output_ext,
     makedirs=True
 ) -> Path:
@@ -46,7 +45,7 @@ def get_result_file_path(
     input:
         input_file_path: Path('/tmp/media/000001_ocr.png')
         base_dir: Path('/ocr/')
-        uuids: ['8db234f4-9579-4dd8-86c9-2564d45de1ce']
+        uuid: '8db234f4-9579-4dd8-86c9-2564d45de1ce'
         output_ext: 'jpeg'
 
     output:
@@ -57,20 +56,12 @@ def get_result_file_path(
     input:
         input_file_path: Path('/tmp/000023_ocr.png')
         base_dir: Path('/ocr')
-        uuids: [...22 more uuids...,'a5b93d53-d62b-4264-a368-8122a8c313bc']
+        uuid: 'a5b93d53-d62b-4264-a368-8122a8c313bc'
         output_ext: 'txt'
 
     output:
         Path('/ocr/a5/b9/a5b93d53-d62b-4264-a368-8122a8c313bc/page.txt')
     """
-    page_number = get_page_number(input_file_path)
-
-    if page_number > len(uuids):
-        raise ValueError(
-            f"page_number > len(uuids) i.e. {page_number} > {len(uuids)}"
-        )
-
-    uuid = uuids[page_number - 1]
     basename = os.path.basename(input_file_path)
     root, _ = os.path.splitext(basename)
 
@@ -91,26 +82,26 @@ def get_result_file_path(
 def copy_txt(
     input_file_path: Path,
     output_dir: Path,
-    uuids: List[str]
+    uuid: str
 ):
     output_file_path = get_result_file_path(
         input_file_path,
         base_dir=output_dir,
         output_ext="txt",
-        uuids=uuids
+        uuid=uuid
     )
     shutil.copy(input_file_path, output_file_path)
 
 
 def copy_hocr(
     input_file_path: Path,
     output_dir: Path,
-    uuids: List[str]
+    uuid: str
 ):
     output_file_path = get_result_file_path(
         input_file_path,
         base_dir=output_dir,
         output_ext="hocr",
-        uuids=uuids
+        uuid=uuid
     )
     shutil.copy(input_file_path, output_file_path)
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ocrmypdf-papermerge"
-version = "0.6.0"
+version = "0.7.0"
 description = "OCRmyPDF plugin to generate SVG files for Papermerge"
 authors = ["Eugen Ciur <[email protected]>"]
 maintainers = ["Eugen Ciur <[email protected]>"]

diff --git a/tests/test_generate_preview.py b/tests/test_generate_preview.py
@@ -9,35 +9,29 @@
 
 
 @pytest.mark.parametrize(
-    "test_input, uuids, expected_file_path",
+    "test_input, uuid, expected_file_path",
     [
         (
             "000001_ocr.jpg",
-            ['75d61315-a12d-4860-97d3-431f395e82f4'],
+            '75d61315-a12d-4860-97d3-431f395e82f4',
             Path("75/d6/75d61315-a12d-4860-97d3-431f395e82f4/page.jpg")
         ),
         (
             "000002.jpg",
-            [
-                '9b6b4733-09c7-4169-b72a-d8cc12de9513',
-                '532a4ec9-0405-44d8-be0c-ebb33944c427'
-            ],
+            '532a4ec9-0405-44d8-be0c-ebb33944c427',
             Path("53/2a/532a4ec9-0405-44d8-be0c-ebb33944c427/page.jpg")
         ),
         (
             "000002_ocr.jpeg",
-            [
-                'fab2bfa3-f12f-479e-9c83-da7d39f2663b',
-                '14bdc2b4-3923-44c2-b62a-a00afc2e1bc7'
-            ],
+            '14bdc2b4-3923-44c2-b62a-a00afc2e1bc7',
             Path("14/bd/14bdc2b4-3923-44c2-b62a-a00afc2e1bc7/page.jpg")
         )
     ]
 )
 def test_generate_preview(
     tmp_path,
     test_input,
-    uuids,
+    uuid,
     expected_file_path
 ):
     base_dir = tmp_path / "media_root" / "ocr"
@@ -47,7 +41,7 @@ def test_generate_preview(
         TEST_DATA_FOLDER / test_input,
         preview_width=100,
         base_dir=base_dir,
-        uuids=uuids
+        uuid=uuid
     )
 
     expected_path = Path(base_dir / expected_file_path)
@@ -70,18 +64,18 @@ def test_generate_preview_will_raise_exp_on_invalid_file_name(tmp_path: Path):
     sidecar_dir = tmp_path / "sidecar_dir"
     sidecar_dir.mkdir()
 
-    with pytest.raises(ValueError):
+    with pytest.raises(FileNotFoundError):
         generate_preview(
             input_file=TEST_DATA_FOLDER / '01_ocr.jpg',
             preview_width=100,
-            uuids=[],
+            uuid='',
             base_dir=sidecar_dir
         )
 
-    with pytest.raises(ValueError):
+    with pytest.raises(FileNotFoundError):
         generate_preview(
             input_file=TEST_DATA_FOLDER / '99.jpg',
             preview_width=100,
-            uuids=[],
+            uuid='',
             base_dir=sidecar_dir
         )