Skip to content

Commit

Permalink
Merge pull request #7 from papermerge/ocr-per-page
Browse files Browse the repository at this point in the history
OCR per page
  • Loading branch information
ciur authored Jun 14, 2024
2 parents 3c99a61 + c61a018 commit 0b5c26b
Show file tree
Hide file tree
Showing 10 changed files with 81 additions and 99 deletions.
9 changes: 4 additions & 5 deletions .github/workflows/publish-to-pypi.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
name: Publish to PyPi

on:
workflow_dispatch:
workflow_call:
secrets:
PYPI_TOKEN:
required: true
push:
tags:
- '[0-9]+.[0-9]+.[0-9]+'
- '[0-9]+.[0-9]+'

jobs:
publish-to-pypi:
Expand Down
5 changes: 5 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog


## [0.7.0] - 2024-06-14

- OCR per page (pass one `uuid` instead of `uuids` list)

## [0.4.9] - 2023-08-02

- Fix Image.ANTIALIAS deprication issue
Expand Down
5 changes: 2 additions & 3 deletions ocrmypdf_papermerge/generate_preview.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from pathlib import Path
from typing import List

from PIL import Image

Expand All @@ -11,15 +10,15 @@ def generate_preview(
*,
preview_width: int,
base_dir: Path,
uuids: List[str]
uuid: str
) -> None:
"""
Generates page preview as jpeg
"""
output_file_path = get_result_file_path(
input_file_path=input_file,
base_dir=base_dir,
uuids=uuids,
uuid=uuid,
output_ext="jpg"
)
output_file_path.parent.mkdir(parents=True, exist_ok=True)
Expand Down
2 changes: 1 addition & 1 deletion ocrmypdf_papermerge/generate_svg.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def generate_svg(
input_file,
base_dir=Path(options.sidecar_dir),
output_ext=options.sidecar_format,
uuids=options.uuids.split(',')
uuid=options.uuid
)

base64_img, size = image_to_base64(input_file)
Expand Down
13 changes: 5 additions & 8 deletions ocrmypdf_papermerge/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def generate_hocr(input_file, output_hocr, output_text, options):
input_file=Path(input_file),
preview_width=options.preview_width,
base_dir=options.sidecar_dir,
uuids=options.uuids.split(',')
uuid=options.uuid
)
# svg | html with embedded raster image plus
# mapped hocr text
Expand All @@ -44,13 +44,13 @@ def generate_hocr(input_file, output_hocr, output_text, options):
copy_hocr(
input_file_path=Path(output_hocr),
output_dir=options.sidecar_dir,
uuids=options.uuids.split(',')
uuid=options.uuid
)
# actual extracted text
copy_txt(
input_file_path=Path(output_text),
output_dir=options.sidecar_dir,
uuids=options.uuids.split(',')
uuid=options.uuid
)


Expand Down Expand Up @@ -79,9 +79,6 @@ def add_options(parser):
)
parser.add_argument(
'-u',
'--uuids',
help="A list of uuids separated by comma. "
" Order of UUIDs matters. First UUID corresponds to first page ID, "
" second UUID corresponds to second page ID etc "
"Number of UUIDs should match number of pages in the document.",
'--uuid',
help="UUID of the target page"
)
23 changes: 7 additions & 16 deletions ocrmypdf_papermerge/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import re
import shutil
from pathlib import Path
from typing import List


def get_page_number(input_file_path: Path) -> int:
Expand Down Expand Up @@ -36,7 +35,7 @@ def get_result_file_path(
input_file_path: Path,
*,
base_dir: Path,
uuids: List[str],
uuid: str,
output_ext,
makedirs=True
) -> Path:
Expand All @@ -46,7 +45,7 @@ def get_result_file_path(
input:
input_file_path: Path('/tmp/media/000001_ocr.png')
base_dir: Path('/ocr/')
uuids: ['8db234f4-9579-4dd8-86c9-2564d45de1ce']
uuid: '8db234f4-9579-4dd8-86c9-2564d45de1ce'
output_ext: 'jpeg'
output:
Expand All @@ -57,20 +56,12 @@ def get_result_file_path(
input:
input_file_path: Path('/tmp/000023_ocr.png')
base_dir: Path('/ocr')
uuids: [...22 more uuids...,'a5b93d53-d62b-4264-a368-8122a8c313bc']
uuid: 'a5b93d53-d62b-4264-a368-8122a8c313bc'
output_ext: 'txt'
output:
Path('/ocr/a5/b9/a5b93d53-d62b-4264-a368-8122a8c313bc/page.txt')
"""
page_number = get_page_number(input_file_path)

if page_number > len(uuids):
raise ValueError(
f"page_number > len(uuids) i.e. {page_number} > {len(uuids)}"
)

uuid = uuids[page_number - 1]
basename = os.path.basename(input_file_path)
root, _ = os.path.splitext(basename)

Expand All @@ -91,26 +82,26 @@ def get_result_file_path(
def copy_txt(
input_file_path: Path,
output_dir: Path,
uuids: List[str]
uuid: str
):
output_file_path = get_result_file_path(
input_file_path,
base_dir=output_dir,
output_ext="txt",
uuids=uuids
uuid=uuid
)
shutil.copy(input_file_path, output_file_path)


def copy_hocr(
input_file_path: Path,
output_dir: Path,
uuids: List[str]
uuid: str
):
output_file_path = get_result_file_path(
input_file_path,
base_dir=output_dir,
output_ext="hocr",
uuids=uuids
uuid=uuid
)
shutil.copy(input_file_path, output_file_path)
88 changes: 44 additions & 44 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ocrmypdf-papermerge"
version = "0.6.0"
version = "0.7.0"
description = "OCRmyPDF plugin to generate SVG files for Papermerge"
authors = ["Eugen Ciur <[email protected]>"]
maintainers = ["Eugen Ciur <[email protected]>"]
Expand Down
26 changes: 10 additions & 16 deletions tests/test_generate_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,35 +9,29 @@


@pytest.mark.parametrize(
"test_input, uuids, expected_file_path",
"test_input, uuid, expected_file_path",
[
(
"000001_ocr.jpg",
['75d61315-a12d-4860-97d3-431f395e82f4'],
'75d61315-a12d-4860-97d3-431f395e82f4',
Path("75/d6/75d61315-a12d-4860-97d3-431f395e82f4/page.jpg")
),
(
"000002.jpg",
[
'9b6b4733-09c7-4169-b72a-d8cc12de9513',
'532a4ec9-0405-44d8-be0c-ebb33944c427'
],
'532a4ec9-0405-44d8-be0c-ebb33944c427',
Path("53/2a/532a4ec9-0405-44d8-be0c-ebb33944c427/page.jpg")
),
(
"000002_ocr.jpeg",
[
'fab2bfa3-f12f-479e-9c83-da7d39f2663b',
'14bdc2b4-3923-44c2-b62a-a00afc2e1bc7'
],
'14bdc2b4-3923-44c2-b62a-a00afc2e1bc7',
Path("14/bd/14bdc2b4-3923-44c2-b62a-a00afc2e1bc7/page.jpg")
)
]
)
def test_generate_preview(
tmp_path,
test_input,
uuids,
uuid,
expected_file_path
):
base_dir = tmp_path / "media_root" / "ocr"
Expand All @@ -47,7 +41,7 @@ def test_generate_preview(
TEST_DATA_FOLDER / test_input,
preview_width=100,
base_dir=base_dir,
uuids=uuids
uuid=uuid
)

expected_path = Path(base_dir / expected_file_path)
Expand All @@ -70,18 +64,18 @@ def test_generate_preview_will_raise_exp_on_invalid_file_name(tmp_path: Path):
sidecar_dir = tmp_path / "sidecar_dir"
sidecar_dir.mkdir()

with pytest.raises(ValueError):
with pytest.raises(FileNotFoundError):
generate_preview(
input_file=TEST_DATA_FOLDER / '01_ocr.jpg',
preview_width=100,
uuids=[],
uuid='',
base_dir=sidecar_dir
)

with pytest.raises(ValueError):
with pytest.raises(FileNotFoundError):
generate_preview(
input_file=TEST_DATA_FOLDER / '99.jpg',
preview_width=100,
uuids=[],
uuid='',
base_dir=sidecar_dir
)
Loading

0 comments on commit 0b5c26b

Please sign in to comment.