From 9df9bc55e5455a0010f9215af885dd054bb7272c Mon Sep 17 00:00:00 2001 From: Maxim Date: Mon, 20 Jan 2020 17:40:03 +0000 Subject: [PATCH] Updated to work with tesseract v4. --- LICENSE | 2 +- README.md | 41 ++++++----- render-document => extract-images | 25 +++---- extract-text | 113 ++++++++++++++++++++++++++++++ ocr | 101 ++++++-------------------- 5 files changed, 170 insertions(+), 112 deletions(-) rename render-document => extract-images (86%) create mode 100755 extract-text diff --git a/LICENSE b/LICENSE index 475daa3..8ba046b 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ BSD 3-Clause License -Copyright (c) 2017,2018,2019, Maxim Konakov +Copyright (c) 2017,2018,2019,2020, Maxim Konakov All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/README.md b/README.md index 8c7d38e..74dfda3 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ An ever growing collection of tools to perform [OCR](https://en.wikipedia.org/wi Doing a good quality OCR in one go is hard. Usually the process includes a number of iterative steps to improve the original image quality in order to achieve reasonable recognition, followed by some manual correction of the output text. -This is not a problem when digitising a receipt for tax return, but processing a book of 500 pages makes +This is not a problem when digitising a page or two, but processing a book of 500 pages makes things a lot harder. Depending on the hardware, the OCR stage itself may take tens of minutes to complete, and the other image processing stages may take considerable time as well. The only solution seems to be splitting the process into a number of steps where each step can be run (and rerun) independently until a reasonable quality of the output @@ -20,16 +20,16 @@ In general, the process of converting a document to text includes the following The image extraction and OCR steps are relatively easy to automate, and this toolset provides two simple scripts to do just that. The image processing step depends heavily on the input image quality and may involve a number of different tools. This toolset provides a few scripts that may be useful for building an image processing pipeline. -Simple text post-processing can be done using the good old Unix `sed` command, at least for the English language, +Text post-processing can often be done using the good old Unix `sed` command, at least for the English language, other languages usually require a more advanced regular expression engine with full Unicode support (think `perl`). ### Tools -#### Image extractor (renderer) +#### Image extractor ``` -▶ ./render-document -Usage: render-document [OPTION]... FILE -Renders pages of a .pdf or .djvu document FILE to grayscale images in PGM format. +▶ ./extract-images +Usage: extract-images [OPTION]... FILE +Renders pages of a .pdf or .djvu FILE to grayscale images in PGM format. Options: -f N first page number (optional, default: 1) -l N last page number (optional, default: last page of the document) @@ -37,22 +37,29 @@ Options: -h display this help and exit ``` -The output is a set of grey-scale images, one per page, with 600dpi resolution. The input document +The output is a set of grey-scale images, one per page, each with 600dpi resolution. The input document does not have to be made of images, any valid document can be rendered by this tool. -#### OCR +#### OCR on a single image ``` -Usage: ocr [OPTION]... DIR -Extracts text from all image files in DIR. -Options: - -L LANG document language (optional, default: 'eng') - -o FILE output file name (optional, default output directed to stdout) - -e EXT input files extension (optional, default: pgm) - -h display this help and exit - -v output version information and exit +▶ ./extract-text +Usage: extract-text FILE [OPTION]... + Run OCR on the given image FILE. Recognised text is written to STDOUT. + All the given options are passed down to the "tesseract" tool. ``` -A wrapper around `tesseract` tool. +Essentially, this is a wrapper around `tesseract` tool. The main purpose of the script +is to validate command line parameters before passing them down to `tesseract`, because +the original `tesseract` error messages are rather cryptic. + +#### OCR +``` +▶ ./ocr +Usage: ocr DIR [OPTION]... + Run OCR on all .pgm files in the given directory DIR. + Recognised text is written to STDOUT. All the given + options are passed down to the "tesseract" tool. +``` #### Image geometry calculator for cropping ``` diff --git a/render-document b/extract-images similarity index 86% rename from render-document rename to extract-images index 0d46dfd..c534bd5 100755 --- a/render-document +++ b/extract-images @@ -40,10 +40,10 @@ LAST_PAGE=9999 WORK_DIR=. # display usage string and exit -usage_and_exit() { +exit_with_usage() { cat <<-EOF >&2 Usage: $(basename "$0") [OPTION]... FILE - Renders pages of a .pdf or .djvu document FILE to grayscale images in PGM format. + Renders pages of a .pdf or .djvu FILE to grayscale images in PGM format. Options: -f N first page number (optional, default: 1) -l N last page number (optional, default: last page of the document) @@ -72,14 +72,12 @@ ensure_int_arg() { # ensure correct filesystem type ensure_fs_type() { - T="$(stat -L -c %F "$2" 2>&1)" - - (( $? == 0 )) || die_ "$T" + T="$(stat -L -c %F "$2" 2>&1)" || die_ "$T" [[ "$1" == "$T" ]] || die "\"$2\" is a $T, not a $1" } # options -(( $# > 0 )) || usage_and_exit +(( $# > 0 )) || exit_with_usage while getopts "f:l:o:h" OPT do @@ -93,19 +91,19 @@ do o) ensure_fs_type "directory" "$OPTARG" WORK_DIR="$OPTARG" ;; - h) usage_and_exit + h) exit_with_usage ;; \?) exit 1 ;; esac done +shift $(( OPTIND - 1 )) # remove parsed options and args from $@ list + # check range of pages (( FIRST_PAGE <= LAST_PAGE )) || die "invalid range of pages: $FIRST_PAGE..$LAST_PAGE" # input file -shift $(( OPTIND - 1 )) # remove parsed options and args from $@ list - case $# in 1) INPUT_FILE="$1" ;; @@ -116,9 +114,7 @@ case $# in esac # check input file availability and MIME type -FILE_TYPE="$(file -ELb --mime-type "$INPUT_FILE")" - -(( $? == 0 )) || die_ "$FILE_TYPE" # because 'file' prints errors to stdout +FILE_TYPE="$(file -ELb --mime-type "$INPUT_FILE")" || die_ "$FILE_TYPE" # because 'file' prints errors to stdout # finally, do the conversion case "$FILE_TYPE" in @@ -126,8 +122,9 @@ case "$FILE_TYPE" in exec pdftoppm -gray -r 600 -f "$FIRST_PAGE" -l "$LAST_PAGE" "$INPUT_FILE" "$WORK_DIR/page" ;; 'image/vnd.djvu') - exec ddjvu -format=pgm -mode=black -scale=600 -eachpage -page="$FIRST_PAGE-$LAST_PAGE" "$INPUT_FILE" "$WORK_DIR/page-%04d.pgm" + exec ddjvu -format=pgm -mode=black -scale=600 -eachpage \ + -page="$FIRST_PAGE-$LAST_PAGE" "$INPUT_FILE" "$WORK_DIR/page-%04d.pgm" ;; - *) die "invalid input file MIME type: $FILE_TYPE" + *) die "unsupported input file MIME type: $FILE_TYPE" ;; esac diff --git a/extract-text b/extract-text new file mode 100755 index 0000000..20e856f --- /dev/null +++ b/extract-text @@ -0,0 +1,113 @@ +#!/usr/bin/env bash + +# BSD 3-Clause License + +# Copyright (c) 2017,2018,2019,2020, Maxim Konakov +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +# This script checks parameters before invoking the "tesseract" tool, because +# the error messages from the tool are rather cryptic. + +# error code +declare -ri CODE=255 # this stops xargs + +# error exit +die() { + echo >&2 "$0: file \"$FILE\": $*" + exit $CODE +} + +# error exit with message prefix cut off +die_() { + die "$(cut -d ' ' -f 2- <<< "$*")" +} + +# display usage string and exit +exit_with_usage() { + cat <<-EOF >&2 + Usage: $(basename "$0") FILE [OPTION]... + Run OCR on the given image FILE. Recognised text is written to STDOUT. + All the given options are passed down to the "tesseract" tool. + EOF + + exit $CODE +} + +# check input parameters +(( $# > 0 )) || exit_with_usage + +# input file name +FILE="$1" +shift + +# tesseract version check +TV="$(tesseract -v | head -q -n 1 | cut -s -d ' ' -f 2)" || exit $CODE + +(( ${TV%%.[0-9]*} >= 4 )) || die "supported \"tesseract\" version is 4.0.0 or later, this one is \"$TV\"" + +# find and check the parameter for tesseract -l option, because otherwise +# the error message will be really confusing. First, find '-l' option +for (( i = 1; i <= $#; i++ )); do + [[ "${!i}" == '-l' ]] && break +done + +(( i == $# )) && die "option error: missing parameter for \"-l\" option" + +# if found, then check the option parameter +if (( i < $# )); then + # get option parameter and split on '+' character + i=$(( i + 1 )) + IFS='+' read -ra LOPT <<< "${!i}" + + # list of supported languages + mapfile -s 1 -t LANGS < <(tesseract --list-langs) + + # match parameters to supported languages + for l in "${LOPT[@]}"; do + for (( i = 0; i < ${#LANGS[@]}; i++ )); do + [[ "$l" == "${LANGS[i]}" ]] && break + done + + (( i < ${#LANGS[@]} )) || die "option error: language \"$l\" is not supported" + done +fi + +# check the input file MIME type +T="$(file -ELb --mime-type "$FILE")" || die_ "$T" + +[[ "$T" =~ ^image/(.{,4}) && "${BASH_REMATCH[1]}" != 'vnd.' ]] \ +|| die "unsupported MIME type \"$T\"" + +# temp file for OCR errors +ERR="$(mktemp --tmpdir)" || exit $CODE +trap 'rm -f "$ERR"' EXIT + +# OCR +tesseract "$FILE" - -c page_separator='' "$@" 2>"$ERR" \ +|| die "tesseract error: $(grep -m 1 -iE '^Error\>' "$ERR")" diff --git a/ocr b/ocr index 91c8797..707204f 100755 --- a/ocr +++ b/ocr @@ -2,7 +2,7 @@ # BSD 3-Clause License -# Copyright (c) 2017,2018,2019, Maxim Konakov +# Copyright (c) 2017,2018,2019,2020, Maxim Konakov # All rights reserved. # Redistribution and use in source and binary forms, with or without @@ -30,92 +30,33 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# version -declare -r VERSION=0.5.0 +set -o pipefail -# helper functions -_show_usage() { +# error exit +die() { + echo >&2 "$0: $*" + exit 1 +} + +# display usage string and exit +exit_with_usage() { cat <<-EOF >&2 - Usage: $0 [OPTION]... DIR - Extracts text from all image files in DIR. - Options: - -L LANG document language (optional, default: 'eng') - -o FILE output file name (optional, default output directed to stdout) - -e EXT input files extension (optional, default: pgm) - -h display this help and exit - -v output version information and exit + Usage: $(basename "$0") DIR [OPTION]... + Run OCR on all .pgm files in the given directory DIR. + Recognised text is written to STDOUT. All the given + options are passed down to the "tesseract" tool. EOF -} -_die() { - echo "ERROR:" "$@" >&2 exit 1 } -# check tesseract version -[[ -n $(which tesseract) ]] || _die "Program 'tesseract' is not installed" - -declare -r TESS_VER=$( tesseract -v 2>&1 | head -q -n 1 | cut -s -d " " -f 2 ) - -# must have major >= 3 -[[ ${TESS_VER%%.[0-9]*} -ge 3 ]] || _die "'tesseract' version $TESS_VER is not supported" - -# options -[[ $# -gt 0 ]] || { _show_usage ; exit 1 ; } - -while getopts "L:o:e:vh" opt -do - case $opt in - L) OCR_LANG=$OPTARG ;; - o) OUT_FILE="$OPTARG" ;; - e) EXT="$OPTARG" ;; - v) echo v$VERSION ; exit 1 ;; - h) _show_usage ; exit 1 ;; - \?) exit 1 ;; - esac -done - -shift $((OPTIND-1)) # remove parsed options and args from $@ list -declare -r EXT=${EXT:-pgm} +# parameter check +(( $# > 0 )) || exit_with_usage -# input directory -case $# in - 1) declare -r INPUT_DIR=$(echo "$1" | sed 's:/*$::') # remove all trailing slashes - [[ -d "$INPUT_DIR" ]] || _die "Input is not a directory, or it does not exist: \"$INPUT_DIR\"" - [[ -n $(find "$INPUT_DIR" -maxdepth 1 -type f -name \*.$EXT) ]] || _die "No files to process in \"$INPUT_DIR\"" - ;; - 0) declare -r INPUT_DIR=. - ;; - *) _die "Cannot process more than one input directory:" "$@" - ;; -esac - -# language -declare -r OCR_LANG=${OCR_LANG:-eng} - -# cleanup input directory -find "$INPUT_DIR" -maxdepth 1 -type f \( -name \*.$EXT.err -o -name \*.$EXT.txt \) -delete +DIR="$1" +shift # OCR -find "$INPUT_DIR" -maxdepth 1 -type f -name \*.$EXT -print0 | xargs -0 -n 1 -P $(nproc) -I % \ - sh -c "tesseract \"%\" \"%\" -l $OCR_LANG 2>\"%.err\" || exit 255" - -if [[ $? -ne 0 ]] -then - find "$INPUT_DIR" -maxdepth 1 -type f -name \*.$EXT.err -exec grep -i -E '^ERROR:?\s+' "{}" \; >&2 - exit 1 -fi - -# cleanup -find "$INPUT_DIR" -maxdepth 1 -type f -name \*.$EXT.err -delete - -# output redirection -if [[ -n "$OUT_FILE" ]] # https://stackoverflow.com/a/20564208 -then - exec 1<&- - exec 1>"$OUT_FILE" || exit 1 -fi - -# output -find "$INPUT_DIR" -maxdepth 1 -type f -name \*.$EXT.txt -print0 | sort -z | xargs -0 cat - +find "$DIR" -maxdepth 1 -name '*.pgm' -print0 \ +| sort -z \ +| xargs -0 -n 1 -I '{}' "$(dirname "$0")/extract-text" '{}' "$@"