Updated to work with tesseract v4.

maxim2266 · Jan 20, 2020 · 9df9bc5 · 9df9bc5
1 parent 8abf3b6
commit 9df9bc5
Show file tree

Hide file tree

Showing 5 changed files with 170 additions and 112 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause License
 
-Copyright (c) 2017,2018,2019, Maxim Konakov
+Copyright (c) 2017,2018,2019,2020, Maxim Konakov
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without

diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ An ever growing collection of tools to perform [OCR](https://en.wikipedia.org/wi
 
 Doing a good quality OCR in one go is hard. Usually the process includes a number of iterative steps to improve the original
 image quality in order to achieve reasonable recognition, followed by some manual correction of the output text.
-This is not a problem when digitising a receipt for tax return, but processing a book of 500 pages makes
+This is not a problem when digitising a page or two, but processing a book of 500 pages makes
 things a lot harder. Depending on the hardware, the OCR stage itself may take tens of minutes to complete, and the
 other image processing stages may take considerable time as well. The only solution seems to be splitting the process
 into a number of steps where each step can be run (and rerun) independently until a reasonable quality of the output
@@ -20,39 +20,46 @@ In general, the process of converting a document to text includes the following
 The image extraction and OCR steps are relatively easy to automate, and this toolset provides two simple
 scripts to do just that. The image processing step depends heavily on the input image quality and may involve
 a number of different tools. This toolset provides a few scripts that may be useful for building an image processing pipeline.
-Simple text post-processing can be done using the good old Unix `sed` command, at least for the English language,
+Text post-processing can often be done using the good old Unix `sed` command, at least for the English language,
 other languages usually require a more advanced regular expression engine with full Unicode support (think `perl`).
 
 ### Tools
-#### Image extractor (renderer)
+#### Image extractor
 
 ```
-▶ ./render-document
-Usage: render-document [OPTION]... FILE
-Renders pages of a .pdf or .djvu document FILE to grayscale images in PGM format.
+▶ ./extract-images
+Usage: extract-images [OPTION]... FILE
+Renders pages of a .pdf or .djvu FILE to grayscale images in PGM format.
 Options:
   -f N     first page number (optional, default: 1)
   -l N     last page number (optional, default: last page of the document)
   -o DIR   output directory (optional, default: .)
   -h       display this help and exit
 ```
 
-The output is a set of grey-scale images, one per page, with 600dpi resolution. The input document
+The output is a set of grey-scale images, one per page, each with 600dpi resolution. The input document
 does not have to be made of images, any valid document can be rendered by this tool.
 
-#### OCR
+#### OCR on a single image
 ```
-Usage: ocr [OPTION]... DIR
-Extracts text from all image files in DIR.
-Options:
-  -L LANG  document language (optional, default: 'eng')
-  -o FILE  output file name (optional, default output directed to stdout)
-  -e EXT   input files extension (optional, default: pgm)
-  -h       display this help and exit
-  -v       output version information and exit
+▶ ./extract-text
+Usage:	extract-text FILE [OPTION]...
+  Run OCR on the given image FILE. Recognised text is written to STDOUT.
+  All the given options are passed down to the "tesseract" tool.
 ```
 
-A wrapper around `tesseract` tool.
+Essentially, this is a wrapper around `tesseract` tool. The main purpose of the script
+is to validate command line parameters before passing them down to `tesseract`, because
+the original `tesseract` error messages are rather cryptic.
+
+#### OCR
+```
+▶ ./ocr
+Usage:	ocr DIR [OPTION]...
+  Run OCR on all .pgm files in the given directory DIR.
+  Recognised text is written to STDOUT. All the given
+  options are passed down to the "tesseract" tool.
+```
 
 #### Image geometry calculator for cropping
 ```

diff --git a/render-document → extract-images b/render-document → extract-images
@@ -40,10 +40,10 @@ LAST_PAGE=9999
 WORK_DIR=.
 
 # display usage string and exit
-usage_and_exit() {
+exit_with_usage() {
 	cat <<-EOF >&2
 	Usage: $(basename "$0") [OPTION]... FILE
-	Renders pages of a .pdf or .djvu document FILE to grayscale images in PGM format.
+	Renders pages of a .pdf or .djvu FILE to grayscale images in PGM format.
 	Options:
 	  -f N     first page number (optional, default: 1)
 	  -l N     last page number (optional, default: last page of the document)
@@ -72,14 +72,12 @@ ensure_int_arg() {
 
 # ensure correct filesystem type
 ensure_fs_type() {
-	T="$(stat -L -c %F "$2" 2>&1)"
-
-	(( $? == 0 )) || die_ "$T"
+	T="$(stat -L -c %F "$2" 2>&1)" || die_ "$T"
 	[[ "$1" == "$T" ]] || die "\"$2\" is a $T, not a $1"
 }
 
 # options
-(( $# > 0 )) || usage_and_exit
+(( $# > 0 )) || exit_with_usage
 
 while getopts "f:l:o:h" OPT
 do
@@ -93,19 +91,19 @@ do
 		o)	ensure_fs_type "directory" "$OPTARG"
 			WORK_DIR="$OPTARG"
 			;;
-		h)	usage_and_exit
+		h)	exit_with_usage
 			;;
 		\?)	exit 1
 			;;
 	esac
 done
 
+shift $(( OPTIND - 1 )) # remove parsed options and args from $@ list
+
 # check range of pages
 (( FIRST_PAGE <= LAST_PAGE )) || die "invalid range of pages: $FIRST_PAGE..$LAST_PAGE"
 
 # input file
-shift $(( OPTIND - 1 )) # remove parsed options and args from $@ list
-
 case $# in
 	1)	INPUT_FILE="$1"
 		;;
@@ -116,18 +114,17 @@ case $# in
 esac
 
 # check input file availability and MIME type
-FILE_TYPE="$(file -ELb --mime-type "$INPUT_FILE")"
-
-(( $? == 0 )) || die_ "$FILE_TYPE" # because 'file' prints errors to stdout
+FILE_TYPE="$(file -ELb --mime-type "$INPUT_FILE")" || die_ "$FILE_TYPE" # because 'file' prints errors to stdout
 
 # finally, do the conversion
 case "$FILE_TYPE" in
 	'application/pdf')
 		exec pdftoppm -gray -r 600 -f "$FIRST_PAGE" -l "$LAST_PAGE" "$INPUT_FILE" "$WORK_DIR/page"
 		;;
 	'image/vnd.djvu')
-		exec ddjvu -format=pgm -mode=black -scale=600 -eachpage -page="$FIRST_PAGE-$LAST_PAGE" "$INPUT_FILE" "$WORK_DIR/page-%04d.pgm"
+		exec ddjvu -format=pgm -mode=black -scale=600 -eachpage \
+			-page="$FIRST_PAGE-$LAST_PAGE" "$INPUT_FILE" "$WORK_DIR/page-%04d.pgm"
 		;;
-	*)	die "invalid input file MIME type: $FILE_TYPE"
+	*)	die "unsupported input file MIME type: $FILE_TYPE"
 		;;
 esac
diff --git a/extract-text b/extract-text
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+
+# BSD 3-Clause License
+
+# Copyright (c) 2017,2018,2019,2020, Maxim Konakov
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+# This script checks parameters before invoking the "tesseract" tool, because
+# the error messages from the tool are rather cryptic.
+
+# error code
+declare -ri CODE=255	# this stops xargs
+
+# error exit
+die() {
+	echo >&2 "$0: file \"$FILE\": $*"
+	exit $CODE
+}
+
+# error exit with message prefix cut off
+die_() {
+	die "$(cut -d ' ' -f 2- <<< "$*")"
+}
+
+# display usage string and exit
+exit_with_usage() {
+	cat <<-EOF >&2
+	Usage:	$(basename "$0") FILE [OPTION]...
+	  Run OCR on the given image FILE. Recognised text is written to STDOUT.
+	  All the given options are passed down to the "tesseract" tool.
+	EOF
+
+	exit $CODE
+}
+
+# check input parameters
+(( $# > 0 )) || exit_with_usage
+
+# input file name
+FILE="$1"
+shift
+
+# tesseract version check
+TV="$(tesseract -v | head -q -n 1 | cut -s -d ' ' -f 2)" || exit $CODE
+
+(( ${TV%%.[0-9]*} >= 4 )) || die "supported \"tesseract\" version is 4.0.0 or later, this one is \"$TV\""
+
+# find and check the parameter for tesseract -l option, because otherwise
+# the error message will be really confusing. First, find '-l' option
+for (( i = 1; i <= $#; i++ )); do
+  [[ "${!i}" == '-l' ]] && break
+done
+
+(( i == $# )) && die "option error: missing parameter for \"-l\" option"
+
+# if found, then check the option parameter
+if (( i < $# )); then
+	# get option parameter and split on '+' character
+	i=$(( i + 1 ))
+	IFS='+' read -ra LOPT <<< "${!i}"
+
+	# list of supported languages
+	mapfile -s 1 -t LANGS < <(tesseract --list-langs)
+
+	# match parameters to supported languages
+	for l in "${LOPT[@]}"; do
+		for (( i = 0; i < ${#LANGS[@]}; i++ )); do
+			[[ "$l" == "${LANGS[i]}" ]] && break
+		done
+
+		(( i < ${#LANGS[@]} )) || die "option error: language \"$l\" is not supported"
+	done
+fi
+
+# check the input file MIME type
+T="$(file -ELb --mime-type "$FILE")" || die_ "$T"
+
+[[ "$T" =~ ^image/(.{,4}) && "${BASH_REMATCH[1]}" != 'vnd.' ]]	\
+|| die "unsupported MIME type \"$T\""
+
+# temp file for OCR errors
+ERR="$(mktemp --tmpdir)" || exit $CODE
+trap 'rm -f "$ERR"' EXIT
+
+# OCR
+tesseract "$FILE" - -c page_separator='' "$@" 2>"$ERR" \
+|| die "tesseract error: $(grep -m 1 -iE '^Error\>' "$ERR")"
diff --git a/ocr b/ocr
@@ -2,7 +2,7 @@
 
 # BSD 3-Clause License
 
-# Copyright (c) 2017,2018,2019, Maxim Konakov
+# Copyright (c) 2017,2018,2019,2020, Maxim Konakov
 # All rights reserved.
 
 # Redistribution and use in source and binary forms, with or without
@@ -30,92 +30,33 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# version
-declare -r VERSION=0.5.0
+set -o pipefail
 
-# helper functions
-_show_usage() {
+# error exit
+die() {
+	echo >&2 "$0: $*"
+	exit 1
+}
+
+# display usage string and exit
+exit_with_usage() {
 	cat <<-EOF >&2
-	Usage: $0 [OPTION]... DIR
-	Extracts text from all image files in DIR.
-	Options:
-	  -L LANG  document language (optional, default: 'eng')
-	  -o FILE  output file name (optional, default output directed to stdout)
-	  -e EXT   input files extension (optional, default: pgm)
-	  -h       display this help and exit
-	  -v       output version information and exit
+	Usage:	$(basename "$0") DIR [OPTION]...
+	  Run OCR on all .pgm files in the given directory DIR.
+	  Recognised text is written to STDOUT. All the given
+	  options are passed down to the "tesseract" tool.
 	EOF
-}
 
-_die() {
-	echo "ERROR:" "$@" >&2
 	exit 1
 }
 
-# check tesseract version
-[[ -n $(which tesseract) ]] || _die "Program 'tesseract' is not installed"
-
-declare -r TESS_VER=$( tesseract -v 2>&1 | head -q -n 1 | cut -s -d " " -f 2 )
-
-# must have major >= 3
-[[ ${TESS_VER%%.[0-9]*} -ge 3 ]] || _die "'tesseract' version $TESS_VER is not supported"
-
-# options
-[[ $# -gt 0 ]] || { _show_usage ; exit 1 ; }
-
-while getopts "L:o:e:vh" opt
-do
-	case $opt in
-		L)	OCR_LANG=$OPTARG ;;
-		o)	OUT_FILE="$OPTARG" ;;
-		e)	EXT="$OPTARG" ;;
-		v)	echo v$VERSION ; exit 1 ;;
-		h)	_show_usage ; exit 1 ;;
-		\?)	exit 1 ;;
-	esac
-done
-
-shift $((OPTIND-1)) # remove parsed options and args from $@ list
-declare -r EXT=${EXT:-pgm}
+# parameter check
+(( $# > 0 )) || exit_with_usage
 
-# input directory
-case $# in
-	1)	declare -r INPUT_DIR=$(echo "$1" | sed 's:/*$::')	# remove all trailing slashes
-		[[ -d "$INPUT_DIR" ]] || _die "Input is not a directory, or it does not exist: \"$INPUT_DIR\""
-		[[ -n $(find "$INPUT_DIR" -maxdepth 1 -type f -name \*.$EXT) ]] || _die "No files to process in \"$INPUT_DIR\""
-		;;
-	0)	declare -r INPUT_DIR=.
-		;;
-	*)	_die "Cannot process more than one input directory:" "$@"
-		;;
-esac
-
-# language
-declare -r OCR_LANG=${OCR_LANG:-eng}
-
-# cleanup input directory
-find "$INPUT_DIR" -maxdepth 1 -type f \( -name \*.$EXT.err -o -name \*.$EXT.txt \) -delete
+DIR="$1"
+shift
 
 # OCR
-find "$INPUT_DIR" -maxdepth 1 -type f -name \*.$EXT -print0 | xargs -0 -n 1 -P $(nproc) -I % \
-	sh -c "tesseract \"%\" \"%\" -l $OCR_LANG 2>\"%.err\" || exit 255"
-
-if [[ $? -ne 0 ]]
-then
-	find "$INPUT_DIR" -maxdepth 1 -type f -name \*.$EXT.err -exec grep -i -E '^ERROR:?\s+' "{}" \; >&2
-	exit 1
-fi
-
-# cleanup
-find "$INPUT_DIR" -maxdepth 1 -type f -name \*.$EXT.err -delete
-
-# output redirection
-if [[ -n "$OUT_FILE" ]]	# https://stackoverflow.com/a/20564208
-then
-	exec 1<&-
-	exec 1>"$OUT_FILE" || exit 1
-fi
-
-# output
-find "$INPUT_DIR" -maxdepth 1 -type f -name \*.$EXT.txt -print0 | sort -z | xargs -0 cat
-
+find "$DIR" -maxdepth 1 -name '*.pgm' -print0	\
+| sort -z	\
+| xargs -0 -n 1 -I '{}' "$(dirname "$0")/extract-text" '{}' "$@"