gnu: tesseract-ocr: Make the default install minimally useful.

Fixes <https://issues.guix.gnu.org/47536>. * gnu/packages/ocr.scm (tesseract-ocr) [phases]{adjust-TESSDATA_PREFIX-macro}: New phase. {install-minimal-tessdata}: New phase. [native-inputs]: Add tesseract-ocr-tessdata-fast. [search-paths]: New field. [description]: Mention how to add support for more languages.
author: Maxim Cournoyer <maxim.cournoyer@gmail.com> 2022-08-11 23:58:24 -0400
committer: Maxim Cournoyer <maxim.cournoyer@gmail.com> 2022-08-12 15:43:48 -0400
commit: ff0600c5efb519925c270e9f54f43ecf096e564e (patch)
tree: 541caf39c0afb17e12d675c8fdfc24288fd0a1a4 /gnu/packages/ocr.scm
parent: a6b6b0e89e6a3aefa0241de9f171424ad99be30c (diff)
download: guix-ff0600c5efb519925c270e9f54f43ecf096e564e.tar
guix-ff0600c5efb519925c270e9f54f43ecf096e564e.tar.gz
1 files changed, 30 insertions, 3 deletions
diff --git a/gnu/packages/ocr.scm b/gnu/packages/ocr.scm
index e2c9f561cc..21d257ef24 100644
--- a/gnu/packages/ocr.scm
+++ b/gnu/packages/ocr.scm
@@ -132,6 +132,15 @@ models for the Tesseract OCR Engine.")
               (substitute* "configure.ac"
                 (("AC_SUBST\\(\\[XML_CATALOG_FILES])")
                  ""))))
+          (add-after 'unpack 'adjust-TESSDATA_PREFIX-macro
+            (lambda _
+              ;; Use a deeper TESSDATA_PREFIX hierarchy so that a more
+              ;; specific search-path than '/share' can be specified.  The
+              ;; build system uses CPPFLAGS for itself, so we can't simply set
+              ;; a make flag.
+              (substitute* "Makefile.am"
+                (("-DTESSDATA_PREFIX='\"@datadir@\"'")
+                 "-DTESSDATA_PREFIX='\"@datadir@/tesseract-ocr\"'"))))
           (add-after 'build 'build-training
             (lambda* (#:key parallel-build? #:allow-other-keys)
               (define n (if parallel-build? (number->string
@@ -140,7 +149,18 @@ models for the Tesseract OCR Engine.")
               (invoke "make" "-j" n "training")))
           (add-after 'install 'install-training
             (lambda _
-              (invoke "make" "training-install"))))))
+              (invoke "make" "training-install")))
+          (add-after 'install 'install-minimal-tessdata
+            ;; tesseract-ocr cannot be used without its trained models data;
+            ;; install the English language as a minimal base which can be
+            ;; extended via TESSDATA_PREFIX.
+            (lambda* (#:key native-inputs inputs #:allow-other-keys)
+              (define eng.traineddata
+                "/share/tesseract-ocr/tessdata/eng.traineddata")
+              (install-file (search-input-file (or native-inputs inputs)
+                                               eng.traineddata)
+                            (dirname (string-append #$output
+                                                    eng.traineddata))))))))
     (native-inputs
      (list asciidoc
            autoconf
@@ -152,13 +172,18 @@ models for the Tesseract OCR Engine.")
            libtool
            libxml2                      ;for XML_CATALOG_FILES
            libxslt
-           pkg-config))
+           pkg-config
+           tesseract-ocr-tessdata-fast))
     (inputs
      (list cairo
            icu4c
            leptonica
            pango
            python-wrapper))
+    (native-search-paths (list (search-path-specification
+                                (variable "TESSDATA_PREFIX")
+                                (files (list "share/tesseract-ocr/tessdata"))
+                                (separator #f)))) ;single value
     (home-page "https://github.com/tesseract-ocr/tesseract")
     (synopsis "Optical character recognition engine")
     (description
@@ -166,7 +191,9 @@ models for the Tesseract OCR Engine.")
 high accuracy.  It supports many languages, output text formatting, hOCR
 positional information and page layout analysis.  Several image formats are
 supported through the Leptonica library.  It can also detect whether text is
-monospaced or proportional.")
+monospaced or proportional.  Support for the English language is included by
+default.  To add support for more languages, the
+@code{tesseract-ocr-tessdata-fast} package should be installed.")
     (license license:asl2.0)))
 
 (define-public gimagereader
author	Maxim Cournoyer <maxim.cournoyer@gmail.com>	2022-08-11 23:58:24 -0400
committer	Maxim Cournoyer <maxim.cournoyer@gmail.com>	2022-08-12 15:43:48 -0400
commit	ff0600c5efb519925c270e9f54f43ecf096e564e (patch)
tree	541caf39c0afb17e12d675c8fdfc24288fd0a1a4 /gnu/packages/ocr.scm
parent	a6b6b0e89e6a3aefa0241de9f171424ad99be30c (diff)
download	guix-ff0600c5efb519925c270e9f54f43ecf096e564e.tar guix-ff0600c5efb519925c270e9f54f43ecf096e564e.tar.gz