summary refs log tree commit diff
diff options
context:
space:
mode:
authorMaxim Cournoyer <maxim.cournoyer@gmail.com>2022-08-11 17:53:21 -0400
committerMaxim Cournoyer <maxim.cournoyer@gmail.com>2022-08-12 00:38:46 -0400
commitf7c027617da44ed684bbb9b35c603b790365fdbf (patch)
tree51b85b1e449a02c2c13e4f0241cbe82f7c710b43
parent887dbf4d802d1061e08eb47026f1adab2c43d791 (diff)
downloadguix-f7c027617da44ed684bbb9b35c603b790365fdbf.tar.gz
gnu: tesseract-ocr: Update to 5.2.0.
* gnu/packages/ocr.scm (tesseract-ocr): Update to 5.2.0.
[inputs, native-inputs]: Move after arguments.  Use new style inputs.
[arguments]: Use gexps.
[configure-flags]: Add --disable-static.
[phases]{fix-docbook}: Replace phase with...
{do-not-override-xml-catalog-files}: ... this new phase.
{build-training}: Move after build phase.  Enable parallel build.
{trailing-install}: Move after install phase.
[native-inputs]: Add libxml2.
-rw-r--r--gnu/packages/ocr.scm124
1 files changed, 63 insertions, 61 deletions
diff --git a/gnu/packages/ocr.scm b/gnu/packages/ocr.scm
index 11595e87e5..e28bd17668 100644
--- a/gnu/packages/ocr.scm
+++ b/gnu/packages/ocr.scm
@@ -5,6 +5,7 @@
 ;;; Copyright © 2019 Alex Vong <alexvong1995@gmail.com>
 ;;; Copyright © 2021 Andy Tai <atai@atai.org>
 ;;; Copyright © 2021, 2022 Nicolas Goaziou <mail@nicolasgoaziou.fr>
+;;; Copyright © 2022 Maxim Cournoyer <maxim.cournoyer@gmail.com>
 ;;;
 ;;; This file is part of GNU Guix.
 ;;;
@@ -74,71 +75,72 @@ it produces text in 8-bit or UTF-8 formats.")
     (license license:gpl3+)))
 
 (define-public tesseract-ocr
-  ;; There are useful commits beyond the last official stable release.
-  (let ((commit "97079fa353557af6df86fd20b5d2e0dff5d8d5df")
-        (revision "1"))
-    (package
-      (name "tesseract-ocr")
-      (version (git-version "4.1.1" revision commit))
-      (source
-       (origin
-         (method git-fetch)
-         (uri (git-reference
-               (url "https://github.com/tesseract-ocr/tesseract")
-               (commit commit)))
-         (file-name (git-file-name name version))
-         (sha256
-          (base32
-           "11137a4aaay7qp64vdjd83hz1l089nzi5a0ql0qgk8gn79pyhi98"))))
-      (build-system gnu-build-system)
-      (inputs
-       `(("cairo" ,cairo)
-         ("icu" ,icu4c)
-         ("leptonica" ,leptonica)
-         ("pango" ,pango)
-         ("python-wrapper" ,python-wrapper)))
-      (native-inputs
-       `(("asciidoc" ,asciidoc)
-         ("autoconf" ,autoconf)
-         ("automake" ,automake)
-         ("docbook-xsl" ,docbook-xsl)
-         ("libarchive" ,libarchive)
-         ("libcurl" ,curl)
-         ("libtool" ,libtool)
-         ("libtiff" ,libtiff)
-         ("pkg-config" ,pkg-config)
-         ("xsltproc" ,libxslt)))
-      (arguments
-       `(#:configure-flags
-         (let ((leptonica (assoc-ref %build-inputs "leptonica")))
-           (list (string-append "LIBLEPT_HEADERSDIR=" leptonica "/include")))
-         #:tests? #f ; Tests currently result in a segfault
-         #:phases
-         (modify-phases %standard-phases
-           (add-after 'unpack 'fix-docbook
-             (lambda* (#:key inputs #:allow-other-keys)
-               ;; Don't attempt to download XSL schema.
-               (substitute* "doc/Makefile.am"
-                 (("http://docbook.sourceforge.net/release/xsl/current/manpages/docbook.xsl")
-                  (string-append (assoc-ref inputs "docbook-xsl")
-                                 "/xml/xsl/docbook-xsl-"
-                                 ,(package-version docbook-xsl)
-                                 "/manpages/docbook.xsl")))))
-           (add-after 'install 'build-training
-             (lambda _
-               (invoke "make" "training")))
-           (add-after 'build-training 'install-training
-             (lambda _
-               (invoke "make" "training-install"))))))
-      (home-page "https://github.com/tesseract-ocr/tesseract")
-      (synopsis "Optical character recognition engine")
-      (description
-       "Tesseract is an optical character recognition (OCR) engine with very
+  (package
+    (name "tesseract-ocr")
+    (version "5.2.0")
+    (source
+     (origin
+       (method git-fetch)
+       (uri (git-reference
+             (url "https://github.com/tesseract-ocr/tesseract")
+             (commit version)))
+       (file-name (git-file-name name version))
+       (sha256
+        (base32
+         "0dai539h07lqj8lyhznd3wbwdpqr78qrsczq78rsmsryqvmdbyaa"))))
+    (build-system gnu-build-system)
+    (arguments
+     (list
+      #:configure-flags
+      #~(list (string-append "LIBLEPT_HEADERSDIR="
+                             #$(this-package-input "leptonica") "/include")
+              "--disable-static")       ;avoid 6 MiB static archive
+      ;; The unit tests are disabled because they require building bundled
+      ;; third party libraries.
+      #:tests? #f
+      #:phases
+      #~(modify-phases %standard-phases
+          (add-after 'unpack 'do-not-override-xml-catalog-files
+            (lambda _
+              (substitute* "configure.ac"
+                (("AC_SUBST\\(\\[XML_CATALOG_FILES])")
+                 ""))))
+          (add-after 'build 'build-training
+            (lambda* (#:key parallel-build? #:allow-other-keys)
+              (define n (if parallel-build? (number->string
+                                             (parallel-job-count))
+                            "1"))
+              (invoke "make" "-j" n "training")))
+          (add-after 'install 'install-training
+            (lambda _
+              (invoke "make" "training-install"))))))
+    (native-inputs
+     (list asciidoc
+           autoconf
+           automake
+           curl
+           docbook-xsl
+           libarchive
+           libtiff
+           libtool
+           libxml2                      ;for XML_CATALOG_FILES
+           libxslt
+           pkg-config))
+    (inputs
+     (list cairo
+           icu4c
+           leptonica
+           pango
+           python-wrapper))
+    (home-page "https://github.com/tesseract-ocr/tesseract")
+    (synopsis "Optical character recognition engine")
+    (description
+     "Tesseract is an optical character recognition (OCR) engine with very
 high accuracy.  It supports many languages, output text formatting, hOCR
 positional information and page layout analysis.  Several image formats are
 supported through the Leptonica library.  It can also detect whether text is
 monospaced or proportional.")
-      (license license:asl2.0))))
+    (license license:asl2.0)))
 
 (define-public gimagereader
   (package