diff options
Diffstat (limited to 'gnu/packages/ocr.scm')
-rw-r--r-- | gnu/packages/ocr.scm | 164 |
1 files changed, 142 insertions, 22 deletions
diff --git a/gnu/packages/ocr.scm b/gnu/packages/ocr.scm index dc4930918a..15e1811d6b 100644 --- a/gnu/packages/ocr.scm +++ b/gnu/packages/ocr.scm @@ -3,6 +3,8 @@ ;;; Copyright © 2016, 2020 Efraim Flashner <efraim@flashner.co.il> ;;; Copyright © 2019 Tobias Geerinckx-Rice <me@tobias.gr> ;;; Copyright © 2019 Alex Vong <alexvong1995@gmail.com> +;;; Copyright © 2021 Andy Tai <atai@atai.org> +;;; Copyright © 2021 Nicolas Goaziou <mail@nicolasgoaziou.fr> ;;; ;;; This file is part of GNU Guix. ;;; @@ -24,10 +26,29 @@ #:use-module (guix packages) #:use-module (guix download) #:use-module (guix git-download) + #:use-module (guix build-system cmake) #:use-module (guix build-system gnu) #:use-module (guix build-system python) + #:use-module (gnu packages) + #:use-module (gnu packages autotools) + #:use-module (gnu packages backup) + #:use-module (gnu packages check) #:use-module (gnu packages compression) + #:use-module (gnu packages curl) + #:use-module (gnu packages djvu) + #:use-module (gnu packages docbook) + #:use-module (gnu packages documentation) + #:use-module (gnu packages enchant) + #:use-module (gnu packages gettext) + #:use-module (gnu packages glib) + #:use-module (gnu packages gtk) + #:use-module (gnu packages icu4c) + #:use-module (gnu packages pdf) + #:use-module (gnu packages pkg-config) #:use-module (gnu packages python) + #:use-module (gnu packages qt) + #:use-module (gnu packages scanner) + #:use-module (gnu packages xml) #:use-module (gnu packages image)) (define-public ocrad @@ -52,34 +73,132 @@ it produces text in 8-bit or UTF-8 formats.") (license license:gpl3+))) (define-public tesseract-ocr + ;; There are useful commits beyond the last official stable release. + (let ((commit "97079fa353557af6df86fd20b5d2e0dff5d8d5df") + (revision "1")) + (package + (name "tesseract-ocr") + (version (git-version "4.1.1" revision commit)) + (source + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/tesseract-ocr/tesseract") + (commit commit))) + (file-name (git-file-name name version)) + (sha256 + (base32 + "11137a4aaay7qp64vdjd83hz1l089nzi5a0ql0qgk8gn79pyhi98")))) + (build-system gnu-build-system) + (inputs + `(("cairo" ,cairo) + ("icu" ,icu4c) + ("leptonica" ,leptonica) + ("pango" ,pango) + ("python-wrapper" ,python-wrapper))) + (native-inputs + `(("asciidoc" ,asciidoc) + ("autoconf" ,autoconf) + ("automake" ,automake) + ("docbook-xsl" ,docbook-xsl) + ("libarchive" ,libarchive) + ("libcurl" ,curl) + ("libtool" ,libtool) + ("libtiff" ,libtiff) + ("pkg-config" ,pkg-config) + ("xsltproc" ,libxslt))) + (arguments + `(#:configure-flags + (let ((leptonica (assoc-ref %build-inputs "leptonica"))) + (list (string-append "LIBLEPT_HEADERSDIR=" leptonica "/include"))) + #:tests? #f ; Tests currently result in a segfault + #:phases + (modify-phases %standard-phases + (add-after 'unpack 'fix-docbook + (lambda* (#:key inputs #:allow-other-keys) + ;; Don't attempt to download XSL schema. + (substitute* "doc/Makefile.am" + (("http://docbook.sourceforge.net/release/xsl/current/manpages/docbook.xsl") + (string-append (assoc-ref inputs "docbook-xsl") + "/xml/xsl/docbook-xsl-" + ,(package-version docbook-xsl) + "/manpages/docbook.xsl"))))) + (add-after 'install 'build-training + (lambda _ + (invoke "make" "training"))) + (add-after 'build-training 'install-training + (lambda _ + (invoke "make" "training-install")))))) + (home-page "https://github.com/tesseract-ocr/tesseract") + (synopsis "Optical character recognition engine") + (description + "Tesseract is an optical character recognition (OCR) engine with very +high accuracy. It supports many languages, output text formatting, hOCR +positional information and page layout analysis. Several image formats are +supported through the Leptonica library. It can also detect whether text is +monospaced or proportional.") + (license license:asl2.0)))) + +(define-public gimagereader (package - (name "tesseract-ocr") - (version "3.04.01") + (name "gimagereader") + (version "3.3.1") (source (origin - (method git-fetch) - (uri (git-reference - (url "https://github.com/tesseract-ocr/tesseract") - (commit version))) - (file-name (git-file-name name version)) + (method url-fetch) + (uri (string-append + "https://github.com/manisandro/gImageReader/releases" + "/download/v" version "/" + "gimagereader-" version ".tar.xz")) (sha256 - (base32 "0h1x4z1h86n2gwknd0wck6gykkp99bmm02lg4a47a698g4az6ybv")))) - (build-system gnu-build-system) - (inputs - `(("leptonica" ,leptonica))) + (base32 "1pghffb55k3wq33nbn9fi0lmjbldpmvqs2msnvss8bxz1k1ck23n")))) + (build-system cmake-build-system) (arguments - '(#:configure-flags - (let ((leptonica (assoc-ref %build-inputs "leptonica"))) - (list (string-append "LIBLEPT_HEADERSDIR=" leptonica "/include"))))) - (home-page "https://github.com/tesseract-ocr/tesseract") - (synopsis "Optical character recognition engine") + `(#:tests? #f ;no test + #:configure-flags (list "-DENABLE_VERSIONCHECK=0") + #:phases + (modify-phases %standard-phases + (add-after 'unpack 'fix-build + ;; XXX: Prevent compilation error: "incomplete type ‘QUrl’ used in + ;; nested name specifier". Fixed upstream as + ;; 6209e25dab20b233e399ff36fabe4252db0f9e44. It can be removed in + ;; release 3.3.2+. + (lambda _ + (with-directory-excursion "qt/src/hocr" + (substitute* '("HOCROdtExporter.cc" "HOCRTextExporter.cc") + (("#include <QMessageBox>\n" all) + (string-append all "#include <QUrl>\n")))) + #t))))) + (native-inputs + `(("gettext" ,gettext-minimal) + ("intltool" ,intltool) + ("pkg-config" ,pkg-config))) + (inputs + `(("enchant" ,enchant) + ("djvulibre" ,djvulibre) + ("leptonica" ,leptonica) + ("podofo" ,podofo) + ("poppler-qt5" ,poppler-qt5) + ("sane-backends" ,sane-backends) + ("qtbase" ,qtbase) + ("qtspell" ,qtspell) + ("quazip" ,quazip) + ("tesseract" ,tesseract-ocr))) + (home-page "https://github.com/manisandro/gImageReader") + (synopsis "Qt front-end to tesseract-ocr") (description - "Tesseract is an optical character recognition (OCR) engine with very -high accuracy. It supports many languages, output text formatting, hOCR -positional information and page layout analysis. Several image formats are -supported through the Leptonica library. It can also detect whether text is -monospaced or proportional.") - (license license:asl2.0))) + "gImageReader is a Qt front-end to Tesseract optical character +recognition (OCR) software. + +gImageReader supports automatic page layout detection but the user can +also manually define and adjust the recognition regions. It is +possible to import images from disk, scanning devices, clipboard and +screenshots. gImageReader also supports multipage PDF documents. +Recognized text is displayed directly next to the image and basic text +editing including search/replace and removing of line breaks is +possible. Spellchecking for the output text is also supported if the +corresponding dictionaries are installed.") + (license license:gpl3+))) (define-public zinnia (let* ((commit "581faa8f6f15e4a7b21964be3a5ec36265c80e5b") @@ -151,3 +270,4 @@ that allows us to create any hand-written recognition systems with low-cost.") #t))))) (inputs `(("zinnia" ,zinnia))))) + |