summary refs log tree commit diff
path: root/gnu/packages/bioinformatics.scm
diff options
context:
space:
mode:
Diffstat (limited to 'gnu/packages/bioinformatics.scm')
-rw-r--r--gnu/packages/bioinformatics.scm543
1 files changed, 477 insertions, 66 deletions
diff --git a/gnu/packages/bioinformatics.scm b/gnu/packages/bioinformatics.scm
index 313c70784a..0d09a98dc5 100644
--- a/gnu/packages/bioinformatics.scm
+++ b/gnu/packages/bioinformatics.scm
@@ -3,7 +3,7 @@
 ;;; Copyright © 2015, 2016, 2017, 2018 Ben Woodcroft <donttrustben@gmail.com>
 ;;; Copyright © 2015, 2016, 2018, 2019, 2020 Pjotr Prins <pjotr.guix@thebird.nl>
 ;;; Copyright © 2015 Andreas Enge <andreas@enge.fr>
-;;; Copyright © 2016, 2020 Roel Janssen <roel@gnu.org>
+;;; Copyright © 2016, 2020, 2021 Roel Janssen <roel@gnu.org>
 ;;; Copyright © 2016, 2017, 2018, 2019, 2020, 2021 Efraim Flashner <efraim@flashner.co.il>
 ;;; Copyright © 2016, 2020 Marius Bakke <mbakke@fastmail.com>
 ;;; Copyright © 2016, 2018 Raoul Bonnal <ilpuccio.febo@gmail.com>
@@ -130,6 +130,7 @@
   #:use-module (gnu packages qt)
   #:use-module (gnu packages rdf)
   #:use-module (gnu packages readline)
+  #:use-module (gnu packages rsync)
   #:use-module (gnu packages ruby)
   #:use-module (gnu packages serialization)
   #:use-module (gnu packages shells)
@@ -144,6 +145,7 @@
   #:use-module (gnu packages tls)
   #:use-module (gnu packages vim)
   #:use-module (gnu packages web)
+  #:use-module (gnu packages wget)
   #:use-module (gnu packages xml)
   #:use-module (gnu packages xorg)
   #:use-module (srfi srfi-1)
@@ -311,7 +313,7 @@ BAM files.")
 (define-public bcftools
   (package
     (name "bcftools")
-    (version "1.11")
+    (version "1.12")
     (source (origin
               (method url-fetch)
               (uri (string-append "https://github.com/samtools/bcftools/"
@@ -319,11 +321,11 @@ BAM files.")
                                   version "/bcftools-" version ".tar.bz2"))
               (sha256
                (base32
-                "0r508mp15pqzf8r1269kb4v5naw9zsvbwd3cz8s1yj7carsf9viw"))
+                "1x94l1hy2pi3lbz0sxlbw0g6q5z5apcrhrlcwda94ns9n4r6a3ks"))
               (modules '((guix build utils)))
               (snippet '(begin
                           ;; Delete bundled htslib.
-                          (delete-file-recursively "htslib-1.11")
+                          (delete-file-recursively "htslib-1.12")
                           #t))))
     (build-system gnu-build-system)
     (arguments
@@ -437,7 +439,7 @@ computational cluster.")
 (define-public bedtools
   (package
     (name "bedtools")
-    (version "2.29.2")
+    (version "2.30.0")
     (source (origin
               (method url-fetch)
               (uri (string-append "https://github.com/arq5x/bedtools2/releases/"
@@ -445,7 +447,7 @@ computational cluster.")
                                   "bedtools-" version ".tar.gz"))
               (sha256
                (base32
-                "0m3hk6548846w83a9s5drsczvy67n2azx41kj71n03klb2gbzwg3"))))
+                "1f2hh79l7dn147c2xyfgf5wfjvlqfw32kjfnnh2n1qy6rpzx2fik"))))
     (build-system gnu-build-system)
     (arguments
      '(#:test-target "test"
@@ -457,7 +459,7 @@ computational cluster.")
     (native-inputs
      `(("python" ,python-wrapper)))
     (inputs
-     `(("samtools" ,samtools-1.9)
+     `(("samtools" ,samtools)
        ("zlib" ,zlib)))
     (home-page "https://github.com/arq5x/bedtools2")
     (synopsis "Tools for genome analysis and arithmetic")
@@ -553,6 +555,40 @@ input and output BAMs must adhere to the PacBio BAM format specification.
 Non-PacBio BAMs will cause exceptions to be thrown.")
     (license license:bsd-3)))
 
+(define-public pbgzip
+  (let ((commit "2b09f97b5f20b6d83c63a5c6b408d152e3982974"))
+    (package
+      (name "pbgzip")
+      (version (git-version "0.0.0" "0" commit))
+      (source (origin
+                (method git-fetch)
+                (uri (git-reference
+                      (url "https://github.com/nh13/pbgzip")
+                      (commit commit)))
+                (file-name (git-file-name name version))
+                (sha256
+                 (base32
+                  "1mlmq0v96irbz71bgw5zcc43g1x32zwnxx21a5p1f1ch4cikw1yd"))))
+      (build-system gnu-build-system)
+      (native-inputs
+       `(("autoconf" ,autoconf)
+         ("automake" ,automake)))
+      (inputs
+       `(("zlib" ,zlib)))
+      (home-page "https://github.com/nh13/pbgzip")
+      (synopsis "Parallel Block GZIP")
+      (description "This package implements parallel block gzip.  For many
+formats, in particular genomics data formats, data are compressed in
+fixed-length blocks such that they can be easily indexed based on a (genomic)
+coordinate order, since typically each block is sorted according to this order.
+This allows for each block to be individually compressed (deflated), or more
+importantly, decompressed (inflated), with the latter enabling random retrieval
+of data in large files (gigabytes to terabytes).  @code{pbgzip} is not limited
+to any particular format, but certain features are tailored to genomics data
+formats when enabled.  Parallel decompression is somewhat faster, but the true
+speedup comes during compression.")
+      (license license:expat))))
+
 (define-public blasr-libcpp
   (package
     (name "blasr-libcpp")
@@ -797,13 +833,13 @@ intended to behave exactly the same as the original BWK awk.")
 (define-public python-pybedtools
   (package
     (name "python-pybedtools")
-    (version "0.8.1")
+    (version "0.8.2")
     (source (origin
               (method url-fetch)
               (uri (pypi-uri "pybedtools" version))
               (sha256
                (base32
-                "14w5i40gi25clrr7h4wa2pcpnyipya8hrqi7nq77553zc5wf0df0"))))
+                "0wc7z8g8prgdx7n5chjva2fdq03wiwhqisjjxzkjg1j5k5ha7151"))))
     (build-system python-build-system)
     (arguments
      `(#:modules ((srfi srfi-26)
@@ -814,13 +850,6 @@ intended to behave exactly the same as the original BWK awk.")
        (modify-phases %standard-phases
          (add-after 'unpack 'disable-broken-tests
            (lambda _
-             (substitute* "pybedtools/test/test_scripts.py"
-               ;; This test freezes.
-               (("def test_intron_exon_reads")
-                "def _do_not_test_intron_exon_reads")
-               ;; This test fails in the Python 2 build.
-               (("def test_venn_mpl")
-                "def _do_not_test_venn_mpl"))
              (substitute* "pybedtools/test/test_helpers.py"
                ;; Requires internet access.
                (("def test_chromsizes")
@@ -1097,6 +1126,31 @@ converted to Alignment objects, and so on.  This means that the objects
 provide a coordinated and extensible framework to do computational biology.")
       (license license:perl-license))))
 
+(define-public perl-bio-db-hts
+  (package
+    (name "perl-bio-db-hts")
+    (version "3.01")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (string-append "mirror://cpan/authors/id/A/AV/AVULLO/Bio-DB-HTS-"
+                           version ".tar.gz"))
+       (sha256
+        (base32
+         "0hjg0igfkpvh27zdkdr6pa7cqm9n6r7cwz0np74cl4wmawgvr9hj"))))
+    (build-system perl-build-system)
+    (native-inputs
+     `(("perl-module-build" ,perl-module-build)
+       ("pkg-config" ,pkg-config)))
+    (propagated-inputs
+     `(("bioperl-minimal" ,bioperl-minimal)
+       ("htslib" ,htslib-1.9)))
+    (home-page "https://metacpan.org/release/Bio-DB-HTS")
+    (synopsis "Perl interface to HTS library for DNA sequencing")
+    (description "This is a Perl interface to the HTS library for DNA
+sequencing.")
+    (license license:asl2.0)))
+
 (define-public python-biopython
   (package
     (name "python-biopython")
@@ -1324,7 +1378,7 @@ package provides command line tools using the Bio++ library.")
 (define-public blast+
   (package
     (name "blast+")
-    (version "2.10.1")
+    (version "2.11.0")
     (source (origin
               (method url-fetch)
               (uri (string-append
@@ -1332,7 +1386,7 @@ package provides command line tools using the Bio++ library.")
                     version "/ncbi-blast-" version "+-src.tar.gz"))
               (sha256
                (base32
-                "11kvrrl0mcwww6530r55hccpg3x3msmhr3051fwnjbq8rzg2j1qi"))
+                "0m0r9vkw631ky1za1wilsfk9k9spwqh22nkrb9a57rbwmrc1i3nq"))
               (modules '((guix build utils)))
               (snippet
                '(begin
@@ -4421,7 +4475,7 @@ performance.")
 (define-public htslib
   (package
     (name "htslib")
-    (version "1.11")
+    (version "1.12")
     (source (origin
               (method url-fetch)
               (uri (string-append
@@ -4429,7 +4483,7 @@ performance.")
                     version "/htslib-" version ".tar.bz2"))
               (sha256
                (base32
-                "1mrq4mihzx37yqhj3sfz6da6mw49niia808bzsw2gkkgmadxvyng"))))
+                "1jplnvizgr0fyyvvmkfmnsywrrpqhid3760vw15bllz98qdi9012"))))
     (build-system gnu-build-system)
     ;; Let htslib translate "gs://" and "s3://" to regular https links with
     ;; "--enable-gcs" and "--enable-s3". For these options to work, we also
@@ -5846,7 +5900,7 @@ to the user's query of interest.")
 (define-public samtools
   (package
     (name "samtools")
-    (version "1.11")
+    (version "1.12")
     (source
      (origin
        (method url-fetch)
@@ -5855,11 +5909,11 @@ to the user's query of interest.")
                        version "/samtools-" version ".tar.bz2"))
        (sha256
         (base32
-         "1dp5wknak4arnw5ghhif9mmljlfnw5bgm91wib7z0j8wdjywx0z2"))
+         "1jrdj2idpma5ja9cg0rr73b565vdbr9wyy6zig54bidicc2pg8vd"))
        (modules '((guix build utils)))
        (snippet '(begin
                    ;; Delete bundled htslib.
-                   (delete-file-recursively "htslib-1.11")
+                   (delete-file-recursively "htslib-1.12")
                    #t))))
     (build-system gnu-build-system)
     (arguments
@@ -7175,6 +7229,43 @@ clustering analysis, differential analysis, motif inference and exploration of
 single cell ATAC-seq sequencing data.")
     (license license:gpl3)))
 
+(define-public r-shinycell
+  (let ((commit
+         "aecbd56e66802f28e397f5ae1f19403aadd12163")
+        (revision "1"))
+    (package
+      (name "r-shinycell")
+      (version (git-version "2.0.0" revision commit))
+      (source
+       (origin
+         (method git-fetch)
+         (uri (git-reference
+               (url "https://github.com/SGDDNB/ShinyCell")
+               (commit commit)))
+         (file-name (git-file-name name version))
+         (sha256
+          (base32
+           "13jn2ikmvljnzayk485g1mmq5abcp9m1b8n1djdb1agmn83zaki5"))))
+      (properties `((upstream-name . "ShinyCell")))
+      (build-system r-build-system)
+      (propagated-inputs
+       `(("r-data-table" ,r-data-table)
+         ("r-ggplot2" ,r-ggplot2)
+         ("r-glue" ,r-glue)
+         ("r-gridextra" ,r-gridextra)
+         ("r-hdf5r" ,r-hdf5r)
+         ("r-matrix" ,r-matrix)
+         ("r-r-utils" ,r-r-utils)
+         ("r-rcolorbrewer" ,r-rcolorbrewer)
+         ("r-readr" ,r-readr)
+         ("r-reticulate" ,r-reticulate)))
+      (home-page "https://github.com/SGDDNB/ShinyCell")
+      (synopsis "Shiny interactive web apps for single-cell data")
+      (description
+       "This package provides Shiny apps for interactive exploration of
+single-cell data.")
+      (license license:gpl3))))
+
 (define-public r-archr
   (let ((commit "46b519ffb6f73edf132497ac31650d19ef055dc1")
         (revision "1"))
@@ -7489,6 +7580,64 @@ Perl and can be helpful if you want to filter, reformat, or trim your sequence
 data.  It also generates basic statistics for your sequences.")
     (license license:gpl3+)))
 
+(define-public shorah
+  (package
+    (name "shorah")
+    (version "1.99.2")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (string-append "https://github.com/cbg-ethz/shorah"
+                           "/releases/download/v" version
+                           "/shorah-" version ".tar.xz"))
+       (sha256
+        (base32
+         "158dir9qcqspknlnyfr9zwk41x48nrh5wcg10k2grh9cidp9daiq"))))
+    (build-system gnu-build-system)
+    (arguments
+     `(#:phases
+       (modify-phases %standard-phases
+         (add-after 'unpack 'fix-test-wrapper
+           (lambda* (#:key outputs #:allow-other-keys)
+             (let ((bin (string-append (assoc-ref outputs "out") "/bin")))
+               (substitute* "examples/run_end2end_test"
+                 (("\\$\\{interpreter\\} ../\\$\\{testscript\\}")
+                  (string-append bin "/${testscript}"))))))
+         (delete 'check)
+         (add-after 'install 'wrap-programs
+           (lambda* (#:key outputs #:allow-other-keys)
+             (let* ((out (assoc-ref outputs "out"))
+                    (site (string-append
+                           out "/lib/python"
+                           ,(version-major+minor
+                             (package-version python))
+                           "/site-packages"))
+                    (pythonpath (getenv "PYTHONPATH"))
+                    (script (string-append out "/bin/shorah")))
+               (chmod script #o555)
+               (wrap-program script `("PYTHONPATH" ":" prefix (,site ,pythonpath))))))
+         (add-after 'wrap-programs 'check
+           (lambda* (#:key tests? #:allow-other-keys)
+             (when tests?
+               (invoke "make" "check")))))))
+    (inputs
+     `(("boost" ,boost)
+       ("htslib" ,htslib)
+       ("python" ,python)
+       ("python-biopython" ,python-biopython)
+       ("python-numpy" ,python-numpy)
+       ("zlib" ,zlib)))
+    (native-inputs
+     `(("pkg-config" ,pkg-config)))
+    (home-page "")
+    (synopsis "Short reads assembly into haplotypes")
+    (description
+     "ShoRAH is a project for the analysis of next generation sequencing data.
+It is designed to analyse genetically heterogeneous samples.  Its tools
+provide error correction, haplotype reconstruction and estimation of the
+frequency of the different genetic variants present in a mixed sample.")
+    (license license:gpl3+)))
+
 (define-public ruby-bio-kseq
   (package
     (name "ruby-bio-kseq")
@@ -7633,40 +7782,6 @@ including VCF header and contents in RDF and JSON.")
     (home-page "https://github.com/vcflib/bio-vcf")
     (license license:expat)))
 
-(define-public r-summarizedexperiment
-  (package
-    (name "r-summarizedexperiment")
-    (version "1.20.0")
-    (source (origin
-              (method url-fetch)
-              (uri (bioconductor-uri "SummarizedExperiment" version))
-              (sha256
-               (base32
-                "04x6d4mcsnvz6glkmf6k2cv3fs8zk03i9rvv0ahpl793n8l411ps"))))
-    (properties
-     `((upstream-name . "SummarizedExperiment")))
-    (build-system r-build-system)
-    (propagated-inputs
-     `(("r-biobase" ,r-biobase)
-       ("r-biocgenerics" ,r-biocgenerics)
-       ("r-delayedarray" ,r-delayedarray)
-       ("r-genomeinfodb" ,r-genomeinfodb)
-       ("r-genomicranges" ,r-genomicranges)
-       ("r-iranges" ,r-iranges)
-       ("r-matrix" ,r-matrix)
-       ("r-matrixgenerics" ,r-matrixgenerics)
-       ("r-s4vectors" ,r-s4vectors)))
-    (native-inputs
-     `(("r-knitr" ,r-knitr)))
-    (home-page "https://bioconductor.org/packages/SummarizedExperiment")
-    (synopsis "Container for representing genomic ranges by sample")
-    (description
-     "The SummarizedExperiment container contains one or more assays, each
-represented by a matrix-like object of numeric or other mode.  The rows
-typically represent genomic ranges of interest and the columns represent
-samples.")
-    (license license:artistic2.0)))
-
 (define-public r-genomicalignments
   (package
     (name "r-genomicalignments")
@@ -9461,14 +9576,14 @@ Shiny-based display methods for Bioconductor objects.")
 (define-public r-annotationhub
   (package
     (name "r-annotationhub")
-    (version "2.22.0")
+    (version "2.22.1")
     (source
      (origin
        (method url-fetch)
        (uri (bioconductor-uri "AnnotationHub" version))
        (sha256
         (base32
-         "1950x654ffqx53b154kbph808zdh2xm5vmj9vzmc5nxc28fi2z5g"))))
+         "08d7m0n4jkpajsj0bvi5xd4vi1zqczl6lnrh8kqi2fbjkrvwdqp5"))))
     (properties `((upstream-name . "AnnotationHub")))
     (build-system r-build-system)
     (propagated-inputs
@@ -9558,14 +9673,14 @@ microarrays or GRanges for sequencing data.")
 (define-public r-gage
   (package
     (name "r-gage")
-    (version "2.40.1")
+    (version "2.40.2")
     (source
      (origin
        (method url-fetch)
        (uri (bioconductor-uri "gage" version))
        (sha256
         (base32
-         "1iawa03dy4bl333my69d4sk7d74cjzfg5dpcxga6q5dglan4sp8r"))))
+         "1bs0hx8sqiyl08dqn2zx31kbv5aci4xvrs71pplx2yxal3jf5178"))))
     (build-system r-build-system)
     (propagated-inputs
      `(("r-annotationdbi" ,r-annotationdbi)
@@ -9691,14 +9806,14 @@ originally made available by Holmes, Harris, and Quince, 2012, PLoS ONE 7(2):
 (define-public r-ensembldb
   (package
     (name "r-ensembldb")
-    (version "2.14.0")
+    (version "2.14.1")
     (source
      (origin
        (method url-fetch)
        (uri (bioconductor-uri "ensembldb" version))
        (sha256
         (base32
-         "04il99gcrqzakvc0bxchdp9gghkn1sp9lpiian0iz4y7r67z3wpy"))))
+         "1hxwfh19qafpdhzprvw4nr8ks3gz7f0y8gyfhk8yqmmvvnvgqv40"))))
     (build-system r-build-system)
     (propagated-inputs
      `(("r-annotationdbi" ,r-annotationdbi)
@@ -11775,7 +11890,7 @@ in an easily configurable manner.")
 (define-public pigx-bsseq
   (package
     (name "pigx-bsseq")
-    (version "0.1.2")
+    (version "0.1.3")
     (source (origin
               (method url-fetch)
               (uri (string-append "https://github.com/BIMSBbioinfo/pigx_bsseq/"
@@ -11783,7 +11898,7 @@ in an easily configurable manner.")
                                   "/pigx_bsseq-" version ".tar.gz"))
               (sha256
                (base32
-                "0mpzlay2d5cjpmrcp7knff6rg1c2mqszd638n7lw0mc0cycbp9f8"))))
+                "0blm0bl5z3ng01n7hh2ffk4rkzvf7vb3nm0crgdzrxr5cahxdxql"))))
     (build-system gnu-build-system)
     (arguments
      `(;; TODO: tests currently require 12+GB of RAM.  See
@@ -11809,8 +11924,11 @@ in an easily configurable manner.")
        ("r-annotationhub" ,r-annotationhub)
        ("r-dt" ,r-dt)
        ("r-genomation" ,r-genomation)
+       ("r-ggbio" ,r-ggbio)
        ("r-ggrepel" ,r-ggrepel)
+       ("r-matrixstats" ,r-matrixstats)
        ("r-methylkit" ,r-methylkit)
+       ("r-reshape2" ,r-reshape2)
        ("r-rtracklayer" ,r-rtracklayer)
        ("r-rmarkdown" ,r-rmarkdown)
        ("r-bookdown" ,r-bookdown)
@@ -12408,7 +12526,7 @@ in RNA-seq data.")
        ("python-igraph" ,python-igraph)
        ("python-joblib" ,python-joblib)
        ("python-legacy-api-wrap" ,python-legacy-api-wrap)
-       ("python-louvain" ,python-louvain)
+       ("python-louvain" ,python-louvain-0.6)
        ("python-matplotlib" ,python-matplotlib)
        ("python-natsort" ,python-natsort)
        ("python-networkx" ,python-networkx)
@@ -14979,6 +15097,299 @@ signaling, and more.  It continues to be evolved and expanded by an
 international community.")
     (license license:lgpl2.1+)))
 
+(define-public kraken2
+  (package
+    (name "kraken2")
+    (version "2.1.1")
+    (source (origin
+              (method git-fetch)
+              (uri (git-reference
+                    (url "https://github.com/DerrickWood/kraken2")
+                    (commit (string-append "v" version))))
+              (file-name (git-file-name name version))
+              (sha256
+               (base32
+                "0h7a7vygd7y5isbrnc6srwq6xj1rmyd33pm8mmcgfkmlxlg5vkg3"))))
+    (build-system gnu-build-system)
+    (arguments
+     `(#:tests? #false                  ; there are none
+       #:make-flags (list "-C" "src"
+                          (string-append "KRAKEN2_DIR="
+                                         (assoc-ref %outputs "out") "/bin"))
+       #:phases
+       (modify-phases %standard-phases
+         (delete 'configure)
+         (add-before 'install 'install-scripts
+           (lambda* (#:key outputs #:allow-other-keys)
+             (let* ((bin (string-append (assoc-ref outputs "out") "/bin"))
+                    (replacements `(("KRAKEN2_DIR" . ,bin)
+                                    ("VERSION" . ,,version))))
+               (mkdir-p bin)
+
+               (with-directory-excursion "scripts"
+                 (let ((scripts (find-files "." ".*")))
+                   (substitute* scripts
+                     (("#####=([^=]+)=#####" _ key)
+                      (or (assoc-ref replacements key)
+                          (error (format #false "unknown key: ~a~%" key)))))
+                   (substitute* "kraken2"
+                     (("compression_program = \"bzip2\"")
+                      (string-append "compression_program = \""
+                                     (which "bzip2")
+                                     "\""))
+                     (("compression_program = \"gzip\"")
+                      (string-append "compression_program = \""
+                                     (which "gzip")
+                                     "\"")))
+                   (substitute* '("download_genomic_library.sh"
+                                  "download_taxonomy.sh"
+                                  "16S_gg_installation.sh"
+                                  "16S_silva_installation.sh"
+                                  "16S_rdp_installation.sh")
+                     (("wget") (which "wget")))
+                   (substitute* '("download_taxonomy.sh"
+			          "download_genomic_library.sh"
+			          "rsync_from_ncbi.pl")
+		     (("rsync -")
+                      (string-append (which "rsync") " -")))
+                   (substitute* "mask_low_complexity.sh"
+                     (("which") (which "which")))
+                   (substitute* '("mask_low_complexity.sh"
+                                  "download_genomic_library.sh"
+                                  "16S_silva_installation.sh")
+                     (("sed -e ")
+                      (string-append (which "sed") " -e ")))
+                   (substitute* '("rsync_from_ncbi.pl"
+                                  "16S_rdp_installation.sh"
+                                  "16S_silva_installation.sh"
+                                  "16S_gg_installation.sh"
+                                  "download_taxonomy.sh"
+                                  "download_genomic_library.sh")
+                     (("gunzip") (which "gunzip")))
+                   (for-each (lambda (script)
+                               (chmod script #o555)
+                               (install-file script bin))
+                             scripts)))))))))
+    (inputs
+     `(("gzip" ,gzip)
+       ("perl" ,perl)
+       ("rsync" ,rsync)
+       ("sed" ,sed)
+       ("wget" ,wget)
+       ("which" ,which)))
+  (home-page "https://github.com/DerrickWood/kraken2")
+  (synopsis "Taxonomic sequence classification system")
+  (description "Kraken is a taxonomic sequence classifier that assigns
+taxonomic labels to DNA sequences.  Kraken examines the k-mers within a query
+sequence and uses the information within those k-mers to query a
+database. That database maps k-mers to the lowest common ancestor (LCA) of all
+genomes known to contain a given k-mer.")
+  (license license:expat)))
+
+(define-public lofreq
+  (package
+    (name "lofreq")
+    (version "2.1.5")
+    (source (origin
+              (method git-fetch)
+              (uri (git-reference
+                    (url "https://github.com/CSB5/lofreq")
+                    (commit (string-append "v" version))))
+              (file-name (git-file-name name version))
+              (sha256
+               (base32
+                "0qssrn3mgjak7df6iqc1rljqd3g3a5syvg0lsv4vds43s3fq23bl"))))
+    (build-system gnu-build-system)
+    (arguments
+     '(#:test-target "bug-tests"
+       #:tests? #false)) ; test data are not included
+    (inputs
+     `(("htslib" ,htslib)
+       ("python" ,python-wrapper)
+       ("zlib" ,zlib)))
+    (native-inputs
+     `(("autoconf" ,autoconf)
+       ("automake" ,automake)
+       ("which" ,which)))
+    (home-page "https://csb5.github.io/lofreq/")
+    (synopsis "Sensitive variant calling from sequencing data ")
+    (description "LoFreq is a fast and sensitive variant-caller for inferring
+SNVs and indels from next-generation sequencing data.  It makes full use of
+base-call qualities and other sources of errors inherent in
+sequencing (e.g. mapping or base/indel alignment uncertainty), which are
+usually ignored by other methods or only used for filtering.")
+    (license license:expat)))
+
+(define-public python-pyliftover
+  (package
+    (name "python-pyliftover")
+    (version "0.4")
+    ;; The version of pypi does not include test data.
+    (source (origin
+              (method git-fetch)
+              (uri (git-reference
+                    (url "https://github.com/konstantint/pyliftover")
+                    (commit version)))
+              (file-name (git-file-name name version))
+              (sha256
+               (base32
+                "1j8jp9iynv2l3jv5pr0pn0p3azlama1bqg233piglzm6bqh3m2m3"))))
+    (build-system python-build-system)
+    (arguments `(#:tests? #false)) ; the tests access the web
+    (native-inputs
+     `(("python-pytest" ,python-pytest)))
+    (home-page "https://github.com/konstantint/pyliftover")
+    (synopsis "Python implementation of UCSC liftOver genome coordinate conversion")
+    (description
+     "PyLiftover is a library for quick and easy conversion of genomic (point)
+coordinates between different assemblies.")
+    (license license:expat)))
+
+(define-public ensembl-vep
+  (let* ((api-version "103")
+         (api-module
+          (lambda (name hash)
+            (origin (method git-fetch)
+                    (uri (git-reference
+                          (url (string-append "https://github.com/Ensembl/"
+                                              name ".git"))
+                          (commit (string-append "release/" api-version))))
+                    (file-name (string-append name "-" api-version "-checkout"))
+                    (sha256 (base32 hash))))))
+    (package
+      (name "ensembl-vep")
+      (version (string-append api-version ".1"))
+      (source
+       (origin
+         (method git-fetch)
+         (uri (git-reference
+               (url "https://github.com/Ensembl/ensembl-vep.git")
+               (commit (string-append "release/" version))))
+         (sha256
+          (base32
+           "1iq7p72cv9b38jz2v8a4slzy2n8y0md487943180ym9xc8qvw09c"))))
+      (build-system gnu-build-system)
+      (arguments
+       `(#:modules ((guix build gnu-build-system)
+                    (guix build utils)
+                    (ice-9 match))
+         #:phases
+         (modify-phases %standard-phases
+           (delete 'configure)
+           (delete 'build)
+           ;; Tests need to run after installation
+           (delete 'check)
+           (replace 'install
+             (lambda* (#:key inputs outputs #:allow-other-keys)
+               (let* ((modules '(("ensembl" "/")
+                                 ("ensembl-variation" "/Variation")
+                                 ("ensembl-funcgen"   "/Funcgen")
+                                 ("ensembl-io"        "/")))
+                      (scripts '(("convert_cache.pl" "vep_convert_cache.pl")
+                                 ("INSTALL.pl"       "vep_install.pl")
+                                 ("haplo"            #f)
+                                 ("variant_recoder"  #f)
+                                 ("filter_vep"       #f)
+                                 ("vep"              #f)))
+                      (out  (assoc-ref outputs "out"))
+                      (bin  (string-append out "/bin"))
+                      (perl (string-append out "/lib/perl5/site_perl")))
+                 (for-each
+                  (match-lambda
+                    ((name path)
+                     (let ((dir (string-append perl "/Bio/EnsEMBL" path)))
+                       (mkdir-p dir)
+                       (copy-recursively
+                        (string-append (assoc-ref inputs (string-append "api-module-" name))
+                                       "/modules/Bio/EnsEMBL" path)
+                        dir))))
+                  modules)
+                 (copy-recursively "modules/" perl)
+                 (mkdir-p bin)
+                 (for-each
+                  (match-lambda
+                    ((script new-name)
+                     (let ((location (string-append bin "/"
+                                                    (or new-name (basename script)))))
+                       (copy-file script location)
+                       (chmod location #o555)
+                       (wrap-program location
+                         `("PERL5LIB" ":" prefix (,(getenv "PERL5LIB")
+                                                  ,perl))))))
+                  scripts)
+
+                 ;; Fix path to tools
+                 (with-directory-excursion (string-append perl "/Bio/EnsEMBL")
+                   (substitute* '("Funcgen/RunnableDB/ProbeMapping/PrePipelineChecks.pm"
+                                  "VEP/BaseRunner.pm"
+                                  "VEP/Utils.pm"
+                                  "VEP/AnnotationSource/Cache/VariationTabix.pm"
+                                  "VEP/AnnotationSource/Cache/BaseSerialized.pm"
+                                  "Variation/Utils/BaseVepTabixPlugin.pm"
+                                  "Variation/Utils/VEP.pm"
+                                  "Variation/Pipeline/ReleaseDataDumps/PreRunChecks.pm")
+                     (("`which")
+                      (string-append "`"
+                                     (assoc-ref inputs "which")
+                                     "/bin/which")))))))
+           (add-after 'install 'check
+             (lambda* (#:key tests? inputs outputs #:allow-other-keys)
+               (when tests?
+                 (setenv "PERL5LIB"
+                         (string-append (getenv "PERL5LIB")
+                                        ":"
+                                        (assoc-ref outputs "out")
+                                        "/lib/perl5/site_perl"))
+                 (copy-recursively (string-append (assoc-ref inputs "source") "/t")
+                                   "/tmp/t")
+                 (for-each make-file-writable (find-files "/tmp/t"))
+                 ;; TODO: haplo needs Set/IntervalTree.pm
+                 (invoke "perl" "-e" (string-append "
+use Test::Harness; use Test::Exception;
+my $dirname = \"/tmp\";
+opendir TEST, \"$dirname\\/t\";
+my @test_files = map {\"$dirname\\/t\\/\".$_} grep {!/^\\./ && /\\.t$/} readdir TEST; closedir TEST;
+@test_files = grep {!/Haplo/} @test_files;
+runtests(@test_files);
+"))))))))
+      (inputs
+       `(("bioperl-minimal" ,bioperl-minimal)
+         ("perl-bio-db-hts" ,perl-bio-db-hts)
+         ("perl-dbi" ,perl-dbi)
+         ("perl-dbd-mysql" ,perl-dbd-mysql)
+         ("perl-libwww" ,perl-libwww)
+         ("perl-http-tiny" ,perl-http-tiny)
+         ("perl-json" ,perl-json)
+         ("which" ,which)))
+      (propagated-inputs
+       `(("kentutils" ,kentutils)))
+      (native-inputs
+       `(("unzip" ,unzip)
+         ("perl" ,perl)
+         ("api-module-ensembl"
+          ,(api-module "ensembl"
+                       "0s59rj905g72hljzfpvnx5nxwz925b917y4jp912i23f5gwxh14v"))
+         ("api-module-ensembl-variation"
+          ,(api-module "ensembl-variation"
+                       "1dvwdzzfjhzymq02b6n4p6j3a9q4jgq0g89hs7hj1apd7zhirgkq"))
+         ("api-module-ensembl-funcgen"
+          ,(api-module "ensembl-funcgen"
+                       "1x23pv38dmv0w0gby6rv3wds50qghb4v3v1mf43vk55msfxzry8n"))
+         ("api-module-ensembl-io"
+          ,(api-module "ensembl-io"
+                       "14adb2x934lzsq20035mazdkhrkcw0qzb0xhz6zps9vk4wixwaix"))
+         ("perl-test-harness" ,perl-test-harness)
+         ("perl-test-exception" ,perl-test-exception)))
+      (home-page "http://www.ensembl.org/vep")
+      (synopsis "Predict functional effects of genomic variants")
+      (description
+       "This package provides a Variant Effect Predictor, which predicts
+the functional effects of genomic variants.  It also provides
+Haplosaurus, which uses phased genotype data to predict
+whole-transcript haplotype sequences, and Variant Recoder, which
+translates between different variant encodings.")
+      (license license:asl2.0))))
+
 (define-public r-signac
   (let ((commit "e0512d348adeda4a3f23a2e8f56d1fe09840e03c")
         (revision "1"))