diff options
author | Nicolas Graves <ngraves@ngraves.fr> | 2023-03-25 16:32:18 +0100 |
---|---|---|
committer | Nicolas Goaziou <mail@nicolasgoaziou.fr> | 2023-04-08 14:04:41 +0200 |
commit | 3ede522d3dfcbd6b6f971e299e7059c5fe775ba7 (patch) | |
tree | 295274bcd1c3e0311bf5efca19360aa4f2730ec7 /gnu | |
parent | cb47d7fda43fca13f965c6cb081a9944b8d98160 (diff) | |
download | guix-3ede522d3dfcbd6b6f971e299e7059c5fe775ba7.tar.gz |
gnu: Add sentencepiece.
* gnu/packages/machine-learning.scm (sentencepiece): New variable. Signed-off-by: Nicolas Goaziou <mail@nicolasgoaziou.fr>
Diffstat (limited to 'gnu')
-rw-r--r-- | gnu/packages/machine-learning.scm | 28 |
1 files changed, 28 insertions, 0 deletions
diff --git a/gnu/packages/machine-learning.scm b/gnu/packages/machine-learning.scm index 37d4ef78ad..072fb1ab15 100644 --- a/gnu/packages/machine-learning.scm +++ b/gnu/packages/machine-learning.scm @@ -583,6 +583,34 @@ optimizing, and searching weighted finite-state transducers (FSTs).") '("--enable-shared" "--enable-far" "--enable-ngram-fsts" "--enable-lookahead-fsts" "--with-pic" "--disable-bin"))))) +(define-public sentencepiece + (package + (name "sentencepiece") + (version "0.1.97") + (source + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/google/sentencepiece") + (commit (string-append "v" version)))) + (file-name (git-file-name name version)) + (sha256 + (base32 "1kzfkp2pk0vabyw3wmkh16h11chzq63mzc20ddhsag5fp6s91ajg")))) + (build-system cmake-build-system) + (arguments (list #:tests? #f)) ;no tests + (native-inputs (list gperftools)) + (home-page "https://github.com/google/sentencepiece") + (synopsis "Unsupervised tokenizer for Neural Network-based text generation") + (description + "SentencePiece is an unsupervised text tokenizer and detokenizer mainly +for Neural Network-based text generation systems where the vocabulary size is +predetermined prior to the neural model training. SentencePiece implements +subword units---e.g., byte-pair-encoding (BPE) and unigram language +model---with the extension of direct training from raw sentences. +SentencePiece allows us to make a purely end-to-end system that does not +depend on language-specific pre- or post-processing.") + (license license:asl2.0))) + (define-public shogun (package (name "shogun") |