summary refs log tree commit diff
diff options
context:
space:
mode:
authorPeter Lo <peterloleungyau@gmail.com>2020-06-29 13:50:37 +0800
committerRicardo Wurmus <rekado@elephly.net>2020-09-11 18:29:49 +0200
commitf90b4b380af1278bfc47b3e70f0892b836a2ba8c (patch)
tree39993991fb2192620d21716626ca997f76aefa81
parent05bda85901dbcab551311acbe64beb2a0633dc07 (diff)
downloadguix-f90b4b380af1278bfc47b3e70f0892b836a2ba8c.tar.gz
gnu: Add r-tokenizers.
* gnu/packages/cran.scm (r-tokenizers): New variable.

Signed-off-by: Ricardo Wurmus <rekado@elephly.net>
-rw-r--r--gnu/packages/cran.scm32
1 files changed, 32 insertions, 0 deletions
diff --git a/gnu/packages/cran.scm b/gnu/packages/cran.scm
index 438bc9dea9..3d64763e0e 100644
--- a/gnu/packages/cran.scm
+++ b/gnu/packages/cran.scm
@@ -23954,3 +23954,35 @@ novels, ready for text analysis.  These novels are \"Sense and Sensibility\",
 \"Pride and Prejudice\", \"Mansfield Park\", \"Emma\", \"Northanger Abbey\",
 and \"Persuasion\".")
     (license license:expat)))
+
+(define-public r-tokenizers
+  (package
+    (name "r-tokenizers")
+    (version "0.2.1")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cran-uri "tokenizers" version))
+       (sha256
+        (base32
+         "006xf1vdrmp9skhpss9ldhmk4cwqk512cjp1pxm2gxfybpf7qq98"))))
+    (properties `((upstream-name . "tokenizers")))
+    (build-system r-build-system)
+    (propagated-inputs
+     `(("r-rcpp" ,r-rcpp)
+       ("r-snowballc" ,r-snowballc)
+       ("r-stringi" ,r-stringi)))
+    (native-inputs
+     `(("r-knitr" ,r-knitr)))
+    (home-page "https://lincolnmullen.com/software/tokenizers/")
+    (synopsis "Fast, consistent tokenization of natural language text")
+    (description
+     "This is a package for converting natural language text into tokens.
+It includes tokenizers for shingled n-grams, skip n-grams, words, word stems,
+sentences, paragraphs, characters, shingled characters, lines, tweets, Penn
+Treebank, regular expressions, as well as functions for counting characters,
+words, and sentences, and a function for splitting longer texts into separate
+documents, each with the same number of words.  The tokenizers have a
+consistent interface, and the package is built on the @code{stringi} and
+@code{Rcpp} packages for fast yet correct tokenization in UTF-8 encoding.")
+    (license license:expat)))