diff options
author | Peter Lo <peterloleungyau@gmail.com> | 2020-06-29 13:50:37 +0800 |
---|---|---|
committer | Ricardo Wurmus <rekado@elephly.net> | 2020-09-11 18:29:49 +0200 |
commit | f90b4b380af1278bfc47b3e70f0892b836a2ba8c (patch) | |
tree | 39993991fb2192620d21716626ca997f76aefa81 | |
parent | 05bda85901dbcab551311acbe64beb2a0633dc07 (diff) | |
download | guix-f90b4b380af1278bfc47b3e70f0892b836a2ba8c.tar guix-f90b4b380af1278bfc47b3e70f0892b836a2ba8c.tar.gz |
gnu: Add r-tokenizers.
* gnu/packages/cran.scm (r-tokenizers): New variable.
Signed-off-by: Ricardo Wurmus <rekado@elephly.net>
-rw-r--r-- | gnu/packages/cran.scm | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/gnu/packages/cran.scm b/gnu/packages/cran.scm index 438bc9dea9..3d64763e0e 100644 --- a/gnu/packages/cran.scm +++ b/gnu/packages/cran.scm @@ -23954,3 +23954,35 @@ novels, ready for text analysis. These novels are \"Sense and Sensibility\", \"Pride and Prejudice\", \"Mansfield Park\", \"Emma\", \"Northanger Abbey\", and \"Persuasion\".") (license license:expat))) + +(define-public r-tokenizers + (package + (name "r-tokenizers") + (version "0.2.1") + (source + (origin + (method url-fetch) + (uri (cran-uri "tokenizers" version)) + (sha256 + (base32 + "006xf1vdrmp9skhpss9ldhmk4cwqk512cjp1pxm2gxfybpf7qq98")))) + (properties `((upstream-name . "tokenizers"))) + (build-system r-build-system) + (propagated-inputs + `(("r-rcpp" ,r-rcpp) + ("r-snowballc" ,r-snowballc) + ("r-stringi" ,r-stringi))) + (native-inputs + `(("r-knitr" ,r-knitr))) + (home-page "https://lincolnmullen.com/software/tokenizers/") + (synopsis "Fast, consistent tokenization of natural language text") + (description + "This is a package for converting natural language text into tokens. +It includes tokenizers for shingled n-grams, skip n-grams, words, word stems, +sentences, paragraphs, characters, shingled characters, lines, tweets, Penn +Treebank, regular expressions, as well as functions for counting characters, +words, and sentences, and a function for splitting longer texts into separate +documents, each with the same number of words. The tokenizers have a +consistent interface, and the package is built on the @code{stringi} and +@code{Rcpp} packages for fast yet correct tokenization in UTF-8 encoding.") + (license license:expat))) |