diff options
author | Nicolas Graves <ngraves@ngraves.fr> | 2023-03-25 16:32:18 +0100 |
---|---|---|
committer | Maxim Cournoyer <maxim.cournoyer@gmail.com> | 2023-04-12 08:29:22 -0400 |
commit | 70510eb04712e995ae904c9299fecd166f5138ff (patch) | |
tree | 0504a21d6be8e99ee7892f3c53bd4c0587003f03 | |
parent | 46c4c6cae4a67b26d9670ea566e032fe4cff09fa (diff) | |
download | guix-70510eb04712e995ae904c9299fecd166f5138ff.tar guix-70510eb04712e995ae904c9299fecd166f5138ff.tar.gz |
gnu: Add sentencepiece.
* gnu/packages/machine-learning.scm (sentencepiece): New variable.
Signed-off-by: Nicolas Goaziou <mail@nicolasgoaziou.fr>
-rw-r--r-- | gnu/packages/machine-learning.scm | 28 |
1 files changed, 28 insertions, 0 deletions
diff --git a/gnu/packages/machine-learning.scm b/gnu/packages/machine-learning.scm index 37d4ef78ad..072fb1ab15 100644 --- a/gnu/packages/machine-learning.scm +++ b/gnu/packages/machine-learning.scm @@ -583,6 +583,34 @@ optimizing, and searching weighted finite-state transducers (FSTs).") '("--enable-shared" "--enable-far" "--enable-ngram-fsts" "--enable-lookahead-fsts" "--with-pic" "--disable-bin"))))) +(define-public sentencepiece + (package + (name "sentencepiece") + (version "0.1.97") + (source + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/google/sentencepiece") + (commit (string-append "v" version)))) + (file-name (git-file-name name version)) + (sha256 + (base32 "1kzfkp2pk0vabyw3wmkh16h11chzq63mzc20ddhsag5fp6s91ajg")))) + (build-system cmake-build-system) + (arguments (list #:tests? #f)) ;no tests + (native-inputs (list gperftools)) + (home-page "https://github.com/google/sentencepiece") + (synopsis "Unsupervised tokenizer for Neural Network-based text generation") + (description + "SentencePiece is an unsupervised text tokenizer and detokenizer mainly +for Neural Network-based text generation systems where the vocabulary size is +predetermined prior to the neural model training. SentencePiece implements +subword units---e.g., byte-pair-encoding (BPE) and unigram language +model---with the extension of direct training from raw sentences. +SentencePiece allows us to make a purely end-to-end system that does not +depend on language-specific pre- or post-processing.") + (license license:asl2.0))) + (define-public shogun (package (name "shogun") |