aboutsummaryrefslogtreecommitdiff
path: root/gnu/packages/bioinformatics.scm
diff options
context:
space:
mode:
Diffstat (limited to 'gnu/packages/bioinformatics.scm')
-rw-r--r--gnu/packages/bioinformatics.scm404
1 files changed, 371 insertions, 33 deletions
diff --git a/gnu/packages/bioinformatics.scm b/gnu/packages/bioinformatics.scm
index 7b3838d36f..281bd1f427 100644
--- a/gnu/packages/bioinformatics.scm
+++ b/gnu/packages/bioinformatics.scm
@@ -1,9 +1,9 @@
-
;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2014, 2015, 2016 Ricardo Wurmus <rekado@elephly.net>
;;; Copyright © 2015, 2016 Ben Woodcroft <donttrustben@gmail.com>
-;;; Copyright © 2015 Pjotr Prins <pjotr.guix@thebird.nl>
+;;; Copyright © 2015, 2016 Pjotr Prins <pjotr.guix@thebird.nl>
;;; Copyright © 2015 Andreas Enge <andreas@enge.fr>
+;;; Copyright © 2016 Roel Janssen <roel@gnu.org>
;;;
;;; This file is part of GNU Guix.
;;;
@@ -26,6 +26,7 @@
#:use-module (guix utils)
#:use-module (guix download)
#:use-module (guix git-download)
+ #:use-module (guix build-system ant)
#:use-module (guix build-system gnu)
#:use-module (guix build-system cmake)
#:use-module (guix build-system perl)
@@ -37,11 +38,13 @@
#:use-module (gnu packages autotools)
#:use-module (gnu packages algebra)
#:use-module (gnu packages base)
+ #:use-module (gnu packages bison)
#:use-module (gnu packages boost)
#:use-module (gnu packages compression)
#:use-module (gnu packages cpio)
#:use-module (gnu packages curl)
#:use-module (gnu packages doxygen)
+ #:use-module (gnu packages datastructures)
#:use-module (gnu packages file)
#:use-module (gnu packages gawk)
#:use-module (gnu packages gcc)
@@ -247,6 +250,47 @@ intervals from multiple files in widely-used genomic file formats such as BAM,
BED, GFF/GTF, VCF.")
(license license:gpl2)))
+(define-public bioawk
+ (package
+ (name "bioawk")
+ (version "1.0")
+ (source (origin
+ (method url-fetch)
+ (uri (string-append "https://github.com/lh3/bioawk/archive/v"
+ version ".tar.gz"))
+ (file-name (string-append name "-" version ".tar.gz"))
+ (sha256
+ (base32 "1daizxsk17ahi9n58fj8vpgwyhzrzh54bzqhanjanp88kgrz7gjw"))))
+ (build-system gnu-build-system)
+ (inputs
+ `(("zlib" ,zlib)))
+ (native-inputs
+ `(("bison" ,bison)))
+ (arguments
+ `(#:tests? #f ; There are no tests to run.
+ ;; Bison must generate files, before other targets can build.
+ #:parallel-build? #f
+ #:phases
+ (modify-phases %standard-phases
+ (delete 'configure) ; There is no configure phase.
+ (replace 'install
+ (lambda* (#:key outputs #:allow-other-keys)
+ (let* ((out (assoc-ref outputs "out"))
+ (bin (string-append out "/bin"))
+ (man (string-append out "/share/man/man1")))
+ (mkdir-p man)
+ (copy-file "awk.1" (string-append man "/bioawk.1"))
+ (install-file "bioawk" bin)))))))
+ (home-page "https://github.com/lh3/bioawk")
+ (synopsis "AWK with bioinformatics extensions")
+ (description "Bioawk is an extension to Brian Kernighan's awk, adding the
+support of several common biological data formats, including optionally gzip'ed
+BED, GFF, SAM, VCF, FASTA/Q and TAB-delimited formats with column names. It
+also adds a few built-in functions and a command line option to use TAB as the
+input/output delimiter. When the new functionality is not used, bioawk is
+intended to behave exactly the same as the original BWK awk.")
+ (license license:x11)))
+
(define-public python2-pybedtools
(package
(name "python2-pybedtools")
@@ -535,10 +579,11 @@ confidence to have in an alignment.")
(snippet
`(begin
;; Remove bundled boost, pigz, zlib, and .git directory
- ;; FIXME: also remove bundled sources for google-sparsehash,
- ;; murmurhash3, kmc once packaged.
+ ;; FIXME: also remove bundled sources for murmurhash3 and
+ ;; kmc once packaged.
(delete-file-recursively "boost")
(delete-file-recursively "pigz")
+ (delete-file-recursively "google-sparsehash")
(delete-file-recursively "zlib")
(delete-file-recursively ".git")
#t))))
@@ -588,6 +633,7 @@ confidence to have in an alignment.")
(inputs
`(("openmpi" ,openmpi)
("boost" ,boost)
+ ("sparsehash" ,sparsehash)
("pigz" ,pigz)
("zlib" ,zlib)))
(supported-systems '("x86_64-linux"))
@@ -774,6 +820,35 @@ and more accurate. BWA-MEM also has better performance than BWA-backtrack for
70-100bp Illumina reads.")
(license license:gpl3+)))
+(define-public bwa-pssm
+ (package (inherit bwa)
+ (name "bwa-pssm")
+ (version "0.5.11")
+ (source (origin
+ (method url-fetch)
+ (uri (string-append "https://github.com/pkerpedjiev/bwa-pssm/"
+ "archive/" version ".tar.gz"))
+ (file-name (string-append name "-" version ".tar.gz"))
+ (sha256
+ (base32
+ "02p7mpbs4mlxmn84g2x4ghak638vbj4lqix2ipx5g84pz9bhdavg"))))
+ (build-system gnu-build-system)
+ (inputs
+ `(("gdsl" ,gdsl)
+ ("zlib" ,zlib)
+ ("perl" ,perl)))
+ (home-page "http://bwa-pssm.binf.ku.dk/")
+ (synopsis "Burrows-Wheeler transform-based probabilistic short read mapper")
+ (description
+ "BWA-PSSM is a probabilistic short genomic sequence read aligner based on
+the use of @dfn{position specific scoring matrices} (PSSM). Like many of the
+existing aligners it is fast and sensitive. Unlike most other aligners,
+however, it is also adaptible in the sense that one can direct the alignment
+based on known biases within the data set. It is coded as a modification of
+the original BWA alignment program and shares the genome index structure as
+well as many of the command line options.")
+ (license license:gpl3+)))
+
(define-public python2-bx-python
(package
(name "python2-bx-python")
@@ -809,6 +884,91 @@ and more accurate. BWA-MEM also has better performance than BWA-backtrack for
multiple sequence alignments.")
(license license:expat)))
+(define-public python-pysam
+ (package
+ (name "python-pysam")
+ (version "0.8.4")
+ (source (origin
+ (method url-fetch)
+ (uri (pypi-uri "pysam" version))
+ (sha256
+ (base32
+ "1slx5mb94mzm5qzk52q270sab0sar95j67w1g1k452nz3s9j7krh"))))
+ (build-system python-build-system)
+ (arguments
+ `(#:tests? #f ; tests are excluded in the manifest
+ #:phases
+ (alist-cons-before
+ 'build 'set-flags
+ (lambda _
+ (setenv "LDFLAGS" "-lncurses")
+ (setenv "CFLAGS" "-D_CURSES_LIB=1"))
+ %standard-phases)))
+ (inputs
+ `(("ncurses" ,ncurses)
+ ("zlib" ,zlib)))
+ (native-inputs
+ `(("python-cython" ,python-cython)
+ ("python-setuptools" ,python-setuptools)))
+ (home-page "https://github.com/pysam-developers/pysam")
+ (synopsis "Python bindings to the SAMtools C API")
+ (description
+ "Pysam is a Python module for reading and manipulating files in the
+SAM/BAM format. Pysam is a lightweight wrapper of the SAMtools C API. It
+also includes an interface for tabix.")
+ (license license:expat)))
+
+(define-public python2-pysam
+ (package-with-python2 python-pysam))
+
+(define-public cd-hit
+ (package
+ (name "cd-hit")
+ (version "4.6.5")
+ (source (origin
+ (method url-fetch)
+ (uri (string-append "https://github.com/weizhongli/cdhit"
+ "/releases/download/V" version
+ "/cd-hit-v" version "-2016-0304.tar.gz"))
+ (sha256
+ (base32
+ "15db0hq38yyifwqx9b6l34z14jcq576dmjavhj8a426c18lvnhp3"))))
+ (build-system gnu-build-system)
+ (arguments
+ `(#:tests? #f ; there are no tests
+ #:make-flags
+ ;; Executables are copied directly to the PREFIX.
+ (list (string-append "PREFIX=" (assoc-ref %outputs "out") "/bin"))
+ #:phases
+ (modify-phases %standard-phases
+ ;; No "configure" script
+ (delete 'configure)
+ ;; Remove sources of non-determinism
+ (add-after 'unpack 'be-timeless
+ (lambda _
+ (substitute* "cdhit-utility.c++"
+ ((" \\(built on \" __DATE__ \"\\)") ""))
+ (substitute* "cdhit-common.c++"
+ (("__DATE__") "\"0\"")
+ (("\", %s, \" __TIME__ \"\\\\n\", date") ""))
+ #t))
+ ;; The "install" target does not create the target directory
+ (add-before 'install 'create-target-dir
+ (lambda* (#:key outputs #:allow-other-keys)
+ (mkdir-p (string-append (assoc-ref outputs "out") "/bin"))
+ #t)))))
+ (inputs
+ `(("perl" ,perl)))
+ (home-page "http://weizhongli-lab.org/cd-hit/")
+ (synopsis "Cluster and compare protein or nucleotide sequences")
+ (description
+ "CD-HIT is a program for clustering and comparing protein or nucleotide
+sequences. CD-HIT is designed to be fast and handle extremely large
+databases.")
+ ;; The manual says: "It can be copied under the GNU General Public License
+ ;; version 2 (GPLv2)."
+ (license license:gpl2)))
+
(define-public clipper
(package
(name "clipper")
@@ -848,6 +1008,46 @@ multiple sequence alignments.")
"CLIPper is a tool to define peaks in CLIP-seq datasets.")
(license license:gpl2)))
+(define-public codingquarry
+ (package
+ (name "codingquarry")
+ (version "2.0")
+ (source (origin
+ (method url-fetch)
+ (uri (string-append
+ "mirror://sourceforge/codingquarry/CodingQuarry_v"
+ version ".tar.gz"))
+ (sha256
+ (base32
+ "0115hkjflsnfzn36xppwf9h9avfxlavr43djqmshkkzbgjzsz60i"))))
+ (build-system gnu-build-system)
+ (arguments
+ '(#:tests? #f ; no "check" target
+ #:phases
+ (modify-phases %standard-phases
+ (delete 'configure)
+ (replace 'install
+ (lambda* (#:key outputs #:allow-other-keys)
+ (let* ((out (assoc-ref outputs "out"))
+ (bin (string-append out "/bin"))
+ (doc (string-append out "/share/doc/codingquarry")))
+ (install-file "INSTRUCTIONS.pdf" doc)
+ (copy-recursively "QuarryFiles"
+ (string-append out "/QuarryFiles"))
+ (install-file "CodingQuarry" bin)
+ (install-file "CufflinksGTF_to_CodingQuarryGFF3.py" bin)))))))
+ (inputs `(("openmpi" ,openmpi)))
+ (native-search-paths
+ (list (search-path-specification
+ (variable "QUARRY_PATH")
+ (files '("QuarryFiles")))))
+ (native-inputs `(("python" ,python-2))) ; Only Python 2 is supported
+ (synopsis "Fungal gene predictor")
+ (description "CodingQuarry is a highly accurate, self-training GHMM fungal
+gene predictor designed to work with assembled, aligned RNA-seq transcripts.")
+ (home-page "https://sourceforge.net/projects/codingquarry/")
+ (license license:gpl3+)))
+
(define-public couger
(package
(name "couger")
@@ -1129,28 +1329,70 @@ other types of unwanted sequence from high-throughput sequencing reads.")
files.")
(license license:expat)))
+(define-public python-pybigwig
+ (package
+ (name "python-pybigwig")
+ (version "0.2.5")
+ (source (origin
+ (method url-fetch)
+ (uri (pypi-uri "pyBigWig" version))
+ (sha256
+ (base32
+ "0yrpdxg3y0sny25x4w22lv1k47jzccqjmg7j4bp0hywklvp0hg7d"))
+ (modules '((guix build utils)))
+ (snippet
+ '(begin
+ ;; Delete bundled libBigWig sources
+ (delete-file-recursively "libBigWig")))))
+ (build-system python-build-system)
+ (arguments
+ `(#:phases
+ (modify-phases %standard-phases
+ (add-after 'unpack 'link-with-libBigWig
+ (lambda* (#:key inputs #:allow-other-keys)
+ (substitute* "setup.py"
+ (("libs=\\[") "libs=[\"BigWig\", "))
+ #t)))))
+ (inputs
+ `(("libbigwig" ,libbigwig)
+ ("zlib" ,zlib)
+ ("curl" ,curl)))
+ (home-page "https://github.com/dpryan79/pyBigWig")
+ (synopsis "Access bigWig files in Python using libBigWig")
+ (description
+ "This package provides Python bindings to the libBigWig library for
+accessing bigWig files.")
+ (license license:expat)))
+
+(define-public python2-pybigwig
+ (let ((pybigwig (package-with-python2 python-pybigwig)))
+ (package (inherit pybigwig)
+ (native-inputs
+ `(("python-setuptools" ,python2-setuptools))))))
+
(define-public deeptools
(package
(name "deeptools")
- (version "1.5.11")
+ (version "2.1.1")
(source (origin
(method url-fetch)
- (uri (string-append
- "https://github.com/fidelram/deepTools/archive/"
- version ".tar.gz"))
+ (uri (string-append "https://github.com/fidelram/deepTools/"
+ "archive/" version ".tar.gz"))
(file-name (string-append name "-" version ".tar.gz"))
(sha256
(base32
- "1kaagygcbvjs9sxd9cqmskd02wcfp9imvb735r087w7hwqpvz6fs"))))
+ "1nmfin0zjdby3vay3r4flvz94dr6qjhj41ax4yz3vx13j6wz8izd"))))
(build-system python-build-system)
(arguments
`(#:python ,python-2))
- (propagated-inputs
+ (inputs
`(("python-scipy" ,python2-scipy)
("python-numpy" ,python2-numpy)
+ ("python-numpydoc" ,python2-numpydoc)
("python-matplotlib" ,python2-matplotlib)
("python-bx-python" ,python2-bx-python)
- ("python-pysam" ,python2-pysam)))
+ ("python-pysam" ,python2-pysam)
+ ("python-pybigwig" ,python2-pybigwig)))
(native-inputs
`(("python-mock" ,python2-mock) ;for tests
("python-pytz" ,python2-pytz) ;for tests
@@ -1851,24 +2093,17 @@ from high-throughput sequencing assays.")
(snippet '(substitute* "build.xml"
(("failifexecutionfails=\"true\"")
"failifexecutionfails=\"false\"")))))
- (build-system gnu-build-system)
+ (build-system ant-build-system)
(arguments
- `(#:modules ((srfi srfi-1)
- (guix build gnu-build-system)
- (guix build utils))
- #:phases (alist-replace
- 'build
- (lambda _
- (setenv "JAVA_HOME" (assoc-ref %build-inputs "jdk"))
- (zero? (system* "ant" "all"
- (string-append "-Ddist="
- (assoc-ref %outputs "out")
- "/share/java/htsjdk/"))))
- (fold alist-delete %standard-phases
- '(configure install check)))))
- (native-inputs
- `(("ant" ,ant)
- ("jdk" ,icedtea "jdk")))
+ `(#:tests? #f ; test require Internet access
+ #:make-flags
+ (list (string-append "-Ddist=" (assoc-ref %outputs "out")
+ "/share/java/htsjdk/"))
+ #:build-target "all"
+ #:phases
+ (modify-phases %standard-phases
+ ;; The build phase also installs the jars
+ (delete 'install))))
(home-page "http://samtools.github.io/htsjdk/")
(synopsis "Java API for high-throughput sequencing data (HTS) formats")
(description
@@ -2405,6 +2640,44 @@ the phenotype as it models the data.")
generated using the PacBio Iso-Seq protocol.")
(license license:bsd-3))))
+(define-public pyicoteo
+ (package
+ (name "pyicoteo")
+ (version "2.0.7")
+ (source
+ (origin
+ (method url-fetch)
+ (uri (string-append "https://bitbucket.org/regulatorygenomicsupf/"
+ "pyicoteo/get/v" version ".tar.bz2"))
+ (file-name (string-append name "-" version ".tar.bz2"))
+ (sha256
+ (base32
+ "0d6087f29xp8wxwlj111c3sylli98n0l8ry58c51ixzq0zfm50wa"))))
+ (build-system python-build-system)
+ (arguments
+ `(#:python ,python-2 ; does not work with Python 3
+ #:tests? #f)) ; there are no tests
+ (inputs
+ `(("python2-matplotlib" ,python2-matplotlib)))
+ (home-page "https://bitbucket.org/regulatorygenomicsupf/pyicoteo")
+ (synopsis "Analyze high-throughput genetic sequencing data")
+ (description
+ "Pyicoteo is a suite of tools for the analysis of high-throughput genetic
+sequencing data. It works with genomic coordinates. There are currently six
+different command-line tools:
+
+@enumerate
+@item pyicoregion: for generating exploratory regions automatically;
+@item pyicoenrich: for differential enrichment between two conditions;
+@item pyicoclip: for calling CLIP-Seq peaks without a control;
+@item pyicos: for genomic coordinates manipulation;
+@item pyicoller: for peak calling on punctuated ChIP-Seq;
+@item pyicount: to count how many reads from N experiment files overlap in a
+ region file;
+@item pyicotrocol: to combine operations from pyicoteo.
+@end enumerate\n")
+ (license license:gpl3+)))
+
(define-public prodigal
(package
(name "prodigal")
@@ -3664,16 +3937,38 @@ barplots or heatmaps.")
packages.")
(license license:artistic2.0)))
+(define-public r-dnacopy
+ (package
+ (name "r-dnacopy")
+ (version "1.44.0")
+ (source (origin
+ (method url-fetch)
+ (uri (bioconductor-uri "DNAcopy" version))
+ (sha256
+ (base32
+ "1c1px4rbr36xx929hp59k7ca9k5ab66qmn8k63fk13278ncm6h66"))))
+ (properties
+ `((upstream-name . "DNAcopy")))
+ (build-system r-build-system)
+ (inputs
+ `(("gfortran" ,gfortran)))
+ (home-page "https://bioconductor.org/packages/DNAcopy")
+ (synopsis "Implementation of a circular binary segmentation algorithm")
+ (description "This package implements the circular binary segmentation (CBS)
+algorithm to segment DNA copy number data and identify genomic regions with
+abnormal copy number.")
+ (license license:gpl2+)))
+
(define-public r-s4vectors
(package
(name "r-s4vectors")
- (version "0.8.5")
+ (version "0.8.11")
(source (origin
(method url-fetch)
(uri (bioconductor-uri "S4Vectors" version))
(sha256
(base32
- "10f4jxwlwsiy7zhb3kgp6anid0d7wkvrrljl80r3nhx38yr24l5k"))))
+ "12iibcs63m9iy7f45wgjcqsna2dnqwckphk682389grshz0g4x66"))))
(properties
`((upstream-name . "S4Vectors")
(r-repository . bioconductor)))
@@ -3695,13 +3990,13 @@ S4Vectors package itself.")
(define-public r-iranges
(package
(name "r-iranges")
- (version "2.4.6")
+ (version "2.4.8")
(source (origin
(method url-fetch)
(uri (bioconductor-uri "IRanges" version))
(sha256
(base32
- "00x0266sys1fc5ipa639y84p6m6mgspk2xb099vcwmd3w4hypj9d"))))
+ "0hi5k1j5jm4xrg1l506g279qw1xkvp1gg1zgsjzpbng4vx4k4iyl"))))
(properties
`((upstream-name . "IRanges")
(r-repository . bioconductor)))
@@ -4141,7 +4436,9 @@ extracting the desired features in a convenient format.")
(version "3.2.2")
(source (origin
(method url-fetch)
- (uri (bioconductor-uri "GO.db" version))
+ (uri (string-append "http://www.bioconductor.org/packages/"
+ "release/data/annotation/src/contrib/GO.db_"
+ version ".tar.gz"))
(sha256
(base32
"00gariag9ampz82dh0xllrc26r85d7vdcwc0vca5zdy147rwxr7f"))))
@@ -4453,3 +4750,44 @@ Using a hidden Markov model, R/qtl allows to estimate genetic maps, to
identify genotyping errors, and to perform single-QTL and two-QTL,
two-dimensional genome scans.")
(license license:gpl3)))
+
+(define-public pepr
+ (package
+ (name "pepr")
+ (version "1.0.9")
+ (source (origin
+ (method url-fetch)
+ (uri (string-append "https://pypi.python.org/packages/source/P"
+ "/PePr/PePr-" version ".tar.gz"))
+ (sha256
+ (base32
+ "0qxjfdpl1b1y53nccws2d85f6k74zwmx8y8sd9rszcqhfayx6gdx"))))
+ (build-system python-build-system)
+ (arguments
+ `(#:python ,python-2 ; python2 only
+ #:tests? #f ; no tests included
+ #:phases
+ (modify-phases %standard-phases
+ ;; When setuptools is used a ".egg" archive is generated and
+ ;; installed. This makes it hard to actually run PePr. This issue
+ ;; has been reported upstream:
+ ;; https://github.com/shawnzhangyx/PePr/issues/9
+ (add-after 'unpack 'disable-egg-generation
+ (lambda _
+ (substitute* "setup.py"
+ (("from setuptools import setup")
+ "from distutils.core import setup"))
+ #t)))))
+ (propagated-inputs
+ `(("python2-numpy" ,python2-numpy)
+ ("python2-scipy" ,python2-scipy)
+ ("python2-pysam" ,python2-pysam)))
+ (home-page "https://code.google.com/p/pepr-chip-seq/")
+ (synopsis "Peak-calling and prioritization pipeline for ChIP-Seq data")
+ (description
+ "PePr is a ChIP-Seq peak calling or differential binding analysis tool
+that is primarily designed for data with biological replicates. It uses a
+negative binomial distribution to model the read counts among the samples in
+the same group, and look for consistent differences between ChIP and control
+group or two ChIP groups run under different conditions.")
+ (license license:gpl3+)))