diff options
Diffstat (limited to 'gnu/packages/bioinformatics.scm')
-rw-r--r-- | gnu/packages/bioinformatics.scm | 404 |
1 files changed, 371 insertions, 33 deletions
diff --git a/gnu/packages/bioinformatics.scm b/gnu/packages/bioinformatics.scm index 7b3838d36f..281bd1f427 100644 --- a/gnu/packages/bioinformatics.scm +++ b/gnu/packages/bioinformatics.scm @@ -1,9 +1,9 @@ - ;;; GNU Guix --- Functional package management for GNU ;;; Copyright © 2014, 2015, 2016 Ricardo Wurmus <rekado@elephly.net> ;;; Copyright © 2015, 2016 Ben Woodcroft <donttrustben@gmail.com> -;;; Copyright © 2015 Pjotr Prins <pjotr.guix@thebird.nl> +;;; Copyright © 2015, 2016 Pjotr Prins <pjotr.guix@thebird.nl> ;;; Copyright © 2015 Andreas Enge <andreas@enge.fr> +;;; Copyright © 2016 Roel Janssen <roel@gnu.org> ;;; ;;; This file is part of GNU Guix. ;;; @@ -26,6 +26,7 @@ #:use-module (guix utils) #:use-module (guix download) #:use-module (guix git-download) + #:use-module (guix build-system ant) #:use-module (guix build-system gnu) #:use-module (guix build-system cmake) #:use-module (guix build-system perl) @@ -37,11 +38,13 @@ #:use-module (gnu packages autotools) #:use-module (gnu packages algebra) #:use-module (gnu packages base) + #:use-module (gnu packages bison) #:use-module (gnu packages boost) #:use-module (gnu packages compression) #:use-module (gnu packages cpio) #:use-module (gnu packages curl) #:use-module (gnu packages doxygen) + #:use-module (gnu packages datastructures) #:use-module (gnu packages file) #:use-module (gnu packages gawk) #:use-module (gnu packages gcc) @@ -247,6 +250,47 @@ intervals from multiple files in widely-used genomic file formats such as BAM, BED, GFF/GTF, VCF.") (license license:gpl2))) +(define-public bioawk + (package + (name "bioawk") + (version "1.0") + (source (origin + (method url-fetch) + (uri (string-append "https://github.com/lh3/bioawk/archive/v" + version ".tar.gz")) + (file-name (string-append name "-" version ".tar.gz")) + (sha256 + (base32 "1daizxsk17ahi9n58fj8vpgwyhzrzh54bzqhanjanp88kgrz7gjw")))) + (build-system gnu-build-system) + (inputs + `(("zlib" ,zlib))) + (native-inputs + `(("bison" ,bison))) + (arguments + `(#:tests? #f ; There are no tests to run. + ;; Bison must generate files, before other targets can build. + #:parallel-build? #f + #:phases + (modify-phases %standard-phases + (delete 'configure) ; There is no configure phase. + (replace 'install + (lambda* (#:key outputs #:allow-other-keys) + (let* ((out (assoc-ref outputs "out")) + (bin (string-append out "/bin")) + (man (string-append out "/share/man/man1"))) + (mkdir-p man) + (copy-file "awk.1" (string-append man "/bioawk.1")) + (install-file "bioawk" bin))))))) + (home-page "https://github.com/lh3/bioawk") + (synopsis "AWK with bioinformatics extensions") + (description "Bioawk is an extension to Brian Kernighan's awk, adding the +support of several common biological data formats, including optionally gzip'ed +BED, GFF, SAM, VCF, FASTA/Q and TAB-delimited formats with column names. It +also adds a few built-in functions and a command line option to use TAB as the +input/output delimiter. When the new functionality is not used, bioawk is +intended to behave exactly the same as the original BWK awk.") + (license license:x11))) + (define-public python2-pybedtools (package (name "python2-pybedtools") @@ -535,10 +579,11 @@ confidence to have in an alignment.") (snippet `(begin ;; Remove bundled boost, pigz, zlib, and .git directory - ;; FIXME: also remove bundled sources for google-sparsehash, - ;; murmurhash3, kmc once packaged. + ;; FIXME: also remove bundled sources for murmurhash3 and + ;; kmc once packaged. (delete-file-recursively "boost") (delete-file-recursively "pigz") + (delete-file-recursively "google-sparsehash") (delete-file-recursively "zlib") (delete-file-recursively ".git") #t)))) @@ -588,6 +633,7 @@ confidence to have in an alignment.") (inputs `(("openmpi" ,openmpi) ("boost" ,boost) + ("sparsehash" ,sparsehash) ("pigz" ,pigz) ("zlib" ,zlib))) (supported-systems '("x86_64-linux")) @@ -774,6 +820,35 @@ and more accurate. BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina reads.") (license license:gpl3+))) +(define-public bwa-pssm + (package (inherit bwa) + (name "bwa-pssm") + (version "0.5.11") + (source (origin + (method url-fetch) + (uri (string-append "https://github.com/pkerpedjiev/bwa-pssm/" + "archive/" version ".tar.gz")) + (file-name (string-append name "-" version ".tar.gz")) + (sha256 + (base32 + "02p7mpbs4mlxmn84g2x4ghak638vbj4lqix2ipx5g84pz9bhdavg")))) + (build-system gnu-build-system) + (inputs + `(("gdsl" ,gdsl) + ("zlib" ,zlib) + ("perl" ,perl))) + (home-page "http://bwa-pssm.binf.ku.dk/") + (synopsis "Burrows-Wheeler transform-based probabilistic short read mapper") + (description + "BWA-PSSM is a probabilistic short genomic sequence read aligner based on +the use of @dfn{position specific scoring matrices} (PSSM). Like many of the +existing aligners it is fast and sensitive. Unlike most other aligners, +however, it is also adaptible in the sense that one can direct the alignment +based on known biases within the data set. It is coded as a modification of +the original BWA alignment program and shares the genome index structure as +well as many of the command line options.") + (license license:gpl3+))) + (define-public python2-bx-python (package (name "python2-bx-python") @@ -809,6 +884,91 @@ and more accurate. BWA-MEM also has better performance than BWA-backtrack for multiple sequence alignments.") (license license:expat))) +(define-public python-pysam + (package + (name "python-pysam") + (version "0.8.4") + (source (origin + (method url-fetch) + (uri (pypi-uri "pysam" version)) + (sha256 + (base32 + "1slx5mb94mzm5qzk52q270sab0sar95j67w1g1k452nz3s9j7krh")))) + (build-system python-build-system) + (arguments + `(#:tests? #f ; tests are excluded in the manifest + #:phases + (alist-cons-before + 'build 'set-flags + (lambda _ + (setenv "LDFLAGS" "-lncurses") + (setenv "CFLAGS" "-D_CURSES_LIB=1")) + %standard-phases))) + (inputs + `(("ncurses" ,ncurses) + ("zlib" ,zlib))) + (native-inputs + `(("python-cython" ,python-cython) + ("python-setuptools" ,python-setuptools))) + (home-page "https://github.com/pysam-developers/pysam") + (synopsis "Python bindings to the SAMtools C API") + (description + "Pysam is a Python module for reading and manipulating files in the +SAM/BAM format. Pysam is a lightweight wrapper of the SAMtools C API. It +also includes an interface for tabix.") + (license license:expat))) + +(define-public python2-pysam + (package-with-python2 python-pysam)) + +(define-public cd-hit + (package + (name "cd-hit") + (version "4.6.5") + (source (origin + (method url-fetch) + (uri (string-append "https://github.com/weizhongli/cdhit" + "/releases/download/V" version + "/cd-hit-v" version "-2016-0304.tar.gz")) + (sha256 + (base32 + "15db0hq38yyifwqx9b6l34z14jcq576dmjavhj8a426c18lvnhp3")))) + (build-system gnu-build-system) + (arguments + `(#:tests? #f ; there are no tests + #:make-flags + ;; Executables are copied directly to the PREFIX. + (list (string-append "PREFIX=" (assoc-ref %outputs "out") "/bin")) + #:phases + (modify-phases %standard-phases + ;; No "configure" script + (delete 'configure) + ;; Remove sources of non-determinism + (add-after 'unpack 'be-timeless + (lambda _ + (substitute* "cdhit-utility.c++" + ((" \\(built on \" __DATE__ \"\\)") "")) + (substitute* "cdhit-common.c++" + (("__DATE__") "\"0\"") + (("\", %s, \" __TIME__ \"\\\\n\", date") "")) + #t)) + ;; The "install" target does not create the target directory + (add-before 'install 'create-target-dir + (lambda* (#:key outputs #:allow-other-keys) + (mkdir-p (string-append (assoc-ref outputs "out") "/bin")) + #t))))) + (inputs + `(("perl" ,perl))) + (home-page "http://weizhongli-lab.org/cd-hit/") + (synopsis "Cluster and compare protein or nucleotide sequences") + (description + "CD-HIT is a program for clustering and comparing protein or nucleotide +sequences. CD-HIT is designed to be fast and handle extremely large +databases.") + ;; The manual says: "It can be copied under the GNU General Public License + ;; version 2 (GPLv2)." + (license license:gpl2))) + (define-public clipper (package (name "clipper") @@ -848,6 +1008,46 @@ multiple sequence alignments.") "CLIPper is a tool to define peaks in CLIP-seq datasets.") (license license:gpl2))) +(define-public codingquarry + (package + (name "codingquarry") + (version "2.0") + (source (origin + (method url-fetch) + (uri (string-append + "mirror://sourceforge/codingquarry/CodingQuarry_v" + version ".tar.gz")) + (sha256 + (base32 + "0115hkjflsnfzn36xppwf9h9avfxlavr43djqmshkkzbgjzsz60i")))) + (build-system gnu-build-system) + (arguments + '(#:tests? #f ; no "check" target + #:phases + (modify-phases %standard-phases + (delete 'configure) + (replace 'install + (lambda* (#:key outputs #:allow-other-keys) + (let* ((out (assoc-ref outputs "out")) + (bin (string-append out "/bin")) + (doc (string-append out "/share/doc/codingquarry"))) + (install-file "INSTRUCTIONS.pdf" doc) + (copy-recursively "QuarryFiles" + (string-append out "/QuarryFiles")) + (install-file "CodingQuarry" bin) + (install-file "CufflinksGTF_to_CodingQuarryGFF3.py" bin))))))) + (inputs `(("openmpi" ,openmpi))) + (native-search-paths + (list (search-path-specification + (variable "QUARRY_PATH") + (files '("QuarryFiles"))))) + (native-inputs `(("python" ,python-2))) ; Only Python 2 is supported + (synopsis "Fungal gene predictor") + (description "CodingQuarry is a highly accurate, self-training GHMM fungal +gene predictor designed to work with assembled, aligned RNA-seq transcripts.") + (home-page "https://sourceforge.net/projects/codingquarry/") + (license license:gpl3+))) + (define-public couger (package (name "couger") @@ -1129,28 +1329,70 @@ other types of unwanted sequence from high-throughput sequencing reads.") files.") (license license:expat))) +(define-public python-pybigwig + (package + (name "python-pybigwig") + (version "0.2.5") + (source (origin + (method url-fetch) + (uri (pypi-uri "pyBigWig" version)) + (sha256 + (base32 + "0yrpdxg3y0sny25x4w22lv1k47jzccqjmg7j4bp0hywklvp0hg7d")) + (modules '((guix build utils))) + (snippet + '(begin + ;; Delete bundled libBigWig sources + (delete-file-recursively "libBigWig"))))) + (build-system python-build-system) + (arguments + `(#:phases + (modify-phases %standard-phases + (add-after 'unpack 'link-with-libBigWig + (lambda* (#:key inputs #:allow-other-keys) + (substitute* "setup.py" + (("libs=\\[") "libs=[\"BigWig\", ")) + #t))))) + (inputs + `(("libbigwig" ,libbigwig) + ("zlib" ,zlib) + ("curl" ,curl))) + (home-page "https://github.com/dpryan79/pyBigWig") + (synopsis "Access bigWig files in Python using libBigWig") + (description + "This package provides Python bindings to the libBigWig library for +accessing bigWig files.") + (license license:expat))) + +(define-public python2-pybigwig + (let ((pybigwig (package-with-python2 python-pybigwig))) + (package (inherit pybigwig) + (native-inputs + `(("python-setuptools" ,python2-setuptools)))))) + (define-public deeptools (package (name "deeptools") - (version "1.5.11") + (version "2.1.1") (source (origin (method url-fetch) - (uri (string-append - "https://github.com/fidelram/deepTools/archive/" - version ".tar.gz")) + (uri (string-append "https://github.com/fidelram/deepTools/" + "archive/" version ".tar.gz")) (file-name (string-append name "-" version ".tar.gz")) (sha256 (base32 - "1kaagygcbvjs9sxd9cqmskd02wcfp9imvb735r087w7hwqpvz6fs")))) + "1nmfin0zjdby3vay3r4flvz94dr6qjhj41ax4yz3vx13j6wz8izd")))) (build-system python-build-system) (arguments `(#:python ,python-2)) - (propagated-inputs + (inputs `(("python-scipy" ,python2-scipy) ("python-numpy" ,python2-numpy) + ("python-numpydoc" ,python2-numpydoc) ("python-matplotlib" ,python2-matplotlib) ("python-bx-python" ,python2-bx-python) - ("python-pysam" ,python2-pysam))) + ("python-pysam" ,python2-pysam) + ("python-pybigwig" ,python2-pybigwig))) (native-inputs `(("python-mock" ,python2-mock) ;for tests ("python-pytz" ,python2-pytz) ;for tests @@ -1851,24 +2093,17 @@ from high-throughput sequencing assays.") (snippet '(substitute* "build.xml" (("failifexecutionfails=\"true\"") "failifexecutionfails=\"false\""))))) - (build-system gnu-build-system) + (build-system ant-build-system) (arguments - `(#:modules ((srfi srfi-1) - (guix build gnu-build-system) - (guix build utils)) - #:phases (alist-replace - 'build - (lambda _ - (setenv "JAVA_HOME" (assoc-ref %build-inputs "jdk")) - (zero? (system* "ant" "all" - (string-append "-Ddist=" - (assoc-ref %outputs "out") - "/share/java/htsjdk/")))) - (fold alist-delete %standard-phases - '(configure install check))))) - (native-inputs - `(("ant" ,ant) - ("jdk" ,icedtea "jdk"))) + `(#:tests? #f ; test require Internet access + #:make-flags + (list (string-append "-Ddist=" (assoc-ref %outputs "out") + "/share/java/htsjdk/")) + #:build-target "all" + #:phases + (modify-phases %standard-phases + ;; The build phase also installs the jars + (delete 'install)))) (home-page "http://samtools.github.io/htsjdk/") (synopsis "Java API for high-throughput sequencing data (HTS) formats") (description @@ -2405,6 +2640,44 @@ the phenotype as it models the data.") generated using the PacBio Iso-Seq protocol.") (license license:bsd-3)))) +(define-public pyicoteo + (package + (name "pyicoteo") + (version "2.0.7") + (source + (origin + (method url-fetch) + (uri (string-append "https://bitbucket.org/regulatorygenomicsupf/" + "pyicoteo/get/v" version ".tar.bz2")) + (file-name (string-append name "-" version ".tar.bz2")) + (sha256 + (base32 + "0d6087f29xp8wxwlj111c3sylli98n0l8ry58c51ixzq0zfm50wa")))) + (build-system python-build-system) + (arguments + `(#:python ,python-2 ; does not work with Python 3 + #:tests? #f)) ; there are no tests + (inputs + `(("python2-matplotlib" ,python2-matplotlib))) + (home-page "https://bitbucket.org/regulatorygenomicsupf/pyicoteo") + (synopsis "Analyze high-throughput genetic sequencing data") + (description + "Pyicoteo is a suite of tools for the analysis of high-throughput genetic +sequencing data. It works with genomic coordinates. There are currently six +different command-line tools: + +@enumerate +@item pyicoregion: for generating exploratory regions automatically; +@item pyicoenrich: for differential enrichment between two conditions; +@item pyicoclip: for calling CLIP-Seq peaks without a control; +@item pyicos: for genomic coordinates manipulation; +@item pyicoller: for peak calling on punctuated ChIP-Seq; +@item pyicount: to count how many reads from N experiment files overlap in a + region file; +@item pyicotrocol: to combine operations from pyicoteo. +@end enumerate\n") + (license license:gpl3+))) + (define-public prodigal (package (name "prodigal") @@ -3664,16 +3937,38 @@ barplots or heatmaps.") packages.") (license license:artistic2.0))) +(define-public r-dnacopy + (package + (name "r-dnacopy") + (version "1.44.0") + (source (origin + (method url-fetch) + (uri (bioconductor-uri "DNAcopy" version)) + (sha256 + (base32 + "1c1px4rbr36xx929hp59k7ca9k5ab66qmn8k63fk13278ncm6h66")))) + (properties + `((upstream-name . "DNAcopy"))) + (build-system r-build-system) + (inputs + `(("gfortran" ,gfortran))) + (home-page "https://bioconductor.org/packages/DNAcopy") + (synopsis "Implementation of a circular binary segmentation algorithm") + (description "This package implements the circular binary segmentation (CBS) +algorithm to segment DNA copy number data and identify genomic regions with +abnormal copy number.") + (license license:gpl2+))) + (define-public r-s4vectors (package (name "r-s4vectors") - (version "0.8.5") + (version "0.8.11") (source (origin (method url-fetch) (uri (bioconductor-uri "S4Vectors" version)) (sha256 (base32 - "10f4jxwlwsiy7zhb3kgp6anid0d7wkvrrljl80r3nhx38yr24l5k")))) + "12iibcs63m9iy7f45wgjcqsna2dnqwckphk682389grshz0g4x66")))) (properties `((upstream-name . "S4Vectors") (r-repository . bioconductor))) @@ -3695,13 +3990,13 @@ S4Vectors package itself.") (define-public r-iranges (package (name "r-iranges") - (version "2.4.6") + (version "2.4.8") (source (origin (method url-fetch) (uri (bioconductor-uri "IRanges" version)) (sha256 (base32 - "00x0266sys1fc5ipa639y84p6m6mgspk2xb099vcwmd3w4hypj9d")))) + "0hi5k1j5jm4xrg1l506g279qw1xkvp1gg1zgsjzpbng4vx4k4iyl")))) (properties `((upstream-name . "IRanges") (r-repository . bioconductor))) @@ -4141,7 +4436,9 @@ extracting the desired features in a convenient format.") (version "3.2.2") (source (origin (method url-fetch) - (uri (bioconductor-uri "GO.db" version)) + (uri (string-append "http://www.bioconductor.org/packages/" + "release/data/annotation/src/contrib/GO.db_" + version ".tar.gz")) (sha256 (base32 "00gariag9ampz82dh0xllrc26r85d7vdcwc0vca5zdy147rwxr7f")))) @@ -4453,3 +4750,44 @@ Using a hidden Markov model, R/qtl allows to estimate genetic maps, to identify genotyping errors, and to perform single-QTL and two-QTL, two-dimensional genome scans.") (license license:gpl3))) + +(define-public pepr + (package + (name "pepr") + (version "1.0.9") + (source (origin + (method url-fetch) + (uri (string-append "https://pypi.python.org/packages/source/P" + "/PePr/PePr-" version ".tar.gz")) + (sha256 + (base32 + "0qxjfdpl1b1y53nccws2d85f6k74zwmx8y8sd9rszcqhfayx6gdx")))) + (build-system python-build-system) + (arguments + `(#:python ,python-2 ; python2 only + #:tests? #f ; no tests included + #:phases + (modify-phases %standard-phases + ;; When setuptools is used a ".egg" archive is generated and + ;; installed. This makes it hard to actually run PePr. This issue + ;; has been reported upstream: + ;; https://github.com/shawnzhangyx/PePr/issues/9 + (add-after 'unpack 'disable-egg-generation + (lambda _ + (substitute* "setup.py" + (("from setuptools import setup") + "from distutils.core import setup")) + #t))))) + (propagated-inputs + `(("python2-numpy" ,python2-numpy) + ("python2-scipy" ,python2-scipy) + ("python2-pysam" ,python2-pysam))) + (home-page "https://code.google.com/p/pepr-chip-seq/") + (synopsis "Peak-calling and prioritization pipeline for ChIP-Seq data") + (description + "PePr is a ChIP-Seq peak calling or differential binding analysis tool +that is primarily designed for data with biological replicates. It uses a +negative binomial distribution to model the read counts among the samples in +the same group, and look for consistent differences between ChIP and control +group or two ChIP groups run under different conditions.") + (license license:gpl3+))) |