From 0f6b9e9828dfc269bfc4eade771efed1753e8c62 Mon Sep 17 00:00:00 2001 From: Ricardo Wurmus Date: Thu, 3 Dec 2015 16:12:09 +0100 Subject: import: cran: Parse DESCRIPTION instead of HTML. * guix/import/cran.scm (description->alist, listify, beautify-description, description->package): New procedures. (table-datum, downloads->url, nodes->text, cran-sxml->sexp): Remove proceduces. (latest-release): Use parsed DESCRIPTION instead of SXML. * tests/cran.scm: Rewrite to match importer. --- guix/import/cran.scm | 265 +++++++++++++++++++++++++-------------------------- tests/cran.scm | 189 +++++++++++++++--------------------- 2 files changed, 209 insertions(+), 245 deletions(-) diff --git a/guix/import/cran.scm b/guix/import/cran.scm index 43dc2c80b6..845ecb5832 100644 --- a/guix/import/cran.scm +++ b/guix/import/cran.scm @@ -20,26 +20,26 @@ (define-module (guix import cran) #:use-module (ice-9 match) #:use-module (ice-9 regex) + #:use-module ((ice-9 rdelim) #:select (read-string)) #:use-module (srfi srfi-1) #:use-module (srfi srfi-26) - #:use-module (sxml simple) - #:use-module (sxml match) - #:use-module (sxml xpath) #:use-module (guix http-client) #:use-module (guix hash) #:use-module (guix store) #:use-module (guix base32) #:use-module ((guix download) #:select (download-to-store)) #:use-module (guix import utils) + #:use-module ((guix build-system r) #:select (cran-uri)) #:use-module (guix upstream) #:use-module (guix packages) + #:use-module (gnu packages) #:export (cran->guix-package %cran-updater)) ;;; Commentary: ;;; ;;; Generate a package declaration template for the latest version of an R -;;; package on CRAN, using the HTML description downloaded from +;;; package on CRAN, using the DESCRIPTION file downloaded from ;;; cran.r-project.org. ;;; ;;; Code: @@ -67,6 +67,31 @@ (define string->license ((lst ...) `(list ,@(map string->license lst))) (_ #f))) + +(define (description->alist description) + "Convert a DESCRIPTION string into an alist." + (let ((lines (string-split description #\newline)) + (parse (lambda (line acc) + (if (string-null? line) acc + ;; Keys usually start with a capital letter and end with + ;; ":". There are some exceptions, unfortunately (such + ;; as "biocViews"). There are no blanks in a key. + (if (string-match "^[A-Za-z][^ :]+:( |\n|$)" line) + ;; New key/value pair + (let* ((pos (string-index line #\:)) + (key (string-take line pos)) + (value (string-drop line (+ 1 pos)))) + (cons (cons key + (string-trim-both value)) + acc)) + ;; This is a continuation of the previous pair + (match-let ((((key . value) . rest) acc)) + (cons (cons key (string-join + (list value + (string-trim-both line)))) + rest))))))) + (fold parse '() lines))) + (define (format-inputs names) "Generate a sorted list of package inputs from a list of package NAMES." (map (lambda (name) @@ -82,125 +107,94 @@ (define* (maybe-inputs package-inputs #:optional (type 'inputs)) ((package-inputs ...) `((,type (,'quasiquote ,(format-inputs package-inputs))))))) -(define (table-datum tree label) - "Extract the datum node following a LABEL in the sxml table TREE. Only the -first cell of a table row is considered a label cell." - ((node-pos 1) - ((sxpath `(xhtml:tr - (xhtml:td 1) ; only first cell can contain label - (equal? ,label) - ,(node-parent tree) ; go up to label cell - ,(node-parent tree) ; go up to matching row - (xhtml:td 2))) ; select second cell - tree))) - (define %cran-url "http://cran.r-project.org/web/packages/") (define (cran-fetch name) - "Return an sxml representation of the CRAN page for the R package NAME, -or #f on failure. NAME is case-sensitive." + "Return an alist of the contents of the DESCRIPTION file for the R package +NAME, or #f on failure. NAME is case-sensitive." ;; This API always returns the latest release of the module. - (let ((cran-url (string-append %cran-url name "/"))) - (false-if-exception - (xml->sxml (http-fetch cran-url) - #:trim-whitespace? #t - #:namespaces '((xhtml . "http://www.w3.org/1999/xhtml")) - #:default-entity-handler - (lambda (port name) - (case name - ((nbsp) " ") - ((ge) ">=") - ((gt) ">") - ((lt) "<") - (else - (format (current-warning-port) - "~a:~a:~a: undefined entitity: ~a\n" - cran-url (port-line port) (port-column port) - name) - (symbol->string name)))))))) - -(define (downloads->url downloads) - "Extract from DOWNLOADS, the downloads item of the CRAN sxml tree, the -download URL." - (string-append "mirror://cran/" - ;; Remove double dots, because we want an - ;; absolute path. - (regexp-substitute/global - #f "\\.\\./" - (string-join ((sxpath '((xhtml:a 1) @ href *text*)) - (table-datum downloads " Package source: "))) - 'pre 'post))) - -(define (nodes->text nodeset) - "Return the concatenation of the text nodes among NODESET." - (string-join ((sxpath '(// *text*)) nodeset) " ")) - -(define (cran-sxml->sexp sxml) - "Return the `package' s-expression for a CRAN package from the SXML -representation of the package page." + (let ((url (string-append %cran-url name "/DESCRIPTION"))) + (description->alist (read-string (http-fetch url))))) + +(define (listify meta field) + "Look up FIELD in the alist META. If FIELD contains a comma-separated +string, turn it into a list and strip off parenthetic expressions. Return the +empty list when the FIELD cannot be found." + (let ((value (assoc-ref meta field))) + (if (not value) + '() + ;; Strip off parentheses + (let ((items (string-split (regexp-substitute/global + #f "( *\\([^\\)]+\\)) *" + value 'pre 'post) + #\,))) + ;; When there is whitespace inside of items it is probably because + ;; this was not an actual list to begin with. + (remove (cut string-any char-set:whitespace <>) + (map string-trim-both items)))))) + +(define (beautify-description description) + "Improve the package DESCRIPTION by turning a beginning sentence fragment +into a proper sentence and by using two spaces between sentences." + (let ((cleaned (if (string-prefix? "A " description) + (string-append "This package provides a" + (substring description 1)) + description))) + ;; Use double spacing between sentences + (regexp-substitute/global #f "\\. \\b" + cleaned 'pre ". " 'post))) + +(define (description->package meta) + "Return the `package' s-expression for a CRAN package from the alist META, +which was derived from the R package's DESCRIPTION file." (define (guix-name name) (if (string-prefix? "r-" name) (string-downcase name) (string-append "r-" (string-downcase name)))) - (sxml-match-let* - (((*TOP* (xhtml:html - ,head - (xhtml:body - (xhtml:h2 ,name-and-synopsis) - (xhtml:p ,description) - ,summary - (xhtml:h4 "Downloads:") ,downloads - . ,rest))) - sxml)) - (let* ((name (match:prefix (string-match ": " name-and-synopsis))) - (synopsis (match:suffix (string-match ": " name-and-synopsis))) - (version (nodes->text (table-datum summary "Version:"))) - (license ((compose string->license nodes->text) - (table-datum summary "License:"))) - (home-page (nodes->text ((sxpath '((xhtml:a 1))) - (table-datum summary "URL:")))) - (source-url (downloads->url downloads)) - (tarball (with-store store (download-to-store store source-url))) - (sysdepends (map match:substring - (list-matches - "[^ ]+" - ;; Strip off comma and parenthetical - ;; expressions. - (regexp-substitute/global - #f "(,|\\([^\\)]+\\))" - (nodes->text (table-datum summary - "SystemRequirements:")) - 'pre 'post)))) - (imports (map guix-name - ((sxpath '(// xhtml:a *text*)) - (table-datum summary "Imports:"))))) - `(package - (name ,(guix-name name)) - (version ,version) - (source (origin - (method url-fetch) - (uri (cran-uri ,name version)) - (sha256 - (base32 - ,(bytevector->nix-base32-string (file-sha256 tarball)))))) - (build-system r-build-system) - ,@(maybe-inputs sysdepends) - ,@(maybe-inputs imports 'propagated-inputs) - (home-page ,(if (string-null? home-page) - (string-append %cran-url name) - home-page)) - (synopsis ,synopsis) - ;; Use double spacing - (description ,(regexp-substitute/global #f "\\. \\b" description - 'pre ". " 'post)) - (license ,license))))) + (let* ((name (assoc-ref meta "Package")) + (synopsis (assoc-ref meta "Title")) + (version (assoc-ref meta "Version")) + (license (string->license (assoc-ref meta "License"))) + ;; Some packages have multiple home pages. Some have none. + (home-page (match (listify meta "URL") + ((url rest ...) url) + (_ (string-append %cran-url name)))) + (source-url (match (cran-uri name version) + ((url rest ...) url) + (_ #f))) + (tarball (with-store store (download-to-store store source-url))) + (sysdepends (map string-downcase (listify meta "SystemRequirements"))) + (propagate (map guix-name (lset-union equal? + (listify meta "Imports") + (listify meta "LinkingTo") + (delete "R" + (listify meta "Depends")))))) + `(package + (name ,(guix-name name)) + (version ,version) + (source (origin + (method url-fetch) + (uri (cran-uri ,name version)) + (sha256 + (base32 + ,(bytevector->nix-base32-string (file-sha256 tarball)))))) + (properties ,`(,'quasiquote ((,'upstream-name . ,name)))) + (build-system r-build-system) + ,@(maybe-inputs sysdepends) + ,@(maybe-inputs propagate 'propagated-inputs) + (home-page ,(if (string-null? home-page) + (string-append %cran-url name) + home-page)) + (synopsis ,synopsis) + (description ,(beautify-description (assoc-ref meta "Description"))) + (license ,license)))) (define (cran->guix-package package-name) "Fetch the metadata for PACKAGE-NAME from cran.r-project.org, and return the `package' s-expression corresponding to that package, or #f on failure." (let ((module-meta (cran-fetch package-name))) - (and=> module-meta cran-sxml->sexp))) + (and=> module-meta description->package))) ;;; @@ -209,32 +203,33 @@ (define (cran->guix-package package-name) (define (latest-release package) "Return an for the latest release of PACKAGE." - (define name - (if (string-prefix? "r-" package) - (string-drop package 2) - package)) - - (define sxml - (cran-fetch name)) - - (and sxml - (sxml-match-let* - (((*TOP* (xhtml:html - ,head - (xhtml:body - (xhtml:h2 ,name-and-synopsis) - (xhtml:p ,description) - ,summary - (xhtml:h4 "Downloads:") ,downloads - . ,rest))) - sxml)) - (let ((version (nodes->text (table-datum summary "Version:"))) - (url (downloads->url downloads))) - ;; CRAN does not provide signatures. - (upstream-source - (package package) - (version version) - (urls (list url))))))) + + (define (package->cran-name package) + (match (package-source package) + ((? origin? origin) + (match (origin-uri origin) + ((url rest ...) + (let ((end (string-rindex url #\_)) + (start (string-rindex url #\/))) + ;; The URL ends on + ;; (string-append "/" name "_" version ".tar.gz") + (substring url start end))) + (_ #f))) + (_ #f))) + + (define cran-name + (package->cran-name (specification->package package))) + + (define meta + (cran-fetch cran-name)) + + (and meta + (let ((version (assoc-ref meta "Version"))) + ;; CRAN does not provide signatures. + (upstream-source + (package package) + (version version) + (urls (cran-uri cran-name version)))))) (define (cran-package? package) "Return true if PACKAGE is an R package from CRAN." diff --git a/tests/cran.scm b/tests/cran.scm index ba5699a133..0a4a2fdd8f 100644 --- a/tests/cran.scm +++ b/tests/cran.scm @@ -19,120 +19,84 @@ (define-module (test-cran) #:use-module (guix import cran) #:use-module (guix tests) + #:use-module (srfi srfi-1) #:use-module (srfi srfi-64) + #:use-module (srfi srfi-26) #:use-module (ice-9 match)) -(define sxml - '(*TOP* (xhtml:html - (xhtml:head - (xhtml:title "CRAN - Package my-example-sxml")) - (xhtml:body - (xhtml:h2 "my-example-sxml: Short description") - (xhtml:p "Long description") - (xhtml:table - (@ (summary "Package my-example-sxml summary")) - (xhtml:tr - (xhtml:td "Version:") - (xhtml:td "1.2.3")) - (xhtml:tr - (xhtml:td "Depends:") - (xhtml:td "R (>= 3.1.0)")) - (xhtml:tr - (xhtml:td "SystemRequirements:") - (xhtml:td "cairo (>= 1.2 http://www.cairographics.org/)")) - (xhtml:tr - (xhtml:td "Imports:") - (xhtml:td - (xhtml:a (@ (href "../scales/index.html")) - "scales") - " (>= 0.2.3), " - (xhtml:a (@ (href "../proto/index.html")) - "proto") - ", " - (xhtml:a (@ (href "../Rcpp/index.html")) "Rcpp") - " (>= 0.11.0)")) - (xhtml:tr - (xhtml:td "Suggests:") - (xhtml:td - (xhtml:a (@ (href "../some/index.html")) - "some") - ", " - (xhtml:a (@ (href "../suggestions/index.html")) - "suggestions"))) - (xhtml:tr - (xhtml:td "License:") - (xhtml:td - (xhtml:a (@ (href "../../licenses/MIT")) "MIT"))) - (xhtml:tr - (xhtml:td "URL:") - (xhtml:td - (xhtml:a (@ (href "http://gnu.org/s/my-example-sxml")) - "http://gnu.org/s/my-example-sxml") - ", " - (xhtml:a (@ (href "http://alternative/home/page")) - "http://alternative/home/page")))) - (xhtml:h4 "Downloads:") - (xhtml:table - (@ (summary "Package my-example-sxml downloads")) - (xhtml:tr - (xhtml:td " Reference manual: ") - (xhtml:td - (xhtml:a (@ (href "my-example-sxml.pdf")) - " my-example-sxml.pdf "))) - (xhtml:tr - (xhtml:td " Package source: ") - (xhtml:td - (xhtml:a - (@ (href "../../../src/contrib/my-example-sxml_1.2.3.tar.gz")) - " my-example-sxml_1.2.3.tar.gz ")))) - (xhtml:h4 "Reverse dependencies:") - (xhtml:table - (@ (summary "Package my-example-sxml reverse dependencies")) - (xhtml:tr - (xhtml:td "Reverse depends:") - (xhtml:td "Too many.")) - (xhtml:tr - (xhtml:td "Reverse imports:") - (xhtml:td "Likewise.")) - (xhtml:tr - (xhtml:td "Reverse suggests:") - (xhtml:td "Uncountable."))))))) +(define description " +Package: My-Example +Type: Package +Title: Example package +Version: 1.2.3 +Date: 2015-12-10 +Author: Ricardo Wurmus +Maintainer: Guix Schmeeks +URL: http://gnu.org/s/my-example +Description: This is a long description +spanning multiple lines: and it could confuse the parser that +there is a colon : on the lines. + And: this line continues the description. +biocViews: 0 +SystemRequirements: Cairo (>= 0) +Depends: A C++11 compiler. Version 4.6.* of g++ (as + currently in Rtools) is insufficient; versions 4.8.*, 4.9.* or + later will be fine. +License: GPL (>= 3) +Imports: Rcpp (>= 0.11.5), proto, Scales +LinkingTo: Rcpp, BH +NeedsCompilation: yes +Repository: CRAN +Date/Publication: 2015-07-14 14:15:16 +") -(define simple-table - '(xhtml:table - (xhtml:tr - (xhtml:td "Numbers") - (xhtml:td "123")) - (xhtml:tr - (@ (class "whatever")) - (xhtml:td (@ (class "unimportant")) "Letters") - (xhtml:td "abc")) - (xhtml:tr - (xhtml:td "Letters") - (xhtml:td "xyz")) - (xhtml:tr - (xhtml:td "Single")) - (xhtml:tr - (xhtml:td "not a value") - (xhtml:td "not a label") - (xhtml:td "also not a label")))) +(define description-alist + ((@@ (guix import cran) description->alist) description)) + +(define simple-alist + '(("Key" . "Value") + ("SimpleList" . "R, Rcpp, something, whatever") + ("BadList" . "This is not a real list, you know?") + ("List" . "R (>= 2.2), BH (for no reason), GenomicRanges"))) (test-begin "cran") -(test-equal "table-datum: return list of first table cell matching label" - '((xhtml:td "abc")) - ((@@ (guix import cran) table-datum) simple-table "Letters")) +(test-assert "description->alist: contains all valid keys" + (let ((keys '("Package" "Type" "Title" "Version" "Date" + "Author" "Maintainer" "URL" "Description" + "SystemRequirements" "Depends" "License" + "Imports" "biocViews" "LinkingTo" + "NeedsCompilation" "Repository" + "Date/Publication"))) + (lset= string=? keys (map car description-alist)))) -(test-equal "table-datum: return empty list if no match" +(test-equal "listify: return empty list if key cannot be found" '() - ((@@ (guix import cran) table-datum) simple-table "Astronauts")) + ((@@ (guix import cran) listify) simple-alist "Letters")) + +(test-equal "listify: split comma-separated value into elements" + '("R" "Rcpp" "something" "whatever") + ((@@ (guix import cran) listify) simple-alist "SimpleList")) -(test-equal "table-datum: only consider the first cell as a label cell" +(test-equal "listify: strip off parentheses" + '("R" "BH" "GenomicRanges") + ((@@ (guix import cran) listify) simple-alist "List")) + +(test-equal "listify: ignore values that are no lists" '() - ((@@ (guix import cran) table-datum) simple-table "not a label")) + ((@@ (guix import cran) listify) simple-alist "BadList")) + +(test-equal "beautify-description: use double spacing" + "This is a package. It is great. Trust me Mr. Hendrix." + ((@@ (guix import cran) beautify-description) + "This is a package. It is great. Trust me Mr. Hendrix.")) +(test-equal "beautify-description: transform fragment into sentence" + "This package provides a function to establish world peace" + ((@@ (guix import cran) beautify-description) + "A function to establish world peace")) -(test-assert "cran-sxml->sexp" +(test-assert "description->package" ;; Replace network resources with sample data. (mock ((guix build download) url-fetch (lambda* (url file-name #:key (mirrors '())) @@ -140,32 +104,37 @@ (define simple-table (lambda () (display (match url - ("mirror://cran/src/contrib/my-example-sxml_1.2.3.tar.gz" + ("mirror://cran/src/contrib/My-Example_1.2.3.tar.gz" "source") (_ (error "Unexpected URL: " url)))))))) - (match ((@@ (guix import cran) cran-sxml->sexp) sxml) + (match ((@@ (guix import cran) description->package) description-alist) (('package - ('name "r-my-example-sxml") + ('name "r-my-example") ('version "1.2.3") ('source ('origin ('method 'url-fetch) - ('uri ('cran-uri "my-example-sxml" 'version)) + ('uri ('cran-uri "My-Example" 'version)) ('sha256 ('base32 (? string? hash))))) + ('properties ('quasiquote (('upstream-name . "My-Example")))) ('build-system 'r-build-system) ('inputs ('quasiquote (("cairo" ('unquote 'cairo))))) ('propagated-inputs ('quasiquote - (("r-proto" ('unquote 'r-proto)) + (("r-bh" ('unquote 'r-bh)) + ("r-proto" ('unquote 'r-proto)) ("r-rcpp" ('unquote 'r-rcpp)) ("r-scales" ('unquote 'r-scales))))) - ('home-page "http://gnu.org/s/my-example-sxml") - ('synopsis "Short description") - ('description "Long description") - ('license 'x11))) + ('home-page "http://gnu.org/s/my-example") + ('synopsis "Example package") + ('description + "This is a long description spanning multiple lines: \ +and it could confuse the parser that there is a colon : on the \ +lines. And: this line continues the description.") + ('license 'gpl3+))) (x (begin (format #t "~s\n" x) -- cgit v1.2.3