aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristopher Baines <mail@cbaines.net>2017-06-19 08:14:43 +0100
committerLudovic Courtès <ludo@gnu.org>2017-07-25 23:24:16 +0200
commitf135b4ae8397d2c501150d3ead3e0603e770ce3f (patch)
treee28318a7a29db971d289f78d716ec73adbd78818
parent84620dd0c4f8f96cfdafb9a3ce8cce5d36a52b03 (diff)
downloadguix-f135b4ae8397d2c501150d3ead3e0603e770ce3f.tar
guix-f135b4ae8397d2c501150d3ead3e0603e770ce3f.tar.gz
git-download: Speed up 'git-predicate'.
Adjust 'git-predicate' to use data structures that perform better when used with git repositories with a large number of files. Previously when matching either a regular file or directory, 'git-predicate' would search a list with a length equal to the number of files in the repository. As a search operation happens for roughly every file in the repository, this meant that the time taken to use 'git-predicate' to traverse all the files in a repository was roughly exponential with respect to the number of files in the repository. Now, for matching regular files or symlinks, 'git-predicate' uses a vhash using the inode value as the key. This should perform roughly in constant amount of time, instead of linear with respect to the number of files in the repository. For matching directories, 'git-predicate' now uses a tree structure stored in association lists. To check if a directory is in the tree, the tree is traversed from the root. The time complexity of this depends on the shape of the tree, but it should be an improvement on searching through the list of all files. * guix/git-download.scm (files->directory-tree, directory-in-tree?): New procedures. (git-predicate): Compute DIRECTORY-TREE. Turn INODES into a vhash. Adjust body of lambda accordingly. Co-authored-by: Ludovic Courtès <ludo@gnu.org>
-rw-r--r--guix/git-download.scm95
1 files changed, 68 insertions, 27 deletions
diff --git a/guix/git-download.scm b/guix/git-download.scm
index 316835502c..5019a3e62f 100644
--- a/guix/git-download.scm
+++ b/guix/git-download.scm
@@ -1,6 +1,7 @@
;;; GNU Guix --- Functional package management for GNU
-;;; Copyright © 2014, 2015, 2016 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2014, 2015, 2016, 2017 Ludovic Courtès <ludo@gnu.org>
;;; Copyright © 2017 Mathieu Lirzin <mthl@gnu.org>
+;;; Copyright © 2017 Christopher Baines <mail@cbaines.net>
;;;
;;; This file is part of GNU Guix.
;;;
@@ -28,6 +29,7 @@
#:use-module (ice-9 match)
#:use-module (ice-9 popen)
#:use-module (ice-9 rdelim)
+ #:use-module (ice-9 vlist)
#:use-module (srfi srfi-1)
#:export (git-reference
git-reference?
@@ -125,45 +127,84 @@ HASH-ALGO (a symbol). Use NAME as the file name, or a generic name if #f."
"Return the file-name for packages using git-download."
(string-append name "-" version "-checkout"))
+
+;;;
+;;; 'git-predicate'.
+;;;
+
+(define (files->directory-tree files)
+ "Return a tree of vhashes representing the directory listed in FILES, a list
+like '(\"a/b\" \"b/c/d\")."
+ (fold (lambda (file result)
+ (let loop ((file (string-split file #\/))
+ (result result))
+ (match file
+ ((_)
+ result)
+ ((directory children ...)
+ (match (vhash-assoc directory result)
+ (#f
+ (vhash-cons directory (loop children vlist-null)
+ result))
+ ((_ . previous)
+ ;; XXX: 'vhash-delete' is O(n).
+ (vhash-cons directory (loop children previous)
+ (vhash-delete directory result)))))
+ (()
+ result))))
+ vlist-null
+ files))
+
+(define (directory-in-tree? tree directory)
+ "Return true if DIRECTORY, a string like \"a/b\", denotes a directory listed
+in TREE."
+ (let loop ((directory (string-split directory #\/))
+ (tree tree))
+ (match directory
+ (()
+ #t)
+ ((head . tail)
+ (match (vhash-assoc head tree)
+ ((_ . sub-tree) (loop tail sub-tree))
+ (#f #f))))))
+
(define (git-predicate directory)
"Return a predicate that returns true if a file is part of the Git checkout
living at DIRECTORY. Upon Git failure, return #f instead of a predicate.
The returned predicate takes two arguments FILE and STAT where FILE is an
absolute file name and STAT is the result of 'lstat'."
- (define (parent-directory? thing directory)
- ;; Return #t if DIRECTORY is the parent of THING.
- (or (string-suffix? thing directory)
- (and (string-index thing #\/)
- (parent-directory? (dirname thing) directory))))
-
- (let* ((pipe (with-directory-excursion directory
- (open-pipe* OPEN_READ "git" "ls-files")))
- (files (let loop ((lines '()))
- (match (read-line pipe)
- ((? eof-object?)
- (reverse lines))
- (line
- (loop (cons line lines))))))
- (inodes (map (lambda (file)
- (let ((stat (lstat
- (string-append directory "/" file))))
- (cons (stat:dev stat) (stat:ino stat))))
- files))
- (status (close-pipe pipe)))
+ (let* ((pipe (with-directory-excursion directory
+ (open-pipe* OPEN_READ "git" "ls-files")))
+ (files (let loop ((lines '()))
+ (match (read-line pipe)
+ ((? eof-object?)
+ (reverse lines))
+ (line
+ (loop (cons line lines))))))
+ (directory-tree (files->directory-tree files))
+ (inodes (fold (lambda (file result)
+ (let ((stat
+ (lstat (string-append directory "/"
+ file))))
+ (vhash-consv (stat:ino stat) (stat:dev stat)
+ result)))
+ vlist-null
+ files))
+ (prefix-length (+ 1 (string-length (canonicalize-path directory))))
+ (status (close-pipe pipe)))
(and (zero? status)
(lambda (file stat)
(match (stat:type stat)
('directory
- ;; 'git ls-files' does not list directories, only regular files,
- ;; so we need this special trick.
- (any (lambda (f) (parent-directory? f file))
- files))
+ (directory-in-tree? directory-tree
+ (string-drop file prefix-length)))
((or 'regular 'symlink)
;; Comparing file names is always tricky business so we rely on
;; inode numbers instead
- (member (cons (stat:dev stat) (stat:ino stat))
- inodes))
+ (match (vhash-assv (stat:ino stat) inodes)
+ ((_ . dev) (= dev (stat:dev stat)))
+ (#f #f)))
(_
#f))))))