From bf5bf5778cb7c3a2475c6acd707abc925b1819aa Mon Sep 17 00:00:00 2001 From: Caleb Ristvedt Date: Sun, 27 May 2018 23:20:54 +0200 Subject: Add (guix store deduplication). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * guix/store/database.scm (register-path): Add #:deduplicate? and call 'deduplicate' when it's true. (counting-wrapper-port, nar-sha256): Move to... * guix/store/deduplication.scm: ... here. New file. * tests/store-deduplication.scm: New file. * Makefile.am (STORE_MODULES): Add deduplication.scm. (SCM_TESTS) [HAVE_GUILE_SQLITE3]: Add store-deduplication.scm. Co-authored-by: Ludovic Courtès --- Makefile.am | 6 +- guix/store/database.scm | 43 ++---------- guix/store/deduplication.scm | 148 ++++++++++++++++++++++++++++++++++++++++++ tests/store-deduplication.scm | 64 ++++++++++++++++++ 4 files changed, 222 insertions(+), 39 deletions(-) create mode 100644 guix/store/deduplication.scm create mode 100644 tests/store-deduplication.scm diff --git a/Makefile.am b/Makefile.am index d81fce5585..474575c9f2 100644 --- a/Makefile.am +++ b/Makefile.am @@ -259,7 +259,8 @@ endif BUILD_DAEMON_OFFLOAD # Scheme implementation of the build daemon and related functionality. STORE_MODULES = \ - guix/store/database.scm + guix/store/database.scm \ + guix/store/deduplication.scm if HAVE_GUILE_SQLITE3 MODULES += $(STORE_MODULES) @@ -392,7 +393,8 @@ endif if HAVE_GUILE_SQLITE3 SCM_TESTS += \ - tests/store-database.scm + tests/store-database.scm \ + tests/store-deduplication.scm endif diff --git a/guix/store/database.scm b/guix/store/database.scm index b9745dbe14..3623c0e7a0 100644 --- a/guix/store/database.scm +++ b/guix/store/database.scm @@ -21,10 +21,9 @@ (define-module (guix store database) #:use-module (sqlite3) #:use-module (guix config) #:use-module (guix serialization) + #:use-module (guix store deduplication) #:use-module (guix base16) - #:use-module (guix hash) #:use-module (guix build syscalls) - #:use-module (rnrs io ports) #:use-module (srfi srfi-11) #:use-module (srfi srfi-19) #:use-module (ice-9 match) @@ -140,39 +139,6 @@ (define* (sqlite-register #:key db-file path (references '()) ;;; High-level interface. ;;; -;; XXX: Would it be better to just make WRITE-FILE give size as well? I question -;; the general utility of this approach. -(define (counting-wrapper-port output-port) - "Some custom ports don't implement GET-POSITION at all. But if we want to -figure out how many bytes are being written, we will want to use that. So this -makes a wrapper around a port which implements GET-POSITION." - (let ((byte-count 0)) - (make-custom-binary-output-port "counting-wrapper" - (lambda (bytes offset count) - (set! byte-count - (+ byte-count count)) - (put-bytevector output-port bytes - offset count) - count) - (lambda () - byte-count) - #f - (lambda () - (close-port output-port))))) - - -(define (nar-sha256 file) - "Gives the sha256 hash of a file and the size of the file in nar form." - (let-values (((port get-hash) (open-sha256-port))) - (let ((wrapper (counting-wrapper-port port))) - (write-file file wrapper) - (force-output wrapper) - (force-output port) - (let ((hash (get-hash)) - (size (port-position wrapper))) - (close-port wrapper) - (values hash size))))) - ;; TODO: Factorize with that in (gnu build install). (define (reset-timestamps file) "Reset the modification time on FILE and on all the files it contains, if @@ -211,7 +177,7 @@ (define (reset-timestamps file) (define* (register-path path #:key (references '()) deriver prefix - state-directory) + state-directory (deduplicate? #t)) ;; Priority for options: first what is given, then environment variables, ;; then defaults. %state-directory, %store-directory, and ;; %store-database-directory already handle the "environment variables / @@ -262,4 +228,7 @@ (define* (register-path path #:deriver deriver #:hash (string-append "sha256:" (bytevector->base16-string hash)) - #:nar-size nar-size)))) + #:nar-size nar-size) + + (when deduplicate? + (deduplicate real-path hash #:store store-dir))))) diff --git a/guix/store/deduplication.scm b/guix/store/deduplication.scm new file mode 100644 index 0000000000..4b4ac01f64 --- /dev/null +++ b/guix/store/deduplication.scm @@ -0,0 +1,148 @@ +;;; GNU Guix --- Functional package management for GNU +;;; Copyright © 2017 Caleb Ristvedt +;;; Copyright © 2018 Ludovic Courtès +;;; +;;; This file is part of GNU Guix. +;;; +;;; GNU Guix is free software; you can redistribute it and/or modify it +;;; under the terms of the GNU General Public License as published by +;;; the Free Software Foundation; either version 3 of the License, or (at +;;; your option) any later version. +;;; +;;; GNU Guix is distributed in the hope that it will be useful, but +;;; WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with GNU Guix. If not, see . + +;;; This houses stuff we do to files when they arrive at the store - resetting +;;; timestamps, deduplicating, etc. + +(define-module (guix store deduplication) + #:use-module (guix hash) + #:use-module (guix build utils) + #:use-module (guix base16) + #:use-module (srfi srfi-11) + #:use-module (rnrs io ports) + #:use-module (ice-9 ftw) + #:use-module (guix serialization) + #:export (nar-sha256 + deduplicate)) + +;; Would it be better to just make WRITE-FILE give size as well? I question +;; the general utility of this approach. +(define (counting-wrapper-port output-port) + "Some custom ports don't implement GET-POSITION at all. But if we want to +figure out how many bytes are being written, we will want to use that. So this +makes a wrapper around a port which implements GET-POSITION." + (let ((byte-count 0)) + (make-custom-binary-output-port "counting-wrapper" + (lambda (bytes offset count) + (set! byte-count + (+ byte-count count)) + (put-bytevector output-port bytes + offset count) + count) + (lambda () + byte-count) + #f + (lambda () + (close-port output-port))))) + +(define (nar-sha256 file) + "Gives the sha256 hash of a file and the size of the file in nar form." + (let-values (((port get-hash) (open-sha256-port))) + (let ((wrapper (counting-wrapper-port port))) + (write-file file wrapper) + (force-output wrapper) + (force-output port) + (let ((hash (get-hash)) + (size (port-position wrapper))) + (close-port wrapper) + (values hash size))))) + +(define (tempname-in directory) + "Gives an unused temporary name under DIRECTORY. Not guaranteed to still be +unused by the time you create anything with that name, but a good shot." + (let ((const-part (string-append directory "/.tmp-link-" + (number->string (getpid))))) + (let try ((guess-part + (number->string (random most-positive-fixnum) 16))) + (if (file-exists? (string-append const-part "-" guess-part)) + (try (number->string (random most-positive-fixnum) 16)) + (string-append const-part "-" guess-part))))) + +(define* (get-temp-link target #:optional (link-prefix (dirname target))) + "Like mkstemp!, but instead of creating a new file and giving you the name, +it creates a new hardlink to TARGET and gives you the name. Since +cross-filesystem hardlinks don't work, the temp link must be created on the +same filesystem - where in that filesystem it is can be controlled by +LINK-PREFIX." + (let try ((tempname (tempname-in link-prefix))) + (catch 'system-error + (lambda () + (link target tempname) + tempname) + (lambda (args) + (if (= (system-error-errno args) EEXIST) + (try (tempname-in link-prefix)) + (throw 'system-error args)))))) + +;; There are 3 main kinds of errors we can get from hardlinking: "Too many +;; things link to this" (EMLINK), "this link already exists" (EEXIST), and +;; "can't fit more stuff in this directory" (ENOSPC). + +(define (replace-with-link target to-replace) + "Atomically replace the file TO-REPLACE with a link to TARGET. Note: TARGET +and TO-REPLACE must be on the same file system." + (let ((temp-link (get-temp-link target (dirname to-replace)))) + (rename-file temp-link to-replace))) + +(define-syntax-rule (false-if-system-error (errors ...) exp ...) + "Given ERRORS, a list of system error codes to ignore, evaluates EXP... and +return #f if any of the system error codes in the given list are thrown." + (catch 'system-error + (lambda () + exp ...) + (lambda args + (if (member (system-error-errno args) (list errors ...)) + #f + (apply throw args))))) + +(define* (deduplicate path hash #:key (store %store-directory)) + "Check if a store item with sha256 hash HASH already exists. If so, +replace PATH with a hardlink to the already-existing one. If not, register +PATH so that future duplicates can hardlink to it. PATH is assumed to be +under STORE." + (let* ((links-directory (string-append store "/.links")) + (link-file (string-append links-directory "/" + (bytevector->base16-string hash)))) + (mkdir-p links-directory) + (if (file-is-directory? path) + ;; Can't hardlink directories, so hardlink their atoms. + (for-each (lambda (file) + (unless (member file '("." "..")) + (deduplicate file (nar-sha256 file) + #:store store))) + (scandir path)) + (if (file-exists? link-file) + (false-if-system-error (EMLINK) + (replace-with-link link-file path)) + (catch 'system-error + (lambda () + (link path link-file)) + (lambda args + (let ((errno (system-error-errno args))) + (cond ((= errno EEXIST) + ;; Someone else put an entry for PATH in + ;; LINKS-DIRECTORY before we could. Let's use it. + (false-if-system-error (EMLINK) + (replace-with-link path link-file))) + ((= errno ENOSPC) + ;; There's not enough room in the directory index for + ;; more entries in .links, but that's fine: we can + ;; just stop. + #f) + (else (apply throw args)))))))))) diff --git a/tests/store-deduplication.scm b/tests/store-deduplication.scm new file mode 100644 index 0000000000..04817a193a --- /dev/null +++ b/tests/store-deduplication.scm @@ -0,0 +1,64 @@ +;;; GNU Guix --- Functional package management for GNU +;;; Copyright © 2018 Ludovic Courtès +;;; +;;; This file is part of GNU Guix. +;;; +;;; GNU Guix is free software; you can redistribute it and/or modify it +;;; under the terms of the GNU General Public License as published by +;;; the Free Software Foundation; either version 3 of the License, or (at +;;; your option) any later version. +;;; +;;; GNU Guix is distributed in the hope that it will be useful, but +;;; WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with GNU Guix. If not, see . + +(define-module (test-store-deduplication) + #:use-module (guix tests) + #:use-module (guix store deduplication) + #:use-module (guix hash) + #:use-module ((guix utils) #:select (call-with-temporary-directory)) + #:use-module (guix build utils) + #:use-module (rnrs bytevectors) + #:use-module (ice-9 binary-ports) + #:use-module (srfi srfi-1) + #:use-module (srfi srfi-64)) + +(test-begin "store-deduplication") + +(test-equal "deduplicate" + (cons* #t #f ;inode comparisons + 2 (make-list 5 6)) ;'nlink' values + + (call-with-temporary-directory + (lambda (store) + (let ((data (string->utf8 "Hello, world!")) + (identical (map (lambda (n) + (string-append store "/" (number->string n))) + (iota 5))) + (unique (string-append store "/unique"))) + (for-each (lambda (file) + (call-with-output-file file + (lambda (port) + (put-bytevector port data)))) + identical) + (call-with-output-file unique + (lambda (port) + (put-bytevector port (string->utf8 "This is unique.")))) + + (for-each (lambda (file) + (deduplicate file (sha256 data) #:store store)) + identical) + (deduplicate unique (nar-sha256 unique) #:store store) + + ;; (system (string-append "ls -lRia " store)) + (cons* (apply = (map (compose stat:ino stat) identical)) + (= (stat:ino (stat unique)) + (stat:ino (stat (car identical)))) + (stat:nlink (stat unique)) + (map (compose stat:nlink stat) identical)))))) + +(test-end "store-deduplication") -- cgit v1.2.3