aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLudovic Courtès <ludo@gnu.org>2024-04-19 22:00:44 +0200
committerLudovic Courtès <ludo@gnu.org>2024-05-25 16:44:42 +0200
commit5a7cb59648d102168bd4ecd16f36b69e0f594be1 (patch)
treee29fb395373468bc9e041f8e39de7a30112cdfe9
parent73b3f941d7d911a1b2bb2bf77d37cb3a12ed4291 (diff)
downloadguix-5a7cb59648d102168bd4ecd16f36b69e0f594be1.tar
guix-5a7cb59648d102168bd4ecd16f36b69e0f594be1.tar.gz
deduplication: Detect holes and create sparse files.
This reduces disk usage of sparse files that are substituted such as Guile object files (ELF files). As of Guile 3.0.9, .go files are sparse due to ELF sections being aligned on 64 KiB boundaries. This reduces disk usage reported by “du -sh” by 9% for the ‘guix’ package, by 23% for ‘guile’, and by 35% for ‘guile-git’. * guix/store/deduplication.scm (hole-size, find-holes): New procedures. (tee)[seekable?]: New variable. [read!]: Add case when SEEKABLE? is true. * tests/store-deduplication.scm (cartesian-product): New procedure. ("copy-file/deduplicate, sparse files (holes: ~a/~a/~a)"): New test set. Change-Id: Iad2ab7830dcb1220e2026f4a127a6c718afa8964
-rw-r--r--guix/store/deduplication.scm79
-rw-r--r--tests/store-deduplication.scm58
2 files changed, 134 insertions, 3 deletions
diff --git a/guix/store/deduplication.scm b/guix/store/deduplication.scm
index 129574c073..2005653c95 100644
--- a/guix/store/deduplication.scm
+++ b/guix/store/deduplication.scm
@@ -1,6 +1,6 @@
;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2017 Caleb Ristvedt <caleb.ristvedt@cune.org>
-;;; Copyright © 2018-2022 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2018-2022, 2024 Ludovic Courtès <ludo@gnu.org>
;;;
;;; This file is part of GNU Guix.
;;;
@@ -28,6 +28,7 @@
#:use-module (srfi srfi-11)
#:use-module (srfi srfi-34)
#:use-module (srfi srfi-35)
+ #:use-module (rnrs bytevectors)
#:use-module (rnrs io ports)
#:use-module (ice-9 match)
#:use-module (guix serialization)
@@ -206,6 +207,48 @@ under STORE."
#f)
(else (apply throw args)))))))))))
+(define (hole-size bv start size)
+ "Return a lower bound of the number of leading zeros in the first SIZE bytes
+of BV, starting at offset START."
+ (let ((end (+ start size)))
+ (let loop ((offset start))
+ (if (> offset (- end 4))
+ (- offset start)
+ (if (zero? (bytevector-u32-native-ref bv offset))
+ (loop (+ offset 4))
+ (- offset start))))))
+
+(define (find-holes bv start size)
+ "Return the list of offset/size pairs representing \"holes\" (sequences of
+zeros) in the SIZE bytes starting at START in BV."
+ (define granularity
+ ;; Disk block size is traditionally 512 bytes; focus on larger holes to
+ ;; reduce the computational effort.
+ 1024)
+
+ (define (align offset)
+ (match (modulo offset granularity)
+ (0 offset)
+ (mod (+ offset (- granularity mod)))))
+
+ (define end
+ (+ start size))
+
+ (let loop ((offset start)
+ (size size)
+ (holes '()))
+ (if (>= offset end)
+ (reverse! holes)
+ (let ((hole (hole-size bv offset size)))
+ (if (and hole (>= hole granularity))
+ (let ((next (align (+ offset hole))))
+ (loop next
+ (- size (- next offset))
+ (cons (cons offset hole) holes)))
+ (loop (+ offset granularity)
+ (- size granularity)
+ holes))))))
+
(define (tee input len output)
"Return a port that reads up to LEN bytes from INPUT and writes them to
OUTPUT as it goes."
@@ -217,6 +260,10 @@ OUTPUT as it goes."
(&nar-error (port input)
(file (port-filename output))))))
+ (define seekable?
+ ;; Whether OUTPUT can be a sparse file.
+ (file-port? output))
+
(define (read! bv start count)
;; Read at most LEN bytes in total.
(let ((count (min count (- len bytes-read))))
@@ -229,7 +276,35 @@ OUTPUT as it goes."
;; Do not return zero since zero means EOF, so try again.
(loop (get-bytevector-n! input bv start count)))
(else
- (put-bytevector output bv start ret)
+ (if seekable?
+ ;; Render long-enough sequences of zeros as "holes".
+ (match (find-holes bv start ret)
+ (()
+ (put-bytevector output bv start ret))
+ (holes
+ (let loop ((offset start)
+ (size ret)
+ (holes holes))
+ (match holes
+ (()
+ (if (> size 0)
+ (put-bytevector output bv offset size)
+ (when (= len (+ bytes-read ret))
+ ;; We created a hole in OUTPUT by seeking
+ ;; forward but that hole only comes into
+ ;; existence if we write something after it.
+ ;; Make the hole one byte smaller and write a
+ ;; final zero.
+ (seek output -1 SEEK_CUR)
+ (put-u8 output 0))))
+ (((hole-start . hole-size) . rest)
+ (let ((prefix-len (- hole-start offset)))
+ (put-bytevector output bv offset prefix-len)
+ (seek output hole-size SEEK_CUR)
+ (loop (+ hole-start hole-size)
+ (- size prefix-len hole-size)
+ rest)))))))
+ (put-bytevector output bv start ret))
(set! bytes-read (+ bytes-read ret))
ret)))))
diff --git a/tests/store-deduplication.scm b/tests/store-deduplication.scm
index f1845035d8..f116ff9834 100644
--- a/tests/store-deduplication.scm
+++ b/tests/store-deduplication.scm
@@ -1,5 +1,5 @@
;;; GNU Guix --- Functional package management for GNU
-;;; Copyright © 2018, 2020-2022 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2018, 2020-2022, 2024 Ludovic Courtès <ludo@gnu.org>
;;;
;;; This file is part of GNU Guix.
;;;
@@ -24,10 +24,27 @@
#:use-module (guix build utils)
#:use-module (rnrs bytevectors)
#:use-module (ice-9 binary-ports)
+ #:use-module (ice-9 match)
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-26)
#:use-module (srfi srfi-64))
+(define (cartesian-product . lst)
+ "Return the Cartesian product of all the given lists."
+ (match lst
+ ((head)
+ (map list head))
+ ((head . rest)
+ (let ((others (apply cartesian-product rest)))
+ (append-map (lambda (init)
+ (map (lambda (lst)
+ (cons init lst))
+ others))
+ head)))
+ (()
+ '())))
+
+
(test-begin "store-deduplication")
(test-equal "deduplicate, below %deduplication-minimum-size"
@@ -166,4 +183,43 @@
(cut string-append store <>))
'("/a" "/b" "/c"))))))))
+(for-each (match-lambda
+ ((initial-gap middle-gap final-gap)
+ (test-assert
+ (format #f "copy-file/deduplicate, sparse files (holes: ~a/~a/~a)"
+ initial-gap middle-gap final-gap)
+ (call-with-temporary-directory
+ (lambda (store)
+ (let ((source (string-append store "/source")))
+ (call-with-output-file source
+ (lambda (port)
+ (seek port initial-gap SEEK_CUR)
+ (display "hi!" port)
+ (seek port middle-gap SEEK_CUR)
+ (display "bye." port)
+ (when (> final-gap 0)
+ (seek port (- final-gap 1) SEEK_CUR)
+ (put-u8 port 0))))
+
+ (for-each (lambda (target)
+ (copy-file/deduplicate source
+ (string-append store target)
+ #:store store))
+ '("/a" "/b" "/c"))
+ (system* "du" "-h" source)
+ (system* "du" "-h" "--apparent-size" source)
+ (system* "du" "-h" (string-append store "/a"))
+ (system* "du" "-h" "--apparent-size" (string-append store "/a"))
+ (and (directory-exists? (string-append store "/.links"))
+ (file=? source (string-append store "/a"))
+ (apply = (map (compose stat:ino stat
+ (cut string-append store <>))
+ '("/a" "/b" "/c")))
+ (let ((st (pk 'S (stat (string-append store "/a")))))
+ (<= (* 512 (stat:blocks st))
+ (stat:size st))))))))))
+ (cartesian-product '(0 3333 8192)
+ '(8192 9999 16384 22222)
+ '(0 8192)))
+
(test-end "store-deduplication")