summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--config-daemon.ac11
-rw-r--r--guix/store/deduplication.scm69
-rw-r--r--nix/libstore/gc.cc4
-rw-r--r--nix/libstore/local-store.hh3
-rw-r--r--nix/libstore/optimise-store.cc15
-rw-r--r--tests/derivations.scm14
-rw-r--r--tests/nar.scm7
-rw-r--r--tests/store-deduplication.scm41
-rw-r--r--tests/store.scm4
9 files changed, 126 insertions, 42 deletions
diff --git a/config-daemon.ac b/config-daemon.ac
index 5ddc740600..86306effe1 100644
--- a/config-daemon.ac
+++ b/config-daemon.ac
@@ -94,17 +94,6 @@ if test "x$guix_build_daemon" = "xyes"; then
   AC_CHECK_FUNCS([lutimes lchown posix_fallocate sched_setaffinity \
      statvfs nanosleep strsignal statx])
 
-  dnl Check whether the store optimiser can optimise symlinks.
-  AC_MSG_CHECKING([whether it is possible to create a link to a symlink])
-  ln -s bla tmp_link
-  if ln tmp_link tmp_link2 2> /dev/null; then
-      AC_MSG_RESULT(yes)
-      AC_DEFINE(CAN_LINK_SYMLINK, 1, [Whether link() works on symlinks.])
-  else
-      AC_MSG_RESULT(no)
-  fi
-  rm -f tmp_link tmp_link2
-
   dnl Check for <locale>.
   AC_LANG_PUSH(C++)
   AC_CHECK_HEADERS([locale])
diff --git a/guix/store/deduplication.scm b/guix/store/deduplication.scm
index cd9660174c..370df4a74c 100644
--- a/guix/store/deduplication.scm
+++ b/guix/store/deduplication.scm
@@ -1,6 +1,6 @@
 ;;; GNU Guix --- Functional package management for GNU
 ;;; Copyright © 2017 Caleb Ristvedt <caleb.ristvedt@cune.org>
-;;; Copyright © 2018, 2019, 2020 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2018-2021 Ludovic Courtès <ludo@gnu.org>
 ;;;
 ;;; This file is part of GNU Guix.
 ;;;
@@ -22,12 +22,13 @@
 
 (define-module (guix store deduplication)
   #:use-module (gcrypt hash)
-  #:use-module (guix build utils)
+  #:use-module ((guix build utils) #:hide (dump-port))
   #:use-module (guix build syscalls)
   #:use-module (guix base32)
   #:use-module (srfi srfi-11)
   #:use-module (srfi srfi-34)
   #:use-module (srfi srfi-35)
+  #:use-module (rnrs bytevectors)
   #:use-module (rnrs io ports)
   #:use-module (ice-9 ftw)
   #:use-module (ice-9 match)
@@ -37,6 +38,31 @@
             dump-file/deduplicate
             copy-file/deduplicate))
 
+;; TODO: Remove once 'dump-port' in (guix build utils) has an optional 'len'
+;; parameter.
+(define* (dump-port in out
+                    #:optional len
+                    #:key (buffer-size 16384))
+  "Read LEN bytes from IN (or as much as possible if LEN is #f) and write it
+to OUT, using chunks of BUFFER-SIZE bytes."
+  (define buffer
+    (make-bytevector buffer-size))
+
+  (let loop ((total 0)
+             (bytes (get-bytevector-n! in buffer 0
+                                       (if len
+                                           (min len buffer-size)
+                                           buffer-size))))
+    (or (eof-object? bytes)
+        (and len (= total len))
+        (let ((total (+ total bytes)))
+          (put-bytevector out buffer 0 bytes)
+          (loop total
+                (get-bytevector-n! in buffer 0
+                                   (if len
+                                       (min (- len total) buffer-size)
+                                       buffer-size)))))))
+
 (define (nar-sha256 file)
   "Gives the sha256 hash of a file and the size of the file in nar form."
   (let-values (((port get-hash) (open-sha256-port)))
@@ -127,11 +153,27 @@ Note: TARGET, TO-REPLACE, and SWAP-DIRECTORY must be on the same file system."
           (unless (= EMLINK (system-error-errno args))
             (apply throw args)))))))
 
+(define %deduplication-minimum-size
+  ;; Size below which files are not deduplicated.  This avoids adding too many
+  ;; entries to '.links', which would slow down 'removeUnusedLinks' while
+  ;; saving little space.  Keep in sync with optimize-store.cc.
+  8192)
+
 (define* (deduplicate path hash #:key (store (%store-directory)))
   "Check if a store item with sha256 hash HASH already exists.  If so,
 replace PATH with a hardlink to the already-existing one.  If not, register
 PATH so that future duplicates can hardlink to it.  PATH is assumed to be
 under STORE."
+  ;; Lightweight promises.
+  (define-syntax-rule (delay exp)
+    (let ((value #f))
+      (lambda ()
+        (unless value
+          (set! value exp))
+        value)))
+  (define-syntax-rule (force promise)
+    (promise))
+
   (define links-directory
     (string-append store "/.links"))
 
@@ -144,13 +186,18 @@ under STORE."
                     ((file . properties)
                      (unless (member file '("." ".."))
                        (let* ((file (string-append path "/" file))
+                              (st   (delay (lstat file)))
                               (type (match (assoc-ref properties 'type)
                                       ((or 'unknown #f)
-                                       (stat:type (lstat file)))
+                                       (stat:type (force st)))
                                       (type type))))
-                         (loop file type
-                               (and (not (eq? 'directory type))
-                                    (nar-sha256 file)))))))
+                         (when (or (eq? 'directory type)
+                                   (and (eq? 'regular type)
+                                        (>= (stat:size (force st))
+                                            %deduplication-minimum-size)))
+                           (loop file type
+                                 (and (not (eq? 'directory type))
+                                      (nar-sha256 file))))))))
                   (scandir* path))
         (let ((link-file (string-append links-directory "/"
                                         (bytevector->nix-base32-string hash))))
@@ -222,9 +269,9 @@ OUTPUT as it goes."
 
 This procedure is suitable as a #:dump-file argument to 'restore-file'.  When
 used that way, it deduplicates files on the fly as they are restored, thereby
-removing the need to a deduplication pass that would re-read all the files
+removing the need for a deduplication pass that would re-read all the files
 down the road."
-  (define hash
+  (define (dump-and-compute-hash)
     (call-with-output-file file
       (lambda (output)
         (let-values (((hash-port get-hash)
@@ -236,7 +283,11 @@ down the road."
           (close-port hash-port)
           (get-hash)))))
 
-  (deduplicate file hash #:store store))
+  (if (>= size %deduplication-minimum-size)
+      (deduplicate file (dump-and-compute-hash) #:store store)
+      (call-with-output-file file
+        (lambda (output)
+          (dump-port input output size)))))
 
 (define* (copy-file/deduplicate source target
                                 #:key (store (%store-directory)))
diff --git a/nix/libstore/gc.cc b/nix/libstore/gc.cc
index e1d0765154..16519116e4 100644
--- a/nix/libstore/gc.cc
+++ b/nix/libstore/gc.cc
@@ -606,7 +606,9 @@ void LocalStore::removeUnusedLinks(const GCState & state)
             throw SysError(format("statting `%1%'") % path);
 #endif
 
-        if (st.st_nlink != 1) {
+	/* Drop links for files smaller than 'deduplicationMinSize', even if
+	   they have more than one hard link.  */
+        if (st.st_nlink != 1 && st.st_size >= deduplicationMinSize) {
             actualSize += st.st_size;
             unsharedSize += (st.st_nlink - 1) * st.st_size;
             continue;
diff --git a/nix/libstore/local-store.hh b/nix/libstore/local-store.hh
index 9ba37219da..20d3c3c893 100644
--- a/nix/libstore/local-store.hh
+++ b/nix/libstore/local-store.hh
@@ -292,4 +292,7 @@ void canonicaliseTimestampAndPermissions(const Path & path);
 
 MakeError(PathInUse, Error);
 
+/* Size below which a file is not considered for deduplication.  */
+extern const size_t deduplicationMinSize;
+
 }
diff --git a/nix/libstore/optimise-store.cc b/nix/libstore/optimise-store.cc
index eb303ab4c3..9fd6f3cb35 100644
--- a/nix/libstore/optimise-store.cc
+++ b/nix/libstore/optimise-store.cc
@@ -15,6 +15,9 @@
 
 namespace nix {
 
+/* Any file smaller than this is not considered for deduplication.
+   Keep in sync with (guix store deduplication).  */
+const size_t deduplicationMinSize = 8192;
 
 static void makeWritable(const Path & path)
 {
@@ -105,12 +108,12 @@ void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path, InodeHa
         return;
     }
 
-    /* We can hard link regular files and maybe symlinks. */
-    if (!S_ISREG(st.st_mode)
-#if CAN_LINK_SYMLINK
-        && !S_ISLNK(st.st_mode)
-#endif
-        ) return;
+    /* We can hard link regular files (and maybe symlinks), but do that only
+       for files larger than some threshold.  This avoids adding too many
+       entries to '.links', which would slow down 'removeUnusedLinks' while
+       saving little space.  */
+    if (!S_ISREG(st.st_mode) || ((size_t) st.st_size) < deduplicationMinSize)
+	return;
 
     /* Sometimes SNAFUs can cause files in the store to be
        modified, in particular when running programs as root under
diff --git a/tests/derivations.scm b/tests/derivations.scm
index cd165d1be6..0775719ea3 100644
--- a/tests/derivations.scm
+++ b/tests/derivations.scm
@@ -1,5 +1,5 @@
 ;;; GNU Guix --- Functional package management for GNU
-;;; Copyright © 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2012-2021 Ludovic Courtès <ludo@gnu.org>
 ;;;
 ;;; This file is part of GNU Guix.
 ;;;
@@ -170,11 +170,15 @@
         #f))))
 
 (test-assert "identical files are deduplicated"
-  (let* ((build1  (add-text-to-store %store "one.sh"
-                                     "echo hello, world > \"$out\"\n"
+  ;; Note: DATA must be longer than %DEDUPLICATION-MINIMUM-SIZE.
+  (let* ((data    (make-string 9000 #\a))
+         (build1  (add-text-to-store %store "one.sh"
+                                     (string-append "echo -n " data
+                                                    " > \"$out\"\n")
                                      '()))
          (build2  (add-text-to-store %store "two.sh"
-                                     "# Hey!\necho hello, world > \"$out\"\n"
+                                     (string-append "# Hey!\necho -n "
+                                                    data " > \"$out\"\n")
                                      '()))
          (drv1    (derivation %store "foo"
                               %bash `(,build1)
@@ -187,7 +191,7 @@
                (file2 (derivation->output-path drv2)))
            (and (valid-path? %store file1) (valid-path? %store file2)
                 (string=? (call-with-input-file file1 get-string-all)
-                          "hello, world\n")
+                          data)
                 (= (stat:ino (lstat file1))
                    (stat:ino (lstat file2))))))))
 
diff --git a/tests/nar.scm b/tests/nar.scm
index ba4881caaa..98752f2088 100644
--- a/tests/nar.scm
+++ b/tests/nar.scm
@@ -1,5 +1,5 @@
 ;;; GNU Guix --- Functional package management for GNU
-;;; Copyright © 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2012-2021 Ludovic Courtès <ludo@gnu.org>
 ;;;
 ;;; This file is part of GNU Guix.
 ;;;
@@ -486,8 +486,9 @@
   ;; their mtime and permissions were not reset.  Ensure that this bug is
   ;; gone.
   (with-store store
-    (let* ((text1 (random-text))
-           (text2 (random-text))
+    ;; Note: TEXT1 and TEXT2 must be longer than %DEDUPLICATION-MINIMUM-SIZE.
+    (let* ((text1 (string-concatenate (make-list 200 (random-text))))
+           (text2 (string-concatenate (make-list 200 (random-text))))
            (tree  `("tree" directory
                     ("a" regular (data ,text1))
                     ("b" directory
diff --git a/tests/store-deduplication.scm b/tests/store-deduplication.scm
index b1c2d93bbd..2950fbc1a3 100644
--- a/tests/store-deduplication.scm
+++ b/tests/store-deduplication.scm
@@ -1,5 +1,5 @@
 ;;; GNU Guix --- Functional package management for GNU
-;;; Copyright © 2018, 2020 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2018, 2020-2021 Ludovic Courtès <ludo@gnu.org>
 ;;;
 ;;; This file is part of GNU Guix.
 ;;;
@@ -30,13 +30,40 @@
 
 (test-begin "store-deduplication")
 
+(test-equal "deduplicate, below %deduplication-minimum-size"
+  (list #t (make-list 5 1))
+
+  (call-with-temporary-directory
+   (lambda (store)
+     ;; Note: DATA must be longer than %DEDUPLICATION-MINIMUM-SIZE.
+     (let ((data      "Hello, world!")
+           (identical (map (lambda (n)
+                             (string-append store "/" (number->string n)
+                                            "/a/b/c"))
+                           (iota 5))))
+       (for-each (lambda (file)
+                   (mkdir-p (dirname file))
+                   (call-with-output-file file
+                     (lambda (port)
+                       (put-bytevector port (string->utf8 data)))))
+                 identical)
+
+       (deduplicate store (nar-sha256 store) #:store store)
+
+       ;; (system (string-append "ls -lRia " store))
+       (list (= (length (delete-duplicates
+                         (map (compose stat:ino stat) identical)))
+                (length identical))
+             (map (compose stat:nlink stat) identical))))))
+
 (test-equal "deduplicate"
   (cons* #t #f                                    ;inode comparisons
          2 (make-list 5 6))                       ;'nlink' values
 
   (call-with-temporary-directory
    (lambda (store)
-     (let ((data      (string->utf8 "Hello, world!"))
+     ;; Note: DATA must be longer than %DEDUPLICATION-MINIMUM-SIZE.
+     (let ((data      (string-concatenate (make-list 1000 "Hello, world!")))
            (identical (map (lambda (n)
                              (string-append store "/" (number->string n)
                                             "/a/b/c"))
@@ -46,7 +73,7 @@
                    (mkdir-p (dirname file))
                    (call-with-output-file file
                      (lambda (port)
-                       (put-bytevector port data))))
+                       (put-bytevector port (string->utf8 data)))))
                  identical)
        ;; Make the parent of IDENTICAL read-only.  This should not prevent
        ;; deduplication from inserting its hard link.
@@ -54,7 +81,7 @@
 
        (call-with-output-file unique
          (lambda (port)
-           (put-bytevector port (string->utf8 "This is unique."))))
+           (put-bytevector port (string->utf8 (string-reverse data)))))
 
        (deduplicate store (nar-sha256 store) #:store store)
 
@@ -77,8 +104,10 @@
    (lambda (store)
      (let ((true-link link)
            (links     0)
-           (data1     (string->utf8 "Hello, world!"))
-           (data2     (string->utf8 "Hi, world!"))
+           (data1     (string->utf8
+                       (string-concatenate (make-list 1000 "Hello, world!"))))
+           (data2     (string->utf8
+                       (string-concatenate (make-list 1000 "Hi, world!"))))
            (identical (map (lambda (n)
                              (string-append store "/" (number->string n)
                                             "/a/b/c"))
diff --git a/tests/store.scm b/tests/store.scm
index 2150a0048c..5c9f651d6c 100644
--- a/tests/store.scm
+++ b/tests/store.scm
@@ -759,7 +759,9 @@
 
 (test-assert "substitute, deduplication"
   (with-store s
-    (let* ((c   (random-text))                     ; contents of the output
+    ;; Note: C must be longer than %DEDUPLICATION-MINIMUM-SIZE.
+    (let* ((c   (string-concatenate
+                 (make-list 200 (random-text))))  ; contents of the output
            (g   (package-derivation s %bootstrap-guile))
            (d1  (build-expression->derivation s "substitute-me"
                                               `(begin ,c (exit 1))