From 5a1add373ab427a3b336981d857252e703a9f8d1 Mon Sep 17 00:00:00 2001 From: Mark H Weaver Date: Wed, 9 Mar 2016 01:23:53 -0500 Subject: grafts: Make grafting faster. * guix/build/graft.scm (replace-store-references): Reimplement for faster grafting. Use binary I/O instead of textual I/O. Replace 'mapping' argument (an alist) with 'replacement-table' (a vhash). (rewrite-directory): Adapt to mapping argument change in 'replace-store-references'. Remove 'with-fluids' that previously set '%default-port-encoding' to #f, since we now use binary I/O. (define-inline, hash-length): New macros. (nix-base32-char?): New variable. --- guix/build/graft.scm | 221 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 169 insertions(+), 52 deletions(-) diff --git a/guix/build/graft.scm b/guix/build/graft.scm index fb21fc3af3..f85d485554 100644 --- a/guix/build/graft.scm +++ b/guix/build/graft.scm @@ -1,5 +1,6 @@ ;;; GNU Guix --- Functional package management for GNU ;;; Copyright © 2014, 2015, 2016 Ludovic Courtès +;;; Copyright © 2016 Mark H Weaver ;;; ;;; This file is part of GNU Guix. ;;; @@ -20,8 +21,12 @@ #:use-module (guix build utils) #:use-module (rnrs bytevectors) #:use-module (rnrs io ports) + #:use-module (ice-9 vlist) #:use-module (ice-9 match) #:use-module (ice-9 threads) + #:use-module (ice-9 binary-ports) + #:use-module (srfi srfi-1) ; list library + #:use-module (srfi srfi-26) ; cut and cute #:export (replace-store-references rewrite-directory)) @@ -38,50 +43,134 @@ ;;; ;;; Code: -(define* (replace-store-references input output mapping +(define-syntax-rule (define-inline name val) + (define-syntax name (identifier-syntax val))) + +(define-inline hash-length 32) + +(define nix-base32-char? + (cute char-set-contains? + ;; ASCII digits and lower case letters except e o t u + (string->char-set "0123456789abcdfghijklmnpqrsvwxyz") + <>)) + +(define* (replace-store-references input output replacement-table #:optional (store (%store-directory))) - "Read data from INPUT, replacing store references according to MAPPING, and -writing the result to OUTPUT." - (define pattern - (let ((nix-base32-chars - '(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9 - #\a #\b #\c #\d #\f #\g #\h #\i #\j #\k #\l #\m #\n - #\p #\q #\r #\s #\v #\w #\x #\y #\z))) - `(,@(map char-set (string->list store)) - ,(char-set #\/) - ,@(make-list 32 (list->char-set nix-base32-chars)) - ,(char-set #\-)))) - - ;; We cannot use `regexp-exec' here because it cannot deal with strings - ;; containing NUL characters, hence 'fold-port-matches'. - (with-fluids ((%default-port-encoding #f)) - (when (file-port? input) - (setvbuf input _IOFBF 65536)) - (when (file-port? output) - (setvbuf output _IOFBF 65536)) - - (let* ((len (+ 34 (string-length store))) - (mapping (map (match-lambda - ((origin . replacement) - (unless (string=? (string-drop origin len) - (string-drop replacement len)) - (error "invalid replacement" origin replacement)) - (cons (string-take origin len) - (string-take replacement len)))) - mapping))) - (fold-port-matches (lambda (string result) - (match (assoc-ref mapping string) - (#f - (put-bytevector output (string->utf8 string))) - ((= string->utf8 replacement) - (put-bytevector output replacement))) - #t) - #f - pattern - input - (lambda (char result) ;unmatched - (put-u8 output (char->integer char)) - result))))) + "Read data from INPUT, replacing store references according to +REPLACEMENT-TABLE, and writing the result to OUTPUT. REPLACEMENT-TABLE is a +vhash that maps strings (original hashes) to bytevectors (replacement hashes). +Note: We use string keys to work around the fact that guile-2.0 hashes all +bytevectors to the same value." + + (define (lookup-replacement s) + (match (vhash-assoc s replacement-table) + ((origin . replacement) + replacement) + (#f #f))) + + (define (optimize-u8-predicate pred) + (cute vector-ref + (list->vector (map pred (iota 256))) + <>)) + + (define nix-base32-byte? + (optimize-u8-predicate + (compose nix-base32-char? + integer->char))) + + (define (dash? byte) (= byte 45)) + + (define request-size (expt 2 20)) ; 1 MiB + + ;; We scan the file for the following 33-byte pattern: 32 bytes of + ;; nix-base32 characters followed by a dash. To accommodate large files, + ;; we do not read the entire file, but instead work on buffers of up to + ;; 'request-size' bytes. To ensure that every 33-byte sequence appears + ;; entirely within exactly one buffer, adjacent buffers must overlap, + ;; i.e. they must share 32 byte positions. We accomplish this by + ;; "ungetting" the last 32 bytes of each buffer before reading the next + ;; buffer, unless we know that we've reached the end-of-file. + (let ((buffer (make-bytevector request-size))) + (let loop () + ;; Note: We avoid 'get-bytevector-n' to work around + ;; . + (match (get-bytevector-n! input buffer 0 request-size) + ((? eof-object?) 'done) + (end + ;; We scan the buffer for dashes that might be preceded by a + ;; nix-base32 hash. The key optimization here is that whenever we + ;; find a NON-nix-base32 character at position 'i', we know that it + ;; cannot be part of a hash, so the earliest position where the next + ;; hash could start is i+1 with the following dash at position i+33. + ;; + ;; Since nix-base32 characters comprise only 1/8 of the 256 possible + ;; byte values, and exclude some of the most common letters in + ;; English text (e t o u), in practice we can advance by 33 positions + ;; most of the time. + (let scan-from ((i hash-length) (written 0)) + ;; 'i' is the first position where we look for a dash. 'written' + ;; is the number of bytes in the buffer that have already been + ;; written. + (if (< i end) + (let ((byte (bytevector-u8-ref buffer i))) + (cond ((and (dash? byte) + ;; We've found a dash. Note that we do not know + ;; whether the preceeding 32 bytes are nix-base32 + ;; characters, but we do not need to know. If + ;; they are not, the following lookup will fail. + (lookup-replacement + (string-tabulate (lambda (j) + (integer->char + (bytevector-u8-ref buffer + (+ j (- i hash-length))))) + hash-length))) + => (lambda (replacement) + ;; We've found a hash that needs to be replaced. + ;; First, write out all bytes preceding the hash + ;; that have not yet been written. + (put-bytevector output buffer written + (- i hash-length written)) + ;; Now write the replacement hash. + (put-bytevector output replacement) + ;; Since the byte at position 'i' is a dash, + ;; which is not a nix-base32 char, the earliest + ;; position where the next hash might start is + ;; i+1, and the earliest position where the + ;; following dash might start is (+ i 1 + ;; hash-length). Also, we have now written up to + ;; position 'i' in the buffer. + (scan-from (+ i 1 hash-length) i))) + ;; If the byte at position 'i' is a nix-base32 char, + ;; then the dash we're looking for might be as early as + ;; the following byte, so we can only advance by 1. + ((nix-base32-byte? byte) + (scan-from (+ i 1) written)) + ;; If the byte at position 'i' is NOT a nix-base32 + ;; char, then the earliest position where the next hash + ;; might start is i+1, with the following dash at + ;; position (+ i 1 hash-length). + (else + (scan-from (+ i 1 hash-length) written)))) + + ;; We have finished scanning the buffer. Now we determine how + ;; many bytes have not yet been written, and how many bytes to + ;; "unget". If 'end' is less than 'request-size' then we read + ;; less than we asked for, which indicates that we are at EOF, + ;; so we needn't unget anything. Otherwise, we unget up to + ;; 'hash-length' bytes (32 bytes). However, we must be careful + ;; not to unget bytes that have already been written, because + ;; that would cause them to be written again from the next + ;; buffer. In practice, this case occurs when a replacement is + ;; made near the end of the buffer. + (let* ((unwritten (- end written)) + (unget-size (if (= end request-size) + (min hash-length unwritten) + 0)) + (write-size (- unwritten unget-size))) + (put-bytevector output buffer written write-size) + (unget-bytevector input buffer (+ written write-size) + unget-size) + (loop))))))))) (define (rename-matching-files directory mapping) "Apply MAPPING to the names of all the files in DIRECTORY, where MAPPING is @@ -122,6 +211,35 @@ an exception is caught." #:optional (store (%store-directory))) "Copy DIRECTORY to OUTPUT, replacing strings according to MAPPING, a list of file name pairs." + + (define hash-mapping + (let* ((prefix (string-append store "/")) + (start (string-length prefix)) + (end (+ start hash-length))) + (define (valid-hash? h) + (every nix-base32-char? (string->list h))) + (define (valid-suffix? s) + (string-prefix? "-" s)) + (define (hash+suffix s) + (and (< end (string-length s)) + (let ((hash (substring s start end)) + (suffix (substring s end))) + (and (string-prefix? prefix s) + (valid-hash? hash) + (valid-suffix? suffix) + (list hash suffix))))) + (map (match-lambda + (((= hash+suffix (origin-hash suffix)) + . + (= hash+suffix (replacement-hash suffix))) + (cons origin-hash (string->utf8 replacement-hash))) + ((origin . replacement) + (error "invalid replacement" origin replacement))) + mapping))) + + (define replacement-table + (alist->vhash hash-mapping)) + (define prefix-len (string-length directory)) @@ -138,18 +256,17 @@ file name pairs." (symlink (call-with-output-string (lambda (output) (replace-store-references (open-input-string target) - output mapping + output replacement-table store))) dest))) ((regular) - (with-fluids ((%default-port-encoding #f)) - (call-with-input-file file - (lambda (input) - (call-with-output-file dest - (lambda (output) - (replace-store-references input output mapping - store) - (chmod output (stat:perms stat)))))))) + (call-with-input-file file + (lambda (input) + (call-with-output-file dest + (lambda (output) + (replace-store-references input output replacement-table + store) + (chmod output (stat:perms stat))))))) ((directory) (mkdir-p dest)) (else -- cgit 1.4.1