diff options
author | Mark H Weaver <mhw@netris.org> | 2021-01-07 15:15:24 -0500 |
---|---|---|
committer | Maxim Cournoyer <maxim.cournoyer@gmail.com> | 2021-01-08 14:10:33 -0500 |
commit | 485ac28235bf8775914b905060266b09629c1bac (patch) | |
tree | 439f53946741042630e8f24ec12580837b6a96c9 | |
parent | 8027711ee8708c130b30be43da7908e0e04d6972 (diff) | |
download | guix-485ac28235bf8775914b905060266b09629c1bac.tar.gz |
utils: Allow text substitution even in the presence of NUL characters.
Fixes <https://issues.guix.gnu.org/30116>. Before this change, the presence of a NUL character on a line meant that the (glibc) regexp engine used by Guile would either 1. stop scanning the string or 2. crash with the error "string contains #\\nul character", depending on the locale used. This change works around this limitation by first replacing the NUL character by an unused Unicode code point, doing the substitution, then reverting the replacement. * guix/build/utils.scm (unused-private-use-code-point) (replace-char): New procedures. (substitute): Make use of the above procedures to work around the NUL character regexp engine limitation. * tests/build-utils.scm: Add tests. Co-authored-by: Maxim Cournoyer <maxim.cournoyer@gmail.com> Signed-off-by: Maxim Cournoyer <maxim.cournoyer@gmail.com>
-rw-r--r-- | guix/build/utils.scm | 57 | ||||
-rw-r--r-- | tests/build-utils.scm | 24 |
2 files changed, 69 insertions, 12 deletions
diff --git a/guix/build/utils.scm b/guix/build/utils.scm index f0ea83085e..76180e67e0 100644 --- a/guix/build/utils.scm +++ b/guix/build/utils.scm @@ -2,7 +2,7 @@ ;;; Copyright © 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Ludovic Courtès <ludo@gnu.org> ;;; Copyright © 2013 Andreas Enge <andreas@enge.fr> ;;; Copyright © 2013 Nikita Karetnikov <nikita@karetnikov.org> -;;; Copyright © 2015, 2018 Mark H Weaver <mhw@netris.org> +;;; Copyright © 2015, 2018, 2021 Mark H Weaver <mhw@netris.org> ;;; Copyright © 2018 Arun Isaac <arunisaac@systemreboot.net> ;;; Copyright © 2018, 2019 Ricardo Wurmus <rekado@elephly.net> ;;; Copyright © 2020 Efraim Flashner <efraim@flashner.co.il> @@ -784,6 +784,31 @@ PROC's result is returned." (lambda (key . args) (false-if-exception (delete-file template)))))) +(define (unused-private-use-code-point s) + "Find a code point within a Unicode Private Use Area that is not +present in S, and return the corresponding character object. If one +cannot be found, return false." + (define (scan lo hi) + (and (<= lo hi) + (let ((c (integer->char lo))) + (if (string-index s c) + (scan (+ lo 1) hi) + c)))) + (or (scan #xE000 #xF8FF) + (scan #xF0000 #xFFFFD) + (scan #x100000 #x10FFFD))) + +(define (replace-char c1 c2 s) + "Return a string which is equal to S except with all instances of C1 +replaced by C2. If C1 and C2 are equal, return S." + (if (char=? c1 c2) + s + (string-map (lambda (c) + (if (char=? c c1) + c2 + c)) + s))) + (define (substitute file pattern+procs) "PATTERN+PROCS is a list of regexp/two-argument-procedure pairs. For each line of FILE, and for each PATTERN that it matches, call the corresponding @@ -802,16 +827,26 @@ end of a line; by itself it won't match the terminating newline of a line." (let loop ((line (read-line in 'concat))) (if (eof-object? line) #t - (let ((line (fold (lambda (r+p line) - (match r+p - ((regexp . proc) - (match (list-matches regexp line) - ((and m+ (_ _ ...)) - (proc line m+)) - (_ line))))) - line - rx+proc))) - (display line out) + ;; Work around the fact that Guile's regexp-exec does not handle + ;; NUL characters (a limitation of the underlying GNU libc's + ;; regexec) by temporarily replacing them by an unused private + ;; Unicode code point. + ;; TODO: Use SRFI-115 instead, once available in Guile. + (let* ((nul* (or (and (string-index line #\nul) + (unused-private-use-code-point line)) + #\nul)) + (line* (replace-char #\nul nul* line)) + (line1* (fold (lambda (r+p line) + (match r+p + ((regexp . proc) + (match (list-matches regexp line) + ((and m+ (_ _ ...)) + (proc line m+)) + (_ line))))) + line* + rx+proc)) + (line1 (replace-char nul* #\nul line1*))) + (display line1 out) (loop (read-line in 'concat))))))))) diff --git a/tests/build-utils.scm b/tests/build-utils.scm index 654b480ed9..31be7ff80f 100644 --- a/tests/build-utils.scm +++ b/tests/build-utils.scm @@ -1,6 +1,7 @@ ;;; GNU Guix --- Functional package management for GNU ;;; Copyright © 2012, 2015, 2016, 2019, 2020 Ludovic Courtès <ludo@gnu.org> ;;; Copyright © 2019 Ricardo Wurmus <rekado@elephly.net> +;;; Copyright © 2021 Maxim Cournoyer <maxim.cournoyer@gmail.com> ;;; ;;; This file is part of GNU Guix. ;;; @@ -18,7 +19,7 @@ ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>. -(define-module (test-build-utils) +(define-module (test build-utils) #:use-module (guix tests) #:use-module (guix build utils) #:use-module ((guix utils) @@ -241,4 +242,25 @@ print('hello world')")) "/some/other/path"))) #f))))) +(test-equal "substitute*, text contains a NUL byte, UTF-8" + "c\0d" + (with-fluids ((%default-port-encoding "UTF-8") + (%default-port-conversion-strategy 'error)) + ;; The GNU libc is locale sensitive. Depending on the value of LANG, the + ;; test could fail with "string contains #\\nul character: ~S" or "cannot + ;; convert wide string to output locale". + (setlocale LC_ALL "en_US.UTF-8") + (call-with-temporary-output-file + (lambda (file port) + (format port "a\0b") + (flush-output-port port) + + (substitute* file + (("a") "c") + (("b") "d")) + + (with-input-from-file file + (lambda _ + (get-string-all (current-input-port)))))))) + (test-end) |