summary refs log tree commit diff
diff options
context:
space:
mode:
authorLudovic Courtès <ludo@gnu.org>2022-11-22 07:17:17 +0100
committerLudovic Courtès <ludo@gnu.org>2022-11-22 09:43:54 +0100
commitb2b9571935f9188086b2e7b434840eeda6c42805 (patch)
tree96ad67225c65cfbd5af5f145c4cb18b3d9713bb4
parent60bea075938cae10147d1d8fd414dc0140f4118f (diff)
downloadguix-b2b9571935f9188086b2e7b434840eeda6c42805.tar.gz
offload: Gracefully handle 'guix repl' protocol errors.
Fixes <https://issues.guix.gnu.org/59447>.
Reported by Mathieu Othacehe <othacehe@gnu.org>.

Previously, if a machine had a buggy 'guix repl', 'guix offload' would
crash with a backtrace instead of just ignoring the machine.

* guix/scripts/offload.scm (remote-inferior*): New procedure.
(check-machine-availability)[if-true]: New procedure.
Use 'remote-inferior*' and 'if-true'.
(check-machine-status): Use 'remote-inferior*'.
-rw-r--r--guix/scripts/offload.scm38
1 files changed, 26 insertions, 12 deletions
diff --git a/guix/scripts/offload.scm b/guix/scripts/offload.scm
index 925325ef5f..8ab393c0ac 100644
--- a/guix/scripts/offload.scm
+++ b/guix/scripts/offload.scm
@@ -25,7 +25,7 @@
   #:autoload   (ssh auth) (userauth-public-key!)
   #:autoload   (ssh session) (make-session
                               connect! get-error
-                              disconnect! session-set!)
+                              disconnect! session-set! session-get)
   #:autoload   (ssh version) (zlib-support?)
   #:use-module (guix config)
   #:use-module (guix records)
@@ -34,7 +34,8 @@
                            send-files retrieve-files retrieve-files*
                            remote-inferior report-guile-error)
   #:use-module (guix store)
-  #:autoload   (guix inferior) (inferior-eval close-inferior inferior?)
+  #:autoload   (guix inferior) (inferior-eval close-inferior
+                                inferior? inferior-protocol-error?)
   #:autoload   (guix derivations) (read-derivation-from-file
                                    derivation-file-name
                                    build-derivations)
@@ -473,6 +474,15 @@ logical cores available, to give a rough estimation of CPU usage.  Return
               (vector-set! vec j (vector-ref vec (- i 1)))
               (loop (cons val result) (- i 1))))))))
 
+(define (remote-inferior* session)
+  "Like 'remote-inferior', but upon error return #f."
+  (or (guard (c ((inferior-protocol-error? c) #f))
+        (remote-inferior session))
+      (begin
+        (warning (G_ "failed to run 'guix repl' on machine '~a'~%")
+                 (session-get session 'host))
+        #f)))
+
 (define (choose-build-machine machines)
   "Return two values: the best machine among MACHINES and its build
 slot (which must later be released with 'release-build-slot'), or #f and #f."
@@ -511,7 +521,7 @@ slot (which must later be released with 'release-build-slot'), or #f and #f."
        ;; too costly to call it once for every machine.
        (let* ((session (false-if-exception (open-ssh-session best
                                                              %short-timeout)))
-              (node    (and session (remote-inferior session)))
+              (node    (and session (remote-inferior* session)))
               (load    (and node (node-load node)))
               (threshold (build-machine-overload-threshold best))
               (space   (and node (node-free-disk-space node))))
@@ -708,6 +718,11 @@ machine."
     (and (string=? (build-machine-name m1) (build-machine-name m2))
          (= (build-machine-port m1) (build-machine-port m2))))
 
+  (define (if-true proc)
+    (lambda args
+      (when (every ->bool args)
+        (apply proc args))))
+
   ;; A given build machine may appear several times (e.g., once for
   ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
   (let ((machines (filter pred
@@ -718,12 +733,12 @@ machine."
     (let* ((names    (map build-machine-name machines))
            (sockets  (map build-machine-daemon-socket machines))
            (sessions (map (cut open-ssh-session <> %short-timeout) machines))
-           (nodes    (map remote-inferior sessions)))
-      (for-each assert-node-has-guix nodes names)
-      (for-each assert-node-repl nodes names)
-      (for-each assert-node-can-import sessions nodes names sockets)
-      (for-each assert-node-can-export sessions nodes names sockets)
-      (for-each close-inferior nodes)
+           (nodes    (map remote-inferior* sessions)))
+      (for-each (if-true assert-node-has-guix) nodes names)
+      (for-each (if-true assert-node-repl) nodes names)
+      (for-each (if-true assert-node-can-import) sessions nodes names sockets)
+      (for-each (if-true assert-node-can-export) sessions nodes names sockets)
+      (for-each (if-true close-inferior) nodes)
       (for-each disconnect! sessions))))
 
 (define (check-machine-status machine-file pred)
@@ -743,10 +758,9 @@ machine."
                 (define session
                   (open-ssh-session machine %short-timeout))
 
-                (match (remote-inferior session)
+                (match (remote-inferior* session)
                   (#f
-                   (warning (G_ "failed to run 'guix repl' on machine '~a'~%")
-                            (build-machine-name machine)))
+                   #f)
                   ((? inferior? inferior)
                    (let ((now (car (gettimeofday))))
                      (match (inferior-eval '(list (uname)