summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--doc/guix.texi30
-rw-r--r--guix/scripts/offload.scm54
2 files changed, 62 insertions, 22 deletions
diff --git a/doc/guix.texi b/doc/guix.texi
index 3fc76c8670..553a3b8ae9 100644
--- a/doc/guix.texi
+++ b/doc/guix.texi
@@ -1081,7 +1081,28 @@ architecture natively supports it, via emulation (@pxref{Transparent
 Emulation with QEMU}), or both.  Missing prerequisites for the build are
 copied over SSH to the target machine, which then proceeds with the
 build; upon success the output(s) of the build are copied back to the
-initial machine.
+initial machine.  The offload facility comes with a basic scheduler that
+attempts to select the best machine.  The best machine is chosen among
+the available machines based on criteria such as:
+
+@enumerate
+@item
+The availability of a build slot.  A build machine can have as many
+build slots (connections) as the value of the @code{parallel-builds}
+field of its @code{build-machine} object.
+
+@item
+Its relative speed, as defined via the @code{speed} field of its
+@code{build-machine} object.
+
+@item
+Its load.  The normalized machine load must be lower than a threshold
+value, configurable via the @code{overload-threshold} field of its
+@code{build-machine} object.
+
+@item
+Disk space availability.  More than a 100 MiB must be available.
+@end enumerate
 
 The @file{/etc/guix/machines.scm} file typically looks like this:
 
@@ -1185,6 +1206,13 @@ when transferring files to and from build machines.
 File name of the Unix-domain socket @command{guix-daemon} is listening
 to on that machine.
 
+@item @code{overload-threshold} (default: @code{0.6})
+The load threshold above which a potential offload machine is
+disregarded by the offload scheduler.  The value roughly translates to
+the total processor usage of the build machine, ranging from 0.0 (0%) to
+1.0 (100%).  It can also be disabled by setting
+@code{overload-threshold} to @code{#f}.
+
 @item @code{parallel-builds} (default: @code{1})
 The number of builds that may run in parallel on the machine.
 
diff --git a/guix/scripts/offload.scm b/guix/scripts/offload.scm
index 3dc8ccefcb..a5fe98b675 100644
--- a/guix/scripts/offload.scm
+++ b/guix/scripts/offload.scm
@@ -88,6 +88,10 @@
                      (default 3))
   (daemon-socket   build-machine-daemon-socket    ; string
                    (default "/var/guix/daemon-socket/socket"))
+  ;; A #f value tells the offload scheduler to disregard the load of the build
+  ;; machine when selecting the best offload machine.
+  (overload-threshold build-machine-overload-threshold ; inexact real between
+                      (default 0.6))                   ; 0.0 and 1.0 | #f
   (parallel-builds build-machine-parallel-builds  ; number
                    (default 1))
   (speed           build-machine-speed            ; inexact real
@@ -391,30 +395,34 @@ of free disk space on '~a'~%")
   (* 100 (expt 2 20)))                            ;100 MiB
 
 (define (node-load node)
-  "Return the load on NODE.  Return +∞ if NODE is misbehaving."
+  "Return the load on NODE, a normalized value between 0.0 and 1.0.  The value
+is derived from /proc/loadavg and normalized according to the number of
+logical cores available, to give a rough estimation of CPU usage.  Return
+1.0 (fully loaded) if NODE is misbehaving."
   (let ((line (inferior-eval '(begin
                                 (use-modules (ice-9 rdelim))
                                 (call-with-input-file "/proc/loadavg"
                                   read-string))
-                             node)))
-    (if (eof-object? line)
-        +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded
+                             node))
+        (ncores (inferior-eval '(begin
+                                  (use-modules (ice-9 threads))
+                                  (current-processor-count))
+                               node)))
+    (if (or (eof-object? line) (eof-object? ncores))
+        1.0    ;MACHINE does not respond, so assume it is fully loaded
         (match (string-tokenize line)
           ((one five fifteen . x)
-           (string->number one))
+           (let ((load (/ (string->number one) ncores)))
+             (if (> load 1.0)
+                 1.0
+                 load)))
           (x
-           +inf.0)))))
-
-(define (normalized-load machine load)
-  "Divide LOAD by the number of parallel builds of MACHINE."
-  (if (rational? load)
-      (let* ((jobs       (build-machine-parallel-builds machine))
-             (normalized (/ load jobs)))
-        (format (current-error-port) "load on machine '~a' is ~s\
- (normalized: ~s)~%"
-                (build-machine-name machine) load normalized)
-        normalized)
-      load))
+           1.0)))))
+
+(define (report-load machine load)
+  (format (current-error-port)
+          "normalized load on machine '~a' is ~,2f~%"
+          (build-machine-name machine) load))
 
 (define (random-seed)
   (logxor (getpid) (car (gettimeofday))))
@@ -472,11 +480,15 @@ slot (which must later be released with 'release-build-slot'), or #f and #f."
        (let* ((session (false-if-exception (open-ssh-session best
                                                              %short-timeout)))
               (node    (and session (remote-inferior session)))
-              (load    (and node (normalized-load best (node-load node))))
+              (load    (and node (node-load node)))
+              (threshold (build-machine-overload-threshold best))
               (space   (and node (node-free-disk-space node))))
+         (when load (report-load best load))
          (when node (close-inferior node))
          (when session (disconnect! session))
-         (if (and node (< load 2.) (>= space %minimum-disk-space))
+         (if (and node
+                  (or (not threshold) (< load threshold))
+                  (>= space %minimum-disk-space))
              (match others
                (((machines slots) ...)
                 ;; Release slots from the uninteresting machines.
@@ -708,13 +720,13 @@ machine."
                               (free (node-free-disk-space inferior)))
                           (close-inferior inferior)
                           (format #t "~a~%  kernel: ~a ~a~%  architecture: ~a~%\
-  host name: ~a~%  normalized load: ~a~%  free disk space: ~,2f MiB~%\
+  host name: ~a~%  normalized load: ~,2f~%  free disk space: ~,2f MiB~%\
   time difference: ~a s~%"
                                   (build-machine-name machine)
                                   (utsname:sysname uts) (utsname:release uts)
                                   (utsname:machine uts)
                                   (utsname:nodename uts)
-                                  (normalized-load machine load)
+                                  load
                                   (/ free (expt 2 20) 1.)
                                   (- time now))))))))