summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMathieu Othacehe <othacehe@gnu.org>2020-09-21 18:41:44 +0200
committerMathieu Othacehe <othacehe@gnu.org>2020-09-29 17:02:45 +0200
commiteb8d1b88adcb63b12986855f40a454e2c59d0b9c (patch)
tree0bc7ff3f3b9a41620fe355a0fe762366037a5093
parent0ffcb80ebbaa2b177f03548035a2ef21ae7ac41d (diff)
downloadcuirass-eb8d1b88adcb63b12986855f40a454e2c59d0b9c.tar
cuirass-eb8d1b88adcb63b12986855f40a454e2c59d0b9c.tar.gz
Add watchdog support.
* src/cuirass/watchdog.scm: New file. * Makefile.am (dist_pkgmodule_DATA): Add it. * src/cuirass/utils.scm (with-timeout, get-message-with-timeout): Export them. * bin/cuirass.in (main): Start the watchdog.
-rw-r--r--Makefile.am5
-rw-r--r--bin/cuirass.in3
-rw-r--r--src/cuirass/utils.scm3
-rw-r--r--src/cuirass/watchdog.scm88
4 files changed, 96 insertions, 3 deletions
diff --git a/Makefile.am b/Makefile.am
index 60b1e24..a575755 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -50,8 +50,9 @@ dist_pkgmodule_DATA = \
src/cuirass/metrics.scm \
src/cuirass/send-events.scm \
src/cuirass/ui.scm \
- src/cuirass/utils.scm \
- src/cuirass/templates.scm
+ src/cuirass/utils.scm \
+ src/cuirass/templates.scm \
+ src/cuirass/watchdog.scm
nodist_pkgmodule_DATA = \
src/cuirass/config.scm
diff --git a/bin/cuirass.in b/bin/cuirass.in
index d6c2695..55e92b6 100644
--- a/bin/cuirass.in
+++ b/bin/cuirass.in
@@ -32,6 +32,7 @@ exec ${GUILE:-@GUILE@} --no-auto-compile -e main -s "$0" "$@"
(cuirass logging)
(cuirass metrics)
(cuirass utils)
+ (cuirass watchdog)
(guix ui)
((guix build utils) #:select (mkdir-p))
(fibers)
@@ -153,7 +154,7 @@ exec ${GUILE:-@GUILE@} --no-auto-compile -e main -s "$0" "$@"
(if one-shot?
(process-specs (db-get-specifications))
(let ((exit-channel (make-channel)))
-
+ (start-watchdog)
(if (option-ref opts 'web #f)
(begin
(spawn-fiber
diff --git a/src/cuirass/utils.scm b/src/cuirass/utils.scm
index 00cfef6..7ce4b83 100644
--- a/src/cuirass/utils.scm
+++ b/src/cuirass/utils.scm
@@ -37,6 +37,9 @@
define-enumeration
unwind-protect
+ with-timeout
+ get-message-with-timeout
+
make-worker-thread-channel
call-with-worker-thread
with-worker-thread
diff --git a/src/cuirass/watchdog.scm b/src/cuirass/watchdog.scm
new file mode 100644
index 0000000..5c5c1df
--- /dev/null
+++ b/src/cuirass/watchdog.scm
@@ -0,0 +1,88 @@
+;;; watchdog.scm -- Monitor fibers scheduling.
+;;; This file is part of Cuirass.
+;;;
+;;; Cuirass is free software: you can redistribute it and/or modify
+;;; it under the terms of the GNU General Public License as published by
+;;; the Free Software Foundation, either version 3 of the License, or
+;;; (at your option) any later version.
+;;;
+;;; Cuirass is distributed in the hope that it will be useful,
+;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;;; GNU General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with Cuirass. If not, see <http://www.gnu.org/licenses/>.
+
+(define-module (cuirass watchdog)
+ #:use-module (cuirass logging)
+ #:use-module (cuirass utils)
+ #:use-module (fibers)
+ #:use-module (fibers channels)
+ #:use-module (fibers internal)
+ #:use-module (fibers operations)
+ #:use-module (ice-9 match)
+ #:use-module (ice-9 threads)
+ #:export (start-watchdog))
+
+(define* (watchdog-fiber scheduler channel
+ #:key
+ (period 1))
+ "Spawn a fiber running on SCHEDULER that sends over CHANNEL, every PERIOD
+seconds, the scheduler name and the current time."
+ (spawn-fiber
+ (lambda ()
+ (while #t
+ (put-message channel (list (scheduler-name scheduler)
+ (current-time)))
+ (sleep period)))
+ scheduler))
+
+(define* (start-watchdog #:key (timeout 5))
+ "Start a watchdog checking that each Fibers scheduler is not blocked for
+more than TIMEOUT seconds.
+
+The watchdog mechanism consists in spawning a dedicated fiber per running
+Fiber scheduler, using the above watchdog-fiber method. Those fibers send a
+ping signal periodically to a separate thread. If no signal is received from
+one of the schedulers for more than TIMEOUT seconds, a warning message is
+printed."
+ (define (check-timeouts pings last-check)
+ (let* ((check-period timeout)
+ (cur-time (current-time))
+ (diff-check (- cur-time last-check)))
+ (if (> diff-check check-period)
+ (begin
+ (for-each
+ (match-lambda
+ ((scheduler . time)
+ (let ((diff-ping (- cur-time time)))
+ (when (> diff-ping timeout)
+ (log-message "Scheduler ~a blocked since ~a seconds."
+ scheduler diff-ping)))))
+ pings)
+ cur-time)
+ last-check)))
+
+ (let ((watchdog-channel (make-channel)))
+ (parameterize (((@@ (fibers internal) current-fiber) #f))
+ (call-with-new-thread
+ (lambda ()
+ (let loop ((pings '())
+ (last-check 0))
+ (let ((operation-timeout 10))
+ (match (perform-operation
+ (with-timeout
+ (get-operation watchdog-channel)
+ #:seconds operation-timeout
+ #:wrap (const 'timeout)))
+ ((scheduler ping)
+ (loop (assq-set! pings scheduler ping)
+ (check-timeouts pings last-check)))
+ ('timeout
+ (loop pings
+ (check-timeouts pings last-check)))))))))
+ (fold-all-schedulers
+ (lambda (name scheduler seed)
+ (watchdog-fiber scheduler watchdog-channel))
+ '())))