aboutsummaryrefslogtreecommitdiff
path: root/guix-build-coordinator/coordinator.scm
diff options
context:
space:
mode:
Diffstat (limited to 'guix-build-coordinator/coordinator.scm')
-rw-r--r--guix-build-coordinator/coordinator.scm372
1 files changed, 291 insertions, 81 deletions
diff --git a/guix-build-coordinator/coordinator.scm b/guix-build-coordinator/coordinator.scm
index b75b40f..dc72ca9 100644
--- a/guix-build-coordinator/coordinator.scm
+++ b/guix-build-coordinator/coordinator.scm
@@ -25,6 +25,7 @@
#:use-module (srfi srfi-11)
#:use-module (srfi srfi-19)
#:use-module (srfi srfi-26)
+ #:use-module (srfi srfi-43)
#:use-module (srfi srfi-71)
#:use-module (ice-9 ftw)
#:use-module (ice-9 vlist)
@@ -50,6 +51,7 @@
#:use-module (prometheus)
#:use-module ((guix build syscalls)
#:select (set-thread-name))
+ #:use-module (guix store)
#:use-module (guix derivations)
#:use-module (guix build utils)
#:use-module (guix-build-coordinator utils)
@@ -97,6 +99,10 @@
build-coordinator-prompt-hook-processing-for-event
start-hook-processing-threads
+ build-coordinator-allocation-plan-stats
+ build-coordinator-trigger-build-allocation
+ build-coordinator-list-allocation-plan-builds
+
build-output-file-location
build-log-file-destination
build-log-file-location
@@ -116,7 +122,8 @@
(define-record-type <build-coordinator>
(make-build-coordinator-record datastore hooks metrics-registry
- allocation-strategy logger)
+ allocation-strategy allocator-channel
+ logger)
build-coordinator?
(datastore build-coordinator-datastore)
(hooks build-coordinator-hooks)
@@ -124,8 +131,10 @@
set-build-coordinator-hook-condvars!)
(metrics-registry build-coordinator-metrics-registry)
(allocation-strategy build-coordinator-allocation-strategy)
- (allocator-thread build-coordinator-allocator-thread
- set-build-coordinator-allocator-thread!)
+ (trigger-build-allocation
+ build-coordinator-trigger-build-allocation
+ set-build-coordinator-trigger-build-allocation!)
+ (allocator-channel build-coordinator-allocator-channel)
(logger build-coordinator-logger)
(events-channel build-coordinator-events-channel
set-build-coordinator-events-channel!)
@@ -363,7 +372,8 @@
(not (client-error? exn))))))
hooks
(allocation-strategy
- basic-build-allocation-strategy))
+ basic-build-allocation-strategy)
+ (timestamp-log-output? #t))
(and (or (list? hooks)
(begin
(simple-format
@@ -397,9 +407,11 @@
;; In guile-lib v0.2.8 onwards, the formatter is
;; called with more arguments
(lambda args ; lvl, time, str
- (format #f "~a (~5a): ~a~%"
- (strftime "%F %H:%M:%S" (localtime
- (second args)))
+ (format #f "~a(~5a): ~a~%"
+ (if timestamp-log-output?
+ (strftime "%F %H:%M:%S " (localtime
+ (second args)))
+ "")
(first args)
(third args)))))
(build-coordinator
@@ -407,6 +419,7 @@
hooks
metrics-registry
allocation-strategy
+ (make-channel)
lgr)))
(add-handler! lgr port-log)
@@ -423,6 +436,11 @@
;; The logger assumes this
(set-port-encoding! (current-output-port) "UTF-8")
+ ;; Work around my broken with-store/non-blocking in Guix
+ (let ((socket-file (%daemon-socket-uri)))
+ (%daemon-socket-uri
+ (string-append "file://" socket-file)))
+
(with-exception-handler
(lambda (exn)
(simple-format #t "failed enabling core dumps: ~A\n" exn))
@@ -452,7 +470,7 @@
(when update-datastore?
(datastore-update (build-coordinator-datastore build-coordinator)))
- (set-build-coordinator-allocator-thread!
+ (set-build-coordinator-trigger-build-allocation!
build-coordinator
(make-build-allocator-thread build-coordinator))
@@ -544,6 +562,8 @@
(spawn-fiber-to-watch-for-deferred-builds build-coordinator)
+ (spawn-build-allocation-plan-management-fiber build-coordinator)
+
(set-build-coordinator-scheduler! build-coordinator
(current-scheduler))
@@ -588,8 +608,8 @@
finished?)
(wait finished?))
- #:hz 10
- #:parallelism 2))
+ #:hz 0
+ #:parallelism 1))
finished?)))))
(define* (submit-build build-coordinator derivation-file
@@ -622,12 +642,19 @@
(derivation
(if derivation-exists-in-database?
#f ; unnecessary to fetch derivation
- (with-fibers-port-timeouts
- (lambda ()
- (call-with-delay-logging read-drv
- #:threshold 10
- #:args (list derivation-file)))
- #:timeout 30)))
+ ;; Bit of a hack, but offload reading the derivation to a
+ ;; thread so that it doesn't block the fibers thread, since
+ ;; local I/O doesn't cooperate with fibers
+ (datastore-call-with-transaction
+ datastore
+ (lambda _
+ (with-fibers-port-timeouts
+ (lambda ()
+ (call-with-delay-logging read-drv
+ #:threshold 10
+ #:args (list derivation-file)))
+ #:timeout 240))
+ #:readonly? #t)))
(system
(or system-from-database
@@ -739,6 +766,19 @@
(build-coordinator-metrics-registry build-coordinator)
"coordinator_submit_build_duration_seconds"
(lambda ()
+ (unless (derivation-path? derivation-file)
+ (raise-exception
+ (make-client-error 'invalid-derivation-file)))
+
+ (string-for-each
+ (lambda (c)
+ (unless (or (char-alphabetic? c)
+ (char-numeric? c)
+ (member c '(#\+ #\- #\. #\_ #\? #\=)))
+ (raise-exception
+ (make-client-error 'invalid-character-in-derivation-file))))
+ (store-path-base derivation-file))
+
(match (check-whether-to-store-build)
('continue
;; Store the derivation first, so that listing related derivations
@@ -746,12 +786,19 @@
(unless (datastore-find-derivation datastore derivation-file)
(datastore-store-derivation
datastore
- (with-fibers-port-timeouts
- (lambda ()
- (call-with-delay-logging read-drv
- #:threshold 10
- #:args (list derivation-file)))
- #:timeout 30)))
+ ;; Bit of a hack, but offload reading the derivation to a thread so
+ ;; that it doesn't block the fibers thread, since local I/O doesn't
+ ;; cooperate with fibers
+ (datastore-call-with-transaction
+ datastore
+ (lambda _
+ (with-fibers-port-timeouts
+ (lambda ()
+ (call-with-delay-logging read-drv
+ #:threshold 10
+ #:args (list derivation-file)))
+ #:timeout 30))
+ #:readonly? #t)))
(let ((related-derivations-lacking-builds
(if ensure-all-related-derivation-outputs-have-builds?
@@ -828,7 +875,9 @@
(make-transaction-rollback-exception
'skipped-as-build-required-by-another)))
- (datastore-remove-build-from-allocation-plan datastore uuid)
+ (build-coordinator-remove-build-from-allocation-plan
+ build-coordinator
+ uuid)
(datastore-cancel-build datastore uuid)
(datastore-insert-unprocessed-hook-event datastore
"build-canceled"
@@ -966,7 +1015,7 @@
(processor_count . ,processor-count))))
(define (trigger-build-allocation build-coordinator)
- ((build-coordinator-allocator-thread build-coordinator)))
+ ((build-coordinator-trigger-build-allocation build-coordinator)))
(define (build-coordinator-prompt-hook-processing-for-event build-coordinator
event-name)
@@ -987,16 +1036,20 @@
(allocator-proc datastore
#:metrics-registry (build-coordinator-metrics-registry
build-coordinator)))))
- (datastore-replace-build-allocation-plan datastore new-plan)
-
- (let ((build-count-per-agent
- (datastore-count-build-allocation-plan-entries
- datastore)))
- (build-coordinator-send-event
- build-coordinator
- "allocation-plan-update"
- `((allocation_plan_counts . ,build-count-per-agent)))))
- #t)
+ (build-coordinator-replace-build-allocation-plan
+ build-coordinator
+ new-plan)
+
+ (build-coordinator-send-event
+ build-coordinator
+ "allocation-plan-update"
+ `((allocation_plan_counts
+ . ,(map
+ (match-lambda
+ ((agent-id . builds)
+ (cons agent-id (length builds))))
+ new-plan))))
+ #t))
(define (make-build-allocator-thread build-coordinator)
(define mtx (make-mutex))
@@ -1068,31 +1121,187 @@
exn)
(exit 1))
(lambda ()
- (let ((build-allocation-plan-total
- (make-gauge-metric
- (build-coordinator-metrics-registry build-coordinator)
- "build_allocation_plan_total"
- #:labels '(agent_id))))
- (with-time-logging
- "allocator initialise metrics"
- (for-each (match-lambda
- ((agent-id . count)
- (metric-set build-allocation-plan-total
- count
- #:label-values
- `((agent_id . ,agent-id)))))
- (datastore-count-build-allocation-plan-entries
- (build-coordinator-datastore build-coordinator)))))
-
(update-build-allocation-plan-loop)))))
trigger-build-allocation)
+(define (spawn-build-allocation-plan-management-fiber coordinator)
+ (define allocation-plan '())
+
+ (define allocation-plan-metric
+ (make-gauge-metric
+ (build-coordinator-metrics-registry coordinator)
+ "build_allocation_plan_total"
+ #:labels '(agent_id)))
+
+ (define (update-build-allocation-plan-metrics!)
+ (for-each
+ (match-lambda
+ ((agent-id . builds)
+ (metric-set allocation-plan-metric
+ (length builds)
+ #:label-values
+ `((agent_id . ,agent-id)))))
+ allocation-plan)
+ #t)
+
+ (spawn-fiber
+ (lambda ()
+ (while #t
+ (with-exception-handler
+ (lambda (exn)
+ (simple-format (current-error-port)
+ "exception in allocation plan fiber: ~A\n"
+ exn))
+ (lambda ()
+ (with-throw-handler #t
+ (lambda ()
+ (match (get-message (build-coordinator-allocator-channel coordinator))
+ (('stats reply)
+ (put-message
+ reply
+ (map
+ (match-lambda
+ ((agent-id . builds)
+ (cons agent-id (length builds))))
+ allocation-plan)))
+ (('fetch-agent-plan agent-id reply)
+ (put-message
+ reply
+ (or (assoc-ref allocation-plan agent-id) '())))
+ (('fetch-plan reply)
+ (put-message reply allocation-plan))
+ (('remove-build uuid reply)
+ (set!
+ allocation-plan
+ (map
+ (match-lambda
+ ((agent-id . builds)
+ (cons agent-id
+ (remove
+ (lambda (build-uuid)
+ (string=? build-uuid uuid))
+ builds))))
+ allocation-plan))
+
+ (put-message reply #t))
+ (('replace new-plan reply)
+
+ (set! allocation-plan new-plan)
+
+ (update-build-allocation-plan-metrics!)
+
+ (put-message reply #t))))
+ (lambda _
+ (backtrace))))
+ #:unwind? #t)))))
+
+(define (build-coordinator-allocation-plan-stats coordinator)
+ (let ((reply (make-channel)))
+ (put-message (build-coordinator-allocator-channel coordinator)
+ (list 'stats reply))
+ (get-message reply)))
+
+(define (build-coordinator-fetch-agent-allocation-plan coordinator agent-id)
+ (let ((reply (make-channel)))
+ (put-message (build-coordinator-allocator-channel coordinator)
+ (list 'fetch-agent-plan agent-id reply))
+ (get-message reply)))
+
+(define (build-coordinator-allocation-plan coordinator)
+ (let ((reply (make-channel)))
+ (put-message (build-coordinator-allocator-channel coordinator)
+ (list 'fetch-plan reply))
+ (get-message reply)))
+
+(define (build-coordinator-build-in-allocation-plan? coordinator uuid)
+ (any
+ (match-lambda
+ ((agent-id . build-uuids)
+ (->bool (member uuid build-uuids string=?))))
+ (build-coordinator-allocation-plan coordinator)))
+
+(define (build-coordinator-remove-build-from-allocation-plan coordinator uuid)
+ (let ((reply (make-channel)))
+ (put-message (build-coordinator-allocator-channel coordinator)
+ (list 'remove-build uuid reply))
+ (get-message reply)))
+
+(define (build-coordinator-replace-build-allocation-plan coordinator plan)
+ (let ((reply (make-channel)))
+ (put-message (build-coordinator-allocator-channel coordinator)
+ (list 'replace plan reply))
+ (get-message reply)))
+
+(define (build-coordinator-fetch-build-to-allocate coordinator
+ agent-id)
+ (define datastore
+ (build-coordinator-datastore coordinator))
+
+ (let loop ((planned-builds
+ (build-coordinator-fetch-agent-allocation-plan coordinator
+ agent-id)))
+
+ (if (null? planned-builds)
+ #f
+ (match (datastore-fetch-build-to-allocate datastore (first planned-builds))
+ (#f #f)
+ (#(uuid derivation-id derivation-name derived_priority)
+
+ (if (datastore-check-if-derivation-conflicts?
+ datastore
+ agent-id
+ derivation-id)
+ (loop (cdr planned-builds))
+ `((uuid . ,uuid)
+ (derivation_name . ,derivation-name)
+ (derived_priority . ,derived_priority))))))))
+
+(define* (build-coordinator-list-allocation-plan-builds coordinator
+ agent-id
+ #:key limit)
+ (define (take* lst i)
+ (if (< (length lst) i)
+ lst
+ (take lst i)))
+
+ (define datastore
+ (build-coordinator-datastore coordinator))
+
+ (let ((build-ids
+ (build-coordinator-fetch-agent-allocation-plan coordinator
+ agent-id)))
+ (filter-map
+ (lambda (build-id)
+ (match (datastore-fetch-build-to-allocate datastore build-id)
+ (#(uuid derivation_id derivation_name derived_priority)
+ (let ((build-details (datastore-find-build datastore uuid)))
+ `((uuid . ,uuid)
+ (derivation_name . ,derivation_name)
+ (system . ,(datastore-find-build-derivation-system
+ datastore
+ uuid))
+ (priority . ,(assq-ref build-details 'priority))
+ (derived_priority . ,derived_priority)
+ (tags . ,(vector-map
+ (lambda (_ tag)
+ (match tag
+ ((key . value)
+ `((key . ,key)
+ (value . ,value)))))
+ (datastore-fetch-build-tags
+ datastore
+ uuid))))))
+ (#f #f)))
+ (if limit
+ (take* build-ids limit)
+ build-ids))))
+
(define (spawn-fiber-to-watch-for-deferred-builds coordinator)
(spawn-fiber
(lambda ()
+ (sleep 10)
(while #t
- (sleep 60) ; 1 minute
(with-exception-handler
(lambda (exn)
(simple-format (current-error-port)
@@ -1100,14 +1309,21 @@
exn))
(lambda ()
(let ((first-deferred-build
- (datastore-find-first-unallocated-deferred-build
- (build-coordinator-datastore coordinator))))
- (when (and first-deferred-build
- (time<=? (date->time-utc
- (assq-ref first-deferred-build 'deferred-until))
- (current-time)))
- (simple-format #t "guix-build-coordinator: triggering build allocation for deferred build: ~A\n" (assq-ref first-deferred-build 'uuid))
- (trigger-build-allocation coordinator))))
+ (datastore-find-deferred-build
+ (build-coordinator-datastore coordinator)
+ (lambda (build-details)
+ (build-coordinator-build-in-allocation-plan?
+ coordinator
+ (assq-ref build-details 'uuid))))))
+ (if (and first-deferred-build
+ (time<=? (date->time-utc
+ (assq-ref first-deferred-build 'deferred-until))
+ (current-time)))
+ (begin
+ (simple-format #t "guix-build-coordinator: triggering build allocation for deferred build: ~A\n" (assq-ref first-deferred-build 'uuid))
+ (trigger-build-allocation coordinator)
+ (sleep 10))
+ (sleep 60))))
#:unwind? #t)))
#:parallel? #t))
@@ -1255,12 +1471,18 @@
(datastore-list-unprocessed-hook-events
datastore
event-name
- (+ 1 (length in-progress-ids)))))
- (find
- (match-lambda
- ((id rest ...)
- (not (member id in-progress-ids))))
- potential-jobs)))
+ (+ 1 (length in-progress-ids))))
+ (job
+ (find
+ (match-lambda
+ ((id rest ...)
+ (not (member id in-progress-ids))))
+ potential-jobs)))
+ (log-msg
+ (build-coordinator-logger build-coordinator)
+ 'DEBUG
+ event-name " work queue, got job " job)
+ job))
(lambda (id event arguments)
(process-event id event arguments handler))
#:name (symbol->string event-name))))
@@ -1286,11 +1508,13 @@
(build-coordinator-datastore build-coordinator))
(define (allocate-one-build agent-id)
- (let ((build-details (datastore-fetch-build-to-allocate datastore agent-id)))
+ (let ((build-details (build-coordinator-fetch-build-to-allocate
+ build-coordinator agent-id)))
(if build-details
(let ((build-id (assq-ref build-details 'uuid)))
(datastore-insert-to-allocated-builds datastore agent-id (list build-id))
- (datastore-remove-builds-from-plan datastore (list build-id))
+ (build-coordinator-remove-build-from-allocation-plan
+ build-coordinator build-id)
build-details)
#f)))
@@ -1322,20 +1546,6 @@
(let ((new-builds
(allocate-several-builds agent
(- target-count start-count))))
- (unless (null? new-builds)
- (let ((allocation-plan-metric
- (metrics-registry-fetch-metric
- (slot-ref datastore 'metrics-registry)
- "build_allocation_plan_total")))
- (for-each
- (match-lambda
- ((agent-id . count)
- (metric-set allocation-plan-metric
- count
- #:label-values
- `((agent_id . ,agent-id)))))
- (datastore-count-build-allocation-plan-entries datastore))))
-
;; Previously allocate builds just returned newly allocated
;; builds, but if max-builds is provided, return all the
;; builds. This means the agent can handle this in a idempotent