diff options
Diffstat (limited to 'guix-build-coordinator/coordinator.scm')
-rw-r--r-- | guix-build-coordinator/coordinator.scm | 372 |
1 files changed, 291 insertions, 81 deletions
diff --git a/guix-build-coordinator/coordinator.scm b/guix-build-coordinator/coordinator.scm index b75b40f..dc72ca9 100644 --- a/guix-build-coordinator/coordinator.scm +++ b/guix-build-coordinator/coordinator.scm @@ -25,6 +25,7 @@ #:use-module (srfi srfi-11) #:use-module (srfi srfi-19) #:use-module (srfi srfi-26) + #:use-module (srfi srfi-43) #:use-module (srfi srfi-71) #:use-module (ice-9 ftw) #:use-module (ice-9 vlist) @@ -50,6 +51,7 @@ #:use-module (prometheus) #:use-module ((guix build syscalls) #:select (set-thread-name)) + #:use-module (guix store) #:use-module (guix derivations) #:use-module (guix build utils) #:use-module (guix-build-coordinator utils) @@ -97,6 +99,10 @@ build-coordinator-prompt-hook-processing-for-event start-hook-processing-threads + build-coordinator-allocation-plan-stats + build-coordinator-trigger-build-allocation + build-coordinator-list-allocation-plan-builds + build-output-file-location build-log-file-destination build-log-file-location @@ -116,7 +122,8 @@ (define-record-type <build-coordinator> (make-build-coordinator-record datastore hooks metrics-registry - allocation-strategy logger) + allocation-strategy allocator-channel + logger) build-coordinator? (datastore build-coordinator-datastore) (hooks build-coordinator-hooks) @@ -124,8 +131,10 @@ set-build-coordinator-hook-condvars!) (metrics-registry build-coordinator-metrics-registry) (allocation-strategy build-coordinator-allocation-strategy) - (allocator-thread build-coordinator-allocator-thread - set-build-coordinator-allocator-thread!) + (trigger-build-allocation + build-coordinator-trigger-build-allocation + set-build-coordinator-trigger-build-allocation!) + (allocator-channel build-coordinator-allocator-channel) (logger build-coordinator-logger) (events-channel build-coordinator-events-channel set-build-coordinator-events-channel!) @@ -363,7 +372,8 @@ (not (client-error? exn)))))) hooks (allocation-strategy - basic-build-allocation-strategy)) + basic-build-allocation-strategy) + (timestamp-log-output? #t)) (and (or (list? hooks) (begin (simple-format @@ -397,9 +407,11 @@ ;; In guile-lib v0.2.8 onwards, the formatter is ;; called with more arguments (lambda args ; lvl, time, str - (format #f "~a (~5a): ~a~%" - (strftime "%F %H:%M:%S" (localtime - (second args))) + (format #f "~a(~5a): ~a~%" + (if timestamp-log-output? + (strftime "%F %H:%M:%S " (localtime + (second args))) + "") (first args) (third args))))) (build-coordinator @@ -407,6 +419,7 @@ hooks metrics-registry allocation-strategy + (make-channel) lgr))) (add-handler! lgr port-log) @@ -423,6 +436,11 @@ ;; The logger assumes this (set-port-encoding! (current-output-port) "UTF-8") + ;; Work around my broken with-store/non-blocking in Guix + (let ((socket-file (%daemon-socket-uri))) + (%daemon-socket-uri + (string-append "file://" socket-file))) + (with-exception-handler (lambda (exn) (simple-format #t "failed enabling core dumps: ~A\n" exn)) @@ -452,7 +470,7 @@ (when update-datastore? (datastore-update (build-coordinator-datastore build-coordinator))) - (set-build-coordinator-allocator-thread! + (set-build-coordinator-trigger-build-allocation! build-coordinator (make-build-allocator-thread build-coordinator)) @@ -544,6 +562,8 @@ (spawn-fiber-to-watch-for-deferred-builds build-coordinator) + (spawn-build-allocation-plan-management-fiber build-coordinator) + (set-build-coordinator-scheduler! build-coordinator (current-scheduler)) @@ -588,8 +608,8 @@ finished?) (wait finished?)) - #:hz 10 - #:parallelism 2)) + #:hz 0 + #:parallelism 1)) finished?))))) (define* (submit-build build-coordinator derivation-file @@ -622,12 +642,19 @@ (derivation (if derivation-exists-in-database? #f ; unnecessary to fetch derivation - (with-fibers-port-timeouts - (lambda () - (call-with-delay-logging read-drv - #:threshold 10 - #:args (list derivation-file))) - #:timeout 30))) + ;; Bit of a hack, but offload reading the derivation to a + ;; thread so that it doesn't block the fibers thread, since + ;; local I/O doesn't cooperate with fibers + (datastore-call-with-transaction + datastore + (lambda _ + (with-fibers-port-timeouts + (lambda () + (call-with-delay-logging read-drv + #:threshold 10 + #:args (list derivation-file))) + #:timeout 240)) + #:readonly? #t))) (system (or system-from-database @@ -739,6 +766,19 @@ (build-coordinator-metrics-registry build-coordinator) "coordinator_submit_build_duration_seconds" (lambda () + (unless (derivation-path? derivation-file) + (raise-exception + (make-client-error 'invalid-derivation-file))) + + (string-for-each + (lambda (c) + (unless (or (char-alphabetic? c) + (char-numeric? c) + (member c '(#\+ #\- #\. #\_ #\? #\=))) + (raise-exception + (make-client-error 'invalid-character-in-derivation-file)))) + (store-path-base derivation-file)) + (match (check-whether-to-store-build) ('continue ;; Store the derivation first, so that listing related derivations @@ -746,12 +786,19 @@ (unless (datastore-find-derivation datastore derivation-file) (datastore-store-derivation datastore - (with-fibers-port-timeouts - (lambda () - (call-with-delay-logging read-drv - #:threshold 10 - #:args (list derivation-file))) - #:timeout 30))) + ;; Bit of a hack, but offload reading the derivation to a thread so + ;; that it doesn't block the fibers thread, since local I/O doesn't + ;; cooperate with fibers + (datastore-call-with-transaction + datastore + (lambda _ + (with-fibers-port-timeouts + (lambda () + (call-with-delay-logging read-drv + #:threshold 10 + #:args (list derivation-file))) + #:timeout 30)) + #:readonly? #t))) (let ((related-derivations-lacking-builds (if ensure-all-related-derivation-outputs-have-builds? @@ -828,7 +875,9 @@ (make-transaction-rollback-exception 'skipped-as-build-required-by-another))) - (datastore-remove-build-from-allocation-plan datastore uuid) + (build-coordinator-remove-build-from-allocation-plan + build-coordinator + uuid) (datastore-cancel-build datastore uuid) (datastore-insert-unprocessed-hook-event datastore "build-canceled" @@ -966,7 +1015,7 @@ (processor_count . ,processor-count)))) (define (trigger-build-allocation build-coordinator) - ((build-coordinator-allocator-thread build-coordinator))) + ((build-coordinator-trigger-build-allocation build-coordinator))) (define (build-coordinator-prompt-hook-processing-for-event build-coordinator event-name) @@ -987,16 +1036,20 @@ (allocator-proc datastore #:metrics-registry (build-coordinator-metrics-registry build-coordinator))))) - (datastore-replace-build-allocation-plan datastore new-plan) - - (let ((build-count-per-agent - (datastore-count-build-allocation-plan-entries - datastore))) - (build-coordinator-send-event - build-coordinator - "allocation-plan-update" - `((allocation_plan_counts . ,build-count-per-agent))))) - #t) + (build-coordinator-replace-build-allocation-plan + build-coordinator + new-plan) + + (build-coordinator-send-event + build-coordinator + "allocation-plan-update" + `((allocation_plan_counts + . ,(map + (match-lambda + ((agent-id . builds) + (cons agent-id (length builds)))) + new-plan)))) + #t)) (define (make-build-allocator-thread build-coordinator) (define mtx (make-mutex)) @@ -1068,31 +1121,187 @@ exn) (exit 1)) (lambda () - (let ((build-allocation-plan-total - (make-gauge-metric - (build-coordinator-metrics-registry build-coordinator) - "build_allocation_plan_total" - #:labels '(agent_id)))) - (with-time-logging - "allocator initialise metrics" - (for-each (match-lambda - ((agent-id . count) - (metric-set build-allocation-plan-total - count - #:label-values - `((agent_id . ,agent-id))))) - (datastore-count-build-allocation-plan-entries - (build-coordinator-datastore build-coordinator))))) - (update-build-allocation-plan-loop))))) trigger-build-allocation) +(define (spawn-build-allocation-plan-management-fiber coordinator) + (define allocation-plan '()) + + (define allocation-plan-metric + (make-gauge-metric + (build-coordinator-metrics-registry coordinator) + "build_allocation_plan_total" + #:labels '(agent_id))) + + (define (update-build-allocation-plan-metrics!) + (for-each + (match-lambda + ((agent-id . builds) + (metric-set allocation-plan-metric + (length builds) + #:label-values + `((agent_id . ,agent-id))))) + allocation-plan) + #t) + + (spawn-fiber + (lambda () + (while #t + (with-exception-handler + (lambda (exn) + (simple-format (current-error-port) + "exception in allocation plan fiber: ~A\n" + exn)) + (lambda () + (with-throw-handler #t + (lambda () + (match (get-message (build-coordinator-allocator-channel coordinator)) + (('stats reply) + (put-message + reply + (map + (match-lambda + ((agent-id . builds) + (cons agent-id (length builds)))) + allocation-plan))) + (('fetch-agent-plan agent-id reply) + (put-message + reply + (or (assoc-ref allocation-plan agent-id) '()))) + (('fetch-plan reply) + (put-message reply allocation-plan)) + (('remove-build uuid reply) + (set! + allocation-plan + (map + (match-lambda + ((agent-id . builds) + (cons agent-id + (remove + (lambda (build-uuid) + (string=? build-uuid uuid)) + builds)))) + allocation-plan)) + + (put-message reply #t)) + (('replace new-plan reply) + + (set! allocation-plan new-plan) + + (update-build-allocation-plan-metrics!) + + (put-message reply #t)))) + (lambda _ + (backtrace)))) + #:unwind? #t))))) + +(define (build-coordinator-allocation-plan-stats coordinator) + (let ((reply (make-channel))) + (put-message (build-coordinator-allocator-channel coordinator) + (list 'stats reply)) + (get-message reply))) + +(define (build-coordinator-fetch-agent-allocation-plan coordinator agent-id) + (let ((reply (make-channel))) + (put-message (build-coordinator-allocator-channel coordinator) + (list 'fetch-agent-plan agent-id reply)) + (get-message reply))) + +(define (build-coordinator-allocation-plan coordinator) + (let ((reply (make-channel))) + (put-message (build-coordinator-allocator-channel coordinator) + (list 'fetch-plan reply)) + (get-message reply))) + +(define (build-coordinator-build-in-allocation-plan? coordinator uuid) + (any + (match-lambda + ((agent-id . build-uuids) + (->bool (member uuid build-uuids string=?)))) + (build-coordinator-allocation-plan coordinator))) + +(define (build-coordinator-remove-build-from-allocation-plan coordinator uuid) + (let ((reply (make-channel))) + (put-message (build-coordinator-allocator-channel coordinator) + (list 'remove-build uuid reply)) + (get-message reply))) + +(define (build-coordinator-replace-build-allocation-plan coordinator plan) + (let ((reply (make-channel))) + (put-message (build-coordinator-allocator-channel coordinator) + (list 'replace plan reply)) + (get-message reply))) + +(define (build-coordinator-fetch-build-to-allocate coordinator + agent-id) + (define datastore + (build-coordinator-datastore coordinator)) + + (let loop ((planned-builds + (build-coordinator-fetch-agent-allocation-plan coordinator + agent-id))) + + (if (null? planned-builds) + #f + (match (datastore-fetch-build-to-allocate datastore (first planned-builds)) + (#f #f) + (#(uuid derivation-id derivation-name derived_priority) + + (if (datastore-check-if-derivation-conflicts? + datastore + agent-id + derivation-id) + (loop (cdr planned-builds)) + `((uuid . ,uuid) + (derivation_name . ,derivation-name) + (derived_priority . ,derived_priority)))))))) + +(define* (build-coordinator-list-allocation-plan-builds coordinator + agent-id + #:key limit) + (define (take* lst i) + (if (< (length lst) i) + lst + (take lst i))) + + (define datastore + (build-coordinator-datastore coordinator)) + + (let ((build-ids + (build-coordinator-fetch-agent-allocation-plan coordinator + agent-id))) + (filter-map + (lambda (build-id) + (match (datastore-fetch-build-to-allocate datastore build-id) + (#(uuid derivation_id derivation_name derived_priority) + (let ((build-details (datastore-find-build datastore uuid))) + `((uuid . ,uuid) + (derivation_name . ,derivation_name) + (system . ,(datastore-find-build-derivation-system + datastore + uuid)) + (priority . ,(assq-ref build-details 'priority)) + (derived_priority . ,derived_priority) + (tags . ,(vector-map + (lambda (_ tag) + (match tag + ((key . value) + `((key . ,key) + (value . ,value))))) + (datastore-fetch-build-tags + datastore + uuid)))))) + (#f #f))) + (if limit + (take* build-ids limit) + build-ids)))) + (define (spawn-fiber-to-watch-for-deferred-builds coordinator) (spawn-fiber (lambda () + (sleep 10) (while #t - (sleep 60) ; 1 minute (with-exception-handler (lambda (exn) (simple-format (current-error-port) @@ -1100,14 +1309,21 @@ exn)) (lambda () (let ((first-deferred-build - (datastore-find-first-unallocated-deferred-build - (build-coordinator-datastore coordinator)))) - (when (and first-deferred-build - (time<=? (date->time-utc - (assq-ref first-deferred-build 'deferred-until)) - (current-time))) - (simple-format #t "guix-build-coordinator: triggering build allocation for deferred build: ~A\n" (assq-ref first-deferred-build 'uuid)) - (trigger-build-allocation coordinator)))) + (datastore-find-deferred-build + (build-coordinator-datastore coordinator) + (lambda (build-details) + (build-coordinator-build-in-allocation-plan? + coordinator + (assq-ref build-details 'uuid)))))) + (if (and first-deferred-build + (time<=? (date->time-utc + (assq-ref first-deferred-build 'deferred-until)) + (current-time))) + (begin + (simple-format #t "guix-build-coordinator: triggering build allocation for deferred build: ~A\n" (assq-ref first-deferred-build 'uuid)) + (trigger-build-allocation coordinator) + (sleep 10)) + (sleep 60)))) #:unwind? #t))) #:parallel? #t)) @@ -1255,12 +1471,18 @@ (datastore-list-unprocessed-hook-events datastore event-name - (+ 1 (length in-progress-ids))))) - (find - (match-lambda - ((id rest ...) - (not (member id in-progress-ids)))) - potential-jobs))) + (+ 1 (length in-progress-ids)))) + (job + (find + (match-lambda + ((id rest ...) + (not (member id in-progress-ids)))) + potential-jobs))) + (log-msg + (build-coordinator-logger build-coordinator) + 'DEBUG + event-name " work queue, got job " job) + job)) (lambda (id event arguments) (process-event id event arguments handler)) #:name (symbol->string event-name)))) @@ -1286,11 +1508,13 @@ (build-coordinator-datastore build-coordinator)) (define (allocate-one-build agent-id) - (let ((build-details (datastore-fetch-build-to-allocate datastore agent-id))) + (let ((build-details (build-coordinator-fetch-build-to-allocate + build-coordinator agent-id))) (if build-details (let ((build-id (assq-ref build-details 'uuid))) (datastore-insert-to-allocated-builds datastore agent-id (list build-id)) - (datastore-remove-builds-from-plan datastore (list build-id)) + (build-coordinator-remove-build-from-allocation-plan + build-coordinator build-id) build-details) #f))) @@ -1322,20 +1546,6 @@ (let ((new-builds (allocate-several-builds agent (- target-count start-count)))) - (unless (null? new-builds) - (let ((allocation-plan-metric - (metrics-registry-fetch-metric - (slot-ref datastore 'metrics-registry) - "build_allocation_plan_total"))) - (for-each - (match-lambda - ((agent-id . count) - (metric-set allocation-plan-metric - count - #:label-values - `((agent_id . ,agent-id))))) - (datastore-count-build-allocation-plan-entries datastore)))) - ;; Previously allocate builds just returned newly allocated ;; builds, but if max-builds is provided, return all the ;; builds. This means the agent can handle this in a idempotent |