;;; Guix Build Coordinator ;;; ;;; Copyright © 2020 Christopher Baines ;;; ;;; This file is part of the guix-build-coordinator. ;;; ;;; The Guix Build Coordinator is free software; you can redistribute ;;; it and/or modify it under the terms of the GNU General Public ;;; License as published by the Free Software Foundation; either ;;; version 3 of the License, or (at your option) any later version. ;;; ;;; The Guix Build Coordinator is distributed in the hope that it will ;;; be useful, but WITHOUT ANY WARRANTY; without even the implied ;;; warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ;;; See the GNU General Public License for more details. ;;; ;;; You should have received a copy of the GNU General Public License ;;; along with the guix-data-service. If not, see ;;; . (define-module (guix-build-coordinator coordinator) #:use-module (srfi srfi-1) #:use-module (srfi srfi-9) #:use-module (srfi srfi-26) #:use-module (ice-9 ftw) #:use-module (ice-9 match) #:use-module (ice-9 format) #:use-module (ice-9 atomic) #:use-module (ice-9 threads) #:use-module (ice-9 exceptions) #:use-module (web uri) #:use-module (oop goops) #:use-module (logging logger) #:use-module (logging port-log) #:use-module (gcrypt random) #:use-module (fibers) #:use-module (fibers channels) #:use-module (fibers conditions) #:use-module (prometheus) #:use-module (guix derivations) #:use-module (guix build utils) #:use-module (guix-build-coordinator utils) #:use-module (guix-build-coordinator utils fibers) #:use-module (guix-build-coordinator config) #:use-module (guix-build-coordinator datastore) #:use-module (guix-build-coordinator build-allocator) #:use-module (guix-build-coordinator agent-messaging http server) #:use-module (guix-build-coordinator client-communication) #:export (make-agent-error ; TODO Remove export once unused agent-error? agent-error-details make-build-coordinator build-coordinator-datastore build-coordinator-hooks build-coordinator-metrics-registry build-coordinator-allocation-strategy build-coordinator-logger %default-agent-uri %default-client-uri perform-coordinator-service-startup run-coordinator-service submit-build cancel-build new-agent new-agent-password fetch-builds agent-details trigger-build-allocation build-coordinator-prompt-hook-processing-for-event start-hook-processing-threads build-output-file-location build-log-file-destination build-log-file-location handle-build-start-report handle-build-result handle-setup-failure-report)) (define-exception-type &agent-error &error make-agent-error agent-error? (details agent-error-details)) (define-record-type (make-build-coordinator-record datastore hooks metrics-registry allocation-strategy logger) build-coordinator? (datastore build-coordinator-datastore) (hooks build-coordinator-hooks) (hook-condvars build-coordinator-hook-condvars set-build-coordinator-hook-condvars!) (metrics-registry build-coordinator-metrics-registry) (allocation-strategy build-coordinator-allocation-strategy) (allocator-thread build-coordinator-allocator-thread set-build-coordinator-allocator-thread!) (logger build-coordinator-logger)) (define %known-hooks '(build-submitted build-started build-success build-failure build-canceled build-missing-inputs)) (define* (make-build-coordinator #:key datastore hooks (metrics-registry (make-metrics-registry #:namespace "guixbuildcoordinator")) (allocation-strategy basic-build-allocation-strategy)) (and (or (list? hooks) (begin (simple-format #t "warning: guix-build-coordinator: hooks should be a list\n") #f)) (for-each (match-lambda ((hook-name . hook) (unless (member hook-name %known-hooks) (simple-format #t "warning: guix-build-coordinator: hook name ~A not recognised it should be one of: ~A\n" hook-name (string-join (map symbol->string %known-hooks) ", "))) (unless (procedure? hook) (simple-format #t "warning: guix-build-coordinator: hook ~A value is not a procedure ~A\n" hook-name hook))) (unknown (simple-format #t "warning: guix-build-coordinator: hooks entry is not a pair: ~A\n" unknown))) hooks)) (let* ((lgr (make )) (port-log (make #:port (current-output-port) #:formatter (lambda (lvl time str) (format #f "~a (~5a): ~a~%" (strftime "%F %H:%M:%S" (localtime time)) lvl str)))) (build-coordinator (make-build-coordinator-record datastore hooks metrics-registry allocation-strategy lgr))) (add-handler! lgr port-log) (open-log! lgr) (set-build-coordinator-allocator-thread! build-coordinator (make-build-allocator-thread build-coordinator)) (set-build-coordinator-hook-condvars! build-coordinator (start-hook-processing-threads build-coordinator)) build-coordinator)) (define* (perform-coordinator-service-startup build-coordinator #:key (update-datastore? #t) (pid-file #f) (trigger-build-allocation? #t)) (when update-datastore? (datastore-update (build-coordinator-datastore build-coordinator))) (when pid-file (call-with-output-file pid-file (lambda (port) (simple-format port "~A\n" (getpid))))) (log-msg (build-coordinator-logger build-coordinator) 'INFO "initialising metrics") (with-time-logging "datastore initialise metrics" (datastore-initialise-metrics! (build-coordinator-datastore build-coordinator))) (when trigger-build-allocation? (trigger-build-allocation build-coordinator))) (define %default-agent-uri (string->uri "http://0.0.0.0:8745")) (define %default-client-uri (string->uri "http://127.0.0.1:8746")) (define* (run-coordinator-service build-coordinator #:key (update-datastore? #t) (pid-file #f) (agent-communication-uri %default-agent-uri) (client-communication-uri %default-client-uri) secret-key-base) (perform-coordinator-service-startup build-coordinator #:update-datastore? update-datastore? #:pid-file pid-file) ;; Create some worker thread channels, which need to be created prior ;; to run-fibers being called. (let ((chunked-request-channel ;; There are fibers issues when trying to read the chunked ;; requests, so do this in dedicated threads. (make-worker-thread-channel (const '()) #:parallelism 8)) (substitutes-channel (make-worker-thread-channel (const '()) #:parallelism 2))) (let ((finished? (make-condition))) (call-with-sigint (lambda () (run-fibers (lambda () (datastore-spawn-fibers (build-coordinator-datastore build-coordinator)) ;; Start the agent messaging server (match (uri-scheme agent-communication-uri) ('http (let ((host (uri-host agent-communication-uri)) (port (uri-port agent-communication-uri))) (http-agent-messaging-start-server port host secret-key-base build-coordinator chunked-request-channel) (log-msg 'INFO "listening on " host ":" port)))) ;; Start the client messaging server (start-client-request-server secret-key-base (uri-host client-communication-uri) (uri-port client-communication-uri) build-coordinator substitutes-channel) (wait finished?)))) finished?)))) (define* (submit-build build-coordinator derivation-file #:key requested-uuid (priority 0) (ignore-if-build-for-derivation-exists? #f) (ignore-if-build-for-outputs-exists? #f) (ensure-all-related-derivation-outputs-have-builds? #f) (tags '())) (define datastore (build-coordinator-datastore build-coordinator)) (define (build-for-derivation-exists?) (not (null? (datastore-list-builds-for-derivation datastore derivation-file)))) (define (build-for-output-already-exists?) (let ((system (datastore-find-derivation-system datastore derivation-file))) (any (lambda (output-details) (let ((builds-for-output (datastore-list-builds-for-output-and-system datastore (assq-ref output-details 'output) system))) (not (null? builds-for-output)))) (datastore-find-derivation-outputs datastore derivation-file)))) (define (store-build derivation-name uuid priority tags) (datastore-insert-build datastore uuid derivation-name priority) (datastore-insert-unprocessed-hook-event datastore "build-submitted" (list uuid)) (unless (null? tags) (datastore-insert-build-tags datastore uuid tags)) #t) (define build-id (or requested-uuid (random-v4-uuid))) (define (perform-datastore-changes db) ;; Actually create a build (when ensure-all-related-derivation-outputs-have-builds? (let ((derivations-lacking-builds (datastore-list-related-derivations-with-no-build-for-outputs datastore derivation-file))) (for-each (lambda (related-derivation) (let ((related-uuid (random-v4-uuid))) (simple-format #t "submtiting ~A for related ~A\n" related-uuid related-derivation) (store-build related-derivation related-uuid ;; Let the scheduler take care of ;; the prioritisation 0 tags))) derivations-lacking-builds))) (store-build derivation-file build-id priority tags) #t) (call-with-duration-metric (build-coordinator-metrics-registry build-coordinator) "coordinator_submit_build_duration_seconds" (lambda () (if (and ignore-if-build-for-derivation-exists? (build-for-derivation-exists?)) '((no-build-submitted . build-already-exists-for-this-derivation)) (if (and ignore-if-build-for-outputs-exists? (build-for-output-already-exists?)) '((no-build-submitted . build-already-exists-for-a-output)) (begin (datastore-call-with-transaction datastore perform-datastore-changes #:duration-metric-name "store_build") (build-coordinator-prompt-hook-processing-for-event build-coordinator 'build-submitted) (trigger-build-allocation build-coordinator) `((build-submitted . ,build-id)))))))) (define (cancel-build build-coordinator uuid) (define datastore (build-coordinator-datastore build-coordinator)) (datastore-call-with-transaction datastore (lambda (db) (let ((build-details (datastore-find-build datastore uuid))) (when (assq-ref build-details 'canceled) (raise-exception (make-exception-with-message "cannot cancel and already canceled build"))) (when (assq-ref build-details 'processed) (raise-exception (make-exception-with-message "cannot cancel and already processed build")))) (datastore-remove-build-from-allocation-plan datastore uuid) (datastore-cancel-build datastore uuid) (datastore-insert-unprocessed-hook-event datastore "build-canceled" (list uuid)))) (build-coordinator-prompt-hook-processing-for-event build-coordinator 'build-canceled) #t) (define* (new-agent datastore #:key requested-uuid description) (let ((uuid (or requested-uuid (random-v4-uuid)))) (datastore-new-agent datastore uuid description) uuid)) (define* (new-agent-password datastore #:key agent) (let ((password (random-token))) (datastore-new-agent-password datastore agent password) password)) (define (trigger-build-allocation build-coordinator) ((build-coordinator-allocator-thread build-coordinator))) (define (build-coordinator-prompt-hook-processing-for-event build-coordinator event-name) (and=> (assoc-ref (build-coordinator-hook-condvars build-coordinator) event-name) (lambda (condvar) (signal-condition-variable condvar) #t))) (define (allocate-builds build-coordinator) (define datastore (build-coordinator-datastore build-coordinator)) (let* ((allocator-proc (build-coordinator-allocation-strategy build-coordinator)) (new-plan (with-time-logging "allocating builds" (allocator-proc datastore #:metrics-registry (build-coordinator-metrics-registry build-coordinator))))) (datastore-replace-build-allocation-plan datastore new-plan)) #t) (define (make-build-allocator-thread build-coordinator) (define mtx (make-mutex)) (define v (make-condition-variable)) (define allocation-needed (make-atomic-box #f)) (define (trigger-build-allocation) (atomic-box-set! allocation-needed #t) (signal-condition-variable v)) (define success-counter-metric (make-counter-metric (build-coordinator-metrics-registry build-coordinator) "allocator_allocations_total")) (define failure-counter-metric (make-counter-metric (build-coordinator-metrics-registry build-coordinator) "allocator_failures_total")) (define (allocate-builds-loop) (while #t (with-mutex mtx (let ((previous-allocation-needed-value (atomic-box-swap! allocation-needed #f))) (when (eq? #f previous-allocation-needed-value) (wait-condition-variable v mtx) (atomic-box-set! allocation-needed #f))) (call-with-duration-metric (build-coordinator-metrics-registry build-coordinator) "allocate_builds_duration_seconds" (lambda () (with-exception-handler (lambda (exn) (simple-format (current-error-port) "build-allocator-thread: exception: ~A\n" exn) (metric-increment failure-counter-metric) (atomic-box-set! allocation-needed #t)) (lambda () (with-exception-handler (lambda (exn) (with-exception-handler (lambda (exn) (simple-format (current-error-port) "exception when printing backtrace: ~A\n" exn) (backtrace) (raise-exception exn)) (lambda () (backtrace) (simple-format #t "\nfinished printing backtrace\n") (force-output))) (raise-exception exn)) (lambda () (with-timeout (* 1000 60 10) ; 10 minutes (raise-exception (make-exception-with-message "timeout allocating builds")) (allocate-builds build-coordinator)) (metric-increment success-counter-metric)))) #:unwind? #t)) #:buckets ((@@ (prometheus) exponential-histogram-buckets) ; TODO #:start 1 #:end (* 30 60)))))) (parameterize (((@@ (fibers internal) current-fiber) #f)) (call-with-new-thread (lambda () (with-exception-handler (lambda (exn) (simple-format (current-error-port) "error: allocator thread: ~A\n" exn) (exit 1)) (lambda () (allocate-builds-loop)))))) trigger-build-allocation) (define (start-hook-processing-threads build-coordinator) (define wait-timeout-seconds (* 60 5)) (define datastore (build-coordinator-datastore build-coordinator)) (define success-counter-metric (make-counter-metric (build-coordinator-metrics-registry build-coordinator) "hook_success_total" #:labels '(event))) (define failure-counter-metric (make-counter-metric (build-coordinator-metrics-registry build-coordinator) "hook_failure_total" #:labels '(event))) (define (process-event event-name handler) (match (datastore-list-unprocessed-hook-events datastore event-name 1) (() #f) (((id event arguments)) (catch #t (lambda () (log-msg (build-coordinator-logger build-coordinator) 'DEBUG "processing " event " event: " arguments) (call-with-duration-metric (build-coordinator-metrics-registry build-coordinator) "hook_duration_seconds" (lambda () (apply handler build-coordinator arguments)) #:labels '(event) #:label-values `((event . ,event))) (log-msg (build-coordinator-logger build-coordinator) 'DEBUG event " handler finished") (datastore-delete-unprocessed-hook-event datastore id) ;; If this is the hook for a successful build, once the hook ;; completed successfully, delete the nar files for this build. (when (eq? 'build-success event) (match arguments ((build-id) (let ((data-location (build-data-location build-id))) (when (file-exists? data-location) (delete-file-recursively data-location)))))) (metric-increment success-counter-metric #:label-values `((event . ,event)))) (lambda (key . args) (log-msg (build-coordinator-logger build-coordinator) 'ERROR "error running " event " hook: " key " " args) (metric-increment failure-counter-metric #:label-values `((event . ,event))))) #t))) (map (match-lambda ((event-name . handler) (let ((mtx (make-mutex)) (condvar (make-condition-variable))) (parameterize (((@@ (fibers internal) current-fiber) #f)) (call-with-new-thread (lambda () (lock-mutex mtx) (with-exception-handler (lambda (exn) (log-msg (build-coordinator-logger build-coordinator) 'CRITICAL "hook processing thread " event-name " exception: " exn) (primitive-exit 1)) (lambda () (while #t (unless (process-event event-name handler) (wait-condition-variable condvar mtx (+ (current-time) wait-timeout-seconds (random 30)))))))))) (cons event-name condvar)))) (build-coordinator-hooks build-coordinator))) (define (fetch-builds build-coordinator agent systems max-builds deprecated-requested-count) (call-with-duration-metric (build-coordinator-metrics-registry build-coordinator) "coordinator_fetch_builds_duration_seconds" (lambda () (let ((update-made (datastore-update-agent-requested-systems (build-coordinator-datastore build-coordinator) agent systems))) (when update-made (trigger-build-allocation build-coordinator))) (datastore-allocate-builds-to-agent (build-coordinator-datastore build-coordinator) agent max-builds deprecated-requested-count)))) (define (agent-details datastore agent-id) (let ((agent (datastore-find-agent datastore agent-id)) (allocated-builds (datastore-list-agent-builds datastore agent-id))) `(,@agent ; description (builds . ,(list->vector allocated-builds))))) (define (build-data-location build-id ) (string-append (%config 'builds-dir) "/" build-id)) (define (build-output-file-location datastore build-id output-name) (let ((output (datastore-find-build-output datastore build-id output-name))) (string-append (build-data-location build-id) "/outputs/" output-name "/" (basename output) ".nar.lz"))) (define (build-log-file-directory build-id) (string-append (%config 'build-logs-dir) "/" build-id)) (define (build-log-file-destination build-id format) (string-append (build-log-file-directory build-id) "/" (cond ((string=? format "bzip2") "log.bz2") ((string=? format "gzip") "log.gz") (else (error "unknown log format" format))))) (define (build-log-file-location build-id) (let* ((directory (build-log-file-directory build-id)) (potential-files (scandir directory (lambda (file) (and (not (member file '("." ".."))) (not (string-suffix? ".tmp" file))))))) (match potential-files ((file) (string-append directory "/" file)) (() #f) (#f #f) ; directory doesn't exist (files (error (simple-format #f "found multiple files for ~A: ~A" build-id files)))))) (define (handle-build-result build-coordinator agent-id build-id result-json) (define datastore (build-coordinator-datastore build-coordinator)) (call-with-duration-metric (build-coordinator-metrics-registry build-coordinator) "coordinator_handle_build_result_duration_seconds" (lambda () (let* ((result (assoc-ref result-json "result")) (success? (string=? result "success"))) (let ((build-details (datastore-find-build datastore build-id))) (when (assq-ref build-details 'processed) (raise-exception (make-agent-error 'build_already_processed)))) (when success? (unless (build-log-file-location build-id) (raise-exception (make-agent-error 'missing_build_log_file))) (for-each (lambda (output) (let ((output-location (build-output-file-location datastore build-id (assq-ref output 'name)))) (unless (file-exists? output-location) (raise-exception (make-agent-error `((missing_output . ,(assq-ref output 'name)))))))) (datastore-list-build-outputs datastore build-id))) (datastore-store-build-result datastore build-id agent-id (if success? "success" "failure") #f ; failure reason, TODO (if success? (vector->list (assoc-ref result-json "outputs")) #f) ;; TODO Check what the value of this is (assoc-ref result-json "end_time")) (build-coordinator-prompt-hook-processing-for-event build-coordinator (if success? 'build-success 'build-failure)) #t)))) (define (handle-build-start-report build-coordinator agent-id build-id) (datastore-store-build-start (build-coordinator-datastore build-coordinator) build-id agent-id) (build-coordinator-prompt-hook-processing-for-event build-coordinator 'build-started)) (define (handle-setup-failure-report build-coordinator agent-id build-id report-json) (define datastore (build-coordinator-datastore build-coordinator)) (let ((failure-reason (assoc-ref report-json "failure_reason"))) (if (string=? failure-reason "missing_inputs") ;; For missing inputs, we need to store the inputs that were missing, ;; so that has a special function (let ((missing-inputs (vector->list (assoc-ref report-json "missing_inputs")))) (datastore-store-setup-failure/missing-inputs datastore build-id agent-id missing-inputs) (build-coordinator-prompt-hook-processing-for-event build-coordinator 'build-missing-inputs)) (datastore-store-setup-failure datastore build-id agent-id failure-reason))))