;;; Guix Build Coordinator ;;; ;;; Copyright © 2020 Christopher Baines ;;; ;;; This file is part of the guix-build-coordinator. ;;; ;;; The Guix Build Coordinator is free software; you can redistribute ;;; it and/or modify it under the terms of the GNU General Public ;;; License as published by the Free Software Foundation; either ;;; version 3 of the License, or (at your option) any later version. ;;; ;;; The Guix Build Coordinator is distributed in the hope that it will ;;; be useful, but WITHOUT ANY WARRANTY; without even the implied ;;; warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ;;; See the GNU General Public License for more details. ;;; ;;; You should have received a copy of the GNU General Public License ;;; along with the guix-data-service. If not, see ;;; . (define-module (guix-build-coordinator build-allocator) #:use-module (srfi srfi-1) #:use-module (srfi srfi-11) #:use-module (srfi srfi-43) #:use-module (ice-9 match) #:use-module (prometheus) #:use-module (guix memoization) #:use-module (guix-build-coordinator utils) #:use-module (guix-build-coordinator datastore) #:use-module (guix-build-coordinator coordinator) #:export (basic-build-allocation-strategy derivation-ordered-build-allocation-strategy)) (define (agent-tags-match-build-tags agent-tags tags-for-build agent-id build-id) (let ((agent-tags (assoc-ref agent-tags agent-id))) (or (null? agent-tags) (let ((build-tags (tags-for-build build-id))) (every (match-lambda ((agent-key . agent-value) (let ((matching-build-tags (vector-fold (lambda (_ result tag) (if (string=? (car tag) agent-key) (cons tag result) result)) '() build-tags))) (if (null? matching-build-tags) #t (any (match-lambda ((_ . build-value) (string=? agent-value build-value))) matching-build-tags))))) (vector->list agent-tags)))))) (define* (basic-build-allocation-strategy datastore #:key (planned-builds-for-agent-limit 4096) metrics-registry) (define (log . args) (when #f (simple-format #t "allocator: ~A\n" (string-join (map (lambda (arg) (simple-format #f "~A" arg)) args) " ")))) (define allocator-considered-builds-metric (when metrics-registry (let ((name "allocator_considered_builds")) (or (metrics-registry-fetch-metric metrics-registry name) (make-gauge-metric metrics-registry name #:labels '(system)))))) (define cached/list-builds-for-output (mlambda (output) (datastore-list-builds-for-output datastore output))) (let* ((agents (filter (lambda (agent) (assoc-ref agent 'active)) (datastore-list-agents datastore))) (agent-tags (map (lambda (agent-details) (let ((agent-id (assq-ref agent-details 'uuid))) (cons agent-id (datastore-fetch-agent-tags datastore agent-id)))) agents)) (requested-systems-by-agent (map (lambda (agent-details) (let ((agent-id (assq-ref agent-details 'uuid))) (cons agent-id (datastore-agent-requested-systems datastore agent-id)))) agents)) (builds (datastore-list-unprocessed-builds datastore)) (builds-count (length builds)) (setup-failures-hash (datastore-fetch-setup-failures datastore)) (derived-build-priorities-hash ;; Mapping from build_id to priority, initialised at the individual ;; priorities for the builds (let ((table (make-hash-table builds-count))) (for-each (lambda (build) (hash-set! table (assq-ref build 'uuid) (assq-ref build 'priority))) builds) table)) ;; build_id -> (list build_id ...) indicating that for the build_id ;; used as the key, the build_ids given as the value should happen ;; first. (build-ordering-hints-hash (make-hash-table)) (builds-ready-to-go-hash (make-hash-table))) (define systems-for-builds ;; TODO Should be one query (let ((table (make-hash-table builds-count))) (for-each (lambda (build) (let ((build-id (assq-ref build 'uuid))) (hash-set! table build-id (datastore-find-build-derivation-system datastore build-id)))) builds) table)) (define tags-for-build (let ((build-tags (make-hash-table))) (lambda (build-id) (let ((tags (hash-ref build-tags build-id))) (if (eq? #f tags) (let ((tags (datastore-fetch-build-tags datastore build-id))) (hash-set! build-tags build-id tags) tags) tags))))) (define (filter-builds-for-agent agent-id) (define requested-systems (assoc-ref requested-systems-by-agent agent-id)) (define (relevant-setup-failure? setup-failure) (log "setup failure:" setup-failure) (let ((failure-reason (assq-ref setup-failure 'failure-reason))) (cond ((string=? failure-reason "missing_inputs") ;; missing_inputs setup failures that should be resolved have been ;; filtered out by this point, so this is a relevant setup failure #t) ((string=? failure-reason "could_not_delete_outputs") ;; This problem might go away, but just don't try the same agent ;; again for now. (string=? (assq-ref setup-failure 'agent-id) agent-id)) ((string=? failure-reason "error_fetching_derivation") ;; This problem might go away, but just don't try the same agent ;; again for now. (string=? (assq-ref setup-failure 'agent-id) agent-id)) (else (error "Unknown setup failure " failure-reason))))) (lambda (build) (log "build:" (assq-ref build 'uuid)) (and (or (null? requested-systems) (let ((build-system (hash-ref systems-for-builds (assq-ref build 'uuid)))) (member build-system requested-systems))) (agent-tags-match-build-tags agent-tags tags-for-build agent-id (assq-ref build 'uuid)) (let* ((build-id (assq-ref build 'uuid)) (setup-failures-for-build (or (hash-ref setup-failures-hash build-id) '())) (relevant-setup-failures (filter relevant-setup-failure? setup-failures-for-build))) (log "relevant setup failures:" relevant-setup-failures) (if (null? relevant-setup-failures) #t #f))))) (define (build-sorting-function-for-agent agent-id) (lambda (a b) (let ((a-priority (hash-ref derived-build-priorities-hash (assq-ref a 'uuid))) (b-priority (hash-ref derived-build-priorities-hash (assq-ref b 'uuid)))) (< b-priority a-priority)))) (define (limit-planned-builds builds) (if planned-builds-for-agent-limit (if (> (length builds) planned-builds-for-agent-limit) (take builds planned-builds-for-agent-limit) builds) builds)) (define (limit-processed-sublists lists) (if planned-builds-for-agent-limit (let loop ((remaining-lists lists) (kept-lists '()) (count 0)) (if (or (>= count planned-builds-for-agent-limit) (null? remaining-lists)) (reverse kept-lists) (loop (cdr remaining-lists) (cons (first remaining-lists) kept-lists) (+ count (length (first remaining-lists)))))) lists)) (define (break-builds-in-to-priority-sublists all-builds) (define (build-priority build) (hash-ref derived-build-priorities-hash (assq-ref build 'uuid))) (let loop ((result '()) (builds all-builds) (current-priority-builds '()) (current-priority (build-priority (first all-builds)))) (if (null? builds) (reverse (cons (reverse current-priority-builds) result)) (let ((build (car builds))) (if (= (build-priority build) current-priority) (loop result (cdr builds) (cons build current-priority-builds) current-priority) (loop (cons (reverse current-priority-builds) result) (cdr builds) (list build) (build-priority build))))))) (define (sort-priority-sublist builds-list) (define (build-id build) (assq-ref build 'uuid)) (define (builds-that-should-happen-first build-id) (hash-ref build-ordering-hints-hash build-id '())) (define seen-builds-hash (make-hash-table)) (define deferred-builds-last-count 0) (define (all-inputs-built? derivation) (let ((inputs (datastore-find-derivation-inputs datastore derivation))) (every (lambda (input-details) (let ((output (assq-ref input-details 'output))) (any (lambda (output-build) (string=? (or (assq-ref output-build 'result) "unknown") "success")) (cached/list-builds-for-output output)))) inputs))) (define (push-deferred-builds-to-the-back builds-list) (let loop ((result '()) (builds builds-list) (builds-to-defer '())) (if (null? builds) (if (null? builds-to-defer) result (if (= (length builds-to-defer) deferred-builds-last-count) ;; There can be loops in the graph of missing inputs, so ;; give up if the ordering doesn't seem to end (append result builds-to-defer) (begin (set! deferred-builds-last-count (length builds-to-defer)) (loop result builds-to-defer '())))) (let ((build (car builds))) (if (every (lambda (required-build-id) (hash-ref seen-builds-hash required-build-id)) (builds-that-should-happen-first (build-id build))) (begin (hash-set! seen-builds-hash (build-id build) #t) (loop (cons build result) (cdr builds) builds-to-defer)) (loop result (cdr builds) (cons build builds-to-defer))))))) (let-values (((ready-builds other-builds) (partition (lambda (build) (or (hash-ref builds-ready-to-go-hash (assq-ref build 'uuid)) (all-inputs-built? (assq-ref build 'derivation-name)))) builds-list))) (append ready-builds (push-deferred-builds-to-the-back other-builds)))) (define (treat-build-as-required build-id priority required-build-id required-build-derived-priority) ;; Add an entry to the build-ordering-hints-hash to indicate that ;; required-build-id should happen prior to build-id (hash-set! build-ordering-hints-hash build-id (cons required-build-id (hash-ref build-ordering-hints-hash build-id '()))) (when (> priority required-build-derived-priority) ;; Bump the priority of the build (hash-set! derived-build-priorities-hash required-build-id priority))) (when metrics-registry (let ((counts (hash-fold (lambda (_ system result) `(,@(alist-delete system result) (,system . ,(+ 1 (or (assoc-ref result system) 0))))) '() systems-for-builds))) (for-each (match-lambda ((system . count) (metric-set allocator-considered-builds-metric count #:label-values `((system . ,system))))) counts))) ;; Go through the setup failures and update the ;; derived-build-priorities-hash to reflect the priorities of builds based ;; on the builds that would be "unblocked" if they were completed. (for-each (lambda (setup-failure-build-id) (let ((setup-failures (hash-ref setup-failures-hash setup-failure-build-id)) (setup-failure-build-derived-priority (hash-ref derived-build-priorities-hash setup-failure-build-id))) (for-each (lambda (setup-failure) (let ((outputs-should-be-available (map (lambda (output) (let ((builds (cached/list-builds-for-output output))) (if (any (lambda (output-build) (string=? (or (assq-ref output-build 'result) "unknown") "success")) builds) #t (begin (for-each (lambda (build) (let* ((required-build-id (assq-ref build 'uuid)) (required-build-derived-priority (hash-ref derived-build-priorities-hash required-build-id))) (when (and (not (assq-ref build 'processed)) ;; The build might not be included ;; in this allocation, so skip it ;; if it isn't (number? required-build-derived-priority)) (treat-build-as-required setup-failure-build-id setup-failure-build-derived-priority required-build-id required-build-derived-priority)))) builds) #f)))) (datastore-list-setup-failure-missing-inputs datastore (assq-ref setup-failure 'id))))) (when (every (lambda (x) (eq? x #t)) outputs-should-be-available) (hash-set! builds-ready-to-go-hash setup-failure-build-id #t) ;; At least one build for each missing input has been ;; successful, so delete the setup failure from the list of ;; setup failures in the hash (hash-set! setup-failures-hash setup-failure-build-id (delete setup-failure (hash-ref setup-failures-hash setup-failure-build-id)))))) setup-failures))) (hash-map->list (lambda (key value) key) setup-failures-hash)) (let ((result (map (lambda (agent-id) (log "considering builds for" agent-id) (let ((builds-sorted-by-derived-priority (sort-list (filter (filter-builds-for-agent agent-id) builds) (build-sorting-function-for-agent agent-id)))) (cons agent-id (if (null? builds-sorted-by-derived-priority) '() (let ((final-ordered-builds (concatenate (map sort-priority-sublist (limit-processed-sublists (break-builds-in-to-priority-sublists builds-sorted-by-derived-priority)))))) (let ((builds-for-agent (limit-planned-builds final-ordered-builds))) (map (lambda (build) (assq-ref build 'uuid)) builds-for-agent))))))) (map (lambda (agent) (assq-ref agent 'uuid)) agents)))) (log "finished") result))) (define* (derivation-ordered-build-allocation-strategy datastore #:key (planned-builds-for-agent-limit 256) (builds-created-after #f) ;; "datetime('now', '-60 days')") metrics-registry) (define (log . args) (when #f (simple-format #t "allocator: ~A\n" (string-join (map (lambda (arg) (simple-format #f "~A" arg)) args) " ")))) (define allocator-considered-builds-metric (when metrics-registry (let ((name "allocator_considered_builds")) (or (metrics-registry-fetch-metric metrics-registry name) (make-gauge-metric metrics-registry name #:labels '(system)))))) (let* ((agents (filter (lambda (agent) (assoc-ref agent 'active)) (datastore-list-agents datastore))) (agent-tags (map (lambda (agent-details) (let ((agent-id (assq-ref agent-details 'uuid))) (cons agent-id (datastore-fetch-agent-tags datastore agent-id)))) agents)) (requested-systems-by-agent (map (lambda (agent-details) (let ((agent-id (assq-ref agent-details 'uuid))) (cons agent-id (datastore-agent-requested-systems datastore agent-id)))) agents)) (setup-failures-hash (datastore-fetch-setup-failures datastore))) (let ((prioritised-builds (datastore-fetch-prioritised-unprocessed-builds datastore))) (define systems-for-builds ;; TODO Should be one query (let ((table (make-hash-table))) (for-each (lambda (build-id) (hash-set! table build-id (datastore-find-build-derivation-system datastore build-id))) prioritised-builds) table)) (define tags-for-build (let ((build-tags (make-hash-table))) (lambda (build-id) (let ((tags (hash-ref build-tags build-id))) (if (eq? #f tags) (let ((tags (datastore-fetch-build-tags datastore build-id))) (hash-set! build-tags build-id tags) tags) tags))))) (define (filter-builds-for-agent agent-id) (define requested-systems (assoc-ref requested-systems-by-agent agent-id)) (define (relevant-setup-failure? setup-failure) (log "setup failure:" setup-failure) (let ((failure-reason (assq-ref setup-failure 'failure-reason))) (cond ((string=? failure-reason "missing_inputs") ;; This problem might go away, but just don't try the same agent ;; again for now. (string=? (assq-ref setup-failure 'agent-id) agent-id)) ((string=? failure-reason "could_not_delete_outputs") ;; This problem might go away, but just don't try the same agent ;; again for now. (string=? (assq-ref setup-failure 'agent-id) agent-id)) ((string=? failure-reason "error_fetching_derivation") ;; This problem might go away, but just don't try the same agent ;; again for now. (string=? (assq-ref setup-failure 'agent-id) agent-id)) (else (error "Unknown setup failure " failure-reason))))) (lambda (build-id) (log "build:" build-id) (and (or (null? requested-systems) (let ((build-system (hash-ref systems-for-builds build-id))) (member build-system requested-systems))) (agent-tags-match-build-tags agent-tags tags-for-build agent-id build-id) (let* ((setup-failures-for-build (or (hash-ref setup-failures-hash build-id) '())) (relevant-setup-failures (filter relevant-setup-failure? setup-failures-for-build))) (log "relevant setup failures:" relevant-setup-failures) (if (null? relevant-setup-failures) #t #f))))) (when metrics-registry (let ((counts (hash-fold (lambda (_ system result) `(,@(alist-delete system result) (,system . ,(+ 1 (or (assoc-ref result system) 0))))) '() systems-for-builds))) (for-each (match-lambda ((system . count) (metric-set allocator-considered-builds-metric count #:label-values `((system . ,system))))) counts))) (let ((result (map (lambda (agent-id) (log "considering builds for" agent-id) (let* ((filter-proc (filter-builds-for-agent agent-id)) (build-ids (let loop ((count 0) (build-ids '()) (potential-build-ids prioritised-builds)) (if (or (and planned-builds-for-agent-limit (>= count planned-builds-for-agent-limit)) (null? potential-build-ids)) (reverse build-ids) ;; highest priority last, so ;; reverse (let ((potential-build (first potential-build-ids))) (if (filter-proc potential-build) (loop (+ 1 count) (cons potential-build build-ids) (cdr potential-build-ids)) (loop count build-ids (cdr potential-build-ids)))))))) (cons agent-id build-ids))) (map (lambda (agent) (assq-ref agent 'uuid)) agents)))) (log "finished") result))))