From 074b9d02f1ca01007f39adbc019763027a51d9bd Mon Sep 17 00:00:00 2001 From: Ludovic Courtès Date: Thu, 5 Apr 2018 22:17:45 +0200 Subject: base: Let sqlite handle deduplication of the list of pending derivations. Previously we would make a SQL query that would return many build jobs, and then call 'delete-duplicates' on that. This was extremely wasteful because the list of returned by the query was huge leading to a heap of several tens of GiB on a big database, and 'delete-duplicates' would lead to more GC and it would take ages. Furthermore, since 'delete-duplicates' is written in C as of Guile 2.2.3, it is uninterruptible from Fiber's viewpoint. Consequently, the kernel thread running the 'restart-builds' fiber would never schedule other fibers, which could lead to deadlocks--e.g., since fibers are scheduled on a circular shuffled list of kernel threads, once every N times, a web server fiber would be sent to that kernel thread and not be serviced. * src/cuirass/base.scm (shuffle-jobs): Remove. (shuffle-derivations): New procedure. (spawn-builds): Take a list of derivations instead of a list of jobs. (restart-builds): Remove 'builds' parameter. Remove 'delete-duplicates' call. Remove done/remaining partitioning. (build-packages): Adjust to pass 'spawn-builds' a list of derivations. * bin/cuirass.in (main): Remove computation of PENDING. Remove second parameter in call to 'restart-builds'. --- src/cuirass/base.scm | 82 ++++++++++++++++++---------------------------------- 1 file changed, 28 insertions(+), 54 deletions(-) (limited to 'src') diff --git a/src/cuirass/base.scm b/src/cuirass/base.scm index a96f640..c9c5ec1 100644 --- a/src/cuirass/base.scm +++ b/src/cuirass/base.scm @@ -362,15 +362,10 @@ Essentially this procedure inverts the inversion-of-control that ;;; Building packages. ;;; -(define (shuffle-jobs jobs) - "Shuffle JOBS, a list of job alists." +(define (shuffle-derivations drv) + "Shuffle DRV, a list of derivation file names." ;; Our shuffling algorithm is simple: we sort by .drv file name. :-) - (define (job count max-batch-size) - (split-at jobs max-batch-size) - (values jobs '()))) - ((drv) - (map (lambda (job) - (assq-ref job #:derivation)) - batch))) + (split-at drv max-batch-size) + (values drv '())))) (guard (c ((nix-protocol-error? c) (log-message "batch of builds (partially) failed:\ ~a (status: ~a)" (nix-protocol-error-message c) (nix-protocol-error-status c)))) - (log-message "building batch of ~a jobs (~a/~a)" + (log-message "building batch of ~a derivations (~a/~a)" max-batch-size (- total count) total) (let-values (((port finish) (build-derivations& store drv))) @@ -526,43 +516,26 @@ procedure is meant to be called at startup." (- (time-second (current-time time-utc)) age) ";")) -(define (restart-builds db builds) +(define (restart-builds db) "Restart builds whose status in DB is \"pending\" (scheduled or started)." (with-store store - (let*-values (((builds) - (delete-duplicates builds build-derivation=?)) - ((valid stale) - (partition (lambda (build) - (let ((drv (assq-ref build #:derivation))) - (valid-path? store drv))) - builds))) + ;; Note: On a big database, 'db-get-pending-derivations' can take a couple + ;; of minutes, hence 'non-blocking'. + (log-message "retrieving list of pending builds...") + (let*-values (((valid stale) + (partition (cut valid-path? store <>) + (non-blocking (db-get-pending-derivations db))))) ;; We cannot restart builds listed in STALE, so mark them as canceled. (log-message "canceling ~a stale builds" (length stale)) - (for-each (lambda (build) - (db-update-build-status! db (assq-ref build #:derivation) - (build-status canceled))) + (for-each (lambda (drv) + (db-update-build-status! db drv (build-status canceled))) stale) - ;; Those in VALID can be restarted, but some of them may actually be - ;; done already--either because our database is outdated, or because it - ;; was not built by Cuirass. - (let-values (((done remaining) - (partition (lambda (build) - (match (assq-ref build #:outputs) - (((name ((#:path . item))) _ ...) - (valid-path? store item)) - (_ #f))) - valid))) - (log-message "~a of the pending builds had actually completed" - (length done)) - (for-each (lambda (build) - (db-update-build-status! db (assq-ref build #:derivation) - (build-status succeeded))) - done) - - (log-message "restarting ~a pending builds" (length remaining)) - (spawn-builds store db remaining) - (log-message "done with restarted builds"))))) + ;; Those in VALID can be restarted. If some of them were built in the + ;; meantime behind our back, that's fine: 'spawn-builds' will DTRT. + (log-message "restarting ~a pending builds" (length valid)) + (spawn-builds store db valid) + (log-message "done with restarted builds")))) (define (build-packages store db jobs) "Build JOBS and return a list of Build results." @@ -595,7 +568,8 @@ procedure is meant to be called at startup." (define build-ids (map register jobs)) - (spawn-builds store db jobs) + (spawn-builds store db + (map (cut assq-ref <> #:derivation) jobs)) (let* ((results (filter-map (cut db-get-build db <>) build-ids)) (status (map (cut assq-ref <> #:status) results)) -- cgit v1.2.3