diff options
author | Christopher Baines <mail@cbaines.net> | 2019-08-04 09:39:40 +0100 |
---|---|---|
committer | Christopher Baines <mail@cbaines.net> | 2019-08-04 09:39:40 +0100 |
commit | 84197686ce227b21dae48745f2919a51e070c3c5 (patch) | |
tree | 0ed4cc11138628575e490b803e766e8749a337d6 | |
parent | 1724bc485f13922d768bc09309a6a7975678c321 (diff) | |
download | data-service-84197686ce227b21dae48745f2919a51e070c3c5.tar data-service-84197686ce227b21dae48745f2919a51e070c3c5.tar.gz |
Fix some duplicated values in tables
The licenses table, along with the package_metadata table had duplicate
values. This could happen as the unique constraints on those tables didn't
properly account for the nullable fields.
The duplicates in those tables also affected the license_sets, packages,
package_derivations tables in a similar way. Finally, the
guix_revision_package_derivations table was also affected.
This commit adds a migration to fix the data, as well as the constraints. THe
code to populate the licenses and package_metadata tables is also updated.
-rw-r--r-- | guix-data-service/model/license.scm | 57 | ||||
-rw-r--r-- | guix-data-service/model/package-metadata.scm | 22 | ||||
-rw-r--r-- | guix-data-service/model/utils.scm | 8 | ||||
-rw-r--r-- | sqitch/deploy/fix_duplicated_licenses.sql | 218 | ||||
-rw-r--r-- | sqitch/revert/fix_duplicated_licenses.sql | 7 | ||||
-rw-r--r-- | sqitch/sqitch.plan | 1 | ||||
-rw-r--r-- | sqitch/verify/fix_duplicated_licenses.sql | 7 | ||||
-rw-r--r-- | tests/model-license-set.scm | 6 | ||||
-rw-r--r-- | tests/model-license.scm | 7 | ||||
-rw-r--r-- | tests/model-package-metadata.scm | 18 |
10 files changed, 313 insertions, 38 deletions
diff --git a/guix-data-service/model/license.scm b/guix-data-service/model/license.scm index 0b28343..73df612 100644 --- a/guix-data-service/model/license.scm +++ b/guix-data-service/model/license.scm @@ -43,28 +43,6 @@ (inferior-eval '(use-modules (guix licenses)) inf) (inferior-eval (proc packages) inf)) -(define (select-licenses license-values) - (string-append - "SELECT id, licenses.name, licenses.uri, licenses.comment " - "FROM licenses " - "JOIN (VALUES " - (string-join - (map (lambda (values) - (string-append - "(" - (string-join - (map value->quoted-string-or-null - values) - ", ") - ")")) - license-values) - ", ") - ") AS vals (name, uri, comment) " - "ON " - "licenses.name = vals.name AND " - "licenses.uri = vals.uri AND " - "licenses.comment = vals.comment")) - (define (insert-licenses values) (string-append "INSERT INTO licenses " @@ -91,15 +69,39 @@ (map number->string (sort (map string->number ids) <))) + (define (non-string-to-false lst) + (map (lambda (value) + (if (string? value) + value + #f)) + lst)) + + (define (empty-string-to-false lst) + ;; TODO squee returns empty strings for null values, which will probably + ;; cause problems + (map (lambda (value) + (if (string? value) + (if (string-null? value) + #f + value) + value)) + lst)) + (let* ((unique-license-tuples (filter (lambda (license-tuple) (not (null? license-tuple))) (delete-duplicates - (concatenate license-data)))) + (map + (lambda (lst) + (non-string-to-false + (empty-string-to-false lst))) + (concatenate license-data))))) (existing-license-entries (exec-query->vhash conn - (select-licenses unique-license-tuples) - cdr + "SELECT id, name, uri, comment FROM licenses" + (lambda (vals) + (non-string-to-false + (empty-string-to-false (cdr vals)))) first)) ;; id (missing-license-entries (delete-duplicates @@ -128,5 +130,8 @@ (begin (error "missing license entry" license-values))))) - license-value-lists))) + (map (lambda (lst) + (non-string-to-false + (empty-string-to-false lst))) + license-value-lists)))) license-data))) diff --git a/guix-data-service/model/package-metadata.scm b/guix-data-service/model/package-metadata.scm index 37a8da8..178d783 100644 --- a/guix-data-service/model/package-metadata.scm +++ b/guix-data-service/model/package-metadata.scm @@ -52,8 +52,14 @@ "ON " (string-join (map (lambda (field) - (string-append - "package_metadata." field " = vals." field)) + (if (member field '("home_page" "location_id" + "license_set_id")) + (string-append + "(package_metadata." field " = vals." field " OR " + "(package_metadata." field " IS NULL AND" + " vals." field " IS NULL))") + (string-append + "package_metadata." field " = vals." field))) fields) " AND "))) @@ -129,7 +135,8 @@ WHERE packages.id IN ( (map (lambda (package license-set-id) (list (inferior-package-synopsis package) (inferior-package-description package) - (inferior-package-home-page package) + (non-empty-string-or-false + (inferior-package-home-page package)) (location->location-id conn (inferior-package-location package)) @@ -140,7 +147,14 @@ WHERE packages.id IN ( (let* ((existing-package-metadata-entries (exec-query->vhash conn (select-package-metadata package-metadata) - cdr + (match-lambda + ((id synopsis description home-page + location-id license-set-id) + (list synopsis + description + (non-empty-string-or-false home-page) + location-id + license-set-id))) first)) ;; id)) (missing-package-metadata-entries (delete-duplicates diff --git a/guix-data-service/model/utils.scm b/guix-data-service/model/utils.scm index 1ece31a..0ce9fea 100644 --- a/guix-data-service/model/utils.scm +++ b/guix-data-service/model/utils.scm @@ -5,6 +5,7 @@ #:use-module (squee) #:export (quote-string value->quoted-string-or-null + non-empty-string-or-false exec-query->vhash two-lists->vhash deduplicate-strings @@ -18,6 +19,13 @@ (string-append "$STR$" value "$STR$") "NULL")) +(define (non-empty-string-or-false s) + (if (string? s) + (if (string-null? s) + #f + s) + #f)) + (define (exec-query->vhash conn query field-function value-function) (fold (lambda (row result) (vhash-cons (field-function row) diff --git a/sqitch/deploy/fix_duplicated_licenses.sql b/sqitch/deploy/fix_duplicated_licenses.sql new file mode 100644 index 0000000..33820ca --- /dev/null +++ b/sqitch/deploy/fix_duplicated_licenses.sql @@ -0,0 +1,218 @@ +-- Deploy guix-data-service:fix_duplicated_licenses to pg + +BEGIN; + +SET CONSTRAINTS ALL DEFERRED; + +-- Remove unique constraint from license_sets + +ALTER TABLE license_sets DROP CONSTRAINT license_sets_pkey; + +-- Change all license sets to refer to canonical licenses + +UPDATE license_sets AS master SET license_ids = ARRAY( + SELECT new_licenses_2.id FROM ( + SELECT a.elem AS id, a.nr AS index + FROM license_sets, unnest(license_sets.license_ids) + WITH ORDINALITY a(elem, nr) + WHERE id = master.id + ) AS old_licenses + INNER JOIN ( + SELECT licenses.id AS old_id, new_licenses.* + FROM licenses INNER JOIN ( + SELECT MIN(id) AS id, name, uri, comment + FROM licenses + GROUP BY name, uri, comment + ORDER BY name + ) AS new_licenses + ON licenses.name = new_licenses.name AND + (licenses.uri = new_licenses.uri OR ( + licenses.uri IS NULL AND new_licenses.uri IS NULL + )) AND ( + licenses.comment = new_licenses.comment OR ( + licenses.comment IS NULL AND new_licenses.comment IS NULL + )) + ) AS new_licenses_2 + ON old_licenses.id = new_licenses_2.old_id + ORDER BY old_licenses.index); + +-- Remove unique constraint from package_metadata + +ALTER TABLE package_metadata DROP CONSTRAINT package_metadata_unique_fields; + +-- Update package_metadata to refer to canonical license_sets + +WITH data AS ( + SELECT MIN(id) AS id, ARRAY_AGG(id) AS old_ids + FROM license_sets + GROUP BY license_ids +) +UPDATE package_metadata AS master +SET license_set_id = data.id +FROM data +WHERE license_set_id = ANY(data.old_ids); + +-- Remove unique constraint from packages + +ALTER TABLE packages DROP CONSTRAINT packages_pkey; + +-- Update packages to refer to canonical package_metadata entries + +WITH data AS ( + SELECT MIN(package_metadata.id) AS id, ARRAY_AGG(package_metadata.id) AS old_ids + FROM package_metadata + GROUP BY package_metadata.synopsis, package_metadata.description, + package_metadata.home_page, package_metadata.location_id, + package_metadata.license_set_id + HAVING COUNT(package_metadata.id) > 1 +) +UPDATE packages SET package_metadata_id = data.id +FROM data +WHERE package_metadata_id = ANY(data.old_ids); + +-- Remove unique constraint from package_derivations + +ALTER TABLE package_derivations DROP CONSTRAINT package_derivations_pkey; + +-- Update package_derivations to refer to canonical packages entries + +WITH data AS ( + SELECT unnest(old_ids) AS old, id FROM ( + SELECT MIN(packages.id) AS id, ARRAY_AGG(packages.id) AS old_ids + FROM packages + GROUP BY name, version, package_metadata_id + HAVING COUNT(id) > 1 + ) AS d2 +) +UPDATE package_derivations SET package_id = data.id +FROM data +WHERE package_id = data.old; + +-- Update guix_revision_package_derivations to refer to canonical +-- package_derivations entries + +WITH data AS ( + SELECT unnest(old_ids) AS old, id FROM ( + SELECT MIN(package_derivations.id) AS id, ARRAY_AGG(package_derivations.id) AS old_ids + FROM package_derivations + GROUP BY package_id, derivation_id, system, target + HAVING COUNT(id) > 1 + ) AS d2 +) +UPDATE guix_revision_package_derivations SET package_derivation_id = data.id +FROM data +WHERE package_derivation_id = data.old; + +-- Drop the foreign key constraint as an attempt to speed up deleting from +-- package_derivations. + +ALTER TABLE guix_revision_package_derivations + DROP CONSTRAINT guix_revision_package_derivations_package_derivation_id_fkey; + +-- Delete non-canonical package_dervations entries + +DELETE FROM package_derivations AS pd WHERE id NOT IN ( + SELECT MIN(id) + FROM package_derivations + GROUP BY ( + package_id, + derivation_id, + system, + target + ) +); + +-- Reinstate the deleted constraint + +ALTER TABLE guix_revision_package_derivations + ADD CONSTRAINT guix_revision_package_derivations_package_derivation_id_fkey + FOREIGN KEY (package_derivation_id) REFERENCES package_derivations(id); + +-- Delete non-canonical packages entries + +DELETE FROM packages AS p WHERE id NOT IN ( + SELECT MIN(id) + FROM packages + GROUP BY (name, version, package_metadata_id) +); + +-- Add referential constraints + +ALTER TABLE package_derivations + ADD CONSTRAINT package_derivations_package_id_fkey + FOREIGN KEY (package_id) REFERENCES packages (id); + +ALTER TABLE package_derivations + ADD CONSTRAINT package_derivations_derivation_id_fkey + FOREIGN KEY (derivation_id) REFERENCES derivations (id); + +-- Delete non-canonical package_metadata entries + +ALTER TABLE packages DROP CONSTRAINT package_metadata_id; + +DELETE FROM package_metadata AS pm WHERE id NOT IN ( + SELECT MIN(id) + FROM package_metadata + GROUP BY (synopsis, description, home_page, location_id, license_set_id) +); + +ALTER TABLE packages ADD CONSTRAINT package_metadata_id + FOREIGN KEY (package_metadata_id) REFERENCES package_metadata(id); + +-- Delete non-canonical license_sets entries + +ALTER TABLE package_metadata DROP CONSTRAINT package_metadata_license_set_id_fkey; + +DELETE FROM license_sets AS ls WHERE id NOT IN ( + SELECT MIN(id) + FROM license_sets + GROUP BY license_ids +); + +ALTER TABLE package_metadata ADD CONSTRAINT package_metadata_license_set_id_fkey + FOREIGN KEY (license_set_id) REFERENCES license_sets(id); + +-- Delete non-canonical licenses entries + +DELETE FROM licenses AS l WHERE id NOT IN ( + SELECT MIN(id) + FROM licenses + GROUP BY (name, uri, comment) +); + +-- Restore unique constraints + +CREATE UNIQUE INDEX ON licenses (name) + WHERE uri IS NULL AND comment IS NULL; +CREATE UNIQUE INDEX ON licenses (name, uri) + WHERE uri IS NOT NULL AND comment IS NULL; +CREATE UNIQUE INDEX ON licenses (name, comment) + WHERE uri IS NULL AND comment IS NOT NULL; +CREATE UNIQUE INDEX ON licenses (name, uri, comment) + WHERE uri IS NOT NULL AND comment IS NOT NULL; + +ALTER TABLE license_sets ADD PRIMARY KEY (license_ids); + +ALTER TABLE package_metadata ALTER synopsis SET NOT NULL; +ALTER TABLE package_metadata ALTER description SET NOT NULL; + +CREATE UNIQUE INDEX ON package_metadata ( + synopsis, + description, + coalesce(location_id, -1), + coalesce(license_set_id, -1) +) WHERE home_page IS NULL; + +CREATE UNIQUE INDEX ON package_metadata ( + synopsis, + description, + home_page, + coalesce(location_id, -1), + coalesce(license_set_id, -1) +) WHERE home_page IS NOT NULL; + +ALTER TABLE packages ADD PRIMARY KEY (name, version, package_metadata_id); + +ALTER TABLE package_derivations ADD PRIMARY KEY (package_id, derivation_id, system, target); + +COMMIT; diff --git a/sqitch/revert/fix_duplicated_licenses.sql b/sqitch/revert/fix_duplicated_licenses.sql new file mode 100644 index 0000000..cec3d71 --- /dev/null +++ b/sqitch/revert/fix_duplicated_licenses.sql @@ -0,0 +1,7 @@ +-- Revert guix-data-service:fix_duplicated_licenses from pg + +BEGIN; + +-- XXX Add DDLs here. + +COMMIT; diff --git a/sqitch/sqitch.plan b/sqitch/sqitch.plan index 7a37548..bd513cc 100644 --- a/sqitch/sqitch.plan +++ b/sqitch/sqitch.plan @@ -15,3 +15,4 @@ dates_to_load_new_guix_revision_jobs 2019-06-02T07:39:49Z Christopher Baines <ma load_new_guix_revision_job_events 2019-06-02T15:44:41Z Christopher Baines <mail@cbaines.net> # Add new table for guix_revision_job_events load_new_guix_revision_job_logs 2019-06-21T14:33:09Z chris <chris@phact> # Add load_new_guix_revision_job_logs change_load_new_guix_revision_job_logs_contents_to_be_nullable 2019-07-07T20:10:54Z Christopher Baines <mail@cbaines.net> # Change the contents field in the load_new_guix_revision_job_logs table\nto be nullable.\n\nwith '#' will # be ignored, and an empty message aborts the add. #\nChange to add: # #\nchange_load_new_guix_revision_job_logs_contents_to_be_nullable #\nsqitch/deploy/change_load_new_guix_revision_job_logs_contents_to_be_nullable.sql\nsqitch/revert/change_load_new_guix_revision_job_logs_contents_to_be_nullable.sql\nsqitch/verify/change_load_new_guix_revision_job_logs_contents_to_be_nullable.sql +fix_duplicated_licenses 2019-07-30T05:48:17Z Christopher Baines <mail@cbaines.net> # Fix duplicated licenses, and add constraints diff --git a/sqitch/verify/fix_duplicated_licenses.sql b/sqitch/verify/fix_duplicated_licenses.sql new file mode 100644 index 0000000..47379eb --- /dev/null +++ b/sqitch/verify/fix_duplicated_licenses.sql @@ -0,0 +1,7 @@ +-- Verify guix-data-service:fix_duplicated_licenses on pg + +BEGIN; + +-- XXX Add verifications here. + +ROLLBACK; diff --git a/tests/model-license-set.scm b/tests/model-license-set.scm index 9cd949d..c39171f 100644 --- a/tests/model-license-set.scm +++ b/tests/model-license-set.scm @@ -19,10 +19,10 @@ "https://example.com/why-license-1")) (("License 1" "https://gnu.org/licenses/test-1.html" - "https://example.com/why-license-1") + #f) ("License 2" - "https://gnu.org/licenses/test-2.html" - "https://example.com/why-license-2"))))) + #f + #f))))) (with-postgresql-connection "test-model-license-set" diff --git a/tests/model-license.scm b/tests/model-license.scm index 7888117..8f4b0c3 100644 --- a/tests/model-license.scm +++ b/tests/model-license.scm @@ -17,10 +17,13 @@ "https://example.com/why-license-1")) (("License 1" "https://gnu.org/licenses/test-1.html" - "https://example.com/why-license-1") + #f) ("License 2" "https://gnu.org/licenses/test-2.html" - "https://example.com/why-license-2"))))) + #f) + ("License 3" + #f + #f))))) (with-postgresql-connection "test-model-license" diff --git a/tests/model-package-metadata.scm b/tests/model-package-metadata.scm index ea0cdbe..015a0b2 100644 --- a/tests/model-package-metadata.scm +++ b/tests/model-package-metadata.scm @@ -18,6 +18,15 @@ (home-page "https://example.com") (location (location "file.scm" 5 0)))) +(define mock-inferior-package-foo-2 + (mock-inferior-package + (name "foo") + (version "2") + (synopsis "Foo") + (description "Foo description") + (home-page #f) + (location #f))) + (define (test-license-set-ids conn) (mock ((guix-data-service model license) @@ -46,7 +55,8 @@ (match (inferior-packages->package-metadata-ids conn - (list mock-inferior-package-foo) + (list mock-inferior-package-foo + mock-inferior-package-foo-2) (test-license-set-ids conn)) ((x) (string? x)))) #:always-rollback? #t)) @@ -57,11 +67,13 @@ (test-equal "inferior-packages->package-metadata-ids" (inferior-packages->package-metadata-ids conn - (list mock-inferior-package-foo) + (list mock-inferior-package-foo + mock-inferior-package-foo-2) (test-license-set-ids conn)) (inferior-packages->package-metadata-ids conn - (list mock-inferior-package-foo) + (list mock-inferior-package-foo + mock-inferior-package-foo-2) (test-license-set-ids conn))) #:always-rollback? #t)))))) |