aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristopher Baines <mail@cbaines.net>2019-08-04 09:39:40 +0100
committerChristopher Baines <mail@cbaines.net>2019-08-04 09:39:40 +0100
commit84197686ce227b21dae48745f2919a51e070c3c5 (patch)
tree0ed4cc11138628575e490b803e766e8749a337d6
parent1724bc485f13922d768bc09309a6a7975678c321 (diff)
downloaddata-service-84197686ce227b21dae48745f2919a51e070c3c5.tar
data-service-84197686ce227b21dae48745f2919a51e070c3c5.tar.gz
Fix some duplicated values in tables
The licenses table, along with the package_metadata table had duplicate values. This could happen as the unique constraints on those tables didn't properly account for the nullable fields. The duplicates in those tables also affected the license_sets, packages, package_derivations tables in a similar way. Finally, the guix_revision_package_derivations table was also affected. This commit adds a migration to fix the data, as well as the constraints. THe code to populate the licenses and package_metadata tables is also updated.
-rw-r--r--guix-data-service/model/license.scm57
-rw-r--r--guix-data-service/model/package-metadata.scm22
-rw-r--r--guix-data-service/model/utils.scm8
-rw-r--r--sqitch/deploy/fix_duplicated_licenses.sql218
-rw-r--r--sqitch/revert/fix_duplicated_licenses.sql7
-rw-r--r--sqitch/sqitch.plan1
-rw-r--r--sqitch/verify/fix_duplicated_licenses.sql7
-rw-r--r--tests/model-license-set.scm6
-rw-r--r--tests/model-license.scm7
-rw-r--r--tests/model-package-metadata.scm18
10 files changed, 313 insertions, 38 deletions
diff --git a/guix-data-service/model/license.scm b/guix-data-service/model/license.scm
index 0b28343..73df612 100644
--- a/guix-data-service/model/license.scm
+++ b/guix-data-service/model/license.scm
@@ -43,28 +43,6 @@
(inferior-eval '(use-modules (guix licenses)) inf)
(inferior-eval (proc packages) inf))
-(define (select-licenses license-values)
- (string-append
- "SELECT id, licenses.name, licenses.uri, licenses.comment "
- "FROM licenses "
- "JOIN (VALUES "
- (string-join
- (map (lambda (values)
- (string-append
- "("
- (string-join
- (map value->quoted-string-or-null
- values)
- ", ")
- ")"))
- license-values)
- ", ")
- ") AS vals (name, uri, comment) "
- "ON "
- "licenses.name = vals.name AND "
- "licenses.uri = vals.uri AND "
- "licenses.comment = vals.comment"))
-
(define (insert-licenses values)
(string-append
"INSERT INTO licenses "
@@ -91,15 +69,39 @@
(map number->string
(sort (map string->number ids) <)))
+ (define (non-string-to-false lst)
+ (map (lambda (value)
+ (if (string? value)
+ value
+ #f))
+ lst))
+
+ (define (empty-string-to-false lst)
+ ;; TODO squee returns empty strings for null values, which will probably
+ ;; cause problems
+ (map (lambda (value)
+ (if (string? value)
+ (if (string-null? value)
+ #f
+ value)
+ value))
+ lst))
+
(let* ((unique-license-tuples
(filter (lambda (license-tuple)
(not (null? license-tuple)))
(delete-duplicates
- (concatenate license-data))))
+ (map
+ (lambda (lst)
+ (non-string-to-false
+ (empty-string-to-false lst)))
+ (concatenate license-data)))))
(existing-license-entries
(exec-query->vhash conn
- (select-licenses unique-license-tuples)
- cdr
+ "SELECT id, name, uri, comment FROM licenses"
+ (lambda (vals)
+ (non-string-to-false
+ (empty-string-to-false (cdr vals))))
first)) ;; id
(missing-license-entries
(delete-duplicates
@@ -128,5 +130,8 @@
(begin
(error "missing license entry"
license-values)))))
- license-value-lists)))
+ (map (lambda (lst)
+ (non-string-to-false
+ (empty-string-to-false lst)))
+ license-value-lists))))
license-data)))
diff --git a/guix-data-service/model/package-metadata.scm b/guix-data-service/model/package-metadata.scm
index 37a8da8..178d783 100644
--- a/guix-data-service/model/package-metadata.scm
+++ b/guix-data-service/model/package-metadata.scm
@@ -52,8 +52,14 @@
"ON "
(string-join
(map (lambda (field)
- (string-append
- "package_metadata." field " = vals." field))
+ (if (member field '("home_page" "location_id"
+ "license_set_id"))
+ (string-append
+ "(package_metadata." field " = vals." field " OR "
+ "(package_metadata." field " IS NULL AND"
+ " vals." field " IS NULL))")
+ (string-append
+ "package_metadata." field " = vals." field)))
fields)
" AND ")))
@@ -129,7 +135,8 @@ WHERE packages.id IN (
(map (lambda (package license-set-id)
(list (inferior-package-synopsis package)
(inferior-package-description package)
- (inferior-package-home-page package)
+ (non-empty-string-or-false
+ (inferior-package-home-page package))
(location->location-id
conn
(inferior-package-location package))
@@ -140,7 +147,14 @@ WHERE packages.id IN (
(let* ((existing-package-metadata-entries
(exec-query->vhash conn
(select-package-metadata package-metadata)
- cdr
+ (match-lambda
+ ((id synopsis description home-page
+ location-id license-set-id)
+ (list synopsis
+ description
+ (non-empty-string-or-false home-page)
+ location-id
+ license-set-id)))
first)) ;; id))
(missing-package-metadata-entries
(delete-duplicates
diff --git a/guix-data-service/model/utils.scm b/guix-data-service/model/utils.scm
index 1ece31a..0ce9fea 100644
--- a/guix-data-service/model/utils.scm
+++ b/guix-data-service/model/utils.scm
@@ -5,6 +5,7 @@
#:use-module (squee)
#:export (quote-string
value->quoted-string-or-null
+ non-empty-string-or-false
exec-query->vhash
two-lists->vhash
deduplicate-strings
@@ -18,6 +19,13 @@
(string-append "$STR$" value "$STR$")
"NULL"))
+(define (non-empty-string-or-false s)
+ (if (string? s)
+ (if (string-null? s)
+ #f
+ s)
+ #f))
+
(define (exec-query->vhash conn query field-function value-function)
(fold (lambda (row result)
(vhash-cons (field-function row)
diff --git a/sqitch/deploy/fix_duplicated_licenses.sql b/sqitch/deploy/fix_duplicated_licenses.sql
new file mode 100644
index 0000000..33820ca
--- /dev/null
+++ b/sqitch/deploy/fix_duplicated_licenses.sql
@@ -0,0 +1,218 @@
+-- Deploy guix-data-service:fix_duplicated_licenses to pg
+
+BEGIN;
+
+SET CONSTRAINTS ALL DEFERRED;
+
+-- Remove unique constraint from license_sets
+
+ALTER TABLE license_sets DROP CONSTRAINT license_sets_pkey;
+
+-- Change all license sets to refer to canonical licenses
+
+UPDATE license_sets AS master SET license_ids = ARRAY(
+ SELECT new_licenses_2.id FROM (
+ SELECT a.elem AS id, a.nr AS index
+ FROM license_sets, unnest(license_sets.license_ids)
+ WITH ORDINALITY a(elem, nr)
+ WHERE id = master.id
+ ) AS old_licenses
+ INNER JOIN (
+ SELECT licenses.id AS old_id, new_licenses.*
+ FROM licenses INNER JOIN (
+ SELECT MIN(id) AS id, name, uri, comment
+ FROM licenses
+ GROUP BY name, uri, comment
+ ORDER BY name
+ ) AS new_licenses
+ ON licenses.name = new_licenses.name AND
+ (licenses.uri = new_licenses.uri OR (
+ licenses.uri IS NULL AND new_licenses.uri IS NULL
+ )) AND (
+ licenses.comment = new_licenses.comment OR (
+ licenses.comment IS NULL AND new_licenses.comment IS NULL
+ ))
+ ) AS new_licenses_2
+ ON old_licenses.id = new_licenses_2.old_id
+ ORDER BY old_licenses.index);
+
+-- Remove unique constraint from package_metadata
+
+ALTER TABLE package_metadata DROP CONSTRAINT package_metadata_unique_fields;
+
+-- Update package_metadata to refer to canonical license_sets
+
+WITH data AS (
+ SELECT MIN(id) AS id, ARRAY_AGG(id) AS old_ids
+ FROM license_sets
+ GROUP BY license_ids
+)
+UPDATE package_metadata AS master
+SET license_set_id = data.id
+FROM data
+WHERE license_set_id = ANY(data.old_ids);
+
+-- Remove unique constraint from packages
+
+ALTER TABLE packages DROP CONSTRAINT packages_pkey;
+
+-- Update packages to refer to canonical package_metadata entries
+
+WITH data AS (
+ SELECT MIN(package_metadata.id) AS id, ARRAY_AGG(package_metadata.id) AS old_ids
+ FROM package_metadata
+ GROUP BY package_metadata.synopsis, package_metadata.description,
+ package_metadata.home_page, package_metadata.location_id,
+ package_metadata.license_set_id
+ HAVING COUNT(package_metadata.id) > 1
+)
+UPDATE packages SET package_metadata_id = data.id
+FROM data
+WHERE package_metadata_id = ANY(data.old_ids);
+
+-- Remove unique constraint from package_derivations
+
+ALTER TABLE package_derivations DROP CONSTRAINT package_derivations_pkey;
+
+-- Update package_derivations to refer to canonical packages entries
+
+WITH data AS (
+ SELECT unnest(old_ids) AS old, id FROM (
+ SELECT MIN(packages.id) AS id, ARRAY_AGG(packages.id) AS old_ids
+ FROM packages
+ GROUP BY name, version, package_metadata_id
+ HAVING COUNT(id) > 1
+ ) AS d2
+)
+UPDATE package_derivations SET package_id = data.id
+FROM data
+WHERE package_id = data.old;
+
+-- Update guix_revision_package_derivations to refer to canonical
+-- package_derivations entries
+
+WITH data AS (
+ SELECT unnest(old_ids) AS old, id FROM (
+ SELECT MIN(package_derivations.id) AS id, ARRAY_AGG(package_derivations.id) AS old_ids
+ FROM package_derivations
+ GROUP BY package_id, derivation_id, system, target
+ HAVING COUNT(id) > 1
+ ) AS d2
+)
+UPDATE guix_revision_package_derivations SET package_derivation_id = data.id
+FROM data
+WHERE package_derivation_id = data.old;
+
+-- Drop the foreign key constraint as an attempt to speed up deleting from
+-- package_derivations.
+
+ALTER TABLE guix_revision_package_derivations
+ DROP CONSTRAINT guix_revision_package_derivations_package_derivation_id_fkey;
+
+-- Delete non-canonical package_dervations entries
+
+DELETE FROM package_derivations AS pd WHERE id NOT IN (
+ SELECT MIN(id)
+ FROM package_derivations
+ GROUP BY (
+ package_id,
+ derivation_id,
+ system,
+ target
+ )
+);
+
+-- Reinstate the deleted constraint
+
+ALTER TABLE guix_revision_package_derivations
+ ADD CONSTRAINT guix_revision_package_derivations_package_derivation_id_fkey
+ FOREIGN KEY (package_derivation_id) REFERENCES package_derivations(id);
+
+-- Delete non-canonical packages entries
+
+DELETE FROM packages AS p WHERE id NOT IN (
+ SELECT MIN(id)
+ FROM packages
+ GROUP BY (name, version, package_metadata_id)
+);
+
+-- Add referential constraints
+
+ALTER TABLE package_derivations
+ ADD CONSTRAINT package_derivations_package_id_fkey
+ FOREIGN KEY (package_id) REFERENCES packages (id);
+
+ALTER TABLE package_derivations
+ ADD CONSTRAINT package_derivations_derivation_id_fkey
+ FOREIGN KEY (derivation_id) REFERENCES derivations (id);
+
+-- Delete non-canonical package_metadata entries
+
+ALTER TABLE packages DROP CONSTRAINT package_metadata_id;
+
+DELETE FROM package_metadata AS pm WHERE id NOT IN (
+ SELECT MIN(id)
+ FROM package_metadata
+ GROUP BY (synopsis, description, home_page, location_id, license_set_id)
+);
+
+ALTER TABLE packages ADD CONSTRAINT package_metadata_id
+ FOREIGN KEY (package_metadata_id) REFERENCES package_metadata(id);
+
+-- Delete non-canonical license_sets entries
+
+ALTER TABLE package_metadata DROP CONSTRAINT package_metadata_license_set_id_fkey;
+
+DELETE FROM license_sets AS ls WHERE id NOT IN (
+ SELECT MIN(id)
+ FROM license_sets
+ GROUP BY license_ids
+);
+
+ALTER TABLE package_metadata ADD CONSTRAINT package_metadata_license_set_id_fkey
+ FOREIGN KEY (license_set_id) REFERENCES license_sets(id);
+
+-- Delete non-canonical licenses entries
+
+DELETE FROM licenses AS l WHERE id NOT IN (
+ SELECT MIN(id)
+ FROM licenses
+ GROUP BY (name, uri, comment)
+);
+
+-- Restore unique constraints
+
+CREATE UNIQUE INDEX ON licenses (name)
+ WHERE uri IS NULL AND comment IS NULL;
+CREATE UNIQUE INDEX ON licenses (name, uri)
+ WHERE uri IS NOT NULL AND comment IS NULL;
+CREATE UNIQUE INDEX ON licenses (name, comment)
+ WHERE uri IS NULL AND comment IS NOT NULL;
+CREATE UNIQUE INDEX ON licenses (name, uri, comment)
+ WHERE uri IS NOT NULL AND comment IS NOT NULL;
+
+ALTER TABLE license_sets ADD PRIMARY KEY (license_ids);
+
+ALTER TABLE package_metadata ALTER synopsis SET NOT NULL;
+ALTER TABLE package_metadata ALTER description SET NOT NULL;
+
+CREATE UNIQUE INDEX ON package_metadata (
+ synopsis,
+ description,
+ coalesce(location_id, -1),
+ coalesce(license_set_id, -1)
+) WHERE home_page IS NULL;
+
+CREATE UNIQUE INDEX ON package_metadata (
+ synopsis,
+ description,
+ home_page,
+ coalesce(location_id, -1),
+ coalesce(license_set_id, -1)
+) WHERE home_page IS NOT NULL;
+
+ALTER TABLE packages ADD PRIMARY KEY (name, version, package_metadata_id);
+
+ALTER TABLE package_derivations ADD PRIMARY KEY (package_id, derivation_id, system, target);
+
+COMMIT;
diff --git a/sqitch/revert/fix_duplicated_licenses.sql b/sqitch/revert/fix_duplicated_licenses.sql
new file mode 100644
index 0000000..cec3d71
--- /dev/null
+++ b/sqitch/revert/fix_duplicated_licenses.sql
@@ -0,0 +1,7 @@
+-- Revert guix-data-service:fix_duplicated_licenses from pg
+
+BEGIN;
+
+-- XXX Add DDLs here.
+
+COMMIT;
diff --git a/sqitch/sqitch.plan b/sqitch/sqitch.plan
index 7a37548..bd513cc 100644
--- a/sqitch/sqitch.plan
+++ b/sqitch/sqitch.plan
@@ -15,3 +15,4 @@ dates_to_load_new_guix_revision_jobs 2019-06-02T07:39:49Z Christopher Baines <ma
load_new_guix_revision_job_events 2019-06-02T15:44:41Z Christopher Baines <mail@cbaines.net> # Add new table for guix_revision_job_events
load_new_guix_revision_job_logs 2019-06-21T14:33:09Z chris <chris@phact> # Add load_new_guix_revision_job_logs
change_load_new_guix_revision_job_logs_contents_to_be_nullable 2019-07-07T20:10:54Z Christopher Baines <mail@cbaines.net> # Change the contents field in the load_new_guix_revision_job_logs table\nto be nullable.\n\nwith '#' will # be ignored, and an empty message aborts the add. #\nChange to add: # #\nchange_load_new_guix_revision_job_logs_contents_to_be_nullable #\nsqitch/deploy/change_load_new_guix_revision_job_logs_contents_to_be_nullable.sql\nsqitch/revert/change_load_new_guix_revision_job_logs_contents_to_be_nullable.sql\nsqitch/verify/change_load_new_guix_revision_job_logs_contents_to_be_nullable.sql
+fix_duplicated_licenses 2019-07-30T05:48:17Z Christopher Baines <mail@cbaines.net> # Fix duplicated licenses, and add constraints
diff --git a/sqitch/verify/fix_duplicated_licenses.sql b/sqitch/verify/fix_duplicated_licenses.sql
new file mode 100644
index 0000000..47379eb
--- /dev/null
+++ b/sqitch/verify/fix_duplicated_licenses.sql
@@ -0,0 +1,7 @@
+-- Verify guix-data-service:fix_duplicated_licenses on pg
+
+BEGIN;
+
+-- XXX Add verifications here.
+
+ROLLBACK;
diff --git a/tests/model-license-set.scm b/tests/model-license-set.scm
index 9cd949d..c39171f 100644
--- a/tests/model-license-set.scm
+++ b/tests/model-license-set.scm
@@ -19,10 +19,10 @@
"https://example.com/why-license-1"))
(("License 1"
"https://gnu.org/licenses/test-1.html"
- "https://example.com/why-license-1")
+ #f)
("License 2"
- "https://gnu.org/licenses/test-2.html"
- "https://example.com/why-license-2")))))
+ #f
+ #f)))))
(with-postgresql-connection
"test-model-license-set"
diff --git a/tests/model-license.scm b/tests/model-license.scm
index 7888117..8f4b0c3 100644
--- a/tests/model-license.scm
+++ b/tests/model-license.scm
@@ -17,10 +17,13 @@
"https://example.com/why-license-1"))
(("License 1"
"https://gnu.org/licenses/test-1.html"
- "https://example.com/why-license-1")
+ #f)
("License 2"
"https://gnu.org/licenses/test-2.html"
- "https://example.com/why-license-2")))))
+ #f)
+ ("License 3"
+ #f
+ #f)))))
(with-postgresql-connection
"test-model-license"
diff --git a/tests/model-package-metadata.scm b/tests/model-package-metadata.scm
index ea0cdbe..015a0b2 100644
--- a/tests/model-package-metadata.scm
+++ b/tests/model-package-metadata.scm
@@ -18,6 +18,15 @@
(home-page "https://example.com")
(location (location "file.scm" 5 0))))
+(define mock-inferior-package-foo-2
+ (mock-inferior-package
+ (name "foo")
+ (version "2")
+ (synopsis "Foo")
+ (description "Foo description")
+ (home-page #f)
+ (location #f)))
+
(define (test-license-set-ids conn)
(mock
((guix-data-service model license)
@@ -46,7 +55,8 @@
(match
(inferior-packages->package-metadata-ids
conn
- (list mock-inferior-package-foo)
+ (list mock-inferior-package-foo
+ mock-inferior-package-foo-2)
(test-license-set-ids conn))
((x) (string? x))))
#:always-rollback? #t))
@@ -57,11 +67,13 @@
(test-equal "inferior-packages->package-metadata-ids"
(inferior-packages->package-metadata-ids
conn
- (list mock-inferior-package-foo)
+ (list mock-inferior-package-foo
+ mock-inferior-package-foo-2)
(test-license-set-ids conn))
(inferior-packages->package-metadata-ids
conn
- (list mock-inferior-package-foo)
+ (list mock-inferior-package-foo
+ mock-inferior-package-foo-2)
(test-license-set-ids conn)))
#:always-rollback? #t))))))