diff options
author | Jochen Topf <jochen@topf.org> | 2015-02-26 15:41:23 +0100 |
---|---|---|
committer | Jochen Topf <jochen@topf.org> | 2015-02-26 15:41:23 +0100 |
commit | 4a2073856b3ca10b61c158a12cc36aaaf8842f46 (patch) | |
tree | 5a31a736950594e3e66fe1586b3677d5d9fdbc52 /sources/db | |
parent | 0ebe372964cdbb3a1caec14699bdcb0c43c0eafe (diff) | |
download | taginfo-4a2073856b3ca10b61c158a12cc36aaaf8842f46.tar taginfo-4a2073856b3ca10b61c158a12cc36aaaf8842f46.tar.gz |
Add similarity program to check for similar keys.
This adds the 'similarity' program that checks for similar keys in the
taginfo-db database. It is not run yet by the update script, but the
supporting database tables etc. are added by this commit.
Diffstat (limited to 'sources/db')
-rw-r--r-- | sources/db/post.sql | 33 | ||||
-rw-r--r-- | sources/db/pre.sql | 12 |
2 files changed, 45 insertions, 0 deletions
diff --git a/sources/db/post.sql b/sources/db/post.sql index bd23643..592df9d 100644 --- a/sources/db/post.sql +++ b/sources/db/post.sql @@ -12,6 +12,39 @@ PRAGMA count_changes = OFF; PRAGMA temp_store = MEMORY; PRAGMA cache_size = 5000000; +-- ============================================================================ + +-- For all keys found to be similar earlier, we get the counts how often they +-- appear in the OSM database and store this data in the same table for easy +-- access. +UPDATE similar_keys SET count_all1=(SELECT k.count_all FROM keys k WHERE k.key=similar_keys.key1); +UPDATE similar_keys SET count_all2=(SELECT k.count_all FROM keys k WHERE k.key=similar_keys.key2); + +CREATE INDEX similar_keys_key1_idx ON similar_keys (key1); +CREATE INDEX similar_keys_key2_idx ON similar_keys (key2); + +ANALYZE similar_keys; + +DROP TABLE IF EXISTS similar_keys_common_rare; + +CREATE TABLE similar_keys_common_rare ( + key_common VARCHAR, + key_rare VARCHAR, + count_all_common INTEGER DEFAULT 0, + count_all_rare INTEGER DEFAULT 0, + similarity INTEGER +); + +INSERT INTO similar_keys_common_rare (key_common, key_rare, count_all_common, count_all_rare, similarity) + SELECT key1, key2, count_all1, count_all2, similarity + FROM similar_keys WHERE count_all1 >= 1000 AND count_all2 <= 10 AND count_all2 > 0; + +INSERT INTO similar_keys_common_rare (key_common, key_rare, count_all_common, count_all_rare, similarity) + SELECT key2, key1, count_all2, count_all1, similarity + FROM similar_keys WHERE count_all2 >= 1000 AND count_all1 <= 10 AND count_all1 > 0; + +-- ============================================================================ + CREATE UNIQUE INDEX keys_key_idx ON keys (key); CREATE INDEX tags_key_idx ON tags (key); -- CREATE UNIQUE INDEX tags_key_value_idx ON tags (key, value); diff --git a/sources/db/pre.sql b/sources/db/pre.sql index 1c752be..586fe6c 100644 --- a/sources/db/pre.sql +++ b/sources/db/pre.sql @@ -49,6 +49,18 @@ CREATE TABLE key_distributions ( png BLOB ); + +DROP TABLE IF EXISTS similar_keys; + +CREATE TABLE similar_keys ( + key1 VARCHAR, + key2 VARCHAR, + count_all1 INTEGER DEFAULT 0, + count_all2 INTEGER DEFAULT 0, + similarity INTEGER +); + + DROP TABLE IF EXISTS tag_distributions; CREATE TABLE tag_distributions ( |