Add similarity program to check for similar keys.

This adds the 'similarity' program that checks for similar keys in the taginfo-db database. It is not run yet by the update script, but the supporting database tables etc. are added by this commit.
author: Jochen Topf <jochen@topf.org> 2015-02-26 15:41:23 +0100
committer: Jochen Topf <jochen@topf.org> 2015-02-26 15:41:23 +0100
commit: 4a2073856b3ca10b61c158a12cc36aaaf8842f46 (patch)
tree: 5a31a736950594e3e66fe1586b3677d5d9fdbc52 /sources
parent: 0ebe372964cdbb3a1caec14699bdcb0c43c0eafe (diff)
download: taginfo-4a2073856b3ca10b61c158a12cc36aaaf8842f46.tar
taginfo-4a2073856b3ca10b61c158a12cc36aaaf8842f46.tar.gz
2 files changed, 45 insertions, 0 deletions
diff --git a/sources/db/post.sql b/sources/db/post.sql
index bd23643..592df9d 100644
--- a/sources/db/post.sql
+++ b/sources/db/post.sql
@@ -12,6 +12,39 @@ PRAGMA count_changes = OFF;
 PRAGMA temp_store    = MEMORY;
 PRAGMA cache_size    = 5000000;
 
+-- ============================================================================
+
+-- For all keys found to be similar earlier, we get the counts how often they
+-- appear in the OSM database and store this data in the same table for easy
+-- access.
+UPDATE similar_keys SET count_all1=(SELECT k.count_all FROM keys k WHERE k.key=similar_keys.key1);
+UPDATE similar_keys SET count_all2=(SELECT k.count_all FROM keys k WHERE k.key=similar_keys.key2);
+
+CREATE INDEX similar_keys_key1_idx ON similar_keys (key1);
+CREATE INDEX similar_keys_key2_idx ON similar_keys (key2);
+
+ANALYZE similar_keys;
+
+DROP TABLE IF EXISTS similar_keys_common_rare;
+
+CREATE TABLE similar_keys_common_rare (
+  key_common       VARCHAR,
+  key_rare         VARCHAR,
+  count_all_common INTEGER DEFAULT 0,
+  count_all_rare   INTEGER DEFAULT 0,
+  similarity       INTEGER
+);
+
+INSERT INTO similar_keys_common_rare (key_common, key_rare, count_all_common, count_all_rare, similarity)
+    SELECT key1, key2, count_all1, count_all2, similarity
+        FROM similar_keys WHERE count_all1 >= 1000 AND count_all2 <= 10 AND count_all2 > 0;
+
+INSERT INTO similar_keys_common_rare (key_common, key_rare, count_all_common, count_all_rare, similarity)
+    SELECT key2, key1, count_all2, count_all1, similarity
+        FROM similar_keys WHERE count_all2 >= 1000 AND count_all1 <= 10 AND count_all1 > 0;
+
+-- ============================================================================
+
 CREATE UNIQUE INDEX keys_key_idx ON keys (key);
 CREATE        INDEX tags_key_idx ON tags (key);
 -- CREATE UNIQUE INDEX tags_key_value_idx ON tags (key, value);
diff --git a/sources/db/pre.sql b/sources/db/pre.sql
index 1c752be..586fe6c 100644
--- a/sources/db/pre.sql
+++ b/sources/db/pre.sql
@@ -49,6 +49,18 @@ CREATE TABLE key_distributions (
   png              BLOB
 );
 
+
+DROP TABLE IF EXISTS similar_keys;
+
+CREATE TABLE similar_keys (
+  key1       VARCHAR,
+  key2       VARCHAR,
+  count_all1 INTEGER DEFAULT 0,
+  count_all2 INTEGER DEFAULT 0,
+  similarity INTEGER
+);
+
+
 DROP TABLE IF EXISTS tag_distributions;
 
 CREATE TABLE tag_distributions (
author	Jochen Topf <jochen@topf.org>	2015-02-26 15:41:23 +0100
committer	Jochen Topf <jochen@topf.org>	2015-02-26 15:41:23 +0100
commit	4a2073856b3ca10b61c158a12cc36aaaf8842f46 (patch)
tree	5a31a736950594e3e66fe1586b3677d5d9fdbc52 /sources
parent	0ebe372964cdbb3a1caec14699bdcb0c43c0eafe (diff)
download	taginfo-4a2073856b3ca10b61c158a12cc36aaaf8842f46.tar taginfo-4a2073856b3ca10b61c158a12cc36aaaf8842f46.tar.gz