Add similarity program to check for similar keys.

This adds the 'similarity' program that checks for similar keys in the taginfo-db database. It is not run yet by the update script, but the supporting database tables etc. are added by this commit.
author: Jochen Topf <jochen@topf.org> 2015-02-26 15:41:23 +0100
committer: Jochen Topf <jochen@topf.org> 2015-02-26 15:41:23 +0100
commit: 4a2073856b3ca10b61c158a12cc36aaaf8842f46 (patch)
tree: 5a31a736950594e3e66fe1586b3677d5d9fdbc52
parent: 0ebe372964cdbb3a1caec14699bdcb0c43c0eafe (diff)
download: taginfo-4a2073856b3ca10b61c158a12cc36aaaf8842f46.tar
taginfo-4a2073856b3ca10b61c158a12cc36aaaf8842f46.tar.gz
6 files changed, 181 insertions, 1 deletions
diff --git a/sources/db/post.sql b/sources/db/post.sql
index bd23643..592df9d 100644
--- a/sources/db/post.sql
+++ b/sources/db/post.sql
@@ -12,6 +12,39 @@ PRAGMA count_changes = OFF;
 PRAGMA temp_store    = MEMORY;
 PRAGMA cache_size    = 5000000;
 
+-- ============================================================================
+
+-- For all keys found to be similar earlier, we get the counts how often they
+-- appear in the OSM database and store this data in the same table for easy
+-- access.
+UPDATE similar_keys SET count_all1=(SELECT k.count_all FROM keys k WHERE k.key=similar_keys.key1);
+UPDATE similar_keys SET count_all2=(SELECT k.count_all FROM keys k WHERE k.key=similar_keys.key2);
+
+CREATE INDEX similar_keys_key1_idx ON similar_keys (key1);
+CREATE INDEX similar_keys_key2_idx ON similar_keys (key2);
+
+ANALYZE similar_keys;
+
+DROP TABLE IF EXISTS similar_keys_common_rare;
+
+CREATE TABLE similar_keys_common_rare (
+  key_common       VARCHAR,
+  key_rare         VARCHAR,
+  count_all_common INTEGER DEFAULT 0,
+  count_all_rare   INTEGER DEFAULT 0,
+  similarity       INTEGER
+);
+
+INSERT INTO similar_keys_common_rare (key_common, key_rare, count_all_common, count_all_rare, similarity)
+    SELECT key1, key2, count_all1, count_all2, similarity
+        FROM similar_keys WHERE count_all1 >= 1000 AND count_all2 <= 10 AND count_all2 > 0;
+
+INSERT INTO similar_keys_common_rare (key_common, key_rare, count_all_common, count_all_rare, similarity)
+    SELECT key2, key1, count_all2, count_all1, similarity
+        FROM similar_keys WHERE count_all2 >= 1000 AND count_all1 <= 10 AND count_all1 > 0;
+
+-- ============================================================================
+
 CREATE UNIQUE INDEX keys_key_idx ON keys (key);
 CREATE        INDEX tags_key_idx ON tags (key);
 -- CREATE UNIQUE INDEX tags_key_value_idx ON tags (key, value);
diff --git a/sources/db/pre.sql b/sources/db/pre.sql
index 1c752be..586fe6c 100644
--- a/sources/db/pre.sql
+++ b/sources/db/pre.sql
@@ -49,6 +49,18 @@ CREATE TABLE key_distributions (
   png              BLOB
 );
 
+
+DROP TABLE IF EXISTS similar_keys;
+
+CREATE TABLE similar_keys (
+  key1       VARCHAR,
+  key2       VARCHAR,
+  count_all1 INTEGER DEFAULT 0,
+  count_all2 INTEGER DEFAULT 0,
+  similarity INTEGER
+);
+
+
 DROP TABLE IF EXISTS tag_distributions;
 
 CREATE TABLE tag_distributions (
diff --git a/tagstats/.gitignore b/tagstats/.gitignore
index fa38f18..2d82327 100644
--- a/tagstats/.gitignore
+++ b/tagstats/.gitignore
@@ -1,3 +1,4 @@
+similarity
 tagstats
 osmstats
 taginfo-db.db
diff --git a/tagstats/Makefile b/tagstats/Makefile
index 0358a67..0752b49 100644
--- a/tagstats/Makefile
+++ b/tagstats/Makefile
@@ -43,7 +43,7 @@ LIB_SQLITE := -lsqlite3
 
 .PHONY: all check indent install clean
 
-all: tagstats osmstats
+all: tagstats osmstats similarity
 
 osmstats: osmstats.cpp statistics_handler.hpp
 	$(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) -o $@ $< $(LDFLAGS) $(LIB_EXPAT) $(LIB_PBF) $(LIB_SQLITE)
@@ -51,6 +51,9 @@ osmstats: osmstats.cpp statistics_handler.hpp
 tagstats: tagstats.cpp tagstats_handler.hpp statistics_handler.hpp string_store.hpp geodistribution.hpp sqlite.hpp
 	$(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) -o $@ $< $(LDFLAGS) $(LIB_EXPAT) $(LIB_PBF) $(LIB_SQLITE) $(LIB_GD)
 
+similarity: similarity.cpp sqlite.hpp
+	$(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) -o $@ $< $(LDFLAGS) $(LIB_SQLITE)
+
 check:
 	cppcheck --enable=all tagstats.cpp osmstats.cpp
 
diff --git a/tagstats/similarity.cpp b/tagstats/similarity.cpp
new file mode 100644
index 0000000..a9e7445
--- /dev/null
+++ b/tagstats/similarity.cpp
@@ -0,0 +1,120 @@
+
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <fcntl.h>
+#include <iostream>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <vector>
+
+#include "sqlite.hpp"
+
+const int MIN_STRLEN = 4;
+const int MAX_STRLEN = 120;
+const int MAX_EDIT_DISTANCE = 2;
+
+/**
+ * Compute Levenshtein edit distance. This is a very simple implementation of
+ * the Levenshtein algorithm I copied from somewhere on the Internet, but it
+ * is fast enough for our purpose here.
+ */
+unsigned int edit_distance(const char* str1, int len1, const char* str2, int len2) {
+    static unsigned d[MAX_STRLEN][MAX_STRLEN];
+
+    d[0][0] = 0;
+    for (int i = 1; i <= len1; ++i) d[i][0] = i;
+    for (int i = 1; i <= len2; ++i) d[0][i] = i;
+
+    for (int i = 1; i <= len1; ++i) {
+        for (int j = 1; j <= len2; ++j) {
+            d[i][j] = std::min( std::min(d[i - 1][j] + 1,d[i][j - 1] + 1),
+                                d[i - 1][j - 1] + (str1[i - 1] == str2[j - 1] ? 0 : 1) );
+        }
+    }
+
+    return d[len1][len2];
+}
+
+/**
+ * Are the two given strings similar according to some metric?
+ */
+int similarity(const char* str1, int len1, const char* str2, int len2) {
+    // Do not check very short strings, because they create too many false
+    // positives.
+    if (len1 < MIN_STRLEN || len2 < MIN_STRLEN) {
+        return -1;
+    }
+
+    // Do not check very long strings. This keeps memory use and run time for
+    // Levenshtein algorithm in check.
+    if (len1 >= MAX_STRLEN || len2 >= MAX_STRLEN) {
+        return -1;
+    }
+
+    // Check if one string is a substring of the other. This will also check
+    // if both strings differ only in case.
+    if (strcasestr(str1, str2) || strcasestr(str2, str1)) {
+        return 0;
+    }
+
+    // Do not check strings if they have very different lengths, they can't
+    // be similar according to Levenshtein anyway.
+    if (abs(len1 - len2) >= MAX_EDIT_DISTANCE) {
+        return -1;
+    }
+
+    // Check Levenshtein edit distance.
+    int distance = edit_distance(str1, len1, str2, len2);
+    if (distance <= MAX_EDIT_DISTANCE) {
+        return distance;
+    }
+
+    return -1;
+}
+
+/**
+ * Iterate over all (null-terminated) strings in the memory between begin and
+ * end and find similar strings.
+ */
+void find_similarities(const char* begin, const char* end, Sqlite::Statement& insert) {
+    int len1;
+    for (const char* str1 = begin; str1 != end; str1 += len1 + 1) {
+        len1 = strlen(str1);
+        int len2;
+        for (const char* str2 = str1 + len1; str2 != end; str2 += len2 + 1) {
+            len2 = strlen(str2);
+            int sim = similarity(str1, len1, str2, len2);
+            if (sim >= 0) {
+                //std::cout << "[" << str1 << "][" << str2 << "]\n";
+                insert.bind_text(str1).bind_text(str2).bind_int(sim).execute();
+            }
+        }
+    }
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 2) {
+        std::cerr << "similarity DATABASE\n";
+        return 1;
+    }
+
+    std::string data;
+
+    Sqlite::Database db(argv[1], SQLITE_OPEN_READWRITE);
+    Sqlite::Statement select(db, "SELECT key FROM keys ORDER BY key");
+    while (select.read()) {
+        data += select.get_string(0);
+        data += '\0';
+    }
+
+    Sqlite::Statement insert(db, "INSERT INTO similar_keys (key1, key2, similarity) VALUES (?, ?, ?)");
+    db.begin_transaction();
+    find_similarities(data.c_str(), data.c_str() + data.size(), insert);
+    db.commit();
+
+    return 0;
+}
+
diff --git a/tagstats/sqlite.hpp b/tagstats/sqlite.hpp
index 45dd2e2..8860fbf 100644
--- a/tagstats/sqlite.hpp
+++ b/tagstats/sqlite.hpp
@@ -196,6 +196,17 @@ namespace Sqlite {
             return std::string(textptr);
         }
 
+        const char* get_string(int column) {
+            if (column >= column_count()) {
+                throw Sqlite::Exception("Column larger than max columns", "");
+            }
+            const char* textptr = reinterpret_cast<const char*>(sqlite3_column_text(m_statement, column));
+            if (!textptr) {
+                throw Sqlite::Exception("Error reading text column", m_db.errmsg());
+            }
+            return textptr;
+        }
+
         int get_int(int column) {
             if (column >= column_count()) {
                 throw Sqlite::Exception("Column larger than max columns", m_db.errmsg());
author	Jochen Topf <jochen@topf.org>	2015-02-26 15:41:23 +0100
committer	Jochen Topf <jochen@topf.org>	2015-02-26 15:41:23 +0100
commit	4a2073856b3ca10b61c158a12cc36aaaf8842f46 (patch)
tree	5a31a736950594e3e66fe1586b3677d5d9fdbc52
parent	0ebe372964cdbb3a1caec14699bdcb0c43c0eafe (diff)
download	taginfo-4a2073856b3ca10b61c158a12cc36aaaf8842f46.tar taginfo-4a2073856b3ca10b61c158a12cc36aaaf8842f46.tar.gz