summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJochen Topf <jochen@topf.org>2015-02-26 15:41:23 +0100
committerJochen Topf <jochen@topf.org>2015-02-26 15:41:23 +0100
commit4a2073856b3ca10b61c158a12cc36aaaf8842f46 (patch)
tree5a31a736950594e3e66fe1586b3677d5d9fdbc52
parent0ebe372964cdbb3a1caec14699bdcb0c43c0eafe (diff)
downloadtaginfo-4a2073856b3ca10b61c158a12cc36aaaf8842f46.tar
taginfo-4a2073856b3ca10b61c158a12cc36aaaf8842f46.tar.gz
Add similarity program to check for similar keys.
This adds the 'similarity' program that checks for similar keys in the taginfo-db database. It is not run yet by the update script, but the supporting database tables etc. are added by this commit.
-rw-r--r--sources/db/post.sql33
-rw-r--r--sources/db/pre.sql12
-rw-r--r--tagstats/.gitignore1
-rw-r--r--tagstats/Makefile5
-rw-r--r--tagstats/similarity.cpp120
-rw-r--r--tagstats/sqlite.hpp11
6 files changed, 181 insertions, 1 deletions
diff --git a/sources/db/post.sql b/sources/db/post.sql
index bd23643..592df9d 100644
--- a/sources/db/post.sql
+++ b/sources/db/post.sql
@@ -12,6 +12,39 @@ PRAGMA count_changes = OFF;
PRAGMA temp_store = MEMORY;
PRAGMA cache_size = 5000000;
+-- ============================================================================
+
+-- For all keys found to be similar earlier, we get the counts how often they
+-- appear in the OSM database and store this data in the same table for easy
+-- access.
+UPDATE similar_keys SET count_all1=(SELECT k.count_all FROM keys k WHERE k.key=similar_keys.key1);
+UPDATE similar_keys SET count_all2=(SELECT k.count_all FROM keys k WHERE k.key=similar_keys.key2);
+
+CREATE INDEX similar_keys_key1_idx ON similar_keys (key1);
+CREATE INDEX similar_keys_key2_idx ON similar_keys (key2);
+
+ANALYZE similar_keys;
+
+DROP TABLE IF EXISTS similar_keys_common_rare;
+
+CREATE TABLE similar_keys_common_rare (
+ key_common VARCHAR,
+ key_rare VARCHAR,
+ count_all_common INTEGER DEFAULT 0,
+ count_all_rare INTEGER DEFAULT 0,
+ similarity INTEGER
+);
+
+INSERT INTO similar_keys_common_rare (key_common, key_rare, count_all_common, count_all_rare, similarity)
+ SELECT key1, key2, count_all1, count_all2, similarity
+ FROM similar_keys WHERE count_all1 >= 1000 AND count_all2 <= 10 AND count_all2 > 0;
+
+INSERT INTO similar_keys_common_rare (key_common, key_rare, count_all_common, count_all_rare, similarity)
+ SELECT key2, key1, count_all2, count_all1, similarity
+ FROM similar_keys WHERE count_all2 >= 1000 AND count_all1 <= 10 AND count_all1 > 0;
+
+-- ============================================================================
+
CREATE UNIQUE INDEX keys_key_idx ON keys (key);
CREATE INDEX tags_key_idx ON tags (key);
-- CREATE UNIQUE INDEX tags_key_value_idx ON tags (key, value);
diff --git a/sources/db/pre.sql b/sources/db/pre.sql
index 1c752be..586fe6c 100644
--- a/sources/db/pre.sql
+++ b/sources/db/pre.sql
@@ -49,6 +49,18 @@ CREATE TABLE key_distributions (
png BLOB
);
+
+DROP TABLE IF EXISTS similar_keys;
+
+CREATE TABLE similar_keys (
+ key1 VARCHAR,
+ key2 VARCHAR,
+ count_all1 INTEGER DEFAULT 0,
+ count_all2 INTEGER DEFAULT 0,
+ similarity INTEGER
+);
+
+
DROP TABLE IF EXISTS tag_distributions;
CREATE TABLE tag_distributions (
diff --git a/tagstats/.gitignore b/tagstats/.gitignore
index fa38f18..2d82327 100644
--- a/tagstats/.gitignore
+++ b/tagstats/.gitignore
@@ -1,3 +1,4 @@
+similarity
tagstats
osmstats
taginfo-db.db
diff --git a/tagstats/Makefile b/tagstats/Makefile
index 0358a67..0752b49 100644
--- a/tagstats/Makefile
+++ b/tagstats/Makefile
@@ -43,7 +43,7 @@ LIB_SQLITE := -lsqlite3
.PHONY: all check indent install clean
-all: tagstats osmstats
+all: tagstats osmstats similarity
osmstats: osmstats.cpp statistics_handler.hpp
$(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) -o $@ $< $(LDFLAGS) $(LIB_EXPAT) $(LIB_PBF) $(LIB_SQLITE)
@@ -51,6 +51,9 @@ osmstats: osmstats.cpp statistics_handler.hpp
tagstats: tagstats.cpp tagstats_handler.hpp statistics_handler.hpp string_store.hpp geodistribution.hpp sqlite.hpp
$(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) -o $@ $< $(LDFLAGS) $(LIB_EXPAT) $(LIB_PBF) $(LIB_SQLITE) $(LIB_GD)
+similarity: similarity.cpp sqlite.hpp
+ $(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) -o $@ $< $(LDFLAGS) $(LIB_SQLITE)
+
check:
cppcheck --enable=all tagstats.cpp osmstats.cpp
diff --git a/tagstats/similarity.cpp b/tagstats/similarity.cpp
new file mode 100644
index 0000000..a9e7445
--- /dev/null
+++ b/tagstats/similarity.cpp
@@ -0,0 +1,120 @@
+
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <fcntl.h>
+#include <iostream>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <vector>
+
+#include "sqlite.hpp"
+
+const int MIN_STRLEN = 4;
+const int MAX_STRLEN = 120;
+const int MAX_EDIT_DISTANCE = 2;
+
+/**
+ * Compute Levenshtein edit distance. This is a very simple implementation of
+ * the Levenshtein algorithm I copied from somewhere on the Internet, but it
+ * is fast enough for our purpose here.
+ */
+unsigned int edit_distance(const char* str1, int len1, const char* str2, int len2) {
+ static unsigned d[MAX_STRLEN][MAX_STRLEN];
+
+ d[0][0] = 0;
+ for (int i = 1; i <= len1; ++i) d[i][0] = i;
+ for (int i = 1; i <= len2; ++i) d[0][i] = i;
+
+ for (int i = 1; i <= len1; ++i) {
+ for (int j = 1; j <= len2; ++j) {
+ d[i][j] = std::min( std::min(d[i - 1][j] + 1,d[i][j - 1] + 1),
+ d[i - 1][j - 1] + (str1[i - 1] == str2[j - 1] ? 0 : 1) );
+ }
+ }
+
+ return d[len1][len2];
+}
+
+/**
+ * Are the two given strings similar according to some metric?
+ */
+int similarity(const char* str1, int len1, const char* str2, int len2) {
+ // Do not check very short strings, because they create too many false
+ // positives.
+ if (len1 < MIN_STRLEN || len2 < MIN_STRLEN) {
+ return -1;
+ }
+
+ // Do not check very long strings. This keeps memory use and run time for
+ // Levenshtein algorithm in check.
+ if (len1 >= MAX_STRLEN || len2 >= MAX_STRLEN) {
+ return -1;
+ }
+
+ // Check if one string is a substring of the other. This will also check
+ // if both strings differ only in case.
+ if (strcasestr(str1, str2) || strcasestr(str2, str1)) {
+ return 0;
+ }
+
+ // Do not check strings if they have very different lengths, they can't
+ // be similar according to Levenshtein anyway.
+ if (abs(len1 - len2) >= MAX_EDIT_DISTANCE) {
+ return -1;
+ }
+
+ // Check Levenshtein edit distance.
+ int distance = edit_distance(str1, len1, str2, len2);
+ if (distance <= MAX_EDIT_DISTANCE) {
+ return distance;
+ }
+
+ return -1;
+}
+
+/**
+ * Iterate over all (null-terminated) strings in the memory between begin and
+ * end and find similar strings.
+ */
+void find_similarities(const char* begin, const char* end, Sqlite::Statement& insert) {
+ int len1;
+ for (const char* str1 = begin; str1 != end; str1 += len1 + 1) {
+ len1 = strlen(str1);
+ int len2;
+ for (const char* str2 = str1 + len1; str2 != end; str2 += len2 + 1) {
+ len2 = strlen(str2);
+ int sim = similarity(str1, len1, str2, len2);
+ if (sim >= 0) {
+ //std::cout << "[" << str1 << "][" << str2 << "]\n";
+ insert.bind_text(str1).bind_text(str2).bind_int(sim).execute();
+ }
+ }
+ }
+}
+
+int main(int argc, char *argv[]) {
+ if (argc != 2) {
+ std::cerr << "similarity DATABASE\n";
+ return 1;
+ }
+
+ std::string data;
+
+ Sqlite::Database db(argv[1], SQLITE_OPEN_READWRITE);
+ Sqlite::Statement select(db, "SELECT key FROM keys ORDER BY key");
+ while (select.read()) {
+ data += select.get_string(0);
+ data += '\0';
+ }
+
+ Sqlite::Statement insert(db, "INSERT INTO similar_keys (key1, key2, similarity) VALUES (?, ?, ?)");
+ db.begin_transaction();
+ find_similarities(data.c_str(), data.c_str() + data.size(), insert);
+ db.commit();
+
+ return 0;
+}
+
diff --git a/tagstats/sqlite.hpp b/tagstats/sqlite.hpp
index 45dd2e2..8860fbf 100644
--- a/tagstats/sqlite.hpp
+++ b/tagstats/sqlite.hpp
@@ -196,6 +196,17 @@ namespace Sqlite {
return std::string(textptr);
}
+ const char* get_string(int column) {
+ if (column >= column_count()) {
+ throw Sqlite::Exception("Column larger than max columns", "");
+ }
+ const char* textptr = reinterpret_cast<const char*>(sqlite3_column_text(m_statement, column));
+ if (!textptr) {
+ throw Sqlite::Exception("Error reading text column", m_db.errmsg());
+ }
+ return textptr;
+ }
+
int get_int(int column) {
if (column >= column_count()) {
throw Sqlite::Exception("Column larger than max columns", m_db.errmsg());