From 4a2073856b3ca10b61c158a12cc36aaaf8842f46 Mon Sep 17 00:00:00 2001 From: Jochen Topf Date: Thu, 26 Feb 2015 15:41:23 +0100 Subject: Add similarity program to check for similar keys. This adds the 'similarity' program that checks for similar keys in the taginfo-db database. It is not run yet by the update script, but the supporting database tables etc. are added by this commit. --- sources/db/post.sql | 33 +++++++++++++ sources/db/pre.sql | 12 +++++ tagstats/.gitignore | 1 + tagstats/Makefile | 5 +- tagstats/similarity.cpp | 120 ++++++++++++++++++++++++++++++++++++++++++++++++ tagstats/sqlite.hpp | 11 +++++ 6 files changed, 181 insertions(+), 1 deletion(-) create mode 100644 tagstats/similarity.cpp diff --git a/sources/db/post.sql b/sources/db/post.sql index bd23643..592df9d 100644 --- a/sources/db/post.sql +++ b/sources/db/post.sql @@ -12,6 +12,39 @@ PRAGMA count_changes = OFF; PRAGMA temp_store = MEMORY; PRAGMA cache_size = 5000000; +-- ============================================================================ + +-- For all keys found to be similar earlier, we get the counts how often they +-- appear in the OSM database and store this data in the same table for easy +-- access. +UPDATE similar_keys SET count_all1=(SELECT k.count_all FROM keys k WHERE k.key=similar_keys.key1); +UPDATE similar_keys SET count_all2=(SELECT k.count_all FROM keys k WHERE k.key=similar_keys.key2); + +CREATE INDEX similar_keys_key1_idx ON similar_keys (key1); +CREATE INDEX similar_keys_key2_idx ON similar_keys (key2); + +ANALYZE similar_keys; + +DROP TABLE IF EXISTS similar_keys_common_rare; + +CREATE TABLE similar_keys_common_rare ( + key_common VARCHAR, + key_rare VARCHAR, + count_all_common INTEGER DEFAULT 0, + count_all_rare INTEGER DEFAULT 0, + similarity INTEGER +); + +INSERT INTO similar_keys_common_rare (key_common, key_rare, count_all_common, count_all_rare, similarity) + SELECT key1, key2, count_all1, count_all2, similarity + FROM similar_keys WHERE count_all1 >= 1000 AND count_all2 <= 10 AND count_all2 > 0; + +INSERT INTO similar_keys_common_rare (key_common, key_rare, count_all_common, count_all_rare, similarity) + SELECT key2, key1, count_all2, count_all1, similarity + FROM similar_keys WHERE count_all2 >= 1000 AND count_all1 <= 10 AND count_all1 > 0; + +-- ============================================================================ + CREATE UNIQUE INDEX keys_key_idx ON keys (key); CREATE INDEX tags_key_idx ON tags (key); -- CREATE UNIQUE INDEX tags_key_value_idx ON tags (key, value); diff --git a/sources/db/pre.sql b/sources/db/pre.sql index 1c752be..586fe6c 100644 --- a/sources/db/pre.sql +++ b/sources/db/pre.sql @@ -49,6 +49,18 @@ CREATE TABLE key_distributions ( png BLOB ); + +DROP TABLE IF EXISTS similar_keys; + +CREATE TABLE similar_keys ( + key1 VARCHAR, + key2 VARCHAR, + count_all1 INTEGER DEFAULT 0, + count_all2 INTEGER DEFAULT 0, + similarity INTEGER +); + + DROP TABLE IF EXISTS tag_distributions; CREATE TABLE tag_distributions ( diff --git a/tagstats/.gitignore b/tagstats/.gitignore index fa38f18..2d82327 100644 --- a/tagstats/.gitignore +++ b/tagstats/.gitignore @@ -1,3 +1,4 @@ +similarity tagstats osmstats taginfo-db.db diff --git a/tagstats/Makefile b/tagstats/Makefile index 0358a67..0752b49 100644 --- a/tagstats/Makefile +++ b/tagstats/Makefile @@ -43,7 +43,7 @@ LIB_SQLITE := -lsqlite3 .PHONY: all check indent install clean -all: tagstats osmstats +all: tagstats osmstats similarity osmstats: osmstats.cpp statistics_handler.hpp $(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) -o $@ $< $(LDFLAGS) $(LIB_EXPAT) $(LIB_PBF) $(LIB_SQLITE) @@ -51,6 +51,9 @@ osmstats: osmstats.cpp statistics_handler.hpp tagstats: tagstats.cpp tagstats_handler.hpp statistics_handler.hpp string_store.hpp geodistribution.hpp sqlite.hpp $(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) -o $@ $< $(LDFLAGS) $(LIB_EXPAT) $(LIB_PBF) $(LIB_SQLITE) $(LIB_GD) +similarity: similarity.cpp sqlite.hpp + $(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) -o $@ $< $(LDFLAGS) $(LIB_SQLITE) + check: cppcheck --enable=all tagstats.cpp osmstats.cpp diff --git a/tagstats/similarity.cpp b/tagstats/similarity.cpp new file mode 100644 index 0000000..a9e7445 --- /dev/null +++ b/tagstats/similarity.cpp @@ -0,0 +1,120 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sqlite.hpp" + +const int MIN_STRLEN = 4; +const int MAX_STRLEN = 120; +const int MAX_EDIT_DISTANCE = 2; + +/** + * Compute Levenshtein edit distance. This is a very simple implementation of + * the Levenshtein algorithm I copied from somewhere on the Internet, but it + * is fast enough for our purpose here. + */ +unsigned int edit_distance(const char* str1, int len1, const char* str2, int len2) { + static unsigned d[MAX_STRLEN][MAX_STRLEN]; + + d[0][0] = 0; + for (int i = 1; i <= len1; ++i) d[i][0] = i; + for (int i = 1; i <= len2; ++i) d[0][i] = i; + + for (int i = 1; i <= len1; ++i) { + for (int j = 1; j <= len2; ++j) { + d[i][j] = std::min( std::min(d[i - 1][j] + 1,d[i][j - 1] + 1), + d[i - 1][j - 1] + (str1[i - 1] == str2[j - 1] ? 0 : 1) ); + } + } + + return d[len1][len2]; +} + +/** + * Are the two given strings similar according to some metric? + */ +int similarity(const char* str1, int len1, const char* str2, int len2) { + // Do not check very short strings, because they create too many false + // positives. + if (len1 < MIN_STRLEN || len2 < MIN_STRLEN) { + return -1; + } + + // Do not check very long strings. This keeps memory use and run time for + // Levenshtein algorithm in check. + if (len1 >= MAX_STRLEN || len2 >= MAX_STRLEN) { + return -1; + } + + // Check if one string is a substring of the other. This will also check + // if both strings differ only in case. + if (strcasestr(str1, str2) || strcasestr(str2, str1)) { + return 0; + } + + // Do not check strings if they have very different lengths, they can't + // be similar according to Levenshtein anyway. + if (abs(len1 - len2) >= MAX_EDIT_DISTANCE) { + return -1; + } + + // Check Levenshtein edit distance. + int distance = edit_distance(str1, len1, str2, len2); + if (distance <= MAX_EDIT_DISTANCE) { + return distance; + } + + return -1; +} + +/** + * Iterate over all (null-terminated) strings in the memory between begin and + * end and find similar strings. + */ +void find_similarities(const char* begin, const char* end, Sqlite::Statement& insert) { + int len1; + for (const char* str1 = begin; str1 != end; str1 += len1 + 1) { + len1 = strlen(str1); + int len2; + for (const char* str2 = str1 + len1; str2 != end; str2 += len2 + 1) { + len2 = strlen(str2); + int sim = similarity(str1, len1, str2, len2); + if (sim >= 0) { + //std::cout << "[" << str1 << "][" << str2 << "]\n"; + insert.bind_text(str1).bind_text(str2).bind_int(sim).execute(); + } + } + } +} + +int main(int argc, char *argv[]) { + if (argc != 2) { + std::cerr << "similarity DATABASE\n"; + return 1; + } + + std::string data; + + Sqlite::Database db(argv[1], SQLITE_OPEN_READWRITE); + Sqlite::Statement select(db, "SELECT key FROM keys ORDER BY key"); + while (select.read()) { + data += select.get_string(0); + data += '\0'; + } + + Sqlite::Statement insert(db, "INSERT INTO similar_keys (key1, key2, similarity) VALUES (?, ?, ?)"); + db.begin_transaction(); + find_similarities(data.c_str(), data.c_str() + data.size(), insert); + db.commit(); + + return 0; +} + diff --git a/tagstats/sqlite.hpp b/tagstats/sqlite.hpp index 45dd2e2..8860fbf 100644 --- a/tagstats/sqlite.hpp +++ b/tagstats/sqlite.hpp @@ -196,6 +196,17 @@ namespace Sqlite { return std::string(textptr); } + const char* get_string(int column) { + if (column >= column_count()) { + throw Sqlite::Exception("Column larger than max columns", ""); + } + const char* textptr = reinterpret_cast(sqlite3_column_text(m_statement, column)); + if (!textptr) { + throw Sqlite::Exception("Error reading text column", m_db.errmsg()); + } + return textptr; + } + int get_int(int column) { if (column >= column_count()) { throw Sqlite::Exception("Column larger than max columns", m_db.errmsg()); -- cgit v1.2.3