From 4a2073856b3ca10b61c158a12cc36aaaf8842f46 Mon Sep 17 00:00:00 2001 From: Jochen Topf Date: Thu, 26 Feb 2015 15:41:23 +0100 Subject: Add similarity program to check for similar keys. This adds the 'similarity' program that checks for similar keys in the taginfo-db database. It is not run yet by the update script, but the supporting database tables etc. are added by this commit. --- tagstats/.gitignore | 1 + tagstats/Makefile | 5 +- tagstats/similarity.cpp | 120 ++++++++++++++++++++++++++++++++++++++++++++++++ tagstats/sqlite.hpp | 11 +++++ 4 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 tagstats/similarity.cpp (limited to 'tagstats') diff --git a/tagstats/.gitignore b/tagstats/.gitignore index fa38f18..2d82327 100644 --- a/tagstats/.gitignore +++ b/tagstats/.gitignore @@ -1,3 +1,4 @@ +similarity tagstats osmstats taginfo-db.db diff --git a/tagstats/Makefile b/tagstats/Makefile index 0358a67..0752b49 100644 --- a/tagstats/Makefile +++ b/tagstats/Makefile @@ -43,7 +43,7 @@ LIB_SQLITE := -lsqlite3 .PHONY: all check indent install clean -all: tagstats osmstats +all: tagstats osmstats similarity osmstats: osmstats.cpp statistics_handler.hpp $(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) -o $@ $< $(LDFLAGS) $(LIB_EXPAT) $(LIB_PBF) $(LIB_SQLITE) @@ -51,6 +51,9 @@ osmstats: osmstats.cpp statistics_handler.hpp tagstats: tagstats.cpp tagstats_handler.hpp statistics_handler.hpp string_store.hpp geodistribution.hpp sqlite.hpp $(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) -o $@ $< $(LDFLAGS) $(LIB_EXPAT) $(LIB_PBF) $(LIB_SQLITE) $(LIB_GD) +similarity: similarity.cpp sqlite.hpp + $(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) -o $@ $< $(LDFLAGS) $(LIB_SQLITE) + check: cppcheck --enable=all tagstats.cpp osmstats.cpp diff --git a/tagstats/similarity.cpp b/tagstats/similarity.cpp new file mode 100644 index 0000000..a9e7445 --- /dev/null +++ b/tagstats/similarity.cpp @@ -0,0 +1,120 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sqlite.hpp" + +const int MIN_STRLEN = 4; +const int MAX_STRLEN = 120; +const int MAX_EDIT_DISTANCE = 2; + +/** + * Compute Levenshtein edit distance. This is a very simple implementation of + * the Levenshtein algorithm I copied from somewhere on the Internet, but it + * is fast enough for our purpose here. + */ +unsigned int edit_distance(const char* str1, int len1, const char* str2, int len2) { + static unsigned d[MAX_STRLEN][MAX_STRLEN]; + + d[0][0] = 0; + for (int i = 1; i <= len1; ++i) d[i][0] = i; + for (int i = 1; i <= len2; ++i) d[0][i] = i; + + for (int i = 1; i <= len1; ++i) { + for (int j = 1; j <= len2; ++j) { + d[i][j] = std::min( std::min(d[i - 1][j] + 1,d[i][j - 1] + 1), + d[i - 1][j - 1] + (str1[i - 1] == str2[j - 1] ? 0 : 1) ); + } + } + + return d[len1][len2]; +} + +/** + * Are the two given strings similar according to some metric? + */ +int similarity(const char* str1, int len1, const char* str2, int len2) { + // Do not check very short strings, because they create too many false + // positives. + if (len1 < MIN_STRLEN || len2 < MIN_STRLEN) { + return -1; + } + + // Do not check very long strings. This keeps memory use and run time for + // Levenshtein algorithm in check. + if (len1 >= MAX_STRLEN || len2 >= MAX_STRLEN) { + return -1; + } + + // Check if one string is a substring of the other. This will also check + // if both strings differ only in case. + if (strcasestr(str1, str2) || strcasestr(str2, str1)) { + return 0; + } + + // Do not check strings if they have very different lengths, they can't + // be similar according to Levenshtein anyway. + if (abs(len1 - len2) >= MAX_EDIT_DISTANCE) { + return -1; + } + + // Check Levenshtein edit distance. + int distance = edit_distance(str1, len1, str2, len2); + if (distance <= MAX_EDIT_DISTANCE) { + return distance; + } + + return -1; +} + +/** + * Iterate over all (null-terminated) strings in the memory between begin and + * end and find similar strings. + */ +void find_similarities(const char* begin, const char* end, Sqlite::Statement& insert) { + int len1; + for (const char* str1 = begin; str1 != end; str1 += len1 + 1) { + len1 = strlen(str1); + int len2; + for (const char* str2 = str1 + len1; str2 != end; str2 += len2 + 1) { + len2 = strlen(str2); + int sim = similarity(str1, len1, str2, len2); + if (sim >= 0) { + //std::cout << "[" << str1 << "][" << str2 << "]\n"; + insert.bind_text(str1).bind_text(str2).bind_int(sim).execute(); + } + } + } +} + +int main(int argc, char *argv[]) { + if (argc != 2) { + std::cerr << "similarity DATABASE\n"; + return 1; + } + + std::string data; + + Sqlite::Database db(argv[1], SQLITE_OPEN_READWRITE); + Sqlite::Statement select(db, "SELECT key FROM keys ORDER BY key"); + while (select.read()) { + data += select.get_string(0); + data += '\0'; + } + + Sqlite::Statement insert(db, "INSERT INTO similar_keys (key1, key2, similarity) VALUES (?, ?, ?)"); + db.begin_transaction(); + find_similarities(data.c_str(), data.c_str() + data.size(), insert); + db.commit(); + + return 0; +} + diff --git a/tagstats/sqlite.hpp b/tagstats/sqlite.hpp index 45dd2e2..8860fbf 100644 --- a/tagstats/sqlite.hpp +++ b/tagstats/sqlite.hpp @@ -196,6 +196,17 @@ namespace Sqlite { return std::string(textptr); } + const char* get_string(int column) { + if (column >= column_count()) { + throw Sqlite::Exception("Column larger than max columns", ""); + } + const char* textptr = reinterpret_cast(sqlite3_column_text(m_statement, column)); + if (!textptr) { + throw Sqlite::Exception("Error reading text column", m_db.errmsg()); + } + return textptr; + } + int get_int(int column) { if (column >= column_count()) { throw Sqlite::Exception("Column larger than max columns", m_db.errmsg()); -- cgit v1.2.3