diff options
author | Jochen Topf <jochen@topf.org> | 2015-03-23 17:48:55 +0100 |
---|---|---|
committer | Jochen Topf <jochen@topf.org> | 2015-04-28 21:55:12 +0200 |
commit | c8a334b10512a708d19ed5fd42aa60424290a371 (patch) | |
tree | beabdda90200e1d6629285def16674b4c5553b90 | |
parent | de091284afa48851da7de95f544e75d6800d4e8a (diff) | |
download | taginfo-c8a334b10512a708d19ed5fd42aa60424290a371.tar taginfo-c8a334b10512a708d19ed5fd42aa60424290a371.tar.gz |
Some experimental code to find unicode properties in keys.
OSM allows nearly all unicode characters in keys and values. This is some
experimental code to find keys that contain unusual characters. Currently
this is not run or used in the normal setup, but I thought I'd throw it
out there in case somebody wants to play with it. I am not sure yet myself
where this is going.
-rw-r--r-- | sources/db/pre.sql | 13 | ||||
-rwxr-xr-x | sources/db/update.sh | 3 | ||||
-rw-r--r-- | tagstats/.gitignore | 1 | ||||
-rw-r--r-- | tagstats/Makefile | 8 | ||||
-rw-r--r-- | tagstats/taginfo_unicode.cpp | 163 |
5 files changed, 187 insertions, 1 deletions
diff --git a/sources/db/pre.sql b/sources/db/pre.sql index 9faf613..165dd38 100644 --- a/sources/db/pre.sql +++ b/sources/db/pre.sql @@ -139,3 +139,16 @@ CREATE TABLE prevalent_roles ( fraction REAL ); +DROP TABLE IF EXISTS key_characters; + +CREATE TABLE key_characters ( + key TEXT, + num INTEGER, + utf8 TEXT, + codepoint TEXT, + block INTEGER, + category TEXT, + direction INTEGER, + name TEXT +); + diff --git a/sources/db/update.sh b/sources/db/update.sh index faa5cde..add5a29 100755 --- a/sources/db/update.sh +++ b/sources/db/update.sh @@ -76,6 +76,9 @@ sqlite3 $DATABASE <post_similar_keys.sql echo "`$DATECMD` Running update_characters... " ./update_characters.rb $DIR +#echo "`$DATECMD` Running taginfo_unicode... " +#./taginfo_unicode $DATABASE + echo "`$DATECMD` Running post_grades.sql... " sqlite3 $DATABASE <post_grades.sql diff --git a/tagstats/.gitignore b/tagstats/.gitignore index 2d82327..a5a613a 100644 --- a/tagstats/.gitignore +++ b/tagstats/.gitignore @@ -1,4 +1,5 @@ similarity +taginfo_unicode tagstats osmstats taginfo-db.db diff --git a/tagstats/Makefile b/tagstats/Makefile index b266fe9..220ce56 100644 --- a/tagstats/Makefile +++ b/tagstats/Makefile @@ -40,10 +40,13 @@ LIB_EXPAT := -lexpat LIB_PBF := -pthread -lz -lprotobuf-lite -losmpbf LIB_GD := -lgd -lz -lm LIB_SQLITE := -lsqlite3 +LIB_ICU := `pkg-config --libs icu-uc icu-io` + +CXXFLAGS_ICU := `pkg-config --cflags icu-uc icu-io` .PHONY: all check indent install clean -all: tagstats osmstats similarity +all: tagstats osmstats similarity taginfo_unicode osmstats: osmstats.cpp statistics_handler.hpp $(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) -o $@ $< $(LDFLAGS) $(LIB_EXPAT) $(LIB_PBF) $(LIB_SQLITE) @@ -54,6 +57,9 @@ tagstats: tagstats.cpp tagstats_handler.hpp statistics_handler.hpp string_store. similarity: similarity.cpp sqlite.hpp $(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) -o $@ $< $(LDFLAGS) $(LIB_SQLITE) +taginfo_unicode: taginfo_unicode.cpp sqlite.hpp + $(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) $(CXXFLAGS_ICU) -o $@ $< $(LDFLAGS) $(LIB_SQLITE) $(LIB_ICU) + check: cppcheck --enable=all tagstats.cpp osmstats.cpp diff --git a/tagstats/taginfo_unicode.cpp b/tagstats/taginfo_unicode.cpp new file mode 100644 index 0000000..bc58956 --- /dev/null +++ b/tagstats/taginfo_unicode.cpp @@ -0,0 +1,163 @@ + +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <iostream> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <vector> + +#include <unicode/schriter.h> +#include <unicode/uchar.h> +#include <unicode/unistr.h> + +#include "sqlite.hpp" + +const char* category_to_string(int8_t category) { + switch (category) { + // letters + case 1: return "Lu"; // uppercase letter + case 2: return "Ll"; // lowercase letter + case 3: return "Lt"; // titlecase letter + case 4: return "Lm"; // modifier letter + case 5: return "Lo"; // other letter + // marks + case 6: return "Mn"; // non-spacing mark + case 7: return "Me"; // enclosing mark + case 8: return "Mc"; // combining spacing mark + // numbers + case 9: return "Nd"; // decimal digit number + case 10: return "Nl"; // letter number + case 11: return "No"; // other number + // separators + case 12: return "Zs"; // space separator + case 13: return "Zl"; // line separator + case 14: return "Zp"; // paragraph separator + // control characters etc. + case 15: return "Cc"; // control char + case 16: return "Cf"; // format char + case 17: return "Co"; // private use char + case 18: return "Cs"; // surrogate + // punctuations + case 19: return "Pd"; // dash punctuation + case 20: return "Ps"; // start punctuation + case 21: return "Pe"; // end punctuation + case 22: return "Pc"; // connector punctuation + case 23: return "Po"; // other punctuation + // symbols + case 24: return "Sm"; // math symbol + case 25: return "Sc"; // currency symbol + case 26: return "Sk"; // modifier symbol + case 27: return "So"; // other symbol + // punctuations cont. + case 28: return "Pi"; // initial punctuation + case 29: return "Pf"; // final punctuation + default: + return "UNKNOWN"; + } +} + +void get_unicode_info(const char* text, const icu::UnicodeString& us, Sqlite::Statement& insert) { + bool allokay = true; + for (const char* t = text; *t; ++t) { + if (!(std::isalnum(*t) || *t == '_' || *t == ':' || *t == ' ' || *t == '.' || *t == '-')) { + allokay = false; + break; + } + } + + if (allokay) { + return; + } + + bool unusual = false; + for (icu::StringCharacterIterator it(us); it.hasNext(); it.next()) { + UChar32 codepoint = it.current32(); + int8_t chartype = u_charType(codepoint); + if (! u_isprint(codepoint)) { + unusual = true; + break; + } + if (u_charDirection(codepoint) != 0) { + unusual = true; + break; + } + if (chartype != 1 && // UPPERCASE_LETTER + chartype != 2 && // LOWERCASE_LETTER + chartype != 9 && // DECIMAL_DIGIT_NUMBER + chartype != 12 && // SPACE_SEPARATOR + chartype != 19 && // DASH_PUNCTUATION + chartype != 22 && // CONNECTOR_PUNCTUATION + chartype != 23) { // OTHER_PUNCTUATION + unusual = true; + break; + } + } + + if (unusual) { + int num = 0; + for (icu::StringCharacterIterator it(us); it.hasNext(); it.next(), ++num) { + UChar32 codepoint = it.current32(); + + int8_t chartype = u_charType(codepoint); + + char buffer[100]; + UErrorCode errorCode = U_ZERO_ERROR; + u_charName(codepoint, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode); + + UCharDirection direction = u_charDirection(codepoint); + int32_t block = u_getIntPropertyValue(codepoint, UCHAR_BLOCK); + + icu::UnicodeString::UnicodeString ustr(codepoint); + std::string str; + ustr.toUTF8String(str); + + char uplus[10]; + snprintf(uplus, 10, "U+%04x", codepoint); + + insert. + bind_text(text). + bind_int(num). + bind_text(str.c_str()). + bind_text(uplus). + bind_int(block). + bind_text(category_to_string(chartype)). + bind_int(direction). + bind_text(buffer). + execute(); + } + } +} + +void find_unicode_info(const char* begin, const char* end, Sqlite::Statement& insert) { + for (; begin != end; begin += strlen(begin) + 1) { + get_unicode_info(begin, icu::UnicodeString::fromUTF8(begin), insert); + } +} + +int main(int argc, char *argv[]) { + if (argc != 2) { + std::cerr << "taginfo_unicode DATABASE\n"; + return 1; + } + + std::string data; + + Sqlite::Database db(argv[1], SQLITE_OPEN_READWRITE); + Sqlite::Statement select(db, "SELECT key FROM keys WHERE characters NOT IN ('plain', 'colon') ORDER BY key"); + while (select.read()) { + data += select.get_string(0); + data += '\0'; + } + + + Sqlite::Statement insert(db, "INSERT INTO key_characters (key, num, utf8, codepoint, block, category, direction, name) VALUES (?, ?, ?, ?, ?, ?, ?, ?)"); + db.begin_transaction(); + find_unicode_info(data.c_str(), data.c_str() + data.size(), insert); + db.commit(); + + return 0; +} + |