From c8a334b10512a708d19ed5fd42aa60424290a371 Mon Sep 17 00:00:00 2001 From: Jochen Topf Date: Mon, 23 Mar 2015 17:48:55 +0100 Subject: Some experimental code to find unicode properties in keys. OSM allows nearly all unicode characters in keys and values. This is some experimental code to find keys that contain unusual characters. Currently this is not run or used in the normal setup, but I thought I'd throw it out there in case somebody wants to play with it. I am not sure yet myself where this is going. --- sources/db/pre.sql | 13 ++++ sources/db/update.sh | 3 + tagstats/.gitignore | 1 + tagstats/Makefile | 8 ++- tagstats/taginfo_unicode.cpp | 163 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 187 insertions(+), 1 deletion(-) create mode 100644 tagstats/taginfo_unicode.cpp diff --git a/sources/db/pre.sql b/sources/db/pre.sql index 9faf613..165dd38 100644 --- a/sources/db/pre.sql +++ b/sources/db/pre.sql @@ -139,3 +139,16 @@ CREATE TABLE prevalent_roles ( fraction REAL ); +DROP TABLE IF EXISTS key_characters; + +CREATE TABLE key_characters ( + key TEXT, + num INTEGER, + utf8 TEXT, + codepoint TEXT, + block INTEGER, + category TEXT, + direction INTEGER, + name TEXT +); + diff --git a/sources/db/update.sh b/sources/db/update.sh index faa5cde..add5a29 100755 --- a/sources/db/update.sh +++ b/sources/db/update.sh @@ -76,6 +76,9 @@ sqlite3 $DATABASE +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "sqlite.hpp" + +const char* category_to_string(int8_t category) { + switch (category) { + // letters + case 1: return "Lu"; // uppercase letter + case 2: return "Ll"; // lowercase letter + case 3: return "Lt"; // titlecase letter + case 4: return "Lm"; // modifier letter + case 5: return "Lo"; // other letter + // marks + case 6: return "Mn"; // non-spacing mark + case 7: return "Me"; // enclosing mark + case 8: return "Mc"; // combining spacing mark + // numbers + case 9: return "Nd"; // decimal digit number + case 10: return "Nl"; // letter number + case 11: return "No"; // other number + // separators + case 12: return "Zs"; // space separator + case 13: return "Zl"; // line separator + case 14: return "Zp"; // paragraph separator + // control characters etc. + case 15: return "Cc"; // control char + case 16: return "Cf"; // format char + case 17: return "Co"; // private use char + case 18: return "Cs"; // surrogate + // punctuations + case 19: return "Pd"; // dash punctuation + case 20: return "Ps"; // start punctuation + case 21: return "Pe"; // end punctuation + case 22: return "Pc"; // connector punctuation + case 23: return "Po"; // other punctuation + // symbols + case 24: return "Sm"; // math symbol + case 25: return "Sc"; // currency symbol + case 26: return "Sk"; // modifier symbol + case 27: return "So"; // other symbol + // punctuations cont. + case 28: return "Pi"; // initial punctuation + case 29: return "Pf"; // final punctuation + default: + return "UNKNOWN"; + } +} + +void get_unicode_info(const char* text, const icu::UnicodeString& us, Sqlite::Statement& insert) { + bool allokay = true; + for (const char* t = text; *t; ++t) { + if (!(std::isalnum(*t) || *t == '_' || *t == ':' || *t == ' ' || *t == '.' || *t == '-')) { + allokay = false; + break; + } + } + + if (allokay) { + return; + } + + bool unusual = false; + for (icu::StringCharacterIterator it(us); it.hasNext(); it.next()) { + UChar32 codepoint = it.current32(); + int8_t chartype = u_charType(codepoint); + if (! u_isprint(codepoint)) { + unusual = true; + break; + } + if (u_charDirection(codepoint) != 0) { + unusual = true; + break; + } + if (chartype != 1 && // UPPERCASE_LETTER + chartype != 2 && // LOWERCASE_LETTER + chartype != 9 && // DECIMAL_DIGIT_NUMBER + chartype != 12 && // SPACE_SEPARATOR + chartype != 19 && // DASH_PUNCTUATION + chartype != 22 && // CONNECTOR_PUNCTUATION + chartype != 23) { // OTHER_PUNCTUATION + unusual = true; + break; + } + } + + if (unusual) { + int num = 0; + for (icu::StringCharacterIterator it(us); it.hasNext(); it.next(), ++num) { + UChar32 codepoint = it.current32(); + + int8_t chartype = u_charType(codepoint); + + char buffer[100]; + UErrorCode errorCode = U_ZERO_ERROR; + u_charName(codepoint, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode); + + UCharDirection direction = u_charDirection(codepoint); + int32_t block = u_getIntPropertyValue(codepoint, UCHAR_BLOCK); + + icu::UnicodeString::UnicodeString ustr(codepoint); + std::string str; + ustr.toUTF8String(str); + + char uplus[10]; + snprintf(uplus, 10, "U+%04x", codepoint); + + insert. + bind_text(text). + bind_int(num). + bind_text(str.c_str()). + bind_text(uplus). + bind_int(block). + bind_text(category_to_string(chartype)). + bind_int(direction). + bind_text(buffer). + execute(); + } + } +} + +void find_unicode_info(const char* begin, const char* end, Sqlite::Statement& insert) { + for (; begin != end; begin += strlen(begin) + 1) { + get_unicode_info(begin, icu::UnicodeString::fromUTF8(begin), insert); + } +} + +int main(int argc, char *argv[]) { + if (argc != 2) { + std::cerr << "taginfo_unicode DATABASE\n"; + return 1; + } + + std::string data; + + Sqlite::Database db(argv[1], SQLITE_OPEN_READWRITE); + Sqlite::Statement select(db, "SELECT key FROM keys WHERE characters NOT IN ('plain', 'colon') ORDER BY key"); + while (select.read()) { + data += select.get_string(0); + data += '\0'; + } + + + Sqlite::Statement insert(db, "INSERT INTO key_characters (key, num, utf8, codepoint, block, category, direction, name) VALUES (?, ?, ?, ?, ?, ?, ?, ?)"); + db.begin_transaction(); + find_unicode_info(data.c_str(), data.c_str() + data.size(), insert); + db.commit(); + + return 0; +} + -- cgit v1.2.3