5 files changed, 187 insertions, 1 deletions
diff --git a/sources/db/pre.sql b/sources/db/pre.sql
index 9faf613..165dd38 100644
--- a/sources/db/pre.sql
+++ b/sources/db/pre.sql
@@ -139,3 +139,16 @@ CREATE TABLE prevalent_roles (
   fraction REAL
 );
 
+DROP TABLE IF EXISTS key_characters;
+
+CREATE TABLE key_characters (
+  key       TEXT,
+  num       INTEGER,
+  utf8      TEXT,
+  codepoint TEXT,
+  block     INTEGER,
+  category  TEXT,
+  direction INTEGER,
+  name      TEXT
+);
+
diff --git a/sources/db/update.sh b/sources/db/update.sh
index faa5cde..add5a29 100755
--- a/sources/db/update.sh
+++ b/sources/db/update.sh
@@ -76,6 +76,9 @@ sqlite3 $DATABASE <post_similar_keys.sql
 echo "`$DATECMD` Running update_characters... "
 ./update_characters.rb $DIR
 
+#echo "`$DATECMD` Running taginfo_unicode... "
+#./taginfo_unicode $DATABASE
+
 echo "`$DATECMD` Running post_grades.sql... "
 sqlite3 $DATABASE <post_grades.sql
 
diff --git a/tagstats/.gitignore b/tagstats/.gitignore
index 2d82327..a5a613a 100644
--- a/tagstats/.gitignore
+++ b/tagstats/.gitignore
@@ -1,4 +1,5 @@
 similarity
+taginfo_unicode
 tagstats
 osmstats
 taginfo-db.db
diff --git a/tagstats/Makefile b/tagstats/Makefile
index b266fe9..220ce56 100644
--- a/tagstats/Makefile
+++ b/tagstats/Makefile
@@ -40,10 +40,13 @@ LIB_EXPAT  := -lexpat
 LIB_PBF    := -pthread -lz -lprotobuf-lite -losmpbf
 LIB_GD     := -lgd -lz -lm
 LIB_SQLITE := -lsqlite3
+LIB_ICU    := `pkg-config --libs icu-uc icu-io`
+
+CXXFLAGS_ICU := `pkg-config --cflags icu-uc icu-io`
 
 .PHONY: all check indent install clean
 
-all: tagstats osmstats similarity
+all: tagstats osmstats similarity taginfo_unicode
 
 osmstats: osmstats.cpp statistics_handler.hpp
 	$(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) -o $@ $< $(LDFLAGS) $(LIB_EXPAT) $(LIB_PBF) $(LIB_SQLITE)
@@ -54,6 +57,9 @@ tagstats: tagstats.cpp tagstats_handler.hpp statistics_handler.hpp string_store.
 similarity: similarity.cpp sqlite.hpp
 	$(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) -o $@ $< $(LDFLAGS) $(LIB_SQLITE)
 
+taginfo_unicode: taginfo_unicode.cpp sqlite.hpp
+	$(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) $(CXXFLAGS_ICU) -o $@ $< $(LDFLAGS) $(LIB_SQLITE) $(LIB_ICU)
+
 check:
 	cppcheck --enable=all tagstats.cpp osmstats.cpp
 
diff --git a/tagstats/taginfo_unicode.cpp b/tagstats/taginfo_unicode.cpp
new file mode 100644
index 0000000..bc58956
--- /dev/null
+++ b/tagstats/taginfo_unicode.cpp
@@ -0,0 +1,163 @@
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <vector>
+
+#include <unicode/schriter.h>
+#include <unicode/uchar.h>
+#include <unicode/unistr.h>
+
+#include "sqlite.hpp"
+
+const char* category_to_string(int8_t category) {
+    switch (category) {
+        // letters
+        case  1: return "Lu"; // uppercase letter
+        case  2: return "Ll"; // lowercase letter
+        case  3: return "Lt"; // titlecase letter
+        case  4: return "Lm"; // modifier letter
+        case  5: return "Lo"; // other letter
+        // marks
+        case  6: return "Mn"; // non-spacing mark
+        case  7: return "Me"; // enclosing mark
+        case  8: return "Mc"; // combining spacing mark
+        // numbers
+        case  9: return "Nd"; // decimal digit number
+        case 10: return "Nl"; // letter number
+        case 11: return "No"; // other number
+        // separators
+        case 12: return "Zs"; // space separator
+        case 13: return "Zl"; // line separator
+        case 14: return "Zp"; // paragraph separator
+        // control characters etc.
+        case 15: return "Cc"; // control char
+        case 16: return "Cf"; // format char
+        case 17: return "Co"; // private use char
+        case 18: return "Cs"; // surrogate
+        // punctuations
+        case 19: return "Pd"; // dash punctuation
+        case 20: return "Ps"; // start punctuation
+        case 21: return "Pe"; // end punctuation
+        case 22: return "Pc"; // connector punctuation
+        case 23: return "Po"; // other punctuation
+        // symbols
+        case 24: return "Sm"; // math symbol
+        case 25: return "Sc"; // currency symbol
+        case 26: return "Sk"; // modifier symbol
+        case 27: return "So"; // other symbol
+        // punctuations cont.
+        case 28: return "Pi"; // initial punctuation
+        case 29: return "Pf"; // final punctuation
+        default:
+            return "UNKNOWN";
+    }
+}
+
+void get_unicode_info(const char* text, const icu::UnicodeString& us, Sqlite::Statement& insert) {
+    bool allokay = true;
+    for (const char* t = text; *t; ++t) {
+        if (!(std::isalnum(*t) || *t == '_' || *t == ':' || *t == ' ' || *t == '.' || *t == '-')) {
+            allokay = false;
+            break;
+        }
+    }
+
+    if (allokay) {
+        return;
+    }
+
+    bool unusual = false;
+    for (icu::StringCharacterIterator it(us); it.hasNext(); it.next()) {
+        UChar32 codepoint = it.current32();
+        int8_t chartype = u_charType(codepoint);
+        if (! u_isprint(codepoint)) {
+            unusual = true;
+            break;
+        }
+        if (u_charDirection(codepoint) != 0) {
+            unusual = true;
+            break;
+        }
+        if (chartype !=  1 && // UPPERCASE_LETTER
+            chartype !=  2 && // LOWERCASE_LETTER
+            chartype !=  9 && // DECIMAL_DIGIT_NUMBER
+            chartype != 12 && // SPACE_SEPARATOR
+            chartype != 19 && // DASH_PUNCTUATION
+            chartype != 22 && // CONNECTOR_PUNCTUATION
+            chartype != 23) { // OTHER_PUNCTUATION
+            unusual = true;
+            break;
+        }
+    }
+
+    if (unusual) {
+        int num = 0;
+        for (icu::StringCharacterIterator it(us); it.hasNext(); it.next(), ++num) {
+            UChar32 codepoint = it.current32();
+
+            int8_t chartype = u_charType(codepoint);
+
+            char buffer[100];
+            UErrorCode errorCode = U_ZERO_ERROR;
+            u_charName(codepoint, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
+
+            UCharDirection direction = u_charDirection(codepoint);
+            int32_t block = u_getIntPropertyValue(codepoint, UCHAR_BLOCK);
+
+            icu::UnicodeString::UnicodeString ustr(codepoint);
+            std::string str;
+            ustr.toUTF8String(str);
+
+            char uplus[10];
+            snprintf(uplus, 10, "U+%04x", codepoint);
+
+            insert.
+                bind_text(text).
+                bind_int(num).
+                bind_text(str.c_str()).
+                bind_text(uplus).
+                bind_int(block).
+                bind_text(category_to_string(chartype)).
+                bind_int(direction).
+                bind_text(buffer).
+                execute();
+        }
+    }
+}
+
+void find_unicode_info(const char* begin, const char* end, Sqlite::Statement& insert) {
+    for (; begin != end; begin += strlen(begin) + 1) {
+        get_unicode_info(begin, icu::UnicodeString::fromUTF8(begin), insert);
+    }
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 2) {
+        std::cerr << "taginfo_unicode DATABASE\n";
+        return 1;
+    }
+
+    std::string data;
+
+    Sqlite::Database db(argv[1], SQLITE_OPEN_READWRITE);
+    Sqlite::Statement select(db, "SELECT key FROM keys WHERE characters NOT IN ('plain', 'colon') ORDER BY key");
+    while (select.read()) {
+        data += select.get_string(0);
+        data += '\0';
+    }
+
+
+    Sqlite::Statement insert(db, "INSERT INTO key_characters (key, num, utf8, codepoint, block, category, direction, name) VALUES (?, ?, ?, ?, ?, ?, ?, ?)");
+    db.begin_transaction();
+    find_unicode_info(data.c_str(), data.c_str() + data.size(), insert);
+    db.commit();
+
+    return 0;
+}
+