Some experimental code to find unicode properties in keys.

OSM allows nearly all unicode characters in keys and values. This is some experimental code to find keys that contain unusual characters. Currently this is not run or used in the normal setup, but I thought I'd throw it out there in case somebody wants to play with it. I am not sure yet myself where this is going.
author: Jochen Topf <jochen@topf.org> 2015-03-23 17:48:55 +0100
committer: Jochen Topf <jochen@topf.org> 2015-04-28 21:55:12 +0200
commit: c8a334b10512a708d19ed5fd42aa60424290a371 (patch)
tree: beabdda90200e1d6629285def16674b4c5553b90
parent: de091284afa48851da7de95f544e75d6800d4e8a (diff)
download: taginfo-c8a334b10512a708d19ed5fd42aa60424290a371.tar
taginfo-c8a334b10512a708d19ed5fd42aa60424290a371.tar.gz
5 files changed, 187 insertions, 1 deletions
diff --git a/sources/db/pre.sql b/sources/db/pre.sql
index 9faf613..165dd38 100644
--- a/sources/db/pre.sql
+++ b/sources/db/pre.sql
@@ -139,3 +139,16 @@ CREATE TABLE prevalent_roles (
   fraction REAL
 );
 
+DROP TABLE IF EXISTS key_characters;
+
+CREATE TABLE key_characters (
+  key       TEXT,
+  num       INTEGER,
+  utf8      TEXT,
+  codepoint TEXT,
+  block     INTEGER,
+  category  TEXT,
+  direction INTEGER,
+  name      TEXT
+);
+
diff --git a/sources/db/update.sh b/sources/db/update.sh
index faa5cde..add5a29 100755
--- a/sources/db/update.sh
+++ b/sources/db/update.sh
@@ -76,6 +76,9 @@ sqlite3 $DATABASE <post_similar_keys.sql
 echo "`$DATECMD` Running update_characters... "
 ./update_characters.rb $DIR
 
+#echo "`$DATECMD` Running taginfo_unicode... "
+#./taginfo_unicode $DATABASE
+
 echo "`$DATECMD` Running post_grades.sql... "
 sqlite3 $DATABASE <post_grades.sql
 
diff --git a/tagstats/.gitignore b/tagstats/.gitignore
index 2d82327..a5a613a 100644
--- a/tagstats/.gitignore
+++ b/tagstats/.gitignore
@@ -1,4 +1,5 @@
 similarity
+taginfo_unicode
 tagstats
 osmstats
 taginfo-db.db
diff --git a/tagstats/Makefile b/tagstats/Makefile
index b266fe9..220ce56 100644
--- a/tagstats/Makefile
+++ b/tagstats/Makefile
@@ -40,10 +40,13 @@ LIB_EXPAT  := -lexpat
 LIB_PBF    := -pthread -lz -lprotobuf-lite -losmpbf
 LIB_GD     := -lgd -lz -lm
 LIB_SQLITE := -lsqlite3
+LIB_ICU    := `pkg-config --libs icu-uc icu-io`
+
+CXXFLAGS_ICU := `pkg-config --cflags icu-uc icu-io`
 
 .PHONY: all check indent install clean
 
-all: tagstats osmstats similarity
+all: tagstats osmstats similarity taginfo_unicode
 
 osmstats: osmstats.cpp statistics_handler.hpp
 	$(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) -o $@ $< $(LDFLAGS) $(LIB_EXPAT) $(LIB_PBF) $(LIB_SQLITE)
@@ -54,6 +57,9 @@ tagstats: tagstats.cpp tagstats_handler.hpp statistics_handler.hpp string_store.
 similarity: similarity.cpp sqlite.hpp
 	$(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) -o $@ $< $(LDFLAGS) $(LIB_SQLITE)
 
+taginfo_unicode: taginfo_unicode.cpp sqlite.hpp
+	$(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) $(CXXFLAGS_ICU) -o $@ $< $(LDFLAGS) $(LIB_SQLITE) $(LIB_ICU)
+
 check:
 	cppcheck --enable=all tagstats.cpp osmstats.cpp
 
diff --git a/tagstats/taginfo_unicode.cpp b/tagstats/taginfo_unicode.cpp
new file mode 100644
index 0000000..bc58956
--- /dev/null
+++ b/tagstats/taginfo_unicode.cpp
@@ -0,0 +1,163 @@
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <vector>
+
+#include <unicode/schriter.h>
+#include <unicode/uchar.h>
+#include <unicode/unistr.h>
+
+#include "sqlite.hpp"
+
+const char* category_to_string(int8_t category) {
+    switch (category) {
+        // letters
+        case  1: return "Lu"; // uppercase letter
+        case  2: return "Ll"; // lowercase letter
+        case  3: return "Lt"; // titlecase letter
+        case  4: return "Lm"; // modifier letter
+        case  5: return "Lo"; // other letter
+        // marks
+        case  6: return "Mn"; // non-spacing mark
+        case  7: return "Me"; // enclosing mark
+        case  8: return "Mc"; // combining spacing mark
+        // numbers
+        case  9: return "Nd"; // decimal digit number
+        case 10: return "Nl"; // letter number
+        case 11: return "No"; // other number
+        // separators
+        case 12: return "Zs"; // space separator
+        case 13: return "Zl"; // line separator
+        case 14: return "Zp"; // paragraph separator
+        // control characters etc.
+        case 15: return "Cc"; // control char
+        case 16: return "Cf"; // format char
+        case 17: return "Co"; // private use char
+        case 18: return "Cs"; // surrogate
+        // punctuations
+        case 19: return "Pd"; // dash punctuation
+        case 20: return "Ps"; // start punctuation
+        case 21: return "Pe"; // end punctuation
+        case 22: return "Pc"; // connector punctuation
+        case 23: return "Po"; // other punctuation
+        // symbols
+        case 24: return "Sm"; // math symbol
+        case 25: return "Sc"; // currency symbol
+        case 26: return "Sk"; // modifier symbol
+        case 27: return "So"; // other symbol
+        // punctuations cont.
+        case 28: return "Pi"; // initial punctuation
+        case 29: return "Pf"; // final punctuation
+        default:
+            return "UNKNOWN";
+    }
+}
+
+void get_unicode_info(const char* text, const icu::UnicodeString& us, Sqlite::Statement& insert) {
+    bool allokay = true;
+    for (const char* t = text; *t; ++t) {
+        if (!(std::isalnum(*t) || *t == '_' || *t == ':' || *t == ' ' || *t == '.' || *t == '-')) {
+            allokay = false;
+            break;
+        }
+    }
+
+    if (allokay) {
+        return;
+    }
+
+    bool unusual = false;
+    for (icu::StringCharacterIterator it(us); it.hasNext(); it.next()) {
+        UChar32 codepoint = it.current32();
+        int8_t chartype = u_charType(codepoint);
+        if (! u_isprint(codepoint)) {
+            unusual = true;
+            break;
+        }
+        if (u_charDirection(codepoint) != 0) {
+            unusual = true;
+            break;
+        }
+        if (chartype !=  1 && // UPPERCASE_LETTER
+            chartype !=  2 && // LOWERCASE_LETTER
+            chartype !=  9 && // DECIMAL_DIGIT_NUMBER
+            chartype != 12 && // SPACE_SEPARATOR
+            chartype != 19 && // DASH_PUNCTUATION
+            chartype != 22 && // CONNECTOR_PUNCTUATION
+            chartype != 23) { // OTHER_PUNCTUATION
+            unusual = true;
+            break;
+        }
+    }
+
+    if (unusual) {
+        int num = 0;
+        for (icu::StringCharacterIterator it(us); it.hasNext(); it.next(), ++num) {
+            UChar32 codepoint = it.current32();
+
+            int8_t chartype = u_charType(codepoint);
+
+            char buffer[100];
+            UErrorCode errorCode = U_ZERO_ERROR;
+            u_charName(codepoint, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
+
+            UCharDirection direction = u_charDirection(codepoint);
+            int32_t block = u_getIntPropertyValue(codepoint, UCHAR_BLOCK);
+
+            icu::UnicodeString::UnicodeString ustr(codepoint);
+            std::string str;
+            ustr.toUTF8String(str);
+
+            char uplus[10];
+            snprintf(uplus, 10, "U+%04x", codepoint);
+
+            insert.
+                bind_text(text).
+                bind_int(num).
+                bind_text(str.c_str()).
+                bind_text(uplus).
+                bind_int(block).
+                bind_text(category_to_string(chartype)).
+                bind_int(direction).
+                bind_text(buffer).
+                execute();
+        }
+    }
+}
+
+void find_unicode_info(const char* begin, const char* end, Sqlite::Statement& insert) {
+    for (; begin != end; begin += strlen(begin) + 1) {
+        get_unicode_info(begin, icu::UnicodeString::fromUTF8(begin), insert);
+    }
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 2) {
+        std::cerr << "taginfo_unicode DATABASE\n";
+        return 1;
+    }
+
+    std::string data;
+
+    Sqlite::Database db(argv[1], SQLITE_OPEN_READWRITE);
+    Sqlite::Statement select(db, "SELECT key FROM keys WHERE characters NOT IN ('plain', 'colon') ORDER BY key");
+    while (select.read()) {
+        data += select.get_string(0);
+        data += '\0';
+    }
+
+
+    Sqlite::Statement insert(db, "INSERT INTO key_characters (key, num, utf8, codepoint, block, category, direction, name) VALUES (?, ?, ?, ?, ?, ?, ?, ?)");
+    db.begin_transaction();
+    find_unicode_info(data.c_str(), data.c_str() + data.size(), insert);
+    db.commit();
+
+    return 0;
+}
+
author	Jochen Topf <jochen@topf.org>	2015-03-23 17:48:55 +0100
committer	Jochen Topf <jochen@topf.org>	2015-04-28 21:55:12 +0200
commit	c8a334b10512a708d19ed5fd42aa60424290a371 (patch)
tree	beabdda90200e1d6629285def16674b4c5553b90
parent	de091284afa48851da7de95f544e75d6800d4e8a (diff)
download	taginfo-c8a334b10512a708d19ed5fd42aa60424290a371.tar taginfo-c8a334b10512a708d19ed5fd42aa60424290a371.tar.gz