aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJochen Topf <jochen@topf.org>2015-03-23 17:48:55 +0100
committerJochen Topf <jochen@topf.org>2015-04-28 21:55:12 +0200
commitc8a334b10512a708d19ed5fd42aa60424290a371 (patch)
treebeabdda90200e1d6629285def16674b4c5553b90
parentde091284afa48851da7de95f544e75d6800d4e8a (diff)
downloadtaginfo-c8a334b10512a708d19ed5fd42aa60424290a371.tar
taginfo-c8a334b10512a708d19ed5fd42aa60424290a371.tar.gz
Some experimental code to find unicode properties in keys.
OSM allows nearly all unicode characters in keys and values. This is some experimental code to find keys that contain unusual characters. Currently this is not run or used in the normal setup, but I thought I'd throw it out there in case somebody wants to play with it. I am not sure yet myself where this is going.
-rw-r--r--sources/db/pre.sql13
-rwxr-xr-xsources/db/update.sh3
-rw-r--r--tagstats/.gitignore1
-rw-r--r--tagstats/Makefile8
-rw-r--r--tagstats/taginfo_unicode.cpp163
5 files changed, 187 insertions, 1 deletions
diff --git a/sources/db/pre.sql b/sources/db/pre.sql
index 9faf613..165dd38 100644
--- a/sources/db/pre.sql
+++ b/sources/db/pre.sql
@@ -139,3 +139,16 @@ CREATE TABLE prevalent_roles (
fraction REAL
);
+DROP TABLE IF EXISTS key_characters;
+
+CREATE TABLE key_characters (
+ key TEXT,
+ num INTEGER,
+ utf8 TEXT,
+ codepoint TEXT,
+ block INTEGER,
+ category TEXT,
+ direction INTEGER,
+ name TEXT
+);
+
diff --git a/sources/db/update.sh b/sources/db/update.sh
index faa5cde..add5a29 100755
--- a/sources/db/update.sh
+++ b/sources/db/update.sh
@@ -76,6 +76,9 @@ sqlite3 $DATABASE <post_similar_keys.sql
echo "`$DATECMD` Running update_characters... "
./update_characters.rb $DIR
+#echo "`$DATECMD` Running taginfo_unicode... "
+#./taginfo_unicode $DATABASE
+
echo "`$DATECMD` Running post_grades.sql... "
sqlite3 $DATABASE <post_grades.sql
diff --git a/tagstats/.gitignore b/tagstats/.gitignore
index 2d82327..a5a613a 100644
--- a/tagstats/.gitignore
+++ b/tagstats/.gitignore
@@ -1,4 +1,5 @@
similarity
+taginfo_unicode
tagstats
osmstats
taginfo-db.db
diff --git a/tagstats/Makefile b/tagstats/Makefile
index b266fe9..220ce56 100644
--- a/tagstats/Makefile
+++ b/tagstats/Makefile
@@ -40,10 +40,13 @@ LIB_EXPAT := -lexpat
LIB_PBF := -pthread -lz -lprotobuf-lite -losmpbf
LIB_GD := -lgd -lz -lm
LIB_SQLITE := -lsqlite3
+LIB_ICU := `pkg-config --libs icu-uc icu-io`
+
+CXXFLAGS_ICU := `pkg-config --cflags icu-uc icu-io`
.PHONY: all check indent install clean
-all: tagstats osmstats similarity
+all: tagstats osmstats similarity taginfo_unicode
osmstats: osmstats.cpp statistics_handler.hpp
$(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) -o $@ $< $(LDFLAGS) $(LIB_EXPAT) $(LIB_PBF) $(LIB_SQLITE)
@@ -54,6 +57,9 @@ tagstats: tagstats.cpp tagstats_handler.hpp statistics_handler.hpp string_store.
similarity: similarity.cpp sqlite.hpp
$(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) -o $@ $< $(LDFLAGS) $(LIB_SQLITE)
+taginfo_unicode: taginfo_unicode.cpp sqlite.hpp
+ $(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) $(CXXFLAGS_ICU) -o $@ $< $(LDFLAGS) $(LIB_SQLITE) $(LIB_ICU)
+
check:
cppcheck --enable=all tagstats.cpp osmstats.cpp
diff --git a/tagstats/taginfo_unicode.cpp b/tagstats/taginfo_unicode.cpp
new file mode 100644
index 0000000..bc58956
--- /dev/null
+++ b/tagstats/taginfo_unicode.cpp
@@ -0,0 +1,163 @@
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <vector>
+
+#include <unicode/schriter.h>
+#include <unicode/uchar.h>
+#include <unicode/unistr.h>
+
+#include "sqlite.hpp"
+
+const char* category_to_string(int8_t category) {
+ switch (category) {
+ // letters
+ case 1: return "Lu"; // uppercase letter
+ case 2: return "Ll"; // lowercase letter
+ case 3: return "Lt"; // titlecase letter
+ case 4: return "Lm"; // modifier letter
+ case 5: return "Lo"; // other letter
+ // marks
+ case 6: return "Mn"; // non-spacing mark
+ case 7: return "Me"; // enclosing mark
+ case 8: return "Mc"; // combining spacing mark
+ // numbers
+ case 9: return "Nd"; // decimal digit number
+ case 10: return "Nl"; // letter number
+ case 11: return "No"; // other number
+ // separators
+ case 12: return "Zs"; // space separator
+ case 13: return "Zl"; // line separator
+ case 14: return "Zp"; // paragraph separator
+ // control characters etc.
+ case 15: return "Cc"; // control char
+ case 16: return "Cf"; // format char
+ case 17: return "Co"; // private use char
+ case 18: return "Cs"; // surrogate
+ // punctuations
+ case 19: return "Pd"; // dash punctuation
+ case 20: return "Ps"; // start punctuation
+ case 21: return "Pe"; // end punctuation
+ case 22: return "Pc"; // connector punctuation
+ case 23: return "Po"; // other punctuation
+ // symbols
+ case 24: return "Sm"; // math symbol
+ case 25: return "Sc"; // currency symbol
+ case 26: return "Sk"; // modifier symbol
+ case 27: return "So"; // other symbol
+ // punctuations cont.
+ case 28: return "Pi"; // initial punctuation
+ case 29: return "Pf"; // final punctuation
+ default:
+ return "UNKNOWN";
+ }
+}
+
+void get_unicode_info(const char* text, const icu::UnicodeString& us, Sqlite::Statement& insert) {
+ bool allokay = true;
+ for (const char* t = text; *t; ++t) {
+ if (!(std::isalnum(*t) || *t == '_' || *t == ':' || *t == ' ' || *t == '.' || *t == '-')) {
+ allokay = false;
+ break;
+ }
+ }
+
+ if (allokay) {
+ return;
+ }
+
+ bool unusual = false;
+ for (icu::StringCharacterIterator it(us); it.hasNext(); it.next()) {
+ UChar32 codepoint = it.current32();
+ int8_t chartype = u_charType(codepoint);
+ if (! u_isprint(codepoint)) {
+ unusual = true;
+ break;
+ }
+ if (u_charDirection(codepoint) != 0) {
+ unusual = true;
+ break;
+ }
+ if (chartype != 1 && // UPPERCASE_LETTER
+ chartype != 2 && // LOWERCASE_LETTER
+ chartype != 9 && // DECIMAL_DIGIT_NUMBER
+ chartype != 12 && // SPACE_SEPARATOR
+ chartype != 19 && // DASH_PUNCTUATION
+ chartype != 22 && // CONNECTOR_PUNCTUATION
+ chartype != 23) { // OTHER_PUNCTUATION
+ unusual = true;
+ break;
+ }
+ }
+
+ if (unusual) {
+ int num = 0;
+ for (icu::StringCharacterIterator it(us); it.hasNext(); it.next(), ++num) {
+ UChar32 codepoint = it.current32();
+
+ int8_t chartype = u_charType(codepoint);
+
+ char buffer[100];
+ UErrorCode errorCode = U_ZERO_ERROR;
+ u_charName(codepoint, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
+
+ UCharDirection direction = u_charDirection(codepoint);
+ int32_t block = u_getIntPropertyValue(codepoint, UCHAR_BLOCK);
+
+ icu::UnicodeString::UnicodeString ustr(codepoint);
+ std::string str;
+ ustr.toUTF8String(str);
+
+ char uplus[10];
+ snprintf(uplus, 10, "U+%04x", codepoint);
+
+ insert.
+ bind_text(text).
+ bind_int(num).
+ bind_text(str.c_str()).
+ bind_text(uplus).
+ bind_int(block).
+ bind_text(category_to_string(chartype)).
+ bind_int(direction).
+ bind_text(buffer).
+ execute();
+ }
+ }
+}
+
+void find_unicode_info(const char* begin, const char* end, Sqlite::Statement& insert) {
+ for (; begin != end; begin += strlen(begin) + 1) {
+ get_unicode_info(begin, icu::UnicodeString::fromUTF8(begin), insert);
+ }
+}
+
+int main(int argc, char *argv[]) {
+ if (argc != 2) {
+ std::cerr << "taginfo_unicode DATABASE\n";
+ return 1;
+ }
+
+ std::string data;
+
+ Sqlite::Database db(argv[1], SQLITE_OPEN_READWRITE);
+ Sqlite::Statement select(db, "SELECT key FROM keys WHERE characters NOT IN ('plain', 'colon') ORDER BY key");
+ while (select.read()) {
+ data += select.get_string(0);
+ data += '\0';
+ }
+
+
+ Sqlite::Statement insert(db, "INSERT INTO key_characters (key, num, utf8, codepoint, block, category, direction, name) VALUES (?, ?, ?, ?, ?, ?, ?, ?)");
+ db.begin_transaction();
+ find_unicode_info(data.c_str(), data.c_str() + data.size(), insert);
+ db.commit();
+
+ return 0;
+}
+