diff options
-rw-r--r-- | sources/db/pre.sql | 13 | ||||
-rwxr-xr-x | sources/db/update.sh | 3 | ||||
-rw-r--r-- | tagstats/.gitignore | 1 | ||||
-rw-r--r-- | tagstats/Makefile | 8 | ||||
-rw-r--r-- | tagstats/taginfo_unicode.cpp | 163 |
5 files changed, 187 insertions, 1 deletions
diff --git a/sources/db/pre.sql b/sources/db/pre.sql index 9faf613..165dd38 100644 --- a/sources/db/pre.sql +++ b/sources/db/pre.sql @@ -139,3 +139,16 @@ CREATE TABLE prevalent_roles ( fraction REAL ); +DROP TABLE IF EXISTS key_characters; + +CREATE TABLE key_characters ( + key TEXT, + num INTEGER, + utf8 TEXT, + codepoint TEXT, + block INTEGER, + category TEXT, + direction INTEGER, + name TEXT +); + diff --git a/sources/db/update.sh b/sources/db/update.sh index faa5cde..add5a29 100755 --- a/sources/db/update.sh +++ b/sources/db/update.sh @@ -76,6 +76,9 @@ sqlite3 $DATABASE <post_similar_keys.sql echo "`$DATECMD` Running update_characters... " ./update_characters.rb $DIR +#echo "`$DATECMD` Running taginfo_unicode... " +#./taginfo_unicode $DATABASE + echo "`$DATECMD` Running post_grades.sql... " sqlite3 $DATABASE <post_grades.sql diff --git a/tagstats/.gitignore b/tagstats/.gitignore index 2d82327..a5a613a 100644 --- a/tagstats/.gitignore +++ b/tagstats/.gitignore @@ -1,4 +1,5 @@ similarity +taginfo_unicode tagstats osmstats taginfo-db.db diff --git a/tagstats/Makefile b/tagstats/Makefile index b266fe9..220ce56 100644 --- a/tagstats/Makefile +++ b/tagstats/Makefile @@ -40,10 +40,13 @@ LIB_EXPAT := -lexpat LIB_PBF := -pthread -lz -lprotobuf-lite -losmpbf LIB_GD := -lgd -lz -lm LIB_SQLITE := -lsqlite3 +LIB_ICU := `pkg-config --libs icu-uc icu-io` + +CXXFLAGS_ICU := `pkg-config --cflags icu-uc icu-io` .PHONY: all check indent install clean -all: tagstats osmstats similarity +all: tagstats osmstats similarity taginfo_unicode osmstats: osmstats.cpp statistics_handler.hpp $(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) -o $@ $< $(LDFLAGS) $(LIB_EXPAT) $(LIB_PBF) $(LIB_SQLITE) @@ -54,6 +57,9 @@ tagstats: tagstats.cpp tagstats_handler.hpp statistics_handler.hpp string_store. similarity: similarity.cpp sqlite.hpp $(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) -o $@ $< $(LDFLAGS) $(LIB_SQLITE) +taginfo_unicode: taginfo_unicode.cpp sqlite.hpp + $(CXX) $(CXXFLAGS) $(CXXFLAGS_WARNINGS) $(CXXFLAGS_FEATURES) $(CXXFLAGS_ICU) -o $@ $< $(LDFLAGS) $(LIB_SQLITE) $(LIB_ICU) + check: cppcheck --enable=all tagstats.cpp osmstats.cpp diff --git a/tagstats/taginfo_unicode.cpp b/tagstats/taginfo_unicode.cpp new file mode 100644 index 0000000..bc58956 --- /dev/null +++ b/tagstats/taginfo_unicode.cpp @@ -0,0 +1,163 @@ + +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <iostream> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <vector> + +#include <unicode/schriter.h> +#include <unicode/uchar.h> +#include <unicode/unistr.h> + +#include "sqlite.hpp" + +const char* category_to_string(int8_t category) { + switch (category) { + // letters + case 1: return "Lu"; // uppercase letter + case 2: return "Ll"; // lowercase letter + case 3: return "Lt"; // titlecase letter + case 4: return "Lm"; // modifier letter + case 5: return "Lo"; // other letter + // marks + case 6: return "Mn"; // non-spacing mark + case 7: return "Me"; // enclosing mark + case 8: return "Mc"; // combining spacing mark + // numbers + case 9: return "Nd"; // decimal digit number + case 10: return "Nl"; // letter number + case 11: return "No"; // other number + // separators + case 12: return "Zs"; // space separator + case 13: return "Zl"; // line separator + case 14: return "Zp"; // paragraph separator + // control characters etc. + case 15: return "Cc"; // control char + case 16: return "Cf"; // format char + case 17: return "Co"; // private use char + case 18: return "Cs"; // surrogate + // punctuations + case 19: return "Pd"; // dash punctuation + case 20: return "Ps"; // start punctuation + case 21: return "Pe"; // end punctuation + case 22: return "Pc"; // connector punctuation + case 23: return "Po"; // other punctuation + // symbols + case 24: return "Sm"; // math symbol + case 25: return "Sc"; // currency symbol + case 26: return "Sk"; // modifier symbol + case 27: return "So"; // other symbol + // punctuations cont. + case 28: return "Pi"; // initial punctuation + case 29: return "Pf"; // final punctuation + default: + return "UNKNOWN"; + } +} + +void get_unicode_info(const char* text, const icu::UnicodeString& us, Sqlite::Statement& insert) { + bool allokay = true; + for (const char* t = text; *t; ++t) { + if (!(std::isalnum(*t) || *t == '_' || *t == ':' || *t == ' ' || *t == '.' || *t == '-')) { + allokay = false; + break; + } + } + + if (allokay) { + return; + } + + bool unusual = false; + for (icu::StringCharacterIterator it(us); it.hasNext(); it.next()) { + UChar32 codepoint = it.current32(); + int8_t chartype = u_charType(codepoint); + if (! u_isprint(codepoint)) { + unusual = true; + break; + } + if (u_charDirection(codepoint) != 0) { + unusual = true; + break; + } + if (chartype != 1 && // UPPERCASE_LETTER + chartype != 2 && // LOWERCASE_LETTER + chartype != 9 && // DECIMAL_DIGIT_NUMBER + chartype != 12 && // SPACE_SEPARATOR + chartype != 19 && // DASH_PUNCTUATION + chartype != 22 && // CONNECTOR_PUNCTUATION + chartype != 23) { // OTHER_PUNCTUATION + unusual = true; + break; + } + } + + if (unusual) { + int num = 0; + for (icu::StringCharacterIterator it(us); it.hasNext(); it.next(), ++num) { + UChar32 codepoint = it.current32(); + + int8_t chartype = u_charType(codepoint); + + char buffer[100]; + UErrorCode errorCode = U_ZERO_ERROR; + u_charName(codepoint, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode); + + UCharDirection direction = u_charDirection(codepoint); + int32_t block = u_getIntPropertyValue(codepoint, UCHAR_BLOCK); + + icu::UnicodeString::UnicodeString ustr(codepoint); + std::string str; + ustr.toUTF8String(str); + + char uplus[10]; + snprintf(uplus, 10, "U+%04x", codepoint); + + insert. + bind_text(text). + bind_int(num). + bind_text(str.c_str()). + bind_text(uplus). + bind_int(block). + bind_text(category_to_string(chartype)). + bind_int(direction). + bind_text(buffer). + execute(); + } + } +} + +void find_unicode_info(const char* begin, const char* end, Sqlite::Statement& insert) { + for (; begin != end; begin += strlen(begin) + 1) { + get_unicode_info(begin, icu::UnicodeString::fromUTF8(begin), insert); + } +} + +int main(int argc, char *argv[]) { + if (argc != 2) { + std::cerr << "taginfo_unicode DATABASE\n"; + return 1; + } + + std::string data; + + Sqlite::Database db(argv[1], SQLITE_OPEN_READWRITE); + Sqlite::Statement select(db, "SELECT key FROM keys WHERE characters NOT IN ('plain', 'colon') ORDER BY key"); + while (select.read()) { + data += select.get_string(0); + data += '\0'; + } + + + Sqlite::Statement insert(db, "INSERT INTO key_characters (key, num, utf8, codepoint, block, category, direction, name) VALUES (?, ?, ?, ?, ?, ?, ?, ?)"); + db.begin_transaction(); + find_unicode_info(data.c_str(), data.c_str() + data.size(), insert); + db.commit(); + + return 0; +} + |