tagstats/taginfo_unicode.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <vector>

#include <unicode/schriter.h>
#include <unicode/uchar.h>
#include <unicode/unistr.h>

#include "sqlite.hpp"

const char* category_to_string(int8_t category) {
    switch (category) {
        // letters
        case  1: return "Lu"; // uppercase letter
        case  2: return "Ll"; // lowercase letter
        case  3: return "Lt"; // titlecase letter
        case  4: return "Lm"; // modifier letter
        case  5: return "Lo"; // other letter
        // marks
        case  6: return "Mn"; // non-spacing mark
        case  7: return "Me"; // enclosing mark
        case  8: return "Mc"; // combining spacing mark
        // numbers
        case  9: return "Nd"; // decimal digit number
        case 10: return "Nl"; // letter number
        case 11: return "No"; // other number
        // separators
        case 12: return "Zs"; // space separator
        case 13: return "Zl"; // line separator
        case 14: return "Zp"; // paragraph separator
        // control characters etc.
        case 15: return "Cc"; // control char
        case 16: return "Cf"; // format char
        case 17: return "Co"; // private use char
        case 18: return "Cs"; // surrogate
        // punctuations
        case 19: return "Pd"; // dash punctuation
        case 20: return "Ps"; // start punctuation
        case 21: return "Pe"; // end punctuation
        case 22: return "Pc"; // connector punctuation
        case 23: return "Po"; // other punctuation
        // symbols
        case 24: return "Sm"; // math symbol
        case 25: return "Sc"; // currency symbol
        case 26: return "Sk"; // modifier symbol
        case 27: return "So"; // other symbol
        // punctuations cont.
        case 28: return "Pi"; // initial punctuation
        case 29: return "Pf"; // final punctuation
        default:
            return "UNKNOWN";
    }
}

void get_unicode_info(const char* text, const icu::UnicodeString& us, Sqlite::Statement& insert) {
    bool allokay = true;
    for (const char* t = text; *t; ++t) {
        if (!(std::isalnum(*t) || *t == '_' || *t == ':' || *t == ' ' || *t == '.' || *t == '-')) {
            allokay = false;
            break;
        }
    }

    if (allokay) {
        return;
    }

    bool unusual = false;
    for (icu::StringCharacterIterator it(us); it.hasNext(); it.next()) {
        UChar32 codepoint = it.current32();
        int8_t chartype = u_charType(codepoint);
        if (! u_isprint(codepoint)) {
            unusual = true;
            break;
        }
        if (u_charDirection(codepoint) != 0) {
            unusual = true;
            break;
        }
        if (chartype !=  1 && // UPPERCASE_LETTER
            chartype !=  2 && // LOWERCASE_LETTER
            chartype !=  9 && // DECIMAL_DIGIT_NUMBER
            chartype != 12 && // SPACE_SEPARATOR
            chartype != 19 && // DASH_PUNCTUATION
            chartype != 22 && // CONNECTOR_PUNCTUATION
            chartype != 23) { // OTHER_PUNCTUATION
            unusual = true;
            break;
        }
    }

    if (unusual) {
        int num = 0;
        for (icu::StringCharacterIterator it(us); it.hasNext(); it.next(), ++num) {
            UChar32 codepoint = it.current32();

            int8_t chartype = u_charType(codepoint);

            char buffer[100];
            UErrorCode errorCode = U_ZERO_ERROR;
            u_charName(codepoint, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);

            UCharDirection direction = u_charDirection(codepoint);
            int32_t block = u_getIntPropertyValue(codepoint, UCHAR_BLOCK);

            icu::UnicodeString::UnicodeString ustr(codepoint);
            std::string str;
            ustr.toUTF8String(str);

            char uplus[10];
            snprintf(uplus, 10, "U+%04x", codepoint);

            insert.
                bind_text(text).
                bind_int(num).
                bind_text(str.c_str()).
                bind_text(uplus).
                bind_int(block).
                bind_text(category_to_string(chartype)).
                bind_int(direction).
                bind_text(buffer).
                execute();
        }
    }
}

void find_unicode_info(const char* begin, const char* end, Sqlite::Statement& insert) {
    for (; begin != end; begin += strlen(begin) + 1) {
        get_unicode_info(begin, icu::UnicodeString::fromUTF8(begin), insert);
    }
}

int main(int argc, char *argv[]) {
    if (argc != 2) {
        std::cerr << "taginfo_unicode DATABASE\n";
        return 1;
    }

    std::string data;

    Sqlite::Database db(argv[1], SQLITE_OPEN_READWRITE);
    Sqlite::Statement select(db, "SELECT key FROM keys WHERE characters NOT IN ('plain', 'colon') ORDER BY key");
    while (select.read()) {
        data += select.get_string(0);
        data += '\0';
    }


    Sqlite::Statement insert(db, "INSERT INTO key_characters (key, num, utf8, codepoint, block, category, direction, name) VALUES (?, ?, ?, ?, ?, ?, ?, ?)");
    db.begin_transaction();
    find_unicode_info(data.c_str(), data.c_str() + data.size(), insert);
    db.commit();

    return 0;
}