1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
|
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <vector>
#include <unicode/schriter.h>
#include <unicode/uchar.h>
#include <unicode/unistr.h>
#include "sqlite.hpp"
const char* category_to_string(int8_t category) {
switch (category) {
// letters
case 1: return "Lu"; // uppercase letter
case 2: return "Ll"; // lowercase letter
case 3: return "Lt"; // titlecase letter
case 4: return "Lm"; // modifier letter
case 5: return "Lo"; // other letter
// marks
case 6: return "Mn"; // non-spacing mark
case 7: return "Me"; // enclosing mark
case 8: return "Mc"; // combining spacing mark
// numbers
case 9: return "Nd"; // decimal digit number
case 10: return "Nl"; // letter number
case 11: return "No"; // other number
// separators
case 12: return "Zs"; // space separator
case 13: return "Zl"; // line separator
case 14: return "Zp"; // paragraph separator
// control characters etc.
case 15: return "Cc"; // control char
case 16: return "Cf"; // format char
case 17: return "Co"; // private use char
case 18: return "Cs"; // surrogate
// punctuations
case 19: return "Pd"; // dash punctuation
case 20: return "Ps"; // start punctuation
case 21: return "Pe"; // end punctuation
case 22: return "Pc"; // connector punctuation
case 23: return "Po"; // other punctuation
// symbols
case 24: return "Sm"; // math symbol
case 25: return "Sc"; // currency symbol
case 26: return "Sk"; // modifier symbol
case 27: return "So"; // other symbol
// punctuations cont.
case 28: return "Pi"; // initial punctuation
case 29: return "Pf"; // final punctuation
default:
return "UNKNOWN";
}
}
void get_unicode_info(const char* text, const icu::UnicodeString& us, Sqlite::Statement& insert) {
bool allokay = true;
for (const char* t = text; *t; ++t) {
if (!(std::isalnum(*t) || *t == '_' || *t == ':' || *t == ' ' || *t == '.' || *t == '-')) {
allokay = false;
break;
}
}
if (allokay) {
return;
}
bool unusual = false;
for (icu::StringCharacterIterator it(us); it.hasNext(); it.next()) {
UChar32 codepoint = it.current32();
int8_t chartype = u_charType(codepoint);
if (! u_isprint(codepoint)) {
unusual = true;
break;
}
if (u_charDirection(codepoint) != 0) {
unusual = true;
break;
}
if (chartype != 1 && // UPPERCASE_LETTER
chartype != 2 && // LOWERCASE_LETTER
chartype != 9 && // DECIMAL_DIGIT_NUMBER
chartype != 12 && // SPACE_SEPARATOR
chartype != 19 && // DASH_PUNCTUATION
chartype != 22 && // CONNECTOR_PUNCTUATION
chartype != 23) { // OTHER_PUNCTUATION
unusual = true;
break;
}
}
if (unusual) {
int num = 0;
for (icu::StringCharacterIterator it(us); it.hasNext(); it.next(), ++num) {
UChar32 codepoint = it.current32();
int8_t chartype = u_charType(codepoint);
char buffer[100];
UErrorCode errorCode = U_ZERO_ERROR;
u_charName(codepoint, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
UCharDirection direction = u_charDirection(codepoint);
int32_t block = u_getIntPropertyValue(codepoint, UCHAR_BLOCK);
icu::UnicodeString ustr(codepoint);
std::string str;
ustr.toUTF8String(str);
char uplus[10];
snprintf(uplus, 10, "U+%04x", codepoint);
insert.
bind_text(text).
bind_int(num).
bind_text(str.c_str()).
bind_text(uplus).
bind_int(block).
bind_text(category_to_string(chartype)).
bind_int(direction).
bind_text(buffer).
execute();
}
}
}
void find_unicode_info(const char* begin, const char* end, Sqlite::Statement& insert) {
for (; begin != end; begin += strlen(begin) + 1) {
get_unicode_info(begin, icu::UnicodeString::fromUTF8(begin), insert);
}
}
int main(int argc, char *argv[]) {
if (argc != 2) {
std::cerr << "taginfo_unicode DATABASE\n";
return 1;
}
std::string data;
Sqlite::Database db(argv[1], SQLITE_OPEN_READWRITE);
Sqlite::Statement select(db, "SELECT key FROM keys WHERE characters NOT IN ('plain', 'colon') ORDER BY key");
while (select.read()) {
data += select.get_string(0);
data += '\0';
}
Sqlite::Statement insert(db, "INSERT INTO key_characters (key, num, utf8, codepoint, block, category, direction, name) VALUES (?, ?, ?, ?, ?, ?, ?, ?)");
db.begin_transaction();
find_unicode_info(data.c_str(), data.c_str() + data.size(), insert);
db.commit();
return 0;
}
|