diff options
author | Jochen Topf <jochen@topf.org> | 2015-03-19 16:36:41 +0100 |
---|---|---|
committer | Jochen Topf <jochen@topf.org> | 2015-03-19 16:36:41 +0100 |
commit | 70f35022161f625e26dba93bcba6c6383e798623 (patch) | |
tree | ed76e754e2313b285d2d3c9125f478dc4fc02b6f /sources | |
parent | c616b58a82011608e4632e9699aa7d195f6f3731 (diff) | |
download | taginfo-70f35022161f625e26dba93bcba6c6383e798623.tar taginfo-70f35022161f625e26dba93bcba6c6383e798623.tar.gz |
An attempt to classify all keys into 'good', 'bad', and 'unknown'.
Of course this is very rough. Could be used to show "bad" keys in editors etc.
Diffstat (limited to 'sources')
-rw-r--r-- | sources/db/post.sql | 7 | ||||
-rw-r--r-- | sources/db/post_grades.sql | 46 | ||||
-rw-r--r-- | sources/db/pre.sql | 3 | ||||
-rwxr-xr-x | sources/db/update.sh | 3 |
4 files changed, 58 insertions, 1 deletions
diff --git a/sources/db/post.sql b/sources/db/post.sql index c321377..49cc634 100644 --- a/sources/db/post.sql +++ b/sources/db/post.sql @@ -38,6 +38,13 @@ INSERT INTO stats (key, value) SELECT 'characters_in_keys_space', count(*) FRO INSERT INTO stats (key, value) SELECT 'characters_in_keys_problem', count(*) FROM keys WHERE characters='problem'; INSERT INTO stats (key, value) SELECT 'characters_in_keys_rest', count(*) FROM keys WHERE characters='rest'; +INSERT INTO stats (key, value) SELECT 'grade_bad', count(*) FROM keys WHERE grade='b'; +INSERT INTO stats (key, value) SELECT 'grade_unknown', count(*) FROM keys WHERE grade='u'; +INSERT INTO stats (key, value) SELECT 'grade_good', count(*) FROM keys WHERE grade='g'; +INSERT INTO stats (key, value) SELECT 'grade_key_count_bad', sum(count_all) FROM keys WHERE grade='b'; +INSERT INTO stats (key, value) SELECT 'grade_key_count_unknown', sum(count_all) FROM keys WHERE grade='u'; +INSERT INTO stats (key, value) SELECT 'grade_key_count_good', sum(count_all) FROM keys WHERE grade='g'; + INSERT INTO stats (key, value) VALUES ('objects', (SELECT sum(value) FROM stats WHERE key IN ('nodes', 'ways', 'relations'))); INSERT INTO stats (key, value) VALUES ('object_tags', (SELECT sum(value) FROM stats WHERE key IN ('node_tags', 'way_tags', 'relation_tags'))); diff --git a/sources/db/post_grades.sql b/sources/db/post_grades.sql new file mode 100644 index 0000000..980df01 --- /dev/null +++ b/sources/db/post_grades.sql @@ -0,0 +1,46 @@ +-- +-- Taginfo source: Database +-- +-- post_grades.sql +-- + +.bail ON + +PRAGMA journal_mode = OFF; +PRAGMA synchronous = OFF; +PRAGMA temp_store = MEMORY; +PRAGMA cache_size = 5000000; + +-- ============================================================================ + +-- BAD KEYS: + +-- All keys containing whitespace or other problematic characters. +UPDATE keys SET grade='b' WHERE characters IN ('space', 'problem'); + +-- All keys documented in the wiki but never used. +UPDATE keys SET grade='b' WHERE characters IS NULL; + +-- All other keys not used at least 10 times with strange characters in them. +UPDATE keys SET grade='b' WHERE count_all < 10 AND characters='rest'; + +-- ============================================================================ + +-- GOOD KEYS: + +-- Documented in the wiki or used more than 100 times if they use letters, +-- underscores and colons only. +UPDATE keys SET grade='g' WHERE ((in_wiki=1 AND count_all > 0) OR (count_all > 100)) AND characters IN ('plain', 'colon', 'letters'); + +-- Languages can contain '-' characters, so we have a few extra "good" keys. +UPDATE keys SET grade='g' WHERE key LIKE '%name:%-%'; + +-- Everything used more than 1000 times is good. Of course thats not the case, +-- but we avoid overwhelming users with stuff they think they need to fix. +UPDATE keys SET grade='g' WHERE count_all > 1000; + +-- ============================================================================ + +-- SELECT grade, count(*), sum(count_all) FROM keys GROUP BY grade; + + diff --git a/sources/db/pre.sql b/sources/db/pre.sql index eab3e47..9faf613 100644 --- a/sources/db/pre.sql +++ b/sources/db/pre.sql @@ -28,7 +28,8 @@ CREATE TABLE keys ( cells_ways INTEGER DEFAULT 0, in_wiki INTEGER DEFAULT 0, in_projects INTEGER DEFAULT 0, - characters VARCHAR + characters VARCHAR, + grade CHAR DEFAULT 'u' ); DROP TABLE IF EXISTS prevalent_values; diff --git a/sources/db/update.sh b/sources/db/update.sh index 44f1fa6..faa5cde 100755 --- a/sources/db/update.sh +++ b/sources/db/update.sh @@ -76,6 +76,9 @@ sqlite3 $DATABASE <post_similar_keys.sql echo "`$DATECMD` Running update_characters... " ./update_characters.rb $DIR +echo "`$DATECMD` Running post_grades.sql... " +sqlite3 $DATABASE <post_grades.sql + echo "`$DATECMD` Running post_indexes.sql... " sqlite3 $DATABASE <post_indexes.sql |