diff options
-rw-r--r-- | sources/db/post.sql | 7 | ||||
-rw-r--r-- | sources/db/pre.sql | 1 | ||||
-rwxr-xr-x | sources/db/update.sh | 3 | ||||
-rwxr-xr-x | sources/db/update_characters.pl | 66 | ||||
-rw-r--r-- | web/views/reports/characters_in_keys.erb | 37 |
5 files changed, 113 insertions, 1 deletions
diff --git a/sources/db/post.sql b/sources/db/post.sql index 2316657..8f0b2fb 100644 --- a/sources/db/post.sql +++ b/sources/db/post.sql @@ -38,6 +38,13 @@ INSERT INTO stats (key, value) SELECT 'num_keypairs_on_nodes', count(*) FROM INSERT INTO stats (key, value) SELECT 'num_keypairs_on_ways', count(*) FROM keypairs WHERE count_ways > 0; INSERT INTO stats (key, value) SELECT 'num_keypairs_on_relations', count(*) FROM keypairs WHERE count_relations > 0; +INSERT INTO stats (key, value) SELECT 'characters_in_keys_plain', count(*) FROM keys WHERE characters='plain'; +INSERT INTO stats (key, value) SELECT 'characters_in_keys_colon', count(*) FROM keys WHERE characters='colon'; +INSERT INTO stats (key, value) SELECT 'characters_in_keys_letters', count(*) FROM keys WHERE characters='letters'; +INSERT INTO stats (key, value) SELECT 'characters_in_keys_space', count(*) FROM keys WHERE characters='space'; +INSERT INTO stats (key, value) SELECT 'characters_in_keys_problem', count(*) FROM keys WHERE characters='problem'; +INSERT INTO stats (key, value) SELECT 'characters_in_keys_rest', count(*) FROM keys WHERE characters='rest'; + UPDATE keys SET prevalent_values=( SELECT group_concat(value, '|') FROM ( SELECT value FROM tags t diff --git a/sources/db/pre.sql b/sources/db/pre.sql index 0163089..998b20e 100644 --- a/sources/db/pre.sql +++ b/sources/db/pre.sql @@ -47,6 +47,7 @@ CREATE TABLE keys ( in_josm INTEGER DEFAULT 0, in_potlatch INTEGER DEFAULT 0, in_merkaartor INTEGER DEFAULT 0, + characters VARCHAR, prevalent_values TEXT ); diff --git a/sources/db/update.sh b/sources/db/update.sh index 3a48e74..4782219 100755 --- a/sources/db/update.sh +++ b/sources/db/update.sh @@ -30,6 +30,9 @@ cd $DIR bzcat $DIR/planet.osm.bz2 | $HERE/osmium_tagstats - cd $HERE +echo "Running update_characters..." +./update_characters.pl + echo "Running post.sql..." perl -pe "s|__DIR__|$DIR|" post.sql | sqlite3 $DATABASE diff --git a/sources/db/update_characters.pl b/sources/db/update_characters.pl new file mode 100755 index 0000000..2161260 --- /dev/null +++ b/sources/db/update_characters.pl @@ -0,0 +1,66 @@ +#!/usr/bin/perl +#------------------------------------------------------------------------------ +# +# Taginfo source: DB +# +# update_characters.pl +# +#------------------------------------------------------------------------------ +# +# Copyright (C) 2010 Jochen Topf <jochen@remote.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +#------------------------------------------------------------------------------ + +use DBI; + +my $dir = $ARGV[0] || '.'; + +my $dbh = DBI->connect("dbi:SQLite:dbname=$dir/taginfo-db.db", '', ''); +$dbh->{unicode} = 1; +#$dbh->{sqlite_unicode} = 1; + +my @regexes = ( + ['plain', qr{^[a-z]([a-z_]*[a-z])?$}], + ['colon', qr{^[a-z][a-z_:]*[a-z]$}], + ['letters', qr{^[\p{L}\p{M}]([\p{L}\p{M}\p{N}_:]*[\p{L}\p{M}\p{N}])?$}], + ['space', qr{[\s\p{Z}]}], + ['problem', qr{[=+/&<>;\@'"?%#\\,\p{C}]}] +); + +my %keys; +my $results = $dbh->selectcol_arrayref('SELECT key FROM keys'); + +ROW: foreach my $key (@$results) { + $keys{$key} = 'rest'; + foreach my $r (@regexes) { + if ($key =~ $r->[1]) { + $keys{$key} = $r->[0]; + next ROW; + } + } +} + +$dbh->do('BEGIN TRANSACTION'); + +foreach my $key (keys %keys) { + $dbh->do('UPDATE keys SET characters=? WHERE key=?', undef, $keys{$key}, $key); +} + +$dbh->do('COMMIT'); + + +#-- THE END ------------------------------------------------------------------- diff --git a/web/views/reports/characters_in_keys.erb b/web/views/reports/characters_in_keys.erb index 1396ae9..5c56710 100644 --- a/web/views/reports/characters_in_keys.erb +++ b/web/views/reports/characters_in_keys.erb @@ -1,4 +1,39 @@ <h1><%= @title %></h1> -<p>Any valid Unicode character can appear in an OSM key but usually only lower case latin letters (a-z) and the underscore (_) are used.</p> +<p>Any valid Unicode character can appear in an OSM key but usually only lower +case latin letters (a-z) and the underscore (_) are used, sometimes the colon +(:) is added as a separator character.</p> + +<table class="list"> + <tr> + <td class="number"><%= @stats['characters_in_keys_plain'] %></td> + <td class="number"><%= sprintf '%.2f', @stats['characters_in_keys_plain'] * 100.0 / @stats['num_keys'] %>%</td> + <td>Plain keys (only latin lowercase letters (a-z) and underscore (_), first and last characters are letters).</td> + </tr> + <tr> + <td class="number"><%= @stats['characters_in_keys_colon'] %></td> + <td class="number"><%= sprintf '%.2f', @stats['characters_in_keys_colon'] * 100.0 / @stats['num_keys'] %>%</td> + <td>Like plain keys but with one or more colons (:) inside.</td> + </tr> + <tr> + <td class="number"><%= @stats['characters_in_keys_letters'] %></td> + <td class="number"><%= sprintf '%.2f', @stats['characters_in_keys_letters'] * 100.0 / @stats['num_keys'] %>%</td> + <td>Like plain keys but with uppercase latin letters or letters from other alphabets (and possibly with colons).</td> + </tr> + <tr> + <td class="number"><%= @stats['characters_in_keys_space'] %></td> + <td class="number"><%= sprintf '%.2f', @stats['characters_in_keys_space'] * 100.0 / @stats['num_keys'] %>%</td> + <td>Contains whitespace (space, tab, new line, carriage return, or whitespace characters from other alphabets).</td> + </tr> + <tr> + <td class="number"><%= @stats['characters_in_keys_problem'] %></td> + <td class="number"><%= sprintf '%.2f', @stats['characters_in_keys_problem'] * 100.0 / @stats['num_keys'] %>%</td> + <td>Contains possibly problematic characters (=+/&<>;'"?%#@\,) or control characters.</td> + </tr> + <tr> + <td class="number"><%= @stats['characters_in_keys_rest'] %></td> + <td class="number"><%= sprintf '%.2f', @stats['characters_in_keys_rest'] * 100.0 / @stats['num_keys'] %>%</td> + <td>Everything else.</td> + </tr> +</table> |