aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sources/db/post.sql7
-rw-r--r--sources/db/pre.sql1
-rwxr-xr-xsources/db/update.sh3
-rwxr-xr-xsources/db/update_characters.pl66
-rw-r--r--web/views/reports/characters_in_keys.erb37
5 files changed, 113 insertions, 1 deletions
diff --git a/sources/db/post.sql b/sources/db/post.sql
index 2316657..8f0b2fb 100644
--- a/sources/db/post.sql
+++ b/sources/db/post.sql
@@ -38,6 +38,13 @@ INSERT INTO stats (key, value) SELECT 'num_keypairs_on_nodes', count(*) FROM
INSERT INTO stats (key, value) SELECT 'num_keypairs_on_ways', count(*) FROM keypairs WHERE count_ways > 0;
INSERT INTO stats (key, value) SELECT 'num_keypairs_on_relations', count(*) FROM keypairs WHERE count_relations > 0;
+INSERT INTO stats (key, value) SELECT 'characters_in_keys_plain', count(*) FROM keys WHERE characters='plain';
+INSERT INTO stats (key, value) SELECT 'characters_in_keys_colon', count(*) FROM keys WHERE characters='colon';
+INSERT INTO stats (key, value) SELECT 'characters_in_keys_letters', count(*) FROM keys WHERE characters='letters';
+INSERT INTO stats (key, value) SELECT 'characters_in_keys_space', count(*) FROM keys WHERE characters='space';
+INSERT INTO stats (key, value) SELECT 'characters_in_keys_problem', count(*) FROM keys WHERE characters='problem';
+INSERT INTO stats (key, value) SELECT 'characters_in_keys_rest', count(*) FROM keys WHERE characters='rest';
+
UPDATE keys SET prevalent_values=(
SELECT group_concat(value, '|') FROM (
SELECT value FROM tags t
diff --git a/sources/db/pre.sql b/sources/db/pre.sql
index 0163089..998b20e 100644
--- a/sources/db/pre.sql
+++ b/sources/db/pre.sql
@@ -47,6 +47,7 @@ CREATE TABLE keys (
in_josm INTEGER DEFAULT 0,
in_potlatch INTEGER DEFAULT 0,
in_merkaartor INTEGER DEFAULT 0,
+ characters VARCHAR,
prevalent_values TEXT
);
diff --git a/sources/db/update.sh b/sources/db/update.sh
index 3a48e74..4782219 100755
--- a/sources/db/update.sh
+++ b/sources/db/update.sh
@@ -30,6 +30,9 @@ cd $DIR
bzcat $DIR/planet.osm.bz2 | $HERE/osmium_tagstats -
cd $HERE
+echo "Running update_characters..."
+./update_characters.pl
+
echo "Running post.sql..."
perl -pe "s|__DIR__|$DIR|" post.sql | sqlite3 $DATABASE
diff --git a/sources/db/update_characters.pl b/sources/db/update_characters.pl
new file mode 100755
index 0000000..2161260
--- /dev/null
+++ b/sources/db/update_characters.pl
@@ -0,0 +1,66 @@
+#!/usr/bin/perl
+#------------------------------------------------------------------------------
+#
+# Taginfo source: DB
+#
+# update_characters.pl
+#
+#------------------------------------------------------------------------------
+#
+# Copyright (C) 2010 Jochen Topf <jochen@remote.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+#------------------------------------------------------------------------------
+
+use DBI;
+
+my $dir = $ARGV[0] || '.';
+
+my $dbh = DBI->connect("dbi:SQLite:dbname=$dir/taginfo-db.db", '', '');
+$dbh->{unicode} = 1;
+#$dbh->{sqlite_unicode} = 1;
+
+my @regexes = (
+ ['plain', qr{^[a-z]([a-z_]*[a-z])?$}],
+ ['colon', qr{^[a-z][a-z_:]*[a-z]$}],
+ ['letters', qr{^[\p{L}\p{M}]([\p{L}\p{M}\p{N}_:]*[\p{L}\p{M}\p{N}])?$}],
+ ['space', qr{[\s\p{Z}]}],
+ ['problem', qr{[=+/&<>;\@'"?%#\\,\p{C}]}]
+);
+
+my %keys;
+my $results = $dbh->selectcol_arrayref('SELECT key FROM keys');
+
+ROW: foreach my $key (@$results) {
+ $keys{$key} = 'rest';
+ foreach my $r (@regexes) {
+ if ($key =~ $r->[1]) {
+ $keys{$key} = $r->[0];
+ next ROW;
+ }
+ }
+}
+
+$dbh->do('BEGIN TRANSACTION');
+
+foreach my $key (keys %keys) {
+ $dbh->do('UPDATE keys SET characters=? WHERE key=?', undef, $keys{$key}, $key);
+}
+
+$dbh->do('COMMIT');
+
+
+#-- THE END -------------------------------------------------------------------
diff --git a/web/views/reports/characters_in_keys.erb b/web/views/reports/characters_in_keys.erb
index 1396ae9..5c56710 100644
--- a/web/views/reports/characters_in_keys.erb
+++ b/web/views/reports/characters_in_keys.erb
@@ -1,4 +1,39 @@
<h1><%= @title %></h1>
-<p>Any valid Unicode character can appear in an OSM key but usually only lower case latin letters (a-z) and the underscore (_) are used.</p>
+<p>Any valid Unicode character can appear in an OSM key but usually only lower
+case latin letters (a-z) and the underscore (_) are used, sometimes the colon
+(:) is added as a separator character.</p>
+
+<table class="list">
+ <tr>
+ <td class="number"><%= @stats['characters_in_keys_plain'] %></td>
+ <td class="number"><%= sprintf '%.2f', @stats['characters_in_keys_plain'] * 100.0 / @stats['num_keys'] %>%</td>
+ <td>Plain keys (only latin lowercase letters (a-z) and underscore (_), first and last characters are letters).</td>
+ </tr>
+ <tr>
+ <td class="number"><%= @stats['characters_in_keys_colon'] %></td>
+ <td class="number"><%= sprintf '%.2f', @stats['characters_in_keys_colon'] * 100.0 / @stats['num_keys'] %>%</td>
+ <td>Like plain keys but with one or more colons (:) inside.</td>
+ </tr>
+ <tr>
+ <td class="number"><%= @stats['characters_in_keys_letters'] %></td>
+ <td class="number"><%= sprintf '%.2f', @stats['characters_in_keys_letters'] * 100.0 / @stats['num_keys'] %>%</td>
+ <td>Like plain keys but with uppercase latin letters or letters from other alphabets (and possibly with colons).</td>
+ </tr>
+ <tr>
+ <td class="number"><%= @stats['characters_in_keys_space'] %></td>
+ <td class="number"><%= sprintf '%.2f', @stats['characters_in_keys_space'] * 100.0 / @stats['num_keys'] %>%</td>
+ <td>Contains whitespace (space, tab, new line, carriage return, or whitespace characters from other alphabets).</td>
+ </tr>
+ <tr>
+ <td class="number"><%= @stats['characters_in_keys_problem'] %></td>
+ <td class="number"><%= sprintf '%.2f', @stats['characters_in_keys_problem'] * 100.0 / @stats['num_keys'] %>%</td>
+ <td>Contains possibly problematic characters (=+/&amp;&lt;&gt;;'"?%#@\,) or control characters.</td>
+ </tr>
+ <tr>
+ <td class="number"><%= @stats['characters_in_keys_rest'] %></td>
+ <td class="number"><%= sprintf '%.2f', @stats['characters_in_keys_rest'] * 100.0 / @stats['num_keys'] %>%</td>
+ <td>Everything else.</td>
+ </tr>
+</table>