diff options
Diffstat (limited to 'sources')
-rwxr-xr-x | sources/db/osmium_tagstats | bin | 0 -> 135169 bytes | |||
-rw-r--r-- | sources/db/post.sql | 55 | ||||
-rw-r--r-- | sources/db/pre.sql | 76 | ||||
-rwxr-xr-x | sources/db/update.sh | 37 | ||||
-rwxr-xr-x | sources/josm/import_josm.rb | 97 | ||||
-rw-r--r-- | sources/josm/post.sql | 14 | ||||
-rw-r--r-- | sources/josm/pre.sql | 60 | ||||
-rwxr-xr-x | sources/josm/update.sh | 38 | ||||
-rw-r--r-- | sources/master/languages.sql | 50 | ||||
-rw-r--r-- | sources/master/master.sql | 90 | ||||
-rwxr-xr-x | sources/master/update.sh | 27 | ||||
-rwxr-xr-x | sources/update_all.sh | 47 | ||||
-rw-r--r-- | sources/wiki/README | 22 | ||||
-rwxr-xr-x | sources/wiki/get_page_list.rb | 123 | ||||
-rwxr-xr-x | sources/wiki/get_wiki_data.rb | 311 | ||||
-rw-r--r-- | sources/wiki/lib/mediawikiapi.rb | 65 | ||||
-rw-r--r-- | sources/wiki/post.sql | 28 | ||||
-rw-r--r-- | sources/wiki/pre.sql | 80 | ||||
-rwxr-xr-x | sources/wiki/update.sh | 40 |
19 files changed, 1260 insertions, 0 deletions
diff --git a/sources/db/osmium_tagstats b/sources/db/osmium_tagstats Binary files differnew file mode 100755 index 0000000..618ee6d --- /dev/null +++ b/sources/db/osmium_tagstats diff --git a/sources/db/post.sql b/sources/db/post.sql new file mode 100644 index 0000000..2316657 --- /dev/null +++ b/sources/db/post.sql @@ -0,0 +1,55 @@ +-- +-- Taginfo source: Database +-- +-- post.sql +-- + +.bail ON + +PRAGMA journal_mode = OFF; +PRAGMA synchronous = OFF; +PRAGMA count_changes = OFF; +PRAGMA temp_store = MEMORY; +PRAGMA cache_size = 5000000; + +ATTACH DATABASE '__DIR__/count.db' AS count; +INSERT INTO stats SELECT * FROM count.stats; +DETACH DATABASE count; + +CREATE UNIQUE INDEX keys_key_idx ON keys (key); +CREATE INDEX tags_key_idx ON tags (key); +-- CREATE UNIQUE INDEX tags_key_value_idx ON tags (key, value); +CREATE INDEX keypairs_key1_idx ON keypairs (key1); +CREATE INDEX keypairs_key2_idx ON keypairs (key2); +CREATE UNIQUE INDEX key_distributions_key_idx ON key_distributions (key); + +INSERT INTO stats (key, value) SELECT 'num_keys', count(*) FROM keys; +INSERT INTO stats (key, value) SELECT 'num_keys_on_nodes', count(*) FROM keys WHERE count_nodes > 0; +INSERT INTO stats (key, value) SELECT 'num_keys_on_ways', count(*) FROM keys WHERE count_ways > 0; +INSERT INTO stats (key, value) SELECT 'num_keys_on_relations', count(*) FROM keys WHERE count_relations > 0; + +INSERT INTO stats (key, value) SELECT 'num_tags', count(*) FROM tags; +INSERT INTO stats (key, value) SELECT 'num_tags_on_nodes', count(*) FROM tags WHERE count_nodes > 0; +INSERT INTO stats (key, value) SELECT 'num_tags_on_ways', count(*) FROM tags WHERE count_ways > 0; +INSERT INTO stats (key, value) SELECT 'num_tags_on_relations', count(*) FROM tags WHERE count_relations > 0; + +INSERT INTO stats (key, value) SELECT 'num_keypairs', count(*) FROM keypairs; +INSERT INTO stats (key, value) SELECT 'num_keypairs_on_nodes', count(*) FROM keypairs WHERE count_nodes > 0; +INSERT INTO stats (key, value) SELECT 'num_keypairs_on_ways', count(*) FROM keypairs WHERE count_ways > 0; +INSERT INTO stats (key, value) SELECT 'num_keypairs_on_relations', count(*) FROM keypairs WHERE count_relations > 0; + +UPDATE keys SET prevalent_values=( + SELECT group_concat(value, '|') FROM ( + SELECT value FROM tags t + WHERE t.key = keys.key + AND t.count_all > keys.count_all / 100 + ORDER BY t.count_all DESC + LIMIT 10 + + ) + ) WHERE values_all < 1000; + +ANALYZE; + +UPDATE meta SET update_end=datetime('now'); + diff --git a/sources/db/pre.sql b/sources/db/pre.sql new file mode 100644 index 0000000..1bb00ea --- /dev/null +++ b/sources/db/pre.sql @@ -0,0 +1,76 @@ +-- +-- Taginfo source: Database +-- +-- pre.sql +-- + +.bail ON + +DROP TABLE IF EXISTS meta; + +CREATE TABLE meta ( + source_id TEXT, + source_name TEXT, + update_start TEXT, + update_end TEXT, + data_until TEXT +); + +INSERT INTO meta (source_id, source_name, update_start) SELECT 'db', 'Database', datetime('now'); + +DROP TABLE IF EXISTS stats; + +CREATE TABLE stats ( + key TEXT, + value INT64 +); + +DROP TABLE IF EXISTS keys; + +CREATE TABLE keys ( + key VARCHAR, + count_all INTEGER, + count_nodes INTEGER, + count_ways INTEGER, + count_relations INTEGER, + values_all INTEGER, + values_nodes INTEGER, + values_ways INTEGER, + values_relations INTEGER, + users_all INTEGER, + users_nodes INTEGER, + users_ways INTEGER, + users_relations INTEGER, + grids INTEGER, + prevalent_values TEXT +); + +DROP TABLE IF EXISTS key_distributions; + +CREATE TABLE key_distributions ( + key VARCHAR, + png BLOB +); + +DROP TABLE IF EXISTS tags; + +CREATE TABLE tags ( + key VARCHAR, + value VARCHAR, + count_all INTEGER, + count_nodes INTEGER, + count_ways INTEGER, + count_relations INTEGER +); + +DROP TABLE IF EXISTS keypairs; + +CREATE TABLE keypairs ( + key1 VARCHAR, + key2 VARCHAR, + count_all INTEGER, + count_nodes INTEGER, + count_ways INTEGER, + count_relations INTEGER +); + diff --git a/sources/db/update.sh b/sources/db/update.sh new file mode 100755 index 0000000..3a48e74 --- /dev/null +++ b/sources/db/update.sh @@ -0,0 +1,37 @@ +#!/bin/sh +# +# Taginfo source: DB +# +# update.sh DIR +# + +set -e + +DIR=$1 + +if [ "x" = "x$DIR" ]; then + echo "Usage: update.sh DIR" + exit 1 +fi + +echo -n "Start db: "; date + +DATABASE=$DIR/taginfo-db.db + +rm -f $DATABASE +rm -f $DIR/count.db + +echo "Running pre.sql..." +sqlite3 $DATABASE <pre.sql + +echo "Running count..." +HERE=`pwd` +cd $DIR +bzcat $DIR/planet.osm.bz2 | $HERE/osmium_tagstats - +cd $HERE + +echo "Running post.sql..." +perl -pe "s|__DIR__|$DIR|" post.sql | sqlite3 $DATABASE + +echo -n "Done db: "; date + diff --git a/sources/josm/import_josm.rb b/sources/josm/import_josm.rb new file mode 100755 index 0000000..9f42a40 --- /dev/null +++ b/sources/josm/import_josm.rb @@ -0,0 +1,97 @@ +#!/usr/bin/ruby +#------------------------------------------------------------------------------ +# +# Taginfo source: JOSM +# +# import_josm.rb +# +#------------------------------------------------------------------------------ +# +# Copyright (C) 2010 Jochen Topf <jochen@remote.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +#------------------------------------------------------------------------------ + +require 'pp' +require 'sqlite3' +require 'rexml/document' + +class Rule + + attr_accessor :k, :v, :b + attr_accessor :scale_min, :scale_max + attr_accessor :icon_source + attr_accessor :line_width, :line_realwidth + + attr_reader :rule + + def initialize(rule) + @rule = rule + end + + def insert(db) + db.execute( + 'INSERT INTO josm_style_rules (k, v, b, scale_min, scale_max, icon_source, line_width, line_realwidth, rule) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', + k, + v, + b, + scale_min, + scale_max, + icon_source, + line_width, + line_realwidth, + rule + ) + end + +end + +dir = ARGV[0] || '.' + +db = SQLite3::Database.new(dir + '/taginfo-josm.db') + +db.execute('BEGIN TRANSACTION'); + +file = File.new(dir + '/elemstyles.xml') +doc = REXML::Document.new(file) + +doc.elements.each('/rules/rule') do |rule_element| + rule = Rule.new(rule_element.to_s) + rule_element.elements.each do |element| + case element.name + when 'condition' + rule.k = element.attributes['k'] + rule.v = element.attributes['v'] + rule.b = element.attributes['b'] + when 'scale_min' + rule.scale_min = element.text + when 'scale_max' + rule.scale_max = element.text + when 'icon' + rule.icon_source = element.attributes['src'] + when 'line' + rule.line_width = element.attributes['width'] + rule.line_realwidth = element.attributes['realwidth'] + end + end +# pp "rule #{rule.k} #{rule.v}" + rule.insert(db) +end + +db.execute('COMMIT'); + + +#-- THE END ------------------------------------------------------------------- diff --git a/sources/josm/post.sql b/sources/josm/post.sql new file mode 100644 index 0000000..d02f8c8 --- /dev/null +++ b/sources/josm/post.sql @@ -0,0 +1,14 @@ +-- +-- Taginfo source: JOSM +-- +-- post.sql +-- + +.bail ON + +CREATE INDEX josm_style_rules_idx ON josm_style_rules (style, k, v); + +ANALYZE; + +UPDATE meta SET update_end=datetime('now'); + diff --git a/sources/josm/pre.sql b/sources/josm/pre.sql new file mode 100644 index 0000000..42a4213 --- /dev/null +++ b/sources/josm/pre.sql @@ -0,0 +1,60 @@ +-- +-- Taginfo source: JOSM +-- +-- pre.sql +-- + +.bail ON + +DROP TABLE IF EXISTS meta; + +CREATE TABLE meta ( + source_id TEXT, + source_name TEXT, + update_start TEXT, + update_end TEXT, + data_until TEXT +); + +INSERT INTO meta (source_id, source_name, update_start, data_until) SELECT 'josm', 'JOSM', datetime('now'), datetime('now'); + +DROP TABLE IF EXISTS stats; + +CREATE TABLE stats ( + key TEXT, + value INT64 +); + +-- +-- josm_styles +-- +-- Contains list of JOSM styles. +-- + +DROP TABLE IF EXISTS josm_styles; + +CREATE TABLE josm_styles ( + style VARCHAR +); + +-- +-- josm_style_rules +-- +-- Contains data about JOSM style rules. +-- + +DROP TABLE IF EXISTS josm_style_rules; + +CREATE TABLE josm_style_rules ( + style VARCHAR, + k VARCHAR, + v VARCHAR, + b VARCHAR, + scale_min INTEGER, + scale_max INTEGER, + icon_source VARCHAR, + line_width INTEGER, + line_realwidth INTEGER, + rule VARCHAR +); + diff --git a/sources/josm/update.sh b/sources/josm/update.sh new file mode 100755 index 0000000..6eca722 --- /dev/null +++ b/sources/josm/update.sh @@ -0,0 +1,38 @@ +#!/bin/sh +# +# Taginfo source: JOSM +# +# update.sh DIR +# + +set -e + +DIR=$1 + +if [ "x" = "x$DIR" ]; then + echo "Usage: update.sh DIR" + exit 1 +fi + +echo -n "Start josm: "; date + +DATABASE=$DIR/taginfo-josm.db +ELEMSTYLES=$DIR/elemstyles.xml + +rm -f $DATABASE +rm -f $ELEMSTYLES + +echo "Getting styles..." +wget -O $ELEMSTYLES http://josm.openstreetmap.de/svn/trunk/styles/standard/elemstyles.xml + +echo "Running pre.sql..." +sqlite3 $DATABASE <pre.sql + +echo "Running import..." +./import_josm.rb $DIR + +echo "Running post.sql..." +sqlite3 $DATABASE <post.sql + +echo -n "Done josm: "; date + diff --git a/sources/master/languages.sql b/sources/master/languages.sql new file mode 100644 index 0000000..a04db33 --- /dev/null +++ b/sources/master/languages.sql @@ -0,0 +1,50 @@ +-- +-- Taginfo +-- +-- languages.sql +-- + +.bail ON + +DROP TABLE IF EXISTS languages; + +CREATE TABLE languages ( + code VARCHAR, + english_name VARCHAR, + native_name VARCHAR +); + +INSERT INTO languages VALUES ('ar', 'Arabic', 'العربية'); +INSERT INTO languages VALUES ('bg', 'Bulgarian', 'български език'); +INSERT INTO languages VALUES ('cz', 'Czech', 'česky'); +INSERT INTO languages VALUES ('da', 'Danish', 'dansk'); +INSERT INTO languages VALUES ('de', 'German', 'Deutsch'); +INSERT INTO languages VALUES ('en', 'English', 'English'); +INSERT INTO languages VALUES ('es', 'Spanish', 'español'); +INSERT INTO languages VALUES ('et', 'Estonian', 'eesti'); +INSERT INTO languages VALUES ('fi', 'Finish', 'suomi'); +INSERT INTO languages VALUES ('fr', 'French', 'français'); +INSERT INTO languages VALUES ('he', 'Hebrew', 'עברית'); +INSERT INTO languages VALUES ('hr', 'Croatian', 'hrvatski'); +INSERT INTO languages VALUES ('hu', 'Hungarian', 'Magyar'); +INSERT INTO languages VALUES ('it', 'Italian', 'Italiano'); +INSERT INTO languages VALUES ('ja', 'Japanese', '日本語'); +INSERT INTO languages VALUES ('nl', 'Dutch', 'Nederlands'); +INSERT INTO languages VALUES ('no', 'Norwegian', 'Norsk'); +INSERT INTO languages VALUES ('pl', 'Polish', 'polski'); +INSERT INTO languages VALUES ('pt', 'Portuguese', 'Português'); +INSERT INTO languages VALUES ('pt-br', 'Brazilian Portuguese', 'Português do Brasil'); +INSERT INTO languages VALUES ('ro', 'Romanian', 'română'); +INSERT INTO languages VALUES ('ro-md', 'Moldovan', 'română (Moldova)'); +INSERT INTO languages VALUES ('ru', 'Russian', 'русский язык'); +-- INSERT INTO languages VALUES ('sh', '', ''); -- not in ISO 639-1 +INSERT INTO languages VALUES ('sq', 'Albanian', 'Shqip'); +INSERT INTO languages VALUES ('sv', 'Swedish', 'svenska'); +INSERT INTO languages VALUES ('tr', 'Turkish', 'Türkçe'); +INSERT INTO languages VALUES ('uk', 'Ukrainian', 'українська'); +INSERT INTO languages VALUES ('zh', 'Chinese', '中文'); +INSERT INTO languages VALUES ('zh-hans', 'Chinese', '中文'); -- hans? +-- INSERT INTO languages VALUES ('', '', ''); + +ANALYZE; + diff --git a/sources/master/master.sql b/sources/master/master.sql new file mode 100644 index 0000000..c446ae7 --- /dev/null +++ b/sources/master/master.sql @@ -0,0 +1,90 @@ +-- +-- Taginfo +-- +-- master.sql +-- + +.bail ON + +ATTACH DATABASE '__DIR__/db/taginfo-db.db' AS db; +ATTACH DATABASE '__DIR__/wiki/taginfo-wiki.db' AS wiki; +ATTACH DATABASE '__DIR__/josm/taginfo-josm.db' AS josm; + +DROP TABLE IF EXISTS master_meta; + +CREATE TABLE master_meta ( + source_id TEXT, + source_name TEXT, + update_start TEXT, + update_end TEXT, + data_until TEXT +); + +INSERT INTO master_meta SELECT * FROM db.meta + UNION SELECT * FROM josm.meta + UNION SELECT * FROM wiki.meta; + +DROP TABLE IF EXISTS master_stats; + +CREATE TABLE master_stats ( + key TEXT, + value INT64 +); + +INSERT INTO master_stats SELECT * FROM db.stats + UNION SELECT * FROM josm.stats + UNION SELECT * FROM wiki.stats; + +DROP TABLE IF EXISTS popular_keys; + +CREATE TABLE popular_keys ( + key VARCHAR, + count INTEGER, + users INTEGER, + wikipages INTEGER DEFAULT 0, + in_wiki INTEGER DEFAULT 0, + in_josm INTEGER DEFAULT 0, + scale_count REAL, + scale_users REAL, + scale_wiki REAL, + scale_josm REAL, + scale_name REAL, + scale1 REAL, + scale2 REAL +); + +INSERT INTO popular_keys (key, count, users) + SELECT key, count_all, users_all FROM db.keys WHERE count_all > 1000 GROUP BY key; + +-- count number of wikipages for each key +UPDATE popular_keys SET wikipages = (SELECT count(*) FROM wiki.wikipages w WHERE w.key=popular_keys.key); + +UPDATE popular_keys SET in_wiki=1 WHERE key IN (SELECT key FROM wiki.wikipages); +UPDATE popular_keys SET in_josm=1 WHERE key IN (SELECT k FROM josm.josm_style_rules); + +DROP TABLE IF EXISTS popular_metadata; + +CREATE TABLE popular_metadata ( + keys INTEGER, + count_min INTEGER, + count_max INTEGER, + count_delta INTEGER, + users_min INTEGER, + users_max INTEGER, + users_delta INTEGER +); + +INSERT INTO popular_metadata (keys, count_min, count_max, count_delta, users_min, users_max, users_delta) + SELECT count(*), min(count), max(count), max(count) - min(count), min(users), max(users), max(users) - min(users) FROM popular_keys; + +UPDATE popular_keys SET scale_count = CAST (count - (SELECT count_min FROM popular_metadata) AS REAL) / (SELECT count_delta FROM popular_metadata); +UPDATE popular_keys SET scale_users = CAST (users - (SELECT users_min FROM popular_metadata) AS REAL) / (SELECT users_delta FROM popular_metadata); +UPDATE popular_keys SET scale_wiki = CAST (wikipages AS REAL) / (SELECT max(wikipages) FROM popular_keys); +UPDATE popular_keys SET scale_josm = in_josm; +UPDATE popular_keys SET scale_name = 1; +UPDATE popular_keys SET scale_name = 0 WHERE key LIKE '%:%'; + +UPDATE popular_keys SET scale1 = 10 * scale_count + 8 * scale_users + 2 * scale_wiki + 1 * scale_josm + 2 * scale_name; + +ANALYZE; + diff --git a/sources/master/update.sh b/sources/master/update.sh new file mode 100755 index 0000000..9b7040e --- /dev/null +++ b/sources/master/update.sh @@ -0,0 +1,27 @@ +#!/bin/sh +# +# Taginfo Master DB +# +# update.sh DIR +# + +set -e + +DIR=$1 + +if [ "x" = "x$DIR" ]; then + echo "Usage: update.sh DIR" + exit 1 +fi + +echo -n "Start master: "; date + +DATABASE=$DIR/taginfo-master.db + +rm -f $DATABASE + +perl -pe "s|__DIR__|$DIR|" master.sql | sqlite3 $DATABASE +sqlite3 $DATABASE <languages.sql + +echo -n "Done master: "; date + diff --git a/sources/update_all.sh b/sources/update_all.sh new file mode 100755 index 0000000..4011a2d --- /dev/null +++ b/sources/update_all.sh @@ -0,0 +1,47 @@ +#!/bin/sh +# +# Taginfo +# +# update_all.sh DIR +# + +SOURCES="db josm wiki" + +set -e + +DIR=$1 + +if [ "x" = "x$DIR" ]; then + echo "Usage: update.sh DIR" + exit 1 +fi + +exec >$DIR/update_all.log 2>&1 + +echo -n "Start: "; date + +mkdir -p $DIR/download + +for source in $SOURCES; do + echo "=====================================" + echo "Running $source/update.sh..." + mkdir -p $DIR/$source + cd $source + ./update.sh $DIR/$source + cd .. + echo "Running bzip2..." + bzip2 -9 -c $DIR/$source/taginfo-$source.db >$DIR/download/taginfo-$source.db.bz2 + echo "Done." +done + +echo "=====================================" +echo "Running master/update.sh..." +cd master +./update.sh $DIR +cd .. +echo "Running bzip2..." +bzip2 -9 -c $DIR/taginfo-master.db >$DIR/download/taginfo-master.db.bz2 +echo "Done." + +echo -n "Done: "; date + diff --git a/sources/wiki/README b/sources/wiki/README new file mode 100644 index 0000000..6c3a621 --- /dev/null +++ b/sources/wiki/README @@ -0,0 +1,22 @@ + +Taginfo Sources: Wiki +===================== + +PREREQUISITES +------------- + +On Debian/Ubuntu you need the following packages: + libjson-ruby sqlite3 packages libsqlite3-ruby1.8 + + +RUNNING +------- + +Run ./update.sh DIR + +where DIR is the name of a directory where the data will be stored. + +The file 'allpages.list' is currently not used. But because we get the +information from the wiki anyway, we just store it. Maybe in the future +it is of some use. + diff --git a/sources/wiki/get_page_list.rb b/sources/wiki/get_page_list.rb new file mode 100755 index 0000000..649efbe --- /dev/null +++ b/sources/wiki/get_page_list.rb @@ -0,0 +1,123 @@ +#!/usr/bin/ruby +#------------------------------------------------------------------------------ +# +# get_page_list.rb [DIR] +# +#------------------------------------------------------------------------------ +# +# Gets the list of all wiki pages from the OSM wiki. +# +# Two files will be written: 'allpages.list' contains all pages in the wiki, +# 'tagpages.list' contains all pages about tags from the wiki. Both have the +# format: +# +# <type> TAB <namespace> TAB <title> +# +# The <type> is either 'page' or 'redirect', depending on whether this is a +# proper wiki page or a redirect to another wiki page, respectively. +# +# The <namespaces> gives the namespace this page is in. This is empty for the +# main namespace. +# +# <title> is the full title of the wiki page including leading namespaces etc. +# +# The files will be created in DIR or in the current directory, if no directory +# was given on the command line. +# +#------------------------------------------------------------------------------ +# +# Copyright (C) 2010 Jochen Topf <jochen@remote.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +#------------------------------------------------------------------------------ + +require 'pp' + +require 'net/http' +require 'uri' +require 'json' + +require 'lib/mediawikiapi.rb' + +#------------------------------------------------------------------------------ + +def get_namespaces(api) + data = api.query(:meta => 'siteinfo', :siprop => 'namespaces') + namespaces = {} + data['query']['namespaces'].values.each do |ns| + if ns['canonical'] =~ /^[A-Z]{2}$/ + namespaces[ns['canonical']] = ns['id'] + end + end + namespaces +end + +def get_page_list(api, namespaceid, options) + apfrom = '' + loop do + data = api.query(:list => 'allpages', :aplimit => 'max', :apfrom => apfrom, :apnamespace => namespaceid, :apfilterredir => options[:redirect] ? 'redirects' : 'nonredirects') +# pp data + data['query']['allpages'].each do |h| + yield h['title'].gsub(/\s/, '_') + end + if data['query-continue'] + apfrom = data['query-continue']['allpages']['apfrom'].gsub(/\s/, '_') +# puts "apfrom=#{apfrom}" + else + return + end + end +end + +#------------------------------------------------------------------------------ + +dir = ARGV[0] || '.' + +api = MediaWikiAPI::API.new('wiki.openstreetmap.org') +api.add_header('User-agent', 'taginfo/0.1 (jochen@remote.org)') + +namespaces = get_namespaces(api) + +# add main namespace +namespaces[''] = 0 + +allpages = File.open(dir + '/allpages.list', 'w') +tagpages = File.open(dir + '/tagpages.list', 'w') + +namespaces.keys.sort.each do |namespace| + id = namespaces[namespace] + + get_page_list(api, id, :redirect => false) do |page| + line = ['page', namespace, page].join("\t") + allpages.puts line + if page =~ /^([^:]+:)?(Key|Tag):(.+)$/ + tagpages.puts line + end + end + + get_page_list(api, id, :redirect => true) do |page| + line = ['redirect', namespace, page].join("\t") + allpages.puts line + if page =~ /^([^:]+:)?(Key|Tag):(.+)$/ + tagpages.puts line + end + end +end + +tagpages.close +allpages.close + +#-- THE END ------------------------------------------------------------------- diff --git a/sources/wiki/get_wiki_data.rb b/sources/wiki/get_wiki_data.rb new file mode 100755 index 0000000..742e520 --- /dev/null +++ b/sources/wiki/get_wiki_data.rb @@ -0,0 +1,311 @@ +#!/usr/bin/ruby +#------------------------------------------------------------------------------ +# +# get_wiki_data.rb [DIR] +# +#------------------------------------------------------------------------------ +# +# Reads all the wiki pages from 'tagpages.list' and gets their content from +# the OSM wiki. The pages are parsed and the information stored in the +# sqlite database 'taginfo-wiki.db' which must have been initialized before. +# +# All files are in DIR or the current directory if no directory was given on +# the command line. +# +# This script writes copious debugging information to STDOUT. You might want +# to redirect that to a file. +# +#------------------------------------------------------------------------------ +# +# Copyright (C) 2010 Jochen Topf <jochen@remote.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +#------------------------------------------------------------------------------ + +require 'pp' + +require 'json' +require 'net/http' +require 'uri' +require 'sqlite3' + +require 'lib/mediawikiapi.rb' + +#------------------------------------------------------------------------------ + +class WikiPage + + @@pages = {} + + attr_accessor :content + attr_accessor :description, :image, :group, :onNode, :onWay, :onArea, :onRelation, :has_templ + attr_reader :type, :namespace, :title, :tag, :key, :value, :lang, :ttype, :tags_implies, :tags_combination, :tags_linked, :parsed + + def initialize(type, namespace, title) + @type = type # 'page' or 'redirect' + @namespace = namespace # 'XX' (mediawiki namespace or '') + @title = title # wiki page title + + @tag = title.gsub(/^([^:]+:)?(Key|Tag):/, '') # complete tag (key=value) + @key = @tag.sub(/=.*/, '') # key + if @tag =~ /=/ + @value = @tag.sub(/.*?=/, '') # value (if any) + end + if title =~ /^(.*):(Key|Tag):/ + @lang = $1.downcase # IETF language tag + @ttype = $2.downcase # 'tag' or 'key' + else + @lang = 'en' + end + + @has_templ = false + + @tags_implies = [] + @tags_combination = [] + @tags_linked = [] + + @group = '' + @onNode = false + @onWay = false + @onArea = false + @onRelation = false + + @parsed = nil + + @@pages[@title] = self + end + + def self.pages + @@pages.values.sort{ |a,b| a.title <=> b.title } + end + + def self.find(name) + @@pages[name] + end + + # Has this wiki page a name that we can understand and process? + def valid? + return false if @lang !~ /^[a-z]{2}(-[a-z0-9-]+)?$/ + return false if @ttype == 'key' && ! @value.nil? + return false if @ttype == 'tag' && @value.nil? + return false if @key =~ %r{/} + return false if @value =~ %r{/} + return true + end + + # Return parameters for API call to read this page. + def params + { :title => title, :action => 'raw' } + end + + def add_tag_link(tag) + @tags_linked << tag + end + + def insert(db) + db.execute( + "INSERT INTO wikipages (lang, tag, key, value, title, tgroup, type, has_templ, parsed, description, image, on_node, on_way, on_area, on_relation, tags_implies, tags_combination, tags_linked) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + lang, + tag, + key, + value, + title, + group, + type, + has_templ, + parsed ? 1 : 0, + description, + image, + onNode ? 1 : 0, + onWay ? 1 : 0, + onArea ? 1 : 0, + onRelation ? 1 : 0, + tags_implies. sort.uniq.join(','), + tags_combination.sort.uniq.join(','), + tags_linked. sort.uniq.join(',') + ) + end + + # Parse content of the wiki page. This will find the templates + # and their parameters. + def parse_content + @parsed = true + text = @content + + # dummy template as base context + context = [ Template.new ] + + loop do + # split text into ('before', 'token', 'after') + m = /^(.*?)(\{\{|\}\}|[|=])(.*)$/m.match(text) + + # we are done if there are no more tokens + if m.nil? + return + end + + # do the right thing depending on next token + case m[2] + when '{{' # start of template + context.last.add_parameter(m[1].strip) + context << Template.new() + when '}}' # end of template + context.last.add_parameter(m[1].strip) + c = context.pop + yield c + context.last.add_parameter(c) + when '|' # template parameter + context.last.add_parameter(m[1].strip) + context.last.parname(nil) + when '=' # named template parameter + parameter_name = (m[1].strip == ':') ? 'subkey' : m[1].strip + context.last.parname(parameter_name) + end + + # 'after' is our next 'text' + text = m[3] + end + rescue + puts "Parsing of page #{title} failed" + @parsed = false + end + +end + +#------------------------------------------------------------------------------ + +class Template + + attr_reader :name, :parameters, :named_parameters + + def initialize() + @name = nil + @parname = nil + @parameters = [] + @named_parameters = {} + end + + def parname(name) + @parname = name + end + + def add_parameter(value) + if value != '' + if @parname.nil? # positional parameter + # first parameter is really the name of this template + if @name.nil? + @name = value + else + @parameters << value + end + else # named parameter + @named_parameters[@parname] ||= [] + @named_parameters[@parname] << value + end + end + end + +end + +#------------------------------------------------------------------------------ + +dir = ARGV[0] || '.' + +api = MediaWikiAPI::API.new('wiki.openstreetmap.org', 80, '/w/index.php?') +api.add_header('User-agent', 'taginfo/0.1 (jochen@remote.org)') + +db = SQLite3::Database.new(dir + '/taginfo-wiki.db') + +db.execute('BEGIN TRANSACTION'); + +File.open(dir + '/tagpages.list') do |wikipages| + wikipages.each do |line| + line.chomp! + t = line.split("\t") + page = WikiPage.new(t[0], t[1], t[2]) + puts "page: (#{page.title}) (#{page.type}) (#{page.namespace}) (#{page.tag})" + + if page.valid? + res = api.get(page.params) + page.content = res.body + + page.parse_content do |template| + puts "Template: #{template.name} [#{template.parameters.join(',')}] #{template.named_parameters.inspect}" + if template.name == 'Key' || template.name == 'Tag' + tag = template.parameters[0] + if template.parameters[1] + tag += '=' + template.parameters[1] + end + page.add_tag_link(tag) + end + if template.name =~ /(Key|Value)Description$/ + page.has_templ = true + end + if template.named_parameters['description'] + desc = [] + template.named_parameters['description'].each do |i| + if i.class == Template + desc << ' ' << i.parameters.join('=') << ' ' + else + desc << i + end + page.description = desc.join('').strip + end + end + if template.named_parameters['image'] + page.image = template.named_parameters['image'][0] + end + if template.named_parameters['group'] + page.group = template.named_parameters['group'][0] + end + if template.named_parameters['onNode'] == ['yes'] + page.onNode = true + end + if template.named_parameters['onWay'] == ['yes'] + page.onWay = true + end + if template.named_parameters['onArea'] == ['yes'] + page.onArea = true + end + if template.named_parameters['onRelation'] == ['yes'] + page.onRelation = true + end + if template.named_parameters['implies'] + template.named_parameters['implies'].each do |i| + if i.class == Template + page.tags_implies << i.parameters.join('=') + end + end + end + if template.named_parameters['combination'] + template.named_parameters['combination'].each do |i| + if i.class == Template + page.tags_combination << i.parameters.join('=') + end + end + end + end + page.insert(db) + else + puts "invalid page: #{page.title}" + end + end +end + +db.execute('COMMIT'); + + +#-- THE END ------------------------------------------------------------------- diff --git a/sources/wiki/lib/mediawikiapi.rb b/sources/wiki/lib/mediawikiapi.rb new file mode 100644 index 0000000..7df131c --- /dev/null +++ b/sources/wiki/lib/mediawikiapi.rb @@ -0,0 +1,65 @@ +#------------------------------------------------------------------------------ +# +# MediaWikiAPI +# +#------------------------------------------------------------------------------ +# +# Simple helper class to access the Mediawiki API. +# +#------------------------------------------------------------------------------ +# +# Copyright (C) 2010 Jochen Topf <jochen@remote.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +#------------------------------------------------------------------------------ + +module MediaWikiAPI + + class API + + def initialize(host, port=80, path='/w/api.php?') + @host = host + @port = port + @path = path + @headers = {} + end + + def add_header(name, value) + @headers[name] = value + end + + def build_path(params) + @path + params.to_a.map{ |el| el.join('=') }.join('&') + end + + def get(params) + path = build_path(params) + http = Net::HTTP.start(@host, @port) + http.get(path, @headers) + end + + def query(params) + params[:action] = 'query' + params[:format] = 'json' + result = get(params) + JSON.parse(result.body) + end + + end + +end + +#-- THE END ------------------------------------------------------------------- diff --git a/sources/wiki/post.sql b/sources/wiki/post.sql new file mode 100644 index 0000000..28e5673 --- /dev/null +++ b/sources/wiki/post.sql @@ -0,0 +1,28 @@ +-- +-- Taginfo source: Wiki +-- +-- post.sql +-- + +.bail ON + +UPDATE wikipages SET status='r' WHERE type='redirect'; +UPDATE wikipages SET status='p' WHERE type='page' AND has_templ='false'; +UPDATE wikipages SET status='t' WHERE type='page' AND has_templ='true' AND parsed=1; +UPDATE wikipages SET status='e' WHERE type='page' AND has_templ='true' AND parsed=0; + +CREATE INDEX wikipages_key_value_idx ON wikipages(key, value); + +INSERT INTO wikipages_keys (key, langs) SELECT key, group_concat(lang || ' ' || status) FROM wikipages WHERE value IS NULL GROUP BY key; +INSERT INTO wikipages_tags (key, value, langs) SELECT key, value, group_concat(lang || ' ' || status) FROM wikipages WHERE value IS NOT NULL GROUP BY key, value; + +INSERT INTO wiki_languages (language, count_pages) SELECT lang, count(*) FROM wikipages GROUP BY lang; + +INSERT INTO stats (key, value) SELECT 'wikipages_keys', count(*) FROM wikipages_keys; +INSERT INTO stats (key, value) SELECT 'wikipages_tags', count(*) FROM wikipages_tags; +INSERT INTO stats (key, value) SELECT 'wikipages_languages', count(*) FROM wiki_languages; + +ANALYZE; + +UPDATE meta SET update_end=datetime('now'); + diff --git a/sources/wiki/pre.sql b/sources/wiki/pre.sql new file mode 100644 index 0000000..2cb3bfd --- /dev/null +++ b/sources/wiki/pre.sql @@ -0,0 +1,80 @@ +-- +-- Taginfo source: Wiki +-- +-- pre.sql +-- + +.bail ON + +DROP TABLE IF EXISTS meta; + +CREATE TABLE meta ( + source_id TEXT, + source_name TEXT, + update_start TEXT, + update_end TEXT, + data_until TEXT +); + +INSERT INTO meta (source_id, source_name, update_start, data_until) SELECT 'wiki', 'Wiki', datetime('now'), datetime('now'); + +DROP TABLE IF EXISTS stats; + +CREATE TABLE stats ( + key TEXT, + value INT64 +); + +DROP TABLE IF EXISTS wikipages; + +CREATE TABLE wikipages ( + lang VARCHAR, + tag VARCHAR, + key VARCHAR, + value VARCHAR, + title VARCHAR, + tgroup VARCHAR, + type VARCHAR, + has_templ INTEGER, + parsed INTEGER, + description VARCHAR, + image VARCHAR, + on_node INTEGER, + on_way INTEGER, + on_area INTEGER, + on_relation INTEGER, + tags_implies VARCHAR, + tags_combination VARCHAR, + tags_linked VARCHAR, + status VARCHAR +); + +DROP TABLE IF EXISTS wikipages_keys; + +CREATE TABLE wikipages_keys ( + key VARCHAR, + langs VARCHAR +); + +DROP TABLE IF EXISTS wikipages_tags; + +CREATE TABLE wikipages_tags ( + key VARCHAR, + value VARCHAR, + langs VARCHAR +); + +DROP TABLE IF EXISTS wiki_languages; + +CREATE TABLE wiki_languages ( + language VARCHAR, + count_pages INT +); + +DROP TABLE IF EXISTS stats; + +CREATE TABLE stats ( + key VARCHAR, + value INT64 +); + diff --git a/sources/wiki/update.sh b/sources/wiki/update.sh new file mode 100755 index 0000000..b6f63a6 --- /dev/null +++ b/sources/wiki/update.sh @@ -0,0 +1,40 @@ +#!/bin/sh +# +# Taginfo source: Wiki +# +# update.sh DIR +# + +set -e + +DIR=$1 + +if [ "x" = "x$DIR" ]; then + echo "Usage: update.sh DIR" + exit 1 +fi + +echo -n "Start wiki: "; date + +DATABASE=$DIR/taginfo-wiki.db +LOGFILE=$DIR/get_wiki_data.log + +rm -f $DIR/allpages.list +rm -f $DIR/tagpages.list +rm -f $LOGFILE +rm -f $DATABASE + +echo "Running pre.sql..." +sqlite3 $DATABASE <pre.sql + +echo "Getting page list..." +./get_page_list.rb $DIR + +echo "Getting wiki data..." +./get_wiki_data.rb $DIR >$LOGFILE + +echo "Running post.sql..." +sqlite3 $DATABASE <post.sql + +echo -n "Done wiki: "; date + |