diff options
author | Jochen Topf <jochen@topf.org> | 2010-10-04 18:41:53 +0200 |
---|---|---|
committer | Jochen Topf <jochen@topf.org> | 2010-10-04 18:41:53 +0200 |
commit | 9918c2c4c266a29848ce39fe2496876c66c3a48e (patch) | |
tree | b49fe450d33dcb3c30b37f7bff68fbb475ecec66 /sources/wiki | |
download | taginfo-9918c2c4c266a29848ce39fe2496876c66c3a48e.tar taginfo-9918c2c4c266a29848ce39fe2496876c66c3a48e.tar.gz |
first commit
Diffstat (limited to 'sources/wiki')
-rw-r--r-- | sources/wiki/README | 22 | ||||
-rwxr-xr-x | sources/wiki/get_page_list.rb | 123 | ||||
-rwxr-xr-x | sources/wiki/get_wiki_data.rb | 311 | ||||
-rw-r--r-- | sources/wiki/lib/mediawikiapi.rb | 65 | ||||
-rw-r--r-- | sources/wiki/post.sql | 28 | ||||
-rw-r--r-- | sources/wiki/pre.sql | 80 | ||||
-rwxr-xr-x | sources/wiki/update.sh | 40 |
7 files changed, 669 insertions, 0 deletions
diff --git a/sources/wiki/README b/sources/wiki/README new file mode 100644 index 0000000..6c3a621 --- /dev/null +++ b/sources/wiki/README @@ -0,0 +1,22 @@ + +Taginfo Sources: Wiki +===================== + +PREREQUISITES +------------- + +On Debian/Ubuntu you need the following packages: + libjson-ruby sqlite3 packages libsqlite3-ruby1.8 + + +RUNNING +------- + +Run ./update.sh DIR + +where DIR is the name of a directory where the data will be stored. + +The file 'allpages.list' is currently not used. But because we get the +information from the wiki anyway, we just store it. Maybe in the future +it is of some use. + diff --git a/sources/wiki/get_page_list.rb b/sources/wiki/get_page_list.rb new file mode 100755 index 0000000..649efbe --- /dev/null +++ b/sources/wiki/get_page_list.rb @@ -0,0 +1,123 @@ +#!/usr/bin/ruby +#------------------------------------------------------------------------------ +# +# get_page_list.rb [DIR] +# +#------------------------------------------------------------------------------ +# +# Gets the list of all wiki pages from the OSM wiki. +# +# Two files will be written: 'allpages.list' contains all pages in the wiki, +# 'tagpages.list' contains all pages about tags from the wiki. Both have the +# format: +# +# <type> TAB <namespace> TAB <title> +# +# The <type> is either 'page' or 'redirect', depending on whether this is a +# proper wiki page or a redirect to another wiki page, respectively. +# +# The <namespaces> gives the namespace this page is in. This is empty for the +# main namespace. +# +# <title> is the full title of the wiki page including leading namespaces etc. +# +# The files will be created in DIR or in the current directory, if no directory +# was given on the command line. +# +#------------------------------------------------------------------------------ +# +# Copyright (C) 2010 Jochen Topf <jochen@remote.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +#------------------------------------------------------------------------------ + +require 'pp' + +require 'net/http' +require 'uri' +require 'json' + +require 'lib/mediawikiapi.rb' + +#------------------------------------------------------------------------------ + +def get_namespaces(api) + data = api.query(:meta => 'siteinfo', :siprop => 'namespaces') + namespaces = {} + data['query']['namespaces'].values.each do |ns| + if ns['canonical'] =~ /^[A-Z]{2}$/ + namespaces[ns['canonical']] = ns['id'] + end + end + namespaces +end + +def get_page_list(api, namespaceid, options) + apfrom = '' + loop do + data = api.query(:list => 'allpages', :aplimit => 'max', :apfrom => apfrom, :apnamespace => namespaceid, :apfilterredir => options[:redirect] ? 'redirects' : 'nonredirects') +# pp data + data['query']['allpages'].each do |h| + yield h['title'].gsub(/\s/, '_') + end + if data['query-continue'] + apfrom = data['query-continue']['allpages']['apfrom'].gsub(/\s/, '_') +# puts "apfrom=#{apfrom}" + else + return + end + end +end + +#------------------------------------------------------------------------------ + +dir = ARGV[0] || '.' + +api = MediaWikiAPI::API.new('wiki.openstreetmap.org') +api.add_header('User-agent', 'taginfo/0.1 (jochen@remote.org)') + +namespaces = get_namespaces(api) + +# add main namespace +namespaces[''] = 0 + +allpages = File.open(dir + '/allpages.list', 'w') +tagpages = File.open(dir + '/tagpages.list', 'w') + +namespaces.keys.sort.each do |namespace| + id = namespaces[namespace] + + get_page_list(api, id, :redirect => false) do |page| + line = ['page', namespace, page].join("\t") + allpages.puts line + if page =~ /^([^:]+:)?(Key|Tag):(.+)$/ + tagpages.puts line + end + end + + get_page_list(api, id, :redirect => true) do |page| + line = ['redirect', namespace, page].join("\t") + allpages.puts line + if page =~ /^([^:]+:)?(Key|Tag):(.+)$/ + tagpages.puts line + end + end +end + +tagpages.close +allpages.close + +#-- THE END ------------------------------------------------------------------- diff --git a/sources/wiki/get_wiki_data.rb b/sources/wiki/get_wiki_data.rb new file mode 100755 index 0000000..742e520 --- /dev/null +++ b/sources/wiki/get_wiki_data.rb @@ -0,0 +1,311 @@ +#!/usr/bin/ruby +#------------------------------------------------------------------------------ +# +# get_wiki_data.rb [DIR] +# +#------------------------------------------------------------------------------ +# +# Reads all the wiki pages from 'tagpages.list' and gets their content from +# the OSM wiki. The pages are parsed and the information stored in the +# sqlite database 'taginfo-wiki.db' which must have been initialized before. +# +# All files are in DIR or the current directory if no directory was given on +# the command line. +# +# This script writes copious debugging information to STDOUT. You might want +# to redirect that to a file. +# +#------------------------------------------------------------------------------ +# +# Copyright (C) 2010 Jochen Topf <jochen@remote.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +#------------------------------------------------------------------------------ + +require 'pp' + +require 'json' +require 'net/http' +require 'uri' +require 'sqlite3' + +require 'lib/mediawikiapi.rb' + +#------------------------------------------------------------------------------ + +class WikiPage + + @@pages = {} + + attr_accessor :content + attr_accessor :description, :image, :group, :onNode, :onWay, :onArea, :onRelation, :has_templ + attr_reader :type, :namespace, :title, :tag, :key, :value, :lang, :ttype, :tags_implies, :tags_combination, :tags_linked, :parsed + + def initialize(type, namespace, title) + @type = type # 'page' or 'redirect' + @namespace = namespace # 'XX' (mediawiki namespace or '') + @title = title # wiki page title + + @tag = title.gsub(/^([^:]+:)?(Key|Tag):/, '') # complete tag (key=value) + @key = @tag.sub(/=.*/, '') # key + if @tag =~ /=/ + @value = @tag.sub(/.*?=/, '') # value (if any) + end + if title =~ /^(.*):(Key|Tag):/ + @lang = $1.downcase # IETF language tag + @ttype = $2.downcase # 'tag' or 'key' + else + @lang = 'en' + end + + @has_templ = false + + @tags_implies = [] + @tags_combination = [] + @tags_linked = [] + + @group = '' + @onNode = false + @onWay = false + @onArea = false + @onRelation = false + + @parsed = nil + + @@pages[@title] = self + end + + def self.pages + @@pages.values.sort{ |a,b| a.title <=> b.title } + end + + def self.find(name) + @@pages[name] + end + + # Has this wiki page a name that we can understand and process? + def valid? + return false if @lang !~ /^[a-z]{2}(-[a-z0-9-]+)?$/ + return false if @ttype == 'key' && ! @value.nil? + return false if @ttype == 'tag' && @value.nil? + return false if @key =~ %r{/} + return false if @value =~ %r{/} + return true + end + + # Return parameters for API call to read this page. + def params + { :title => title, :action => 'raw' } + end + + def add_tag_link(tag) + @tags_linked << tag + end + + def insert(db) + db.execute( + "INSERT INTO wikipages (lang, tag, key, value, title, tgroup, type, has_templ, parsed, description, image, on_node, on_way, on_area, on_relation, tags_implies, tags_combination, tags_linked) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + lang, + tag, + key, + value, + title, + group, + type, + has_templ, + parsed ? 1 : 0, + description, + image, + onNode ? 1 : 0, + onWay ? 1 : 0, + onArea ? 1 : 0, + onRelation ? 1 : 0, + tags_implies. sort.uniq.join(','), + tags_combination.sort.uniq.join(','), + tags_linked. sort.uniq.join(',') + ) + end + + # Parse content of the wiki page. This will find the templates + # and their parameters. + def parse_content + @parsed = true + text = @content + + # dummy template as base context + context = [ Template.new ] + + loop do + # split text into ('before', 'token', 'after') + m = /^(.*?)(\{\{|\}\}|[|=])(.*)$/m.match(text) + + # we are done if there are no more tokens + if m.nil? + return + end + + # do the right thing depending on next token + case m[2] + when '{{' # start of template + context.last.add_parameter(m[1].strip) + context << Template.new() + when '}}' # end of template + context.last.add_parameter(m[1].strip) + c = context.pop + yield c + context.last.add_parameter(c) + when '|' # template parameter + context.last.add_parameter(m[1].strip) + context.last.parname(nil) + when '=' # named template parameter + parameter_name = (m[1].strip == ':') ? 'subkey' : m[1].strip + context.last.parname(parameter_name) + end + + # 'after' is our next 'text' + text = m[3] + end + rescue + puts "Parsing of page #{title} failed" + @parsed = false + end + +end + +#------------------------------------------------------------------------------ + +class Template + + attr_reader :name, :parameters, :named_parameters + + def initialize() + @name = nil + @parname = nil + @parameters = [] + @named_parameters = {} + end + + def parname(name) + @parname = name + end + + def add_parameter(value) + if value != '' + if @parname.nil? # positional parameter + # first parameter is really the name of this template + if @name.nil? + @name = value + else + @parameters << value + end + else # named parameter + @named_parameters[@parname] ||= [] + @named_parameters[@parname] << value + end + end + end + +end + +#------------------------------------------------------------------------------ + +dir = ARGV[0] || '.' + +api = MediaWikiAPI::API.new('wiki.openstreetmap.org', 80, '/w/index.php?') +api.add_header('User-agent', 'taginfo/0.1 (jochen@remote.org)') + +db = SQLite3::Database.new(dir + '/taginfo-wiki.db') + +db.execute('BEGIN TRANSACTION'); + +File.open(dir + '/tagpages.list') do |wikipages| + wikipages.each do |line| + line.chomp! + t = line.split("\t") + page = WikiPage.new(t[0], t[1], t[2]) + puts "page: (#{page.title}) (#{page.type}) (#{page.namespace}) (#{page.tag})" + + if page.valid? + res = api.get(page.params) + page.content = res.body + + page.parse_content do |template| + puts "Template: #{template.name} [#{template.parameters.join(',')}] #{template.named_parameters.inspect}" + if template.name == 'Key' || template.name == 'Tag' + tag = template.parameters[0] + if template.parameters[1] + tag += '=' + template.parameters[1] + end + page.add_tag_link(tag) + end + if template.name =~ /(Key|Value)Description$/ + page.has_templ = true + end + if template.named_parameters['description'] + desc = [] + template.named_parameters['description'].each do |i| + if i.class == Template + desc << ' ' << i.parameters.join('=') << ' ' + else + desc << i + end + page.description = desc.join('').strip + end + end + if template.named_parameters['image'] + page.image = template.named_parameters['image'][0] + end + if template.named_parameters['group'] + page.group = template.named_parameters['group'][0] + end + if template.named_parameters['onNode'] == ['yes'] + page.onNode = true + end + if template.named_parameters['onWay'] == ['yes'] + page.onWay = true + end + if template.named_parameters['onArea'] == ['yes'] + page.onArea = true + end + if template.named_parameters['onRelation'] == ['yes'] + page.onRelation = true + end + if template.named_parameters['implies'] + template.named_parameters['implies'].each do |i| + if i.class == Template + page.tags_implies << i.parameters.join('=') + end + end + end + if template.named_parameters['combination'] + template.named_parameters['combination'].each do |i| + if i.class == Template + page.tags_combination << i.parameters.join('=') + end + end + end + end + page.insert(db) + else + puts "invalid page: #{page.title}" + end + end +end + +db.execute('COMMIT'); + + +#-- THE END ------------------------------------------------------------------- diff --git a/sources/wiki/lib/mediawikiapi.rb b/sources/wiki/lib/mediawikiapi.rb new file mode 100644 index 0000000..7df131c --- /dev/null +++ b/sources/wiki/lib/mediawikiapi.rb @@ -0,0 +1,65 @@ +#------------------------------------------------------------------------------ +# +# MediaWikiAPI +# +#------------------------------------------------------------------------------ +# +# Simple helper class to access the Mediawiki API. +# +#------------------------------------------------------------------------------ +# +# Copyright (C) 2010 Jochen Topf <jochen@remote.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +#------------------------------------------------------------------------------ + +module MediaWikiAPI + + class API + + def initialize(host, port=80, path='/w/api.php?') + @host = host + @port = port + @path = path + @headers = {} + end + + def add_header(name, value) + @headers[name] = value + end + + def build_path(params) + @path + params.to_a.map{ |el| el.join('=') }.join('&') + end + + def get(params) + path = build_path(params) + http = Net::HTTP.start(@host, @port) + http.get(path, @headers) + end + + def query(params) + params[:action] = 'query' + params[:format] = 'json' + result = get(params) + JSON.parse(result.body) + end + + end + +end + +#-- THE END ------------------------------------------------------------------- diff --git a/sources/wiki/post.sql b/sources/wiki/post.sql new file mode 100644 index 0000000..28e5673 --- /dev/null +++ b/sources/wiki/post.sql @@ -0,0 +1,28 @@ +-- +-- Taginfo source: Wiki +-- +-- post.sql +-- + +.bail ON + +UPDATE wikipages SET status='r' WHERE type='redirect'; +UPDATE wikipages SET status='p' WHERE type='page' AND has_templ='false'; +UPDATE wikipages SET status='t' WHERE type='page' AND has_templ='true' AND parsed=1; +UPDATE wikipages SET status='e' WHERE type='page' AND has_templ='true' AND parsed=0; + +CREATE INDEX wikipages_key_value_idx ON wikipages(key, value); + +INSERT INTO wikipages_keys (key, langs) SELECT key, group_concat(lang || ' ' || status) FROM wikipages WHERE value IS NULL GROUP BY key; +INSERT INTO wikipages_tags (key, value, langs) SELECT key, value, group_concat(lang || ' ' || status) FROM wikipages WHERE value IS NOT NULL GROUP BY key, value; + +INSERT INTO wiki_languages (language, count_pages) SELECT lang, count(*) FROM wikipages GROUP BY lang; + +INSERT INTO stats (key, value) SELECT 'wikipages_keys', count(*) FROM wikipages_keys; +INSERT INTO stats (key, value) SELECT 'wikipages_tags', count(*) FROM wikipages_tags; +INSERT INTO stats (key, value) SELECT 'wikipages_languages', count(*) FROM wiki_languages; + +ANALYZE; + +UPDATE meta SET update_end=datetime('now'); + diff --git a/sources/wiki/pre.sql b/sources/wiki/pre.sql new file mode 100644 index 0000000..2cb3bfd --- /dev/null +++ b/sources/wiki/pre.sql @@ -0,0 +1,80 @@ +-- +-- Taginfo source: Wiki +-- +-- pre.sql +-- + +.bail ON + +DROP TABLE IF EXISTS meta; + +CREATE TABLE meta ( + source_id TEXT, + source_name TEXT, + update_start TEXT, + update_end TEXT, + data_until TEXT +); + +INSERT INTO meta (source_id, source_name, update_start, data_until) SELECT 'wiki', 'Wiki', datetime('now'), datetime('now'); + +DROP TABLE IF EXISTS stats; + +CREATE TABLE stats ( + key TEXT, + value INT64 +); + +DROP TABLE IF EXISTS wikipages; + +CREATE TABLE wikipages ( + lang VARCHAR, + tag VARCHAR, + key VARCHAR, + value VARCHAR, + title VARCHAR, + tgroup VARCHAR, + type VARCHAR, + has_templ INTEGER, + parsed INTEGER, + description VARCHAR, + image VARCHAR, + on_node INTEGER, + on_way INTEGER, + on_area INTEGER, + on_relation INTEGER, + tags_implies VARCHAR, + tags_combination VARCHAR, + tags_linked VARCHAR, + status VARCHAR +); + +DROP TABLE IF EXISTS wikipages_keys; + +CREATE TABLE wikipages_keys ( + key VARCHAR, + langs VARCHAR +); + +DROP TABLE IF EXISTS wikipages_tags; + +CREATE TABLE wikipages_tags ( + key VARCHAR, + value VARCHAR, + langs VARCHAR +); + +DROP TABLE IF EXISTS wiki_languages; + +CREATE TABLE wiki_languages ( + language VARCHAR, + count_pages INT +); + +DROP TABLE IF EXISTS stats; + +CREATE TABLE stats ( + key VARCHAR, + value INT64 +); + diff --git a/sources/wiki/update.sh b/sources/wiki/update.sh new file mode 100755 index 0000000..b6f63a6 --- /dev/null +++ b/sources/wiki/update.sh @@ -0,0 +1,40 @@ +#!/bin/sh +# +# Taginfo source: Wiki +# +# update.sh DIR +# + +set -e + +DIR=$1 + +if [ "x" = "x$DIR" ]; then + echo "Usage: update.sh DIR" + exit 1 +fi + +echo -n "Start wiki: "; date + +DATABASE=$DIR/taginfo-wiki.db +LOGFILE=$DIR/get_wiki_data.log + +rm -f $DIR/allpages.list +rm -f $DIR/tagpages.list +rm -f $LOGFILE +rm -f $DATABASE + +echo "Running pre.sql..." +sqlite3 $DATABASE <pre.sql + +echo "Getting page list..." +./get_page_list.rb $DIR + +echo "Getting wiki data..." +./get_wiki_data.rb $DIR >$LOGFILE + +echo "Running post.sql..." +sqlite3 $DATABASE <post.sql + +echo -n "Done wiki: "; date + |