aboutsummaryrefslogtreecommitdiff
path: root/sources/wiki
diff options
context:
space:
mode:
authorJochen Topf <jochen@topf.org>2010-10-04 18:41:53 +0200
committerJochen Topf <jochen@topf.org>2010-10-04 18:41:53 +0200
commit9918c2c4c266a29848ce39fe2496876c66c3a48e (patch)
treeb49fe450d33dcb3c30b37f7bff68fbb475ecec66 /sources/wiki
downloadtaginfo-9918c2c4c266a29848ce39fe2496876c66c3a48e.tar
taginfo-9918c2c4c266a29848ce39fe2496876c66c3a48e.tar.gz
first commit
Diffstat (limited to 'sources/wiki')
-rw-r--r--sources/wiki/README22
-rwxr-xr-xsources/wiki/get_page_list.rb123
-rwxr-xr-xsources/wiki/get_wiki_data.rb311
-rw-r--r--sources/wiki/lib/mediawikiapi.rb65
-rw-r--r--sources/wiki/post.sql28
-rw-r--r--sources/wiki/pre.sql80
-rwxr-xr-xsources/wiki/update.sh40
7 files changed, 669 insertions, 0 deletions
diff --git a/sources/wiki/README b/sources/wiki/README
new file mode 100644
index 0000000..6c3a621
--- /dev/null
+++ b/sources/wiki/README
@@ -0,0 +1,22 @@
+
+Taginfo Sources: Wiki
+=====================
+
+PREREQUISITES
+-------------
+
+On Debian/Ubuntu you need the following packages:
+ libjson-ruby sqlite3 packages libsqlite3-ruby1.8
+
+
+RUNNING
+-------
+
+Run ./update.sh DIR
+
+where DIR is the name of a directory where the data will be stored.
+
+The file 'allpages.list' is currently not used. But because we get the
+information from the wiki anyway, we just store it. Maybe in the future
+it is of some use.
+
diff --git a/sources/wiki/get_page_list.rb b/sources/wiki/get_page_list.rb
new file mode 100755
index 0000000..649efbe
--- /dev/null
+++ b/sources/wiki/get_page_list.rb
@@ -0,0 +1,123 @@
+#!/usr/bin/ruby
+#------------------------------------------------------------------------------
+#
+# get_page_list.rb [DIR]
+#
+#------------------------------------------------------------------------------
+#
+# Gets the list of all wiki pages from the OSM wiki.
+#
+# Two files will be written: 'allpages.list' contains all pages in the wiki,
+# 'tagpages.list' contains all pages about tags from the wiki. Both have the
+# format:
+#
+# <type> TAB <namespace> TAB <title>
+#
+# The <type> is either 'page' or 'redirect', depending on whether this is a
+# proper wiki page or a redirect to another wiki page, respectively.
+#
+# The <namespaces> gives the namespace this page is in. This is empty for the
+# main namespace.
+#
+# <title> is the full title of the wiki page including leading namespaces etc.
+#
+# The files will be created in DIR or in the current directory, if no directory
+# was given on the command line.
+#
+#------------------------------------------------------------------------------
+#
+# Copyright (C) 2010 Jochen Topf <jochen@remote.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+#------------------------------------------------------------------------------
+
+require 'pp'
+
+require 'net/http'
+require 'uri'
+require 'json'
+
+require 'lib/mediawikiapi.rb'
+
+#------------------------------------------------------------------------------
+
+def get_namespaces(api)
+ data = api.query(:meta => 'siteinfo', :siprop => 'namespaces')
+ namespaces = {}
+ data['query']['namespaces'].values.each do |ns|
+ if ns['canonical'] =~ /^[A-Z]{2}$/
+ namespaces[ns['canonical']] = ns['id']
+ end
+ end
+ namespaces
+end
+
+def get_page_list(api, namespaceid, options)
+ apfrom = ''
+ loop do
+ data = api.query(:list => 'allpages', :aplimit => 'max', :apfrom => apfrom, :apnamespace => namespaceid, :apfilterredir => options[:redirect] ? 'redirects' : 'nonredirects')
+# pp data
+ data['query']['allpages'].each do |h|
+ yield h['title'].gsub(/\s/, '_')
+ end
+ if data['query-continue']
+ apfrom = data['query-continue']['allpages']['apfrom'].gsub(/\s/, '_')
+# puts "apfrom=#{apfrom}"
+ else
+ return
+ end
+ end
+end
+
+#------------------------------------------------------------------------------
+
+dir = ARGV[0] || '.'
+
+api = MediaWikiAPI::API.new('wiki.openstreetmap.org')
+api.add_header('User-agent', 'taginfo/0.1 (jochen@remote.org)')
+
+namespaces = get_namespaces(api)
+
+# add main namespace
+namespaces[''] = 0
+
+allpages = File.open(dir + '/allpages.list', 'w')
+tagpages = File.open(dir + '/tagpages.list', 'w')
+
+namespaces.keys.sort.each do |namespace|
+ id = namespaces[namespace]
+
+ get_page_list(api, id, :redirect => false) do |page|
+ line = ['page', namespace, page].join("\t")
+ allpages.puts line
+ if page =~ /^([^:]+:)?(Key|Tag):(.+)$/
+ tagpages.puts line
+ end
+ end
+
+ get_page_list(api, id, :redirect => true) do |page|
+ line = ['redirect', namespace, page].join("\t")
+ allpages.puts line
+ if page =~ /^([^:]+:)?(Key|Tag):(.+)$/
+ tagpages.puts line
+ end
+ end
+end
+
+tagpages.close
+allpages.close
+
+#-- THE END -------------------------------------------------------------------
diff --git a/sources/wiki/get_wiki_data.rb b/sources/wiki/get_wiki_data.rb
new file mode 100755
index 0000000..742e520
--- /dev/null
+++ b/sources/wiki/get_wiki_data.rb
@@ -0,0 +1,311 @@
+#!/usr/bin/ruby
+#------------------------------------------------------------------------------
+#
+# get_wiki_data.rb [DIR]
+#
+#------------------------------------------------------------------------------
+#
+# Reads all the wiki pages from 'tagpages.list' and gets their content from
+# the OSM wiki. The pages are parsed and the information stored in the
+# sqlite database 'taginfo-wiki.db' which must have been initialized before.
+#
+# All files are in DIR or the current directory if no directory was given on
+# the command line.
+#
+# This script writes copious debugging information to STDOUT. You might want
+# to redirect that to a file.
+#
+#------------------------------------------------------------------------------
+#
+# Copyright (C) 2010 Jochen Topf <jochen@remote.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+#------------------------------------------------------------------------------
+
+require 'pp'
+
+require 'json'
+require 'net/http'
+require 'uri'
+require 'sqlite3'
+
+require 'lib/mediawikiapi.rb'
+
+#------------------------------------------------------------------------------
+
+class WikiPage
+
+ @@pages = {}
+
+ attr_accessor :content
+ attr_accessor :description, :image, :group, :onNode, :onWay, :onArea, :onRelation, :has_templ
+ attr_reader :type, :namespace, :title, :tag, :key, :value, :lang, :ttype, :tags_implies, :tags_combination, :tags_linked, :parsed
+
+ def initialize(type, namespace, title)
+ @type = type # 'page' or 'redirect'
+ @namespace = namespace # 'XX' (mediawiki namespace or '')
+ @title = title # wiki page title
+
+ @tag = title.gsub(/^([^:]+:)?(Key|Tag):/, '') # complete tag (key=value)
+ @key = @tag.sub(/=.*/, '') # key
+ if @tag =~ /=/
+ @value = @tag.sub(/.*?=/, '') # value (if any)
+ end
+ if title =~ /^(.*):(Key|Tag):/
+ @lang = $1.downcase # IETF language tag
+ @ttype = $2.downcase # 'tag' or 'key'
+ else
+ @lang = 'en'
+ end
+
+ @has_templ = false
+
+ @tags_implies = []
+ @tags_combination = []
+ @tags_linked = []
+
+ @group = ''
+ @onNode = false
+ @onWay = false
+ @onArea = false
+ @onRelation = false
+
+ @parsed = nil
+
+ @@pages[@title] = self
+ end
+
+ def self.pages
+ @@pages.values.sort{ |a,b| a.title <=> b.title }
+ end
+
+ def self.find(name)
+ @@pages[name]
+ end
+
+ # Has this wiki page a name that we can understand and process?
+ def valid?
+ return false if @lang !~ /^[a-z]{2}(-[a-z0-9-]+)?$/
+ return false if @ttype == 'key' && ! @value.nil?
+ return false if @ttype == 'tag' && @value.nil?
+ return false if @key =~ %r{/}
+ return false if @value =~ %r{/}
+ return true
+ end
+
+ # Return parameters for API call to read this page.
+ def params
+ { :title => title, :action => 'raw' }
+ end
+
+ def add_tag_link(tag)
+ @tags_linked << tag
+ end
+
+ def insert(db)
+ db.execute(
+ "INSERT INTO wikipages (lang, tag, key, value, title, tgroup, type, has_templ, parsed, description, image, on_node, on_way, on_area, on_relation, tags_implies, tags_combination, tags_linked) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
+ lang,
+ tag,
+ key,
+ value,
+ title,
+ group,
+ type,
+ has_templ,
+ parsed ? 1 : 0,
+ description,
+ image,
+ onNode ? 1 : 0,
+ onWay ? 1 : 0,
+ onArea ? 1 : 0,
+ onRelation ? 1 : 0,
+ tags_implies. sort.uniq.join(','),
+ tags_combination.sort.uniq.join(','),
+ tags_linked. sort.uniq.join(',')
+ )
+ end
+
+ # Parse content of the wiki page. This will find the templates
+ # and their parameters.
+ def parse_content
+ @parsed = true
+ text = @content
+
+ # dummy template as base context
+ context = [ Template.new ]
+
+ loop do
+ # split text into ('before', 'token', 'after')
+ m = /^(.*?)(\{\{|\}\}|[|=])(.*)$/m.match(text)
+
+ # we are done if there are no more tokens
+ if m.nil?
+ return
+ end
+
+ # do the right thing depending on next token
+ case m[2]
+ when '{{' # start of template
+ context.last.add_parameter(m[1].strip)
+ context << Template.new()
+ when '}}' # end of template
+ context.last.add_parameter(m[1].strip)
+ c = context.pop
+ yield c
+ context.last.add_parameter(c)
+ when '|' # template parameter
+ context.last.add_parameter(m[1].strip)
+ context.last.parname(nil)
+ when '=' # named template parameter
+ parameter_name = (m[1].strip == ':') ? 'subkey' : m[1].strip
+ context.last.parname(parameter_name)
+ end
+
+ # 'after' is our next 'text'
+ text = m[3]
+ end
+ rescue
+ puts "Parsing of page #{title} failed"
+ @parsed = false
+ end
+
+end
+
+#------------------------------------------------------------------------------
+
+class Template
+
+ attr_reader :name, :parameters, :named_parameters
+
+ def initialize()
+ @name = nil
+ @parname = nil
+ @parameters = []
+ @named_parameters = {}
+ end
+
+ def parname(name)
+ @parname = name
+ end
+
+ def add_parameter(value)
+ if value != ''
+ if @parname.nil? # positional parameter
+ # first parameter is really the name of this template
+ if @name.nil?
+ @name = value
+ else
+ @parameters << value
+ end
+ else # named parameter
+ @named_parameters[@parname] ||= []
+ @named_parameters[@parname] << value
+ end
+ end
+ end
+
+end
+
+#------------------------------------------------------------------------------
+
+dir = ARGV[0] || '.'
+
+api = MediaWikiAPI::API.new('wiki.openstreetmap.org', 80, '/w/index.php?')
+api.add_header('User-agent', 'taginfo/0.1 (jochen@remote.org)')
+
+db = SQLite3::Database.new(dir + '/taginfo-wiki.db')
+
+db.execute('BEGIN TRANSACTION');
+
+File.open(dir + '/tagpages.list') do |wikipages|
+ wikipages.each do |line|
+ line.chomp!
+ t = line.split("\t")
+ page = WikiPage.new(t[0], t[1], t[2])
+ puts "page: (#{page.title}) (#{page.type}) (#{page.namespace}) (#{page.tag})"
+
+ if page.valid?
+ res = api.get(page.params)
+ page.content = res.body
+
+ page.parse_content do |template|
+ puts "Template: #{template.name} [#{template.parameters.join(',')}] #{template.named_parameters.inspect}"
+ if template.name == 'Key' || template.name == 'Tag'
+ tag = template.parameters[0]
+ if template.parameters[1]
+ tag += '=' + template.parameters[1]
+ end
+ page.add_tag_link(tag)
+ end
+ if template.name =~ /(Key|Value)Description$/
+ page.has_templ = true
+ end
+ if template.named_parameters['description']
+ desc = []
+ template.named_parameters['description'].each do |i|
+ if i.class == Template
+ desc << ' ' << i.parameters.join('=') << ' '
+ else
+ desc << i
+ end
+ page.description = desc.join('').strip
+ end
+ end
+ if template.named_parameters['image']
+ page.image = template.named_parameters['image'][0]
+ end
+ if template.named_parameters['group']
+ page.group = template.named_parameters['group'][0]
+ end
+ if template.named_parameters['onNode'] == ['yes']
+ page.onNode = true
+ end
+ if template.named_parameters['onWay'] == ['yes']
+ page.onWay = true
+ end
+ if template.named_parameters['onArea'] == ['yes']
+ page.onArea = true
+ end
+ if template.named_parameters['onRelation'] == ['yes']
+ page.onRelation = true
+ end
+ if template.named_parameters['implies']
+ template.named_parameters['implies'].each do |i|
+ if i.class == Template
+ page.tags_implies << i.parameters.join('=')
+ end
+ end
+ end
+ if template.named_parameters['combination']
+ template.named_parameters['combination'].each do |i|
+ if i.class == Template
+ page.tags_combination << i.parameters.join('=')
+ end
+ end
+ end
+ end
+ page.insert(db)
+ else
+ puts "invalid page: #{page.title}"
+ end
+ end
+end
+
+db.execute('COMMIT');
+
+
+#-- THE END -------------------------------------------------------------------
diff --git a/sources/wiki/lib/mediawikiapi.rb b/sources/wiki/lib/mediawikiapi.rb
new file mode 100644
index 0000000..7df131c
--- /dev/null
+++ b/sources/wiki/lib/mediawikiapi.rb
@@ -0,0 +1,65 @@
+#------------------------------------------------------------------------------
+#
+# MediaWikiAPI
+#
+#------------------------------------------------------------------------------
+#
+# Simple helper class to access the Mediawiki API.
+#
+#------------------------------------------------------------------------------
+#
+# Copyright (C) 2010 Jochen Topf <jochen@remote.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+#------------------------------------------------------------------------------
+
+module MediaWikiAPI
+
+ class API
+
+ def initialize(host, port=80, path='/w/api.php?')
+ @host = host
+ @port = port
+ @path = path
+ @headers = {}
+ end
+
+ def add_header(name, value)
+ @headers[name] = value
+ end
+
+ def build_path(params)
+ @path + params.to_a.map{ |el| el.join('=') }.join('&')
+ end
+
+ def get(params)
+ path = build_path(params)
+ http = Net::HTTP.start(@host, @port)
+ http.get(path, @headers)
+ end
+
+ def query(params)
+ params[:action] = 'query'
+ params[:format] = 'json'
+ result = get(params)
+ JSON.parse(result.body)
+ end
+
+ end
+
+end
+
+#-- THE END -------------------------------------------------------------------
diff --git a/sources/wiki/post.sql b/sources/wiki/post.sql
new file mode 100644
index 0000000..28e5673
--- /dev/null
+++ b/sources/wiki/post.sql
@@ -0,0 +1,28 @@
+--
+-- Taginfo source: Wiki
+--
+-- post.sql
+--
+
+.bail ON
+
+UPDATE wikipages SET status='r' WHERE type='redirect';
+UPDATE wikipages SET status='p' WHERE type='page' AND has_templ='false';
+UPDATE wikipages SET status='t' WHERE type='page' AND has_templ='true' AND parsed=1;
+UPDATE wikipages SET status='e' WHERE type='page' AND has_templ='true' AND parsed=0;
+
+CREATE INDEX wikipages_key_value_idx ON wikipages(key, value);
+
+INSERT INTO wikipages_keys (key, langs) SELECT key, group_concat(lang || ' ' || status) FROM wikipages WHERE value IS NULL GROUP BY key;
+INSERT INTO wikipages_tags (key, value, langs) SELECT key, value, group_concat(lang || ' ' || status) FROM wikipages WHERE value IS NOT NULL GROUP BY key, value;
+
+INSERT INTO wiki_languages (language, count_pages) SELECT lang, count(*) FROM wikipages GROUP BY lang;
+
+INSERT INTO stats (key, value) SELECT 'wikipages_keys', count(*) FROM wikipages_keys;
+INSERT INTO stats (key, value) SELECT 'wikipages_tags', count(*) FROM wikipages_tags;
+INSERT INTO stats (key, value) SELECT 'wikipages_languages', count(*) FROM wiki_languages;
+
+ANALYZE;
+
+UPDATE meta SET update_end=datetime('now');
+
diff --git a/sources/wiki/pre.sql b/sources/wiki/pre.sql
new file mode 100644
index 0000000..2cb3bfd
--- /dev/null
+++ b/sources/wiki/pre.sql
@@ -0,0 +1,80 @@
+--
+-- Taginfo source: Wiki
+--
+-- pre.sql
+--
+
+.bail ON
+
+DROP TABLE IF EXISTS meta;
+
+CREATE TABLE meta (
+ source_id TEXT,
+ source_name TEXT,
+ update_start TEXT,
+ update_end TEXT,
+ data_until TEXT
+);
+
+INSERT INTO meta (source_id, source_name, update_start, data_until) SELECT 'wiki', 'Wiki', datetime('now'), datetime('now');
+
+DROP TABLE IF EXISTS stats;
+
+CREATE TABLE stats (
+ key TEXT,
+ value INT64
+);
+
+DROP TABLE IF EXISTS wikipages;
+
+CREATE TABLE wikipages (
+ lang VARCHAR,
+ tag VARCHAR,
+ key VARCHAR,
+ value VARCHAR,
+ title VARCHAR,
+ tgroup VARCHAR,
+ type VARCHAR,
+ has_templ INTEGER,
+ parsed INTEGER,
+ description VARCHAR,
+ image VARCHAR,
+ on_node INTEGER,
+ on_way INTEGER,
+ on_area INTEGER,
+ on_relation INTEGER,
+ tags_implies VARCHAR,
+ tags_combination VARCHAR,
+ tags_linked VARCHAR,
+ status VARCHAR
+);
+
+DROP TABLE IF EXISTS wikipages_keys;
+
+CREATE TABLE wikipages_keys (
+ key VARCHAR,
+ langs VARCHAR
+);
+
+DROP TABLE IF EXISTS wikipages_tags;
+
+CREATE TABLE wikipages_tags (
+ key VARCHAR,
+ value VARCHAR,
+ langs VARCHAR
+);
+
+DROP TABLE IF EXISTS wiki_languages;
+
+CREATE TABLE wiki_languages (
+ language VARCHAR,
+ count_pages INT
+);
+
+DROP TABLE IF EXISTS stats;
+
+CREATE TABLE stats (
+ key VARCHAR,
+ value INT64
+);
+
diff --git a/sources/wiki/update.sh b/sources/wiki/update.sh
new file mode 100755
index 0000000..b6f63a6
--- /dev/null
+++ b/sources/wiki/update.sh
@@ -0,0 +1,40 @@
+#!/bin/sh
+#
+# Taginfo source: Wiki
+#
+# update.sh DIR
+#
+
+set -e
+
+DIR=$1
+
+if [ "x" = "x$DIR" ]; then
+ echo "Usage: update.sh DIR"
+ exit 1
+fi
+
+echo -n "Start wiki: "; date
+
+DATABASE=$DIR/taginfo-wiki.db
+LOGFILE=$DIR/get_wiki_data.log
+
+rm -f $DIR/allpages.list
+rm -f $DIR/tagpages.list
+rm -f $LOGFILE
+rm -f $DATABASE
+
+echo "Running pre.sql..."
+sqlite3 $DATABASE <pre.sql
+
+echo "Getting page list..."
+./get_page_list.rb $DIR
+
+echo "Getting wiki data..."
+./get_wiki_data.rb $DIR >$LOGFILE
+
+echo "Running post.sql..."
+sqlite3 $DATABASE <post.sql
+
+echo -n "Done wiki: "; date
+