first commit

author: Jochen Topf <jochen@topf.org> 2010-10-04 18:41:53 +0200
committer: Jochen Topf <jochen@topf.org> 2010-10-04 18:41:53 +0200
commit: 9918c2c4c266a29848ce39fe2496876c66c3a48e (patch)
tree: b49fe450d33dcb3c30b37f7bff68fbb475ecec66 /sources/wiki
download: taginfo-9918c2c4c266a29848ce39fe2496876c66c3a48e.tar
taginfo-9918c2c4c266a29848ce39fe2496876c66c3a48e.tar.gz
7 files changed, 669 insertions, 0 deletions
diff --git a/sources/wiki/README b/sources/wiki/README
new file mode 100644
index 0000000..6c3a621
--- /dev/null
+++ b/sources/wiki/README
@@ -0,0 +1,22 @@
+
+Taginfo Sources: Wiki
+=====================
+
+PREREQUISITES
+-------------
+
+On Debian/Ubuntu you need the following packages:
+    libjson-ruby sqlite3 packages libsqlite3-ruby1.8
+
+
+RUNNING
+-------
+
+Run ./update.sh DIR
+
+where DIR is the name of a directory where the data will be stored.
+
+The file 'allpages.list' is currently not used. But because we get the
+information from the wiki anyway, we just store it. Maybe in the future
+it is of some use.
+
diff --git a/sources/wiki/get_page_list.rb b/sources/wiki/get_page_list.rb
new file mode 100755
index 0000000..649efbe
--- /dev/null
+++ b/sources/wiki/get_page_list.rb
@@ -0,0 +1,123 @@
+#!/usr/bin/ruby
+#------------------------------------------------------------------------------
+#
+#  get_page_list.rb [DIR]
+#
+#------------------------------------------------------------------------------
+#
+#  Gets the list of all wiki pages from the OSM wiki.
+#
+#  Two files will be written: 'allpages.list' contains all pages in the wiki,
+#  'tagpages.list' contains all pages about tags from the wiki. Both have the
+#  format:
+#
+#  <type> TAB <namespace> TAB <title>
+#
+#  The <type> is either 'page' or 'redirect', depending on whether this is a
+#  proper wiki page or a redirect to another wiki page, respectively.
+#
+#  The <namespaces> gives the namespace this page is in. This is empty for the
+#  main namespace.
+#
+#  <title> is the full title of the wiki page including leading namespaces etc.
+#
+#  The files will be created in DIR or in the current directory, if no directory
+#  was given on the command line.
+#
+#------------------------------------------------------------------------------
+#
+#  Copyright (C) 2010  Jochen Topf <jochen@remote.org>
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License along
+#  with this program; if not, write to the Free Software Foundation, Inc.,
+#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+#------------------------------------------------------------------------------
+
+require 'pp'
+
+require 'net/http'
+require 'uri'
+require 'json'
+
+require 'lib/mediawikiapi.rb'
+
+#------------------------------------------------------------------------------
+
+def get_namespaces(api)
+    data = api.query(:meta => 'siteinfo', :siprop => 'namespaces')
+    namespaces = {}
+    data['query']['namespaces'].values.each do |ns|
+        if ns['canonical'] =~ /^[A-Z]{2}$/
+            namespaces[ns['canonical']] = ns['id']
+        end
+    end
+    namespaces
+end
+
+def get_page_list(api, namespaceid, options)
+    apfrom = ''
+    loop do
+        data = api.query(:list => 'allpages', :aplimit => 'max', :apfrom => apfrom, :apnamespace => namespaceid, :apfilterredir => options[:redirect] ? 'redirects' : 'nonredirects')
+#        pp data
+        data['query']['allpages'].each do |h|
+            yield h['title'].gsub(/\s/, '_')
+        end
+        if data['query-continue']
+            apfrom = data['query-continue']['allpages']['apfrom'].gsub(/\s/, '_')
+#            puts "apfrom=#{apfrom}"
+        else
+            return
+        end
+    end
+end
+
+#------------------------------------------------------------------------------
+
+dir = ARGV[0] || '.'
+
+api = MediaWikiAPI::API.new('wiki.openstreetmap.org')
+api.add_header('User-agent', 'taginfo/0.1 (jochen@remote.org)')
+
+namespaces = get_namespaces(api)
+
+# add main namespace
+namespaces[''] = 0
+
+allpages = File.open(dir + '/allpages.list', 'w')
+tagpages = File.open(dir + '/tagpages.list', 'w')
+
+namespaces.keys.sort.each do |namespace|
+    id = namespaces[namespace]
+
+    get_page_list(api, id, :redirect => false) do |page|
+        line = ['page', namespace, page].join("\t")
+        allpages.puts line
+        if page =~ /^([^:]+:)?(Key|Tag):(.+)$/
+            tagpages.puts line
+        end
+    end
+
+    get_page_list(api, id, :redirect => true) do |page|
+        line = ['redirect', namespace, page].join("\t")
+        allpages.puts line
+        if page =~ /^([^:]+:)?(Key|Tag):(.+)$/
+            tagpages.puts line
+        end
+    end
+end
+
+tagpages.close
+allpages.close
+
+#-- THE END -------------------------------------------------------------------
diff --git a/sources/wiki/get_wiki_data.rb b/sources/wiki/get_wiki_data.rb
new file mode 100755
index 0000000..742e520
--- /dev/null
+++ b/sources/wiki/get_wiki_data.rb
@@ -0,0 +1,311 @@
+#!/usr/bin/ruby
+#------------------------------------------------------------------------------
+#
+#  get_wiki_data.rb [DIR]
+#
+#------------------------------------------------------------------------------
+#
+#  Reads all the wiki pages from 'tagpages.list' and gets their content from
+#  the OSM wiki. The pages are parsed and the information stored in the
+#  sqlite database 'taginfo-wiki.db' which must have been initialized before.
+#
+#  All files are in DIR or the current directory if no directory was given on
+#  the command line.
+#
+#  This script writes copious debugging information to STDOUT. You might want
+#  to redirect that to a file.
+#
+#------------------------------------------------------------------------------
+#
+#  Copyright (C) 2010  Jochen Topf <jochen@remote.org>
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License along
+#  with this program; if not, write to the Free Software Foundation, Inc.,
+#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+#------------------------------------------------------------------------------
+
+require 'pp'
+
+require 'json'
+require 'net/http'
+require 'uri'
+require 'sqlite3'
+
+require 'lib/mediawikiapi.rb'
+
+#------------------------------------------------------------------------------
+
+class WikiPage
+
+    @@pages = {}
+
+    attr_accessor :content
+    attr_accessor :description, :image, :group, :onNode, :onWay, :onArea, :onRelation, :has_templ
+    attr_reader :type, :namespace, :title, :tag, :key, :value, :lang, :ttype, :tags_implies, :tags_combination, :tags_linked, :parsed
+
+    def initialize(type, namespace, title)
+        @type      = type       # 'page' or 'redirect'
+        @namespace = namespace  # 'XX' (mediawiki namespace or '')
+        @title     = title      # wiki page title
+
+        @tag       = title.gsub(/^([^:]+:)?(Key|Tag):/, '') # complete tag (key=value)
+        @key       = @tag.sub(/=.*/, '')                    # key
+        if @tag =~ /=/
+            @value = @tag.sub(/.*?=/, '')                   # value (if any)
+        end
+        if title =~ /^(.*):(Key|Tag):/
+            @lang  = $1.downcase                            # IETF language tag
+            @ttype = $2.downcase                            # 'tag' or 'key'
+        else
+            @lang  = 'en'
+        end
+
+        @has_templ  = false
+
+        @tags_implies     = []
+        @tags_combination = []
+        @tags_linked      = []
+
+        @group      = ''
+        @onNode     = false
+        @onWay      = false
+        @onArea     = false
+        @onRelation = false
+
+        @parsed = nil
+
+        @@pages[@title] = self
+    end
+
+    def self.pages
+        @@pages.values.sort{ |a,b| a.title <=> b.title }
+    end
+
+    def self.find(name)
+        @@pages[name]
+    end
+
+    # Has this wiki page a name that we can understand and process?
+    def valid?
+        return false if @lang  !~ /^[a-z]{2}(-[a-z0-9-]+)?$/
+        return false if @ttype == 'key' && ! @value.nil?
+        return false if @ttype == 'tag' &&   @value.nil?
+        return false if @key   =~ %r{/}
+        return false if @value =~ %r{/}
+        return true
+    end
+
+    # Return parameters for API call to read this page.
+    def params
+        { :title => title, :action => 'raw' }
+    end
+
+    def add_tag_link(tag)
+        @tags_linked << tag
+    end
+
+    def insert(db)
+        db.execute(
+            "INSERT INTO wikipages (lang, tag, key, value, title, tgroup, type, has_templ, parsed, description, image, on_node, on_way, on_area, on_relation, tags_implies, tags_combination, tags_linked) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
+            lang,
+            tag,
+            key,
+            value,
+            title,
+            group,
+            type,
+            has_templ,
+            parsed     ? 1 : 0,
+            description,
+            image,
+            onNode     ? 1 : 0,
+            onWay      ? 1 : 0,
+            onArea     ? 1 : 0,
+            onRelation ? 1 : 0,
+            tags_implies.    sort.uniq.join(','),
+            tags_combination.sort.uniq.join(','),
+            tags_linked.     sort.uniq.join(',')
+        )
+    end
+
+    # Parse content of the wiki page. This will find the templates
+    # and their parameters.
+    def parse_content
+        @parsed = true
+        text = @content
+
+        # dummy template as base context
+        context = [ Template.new ]
+
+        loop do
+            # split text into ('before', 'token', 'after')
+            m = /^(.*?)(\{\{|\}\}|[|=])(.*)$/m.match(text)
+
+            # we are done if there are no more tokens
+            if m.nil?
+                return
+            end
+
+            # do the right thing depending on next token
+            case m[2]
+                when '{{' # start of template
+                    context.last.add_parameter(m[1].strip)
+                    context << Template.new()
+                when '}}' # end of template
+                    context.last.add_parameter(m[1].strip)
+                    c = context.pop
+                    yield c
+                    context.last.add_parameter(c)
+                when '|' # template parameter
+                    context.last.add_parameter(m[1].strip)
+                    context.last.parname(nil)
+                when '=' # named template parameter
+                    parameter_name = (m[1].strip == ':') ? 'subkey' : m[1].strip
+                    context.last.parname(parameter_name)
+            end
+
+            # 'after' is our next 'text'
+            text = m[3]
+        end
+    rescue
+        puts "Parsing of page #{title} failed"
+        @parsed = false
+    end
+
+end
+
+#------------------------------------------------------------------------------
+
+class Template
+
+    attr_reader :name, :parameters, :named_parameters
+
+    def initialize()
+        @name             = nil
+        @parname          = nil
+        @parameters       = []
+        @named_parameters = {}
+    end
+
+    def parname(name)
+        @parname = name
+    end
+
+    def add_parameter(value)
+        if value != ''
+            if @parname.nil? # positional parameter
+                # first parameter is really the name of this template
+                if @name.nil?
+                    @name = value
+                else
+                    @parameters << value
+                end
+            else # named parameter
+                @named_parameters[@parname] ||= []
+                @named_parameters[@parname] << value
+            end
+        end
+    end
+
+end
+
+#------------------------------------------------------------------------------
+
+dir = ARGV[0] || '.'
+
+api = MediaWikiAPI::API.new('wiki.openstreetmap.org', 80, '/w/index.php?')
+api.add_header('User-agent', 'taginfo/0.1 (jochen@remote.org)')
+
+db = SQLite3::Database.new(dir + '/taginfo-wiki.db')
+
+db.execute('BEGIN TRANSACTION');
+
+File.open(dir + '/tagpages.list') do |wikipages|
+    wikipages.each do |line|
+        line.chomp!
+        t = line.split("\t")
+        page = WikiPage.new(t[0], t[1], t[2])
+        puts "page: (#{page.title}) (#{page.type}) (#{page.namespace}) (#{page.tag})"
+
+        if page.valid?
+            res = api.get(page.params)
+            page.content = res.body
+
+            page.parse_content do |template|
+                puts "Template: #{template.name} [#{template.parameters.join(',')}] #{template.named_parameters.inspect}"
+                if template.name == 'Key' || template.name == 'Tag'
+                    tag = template.parameters[0]
+                    if template.parameters[1]
+                        tag += '=' + template.parameters[1]
+                    end
+                    page.add_tag_link(tag)
+                end
+                if template.name =~ /(Key|Value)Description$/
+                    page.has_templ = true
+                end
+                if template.named_parameters['description']
+                    desc = []
+                    template.named_parameters['description'].each do |i|
+                        if i.class == Template
+                            desc << ' ' << i.parameters.join('=') << ' '
+                        else
+                            desc << i
+                        end
+                        page.description = desc.join('').strip
+                    end
+                end
+                if template.named_parameters['image']
+                    page.image = template.named_parameters['image'][0]
+                end
+                if template.named_parameters['group']
+                    page.group = template.named_parameters['group'][0]
+                end
+                if template.named_parameters['onNode'] == ['yes']
+                    page.onNode = true
+                end
+                if template.named_parameters['onWay'] == ['yes']
+                    page.onWay = true
+                end
+                if template.named_parameters['onArea'] == ['yes']
+                    page.onArea = true
+                end
+                if template.named_parameters['onRelation'] == ['yes']
+                    page.onRelation = true
+                end
+                if template.named_parameters['implies']
+                    template.named_parameters['implies'].each do |i|
+                        if i.class == Template
+                            page.tags_implies << i.parameters.join('=')
+                        end
+                    end
+                end
+                if template.named_parameters['combination']
+                    template.named_parameters['combination'].each do |i|
+                        if i.class == Template
+                            page.tags_combination << i.parameters.join('=')
+                        end
+                    end
+                end
+            end
+            page.insert(db)
+        else
+            puts "invalid page: #{page.title}"
+        end
+    end
+end
+
+db.execute('COMMIT');
+
+
+#-- THE END -------------------------------------------------------------------
diff --git a/sources/wiki/lib/mediawikiapi.rb b/sources/wiki/lib/mediawikiapi.rb
new file mode 100644
index 0000000..7df131c
--- /dev/null
+++ b/sources/wiki/lib/mediawikiapi.rb
@@ -0,0 +1,65 @@
+#------------------------------------------------------------------------------
+#
+#  MediaWikiAPI
+#
+#------------------------------------------------------------------------------
+#
+#  Simple helper class to access the Mediawiki API.
+#
+#------------------------------------------------------------------------------
+#
+#  Copyright (C) 2010  Jochen Topf <jochen@remote.org>
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License along
+#  with this program; if not, write to the Free Software Foundation, Inc.,
+#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+#------------------------------------------------------------------------------
+
+module MediaWikiAPI
+
+    class API
+
+        def initialize(host, port=80, path='/w/api.php?')
+            @host = host
+            @port = port
+            @path = path
+            @headers = {}
+        end
+
+        def add_header(name, value)
+            @headers[name] = value
+        end
+
+        def build_path(params)
+            @path + params.to_a.map{ |el| el.join('=') }.join('&')
+        end
+
+        def get(params)
+            path = build_path(params)
+            http = Net::HTTP.start(@host, @port)
+            http.get(path, @headers)
+        end
+
+        def query(params)
+            params[:action] = 'query'
+            params[:format] = 'json'
+            result = get(params)
+            JSON.parse(result.body)
+        end
+
+    end
+
+end
+
+#-- THE END -------------------------------------------------------------------
diff --git a/sources/wiki/post.sql b/sources/wiki/post.sql
new file mode 100644
index 0000000..28e5673
--- /dev/null
+++ b/sources/wiki/post.sql
@@ -0,0 +1,28 @@
+--
+--  Taginfo source: Wiki
+--
+--  post.sql
+--
+
+.bail ON
+
+UPDATE wikipages SET status='r' WHERE type='redirect';
+UPDATE wikipages SET status='p' WHERE type='page' AND has_templ='false';
+UPDATE wikipages SET status='t' WHERE type='page' AND has_templ='true' AND parsed=1;
+UPDATE wikipages SET status='e' WHERE type='page' AND has_templ='true' AND parsed=0;
+
+CREATE INDEX wikipages_key_value_idx ON wikipages(key, value);
+
+INSERT INTO wikipages_keys (key, langs) SELECT key, group_concat(lang || ' ' || status) FROM wikipages WHERE value IS NULL GROUP BY key;
+INSERT INTO wikipages_tags (key, value, langs) SELECT key, value, group_concat(lang || ' ' || status) FROM wikipages WHERE value IS NOT NULL GROUP BY key, value;
+
+INSERT INTO wiki_languages (language, count_pages) SELECT lang, count(*) FROM wikipages GROUP BY lang;
+
+INSERT INTO stats (key, value) SELECT 'wikipages_keys',      count(*) FROM wikipages_keys;
+INSERT INTO stats (key, value) SELECT 'wikipages_tags',      count(*) FROM wikipages_tags;
+INSERT INTO stats (key, value) SELECT 'wikipages_languages', count(*) FROM wiki_languages;
+
+ANALYZE;
+
+UPDATE meta SET update_end=datetime('now');
+
diff --git a/sources/wiki/pre.sql b/sources/wiki/pre.sql
new file mode 100644
index 0000000..2cb3bfd
--- /dev/null
+++ b/sources/wiki/pre.sql
@@ -0,0 +1,80 @@
+--
+--  Taginfo source: Wiki
+--
+--  pre.sql
+--
+
+.bail ON
+
+DROP TABLE IF EXISTS meta;
+
+CREATE TABLE meta (
+    source_id    TEXT,
+    source_name  TEXT,
+    update_start TEXT,
+    update_end   TEXT,
+    data_until   TEXT
+);
+
+INSERT INTO meta (source_id, source_name, update_start, data_until) SELECT 'wiki', 'Wiki', datetime('now'), datetime('now');
+
+DROP TABLE IF EXISTS stats;
+
+CREATE TABLE stats (
+    key   TEXT,
+    value INT64
+);
+
+DROP TABLE IF EXISTS wikipages;
+
+CREATE TABLE wikipages (
+    lang             VARCHAR,
+    tag              VARCHAR,
+    key              VARCHAR,
+    value            VARCHAR,
+    title            VARCHAR,
+    tgroup           VARCHAR,
+    type             VARCHAR,
+    has_templ        INTEGER,
+    parsed           INTEGER,
+    description      VARCHAR,
+    image            VARCHAR,
+    on_node          INTEGER,
+    on_way           INTEGER,
+    on_area          INTEGER,
+    on_relation      INTEGER,
+    tags_implies     VARCHAR,
+    tags_combination VARCHAR,
+    tags_linked      VARCHAR,
+    status           VARCHAR
+);
+
+DROP TABLE IF EXISTS wikipages_keys;
+
+CREATE TABLE wikipages_keys (
+    key   VARCHAR,
+    langs VARCHAR
+);
+
+DROP TABLE IF EXISTS wikipages_tags;
+
+CREATE TABLE wikipages_tags (
+    key   VARCHAR,
+    value VARCHAR,
+    langs VARCHAR
+);
+
+DROP TABLE IF EXISTS wiki_languages;
+
+CREATE TABLE wiki_languages (
+    language    VARCHAR,
+    count_pages INT
+);
+
+DROP TABLE IF EXISTS stats;
+
+CREATE TABLE stats (
+    key   VARCHAR,
+    value INT64
+);
+
diff --git a/sources/wiki/update.sh b/sources/wiki/update.sh
new file mode 100755
index 0000000..b6f63a6
--- /dev/null
+++ b/sources/wiki/update.sh
@@ -0,0 +1,40 @@
+#!/bin/sh
+#
+#  Taginfo source: Wiki
+#
+#  update.sh DIR
+#
+
+set -e
+
+DIR=$1
+
+if [ "x" = "x$DIR" ]; then
+    echo "Usage: update.sh DIR"
+    exit 1
+fi
+
+echo -n "Start wiki: "; date
+
+DATABASE=$DIR/taginfo-wiki.db
+LOGFILE=$DIR/get_wiki_data.log
+
+rm -f $DIR/allpages.list
+rm -f $DIR/tagpages.list
+rm -f $LOGFILE
+rm -f $DATABASE
+
+echo "Running pre.sql..."
+sqlite3 $DATABASE <pre.sql
+
+echo "Getting page list..."
+./get_page_list.rb $DIR
+
+echo "Getting wiki data..."
+./get_wiki_data.rb $DIR >$LOGFILE
+
+echo "Running post.sql..."
+sqlite3 $DATABASE <post.sql
+
+echo -n "Done wiki: "; date
+
author	Jochen Topf <jochen@topf.org>	2010-10-04 18:41:53 +0200
committer	Jochen Topf <jochen@topf.org>	2010-10-04 18:41:53 +0200
commit	9918c2c4c266a29848ce39fe2496876c66c3a48e (patch)
tree	b49fe450d33dcb3c30b37f7bff68fbb475ecec66 /sources/wiki
download	taginfo-9918c2c4c266a29848ce39fe2496876c66c3a48e.tar taginfo-9918c2c4c266a29848ce39fe2496876c66c3a48e.tar.gz