summaryrefslogtreecommitdiff
path: root/sources/wiki
diff options
context:
space:
mode:
Diffstat (limited to 'sources/wiki')
-rwxr-xr-xsources/wiki/get_image_info.rb118
-rwxr-xr-xsources/wiki/get_page_list.rb16
-rwxr-xr-xsources/wiki/get_wiki_data.rb20
-rw-r--r--sources/wiki/lib/mediawikiapi.rb5
-rw-r--r--sources/wiki/post.sql2
-rw-r--r--sources/wiki/pre.sql21
6 files changed, 167 insertions, 15 deletions
diff --git a/sources/wiki/get_image_info.rb b/sources/wiki/get_image_info.rb
new file mode 100755
index 0000000..06707ca
--- /dev/null
+++ b/sources/wiki/get_image_info.rb
@@ -0,0 +1,118 @@
+#!/usr/bin/ruby
+#------------------------------------------------------------------------------
+#
+# get_image_info.rb [DIR]
+#
+#------------------------------------------------------------------------------
+#
+# Gets meta information about images from the OSM wiki.
+#
+# Reads the list of all images used in Key: and Tag: pages from the local
+# database and requests meta information (width, height, mime type, URL, ...)
+# for those images. Writes this data into the wiki_images table.
+#
+# The database must be in DIR or in the current directory, if no directory
+# was given on the command line.
+#
+#------------------------------------------------------------------------------
+#
+# Copyright (C) 2013 Jochen Topf <jochen@remote.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+#------------------------------------------------------------------------------
+
+require 'rubygems'
+
+require 'pp'
+
+require 'net/http'
+require 'uri'
+require 'json'
+require 'sqlite3'
+
+require 'lib/mediawikiapi.rb'
+
+#------------------------------------------------------------------------------
+
+dir = ARGV[0] || '.'
+
+api = MediaWikiAPI::API.new('wiki.openstreetmap.org')
+api.add_header('User-agent', 'taginfo/0.1 (jochen@remote.org)')
+
+db = SQLite3::Database.new(dir + '/taginfo-wiki.db')
+db.results_as_hash = true
+image_titles = db.execute("SELECT DISTINCT(image) AS title FROM wikipages").map{ |row| row['title'] }.select{ |title| !title.nil? && title.match(%r{^(file|image):}i) }
+
+db.execute('BEGIN TRANSACTION');
+
+until image_titles.empty?
+ some_titles = image_titles.slice!(0, 10)
+# puts some_titles.join(",") + "\n"
+
+ begin
+ data = api.query(:prop => 'imageinfo', :iiprop => 'url|size|mime', :titles => some_titles.join('|'), :iiurlwidth => 200, :iiurlheight => 200)
+
+ if !data['query']
+ STDERR.puts "Wiki API call failed (no 'query' field):"
+ pp data
+ next
+ end
+
+ normalized = data['query']['normalized']
+ if normalized
+ normalized.each do |n|
+ db.execute('UPDATE wikipages SET image=? WHERE image=?', n['to'], n['from'])
+ end
+ end
+
+ if !data['query']['pages']
+ STDERR.puts "Wiki API call failed (no 'pages' field):"
+ pp data
+ next
+ end
+
+ data['query']['pages'].each do |k,v|
+ if v['imageinfo']
+ info = v['imageinfo'][0]
+ if info['thumburl'].match(%r{^(.*/)[0-9]{1,4}(px-.*)$})
+ prefix = $1
+ suffix = $2
+ else
+ prefix = nil
+ suffix = nil
+ end
+ db.execute("INSERT INTO wiki_images (image, width, height, size, mime, image_url, thumb_url_prefix, thumb_url_suffix) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
+ v['title'],
+ info['width'],
+ info['height'],
+ info['size'],
+ info['mime'],
+ info['url'],
+ prefix,
+ suffix
+ )
+ end
+ end
+ rescue
+ puts "Wiki API call error:"
+ pp data
+ end
+end
+
+db.execute('COMMIT');
+
+
+#-- THE END -------------------------------------------------------------------
diff --git a/sources/wiki/get_page_list.rb b/sources/wiki/get_page_list.rb
index 88afe10..7cdbafa 100755
--- a/sources/wiki/get_page_list.rb
+++ b/sources/wiki/get_page_list.rb
@@ -70,13 +70,13 @@ end
def get_page_list(api, namespaceid, options)
apfrom = ''
loop do
- data = api.query(:list => 'allpages', :aplimit => 'max', :apfrom => apfrom, :apnamespace => namespaceid, :apfilterredir => options[:redirect] ? 'redirects' : 'nonredirects')
+ data = api.query(:generator => 'allpages', :gaplimit => 'max', :gapfrom => apfrom, :gapnamespace => namespaceid, :gapfilterredir => options[:redirect] ? 'redirects' : 'nonredirects', :prop => 'info')
# pp data
- data['query']['allpages'].each do |h|
- yield h['title'].gsub(/\s/, '_')
+ data['query']['pages'].each do |k,v|
+ yield v['touched'], v['title'].gsub(/\s/, '_')
end
if data['query-continue']
- apfrom = data['query-continue']['allpages']['apfrom'].gsub(/\s/, '_')
+ apfrom = data['query-continue']['allpages']['gapfrom'].gsub(/\s/, '_')
# puts "apfrom=#{apfrom}"
else
return
@@ -102,16 +102,16 @@ tagpages = File.open(dir + '/tagpages.list', 'w')
namespaces.keys.sort.each do |namespace|
id = namespaces[namespace]
- get_page_list(api, id, :redirect => false) do |page|
- line = ['page', namespace, page].join("\t")
+ get_page_list(api, id, :redirect => false) do |timestamp, page|
+ line = ['page', timestamp, namespace, page].join("\t")
allpages.puts line
if page =~ /^([^:]+:)?(Key|Tag):(.+)$/
tagpages.puts line
end
end
- get_page_list(api, id, :redirect => true) do |page|
- line = ['redirect', namespace, page].join("\t")
+ get_page_list(api, id, :redirect => true) do |timestamp, page|
+ line = ['redirect', timestamp, namespace, page].join("\t")
allpages.puts line
if page =~ /^([^:]+:)?(Key|Tag):(.+)$/
tagpages.puts line
diff --git a/sources/wiki/get_wiki_data.rb b/sources/wiki/get_wiki_data.rb
index 302db28..34d1182 100755
--- a/sources/wiki/get_wiki_data.rb
+++ b/sources/wiki/get_wiki_data.rb
@@ -54,10 +54,11 @@ class WikiPage
attr_accessor :content
attr_accessor :description, :image, :group, :onNode, :onWay, :onArea, :onRelation, :has_templ
- attr_reader :type, :namespace, :title, :tag, :key, :value, :lang, :ttype, :tags_implies, :tags_combination, :tags_linked, :parsed
+ attr_reader :type, :timestamp, :namespace, :title, :tag, :key, :value, :lang, :ttype, :tags_implies, :tags_combination, :tags_linked, :parsed
- def initialize(type, namespace, title)
+ def initialize(type, timestamp, namespace, title)
@type = type # 'page' or 'redirect'
+ @timestamp = timestamp # page last touched
@namespace = namespace # 'XX' (mediawiki namespace or '')
@title = title # wiki page title
@@ -129,7 +130,7 @@ class WikiPage
content,
group,
type,
- has_templ,
+ has_templ ? 1 : 0,
parsed ? 1 : 0,
description,
image,
@@ -239,8 +240,8 @@ File.open(dir + '/tagpages.list') do |wikipages|
wikipages.each do |line|
line.chomp!
t = line.split("\t")
- page = WikiPage.new(t[0], t[1], t[2])
- puts "page: (#{page.title}) (#{page.type}) (#{page.namespace}) (#{page.tag})"
+ page = WikiPage.new(t[0], t[1], t[2], t[3])
+ puts "page: (#{page.title}) (#{page.type}) (#{page.timestamp}) (#{page.namespace}) (#{page.tag})"
reason = page.check_title
if reason == :ok
@@ -271,7 +272,14 @@ File.open(dir + '/tagpages.list') do |wikipages|
end
end
if template.named_parameters['image']
- page.image = template.named_parameters['image'][0]
+ ititle = template.named_parameters['image'][0]
+ if !ititle.nil? && ititle.match(%r{^(file|image):(.*)$}i)
+ page.image = "File:#{$2}"
+ else
+ puts "invalid image: #{reason} #{page.title} #{ititle}"
+ db.execute('INSERT INTO invalid_image_titles (reason, page_title, image_title) VALUES (?, ?, ?)', reason, page.title, ititle)
+ page.image = ''
+ end
end
if template.named_parameters['group']
page.group = template.named_parameters['group'][0]
diff --git a/sources/wiki/lib/mediawikiapi.rb b/sources/wiki/lib/mediawikiapi.rb
index a231cee..293aa6f 100644
--- a/sources/wiki/lib/mediawikiapi.rb
+++ b/sources/wiki/lib/mediawikiapi.rb
@@ -26,6 +26,8 @@
#
#------------------------------------------------------------------------------
+require 'cgi'
+
module MediaWikiAPI
class API
@@ -42,12 +44,13 @@ module MediaWikiAPI
end
def build_path(params)
- @path + params.to_a.map{ |el| el.join('=') }.join('&')
+ @path + params.to_a.map{ |el| CGI::escape(el[0].to_s) + '=' + CGI::escape(el[1].to_s) }.join('&')
end
def get(params)
path = build_path(params)
http = Net::HTTP.start(@host, @port)
+# puts "Getting path [#{path}]"
http.get(path, @headers)
end
diff --git a/sources/wiki/post.sql b/sources/wiki/post.sql
index 773a04d..99eb26f 100644
--- a/sources/wiki/post.sql
+++ b/sources/wiki/post.sql
@@ -13,6 +13,8 @@ UPDATE wikipages SET status='e' WHERE type='page' AND has_templ='true' AND parse
CREATE INDEX wikipages_key_value_idx ON wikipages(key, value);
+CREATE INDEX wiki_images_image ON wiki_images(image);
+
INSERT INTO wikipages_keys (key, langs, lang_count) SELECT key, group_concat(lang || ' ' || status), count(*) FROM wikipages WHERE value IS NULL GROUP BY key;
INSERT INTO wikipages_tags (key, value, langs, lang_count) SELECT key, value, group_concat(lang || ' ' || status), count(*) FROM wikipages WHERE value IS NOT NULL GROUP BY key, value;
diff --git a/sources/wiki/pre.sql b/sources/wiki/pre.sql
index ebb80d3..8a515e7 100644
--- a/sources/wiki/pre.sql
+++ b/sources/wiki/pre.sql
@@ -33,6 +33,19 @@ CREATE TABLE wikipages (
status TEXT
);
+DROP TABLE IF EXISTS wiki_images;
+
+CREATE TABLE wiki_images (
+ image TEXT,
+ width INTEGER,
+ height INTEGER,
+ size INTEGER,
+ mime TEXT,
+ image_url TEXT,
+ thumb_url_prefix TEXT,
+ thumb_url_suffix TEXT
+);
+
DROP TABLE IF EXISTS wikipages_keys;
CREATE TABLE wikipages_keys (
@@ -64,6 +77,14 @@ CREATE TABLE invalid_page_titles (
title TEXT
);
+DROP TABLE IF EXISTS invalid_image_titles;
+
+CREATE TABLE invalid_image_titles (
+ reason TEXT,
+ page_title TEXT,
+ image_title TEXT
+);
+
DROP TABLE IF EXISTS words;
CREATE TABLE words (