summaryrefslogtreecommitdiff
path: root/sources/wiki
diff options
context:
space:
mode:
authorJochen Topf <jochen@topf.org>2013-01-09 19:18:55 +0100
committerJochen Topf <jochen@topf.org>2013-01-09 19:18:55 +0100
commit64a047f622a5ed15dea94b5e52dd8c948fce9e95 (patch)
tree1dfeecec0fa0477c1a246ef6567dffc56592d325 /sources/wiki
parent9cadfd89c12c9223e7c572646680d0bcce57310c (diff)
downloadtaginfo-64a047f622a5ed15dea94b5e52dd8c948fce9e95.tar
taginfo-64a047f622a5ed15dea94b5e52dd8c948fce9e95.tar.gz
Better support for wiki images.
Key and tag wiki pages can contain images. Until now we only got the titles of those images. Now we also get the URL to the image, URL to thumbnails, width, height, and mime type. This information is now exposed in the API and it is used to show the images in the Overview tab of the key and tag pages. While we are changing the update process anyway, I changed the program that gets the list of all pages to also output the time those pages changed last. This information is currently not used, but it could be used to cache those pages locally making the update much faster and adding less strain to the wiki server.
Diffstat (limited to 'sources/wiki')
-rwxr-xr-xsources/wiki/get_image_info.rb118
-rwxr-xr-xsources/wiki/get_page_list.rb16
-rwxr-xr-xsources/wiki/get_wiki_data.rb20
-rw-r--r--sources/wiki/lib/mediawikiapi.rb5
-rw-r--r--sources/wiki/post.sql2
-rw-r--r--sources/wiki/pre.sql21
6 files changed, 167 insertions, 15 deletions
diff --git a/sources/wiki/get_image_info.rb b/sources/wiki/get_image_info.rb
new file mode 100755
index 0000000..06707ca
--- /dev/null
+++ b/sources/wiki/get_image_info.rb
@@ -0,0 +1,118 @@
+#!/usr/bin/ruby
+#------------------------------------------------------------------------------
+#
+# get_image_info.rb [DIR]
+#
+#------------------------------------------------------------------------------
+#
+# Gets meta information about images from the OSM wiki.
+#
+# Reads the list of all images used in Key: and Tag: pages from the local
+# database and requests meta information (width, height, mime type, URL, ...)
+# for those images. Writes this data into the wiki_images table.
+#
+# The database must be in DIR or in the current directory, if no directory
+# was given on the command line.
+#
+#------------------------------------------------------------------------------
+#
+# Copyright (C) 2013 Jochen Topf <jochen@remote.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+#------------------------------------------------------------------------------
+
+require 'rubygems'
+
+require 'pp'
+
+require 'net/http'
+require 'uri'
+require 'json'
+require 'sqlite3'
+
+require 'lib/mediawikiapi.rb'
+
+#------------------------------------------------------------------------------
+
+dir = ARGV[0] || '.'
+
+api = MediaWikiAPI::API.new('wiki.openstreetmap.org')
+api.add_header('User-agent', 'taginfo/0.1 (jochen@remote.org)')
+
+db = SQLite3::Database.new(dir + '/taginfo-wiki.db')
+db.results_as_hash = true
+image_titles = db.execute("SELECT DISTINCT(image) AS title FROM wikipages").map{ |row| row['title'] }.select{ |title| !title.nil? && title.match(%r{^(file|image):}i) }
+
+db.execute('BEGIN TRANSACTION');
+
+until image_titles.empty?
+ some_titles = image_titles.slice!(0, 10)
+# puts some_titles.join(",") + "\n"
+
+ begin
+ data = api.query(:prop => 'imageinfo', :iiprop => 'url|size|mime', :titles => some_titles.join('|'), :iiurlwidth => 200, :iiurlheight => 200)
+
+ if !data['query']
+ STDERR.puts "Wiki API call failed (no 'query' field):"
+ pp data
+ next
+ end
+
+ normalized = data['query']['normalized']
+ if normalized
+ normalized.each do |n|
+ db.execute('UPDATE wikipages SET image=? WHERE image=?', n['to'], n['from'])
+ end
+ end
+
+ if !data['query']['pages']
+ STDERR.puts "Wiki API call failed (no 'pages' field):"
+ pp data
+ next
+ end
+
+ data['query']['pages'].each do |k,v|
+ if v['imageinfo']
+ info = v['imageinfo'][0]
+ if info['thumburl'].match(%r{^(.*/)[0-9]{1,4}(px-.*)$})
+ prefix = $1
+ suffix = $2
+ else
+ prefix = nil
+ suffix = nil
+ end
+ db.execute("INSERT INTO wiki_images (image, width, height, size, mime, image_url, thumb_url_prefix, thumb_url_suffix) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
+ v['title'],
+ info['width'],
+ info['height'],
+ info['size'],
+ info['mime'],
+ info['url'],
+ prefix,
+ suffix
+ )
+ end
+ end
+ rescue
+ puts "Wiki API call error:"
+ pp data
+ end
+end
+
+db.execute('COMMIT');
+
+
+#-- THE END -------------------------------------------------------------------
diff --git a/sources/wiki/get_page_list.rb b/sources/wiki/get_page_list.rb
index 88afe10..7cdbafa 100755
--- a/sources/wiki/get_page_list.rb
+++ b/sources/wiki/get_page_list.rb
@@ -70,13 +70,13 @@ end
def get_page_list(api, namespaceid, options)
apfrom = ''
loop do
- data = api.query(:list => 'allpages', :aplimit => 'max', :apfrom => apfrom, :apnamespace => namespaceid, :apfilterredir => options[:redirect] ? 'redirects' : 'nonredirects')
+ data = api.query(:generator => 'allpages', :gaplimit => 'max', :gapfrom => apfrom, :gapnamespace => namespaceid, :gapfilterredir => options[:redirect] ? 'redirects' : 'nonredirects', :prop => 'info')
# pp data
- data['query']['allpages'].each do |h|
- yield h['title'].gsub(/\s/, '_')
+ data['query']['pages'].each do |k,v|
+ yield v['touched'], v['title'].gsub(/\s/, '_')
end
if data['query-continue']
- apfrom = data['query-continue']['allpages']['apfrom'].gsub(/\s/, '_')
+ apfrom = data['query-continue']['allpages']['gapfrom'].gsub(/\s/, '_')
# puts "apfrom=#{apfrom}"
else
return
@@ -102,16 +102,16 @@ tagpages = File.open(dir + '/tagpages.list', 'w')
namespaces.keys.sort.each do |namespace|
id = namespaces[namespace]
- get_page_list(api, id, :redirect => false) do |page|
- line = ['page', namespace, page].join("\t")
+ get_page_list(api, id, :redirect => false) do |timestamp, page|
+ line = ['page', timestamp, namespace, page].join("\t")
allpages.puts line
if page =~ /^([^:]+:)?(Key|Tag):(.+)$/
tagpages.puts line
end
end
- get_page_list(api, id, :redirect => true) do |page|
- line = ['redirect', namespace, page].join("\t")
+ get_page_list(api, id, :redirect => true) do |timestamp, page|
+ line = ['redirect', timestamp, namespace, page].join("\t")
allpages.puts line
if page =~ /^([^:]+:)?(Key|Tag):(.+)$/
tagpages.puts line
diff --git a/sources/wiki/get_wiki_data.rb b/sources/wiki/get_wiki_data.rb
index 302db28..34d1182 100755
--- a/sources/wiki/get_wiki_data.rb
+++ b/sources/wiki/get_wiki_data.rb
@@ -54,10 +54,11 @@ class WikiPage
attr_accessor :content
attr_accessor :description, :image, :group, :onNode, :onWay, :onArea, :onRelation, :has_templ
- attr_reader :type, :namespace, :title, :tag, :key, :value, :lang, :ttype, :tags_implies, :tags_combination, :tags_linked, :parsed
+ attr_reader :type, :timestamp, :namespace, :title, :tag, :key, :value, :lang, :ttype, :tags_implies, :tags_combination, :tags_linked, :parsed
- def initialize(type, namespace, title)
+ def initialize(type, timestamp, namespace, title)
@type = type # 'page' or 'redirect'
+ @timestamp = timestamp # page last touched
@namespace = namespace # 'XX' (mediawiki namespace or '')
@title = title # wiki page title
@@ -129,7 +130,7 @@ class WikiPage
content,
group,
type,
- has_templ,
+ has_templ ? 1 : 0,
parsed ? 1 : 0,
description,
image,
@@ -239,8 +240,8 @@ File.open(dir + '/tagpages.list') do |wikipages|
wikipages.each do |line|
line.chomp!
t = line.split("\t")
- page = WikiPage.new(t[0], t[1], t[2])
- puts "page: (#{page.title}) (#{page.type}) (#{page.namespace}) (#{page.tag})"
+ page = WikiPage.new(t[0], t[1], t[2], t[3])
+ puts "page: (#{page.title}) (#{page.type}) (#{page.timestamp}) (#{page.namespace}) (#{page.tag})"
reason = page.check_title
if reason == :ok
@@ -271,7 +272,14 @@ File.open(dir + '/tagpages.list') do |wikipages|
end
end
if template.named_parameters['image']
- page.image = template.named_parameters['image'][0]
+ ititle = template.named_parameters['image'][0]
+ if !ititle.nil? && ititle.match(%r{^(file|image):(.*)$}i)
+ page.image = "File:#{$2}"
+ else
+ puts "invalid image: #{reason} #{page.title} #{ititle}"
+ db.execute('INSERT INTO invalid_image_titles (reason, page_title, image_title) VALUES (?, ?, ?)', reason, page.title, ititle)
+ page.image = ''
+ end
end
if template.named_parameters['group']
page.group = template.named_parameters['group'][0]
diff --git a/sources/wiki/lib/mediawikiapi.rb b/sources/wiki/lib/mediawikiapi.rb
index a231cee..293aa6f 100644
--- a/sources/wiki/lib/mediawikiapi.rb
+++ b/sources/wiki/lib/mediawikiapi.rb
@@ -26,6 +26,8 @@
#
#------------------------------------------------------------------------------
+require 'cgi'
+
module MediaWikiAPI
class API
@@ -42,12 +44,13 @@ module MediaWikiAPI
end
def build_path(params)
- @path + params.to_a.map{ |el| el.join('=') }.join('&')
+ @path + params.to_a.map{ |el| CGI::escape(el[0].to_s) + '=' + CGI::escape(el[1].to_s) }.join('&')
end
def get(params)
path = build_path(params)
http = Net::HTTP.start(@host, @port)
+# puts "Getting path [#{path}]"
http.get(path, @headers)
end
diff --git a/sources/wiki/post.sql b/sources/wiki/post.sql
index 773a04d..99eb26f 100644
--- a/sources/wiki/post.sql
+++ b/sources/wiki/post.sql
@@ -13,6 +13,8 @@ UPDATE wikipages SET status='e' WHERE type='page' AND has_templ='true' AND parse
CREATE INDEX wikipages_key_value_idx ON wikipages(key, value);
+CREATE INDEX wiki_images_image ON wiki_images(image);
+
INSERT INTO wikipages_keys (key, langs, lang_count) SELECT key, group_concat(lang || ' ' || status), count(*) FROM wikipages WHERE value IS NULL GROUP BY key;
INSERT INTO wikipages_tags (key, value, langs, lang_count) SELECT key, value, group_concat(lang || ' ' || status), count(*) FROM wikipages WHERE value IS NOT NULL GROUP BY key, value;
diff --git a/sources/wiki/pre.sql b/sources/wiki/pre.sql
index ebb80d3..8a515e7 100644
--- a/sources/wiki/pre.sql
+++ b/sources/wiki/pre.sql
@@ -33,6 +33,19 @@ CREATE TABLE wikipages (
status TEXT
);
+DROP TABLE IF EXISTS wiki_images;
+
+CREATE TABLE wiki_images (
+ image TEXT,
+ width INTEGER,
+ height INTEGER,
+ size INTEGER,
+ mime TEXT,
+ image_url TEXT,
+ thumb_url_prefix TEXT,
+ thumb_url_suffix TEXT
+);
+
DROP TABLE IF EXISTS wikipages_keys;
CREATE TABLE wikipages_keys (
@@ -64,6 +77,14 @@ CREATE TABLE invalid_page_titles (
title TEXT
);
+DROP TABLE IF EXISTS invalid_image_titles;
+
+CREATE TABLE invalid_image_titles (
+ reason TEXT,
+ page_title TEXT,
+ image_title TEXT
+);
+
DROP TABLE IF EXISTS words;
CREATE TABLE words (