From 64a047f622a5ed15dea94b5e52dd8c948fce9e95 Mon Sep 17 00:00:00 2001 From: Jochen Topf Date: Wed, 9 Jan 2013 19:18:55 +0100 Subject: Better support for wiki images. Key and tag wiki pages can contain images. Until now we only got the titles of those images. Now we also get the URL to the image, URL to thumbnails, width, height, and mime type. This information is now exposed in the API and it is used to show the images in the Overview tab of the key and tag pages. While we are changing the update process anyway, I changed the program that gets the list of all pages to also output the time those pages changed last. This information is currently not used, but it could be used to cache those pages locally making the update much faster and adding less strain to the wiki server. --- sources/wiki/get_image_info.rb | 118 +++++++++++++++++++++++++++++++++++++++ sources/wiki/get_page_list.rb | 16 +++--- sources/wiki/get_wiki_data.rb | 20 +++++-- sources/wiki/lib/mediawikiapi.rb | 5 +- sources/wiki/post.sql | 2 + sources/wiki/pre.sql | 21 +++++++ 6 files changed, 167 insertions(+), 15 deletions(-) create mode 100755 sources/wiki/get_image_info.rb (limited to 'sources/wiki') diff --git a/sources/wiki/get_image_info.rb b/sources/wiki/get_image_info.rb new file mode 100755 index 0000000..06707ca --- /dev/null +++ b/sources/wiki/get_image_info.rb @@ -0,0 +1,118 @@ +#!/usr/bin/ruby +#------------------------------------------------------------------------------ +# +# get_image_info.rb [DIR] +# +#------------------------------------------------------------------------------ +# +# Gets meta information about images from the OSM wiki. +# +# Reads the list of all images used in Key: and Tag: pages from the local +# database and requests meta information (width, height, mime type, URL, ...) +# for those images. Writes this data into the wiki_images table. +# +# The database must be in DIR or in the current directory, if no directory +# was given on the command line. +# +#------------------------------------------------------------------------------ +# +# Copyright (C) 2013 Jochen Topf +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +#------------------------------------------------------------------------------ + +require 'rubygems' + +require 'pp' + +require 'net/http' +require 'uri' +require 'json' +require 'sqlite3' + +require 'lib/mediawikiapi.rb' + +#------------------------------------------------------------------------------ + +dir = ARGV[0] || '.' + +api = MediaWikiAPI::API.new('wiki.openstreetmap.org') +api.add_header('User-agent', 'taginfo/0.1 (jochen@remote.org)') + +db = SQLite3::Database.new(dir + '/taginfo-wiki.db') +db.results_as_hash = true +image_titles = db.execute("SELECT DISTINCT(image) AS title FROM wikipages").map{ |row| row['title'] }.select{ |title| !title.nil? && title.match(%r{^(file|image):}i) } + +db.execute('BEGIN TRANSACTION'); + +until image_titles.empty? + some_titles = image_titles.slice!(0, 10) +# puts some_titles.join(",") + "\n" + + begin + data = api.query(:prop => 'imageinfo', :iiprop => 'url|size|mime', :titles => some_titles.join('|'), :iiurlwidth => 200, :iiurlheight => 200) + + if !data['query'] + STDERR.puts "Wiki API call failed (no 'query' field):" + pp data + next + end + + normalized = data['query']['normalized'] + if normalized + normalized.each do |n| + db.execute('UPDATE wikipages SET image=? WHERE image=?', n['to'], n['from']) + end + end + + if !data['query']['pages'] + STDERR.puts "Wiki API call failed (no 'pages' field):" + pp data + next + end + + data['query']['pages'].each do |k,v| + if v['imageinfo'] + info = v['imageinfo'][0] + if info['thumburl'].match(%r{^(.*/)[0-9]{1,4}(px-.*)$}) + prefix = $1 + suffix = $2 + else + prefix = nil + suffix = nil + end + db.execute("INSERT INTO wiki_images (image, width, height, size, mime, image_url, thumb_url_prefix, thumb_url_suffix) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", + v['title'], + info['width'], + info['height'], + info['size'], + info['mime'], + info['url'], + prefix, + suffix + ) + end + end + rescue + puts "Wiki API call error:" + pp data + end +end + +db.execute('COMMIT'); + + +#-- THE END ------------------------------------------------------------------- diff --git a/sources/wiki/get_page_list.rb b/sources/wiki/get_page_list.rb index 88afe10..7cdbafa 100755 --- a/sources/wiki/get_page_list.rb +++ b/sources/wiki/get_page_list.rb @@ -70,13 +70,13 @@ end def get_page_list(api, namespaceid, options) apfrom = '' loop do - data = api.query(:list => 'allpages', :aplimit => 'max', :apfrom => apfrom, :apnamespace => namespaceid, :apfilterredir => options[:redirect] ? 'redirects' : 'nonredirects') + data = api.query(:generator => 'allpages', :gaplimit => 'max', :gapfrom => apfrom, :gapnamespace => namespaceid, :gapfilterredir => options[:redirect] ? 'redirects' : 'nonredirects', :prop => 'info') # pp data - data['query']['allpages'].each do |h| - yield h['title'].gsub(/\s/, '_') + data['query']['pages'].each do |k,v| + yield v['touched'], v['title'].gsub(/\s/, '_') end if data['query-continue'] - apfrom = data['query-continue']['allpages']['apfrom'].gsub(/\s/, '_') + apfrom = data['query-continue']['allpages']['gapfrom'].gsub(/\s/, '_') # puts "apfrom=#{apfrom}" else return @@ -102,16 +102,16 @@ tagpages = File.open(dir + '/tagpages.list', 'w') namespaces.keys.sort.each do |namespace| id = namespaces[namespace] - get_page_list(api, id, :redirect => false) do |page| - line = ['page', namespace, page].join("\t") + get_page_list(api, id, :redirect => false) do |timestamp, page| + line = ['page', timestamp, namespace, page].join("\t") allpages.puts line if page =~ /^([^:]+:)?(Key|Tag):(.+)$/ tagpages.puts line end end - get_page_list(api, id, :redirect => true) do |page| - line = ['redirect', namespace, page].join("\t") + get_page_list(api, id, :redirect => true) do |timestamp, page| + line = ['redirect', timestamp, namespace, page].join("\t") allpages.puts line if page =~ /^([^:]+:)?(Key|Tag):(.+)$/ tagpages.puts line diff --git a/sources/wiki/get_wiki_data.rb b/sources/wiki/get_wiki_data.rb index 302db28..34d1182 100755 --- a/sources/wiki/get_wiki_data.rb +++ b/sources/wiki/get_wiki_data.rb @@ -54,10 +54,11 @@ class WikiPage attr_accessor :content attr_accessor :description, :image, :group, :onNode, :onWay, :onArea, :onRelation, :has_templ - attr_reader :type, :namespace, :title, :tag, :key, :value, :lang, :ttype, :tags_implies, :tags_combination, :tags_linked, :parsed + attr_reader :type, :timestamp, :namespace, :title, :tag, :key, :value, :lang, :ttype, :tags_implies, :tags_combination, :tags_linked, :parsed - def initialize(type, namespace, title) + def initialize(type, timestamp, namespace, title) @type = type # 'page' or 'redirect' + @timestamp = timestamp # page last touched @namespace = namespace # 'XX' (mediawiki namespace or '') @title = title # wiki page title @@ -129,7 +130,7 @@ class WikiPage content, group, type, - has_templ, + has_templ ? 1 : 0, parsed ? 1 : 0, description, image, @@ -239,8 +240,8 @@ File.open(dir + '/tagpages.list') do |wikipages| wikipages.each do |line| line.chomp! t = line.split("\t") - page = WikiPage.new(t[0], t[1], t[2]) - puts "page: (#{page.title}) (#{page.type}) (#{page.namespace}) (#{page.tag})" + page = WikiPage.new(t[0], t[1], t[2], t[3]) + puts "page: (#{page.title}) (#{page.type}) (#{page.timestamp}) (#{page.namespace}) (#{page.tag})" reason = page.check_title if reason == :ok @@ -271,7 +272,14 @@ File.open(dir + '/tagpages.list') do |wikipages| end end if template.named_parameters['image'] - page.image = template.named_parameters['image'][0] + ititle = template.named_parameters['image'][0] + if !ititle.nil? && ititle.match(%r{^(file|image):(.*)$}i) + page.image = "File:#{$2}" + else + puts "invalid image: #{reason} #{page.title} #{ititle}" + db.execute('INSERT INTO invalid_image_titles (reason, page_title, image_title) VALUES (?, ?, ?)', reason, page.title, ititle) + page.image = '' + end end if template.named_parameters['group'] page.group = template.named_parameters['group'][0] diff --git a/sources/wiki/lib/mediawikiapi.rb b/sources/wiki/lib/mediawikiapi.rb index a231cee..293aa6f 100644 --- a/sources/wiki/lib/mediawikiapi.rb +++ b/sources/wiki/lib/mediawikiapi.rb @@ -26,6 +26,8 @@ # #------------------------------------------------------------------------------ +require 'cgi' + module MediaWikiAPI class API @@ -42,12 +44,13 @@ module MediaWikiAPI end def build_path(params) - @path + params.to_a.map{ |el| el.join('=') }.join('&') + @path + params.to_a.map{ |el| CGI::escape(el[0].to_s) + '=' + CGI::escape(el[1].to_s) }.join('&') end def get(params) path = build_path(params) http = Net::HTTP.start(@host, @port) +# puts "Getting path [#{path}]" http.get(path, @headers) end diff --git a/sources/wiki/post.sql b/sources/wiki/post.sql index 773a04d..99eb26f 100644 --- a/sources/wiki/post.sql +++ b/sources/wiki/post.sql @@ -13,6 +13,8 @@ UPDATE wikipages SET status='e' WHERE type='page' AND has_templ='true' AND parse CREATE INDEX wikipages_key_value_idx ON wikipages(key, value); +CREATE INDEX wiki_images_image ON wiki_images(image); + INSERT INTO wikipages_keys (key, langs, lang_count) SELECT key, group_concat(lang || ' ' || status), count(*) FROM wikipages WHERE value IS NULL GROUP BY key; INSERT INTO wikipages_tags (key, value, langs, lang_count) SELECT key, value, group_concat(lang || ' ' || status), count(*) FROM wikipages WHERE value IS NOT NULL GROUP BY key, value; diff --git a/sources/wiki/pre.sql b/sources/wiki/pre.sql index ebb80d3..8a515e7 100644 --- a/sources/wiki/pre.sql +++ b/sources/wiki/pre.sql @@ -33,6 +33,19 @@ CREATE TABLE wikipages ( status TEXT ); +DROP TABLE IF EXISTS wiki_images; + +CREATE TABLE wiki_images ( + image TEXT, + width INTEGER, + height INTEGER, + size INTEGER, + mime TEXT, + image_url TEXT, + thumb_url_prefix TEXT, + thumb_url_suffix TEXT +); + DROP TABLE IF EXISTS wikipages_keys; CREATE TABLE wikipages_keys ( @@ -64,6 +77,14 @@ CREATE TABLE invalid_page_titles ( title TEXT ); +DROP TABLE IF EXISTS invalid_image_titles; + +CREATE TABLE invalid_image_titles ( + reason TEXT, + page_title TEXT, + image_title TEXT +); + DROP TABLE IF EXISTS words; CREATE TABLE words ( -- cgit v1.2.3