diff options
Diffstat (limited to 'sources/wiki')
-rwxr-xr-x | sources/wiki/get_image_info.rb | 118 | ||||
-rwxr-xr-x | sources/wiki/get_page_list.rb | 16 | ||||
-rwxr-xr-x | sources/wiki/get_wiki_data.rb | 20 | ||||
-rw-r--r-- | sources/wiki/lib/mediawikiapi.rb | 5 | ||||
-rw-r--r-- | sources/wiki/post.sql | 2 | ||||
-rw-r--r-- | sources/wiki/pre.sql | 21 |
6 files changed, 167 insertions, 15 deletions
diff --git a/sources/wiki/get_image_info.rb b/sources/wiki/get_image_info.rb new file mode 100755 index 0000000..06707ca --- /dev/null +++ b/sources/wiki/get_image_info.rb @@ -0,0 +1,118 @@ +#!/usr/bin/ruby +#------------------------------------------------------------------------------ +# +# get_image_info.rb [DIR] +# +#------------------------------------------------------------------------------ +# +# Gets meta information about images from the OSM wiki. +# +# Reads the list of all images used in Key: and Tag: pages from the local +# database and requests meta information (width, height, mime type, URL, ...) +# for those images. Writes this data into the wiki_images table. +# +# The database must be in DIR or in the current directory, if no directory +# was given on the command line. +# +#------------------------------------------------------------------------------ +# +# Copyright (C) 2013 Jochen Topf <jochen@remote.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +#------------------------------------------------------------------------------ + +require 'rubygems' + +require 'pp' + +require 'net/http' +require 'uri' +require 'json' +require 'sqlite3' + +require 'lib/mediawikiapi.rb' + +#------------------------------------------------------------------------------ + +dir = ARGV[0] || '.' + +api = MediaWikiAPI::API.new('wiki.openstreetmap.org') +api.add_header('User-agent', 'taginfo/0.1 (jochen@remote.org)') + +db = SQLite3::Database.new(dir + '/taginfo-wiki.db') +db.results_as_hash = true +image_titles = db.execute("SELECT DISTINCT(image) AS title FROM wikipages").map{ |row| row['title'] }.select{ |title| !title.nil? && title.match(%r{^(file|image):}i) } + +db.execute('BEGIN TRANSACTION'); + +until image_titles.empty? + some_titles = image_titles.slice!(0, 10) +# puts some_titles.join(",") + "\n" + + begin + data = api.query(:prop => 'imageinfo', :iiprop => 'url|size|mime', :titles => some_titles.join('|'), :iiurlwidth => 200, :iiurlheight => 200) + + if !data['query'] + STDERR.puts "Wiki API call failed (no 'query' field):" + pp data + next + end + + normalized = data['query']['normalized'] + if normalized + normalized.each do |n| + db.execute('UPDATE wikipages SET image=? WHERE image=?', n['to'], n['from']) + end + end + + if !data['query']['pages'] + STDERR.puts "Wiki API call failed (no 'pages' field):" + pp data + next + end + + data['query']['pages'].each do |k,v| + if v['imageinfo'] + info = v['imageinfo'][0] + if info['thumburl'].match(%r{^(.*/)[0-9]{1,4}(px-.*)$}) + prefix = $1 + suffix = $2 + else + prefix = nil + suffix = nil + end + db.execute("INSERT INTO wiki_images (image, width, height, size, mime, image_url, thumb_url_prefix, thumb_url_suffix) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", + v['title'], + info['width'], + info['height'], + info['size'], + info['mime'], + info['url'], + prefix, + suffix + ) + end + end + rescue + puts "Wiki API call error:" + pp data + end +end + +db.execute('COMMIT'); + + +#-- THE END ------------------------------------------------------------------- diff --git a/sources/wiki/get_page_list.rb b/sources/wiki/get_page_list.rb index 88afe10..7cdbafa 100755 --- a/sources/wiki/get_page_list.rb +++ b/sources/wiki/get_page_list.rb @@ -70,13 +70,13 @@ end def get_page_list(api, namespaceid, options) apfrom = '' loop do - data = api.query(:list => 'allpages', :aplimit => 'max', :apfrom => apfrom, :apnamespace => namespaceid, :apfilterredir => options[:redirect] ? 'redirects' : 'nonredirects') + data = api.query(:generator => 'allpages', :gaplimit => 'max', :gapfrom => apfrom, :gapnamespace => namespaceid, :gapfilterredir => options[:redirect] ? 'redirects' : 'nonredirects', :prop => 'info') # pp data - data['query']['allpages'].each do |h| - yield h['title'].gsub(/\s/, '_') + data['query']['pages'].each do |k,v| + yield v['touched'], v['title'].gsub(/\s/, '_') end if data['query-continue'] - apfrom = data['query-continue']['allpages']['apfrom'].gsub(/\s/, '_') + apfrom = data['query-continue']['allpages']['gapfrom'].gsub(/\s/, '_') # puts "apfrom=#{apfrom}" else return @@ -102,16 +102,16 @@ tagpages = File.open(dir + '/tagpages.list', 'w') namespaces.keys.sort.each do |namespace| id = namespaces[namespace] - get_page_list(api, id, :redirect => false) do |page| - line = ['page', namespace, page].join("\t") + get_page_list(api, id, :redirect => false) do |timestamp, page| + line = ['page', timestamp, namespace, page].join("\t") allpages.puts line if page =~ /^([^:]+:)?(Key|Tag):(.+)$/ tagpages.puts line end end - get_page_list(api, id, :redirect => true) do |page| - line = ['redirect', namespace, page].join("\t") + get_page_list(api, id, :redirect => true) do |timestamp, page| + line = ['redirect', timestamp, namespace, page].join("\t") allpages.puts line if page =~ /^([^:]+:)?(Key|Tag):(.+)$/ tagpages.puts line diff --git a/sources/wiki/get_wiki_data.rb b/sources/wiki/get_wiki_data.rb index 302db28..34d1182 100755 --- a/sources/wiki/get_wiki_data.rb +++ b/sources/wiki/get_wiki_data.rb @@ -54,10 +54,11 @@ class WikiPage attr_accessor :content attr_accessor :description, :image, :group, :onNode, :onWay, :onArea, :onRelation, :has_templ - attr_reader :type, :namespace, :title, :tag, :key, :value, :lang, :ttype, :tags_implies, :tags_combination, :tags_linked, :parsed + attr_reader :type, :timestamp, :namespace, :title, :tag, :key, :value, :lang, :ttype, :tags_implies, :tags_combination, :tags_linked, :parsed - def initialize(type, namespace, title) + def initialize(type, timestamp, namespace, title) @type = type # 'page' or 'redirect' + @timestamp = timestamp # page last touched @namespace = namespace # 'XX' (mediawiki namespace or '') @title = title # wiki page title @@ -129,7 +130,7 @@ class WikiPage content, group, type, - has_templ, + has_templ ? 1 : 0, parsed ? 1 : 0, description, image, @@ -239,8 +240,8 @@ File.open(dir + '/tagpages.list') do |wikipages| wikipages.each do |line| line.chomp! t = line.split("\t") - page = WikiPage.new(t[0], t[1], t[2]) - puts "page: (#{page.title}) (#{page.type}) (#{page.namespace}) (#{page.tag})" + page = WikiPage.new(t[0], t[1], t[2], t[3]) + puts "page: (#{page.title}) (#{page.type}) (#{page.timestamp}) (#{page.namespace}) (#{page.tag})" reason = page.check_title if reason == :ok @@ -271,7 +272,14 @@ File.open(dir + '/tagpages.list') do |wikipages| end end if template.named_parameters['image'] - page.image = template.named_parameters['image'][0] + ititle = template.named_parameters['image'][0] + if !ititle.nil? && ititle.match(%r{^(file|image):(.*)$}i) + page.image = "File:#{$2}" + else + puts "invalid image: #{reason} #{page.title} #{ititle}" + db.execute('INSERT INTO invalid_image_titles (reason, page_title, image_title) VALUES (?, ?, ?)', reason, page.title, ititle) + page.image = '' + end end if template.named_parameters['group'] page.group = template.named_parameters['group'][0] diff --git a/sources/wiki/lib/mediawikiapi.rb b/sources/wiki/lib/mediawikiapi.rb index a231cee..293aa6f 100644 --- a/sources/wiki/lib/mediawikiapi.rb +++ b/sources/wiki/lib/mediawikiapi.rb @@ -26,6 +26,8 @@ # #------------------------------------------------------------------------------ +require 'cgi' + module MediaWikiAPI class API @@ -42,12 +44,13 @@ module MediaWikiAPI end def build_path(params) - @path + params.to_a.map{ |el| el.join('=') }.join('&') + @path + params.to_a.map{ |el| CGI::escape(el[0].to_s) + '=' + CGI::escape(el[1].to_s) }.join('&') end def get(params) path = build_path(params) http = Net::HTTP.start(@host, @port) +# puts "Getting path [#{path}]" http.get(path, @headers) end diff --git a/sources/wiki/post.sql b/sources/wiki/post.sql index 773a04d..99eb26f 100644 --- a/sources/wiki/post.sql +++ b/sources/wiki/post.sql @@ -13,6 +13,8 @@ UPDATE wikipages SET status='e' WHERE type='page' AND has_templ='true' AND parse CREATE INDEX wikipages_key_value_idx ON wikipages(key, value); +CREATE INDEX wiki_images_image ON wiki_images(image); + INSERT INTO wikipages_keys (key, langs, lang_count) SELECT key, group_concat(lang || ' ' || status), count(*) FROM wikipages WHERE value IS NULL GROUP BY key; INSERT INTO wikipages_tags (key, value, langs, lang_count) SELECT key, value, group_concat(lang || ' ' || status), count(*) FROM wikipages WHERE value IS NOT NULL GROUP BY key, value; diff --git a/sources/wiki/pre.sql b/sources/wiki/pre.sql index ebb80d3..8a515e7 100644 --- a/sources/wiki/pre.sql +++ b/sources/wiki/pre.sql @@ -33,6 +33,19 @@ CREATE TABLE wikipages ( status TEXT ); +DROP TABLE IF EXISTS wiki_images; + +CREATE TABLE wiki_images ( + image TEXT, + width INTEGER, + height INTEGER, + size INTEGER, + mime TEXT, + image_url TEXT, + thumb_url_prefix TEXT, + thumb_url_suffix TEXT +); + DROP TABLE IF EXISTS wikipages_keys; CREATE TABLE wikipages_keys ( @@ -64,6 +77,14 @@ CREATE TABLE invalid_page_titles ( title TEXT ); +DROP TABLE IF EXISTS invalid_image_titles; + +CREATE TABLE invalid_image_titles ( + reason TEXT, + page_title TEXT, + image_title TEXT +); + DROP TABLE IF EXISTS words; CREATE TABLE words ( |