aboutsummaryrefslogtreecommitdiff
path: root/sources/wiki
diff options
context:
space:
mode:
authorJochen Topf <jochen@topf.org>2013-01-24 15:54:19 +0100
committerJochen Topf <jochen@topf.org>2013-01-24 15:54:19 +0100
commit38095b40997a08a0b7e5155da442fa77165ce556 (patch)
treeb46b031fad2b6a7cbff9ef11e44997455cf5f63d /sources/wiki
parent55550a88ff517d48ca6e56f720c4080fdb569ac9 (diff)
downloadtaginfo-38095b40997a08a0b7e5155da442fa77165ce556.tar
taginfo-38095b40997a08a0b7e5155da442fa77165ce556.tar.gz
Updated ruby import scripts
* Removed rubygems require which isn't needed any more in Ruby 1.9 * Updated transaction syntax to use blocks * Updated copyright year
Diffstat (limited to 'sources/wiki')
-rwxr-xr-xsources/wiki/extract_words.rb15
-rwxr-xr-xsources/wiki/get_image_info.rb108
-rwxr-xr-xsources/wiki/get_page_list.rb9
-rwxr-xr-xsources/wiki/get_wiki_data.rb72
4 files changed, 99 insertions, 105 deletions
diff --git a/sources/wiki/extract_words.rb b/sources/wiki/extract_words.rb
index 8b018d8..70c483d 100755
--- a/sources/wiki/extract_words.rb
+++ b/sources/wiki/extract_words.rb
@@ -9,7 +9,7 @@
#
#------------------------------------------------------------------------------
#
-# Copyright (C) 2012 Jochen Topf <jochen@remote.org>
+# Copyright (C) 2013 Jochen Topf <jochen@remote.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -27,8 +27,6 @@
#
#------------------------------------------------------------------------------
-require 'rubygems'
-
require 'sqlite3'
#------------------------------------------------------------------------------
@@ -126,6 +124,8 @@ dir = ARGV[0] || '.'
db = SQLite3::Database.new(dir + '/taginfo-wiki.db')
db.results_as_hash = true
+#------------------------------------------------------------------------------
+
words = Words.new
we = WordExtractor.new(words)
@@ -141,10 +141,11 @@ words.invert
# puts "#{key}=#{value}: #{words}"
#end
-db.execute('BEGIN TRANSACTION');
-words.dump do |key, value, words|
- db.execute('INSERT INTO words (key, value, words) VALUES (?, ?, ?)', key, value, words)
+db.transaction do |db|
+ words.dump do |key, value, words|
+ db.execute('INSERT INTO words (key, value, words) VALUES (?, ?, ?)', key, value, words)
+ end
end
-db.execute('COMMIT');
+
#-- THE END -------------------------------------------------------------------
diff --git a/sources/wiki/get_image_info.rb b/sources/wiki/get_image_info.rb
index 8430c49..6254208 100755
--- a/sources/wiki/get_image_info.rb
+++ b/sources/wiki/get_image_info.rb
@@ -34,8 +34,6 @@
#
#------------------------------------------------------------------------------
-require 'rubygems'
-
require 'pp'
require 'net/http'
@@ -48,79 +46,79 @@ require './lib/mediawikiapi.rb'
#------------------------------------------------------------------------------
dir = ARGV[0] || '.'
+db = SQLite3::Database.new(dir + '/taginfo-wiki.db')
+db.results_as_hash = true
+
+#------------------------------------------------------------------------------
api = MediaWikiAPI::API.new('wiki.openstreetmap.org')
-db = SQLite3::Database.new(dir + '/taginfo-wiki.db')
-db.results_as_hash = true
image_titles = db.execute("SELECT DISTINCT(image) AS title FROM wikipages WHERE image IS NOT NULL AND image != '' UNION SELECT DISTINCT(image) AS title FROM relation_pages WHERE image IS NOT NULL AND image != ''").
map{ |row| row['title'] }.
select{ |title| title.match(%r{^(file|image):}i) }
-db.execute('BEGIN TRANSACTION');
+db.transaction do |db|
+ puts "Found #{ image_titles.size } different image titles"
-puts "Found #{ image_titles.size } different image titles"
+ images_added = {}
-images_added = {}
+ until image_titles.empty?
+ some_titles = image_titles.slice!(0, 10)
+ puts "Get image info for: #{ some_titles.join(' ') }"
-until image_titles.empty?
- some_titles = image_titles.slice!(0, 10)
- puts "Get image info for: #{ some_titles.join(' ') }"
+ begin
+ data = api.query(:prop => 'imageinfo', :iiprop => 'url|size|mime', :titles => some_titles.join('|'), :iiurlwidth => 10, :iiurlheight => 10)
- begin
- data = api.query(:prop => 'imageinfo', :iiprop => 'url|size|mime', :titles => some_titles.join('|'), :iiurlwidth => 10, :iiurlheight => 10)
-
- if !data['query']
- puts "Wiki API call failed (no 'query' field):"
- pp data
- next
- end
+ if !data['query']
+ puts "Wiki API call failed (no 'query' field):"
+ pp data
+ next
+ end
- normalized = data['query']['normalized']
- if normalized
- normalized.each do |n|
- db.execute('UPDATE wikipages SET image=? WHERE image=?', n['to'], n['from'])
- db.execute('UPDATE relation_pages SET image=? WHERE image=?', n['to'], n['from'])
+ normalized = data['query']['normalized']
+ if normalized
+ normalized.each do |n|
+ db.execute('UPDATE wikipages SET image=? WHERE image=?', n['to'], n['from'])
+ db.execute('UPDATE relation_pages SET image=? WHERE image=?', n['to'], n['from'])
+ end
end
- end
- if !data['query']['pages']
- puts "Wiki API call failed (no 'pages' field):"
- pp data
- next
- end
+ if !data['query']['pages']
+ puts "Wiki API call failed (no 'pages' field):"
+ pp data
+ next
+ end
- data['query']['pages'].each do |k,v|
- if v['imageinfo'] && ! images_added[v['title']]
- info = v['imageinfo'][0]
- if info['thumburl'].match(%r{^(.*/)[0-9]{1,4}(px-.*)$})
- prefix = $1
- suffix = $2
- else
- prefix = nil
- suffix = nil
- puts "Wrong thumbnail format: '#{info['thumburl']}'"
+ data['query']['pages'].each do |k,v|
+ if v['imageinfo'] && ! images_added[v['title']]
+ info = v['imageinfo'][0]
+ if info['thumburl'].match(%r{^(.*/)[0-9]{1,4}(px-.*)$})
+ prefix = $1
+ suffix = $2
+ else
+ prefix = nil
+ suffix = nil
+ puts "Wrong thumbnail format: '#{info['thumburl']}'"
+ end
+ images_added[v['title']] = 1
+ db.execute("INSERT INTO wiki_images (image, width, height, size, mime, image_url, thumb_url_prefix, thumb_url_suffix) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
+ v['title'],
+ info['width'],
+ info['height'],
+ info['size'],
+ info['mime'],
+ info['url'],
+ prefix,
+ suffix
+ )
end
- images_added[v['title']] = 1
- db.execute("INSERT INTO wiki_images (image, width, height, size, mime, image_url, thumb_url_prefix, thumb_url_suffix) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
- v['title'],
- info['width'],
- info['height'],
- info['size'],
- info['mime'],
- info['url'],
- prefix,
- suffix
- )
end
+ rescue
+ puts "Wiki API call error:"
+ pp data
end
- rescue
- puts "Wiki API call error:"
- pp data
end
end
-db.execute('COMMIT');
-
#-- THE END -------------------------------------------------------------------
diff --git a/sources/wiki/get_page_list.rb b/sources/wiki/get_page_list.rb
index 3c4b219..05812c3 100755
--- a/sources/wiki/get_page_list.rb
+++ b/sources/wiki/get_page_list.rb
@@ -26,7 +26,7 @@
#
#------------------------------------------------------------------------------
#
-# Copyright (C) 2012 Jochen Topf <jochen@remote.org>
+# Copyright (C) 2013 Jochen Topf <jochen@remote.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -44,15 +44,10 @@
#
#------------------------------------------------------------------------------
-require 'rubygems'
-
-require 'pp'
-
require 'net/http'
require 'uri'
require 'json'
-
require './lib/mediawikiapi.rb'
#------------------------------------------------------------------------------
@@ -72,7 +67,6 @@ def get_page_list(api, namespaceid, options)
apfrom = ''
loop do
data = api.query(:generator => 'allpages', :gaplimit => 'max', :gapfrom => apfrom, :gapnamespace => namespaceid, :gapfilterredir => options[:redirect] ? 'redirects' : 'nonredirects', :prop => 'info')
-# pp data
data['query']['pages'].each do |k,v|
yield v['touched'], v['title'].gsub(/\s/, '_')
end
@@ -122,4 +116,5 @@ end
tagpages.close
allpages.close
+
#-- THE END -------------------------------------------------------------------
diff --git a/sources/wiki/get_wiki_data.rb b/sources/wiki/get_wiki_data.rb
index d593d9d..f90f4f5 100755
--- a/sources/wiki/get_wiki_data.rb
+++ b/sources/wiki/get_wiki_data.rb
@@ -35,10 +35,6 @@
#
#------------------------------------------------------------------------------
-require 'rubygems'
-
-require 'pp'
-
require 'json'
require 'net/http'
require 'uri'
@@ -210,6 +206,8 @@ class WikiPage
end
end
+#------------------------------------------------------------------------------
+
class KeyOrTagPage < WikiPage
def initialize(type, timestamp, namespace, title)
@@ -262,6 +260,8 @@ class KeyOrTagPage < WikiPage
end
+#------------------------------------------------------------------------------
+
class KeyPage < KeyOrTagPage
end
@@ -384,49 +384,49 @@ end
#------------------------------------------------------------------------------
dir = ARGV[0] || '.'
-
-api = MediaWikiAPI::API.new('wiki.openstreetmap.org', 80, '/w/index.php?')
-
db = SQLite3::Database.new(dir + '/taginfo-wiki.db')
db.results_as_hash = true
-cache = Cache.new(dir, db, api)
+#------------------------------------------------------------------------------
-db.execute('BEGIN TRANSACTION')
+api = MediaWikiAPI::API.new('wiki.openstreetmap.org', 80, '/w/index.php?')
-File.open(dir + '/tagpages.list') do |wikipages|
- wikipages.each do |line|
- line.chomp!
- (type, timestamp, namespace, title) = line.split("\t")
+cache = Cache.new(dir, db, api)
- if title =~ /(^|:)Key:/
- page = KeyPage.new(type, timestamp, namespace, title)
- elsif title =~ /(^|:)Tag:/
- page = TagPage.new(type, timestamp, namespace, title)
- elsif title =~ /(^|:)Relation:/
- page = RelationPage.new(type, timestamp, namespace, title)
- else
- puts "Wiki page has wrong format: '#{title}'"
- next
- end
+db.transaction do |db|
+
+ File.open(dir + '/tagpages.list') do |wikipages|
+ wikipages.each do |line|
+ line.chomp!
+ (type, timestamp, namespace, title) = line.split("\t")
+
+ if title =~ /(^|:)Key:/
+ page = KeyPage.new(type, timestamp, namespace, title)
+ elsif title =~ /(^|:)Tag:/
+ page = TagPage.new(type, timestamp, namespace, title)
+ elsif title =~ /(^|:)Relation:/
+ page = RelationPage.new(type, timestamp, namespace, title)
+ else
+ puts "Wiki page has wrong format: '#{title}'"
+ next
+ end
- puts "Parsing page: title='#{page.title}' type='#{page.type}' timestamp='#{page.timestamp}' namespace='#{page.namespace}'"
+ puts "Parsing page: title='#{page.title}' type='#{page.type}' timestamp='#{page.timestamp}' namespace='#{page.namespace}'"
- reason = page.check_title
- if reason == :ok
- cache.get_page(page)
- page.parse_content(db)
- page.insert(db)
- else
- puts "invalid page: #{reason} #{page.title}"
- db.execute('INSERT INTO invalid_page_titles (reason, title) VALUES (?, ?)', reason.to_s, page.title)
+ reason = page.check_title
+ if reason == :ok
+ cache.get_page(page)
+ page.parse_content(db)
+ page.insert(db)
+ else
+ puts "invalid page: #{reason} #{page.title}"
+ db.execute('INSERT INTO invalid_page_titles (reason, title) VALUES (?, ?)', reason.to_s, page.title)
+ end
end
end
-end
-cache.cleanup
-
-db.execute('COMMIT')
+ cache.cleanup
+end
#-- THE END -------------------------------------------------------------------