diff options
author | Jochen Topf <jochen@topf.org> | 2013-01-24 15:54:19 +0100 |
---|---|---|
committer | Jochen Topf <jochen@topf.org> | 2013-01-24 15:54:19 +0100 |
commit | 38095b40997a08a0b7e5155da442fa77165ce556 (patch) | |
tree | b46b031fad2b6a7cbff9ef11e44997455cf5f63d /sources/wiki | |
parent | 55550a88ff517d48ca6e56f720c4080fdb569ac9 (diff) | |
download | taginfo-38095b40997a08a0b7e5155da442fa77165ce556.tar taginfo-38095b40997a08a0b7e5155da442fa77165ce556.tar.gz |
Updated ruby import scripts
* Removed rubygems require which isn't needed any more in Ruby 1.9
* Updated transaction syntax to use blocks
* Updated copyright year
Diffstat (limited to 'sources/wiki')
-rwxr-xr-x | sources/wiki/extract_words.rb | 15 | ||||
-rwxr-xr-x | sources/wiki/get_image_info.rb | 108 | ||||
-rwxr-xr-x | sources/wiki/get_page_list.rb | 9 | ||||
-rwxr-xr-x | sources/wiki/get_wiki_data.rb | 72 |
4 files changed, 99 insertions, 105 deletions
diff --git a/sources/wiki/extract_words.rb b/sources/wiki/extract_words.rb index 8b018d8..70c483d 100755 --- a/sources/wiki/extract_words.rb +++ b/sources/wiki/extract_words.rb @@ -9,7 +9,7 @@ # #------------------------------------------------------------------------------ # -# Copyright (C) 2012 Jochen Topf <jochen@remote.org> +# Copyright (C) 2013 Jochen Topf <jochen@remote.org> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -27,8 +27,6 @@ # #------------------------------------------------------------------------------ -require 'rubygems' - require 'sqlite3' #------------------------------------------------------------------------------ @@ -126,6 +124,8 @@ dir = ARGV[0] || '.' db = SQLite3::Database.new(dir + '/taginfo-wiki.db') db.results_as_hash = true +#------------------------------------------------------------------------------ + words = Words.new we = WordExtractor.new(words) @@ -141,10 +141,11 @@ words.invert # puts "#{key}=#{value}: #{words}" #end -db.execute('BEGIN TRANSACTION'); -words.dump do |key, value, words| - db.execute('INSERT INTO words (key, value, words) VALUES (?, ?, ?)', key, value, words) +db.transaction do |db| + words.dump do |key, value, words| + db.execute('INSERT INTO words (key, value, words) VALUES (?, ?, ?)', key, value, words) + end end -db.execute('COMMIT'); + #-- THE END ------------------------------------------------------------------- diff --git a/sources/wiki/get_image_info.rb b/sources/wiki/get_image_info.rb index 8430c49..6254208 100755 --- a/sources/wiki/get_image_info.rb +++ b/sources/wiki/get_image_info.rb @@ -34,8 +34,6 @@ # #------------------------------------------------------------------------------ -require 'rubygems' - require 'pp' require 'net/http' @@ -48,79 +46,79 @@ require './lib/mediawikiapi.rb' #------------------------------------------------------------------------------ dir = ARGV[0] || '.' +db = SQLite3::Database.new(dir + '/taginfo-wiki.db') +db.results_as_hash = true + +#------------------------------------------------------------------------------ api = MediaWikiAPI::API.new('wiki.openstreetmap.org') -db = SQLite3::Database.new(dir + '/taginfo-wiki.db') -db.results_as_hash = true image_titles = db.execute("SELECT DISTINCT(image) AS title FROM wikipages WHERE image IS NOT NULL AND image != '' UNION SELECT DISTINCT(image) AS title FROM relation_pages WHERE image IS NOT NULL AND image != ''"). map{ |row| row['title'] }. select{ |title| title.match(%r{^(file|image):}i) } -db.execute('BEGIN TRANSACTION'); +db.transaction do |db| + puts "Found #{ image_titles.size } different image titles" -puts "Found #{ image_titles.size } different image titles" + images_added = {} -images_added = {} + until image_titles.empty? + some_titles = image_titles.slice!(0, 10) + puts "Get image info for: #{ some_titles.join(' ') }" -until image_titles.empty? - some_titles = image_titles.slice!(0, 10) - puts "Get image info for: #{ some_titles.join(' ') }" + begin + data = api.query(:prop => 'imageinfo', :iiprop => 'url|size|mime', :titles => some_titles.join('|'), :iiurlwidth => 10, :iiurlheight => 10) - begin - data = api.query(:prop => 'imageinfo', :iiprop => 'url|size|mime', :titles => some_titles.join('|'), :iiurlwidth => 10, :iiurlheight => 10) - - if !data['query'] - puts "Wiki API call failed (no 'query' field):" - pp data - next - end + if !data['query'] + puts "Wiki API call failed (no 'query' field):" + pp data + next + end - normalized = data['query']['normalized'] - if normalized - normalized.each do |n| - db.execute('UPDATE wikipages SET image=? WHERE image=?', n['to'], n['from']) - db.execute('UPDATE relation_pages SET image=? WHERE image=?', n['to'], n['from']) + normalized = data['query']['normalized'] + if normalized + normalized.each do |n| + db.execute('UPDATE wikipages SET image=? WHERE image=?', n['to'], n['from']) + db.execute('UPDATE relation_pages SET image=? WHERE image=?', n['to'], n['from']) + end end - end - if !data['query']['pages'] - puts "Wiki API call failed (no 'pages' field):" - pp data - next - end + if !data['query']['pages'] + puts "Wiki API call failed (no 'pages' field):" + pp data + next + end - data['query']['pages'].each do |k,v| - if v['imageinfo'] && ! images_added[v['title']] - info = v['imageinfo'][0] - if info['thumburl'].match(%r{^(.*/)[0-9]{1,4}(px-.*)$}) - prefix = $1 - suffix = $2 - else - prefix = nil - suffix = nil - puts "Wrong thumbnail format: '#{info['thumburl']}'" + data['query']['pages'].each do |k,v| + if v['imageinfo'] && ! images_added[v['title']] + info = v['imageinfo'][0] + if info['thumburl'].match(%r{^(.*/)[0-9]{1,4}(px-.*)$}) + prefix = $1 + suffix = $2 + else + prefix = nil + suffix = nil + puts "Wrong thumbnail format: '#{info['thumburl']}'" + end + images_added[v['title']] = 1 + db.execute("INSERT INTO wiki_images (image, width, height, size, mime, image_url, thumb_url_prefix, thumb_url_suffix) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", + v['title'], + info['width'], + info['height'], + info['size'], + info['mime'], + info['url'], + prefix, + suffix + ) end - images_added[v['title']] = 1 - db.execute("INSERT INTO wiki_images (image, width, height, size, mime, image_url, thumb_url_prefix, thumb_url_suffix) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", - v['title'], - info['width'], - info['height'], - info['size'], - info['mime'], - info['url'], - prefix, - suffix - ) end + rescue + puts "Wiki API call error:" + pp data end - rescue - puts "Wiki API call error:" - pp data end end -db.execute('COMMIT'); - #-- THE END ------------------------------------------------------------------- diff --git a/sources/wiki/get_page_list.rb b/sources/wiki/get_page_list.rb index 3c4b219..05812c3 100755 --- a/sources/wiki/get_page_list.rb +++ b/sources/wiki/get_page_list.rb @@ -26,7 +26,7 @@ # #------------------------------------------------------------------------------ # -# Copyright (C) 2012 Jochen Topf <jochen@remote.org> +# Copyright (C) 2013 Jochen Topf <jochen@remote.org> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -44,15 +44,10 @@ # #------------------------------------------------------------------------------ -require 'rubygems' - -require 'pp' - require 'net/http' require 'uri' require 'json' - require './lib/mediawikiapi.rb' #------------------------------------------------------------------------------ @@ -72,7 +67,6 @@ def get_page_list(api, namespaceid, options) apfrom = '' loop do data = api.query(:generator => 'allpages', :gaplimit => 'max', :gapfrom => apfrom, :gapnamespace => namespaceid, :gapfilterredir => options[:redirect] ? 'redirects' : 'nonredirects', :prop => 'info') -# pp data data['query']['pages'].each do |k,v| yield v['touched'], v['title'].gsub(/\s/, '_') end @@ -122,4 +116,5 @@ end tagpages.close allpages.close + #-- THE END ------------------------------------------------------------------- diff --git a/sources/wiki/get_wiki_data.rb b/sources/wiki/get_wiki_data.rb index d593d9d..f90f4f5 100755 --- a/sources/wiki/get_wiki_data.rb +++ b/sources/wiki/get_wiki_data.rb @@ -35,10 +35,6 @@ # #------------------------------------------------------------------------------ -require 'rubygems' - -require 'pp' - require 'json' require 'net/http' require 'uri' @@ -210,6 +206,8 @@ class WikiPage end end +#------------------------------------------------------------------------------ + class KeyOrTagPage < WikiPage def initialize(type, timestamp, namespace, title) @@ -262,6 +260,8 @@ class KeyOrTagPage < WikiPage end +#------------------------------------------------------------------------------ + class KeyPage < KeyOrTagPage end @@ -384,49 +384,49 @@ end #------------------------------------------------------------------------------ dir = ARGV[0] || '.' - -api = MediaWikiAPI::API.new('wiki.openstreetmap.org', 80, '/w/index.php?') - db = SQLite3::Database.new(dir + '/taginfo-wiki.db') db.results_as_hash = true -cache = Cache.new(dir, db, api) +#------------------------------------------------------------------------------ -db.execute('BEGIN TRANSACTION') +api = MediaWikiAPI::API.new('wiki.openstreetmap.org', 80, '/w/index.php?') -File.open(dir + '/tagpages.list') do |wikipages| - wikipages.each do |line| - line.chomp! - (type, timestamp, namespace, title) = line.split("\t") +cache = Cache.new(dir, db, api) - if title =~ /(^|:)Key:/ - page = KeyPage.new(type, timestamp, namespace, title) - elsif title =~ /(^|:)Tag:/ - page = TagPage.new(type, timestamp, namespace, title) - elsif title =~ /(^|:)Relation:/ - page = RelationPage.new(type, timestamp, namespace, title) - else - puts "Wiki page has wrong format: '#{title}'" - next - end +db.transaction do |db| + + File.open(dir + '/tagpages.list') do |wikipages| + wikipages.each do |line| + line.chomp! + (type, timestamp, namespace, title) = line.split("\t") + + if title =~ /(^|:)Key:/ + page = KeyPage.new(type, timestamp, namespace, title) + elsif title =~ /(^|:)Tag:/ + page = TagPage.new(type, timestamp, namespace, title) + elsif title =~ /(^|:)Relation:/ + page = RelationPage.new(type, timestamp, namespace, title) + else + puts "Wiki page has wrong format: '#{title}'" + next + end - puts "Parsing page: title='#{page.title}' type='#{page.type}' timestamp='#{page.timestamp}' namespace='#{page.namespace}'" + puts "Parsing page: title='#{page.title}' type='#{page.type}' timestamp='#{page.timestamp}' namespace='#{page.namespace}'" - reason = page.check_title - if reason == :ok - cache.get_page(page) - page.parse_content(db) - page.insert(db) - else - puts "invalid page: #{reason} #{page.title}" - db.execute('INSERT INTO invalid_page_titles (reason, title) VALUES (?, ?)', reason.to_s, page.title) + reason = page.check_title + if reason == :ok + cache.get_page(page) + page.parse_content(db) + page.insert(db) + else + puts "invalid page: #{reason} #{page.title}" + db.execute('INSERT INTO invalid_page_titles (reason, title) VALUES (?, ?)', reason.to_s, page.title) + end end end -end -cache.cleanup - -db.execute('COMMIT') + cache.cleanup +end #-- THE END ------------------------------------------------------------------- |