diff options
author | Jochen Topf <jochen@topf.org> | 2013-01-17 22:35:42 +0100 |
---|---|---|
committer | Jochen Topf <jochen@topf.org> | 2013-01-17 22:35:42 +0100 |
commit | 2cc4f68ca65b7d9c8a20aed1875ed464ee8afa4d (patch) | |
tree | 26461042801daca6310c5b07f98d00753bbec956 /sources/wiki | |
parent | d22886fe9671af0f24e8619501c8df422f0b708f (diff) | |
download | taginfo-2cc4f68ca65b7d9c8a20aed1875ed464ee8afa4d.tar taginfo-2cc4f68ca65b7d9c8a20aed1875ed464ee8afa4d.tar.gz |
Add parsing of Relation:* pages to wiki source.
Diffstat (limited to 'sources/wiki')
-rwxr-xr-x | sources/wiki/get_page_list.rb | 4 | ||||
-rwxr-xr-x | sources/wiki/get_wiki_data.rb | 375 | ||||
-rw-r--r-- | sources/wiki/post.sql | 7 | ||||
-rw-r--r-- | sources/wiki/pre.sql | 17 |
4 files changed, 247 insertions, 156 deletions
diff --git a/sources/wiki/get_page_list.rb b/sources/wiki/get_page_list.rb index 0ea5976..eb5ddaa 100755 --- a/sources/wiki/get_page_list.rb +++ b/sources/wiki/get_page_list.rb @@ -104,7 +104,7 @@ namespaces.keys.sort.each do |namespace| get_page_list(api, id, :redirect => false) do |timestamp, page| line = ['page', timestamp, namespace, page].join("\t") allpages.puts line - if page =~ /^([^:]+:)?(Key|Tag):(.+)$/ + if page =~ /^([^:]+:)?(Key|Tag|Relation):(.+)$/ tagpages.puts line end end @@ -112,7 +112,7 @@ namespaces.keys.sort.each do |namespace| get_page_list(api, id, :redirect => true) do |timestamp, page| line = ['redirect', timestamp, namespace, page].join("\t") allpages.puts line - if page =~ /^([^:]+:)?(Key|Tag):(.+)$/ + if page =~ /^([^:]+:)?(Key|Tag|Relation):(.+)$/ tagpages.puts line end end diff --git a/sources/wiki/get_wiki_data.rb b/sources/wiki/get_wiki_data.rb index 5ff8cbb..2803d33 100755 --- a/sources/wiki/get_wiki_data.rb +++ b/sources/wiki/get_wiki_data.rb @@ -53,8 +53,15 @@ class WikiPage @@pages = {} attr_accessor :content - attr_accessor :description, :image, :group, :onNode, :onWay, :onArea, :onRelation, :has_templ - attr_reader :type, :timestamp, :namespace, :title, :tag, :key, :value, :lang, :ttype, :tags_implies, :tags_combination, :tags_linked, :parsed + attr_reader :type, :timestamp, :namespace, :title, :description, :image, :tag, :key, :value, :lang, :ttype, :tags_implies, :tags_combination, :tags_linked, :parsed, :has_templ, :group, :onNode, :onWay, :onArea, :onRelation + + def self.pages + @@pages.values.sort{ |a,b| a.title <=> b.title } + end + + def self.find(name) + @@pages[name] + end def initialize(type, timestamp, namespace, title) @type = type # 'page' or 'redirect' @@ -62,41 +69,13 @@ class WikiPage @namespace = namespace # 'XX' (mediawiki namespace or '') @title = title # wiki page title - @tag = title.gsub(/^([^:]+:)?(Key|Tag):/, '') # complete tag (key=value) - @key = @tag.sub(/=.*/, '') # key - if @tag =~ /=/ - @value = @tag.sub(/.*?=/, '') # value (if any) - end - if title =~ /^(.*):(Key|Tag):/ - @lang = $1.downcase # IETF language tag - @ttype = $2.downcase # 'tag' or 'key' - else - @lang = 'en' - end - @has_templ = false - - @tags_implies = [] - @tags_combination = [] - @tags_linked = [] - - @group = '' - @onNode = false - @onWay = false - @onArea = false - @onRelation = false - @parsed = nil - @@pages[@title] = self - end - - def self.pages - @@pages.values.sort{ |a,b| a.title <=> b.title } - end + @tags_linked = [] + @group = '' - def self.find(name) - @@pages[name] + @@pages[@title] = self end # Has this wiki page a name that we can understand and process? @@ -119,34 +98,9 @@ class WikiPage @tags_linked << tag end - def insert(db) - db.execute( - "INSERT INTO wikipages (lang, tag, key, value, title, body, tgroup, type, has_templ, parsed, description, image, on_node, on_way, on_area, on_relation, tags_implies, tags_combination, tags_linked) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", - lang, - tag, - key, - value, - title, - content, - group, - type, - has_templ ? 1 : 0, - parsed ? 1 : 0, - description, - image, - onNode ? 1 : 0, - onWay ? 1 : 0, - onArea ? 1 : 0, - onRelation ? 1 : 0, - tags_implies. sort.uniq.join(','), - tags_combination.sort.uniq.join(','), - tags_linked. sort.uniq.join(',') - ) - end - # Parse content of the wiki page. This will find the templates # and their parameters. - def parse_content + def parse_content(db) @parsed = true text = @content.gsub(%r{<!--.*?-->}, '') @@ -170,7 +124,7 @@ class WikiPage when '}}' # end of template context.last.add_parameter(m[1].strip) c = context.pop - yield c + parse_template(c, db) context.last.add_parameter(c) when '|' # template parameter context.last.add_parameter(m[1].strip) @@ -183,11 +137,166 @@ class WikiPage # 'after' is our next 'text' text = m[3] end - rescue - puts "Parsing of page #{title} failed" + rescue => ex + puts "Parsing of page #{title} failed '#{ex.message}'" @parsed = false end + def parse_template(template, db) + puts "Template: #{template.name} [#{template.parameters.join(',')}] #{template.named_parameters.inspect}" + if template.name == 'Key' || template.name == 'Tag' + tag = template.parameters[0] + if template.parameters[1] + tag += '=' + template.parameters[1] + end + add_tag_link(tag) + end + if template.name =~ /(Key|Value|Relation)Description$/ + @has_templ = true + if template.named_parameters['description'] + desc = [] + template.named_parameters['description'].each do |i| + if i.class == Template + desc << ' ' << i.parameters.join('=') << ' ' + else + desc << i + end + @description = desc.join('').strip + end + end + if template.named_parameters['image'] + ititle = template.named_parameters['image'][0] + if !ititle.nil? && ititle.match(%r{^(file|image):(.*)$}i) + @image = "File:#{$2}" + else + puts "invalid image: page='#{title}' image='#{ititle}'" + db.execute('INSERT INTO invalid_image_titles (page_title, image_title) VALUES (?, ?)', title, ititle) + @image = '' + end + end + if template.named_parameters['group'] + @group = template.named_parameters['group'][0] + end + if template.named_parameters['onNode'] == ['yes'] + @onNode = true + end + if template.named_parameters['onWay'] == ['yes'] + @onWay = true + end + if template.named_parameters['onArea'] == ['yes'] + @onArea = true + end + if template.named_parameters['onRelation'] == ['yes'] + @onRelation = true + end + if template.named_parameters['implies'] + template.named_parameters['implies'].each do |i| + if i.class == Template + tags_implies << i.parameters.join('=') + end + end + end + if template.named_parameters['combination'] + template.named_parameters['combination'].each do |i| + if i.class == Template + tags_combination << i.parameters.join('=') + end + end + end + end + end +end + +class KeyOrTagPage < WikiPage + + def initialize(type, timestamp, namespace, title) + super(type, timestamp, namespace, title) + + @tag = title.gsub(/^([^:]+:)?(Key|Tag):/, '') # complete tag (key=value) + @key = @tag.sub(/=.*/, '') # key + if @tag =~ /=/ + @value = @tag.sub(/.*?=/, '') # value (if any) + end + if title =~ /^(.*):(Key|Tag):/ + @lang = $1.downcase # IETF language tag + @ttype = $2.downcase # 'tag' or 'key' + else + @lang = 'en' + end + + @tags_implies = [] + @tags_combination = [] + @onNode = false + @onWay = false + @onArea = false + @onRelation = false + end + + def insert(db) + db.execute( + "INSERT INTO wikipages (lang, tag, key, value, title, body, tgroup, type, has_templ, parsed, description, image, on_node, on_way, on_area, on_relation, tags_implies, tags_combination, tags_linked) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + lang, + tag, + key, + value, + title, + content, + group, + type, + has_templ ? 1 : 0, + parsed ? 1 : 0, + description, + image, + onNode ? 1 : 0, + onWay ? 1 : 0, + onArea ? 1 : 0, + onRelation ? 1 : 0, + tags_implies. sort.uniq.join(','), + tags_combination.sort.uniq.join(','), + tags_linked. sort.uniq.join(',') + ) + end + +end + +class KeyPage < KeyOrTagPage +end + +class TagPage < KeyOrTagPage +end + +class RelationPage < WikiPage + + attr_reader :rtype + + def initialize(type, timestamp, namespace, title) + super(type, timestamp, namespace, title) + + @rtype = title.gsub(/^([^:]+:)?Relation:/, '') # relation type + if title =~ /^(.*):Relation:/ + @lang = $1.downcase # IETF language tag + else + @lang = 'en' + end + end + + def insert(db) + db.execute( + "INSERT INTO relation_pages (lang, rtype, title, body, tgroup, type, has_templ, parsed, description, image, tags_linked) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + lang, + rtype, + title, + content, + group, + type, + has_templ ? 1 : 0, + parsed ? 1 : 0, + description, + image, + tags_linked.sort.uniq.join(',') + ) + end + end #------------------------------------------------------------------------------ @@ -227,28 +336,42 @@ end #------------------------------------------------------------------------------ -def get_page(db, api, page) - db.execute("SELECT * FROM cache.cache_pages WHERE title=? AND timestamp=?", page.title, page.timestamp) do |row| - page.content = row['body'] - puts "Page #{ page.title } in cache (#{ page.timestamp })" - return +class Cache + + def initialize(dir, db, api) + @db = db + @api = api + @db.execute("ATTACH DATABASE ? AS cache", dir + '/wikicache.db') + @current_pagetitles = {} end - db.execute("DELETE FROM cache.cache_pages WHERE title=?", page.title); - res = api.get(page.params) - page.content = res.body - db.execute("INSERT INTO cache.cache_pages (title, timestamp, body) VALUES (?, ?, ?)", page.title, page.timestamp, page.content); - puts "Page #{ page.title } not in cache (#{ page.timestamp })" -end -def cleanup_cache(db, current_pagetitles) - db.execute("SELECT title FROM cache.cache_pages") do |row| - current_pagetitles.delete(row['title']) + def get_page(page) + @current_pagetitles[page.title] = page.timestamp + @db.execute("SELECT * FROM cache.cache_pages WHERE title=? AND timestamp=?", page.title, page.timestamp) do |row| + page.content = row['body'] + puts "Page #{ page.title } in cache (#{ page.timestamp })" + return + end + @db.execute("DELETE FROM cache.cache_pages WHERE title=?", page.title); + res = @api.get(page.params) + page.content = res.body + @db.execute("INSERT INTO cache.cache_pages (title, timestamp, body) VALUES (?, ?, ?)", page.title, page.timestamp, page.content); + puts "Page #{ page.title } not in cache (#{ page.timestamp })" end - to_delete = current_pagetitles.keys - puts "Deleting pages from cache: #{ to_delete.join(' ') }" - to_delete.each do |title| - db.execute("DELETE FROM cache.cache_pages WHERE title=?", title); + + # Removes pages from cache that are not in the wiki any more + def cleanup + @db.execute("SELECT title FROM cache.cache_pages") do |row| + @current_pagetitles.delete(row['title']) + end + + to_delete = @current_pagetitles.keys + puts "Deleting pages from cache: #{ to_delete.join(' ') }" + to_delete.each do |title| + @db.execute("DELETE FROM cache.cache_pages WHERE title=?", title); + end end + end #------------------------------------------------------------------------------ @@ -260,88 +383,32 @@ api = MediaWikiAPI::API.new('wiki.openstreetmap.org', 80, '/w/index.php?') db = SQLite3::Database.new(dir + '/taginfo-wiki.db') db.results_as_hash = true -db.execute("ATTACH DATABASE '#{dir}/wikicache.db' AS cache") +cache = Cache.new(dir, db, api) db.execute('BEGIN TRANSACTION') -current_pagetitles = {} - File.open(dir + '/tagpages.list') do |wikipages| wikipages.each do |line| line.chomp! - t = line.split("\t") - page = WikiPage.new(t[0], t[1], t[2], t[3]) - puts "page: (#{page.title}) (#{page.type}) (#{page.timestamp}) (#{page.namespace}) (#{page.tag})" + (type, timestamp, namespace, title) = line.split("\t") + + if title =~ /(^|:)Key:/ + page = KeyPage.new(type, timestamp, namespace, title) + elsif title =~ /(^|:)Tag:/ + page = TagPage.new(type, timestamp, namespace, title) + elsif title =~ /(^|:)Relation:/ + page = RelationPage.new(type, timestamp, namespace, title) + else + puts "Wiki page has wrong format: '#{title}'" + next + end + + puts "Parsing page: title='#{page.title}' type='#{page.type}' timestamp='#{page.timestamp}' namespace='#{page.namespace}'" reason = page.check_title if reason == :ok - current_pagetitles[page.title] = page.timestamp - - get_page(db, api, page) - - page.parse_content do |template| - #puts "Template: #{template.name} [#{template.parameters.join(',')}] #{template.named_parameters.inspect}" - if template.name == 'Key' || template.name == 'Tag' - tag = template.parameters[0] - if template.parameters[1] - tag += '=' + template.parameters[1] - end - page.add_tag_link(tag) - end - if template.name =~ /(Key|Value)Description$/ - page.has_templ = true - if template.named_parameters['description'] - desc = [] - template.named_parameters['description'].each do |i| - if i.class == Template - desc << ' ' << i.parameters.join('=') << ' ' - else - desc << i - end - page.description = desc.join('').strip - end - end - if template.named_parameters['image'] - ititle = template.named_parameters['image'][0] - if !ititle.nil? && ititle.match(%r{^(file|image):(.*)$}i) - page.image = "File:#{$2}" - else - puts "invalid image: page='#{page.title}' image='#{ititle}'" - db.execute('INSERT INTO invalid_image_titles (page_title, image_title) VALUES (?, ?)', page.title, ititle) - page.image = '' - end - end - if template.named_parameters['group'] - page.group = template.named_parameters['group'][0] - end - if template.named_parameters['onNode'] == ['yes'] - page.onNode = true - end - if template.named_parameters['onWay'] == ['yes'] - page.onWay = true - end - if template.named_parameters['onArea'] == ['yes'] - page.onArea = true - end - if template.named_parameters['onRelation'] == ['yes'] - page.onRelation = true - end - if template.named_parameters['implies'] - template.named_parameters['implies'].each do |i| - if i.class == Template - page.tags_implies << i.parameters.join('=') - end - end - end - if template.named_parameters['combination'] - template.named_parameters['combination'].each do |i| - if i.class == Template - page.tags_combination << i.parameters.join('=') - end - end - end - end - end + cache.get_page(page) + page.parse_content(db) page.insert(db) else puts "invalid page: #{reason} #{page.title}" @@ -350,7 +417,7 @@ File.open(dir + '/tagpages.list') do |wikipages| end end -cleanup_cache(db, current_pagetitles) +cache.cleanup db.execute('COMMIT') diff --git a/sources/wiki/post.sql b/sources/wiki/post.sql index 99eb26f..8bff9fd 100644 --- a/sources/wiki/post.sql +++ b/sources/wiki/post.sql @@ -13,6 +13,13 @@ UPDATE wikipages SET status='e' WHERE type='page' AND has_templ='true' AND parse CREATE INDEX wikipages_key_value_idx ON wikipages(key, value); +UPDATE relation_pages SET status='r' WHERE type='redirect'; +UPDATE relation_pages SET status='p' WHERE type='page' AND has_templ='false'; +UPDATE relation_pages SET status='t' WHERE type='page' AND has_templ='true' AND parsed=1; +UPDATE relation_pages SET status='e' WHERE type='page' AND has_templ='true' AND parsed=0; + +CREATE INDEX relation_pages_rtype_idx ON relation_pages(rtype); + CREATE INDEX wiki_images_image ON wiki_images(image); INSERT INTO wikipages_keys (key, langs, lang_count) SELECT key, group_concat(lang || ' ' || status), count(*) FROM wikipages WHERE value IS NULL GROUP BY key; diff --git a/sources/wiki/pre.sql b/sources/wiki/pre.sql index 8a515e7..280949a 100644 --- a/sources/wiki/pre.sql +++ b/sources/wiki/pre.sql @@ -33,6 +33,23 @@ CREATE TABLE wikipages ( status TEXT ); +DROP TABLE IF EXISTS relation_pages; + +CREATE TABLE relation_pages ( + lang TEXT, + rtype TEXT, + title TEXT, + body TEXT, + tgroup TEXT, + type TEXT, + has_templ INTEGER, + parsed INTEGER, + description TEXT, + image TEXT, + tags_linked TEXT, + status TEXT +); + DROP TABLE IF EXISTS wiki_images; CREATE TABLE wiki_images ( |