From 47cb7bf89567ecf603e23068fc5ac07aa801d487 Mon Sep 17 00:00:00 2001 From: Jochen Topf Date: Tue, 24 Mar 2015 18:00:41 +0100 Subject: Get and classify all links in wiki to Key/Tag/Relation pages. This adds two scripts, the first (get_links.rb) uses the Mediawiki API to get all pages that link to any Key/Tag/Relation pages. The second (classify_links.rb) puts those links into different categories. The update script is prepared to call those scripts, but the code is disabled at the moment. --- sources/wiki/classify_links.rb | 127 +++++++++++++++++++++++++++++++++++++++++ sources/wiki/get_links.rb | 81 ++++++++++++++++++++++++++ sources/wiki/pre.sql | 14 +++++ sources/wiki/update.sh | 6 ++ 4 files changed, 228 insertions(+) create mode 100755 sources/wiki/classify_links.rb create mode 100755 sources/wiki/get_links.rb diff --git a/sources/wiki/classify_links.rb b/sources/wiki/classify_links.rb new file mode 100755 index 0000000..907eadc --- /dev/null +++ b/sources/wiki/classify_links.rb @@ -0,0 +1,127 @@ +#!/usr/bin/env ruby +#------------------------------------------------------------------------------ +# +# classify_links.rb [DIR] +# +#------------------------------------------------------------------------------ +# +# Read the links we got from get_links.rb, classify them, and add the to the +# taginfo-wiki.db database. +# +# Classification (link_class): +# +# category - From a Category: page +# how_to_map - From any "How to map" page +# import - From any "Import" page +# key_to_tag - From a Key to one of its Tags +# ktr - From any Key/Tag/Relation page +# map_features - From any "Map Features" page +# proposed - From any "Proposed" page +# rest - From anything else +# same - From one language variant to another of the same Key/Tag/Relation +# tag_to_key - From a Tag to its Key +# template - From any "Template:" page +# user - From any "User:" or "User talk:" page +# +#------------------------------------------------------------------------------ +# +# Copyright (C) 2015 Jochen Topf +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +#------------------------------------------------------------------------------ + +require 'sqlite3' + +dir = ARGV[0] || '.' + +db = SQLite3::Database.new(dir + '/taginfo-wiki.db') +db.results_as_hash = true + +# Regular expression matching Key/Tag/Relation pages in all languages +regexp_ktr = Regexp.new('^(?:(.*):)?(Key|Tag|Relation):(.*)$') + +db.transaction do |db| + + File.open(dir + '/links.list') do |linkfile| + linkfile.each do |line| + line.chomp! + (from, to) = line.split("\t") + + link_class = 'rest' + + if from =~ /^Category:/ + link_class = 'category' + end + + if from =~ /^(([A-Za-z]+):)?Template(_talk)?:/ + link_class = 'template' + end + + if from =~ /Map_Features/i + link_class = 'map_features' + end + + if from =~ /Import/i + link_class = 'import' + end + + if from =~ /How_to_map_a$/ + link_class = 'how_to_map' + end + + if from =~ /Proposed_features/i + link_class = 'proposed' + end + + if from =~ /^(([A-Za-z]+):)?User(_talk)?:/ + link_class = 'user' + end + + fm = from.match regexp_ktr + if fm + from_lang = fm[1] + from_type = fm[2] + from_name = fm[3] + end + + tm = to.match regexp_ktr + if tm + to_lang = tm[1] + to_type = tm[2] + to_name = tm[3] + end + + if fm && tm + if from_type == to_type && from_name == to_name + link_class = 'same' + elsif from_type == 'Tag' && to_type == 'Key' && from_name.sub(/=.*/, '') == to_name + link_class = 'tag_to_key' + elsif from_type == 'Key' && to_type == 'Tag' && to_name.sub(/=.*/, '') == from_name + link_class = 'key_to_tag' + else + link_class = 'ktr' + end + end + + db.execute("INSERT INTO wiki_links (link_class, from_title, from_lang, from_type, from_name, to_title, to_lang, to_type, to_name) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", + link_class, + from, from_lang, from_type, from_name, + to, to_lang, to_type, to_name + ) +# puts "#{link_class}\t#{from}\t#{from_lang}\t#{from_type}\t#{from_name}\t#{to}\t#{to_lang}\t#{to_type}\t#{to_name}" + end + end +end diff --git a/sources/wiki/get_links.rb b/sources/wiki/get_links.rb new file mode 100755 index 0000000..19c06c8 --- /dev/null +++ b/sources/wiki/get_links.rb @@ -0,0 +1,81 @@ +#!/usr/bin/env ruby +#------------------------------------------------------------------------------ +# +# get_links.rb [DIR] +# +#------------------------------------------------------------------------------ +# +# Get a list of pages linking to all Key/Tag/Relation pages from the OSM +# wiki. This list will include links from other language versions of the +# same Key/Tag/Relation, links from other Key/Tag/Relation pages and links +# from all other wiki pages. +# +# Output is on STDOUT with the title of the page the link is from a TAB +# character and the title of the page the link is to. The underscore (_) is +# used where there are spaces in a title. +# +#------------------------------------------------------------------------------ +# +# Copyright (C) 2015 Jochen Topf +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +#------------------------------------------------------------------------------ + +require 'net/http' +require 'uri' +require 'json' +require 'pp' + +require './lib/mediawikiapi.rb' + +#------------------------------------------------------------------------------ + +def what_links_to(api, title) + blcontinue = nil + loop do + options = { :action => 'query', :list => 'backlinks', :bltitle => title, :bllimit => 500 } + if blcontinue + options[:blcontinue] = blcontinue + end + data = api.query(options) + data['query']['backlinks'].each do |bl| + bl['title'].gsub!(/\s/, '_') + puts "#{bl['title']}\t#{title}" + end + if data['query-continue'] + blcontinue = data['query-continue']['backlinks']['blcontinue'].gsub(/\s/, '_') + else + return + end + end +end + +#------------------------------------------------------------------------------ + +dir = ARGV[0] || '.' + +api = MediaWikiAPI::API.new('wiki.openstreetmap.org') + +File.open(dir + '/tagpages.list') do |tagpages| + tagpages.each do |line| + line.chomp! + (type, timestamp, namespace, title) = line.split("\t") + what_links_to(api, title) + end +end + + +#-- THE END ------------------------------------------------------------------- diff --git a/sources/wiki/pre.sql b/sources/wiki/pre.sql index 280949a..742d8f0 100644 --- a/sources/wiki/pre.sql +++ b/sources/wiki/pre.sql @@ -87,6 +87,20 @@ CREATE TABLE wiki_languages ( count_pages INTEGER ); +DROP TABLE IF EXISTS wiki_links; + +CREATE TABLE wiki_links ( + link_class TEXT, + from_title TEXT, + from_lang TEXT, + from_type TEXT, + from_name TEXT, + to_title TEXT, + to_lang TEXT, + to_type TEXT, + to_name TEXT +); + DROP TABLE IF EXISTS invalid_page_title; CREATE TABLE invalid_page_titles ( diff --git a/sources/wiki/update.sh b/sources/wiki/update.sh index e922710..b8ad763 100755 --- a/sources/wiki/update.sh +++ b/sources/wiki/update.sh @@ -53,6 +53,12 @@ $EXEC_RUBY ./get_wiki_data.rb $DIR >$LOGFILE_WIKI_DATA echo "`$DATECMD` Getting image info..." $EXEC_RUBY ./get_image_info.rb $DIR >$LOGFILE_IMAGE_INFO +#echo "`$DATECMD` Getting links to Key/Tag/Relation pages..." +#$EXEC_RUBY ./get_links.rb $DIR >$DIR/links.list + +#echo "`$DATECMD` Classifying links..." +#$EXEC_RUBY ./classify_links.rb $DIR + echo "`$DATECMD` Extracting words..." $EXEC_RUBY ./extract_words.rb $DIR -- cgit v1.2.3