summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJochen Topf <jochen@topf.org>2015-03-24 18:00:41 +0100
committerJochen Topf <jochen@topf.org>2015-04-28 21:55:12 +0200
commit47cb7bf89567ecf603e23068fc5ac07aa801d487 (patch)
tree57cbe52fa8c49d7eb1f068f3ef394d36a49bd807
parentc8a334b10512a708d19ed5fd42aa60424290a371 (diff)
downloadtaginfo-47cb7bf89567ecf603e23068fc5ac07aa801d487.tar
taginfo-47cb7bf89567ecf603e23068fc5ac07aa801d487.tar.gz
Get and classify all links in wiki to Key/Tag/Relation pages.
This adds two scripts, the first (get_links.rb) uses the Mediawiki API to get all pages that link to any Key/Tag/Relation pages. The second (classify_links.rb) puts those links into different categories. The update script is prepared to call those scripts, but the code is disabled at the moment.
-rwxr-xr-xsources/wiki/classify_links.rb127
-rwxr-xr-xsources/wiki/get_links.rb81
-rw-r--r--sources/wiki/pre.sql14
-rwxr-xr-xsources/wiki/update.sh6
4 files changed, 228 insertions, 0 deletions
diff --git a/sources/wiki/classify_links.rb b/sources/wiki/classify_links.rb
new file mode 100755
index 0000000..907eadc
--- /dev/null
+++ b/sources/wiki/classify_links.rb
@@ -0,0 +1,127 @@
+#!/usr/bin/env ruby
+#------------------------------------------------------------------------------
+#
+# classify_links.rb [DIR]
+#
+#------------------------------------------------------------------------------
+#
+# Read the links we got from get_links.rb, classify them, and add the to the
+# taginfo-wiki.db database.
+#
+# Classification (link_class):
+#
+# category - From a Category: page
+# how_to_map - From any "How to map" page
+# import - From any "Import" page
+# key_to_tag - From a Key to one of its Tags
+# ktr - From any Key/Tag/Relation page
+# map_features - From any "Map Features" page
+# proposed - From any "Proposed" page
+# rest - From anything else
+# same - From one language variant to another of the same Key/Tag/Relation
+# tag_to_key - From a Tag to its Key
+# template - From any "Template:" page
+# user - From any "User:" or "User talk:" page
+#
+#------------------------------------------------------------------------------
+#
+# Copyright (C) 2015 Jochen Topf <jochen@remote.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+#------------------------------------------------------------------------------
+
+require 'sqlite3'
+
+dir = ARGV[0] || '.'
+
+db = SQLite3::Database.new(dir + '/taginfo-wiki.db')
+db.results_as_hash = true
+
+# Regular expression matching Key/Tag/Relation pages in all languages
+regexp_ktr = Regexp.new('^(?:(.*):)?(Key|Tag|Relation):(.*)$')
+
+db.transaction do |db|
+
+ File.open(dir + '/links.list') do |linkfile|
+ linkfile.each do |line|
+ line.chomp!
+ (from, to) = line.split("\t")
+
+ link_class = 'rest'
+
+ if from =~ /^Category:/
+ link_class = 'category'
+ end
+
+ if from =~ /^(([A-Za-z]+):)?Template(_talk)?:/
+ link_class = 'template'
+ end
+
+ if from =~ /Map_Features/i
+ link_class = 'map_features'
+ end
+
+ if from =~ /Import/i
+ link_class = 'import'
+ end
+
+ if from =~ /How_to_map_a$/
+ link_class = 'how_to_map'
+ end
+
+ if from =~ /Proposed_features/i
+ link_class = 'proposed'
+ end
+
+ if from =~ /^(([A-Za-z]+):)?User(_talk)?:/
+ link_class = 'user'
+ end
+
+ fm = from.match regexp_ktr
+ if fm
+ from_lang = fm[1]
+ from_type = fm[2]
+ from_name = fm[3]
+ end
+
+ tm = to.match regexp_ktr
+ if tm
+ to_lang = tm[1]
+ to_type = tm[2]
+ to_name = tm[3]
+ end
+
+ if fm && tm
+ if from_type == to_type && from_name == to_name
+ link_class = 'same'
+ elsif from_type == 'Tag' && to_type == 'Key' && from_name.sub(/=.*/, '') == to_name
+ link_class = 'tag_to_key'
+ elsif from_type == 'Key' && to_type == 'Tag' && to_name.sub(/=.*/, '') == from_name
+ link_class = 'key_to_tag'
+ else
+ link_class = 'ktr'
+ end
+ end
+
+ db.execute("INSERT INTO wiki_links (link_class, from_title, from_lang, from_type, from_name, to_title, to_lang, to_type, to_name) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
+ link_class,
+ from, from_lang, from_type, from_name,
+ to, to_lang, to_type, to_name
+ )
+# puts "#{link_class}\t#{from}\t#{from_lang}\t#{from_type}\t#{from_name}\t#{to}\t#{to_lang}\t#{to_type}\t#{to_name}"
+ end
+ end
+end
diff --git a/sources/wiki/get_links.rb b/sources/wiki/get_links.rb
new file mode 100755
index 0000000..19c06c8
--- /dev/null
+++ b/sources/wiki/get_links.rb
@@ -0,0 +1,81 @@
+#!/usr/bin/env ruby
+#------------------------------------------------------------------------------
+#
+# get_links.rb [DIR]
+#
+#------------------------------------------------------------------------------
+#
+# Get a list of pages linking to all Key/Tag/Relation pages from the OSM
+# wiki. This list will include links from other language versions of the
+# same Key/Tag/Relation, links from other Key/Tag/Relation pages and links
+# from all other wiki pages.
+#
+# Output is on STDOUT with the title of the page the link is from a TAB
+# character and the title of the page the link is to. The underscore (_) is
+# used where there are spaces in a title.
+#
+#------------------------------------------------------------------------------
+#
+# Copyright (C) 2015 Jochen Topf <jochen@remote.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+#------------------------------------------------------------------------------
+
+require 'net/http'
+require 'uri'
+require 'json'
+require 'pp'
+
+require './lib/mediawikiapi.rb'
+
+#------------------------------------------------------------------------------
+
+def what_links_to(api, title)
+ blcontinue = nil
+ loop do
+ options = { :action => 'query', :list => 'backlinks', :bltitle => title, :bllimit => 500 }
+ if blcontinue
+ options[:blcontinue] = blcontinue
+ end
+ data = api.query(options)
+ data['query']['backlinks'].each do |bl|
+ bl['title'].gsub!(/\s/, '_')
+ puts "#{bl['title']}\t#{title}"
+ end
+ if data['query-continue']
+ blcontinue = data['query-continue']['backlinks']['blcontinue'].gsub(/\s/, '_')
+ else
+ return
+ end
+ end
+end
+
+#------------------------------------------------------------------------------
+
+dir = ARGV[0] || '.'
+
+api = MediaWikiAPI::API.new('wiki.openstreetmap.org')
+
+File.open(dir + '/tagpages.list') do |tagpages|
+ tagpages.each do |line|
+ line.chomp!
+ (type, timestamp, namespace, title) = line.split("\t")
+ what_links_to(api, title)
+ end
+end
+
+
+#-- THE END -------------------------------------------------------------------
diff --git a/sources/wiki/pre.sql b/sources/wiki/pre.sql
index 280949a..742d8f0 100644
--- a/sources/wiki/pre.sql
+++ b/sources/wiki/pre.sql
@@ -87,6 +87,20 @@ CREATE TABLE wiki_languages (
count_pages INTEGER
);
+DROP TABLE IF EXISTS wiki_links;
+
+CREATE TABLE wiki_links (
+ link_class TEXT,
+ from_title TEXT,
+ from_lang TEXT,
+ from_type TEXT,
+ from_name TEXT,
+ to_title TEXT,
+ to_lang TEXT,
+ to_type TEXT,
+ to_name TEXT
+);
+
DROP TABLE IF EXISTS invalid_page_title;
CREATE TABLE invalid_page_titles (
diff --git a/sources/wiki/update.sh b/sources/wiki/update.sh
index e922710..b8ad763 100755
--- a/sources/wiki/update.sh
+++ b/sources/wiki/update.sh
@@ -53,6 +53,12 @@ $EXEC_RUBY ./get_wiki_data.rb $DIR >$LOGFILE_WIKI_DATA
echo "`$DATECMD` Getting image info..."
$EXEC_RUBY ./get_image_info.rb $DIR >$LOGFILE_IMAGE_INFO
+#echo "`$DATECMD` Getting links to Key/Tag/Relation pages..."
+#$EXEC_RUBY ./get_links.rb $DIR >$DIR/links.list
+
+#echo "`$DATECMD` Classifying links..."
+#$EXEC_RUBY ./classify_links.rb $DIR
+
echo "`$DATECMD` Extracting words..."
$EXEC_RUBY ./extract_words.rb $DIR