1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
|
#!/usr/bin/env ruby
#------------------------------------------------------------------------------
#
# get_page_list.rb [DIR]
#
#------------------------------------------------------------------------------
#
# Gets the list of all wiki pages from the OSM wiki.
#
# Two files will be written: 'allpages.list' contains all pages in the wiki,
# 'tagpages.list' contains all pages about tags from the wiki. Both have the
# format:
#
# <type> TAB <namespace> TAB <title>
#
# The <type> is either 'page' or 'redirect', depending on whether this is a
# proper wiki page or a redirect to another wiki page, respectively.
#
# The <namespaces> gives the namespace this page is in. This is empty for the
# main namespace.
#
# <title> is the full title of the wiki page including leading namespaces etc.
#
# The files will be created in DIR or in the current directory, if no directory
# was given on the command line.
#
#------------------------------------------------------------------------------
#
# Copyright (C) 2013 Jochen Topf <jochen@remote.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#------------------------------------------------------------------------------
require 'net/http'
require 'uri'
require 'json'
require './lib/mediawikiapi.rb'
#------------------------------------------------------------------------------
def get_namespaces(api)
data = api.query(:meta => 'siteinfo', :siprop => 'namespaces')
namespaces = {}
data['query']['namespaces'].values.each do |ns|
if ns['canonical'] =~ /^[A-Z]{2}$/
namespaces[ns['canonical']] = ns['id']
end
end
namespaces
end
def get_page_list(api, namespaceid, options)
apfrom = ''
loop do
data = api.query(:generator => 'allpages', :gaplimit => 'max', :gapfrom => apfrom, :gapnamespace => namespaceid, :gapfilterredir => options[:redirect] ? 'redirects' : 'nonredirects', :prop => 'info')
data['query']['pages'].each do |k,v|
yield v['touched'], v['title'].gsub(/\s/, '_')
end
if data['query-continue']
apfrom = data['query-continue']['allpages']['gapcontinue'].gsub(/\s/, '_')
# puts "apfrom=#{apfrom}"
else
return
end
end
end
#------------------------------------------------------------------------------
dir = ARGV[0] || '.'
api = MediaWikiAPI::API.new('wiki.openstreetmap.org')
namespaces = get_namespaces(api)
# add main namespace
namespaces[''] = 0
allpages = File.open(dir + '/allpages.list', 'w')
tagpages = File.open(dir + '/tagpages.list', 'w')
namespaces.keys.sort.each do |namespace|
id = namespaces[namespace]
get_page_list(api, id, :redirect => false) do |timestamp, page|
line = ['page', timestamp, namespace, page].join("\t")
allpages.puts line
if page =~ /^([^:]+:)?(Key|Tag|Relation):(.+)$/
tagpages.puts line
end
end
get_page_list(api, id, :redirect => true) do |timestamp, page|
line = ['redirect', timestamp, namespace, page].join("\t")
allpages.puts line
if page =~ /^([^:]+:)?(Key|Tag|Relation):(.+)$/
tagpages.puts line
end
end
end
tagpages.close
allpages.close
#-- THE END -------------------------------------------------------------------
|