summaryrefslogtreecommitdiff
path: root/sources/wiki/get_page_list.rb
blob: 817481246c56b7bca05741ffa1ec2d68a4d9597c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/ruby
#------------------------------------------------------------------------------
#
#  get_page_list.rb [DIR]
#
#------------------------------------------------------------------------------
#
#  Gets the list of all wiki pages from the OSM wiki.
#
#  Two files will be written: 'allpages.list' contains all pages in the wiki,
#  'tagpages.list' contains all pages about tags from the wiki. Both have the
#  format:
#
#  <type> TAB <namespace> TAB <title>
#
#  The <type> is either 'page' or 'redirect', depending on whether this is a
#  proper wiki page or a redirect to another wiki page, respectively.
#
#  The <namespaces> gives the namespace this page is in. This is empty for the
#  main namespace.
#
#  <title> is the full title of the wiki page including leading namespaces etc.
#
#  The files will be created in DIR or in the current directory, if no directory
#  was given on the command line.
#
#------------------------------------------------------------------------------
#
#  Copyright (C) 2013  Jochen Topf <jochen@remote.org>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
#------------------------------------------------------------------------------

require 'net/http'
require 'uri'
require 'json'

require './lib/mediawikiapi.rb'

#------------------------------------------------------------------------------

def get_namespaces(api)
    data = api.query(:meta => 'siteinfo', :siprop => 'namespaces')
    namespaces = {}
    data['query']['namespaces'].values.each do |ns|
        if ns['canonical'] =~ /^[A-Z]{2}$/
            namespaces[ns['canonical']] = ns['id']
        end
    end
    namespaces
end

def get_page_list(api, namespaceid, options)
    apfrom = ''
    loop do
        data = api.query(:generator => 'allpages', :gaplimit => 'max', :gapfrom => apfrom, :gapnamespace => namespaceid, :gapfilterredir => options[:redirect] ? 'redirects' : 'nonredirects', :prop => 'info')
        data['query']['pages'].each do |k,v|
            yield v['touched'], v['title'].gsub(/\s/, '_')
        end
        if data['query-continue']
            apfrom = data['query-continue']['allpages']['gapcontinue'].gsub(/\s/, '_')
#            puts "apfrom=#{apfrom}"
        else
            return
        end
    end
end

#------------------------------------------------------------------------------

dir = ARGV[0] || '.'

api = MediaWikiAPI::API.new('wiki.openstreetmap.org')

namespaces = get_namespaces(api)

# add main namespace
namespaces[''] = 0

allpages = File.open(dir + '/allpages.list', 'w')
tagpages = File.open(dir + '/tagpages.list', 'w')

namespaces.keys.sort.each do |namespace|
    id = namespaces[namespace]

    get_page_list(api, id, :redirect => false) do |timestamp, page|
        line = ['page', timestamp, namespace, page].join("\t")
        allpages.puts line
        if page =~ /^([^:]+:)?(Key|Tag|Relation):(.+)$/
            tagpages.puts line
        end
    end

    get_page_list(api, id, :redirect => true) do |timestamp, page|
        line = ['redirect', timestamp, namespace, page].join("\t")
        allpages.puts line
        if page =~ /^([^:]+:)?(Key|Tag|Relation):(.+)$/
            tagpages.puts line
        end
    end
end

tagpages.close
allpages.close


#-- THE END -------------------------------------------------------------------