summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sources/wiki/cache.sql16
-rwxr-xr-xsources/wiki/get_image_info.rb8
-rwxr-xr-xsources/wiki/get_wiki_data.rb126
-rwxr-xr-xsources/wiki/update.sh12
4 files changed, 111 insertions, 51 deletions
diff --git a/sources/wiki/cache.sql b/sources/wiki/cache.sql
new file mode 100644
index 0000000..7c56b0f
--- /dev/null
+++ b/sources/wiki/cache.sql
@@ -0,0 +1,16 @@
+--
+-- Taginfo source: Wiki
+--
+-- cache.sql
+--
+
+.bail ON
+
+CREATE TABLE cache_pages (
+ title TEXT,
+ timestamp TEXT,
+ body TEXT
+);
+
+CREATE INDEX cache_pages_title_timestamp ON cache_pages (title, timestamp);
+
diff --git a/sources/wiki/get_image_info.rb b/sources/wiki/get_image_info.rb
index a6756e1..47d90b8 100755
--- a/sources/wiki/get_image_info.rb
+++ b/sources/wiki/get_image_info.rb
@@ -54,13 +54,17 @@ api.add_header('User-agent', 'taginfo/0.1 (jochen@remote.org)')
db = SQLite3::Database.new(dir + '/taginfo-wiki.db')
db.results_as_hash = true
-image_titles = db.execute("SELECT DISTINCT(image) AS title FROM wikipages").map{ |row| row['title'] }.select{ |title| !title.nil? && title.match(%r{^(file|image):}i) }
+image_titles = db.execute("SELECT DISTINCT(image) AS title FROM wikipages WHERE image IS NOT NULL AND image != ''").
+ map{ |row| row['title'] }.
+ select{ |title| title.match(%r{^(file|image):}i) }
db.execute('BEGIN TRANSACTION');
+puts "Found #{ image_titles.size } different image titles"
+
until image_titles.empty?
some_titles = image_titles.slice!(0, 10)
-# puts some_titles.join(",") + "\n"
+ puts "Get image info for: #{ some_titles.join(' ') }"
begin
data = api.query(:prop => 'imageinfo', :iiprop => 'url|size|mime', :titles => some_titles.join('|'), :iiurlwidth => 10, :iiurlheight => 10)
diff --git a/sources/wiki/get_wiki_data.rb b/sources/wiki/get_wiki_data.rb
index 35600bb..0213b1c 100755
--- a/sources/wiki/get_wiki_data.rb
+++ b/sources/wiki/get_wiki_data.rb
@@ -227,14 +227,45 @@ end
#------------------------------------------------------------------------------
+def get_page(db, api, page)
+ db.execute("SELECT * FROM cache.cache_pages WHERE title=? AND timestamp=?", page.title, page.timestamp) do |row|
+ page.content = row['body']
+ puts "Page #{ page.title } in cache (#{ page.timestamp })"
+ return
+ end
+ db.execute("DELETE FROM cache.cache_pages WHERE title=?", page.title);
+ res = api.get(page.params)
+ page.content = res.body
+ db.execute("INSERT INTO cache.cache_pages (title, timestamp, body) VALUES (?, ?, ?)", page.title, page.timestamp, page.content);
+ puts "Page #{ page.title } not in cache (#{ page.timestamp })"
+end
+
+def cleanup_cache(db, current_pagetitles)
+ db.execute("SELECT title FROM cache.cache_pages") do |row|
+ current_pagetitles.delete(row['title'])
+ end
+ to_delete = current_pagetitles.keys
+ puts "Deleting pages from cache: #{ to_delete.join(' ') }"
+ to_delete.each do |title|
+ db.execute("DELETE FROM cache.cache_pages WHERE title=?", title);
+ end
+end
+
+#------------------------------------------------------------------------------
+
dir = ARGV[0] || '.'
api = MediaWikiAPI::API.new('wiki.openstreetmap.org', 80, '/w/index.php?')
api.add_header('User-agent', 'taginfo/0.1 (jochen@remote.org)')
db = SQLite3::Database.new(dir + '/taginfo-wiki.db')
+db.results_as_hash = true
+
+db.execute("ATTACH DATABASE '#{dir}/wikicache.db' AS cache")
+
+db.execute('BEGIN TRANSACTION')
-db.execute('BEGIN TRANSACTION');
+current_pagetitles = {}
File.open(dir + '/tagpages.list') do |wikipages|
wikipages.each do |line|
@@ -245,8 +276,9 @@ File.open(dir + '/tagpages.list') do |wikipages|
reason = page.check_title
if reason == :ok
- res = api.get(page.params)
- page.content = res.body
+ current_pagetitles[page.title] = page.timestamp
+
+ get_page(db, api, page)
page.parse_content do |template|
puts "Template: #{template.name} [#{template.parameters.join(',')}] #{template.named_parameters.inspect}"
@@ -259,54 +291,54 @@ File.open(dir + '/tagpages.list') do |wikipages|
end
if template.name =~ /(Key|Value)Description$/
page.has_templ = true
- end
- if template.named_parameters['description']
- desc = []
- template.named_parameters['description'].each do |i|
- if i.class == Template
- desc << ' ' << i.parameters.join('=') << ' '
+ if template.named_parameters['description']
+ desc = []
+ template.named_parameters['description'].each do |i|
+ if i.class == Template
+ desc << ' ' << i.parameters.join('=') << ' '
+ else
+ desc << i
+ end
+ page.description = desc.join('').strip
+ end
+ end
+ if template.named_parameters['image']
+ ititle = template.named_parameters['image'][0]
+ if !ititle.nil? && ititle.match(%r{^(file|image):(.*)$}i)
+ page.image = "File:#{$2}"
else
- desc << i
+ puts "invalid image: page='#{page.title}' image='#{ititle}'"
+ db.execute('INSERT INTO invalid_image_titles (page_title, image_title) VALUES (?, ?)', page.title, ititle)
+ page.image = ''
end
- page.description = desc.join('').strip
end
- end
- if template.named_parameters['image']
- ititle = template.named_parameters['image'][0]
- if !ititle.nil? && ititle.match(%r{^(file|image):(.*)$}i)
- page.image = "File:#{$2}"
- else
- puts "invalid image: page='#{page.title}' image='#{ititle}'"
- db.execute('INSERT INTO invalid_image_titles (page_title, image_title) VALUES (?, ?)', page.title, ititle)
- page.image = ''
+ if template.named_parameters['group']
+ page.group = template.named_parameters['group'][0]
end
- end
- if template.named_parameters['group']
- page.group = template.named_parameters['group'][0]
- end
- if template.named_parameters['onNode'] == ['yes']
- page.onNode = true
- end
- if template.named_parameters['onWay'] == ['yes']
- page.onWay = true
- end
- if template.named_parameters['onArea'] == ['yes']
- page.onArea = true
- end
- if template.named_parameters['onRelation'] == ['yes']
- page.onRelation = true
- end
- if template.named_parameters['implies']
- template.named_parameters['implies'].each do |i|
- if i.class == Template
- page.tags_implies << i.parameters.join('=')
+ if template.named_parameters['onNode'] == ['yes']
+ page.onNode = true
+ end
+ if template.named_parameters['onWay'] == ['yes']
+ page.onWay = true
+ end
+ if template.named_parameters['onArea'] == ['yes']
+ page.onArea = true
+ end
+ if template.named_parameters['onRelation'] == ['yes']
+ page.onRelation = true
+ end
+ if template.named_parameters['implies']
+ template.named_parameters['implies'].each do |i|
+ if i.class == Template
+ page.tags_implies << i.parameters.join('=')
+ end
end
end
- end
- if template.named_parameters['combination']
- template.named_parameters['combination'].each do |i|
- if i.class == Template
- page.tags_combination << i.parameters.join('=')
+ if template.named_parameters['combination']
+ template.named_parameters['combination'].each do |i|
+ if i.class == Template
+ page.tags_combination << i.parameters.join('=')
+ end
end
end
end
@@ -319,7 +351,9 @@ File.open(dir + '/tagpages.list') do |wikipages|
end
end
-db.execute('COMMIT');
+cleanup_cache(db, current_pagetitles)
+
+db.execute('COMMIT')
#-- THE END -------------------------------------------------------------------
diff --git a/sources/wiki/update.sh b/sources/wiki/update.sh
index 7f7ed25..308c1b4 100755
--- a/sources/wiki/update.sh
+++ b/sources/wiki/update.sh
@@ -19,13 +19,19 @@ fi
echo "`$DATECMD` Start wiki..."
DATABASE=$DIR/taginfo-wiki.db
-LOGFILE=$DIR/get_wiki_data.log
+CACHEDB=$DIR/wikicache.db
+LOGFILE_WIKI_DATA=$DIR/get_wiki_data.log
+LOGFILE_IMAGE_INFO=$DIR/get_image_info.log
rm -f $DIR/allpages.list
rm -f $DIR/tagpages.list
rm -f $LOGFILE
rm -f $DATABASE
+if [ ! -e $CACHEDB ]; then
+ sqlite3 $CACHEDB <cache.sql
+fi
+
echo "`$DATECMD` Running init.sql..."
sqlite3 $DATABASE <../init.sql
@@ -36,10 +42,10 @@ echo "`$DATECMD` Getting page list..."
./get_page_list.rb $DIR
echo "`$DATECMD` Getting wiki data..."
-./get_wiki_data.rb $DIR >$LOGFILE
+./get_wiki_data.rb $DIR >$LOGFILE_WIKI_DATA
echo "`$DATECMD` Getting image info..."
-./get_image_info.rb $DIR >$LOGFILE
+./get_image_info.rb $DIR >$LOGFILE_IMAGE_INFO
echo "`$DATECMD` Extracting words..."
./extract_words.rb $DIR