diff options
-rw-r--r-- | sources/wiki/cache.sql | 16 | ||||
-rwxr-xr-x | sources/wiki/get_image_info.rb | 8 | ||||
-rwxr-xr-x | sources/wiki/get_wiki_data.rb | 126 | ||||
-rwxr-xr-x | sources/wiki/update.sh | 12 |
4 files changed, 111 insertions, 51 deletions
diff --git a/sources/wiki/cache.sql b/sources/wiki/cache.sql new file mode 100644 index 0000000..7c56b0f --- /dev/null +++ b/sources/wiki/cache.sql @@ -0,0 +1,16 @@ +-- +-- Taginfo source: Wiki +-- +-- cache.sql +-- + +.bail ON + +CREATE TABLE cache_pages ( + title TEXT, + timestamp TEXT, + body TEXT +); + +CREATE INDEX cache_pages_title_timestamp ON cache_pages (title, timestamp); + diff --git a/sources/wiki/get_image_info.rb b/sources/wiki/get_image_info.rb index a6756e1..47d90b8 100755 --- a/sources/wiki/get_image_info.rb +++ b/sources/wiki/get_image_info.rb @@ -54,13 +54,17 @@ api.add_header('User-agent', 'taginfo/0.1 (jochen@remote.org)') db = SQLite3::Database.new(dir + '/taginfo-wiki.db') db.results_as_hash = true -image_titles = db.execute("SELECT DISTINCT(image) AS title FROM wikipages").map{ |row| row['title'] }.select{ |title| !title.nil? && title.match(%r{^(file|image):}i) } +image_titles = db.execute("SELECT DISTINCT(image) AS title FROM wikipages WHERE image IS NOT NULL AND image != ''"). + map{ |row| row['title'] }. + select{ |title| title.match(%r{^(file|image):}i) } db.execute('BEGIN TRANSACTION'); +puts "Found #{ image_titles.size } different image titles" + until image_titles.empty? some_titles = image_titles.slice!(0, 10) -# puts some_titles.join(",") + "\n" + puts "Get image info for: #{ some_titles.join(' ') }" begin data = api.query(:prop => 'imageinfo', :iiprop => 'url|size|mime', :titles => some_titles.join('|'), :iiurlwidth => 10, :iiurlheight => 10) diff --git a/sources/wiki/get_wiki_data.rb b/sources/wiki/get_wiki_data.rb index 35600bb..0213b1c 100755 --- a/sources/wiki/get_wiki_data.rb +++ b/sources/wiki/get_wiki_data.rb @@ -227,14 +227,45 @@ end #------------------------------------------------------------------------------ +def get_page(db, api, page) + db.execute("SELECT * FROM cache.cache_pages WHERE title=? AND timestamp=?", page.title, page.timestamp) do |row| + page.content = row['body'] + puts "Page #{ page.title } in cache (#{ page.timestamp })" + return + end + db.execute("DELETE FROM cache.cache_pages WHERE title=?", page.title); + res = api.get(page.params) + page.content = res.body + db.execute("INSERT INTO cache.cache_pages (title, timestamp, body) VALUES (?, ?, ?)", page.title, page.timestamp, page.content); + puts "Page #{ page.title } not in cache (#{ page.timestamp })" +end + +def cleanup_cache(db, current_pagetitles) + db.execute("SELECT title FROM cache.cache_pages") do |row| + current_pagetitles.delete(row['title']) + end + to_delete = current_pagetitles.keys + puts "Deleting pages from cache: #{ to_delete.join(' ') }" + to_delete.each do |title| + db.execute("DELETE FROM cache.cache_pages WHERE title=?", title); + end +end + +#------------------------------------------------------------------------------ + dir = ARGV[0] || '.' api = MediaWikiAPI::API.new('wiki.openstreetmap.org', 80, '/w/index.php?') api.add_header('User-agent', 'taginfo/0.1 (jochen@remote.org)') db = SQLite3::Database.new(dir + '/taginfo-wiki.db') +db.results_as_hash = true + +db.execute("ATTACH DATABASE '#{dir}/wikicache.db' AS cache") + +db.execute('BEGIN TRANSACTION') -db.execute('BEGIN TRANSACTION'); +current_pagetitles = {} File.open(dir + '/tagpages.list') do |wikipages| wikipages.each do |line| @@ -245,8 +276,9 @@ File.open(dir + '/tagpages.list') do |wikipages| reason = page.check_title if reason == :ok - res = api.get(page.params) - page.content = res.body + current_pagetitles[page.title] = page.timestamp + + get_page(db, api, page) page.parse_content do |template| puts "Template: #{template.name} [#{template.parameters.join(',')}] #{template.named_parameters.inspect}" @@ -259,54 +291,54 @@ File.open(dir + '/tagpages.list') do |wikipages| end if template.name =~ /(Key|Value)Description$/ page.has_templ = true - end - if template.named_parameters['description'] - desc = [] - template.named_parameters['description'].each do |i| - if i.class == Template - desc << ' ' << i.parameters.join('=') << ' ' + if template.named_parameters['description'] + desc = [] + template.named_parameters['description'].each do |i| + if i.class == Template + desc << ' ' << i.parameters.join('=') << ' ' + else + desc << i + end + page.description = desc.join('').strip + end + end + if template.named_parameters['image'] + ititle = template.named_parameters['image'][0] + if !ititle.nil? && ititle.match(%r{^(file|image):(.*)$}i) + page.image = "File:#{$2}" else - desc << i + puts "invalid image: page='#{page.title}' image='#{ititle}'" + db.execute('INSERT INTO invalid_image_titles (page_title, image_title) VALUES (?, ?)', page.title, ititle) + page.image = '' end - page.description = desc.join('').strip end - end - if template.named_parameters['image'] - ititle = template.named_parameters['image'][0] - if !ititle.nil? && ititle.match(%r{^(file|image):(.*)$}i) - page.image = "File:#{$2}" - else - puts "invalid image: page='#{page.title}' image='#{ititle}'" - db.execute('INSERT INTO invalid_image_titles (page_title, image_title) VALUES (?, ?)', page.title, ititle) - page.image = '' + if template.named_parameters['group'] + page.group = template.named_parameters['group'][0] end - end - if template.named_parameters['group'] - page.group = template.named_parameters['group'][0] - end - if template.named_parameters['onNode'] == ['yes'] - page.onNode = true - end - if template.named_parameters['onWay'] == ['yes'] - page.onWay = true - end - if template.named_parameters['onArea'] == ['yes'] - page.onArea = true - end - if template.named_parameters['onRelation'] == ['yes'] - page.onRelation = true - end - if template.named_parameters['implies'] - template.named_parameters['implies'].each do |i| - if i.class == Template - page.tags_implies << i.parameters.join('=') + if template.named_parameters['onNode'] == ['yes'] + page.onNode = true + end + if template.named_parameters['onWay'] == ['yes'] + page.onWay = true + end + if template.named_parameters['onArea'] == ['yes'] + page.onArea = true + end + if template.named_parameters['onRelation'] == ['yes'] + page.onRelation = true + end + if template.named_parameters['implies'] + template.named_parameters['implies'].each do |i| + if i.class == Template + page.tags_implies << i.parameters.join('=') + end end end - end - if template.named_parameters['combination'] - template.named_parameters['combination'].each do |i| - if i.class == Template - page.tags_combination << i.parameters.join('=') + if template.named_parameters['combination'] + template.named_parameters['combination'].each do |i| + if i.class == Template + page.tags_combination << i.parameters.join('=') + end end end end @@ -319,7 +351,9 @@ File.open(dir + '/tagpages.list') do |wikipages| end end -db.execute('COMMIT'); +cleanup_cache(db, current_pagetitles) + +db.execute('COMMIT') #-- THE END ------------------------------------------------------------------- diff --git a/sources/wiki/update.sh b/sources/wiki/update.sh index 7f7ed25..308c1b4 100755 --- a/sources/wiki/update.sh +++ b/sources/wiki/update.sh @@ -19,13 +19,19 @@ fi echo "`$DATECMD` Start wiki..." DATABASE=$DIR/taginfo-wiki.db -LOGFILE=$DIR/get_wiki_data.log +CACHEDB=$DIR/wikicache.db +LOGFILE_WIKI_DATA=$DIR/get_wiki_data.log +LOGFILE_IMAGE_INFO=$DIR/get_image_info.log rm -f $DIR/allpages.list rm -f $DIR/tagpages.list rm -f $LOGFILE rm -f $DATABASE +if [ ! -e $CACHEDB ]; then + sqlite3 $CACHEDB <cache.sql +fi + echo "`$DATECMD` Running init.sql..." sqlite3 $DATABASE <../init.sql @@ -36,10 +42,10 @@ echo "`$DATECMD` Getting page list..." ./get_page_list.rb $DIR echo "`$DATECMD` Getting wiki data..." -./get_wiki_data.rb $DIR >$LOGFILE +./get_wiki_data.rb $DIR >$LOGFILE_WIKI_DATA echo "`$DATECMD` Getting image info..." -./get_image_info.rb $DIR >$LOGFILE +./get_image_info.rb $DIR >$LOGFILE_IMAGE_INFO echo "`$DATECMD` Extracting words..." ./extract_words.rb $DIR |