diff options
author | Jochen Topf <jochen@topf.org> | 2012-10-14 19:32:20 +0200 |
---|---|---|
committer | Jochen Topf <jochen@topf.org> | 2012-10-14 19:32:20 +0200 |
commit | 47eecdd35cac768566fc8a79afe7760bbf99b75a (patch) | |
tree | 9257beb7ee8e54b06121d7a7cdb931e92847e00d /sources/wiki | |
parent | d8296b1b2e916070e8de2c5435254d60e3b9bd3c (diff) | |
download | taginfo-47eecdd35cac768566fc8a79afe7760bbf99b75a.tar taginfo-47eecdd35cac768566fc8a79afe7760bbf99b75a.tar.gz |
Add minimal support for wiki pages fulltext search
Diffstat (limited to 'sources/wiki')
-rwxr-xr-x | sources/wiki/extract_words.rb | 150 | ||||
-rw-r--r-- | sources/wiki/pre.sql | 8 | ||||
-rwxr-xr-x | sources/wiki/update.sh | 3 |
3 files changed, 161 insertions, 0 deletions
diff --git a/sources/wiki/extract_words.rb b/sources/wiki/extract_words.rb new file mode 100755 index 0000000..8b018d8 --- /dev/null +++ b/sources/wiki/extract_words.rb @@ -0,0 +1,150 @@ +#!/usr/bin/ruby +#------------------------------------------------------------------------------ +# +# extract_words.rb [DIR] +# +#------------------------------------------------------------------------------ +# +# Extracts words from wiki pages into their own table for full-text search. +# +#------------------------------------------------------------------------------ +# +# Copyright (C) 2012 Jochen Topf <jochen@remote.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +#------------------------------------------------------------------------------ + +require 'rubygems' + +require 'sqlite3' + +#------------------------------------------------------------------------------ + +class Words + + def initialize + @words = Hash.new + end + + def add(key, value, lang, word) + entry = [key, value] + if @words[word] + @words[word] << entry + else + @words[word] = [entry] + end + end + + # Remove words that appear too often + def cleanup + @words.delete_if do |k, v| + v.size >= 10 + end + end + + def invert + @kvw = [] + @words.each do |word, entries| + entries.each do |entry| + key = entry[0] + value = entry[1] || '' + @kvw << [key, value, word] + end + end + end + + def dump + lastkey = '' + lastvalue = '' + words = [] + @kvw.sort.uniq.each do |key, value, word| + if key != lastkey || value != lastvalue + yield lastkey, lastvalue, words.join(',') + words = [] + lastkey = key + lastvalue = value + else + words << word + end + end + yield lastkey, lastvalue, words.join(',') + end + +end + +#------------------------------------------------------------------------------ + +class WordExtractor + + def initialize(words) + @words = words + end + + def interested_in(word, key, value, lang) + # not interested in very short words + return false if word.size <= 2 + + # digits make for bad words + return false if word =~ /\d/ + +# # not interested if word == key or == value +# key.downcase! +# value.downcase! unless value.nil? +# return false if word == key || word == value + + return true + end + + def parse(key, value, lang, text) + words = text.scan(/\w+/).sort.uniq + words.each do |word| + word.downcase! + if interested_in(word, key, value, lang) + @words.add(key, value, lang, word) + end + end + end + +end + +#------------------------------------------------------------------------------ + +dir = ARGV[0] || '.' +db = SQLite3::Database.new(dir + '/taginfo-wiki.db') +db.results_as_hash = true + +words = Words.new +we = WordExtractor.new(words) + +db.execute("SELECT * FROM wikipages") do |row| +# puts "key=#{ row['key'] } value=#{ row['value'] } lang=#{ row['lang'] }" + we.parse(row['key'], row['value'], row['lang'], row['body']) +end + +words.cleanup +words.invert + +#words.dump do |key, value, words| +# puts "#{key}=#{value}: #{words}" +#end + +db.execute('BEGIN TRANSACTION'); +words.dump do |key, value, words| + db.execute('INSERT INTO words (key, value, words) VALUES (?, ?, ?)', key, value, words) +end +db.execute('COMMIT'); + +#-- THE END ------------------------------------------------------------------- diff --git a/sources/wiki/pre.sql b/sources/wiki/pre.sql index 56e9538..ebb80d3 100644 --- a/sources/wiki/pre.sql +++ b/sources/wiki/pre.sql @@ -64,6 +64,14 @@ CREATE TABLE invalid_page_titles ( title TEXT ); +DROP TABLE IF EXISTS words; + +CREATE TABLE words ( + key TEXT, + value TEXT, + words TEXT +); + DROP TABLE IF EXISTS stats; CREATE TABLE stats ( diff --git a/sources/wiki/update.sh b/sources/wiki/update.sh index 7d7dc6b..635b45c 100755 --- a/sources/wiki/update.sh +++ b/sources/wiki/update.sh @@ -38,6 +38,9 @@ echo "`$DATECMD` Getting page list..." echo "`$DATECMD` Getting wiki data..." ./get_wiki_data.rb $DIR >$LOGFILE +echo "`$DATECMD` Extracting words..." +./extract_words.rb $DIR + echo "`$DATECMD` Running post.sql..." sqlite3 $DATABASE <post.sql |