Add minimal support for wiki pages fulltext search

author: Jochen Topf <jochen@topf.org> 2012-10-14 19:32:20 +0200
committer: Jochen Topf <jochen@topf.org> 2012-10-14 19:32:20 +0200
commit: 47eecdd35cac768566fc8a79afe7760bbf99b75a (patch)
tree: 9257beb7ee8e54b06121d7a7cdb931e92847e00d /sources
parent: d8296b1b2e916070e8de2c5435254d60e3b9bd3c (diff)
download: taginfo-47eecdd35cac768566fc8a79afe7760bbf99b75a.tar
taginfo-47eecdd35cac768566fc8a79afe7760bbf99b75a.tar.gz
3 files changed, 161 insertions, 0 deletions
diff --git a/sources/wiki/extract_words.rb b/sources/wiki/extract_words.rb
new file mode 100755
index 0000000..8b018d8
--- /dev/null
+++ b/sources/wiki/extract_words.rb
@@ -0,0 +1,150 @@
+#!/usr/bin/ruby
+#------------------------------------------------------------------------------
+#
+#  extract_words.rb [DIR]
+#
+#------------------------------------------------------------------------------
+#
+#  Extracts words from wiki pages into their own table for full-text search.
+#
+#------------------------------------------------------------------------------
+#
+#  Copyright (C) 2012  Jochen Topf <jochen@remote.org>
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License along
+#  with this program; if not, write to the Free Software Foundation, Inc.,
+#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+#------------------------------------------------------------------------------
+
+require 'rubygems'
+
+require 'sqlite3'
+
+#------------------------------------------------------------------------------
+
+class Words
+
+    def initialize
+        @words = Hash.new
+    end
+
+    def add(key, value, lang, word)
+        entry = [key, value]
+        if @words[word]
+            @words[word] << entry
+        else
+            @words[word] = [entry]
+        end
+    end
+
+    # Remove words that appear too often
+    def cleanup
+        @words.delete_if do |k, v|
+            v.size >= 10
+        end
+    end
+
+    def invert
+        @kvw = []
+        @words.each do |word, entries|
+            entries.each do |entry|
+                key = entry[0]
+                value = entry[1] || ''
+                @kvw << [key, value, word]
+            end
+        end
+    end
+
+    def dump
+        lastkey = ''
+        lastvalue = ''
+        words = []
+        @kvw.sort.uniq.each do |key, value, word|
+            if key != lastkey || value != lastvalue
+                yield lastkey, lastvalue, words.join(',')
+                words = []    
+                lastkey = key
+                lastvalue = value
+            else
+                words << word
+            end
+        end
+        yield lastkey, lastvalue, words.join(',')
+    end
+
+end
+
+#------------------------------------------------------------------------------
+
+class WordExtractor
+
+    def initialize(words)
+        @words = words
+    end
+
+    def interested_in(word, key, value, lang)
+        # not interested in very short words
+        return false if word.size <= 2
+
+        # digits make for bad words
+        return false if word =~ /\d/
+
+#        # not interested if word == key or == value
+#        key.downcase!
+#        value.downcase! unless value.nil?
+#        return false if word == key || word == value
+
+        return true
+    end
+
+    def parse(key, value, lang, text)
+        words = text.scan(/\w+/).sort.uniq
+        words.each do |word|
+            word.downcase!
+            if interested_in(word, key, value, lang)
+                @words.add(key, value, lang, word)
+            end
+        end
+    end
+
+end
+
+#------------------------------------------------------------------------------
+
+dir = ARGV[0] || '.'
+db = SQLite3::Database.new(dir + '/taginfo-wiki.db')
+db.results_as_hash = true
+
+words = Words.new
+we = WordExtractor.new(words)
+
+db.execute("SELECT * FROM wikipages") do |row|
+#    puts "key=#{ row['key'] } value=#{ row['value'] } lang=#{ row['lang'] }"
+    we.parse(row['key'], row['value'], row['lang'], row['body'])
+end
+
+words.cleanup
+words.invert
+
+#words.dump do |key, value, words|
+#    puts "#{key}=#{value}: #{words}"
+#end
+
+db.execute('BEGIN TRANSACTION');
+words.dump do |key, value, words|
+    db.execute('INSERT INTO words (key, value, words) VALUES (?, ?, ?)', key, value, words)
+end
+db.execute('COMMIT');
+
+#-- THE END -------------------------------------------------------------------
diff --git a/sources/wiki/pre.sql b/sources/wiki/pre.sql
index 56e9538..ebb80d3 100644
--- a/sources/wiki/pre.sql
+++ b/sources/wiki/pre.sql
@@ -64,6 +64,14 @@ CREATE TABLE invalid_page_titles (
     title  TEXT
 );
 
+DROP TABLE IF EXISTS words;
+
+CREATE TABLE words (
+    key   TEXT,
+    value TEXT,
+    words TEXT
+);
+
 DROP TABLE IF EXISTS stats;
 
 CREATE TABLE stats (
diff --git a/sources/wiki/update.sh b/sources/wiki/update.sh
index 7d7dc6b..635b45c 100755
--- a/sources/wiki/update.sh
+++ b/sources/wiki/update.sh
@@ -38,6 +38,9 @@ echo "`$DATECMD` Getting page list..."
 echo "`$DATECMD` Getting wiki data..."
 ./get_wiki_data.rb $DIR >$LOGFILE
 
+echo "`$DATECMD` Extracting words..."
+./extract_words.rb $DIR
+
 echo "`$DATECMD` Running post.sql..."
 sqlite3 $DATABASE <post.sql
author	Jochen Topf <jochen@topf.org>	2012-10-14 19:32:20 +0200
committer	Jochen Topf <jochen@topf.org>	2012-10-14 19:32:20 +0200
commit	47eecdd35cac768566fc8a79afe7760bbf99b75a (patch)
tree	9257beb7ee8e54b06121d7a7cdb931e92847e00d /sources
parent	d8296b1b2e916070e8de2c5435254d60e3b9bd3c (diff)
download	taginfo-47eecdd35cac768566fc8a79afe7760bbf99b75a.tar taginfo-47eecdd35cac768566fc8a79afe7760bbf99b75a.tar.gz