summaryrefslogtreecommitdiff
path: root/sources
diff options
context:
space:
mode:
authorJochen Topf <jochen@topf.org>2012-10-14 19:32:20 +0200
committerJochen Topf <jochen@topf.org>2012-10-14 19:32:20 +0200
commit47eecdd35cac768566fc8a79afe7760bbf99b75a (patch)
tree9257beb7ee8e54b06121d7a7cdb931e92847e00d /sources
parentd8296b1b2e916070e8de2c5435254d60e3b9bd3c (diff)
downloadtaginfo-47eecdd35cac768566fc8a79afe7760bbf99b75a.tar
taginfo-47eecdd35cac768566fc8a79afe7760bbf99b75a.tar.gz
Add minimal support for wiki pages fulltext search
Diffstat (limited to 'sources')
-rwxr-xr-xsources/wiki/extract_words.rb150
-rw-r--r--sources/wiki/pre.sql8
-rwxr-xr-xsources/wiki/update.sh3
3 files changed, 161 insertions, 0 deletions
diff --git a/sources/wiki/extract_words.rb b/sources/wiki/extract_words.rb
new file mode 100755
index 0000000..8b018d8
--- /dev/null
+++ b/sources/wiki/extract_words.rb
@@ -0,0 +1,150 @@
+#!/usr/bin/ruby
+#------------------------------------------------------------------------------
+#
+# extract_words.rb [DIR]
+#
+#------------------------------------------------------------------------------
+#
+# Extracts words from wiki pages into their own table for full-text search.
+#
+#------------------------------------------------------------------------------
+#
+# Copyright (C) 2012 Jochen Topf <jochen@remote.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+#------------------------------------------------------------------------------
+
+require 'rubygems'
+
+require 'sqlite3'
+
+#------------------------------------------------------------------------------
+
+class Words
+
+ def initialize
+ @words = Hash.new
+ end
+
+ def add(key, value, lang, word)
+ entry = [key, value]
+ if @words[word]
+ @words[word] << entry
+ else
+ @words[word] = [entry]
+ end
+ end
+
+ # Remove words that appear too often
+ def cleanup
+ @words.delete_if do |k, v|
+ v.size >= 10
+ end
+ end
+
+ def invert
+ @kvw = []
+ @words.each do |word, entries|
+ entries.each do |entry|
+ key = entry[0]
+ value = entry[1] || ''
+ @kvw << [key, value, word]
+ end
+ end
+ end
+
+ def dump
+ lastkey = ''
+ lastvalue = ''
+ words = []
+ @kvw.sort.uniq.each do |key, value, word|
+ if key != lastkey || value != lastvalue
+ yield lastkey, lastvalue, words.join(',')
+ words = []
+ lastkey = key
+ lastvalue = value
+ else
+ words << word
+ end
+ end
+ yield lastkey, lastvalue, words.join(',')
+ end
+
+end
+
+#------------------------------------------------------------------------------
+
+class WordExtractor
+
+ def initialize(words)
+ @words = words
+ end
+
+ def interested_in(word, key, value, lang)
+ # not interested in very short words
+ return false if word.size <= 2
+
+ # digits make for bad words
+ return false if word =~ /\d/
+
+# # not interested if word == key or == value
+# key.downcase!
+# value.downcase! unless value.nil?
+# return false if word == key || word == value
+
+ return true
+ end
+
+ def parse(key, value, lang, text)
+ words = text.scan(/\w+/).sort.uniq
+ words.each do |word|
+ word.downcase!
+ if interested_in(word, key, value, lang)
+ @words.add(key, value, lang, word)
+ end
+ end
+ end
+
+end
+
+#------------------------------------------------------------------------------
+
+dir = ARGV[0] || '.'
+db = SQLite3::Database.new(dir + '/taginfo-wiki.db')
+db.results_as_hash = true
+
+words = Words.new
+we = WordExtractor.new(words)
+
+db.execute("SELECT * FROM wikipages") do |row|
+# puts "key=#{ row['key'] } value=#{ row['value'] } lang=#{ row['lang'] }"
+ we.parse(row['key'], row['value'], row['lang'], row['body'])
+end
+
+words.cleanup
+words.invert
+
+#words.dump do |key, value, words|
+# puts "#{key}=#{value}: #{words}"
+#end
+
+db.execute('BEGIN TRANSACTION');
+words.dump do |key, value, words|
+ db.execute('INSERT INTO words (key, value, words) VALUES (?, ?, ?)', key, value, words)
+end
+db.execute('COMMIT');
+
+#-- THE END -------------------------------------------------------------------
diff --git a/sources/wiki/pre.sql b/sources/wiki/pre.sql
index 56e9538..ebb80d3 100644
--- a/sources/wiki/pre.sql
+++ b/sources/wiki/pre.sql
@@ -64,6 +64,14 @@ CREATE TABLE invalid_page_titles (
title TEXT
);
+DROP TABLE IF EXISTS words;
+
+CREATE TABLE words (
+ key TEXT,
+ value TEXT,
+ words TEXT
+);
+
DROP TABLE IF EXISTS stats;
CREATE TABLE stats (
diff --git a/sources/wiki/update.sh b/sources/wiki/update.sh
index 7d7dc6b..635b45c 100755
--- a/sources/wiki/update.sh
+++ b/sources/wiki/update.sh
@@ -38,6 +38,9 @@ echo "`$DATECMD` Getting page list..."
echo "`$DATECMD` Getting wiki data..."
./get_wiki_data.rb $DIR >$LOGFILE
+echo "`$DATECMD` Extracting words..."
+./extract_words.rb $DIR
+
echo "`$DATECMD` Running post.sql..."
sqlite3 $DATABASE <post.sql