summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJochen Topf <jochen@topf.org>2012-10-14 19:32:20 +0200
committerJochen Topf <jochen@topf.org>2012-10-14 19:32:20 +0200
commit47eecdd35cac768566fc8a79afe7760bbf99b75a (patch)
tree9257beb7ee8e54b06121d7a7cdb931e92847e00d
parentd8296b1b2e916070e8de2c5435254d60e3b9bd3c (diff)
downloadtaginfo-47eecdd35cac768566fc8a79afe7760bbf99b75a.tar
taginfo-47eecdd35cac768566fc8a79afe7760bbf99b75a.tar.gz
Add minimal support for wiki pages fulltext search
-rwxr-xr-xsources/wiki/extract_words.rb150
-rw-r--r--sources/wiki/pre.sql8
-rwxr-xr-xsources/wiki/update.sh3
-rw-r--r--web/i18n/de.yml2
-rw-r--r--web/i18n/en.yml2
-rw-r--r--web/lib/api/search.rb25
-rw-r--r--web/views/search.erb7
-rw-r--r--web/viewsjs/search.js.erb22
8 files changed, 219 insertions, 0 deletions
diff --git a/sources/wiki/extract_words.rb b/sources/wiki/extract_words.rb
new file mode 100755
index 0000000..8b018d8
--- /dev/null
+++ b/sources/wiki/extract_words.rb
@@ -0,0 +1,150 @@
+#!/usr/bin/ruby
+#------------------------------------------------------------------------------
+#
+# extract_words.rb [DIR]
+#
+#------------------------------------------------------------------------------
+#
+# Extracts words from wiki pages into their own table for full-text search.
+#
+#------------------------------------------------------------------------------
+#
+# Copyright (C) 2012 Jochen Topf <jochen@remote.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+#------------------------------------------------------------------------------
+
+require 'rubygems'
+
+require 'sqlite3'
+
+#------------------------------------------------------------------------------
+
+class Words
+
+ def initialize
+ @words = Hash.new
+ end
+
+ def add(key, value, lang, word)
+ entry = [key, value]
+ if @words[word]
+ @words[word] << entry
+ else
+ @words[word] = [entry]
+ end
+ end
+
+ # Remove words that appear too often
+ def cleanup
+ @words.delete_if do |k, v|
+ v.size >= 10
+ end
+ end
+
+ def invert
+ @kvw = []
+ @words.each do |word, entries|
+ entries.each do |entry|
+ key = entry[0]
+ value = entry[1] || ''
+ @kvw << [key, value, word]
+ end
+ end
+ end
+
+ def dump
+ lastkey = ''
+ lastvalue = ''
+ words = []
+ @kvw.sort.uniq.each do |key, value, word|
+ if key != lastkey || value != lastvalue
+ yield lastkey, lastvalue, words.join(',')
+ words = []
+ lastkey = key
+ lastvalue = value
+ else
+ words << word
+ end
+ end
+ yield lastkey, lastvalue, words.join(',')
+ end
+
+end
+
+#------------------------------------------------------------------------------
+
+class WordExtractor
+
+ def initialize(words)
+ @words = words
+ end
+
+ def interested_in(word, key, value, lang)
+ # not interested in very short words
+ return false if word.size <= 2
+
+ # digits make for bad words
+ return false if word =~ /\d/
+
+# # not interested if word == key or == value
+# key.downcase!
+# value.downcase! unless value.nil?
+# return false if word == key || word == value
+
+ return true
+ end
+
+ def parse(key, value, lang, text)
+ words = text.scan(/\w+/).sort.uniq
+ words.each do |word|
+ word.downcase!
+ if interested_in(word, key, value, lang)
+ @words.add(key, value, lang, word)
+ end
+ end
+ end
+
+end
+
+#------------------------------------------------------------------------------
+
+dir = ARGV[0] || '.'
+db = SQLite3::Database.new(dir + '/taginfo-wiki.db')
+db.results_as_hash = true
+
+words = Words.new
+we = WordExtractor.new(words)
+
+db.execute("SELECT * FROM wikipages") do |row|
+# puts "key=#{ row['key'] } value=#{ row['value'] } lang=#{ row['lang'] }"
+ we.parse(row['key'], row['value'], row['lang'], row['body'])
+end
+
+words.cleanup
+words.invert
+
+#words.dump do |key, value, words|
+# puts "#{key}=#{value}: #{words}"
+#end
+
+db.execute('BEGIN TRANSACTION');
+words.dump do |key, value, words|
+ db.execute('INSERT INTO words (key, value, words) VALUES (?, ?, ?)', key, value, words)
+end
+db.execute('COMMIT');
+
+#-- THE END -------------------------------------------------------------------
diff --git a/sources/wiki/pre.sql b/sources/wiki/pre.sql
index 56e9538..ebb80d3 100644
--- a/sources/wiki/pre.sql
+++ b/sources/wiki/pre.sql
@@ -64,6 +64,14 @@ CREATE TABLE invalid_page_titles (
title TEXT
);
+DROP TABLE IF EXISTS words;
+
+CREATE TABLE words (
+ key TEXT,
+ value TEXT,
+ words TEXT
+);
+
DROP TABLE IF EXISTS stats;
CREATE TABLE stats (
diff --git a/sources/wiki/update.sh b/sources/wiki/update.sh
index 7d7dc6b..635b45c 100755
--- a/sources/wiki/update.sh
+++ b/sources/wiki/update.sh
@@ -38,6 +38,9 @@ echo "`$DATECMD` Getting page list..."
echo "`$DATECMD` Getting wiki data..."
./get_wiki_data.rb $DIR >$LOGFILE
+echo "`$DATECMD` Extracting words..."
+./extract_words.rb $DIR
+
echo "`$DATECMD` Running post.sql..."
sqlite3 $DATABASE <post.sql
diff --git a/web/i18n/de.yml b/web/i18n/de.yml
index 7bb86a3..cbd4aeb 100644
--- a/web/i18n/de.yml
+++ b/web/i18n/de.yml
@@ -124,9 +124,11 @@ pages:
search:
title: Suchergebnisse
you_were_searching_for: Du hast gesucht nach
+ fulltext: Volltext
no_keys: Keine Keys gefunden.
no_values: Keine Values gefunden.
no_tags: Keine Tags gefunden.
+ no_match: Nichts gefunden.
keys:
intro: |
Diese Tabelle zeigt alle Keys, die in der Datenbank oder einer anderen Quelle vorkommen.
diff --git a/web/i18n/en.yml b/web/i18n/en.yml
index 59b63d4..712341c 100644
--- a/web/i18n/en.yml
+++ b/web/i18n/en.yml
@@ -120,9 +120,11 @@ pages:
search:
title: Search results
you_were_searching_for: You were searching for
+ fulltext: Full text
no_keys: No keys found.
no_values: No values found.
no_tags: No tags found.
+ no_match: No matches.
keys:
intro: |
This table shows all tag keys that exist in the database or in any of the other sources.
diff --git a/web/lib/api/search.rb b/web/lib/api/search.rb
index 2f0ef75..eca1ed8 100644
--- a/web/lib/api/search.rb
+++ b/web/lib/api/search.rb
@@ -78,4 +78,29 @@ class Taginfo < Sinatra::Base
}.to_json
end
+ api(2, 'search/wikipages') do
+ query = params[:q].downcase
+
+ total = @db.count('wiki.words').condition("words LIKE ('%' || ? || '%')", query).get_first_value().to_i
+ sel = @db.select("SELECT key, value FROM wiki.words WHERE words LIKE ('%' || ? || '%')", query)
+
+ res = sel.
+ order_by(params[:sortname], params[:sortorder]) { |o|
+ o.key
+ o.value
+ }.
+ paging(params[:rp], params[:page]).
+ execute()
+
+ return {
+ :page => params[:page].to_i,
+ :rp => params[:rp].to_i,
+ :total => total,
+ :data => res.map{ |row| {
+ :key => row['key'],
+ :value => row['value']
+ }}
+ }.to_json
+ end
+
end
diff --git a/web/views/search.erb b/web/views/search.erb
index 92285b7..30212f7 100644
--- a/web/views/search.erb
+++ b/web/views/search.erb
@@ -6,6 +6,7 @@
<ul>
<li><a href="#keys"><%= t.osm.keys %></a></li>
<li><a href="#values"><%= t.osm.values %></a></li>
+ <li><a href="#fulltext"><%= t.pages.search.fulltext %></a></li>
</ul>
<div id="keys">
<h2><%= t.osm.keys %></h2>
@@ -17,6 +18,12 @@
<table id="grid-values">
</table>
</div>
+ <div id="fulltext">
+ <h2><%= t.pages.search.fulltext %></h2>
+ <p class="boxpre" style="color: #f00000;">This search is experimental. It shows keys and tags that might be related to the word you searched for. This doesn't work if there are several words.</p>
+ <table id="grid-fulltext">
+ </table>
+ </div>
</div>
<% javascript do
JS.raw(%Q{
diff --git a/web/viewsjs/search.js.erb b/web/viewsjs/search.js.erb
index eb1c9cf..be909c5 100644
--- a/web/viewsjs/search.js.erb
+++ b/web/viewsjs/search.js.erb
@@ -71,6 +71,28 @@ var create_flexigrid_for = {
return data;
}
});
+ },
+ fulltext: function(query) {
+ var q = query.split('=', 2);
+ create_flexigrid('grid-fulltext', {
+ url: '/api/2/search/wikipages?q=' + encodeURIComponent(query),
+ colModel: [
+ { display: '<%= osm.key %>', name: 'key', width: 300, sortable: true },
+ { display: '<%= osm.value %>', name: 'value', width: 500, sortable: true }
+ ],
+ sortname: 'key',
+ sortorder: 'asc',
+ emptymsg: '<%= search.no_match %>',
+ preProcess: function(data) {
+ data.rows = jQuery.map(data.data, function(row, i) {
+ return { 'cell': [
+ link_to_key(row.key),
+ row.value ? link_to_value(row.key, row.value) : ''
+ ] };
+ });
+ return data;
+ }
+ });
}
};