diff options
author | Jochen Topf <jochen@topf.org> | 2012-10-14 19:32:20 +0200 |
---|---|---|
committer | Jochen Topf <jochen@topf.org> | 2012-10-14 19:32:20 +0200 |
commit | 47eecdd35cac768566fc8a79afe7760bbf99b75a (patch) | |
tree | 9257beb7ee8e54b06121d7a7cdb931e92847e00d | |
parent | d8296b1b2e916070e8de2c5435254d60e3b9bd3c (diff) | |
download | taginfo-47eecdd35cac768566fc8a79afe7760bbf99b75a.tar taginfo-47eecdd35cac768566fc8a79afe7760bbf99b75a.tar.gz |
Add minimal support for wiki pages fulltext search
-rwxr-xr-x | sources/wiki/extract_words.rb | 150 | ||||
-rw-r--r-- | sources/wiki/pre.sql | 8 | ||||
-rwxr-xr-x | sources/wiki/update.sh | 3 | ||||
-rw-r--r-- | web/i18n/de.yml | 2 | ||||
-rw-r--r-- | web/i18n/en.yml | 2 | ||||
-rw-r--r-- | web/lib/api/search.rb | 25 | ||||
-rw-r--r-- | web/views/search.erb | 7 | ||||
-rw-r--r-- | web/viewsjs/search.js.erb | 22 |
8 files changed, 219 insertions, 0 deletions
diff --git a/sources/wiki/extract_words.rb b/sources/wiki/extract_words.rb new file mode 100755 index 0000000..8b018d8 --- /dev/null +++ b/sources/wiki/extract_words.rb @@ -0,0 +1,150 @@ +#!/usr/bin/ruby +#------------------------------------------------------------------------------ +# +# extract_words.rb [DIR] +# +#------------------------------------------------------------------------------ +# +# Extracts words from wiki pages into their own table for full-text search. +# +#------------------------------------------------------------------------------ +# +# Copyright (C) 2012 Jochen Topf <jochen@remote.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +#------------------------------------------------------------------------------ + +require 'rubygems' + +require 'sqlite3' + +#------------------------------------------------------------------------------ + +class Words + + def initialize + @words = Hash.new + end + + def add(key, value, lang, word) + entry = [key, value] + if @words[word] + @words[word] << entry + else + @words[word] = [entry] + end + end + + # Remove words that appear too often + def cleanup + @words.delete_if do |k, v| + v.size >= 10 + end + end + + def invert + @kvw = [] + @words.each do |word, entries| + entries.each do |entry| + key = entry[0] + value = entry[1] || '' + @kvw << [key, value, word] + end + end + end + + def dump + lastkey = '' + lastvalue = '' + words = [] + @kvw.sort.uniq.each do |key, value, word| + if key != lastkey || value != lastvalue + yield lastkey, lastvalue, words.join(',') + words = [] + lastkey = key + lastvalue = value + else + words << word + end + end + yield lastkey, lastvalue, words.join(',') + end + +end + +#------------------------------------------------------------------------------ + +class WordExtractor + + def initialize(words) + @words = words + end + + def interested_in(word, key, value, lang) + # not interested in very short words + return false if word.size <= 2 + + # digits make for bad words + return false if word =~ /\d/ + +# # not interested if word == key or == value +# key.downcase! +# value.downcase! unless value.nil? +# return false if word == key || word == value + + return true + end + + def parse(key, value, lang, text) + words = text.scan(/\w+/).sort.uniq + words.each do |word| + word.downcase! + if interested_in(word, key, value, lang) + @words.add(key, value, lang, word) + end + end + end + +end + +#------------------------------------------------------------------------------ + +dir = ARGV[0] || '.' +db = SQLite3::Database.new(dir + '/taginfo-wiki.db') +db.results_as_hash = true + +words = Words.new +we = WordExtractor.new(words) + +db.execute("SELECT * FROM wikipages") do |row| +# puts "key=#{ row['key'] } value=#{ row['value'] } lang=#{ row['lang'] }" + we.parse(row['key'], row['value'], row['lang'], row['body']) +end + +words.cleanup +words.invert + +#words.dump do |key, value, words| +# puts "#{key}=#{value}: #{words}" +#end + +db.execute('BEGIN TRANSACTION'); +words.dump do |key, value, words| + db.execute('INSERT INTO words (key, value, words) VALUES (?, ?, ?)', key, value, words) +end +db.execute('COMMIT'); + +#-- THE END ------------------------------------------------------------------- diff --git a/sources/wiki/pre.sql b/sources/wiki/pre.sql index 56e9538..ebb80d3 100644 --- a/sources/wiki/pre.sql +++ b/sources/wiki/pre.sql @@ -64,6 +64,14 @@ CREATE TABLE invalid_page_titles ( title TEXT ); +DROP TABLE IF EXISTS words; + +CREATE TABLE words ( + key TEXT, + value TEXT, + words TEXT +); + DROP TABLE IF EXISTS stats; CREATE TABLE stats ( diff --git a/sources/wiki/update.sh b/sources/wiki/update.sh index 7d7dc6b..635b45c 100755 --- a/sources/wiki/update.sh +++ b/sources/wiki/update.sh @@ -38,6 +38,9 @@ echo "`$DATECMD` Getting page list..." echo "`$DATECMD` Getting wiki data..." ./get_wiki_data.rb $DIR >$LOGFILE +echo "`$DATECMD` Extracting words..." +./extract_words.rb $DIR + echo "`$DATECMD` Running post.sql..." sqlite3 $DATABASE <post.sql diff --git a/web/i18n/de.yml b/web/i18n/de.yml index 7bb86a3..cbd4aeb 100644 --- a/web/i18n/de.yml +++ b/web/i18n/de.yml @@ -124,9 +124,11 @@ pages: search: title: Suchergebnisse you_were_searching_for: Du hast gesucht nach + fulltext: Volltext no_keys: Keine Keys gefunden. no_values: Keine Values gefunden. no_tags: Keine Tags gefunden. + no_match: Nichts gefunden. keys: intro: | Diese Tabelle zeigt alle Keys, die in der Datenbank oder einer anderen Quelle vorkommen. diff --git a/web/i18n/en.yml b/web/i18n/en.yml index 59b63d4..712341c 100644 --- a/web/i18n/en.yml +++ b/web/i18n/en.yml @@ -120,9 +120,11 @@ pages: search: title: Search results you_were_searching_for: You were searching for + fulltext: Full text no_keys: No keys found. no_values: No values found. no_tags: No tags found. + no_match: No matches. keys: intro: | This table shows all tag keys that exist in the database or in any of the other sources. diff --git a/web/lib/api/search.rb b/web/lib/api/search.rb index 2f0ef75..eca1ed8 100644 --- a/web/lib/api/search.rb +++ b/web/lib/api/search.rb @@ -78,4 +78,29 @@ class Taginfo < Sinatra::Base }.to_json end + api(2, 'search/wikipages') do + query = params[:q].downcase + + total = @db.count('wiki.words').condition("words LIKE ('%' || ? || '%')", query).get_first_value().to_i + sel = @db.select("SELECT key, value FROM wiki.words WHERE words LIKE ('%' || ? || '%')", query) + + res = sel. + order_by(params[:sortname], params[:sortorder]) { |o| + o.key + o.value + }. + paging(params[:rp], params[:page]). + execute() + + return { + :page => params[:page].to_i, + :rp => params[:rp].to_i, + :total => total, + :data => res.map{ |row| { + :key => row['key'], + :value => row['value'] + }} + }.to_json + end + end diff --git a/web/views/search.erb b/web/views/search.erb index 92285b7..30212f7 100644 --- a/web/views/search.erb +++ b/web/views/search.erb @@ -6,6 +6,7 @@ <ul> <li><a href="#keys"><%= t.osm.keys %></a></li> <li><a href="#values"><%= t.osm.values %></a></li> + <li><a href="#fulltext"><%= t.pages.search.fulltext %></a></li> </ul> <div id="keys"> <h2><%= t.osm.keys %></h2> @@ -17,6 +18,12 @@ <table id="grid-values"> </table> </div> + <div id="fulltext"> + <h2><%= t.pages.search.fulltext %></h2> + <p class="boxpre" style="color: #f00000;">This search is experimental. It shows keys and tags that might be related to the word you searched for. This doesn't work if there are several words.</p> + <table id="grid-fulltext"> + </table> + </div> </div> <% javascript do JS.raw(%Q{ diff --git a/web/viewsjs/search.js.erb b/web/viewsjs/search.js.erb index eb1c9cf..be909c5 100644 --- a/web/viewsjs/search.js.erb +++ b/web/viewsjs/search.js.erb @@ -71,6 +71,28 @@ var create_flexigrid_for = { return data; } }); + }, + fulltext: function(query) { + var q = query.split('=', 2); + create_flexigrid('grid-fulltext', { + url: '/api/2/search/wikipages?q=' + encodeURIComponent(query), + colModel: [ + { display: '<%= osm.key %>', name: 'key', width: 300, sortable: true }, + { display: '<%= osm.value %>', name: 'value', width: 500, sortable: true } + ], + sortname: 'key', + sortorder: 'asc', + emptymsg: '<%= search.no_match %>', + preProcess: function(data) { + data.rows = jQuery.map(data.data, function(row, i) { + return { 'cell': [ + link_to_key(row.key), + row.value ? link_to_value(row.key, row.value) : '' + ] }; + }); + return data; + } + }); } }; |