From 63f87a5ff3a996352cd733b6ac2e432c9e86ab97 Mon Sep 17 00:00:00 2001 From: William Morgan Date: Wed, 24 Jun 2009 13:30:43 -0400 Subject: [PATCH] bugfix: dates need to be truncated for xapian to index If dates are way out of range, the current indexing process both dies and generates bad doc ids. This patch forces dates to be within a reasonable range (current between 1969 and 2038.) Not necessarily the best solution. --- lib/sup/xapian_index.rb | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/lib/sup/xapian_index.rb b/lib/sup/xapian_index.rb index d064ffc..7bbc41c 100644 --- a/lib/sup/xapian_index.rb +++ b/lib/sup/xapian_index.rb @@ -10,6 +10,12 @@ module Redwood class XapianIndex < BaseIndex STEM_LANGUAGE = "english" + ## dates are converted to integers for xapian, and are used for document ids, + ## so we must ensure they're reasonably valid. this typically only affect + ## spam. + MIN_DATE = Time.at 0 + MAX_DATE = Time.at(2**31) + def initialize dir=BASE_DIR super @@ -307,8 +313,8 @@ class XapianIndex < BaseIndex DOCID_SCALE = 2.0**32 TIME_SCALE = 2.0**27 MIDDLE_DATE = Time.gm(2011) - def assign_docid m - t = (m.date.to_i - MIDDLE_DATE.to_i).to_f + def assign_docid m, truncated_date + t = (truncated_date.to_i - MIDDLE_DATE.to_i).to_f docid = (DOCID_SCALE - DOCID_SCALE/(Math::E**(-(t/TIME_SCALE)) + 1)).to_i begin while @assigned_docids.member? [docid].pack("N") @@ -400,11 +406,25 @@ class XapianIndex < BaseIndex text << [body_text, PREFIX['body']] m.attachments.each { |a| text << [a, PREFIX['attachment']] } + truncated_date = if m.date < MIN_DATE + Redwood::log "warning: adjusting too-low date #{m.date} for indexing" + MIN_DATE + elsif m.date > MAX_DATE + Redwood::log "warning: adjusting too-high date #{m.date} for indexing" + MAX_DATE + else + m.date + end + # Date value for range queries - date_value = Xapian.sortable_serialise(m.date.to_i) + date_value = begin + Xapian.sortable_serialise truncated_date.to_i + rescue TypeError + Xapian.sortable_serialise 0 + end doc = Xapian::Document.new - docid = @docids[m.id] || assign_docid(m) + docid = @docids[m.id] || assign_docid(m, truncated_date) @term_generator.document = doc text.each { |text,prefix| @term_generator.index_text text, 1, prefix } -- 2.43.0