git.cworth.org Git - sup/blob - lib/sup/index.rb

   1 ## the index structure for redwood. interacts with ferret.
   2
   3 require 'thread'
   4 require 'fileutils'
   5 require_gem 'ferret', ">= 0.10.13"
   6
   7 module Redwood
   8
   9 class IndexError < StandardError
  10   attr_reader :source
  11
  12   def initialize source, s
  13     super s
  14     @source = source
  15   end
  16 end
  17
  18 class Index
  19   include Singleton
  20
  21   attr_reader :index # debugging only
  22
  23   def initialize dir=BASE_DIR
  24     @dir = dir
  25     @mutex = Mutex.new
  26     @sources = {}
  27     @sources_dirty = false
  28
  29     self.class.i_am_the_instance self
  30   end
  31
  32   def load
  33     load_sources
  34     load_index
  35   end
  36
  37   def save
  38     FileUtils.mkdir_p @dir unless File.exists? @dir
  39     save_sources
  40     save_index
  41   end
  42
  43   def add_source source
  44     raise "duplicate source!" if @sources.include? source
  45     @sources_dirty = true
  46     source.id ||= @sources.size
  47     ##TODO: why was this necessary?
  48     ##source.id += 1 while @sources.member? source.id
  49     @sources[source.id] = source
  50   end
  51
  52   def source_for name; @sources.values.find { |s| s.is_source_for? name }; end
  53   def usual_sources; @sources.values.find_all { |s| s.usual? }; end
  54
  55   def load_index dir=File.join(@dir, "ferret")
  56     wsa = Ferret::Analysis::WhiteSpaceAnalyzer.new false
  57     sa = Ferret::Analysis::StandardAnalyzer.new
  58     analyzer = Ferret::Analysis::PerFieldAnalyzer.new wsa
  59     analyzer[:body] = sa
  60
  61     if File.exists? dir
  62       Redwood::log "loading index"
  63       @index = Ferret::Index::Index.new(:path => dir, :analyzer => analyzer)
  64     else
  65       Redwood::log "creating index"
  66       field_infos = Ferret::Index::FieldInfos.new :store => :yes
  67       field_infos.add_field :message_id
  68       field_infos.add_field :source_id
  69       field_infos.add_field :source_info
  70       field_infos.add_field :date, :index => :untokenized
  71       field_infos.add_field :body, :store => :no
  72       field_infos.add_field :label
  73       field_infos.add_field :subject
  74       field_infos.add_field :from
  75       field_infos.add_field :to
  76       field_infos.add_field :refs
  77       field_infos.add_field :snippet, :index => :no, :term_vector => :no
  78       field_infos.create_index dir
  79       @index = Ferret::Index::Index.new(:path => dir, :analyzer => analyzer)
  80     end
  81   end
  82
  83   ## update the message by deleting and re-adding
  84   def update_message m, source=nil, source_info=nil
  85     docid, entry = load_entry_for_id m.id
  86     if entry
  87       source ||= entry[:source_id].to_i
  88       source_info ||= entry[:source_info].to_i
  89     end
  90     raise "no entry and no source info for message #{m.id}" unless source && source_info
  91
  92     raise "deleting non-corresponding entry #{docid}" unless @index[docid][:message_id] == m.id
  93     @index.delete docid
  94     add_message m
  95   end
  96
  97   def save_index fn=File.join(@dir, "ferret")
  98     # don't have to do anything apparently
  99   end
 100
 101   def contains_id? id
 102     @index.search(Ferret::Search::TermQuery.new(:message_id, id)).total_hits > 0
 103   end
 104   def contains? m; contains_id? m.id; end
 105   def size; @index.size; end
 106
 107   ## you should probably not call this on a block that doesn't break
 108   ## rather quickly because the results will probably be, as we say
 109   ## in scotland, frikkin' huuuge.
 110   EACH_BY_DATE_NUM = 100
 111   def each_id_by_date opts={}
 112     return if @index.size == 0 # otherwise ferret barfs
 113     query = build_query opts
 114     offset = 0
 115     while true
 116       results = @index.search(query, :sort => "date DESC", :limit => EACH_BY_DATE_NUM, :offset => offset)
 117       Redwood::log "got #{results.total_hits} results for query (offset #{offset}) #{query.inspect}"
 118       results.hits.each { |hit| yield @index[hit.doc][:message_id], lambda { build_message hit.doc } }
 119       break if offset >= results.total_hits - EACH_BY_DATE_NUM
 120       offset += EACH_BY_DATE_NUM
 121     end
 122   end
 123
 124   def num_results_for opts={}
 125     query = build_query opts
 126     x = @index.search(query).total_hits
 127     Redwood::log "num_results_for: have #{x} for query #{query}"
 128     x
 129   end
 130
 131   SAME_SUBJECT_DATE_LIMIT = 7
 132   def each_message_in_thread_for m, opts={}
 133     messages = {}
 134     searched = {}
 135     num_queries = 0
 136
 137     ## temporarily disabling subject searching because it's a
 138     ## significant slowdown.
 139     ##
 140     ## TODO: make this configurable, i guess
 141     if false
 142       date_min = m.date - (SAME_SUBJECT_DATE_LIMIT * 12 * 3600)
 143       date_max = m.date + (SAME_SUBJECT_DATE_LIMIT * 12 * 3600)
 144
 145       q = Ferret::Search::BooleanQuery.new true
 146       sq = Ferret::Search::PhraseQuery.new(:subject)
 147       wrap_subj(Message.normalize_subj(m.subj)).split(/\s+/).each do |t|
 148         sq.add_term t
 149       end
 150       q.add_query sq, :must
 151       q.add_query Ferret::Search::RangeQuery.new(:date, :>= => date_min.to_indexable_s, :<= => date_max.to_indexable_s), :must
 152
 153       pending = @index.search(q).hits.map { |hit| @index[hit.doc][:message_id] }
 154       Redwood::log "found #{pending.size} results for subject query #{q}"
 155     else
 156       pending = [m.id]
 157     end
 158
 159     until pending.empty? || (opts[:limit] && messages.size >= opts[:limit])
 160       id = pending.pop
 161       next if searched.member? id
 162       searched[id] = true
 163       q = Ferret::Search::BooleanQuery.new true
 164       q.add_query Ferret::Search::TermQuery.new(:message_id, id), :should
 165       q.add_query Ferret::Search::TermQuery.new(:refs, id), :should
 166
 167       num_queries += 1
 168       @index.search_each(q, :limit => :all) do |docid, score|
 169         break if opts[:limit] && messages.size >= opts[:limit]
 170         mid = @index[docid][:message_id]
 171         unless messages.member? mid
 172           messages[mid] ||= lambda { build_message docid }
 173           refs = @index[docid][:refs].split(" ")
 174           pending += refs
 175         end
 176       end
 177     end
 178     Redwood::log "ran #{num_queries} queries to build thread of #{messages.size} messages for #{m.id}"
 179     messages.each { |mid, builder| yield mid, builder }
 180   end
 181
 182   ## builds a message object from a ferret result
 183   def build_message docid
 184     doc = @index[docid]
 185     source = @sources[doc[:source_id].to_i]
 186     #puts "building message #{doc[:message_id]} (#{source}##{doc[:source_info]})"
 187     raise "invalid source #{doc[:source_id]}" unless source
 188
 189     m =
 190       if source.broken?
 191         nil
 192       else
 193         begin
 194           Message.new :source => source, :source_info => doc[:source_info].to_i,
 195                       :labels => doc[:label].split(" ").map { |s| s.intern },
 196                       :snippet => doc[:snippet]
 197         rescue MessageFormatError => e
 198           raise IndexError.new(source, "error building message #{doc[:message_id]} at #{source}/#{doc[:source_info]}: #{e.message}")
 199         rescue SourceError => e
 200           nil
 201         end
 202       end
 203
 204     unless m
 205       fake_header = {
 206         "date" => Time.at(doc[:date].to_i),
 207         "subject" => unwrap_subj(doc[:subject]),
 208         "from" => doc[:from],
 209         "to" => doc[:to],
 210         "message-id" => doc[:message_id],
 211         "references" => doc[:refs],
 212       }
 213
 214       m = Message.new :labels => doc[:label].split(" ").map { |s| s.intern },
 215                       :snippet => doc[:snippet], :header => fake_header,
 216                       :body => <<EOS
 217 #{doc[:snippet]}...
 218
 219 An error occurred while loading this message. It is possible that the source
 220 has changed, or (in the case of remote sources) is down.
 221
 222 The error message was:
 223   #{source.broken_msg}
 224 EOS
 225     end
 226     m
 227   end
 228
 229   def fresh_thread_id; @next_thread_id += 1; end
 230   def wrap_subj subj; "__START_SUBJECT__ #{subj} __END_SUBJECT__"; end
 231   def unwrap_subj subj; subj =~ /__START_SUBJECT__ (.*?) __END_SUBJECT__/ && $1; end
 232
 233   def add_message m
 234     return false if contains? m
 235
 236     source_id =
 237       if m.source.is_a? Integer
 238         m.source
 239       else
 240         m.source.id or raise "unregistered source #{m.source}"
 241       end
 242
 243     to = (m.to + m.cc + m.bcc).map { |x| x.email }.join(" ")
 244     d = {
 245       :message_id => m.id,
 246       :source_id => source_id,
 247       :source_info => m.source_info,
 248       :date => m.date.to_indexable_s,
 249       :body => m.content,
 250       :snippet => m.snippet,
 251       :label => m.labels.join(" "),
 252       :from => m.from ? m.from.email : "",
 253       :to => (m.to + m.cc + m.bcc).map { |x| x.email }.join(" "),
 254       :subject => wrap_subj(Message.normalize_subj(m.subj)),
 255       :refs => (m.refs + m.replytos).uniq.join(" "),
 256     }
 257
 258     @index.add_document d
 259
 260     ## TODO: figure out why this is sometimes triggered
 261     #docid, entry = load_entry_for_id m.id
 262     #raise "just added message #{m.id} but couldn't find it in a search" unless docid
 263     true
 264   end
 265
 266   def drop_entry docno; @index.delete docno; end
 267
 268   def load_entry_for_id mid
 269     results = @index.search(Ferret::Search::TermQuery.new(:message_id, mid))
 270     return if results.total_hits == 0
 271     docid = results.hits[0].doc
 272     [docid, @index[docid]]
 273   end
 274
 275   def load_contacts emails, h={}
 276     q = Ferret::Search::BooleanQuery.new true
 277     emails.each do |e|
 278       qq = Ferret::Search::BooleanQuery.new true
 279       qq.add_query Ferret::Search::TermQuery.new(:from, e), :should
 280       qq.add_query Ferret::Search::TermQuery.new(:to, e), :should
 281       q.add_query qq
 282     end
 283     q.add_query Ferret::Search::TermQuery.new(:label, "spam"), :must_not
 284
 285     Redwood::log "contact search: #{q}"
 286     contacts = {}
 287     num = h[:num] || 20
 288     @index.search_each(q, :sort => "date DESC", :limit => :all) do |docid, score|
 289       break if contacts.size >= num
 290       #Redwood::log "got message with to: #{@index[docid][:to].inspect} and from: #{@index[docid][:from].inspect}"
 291       f = @index[docid][:from]
 292       t = @index[docid][:to]
 293
 294       if AccountManager.is_account_email? f
 295         t.split(" ").each { |e| #Redwood::log "adding #{e} because there's a message to him from account email #{f}";
 296           contacts[Person.for(e)] = true }
 297       else
 298         #Redwood::log "adding from #{f} because there's a message from him to #{t}"
 299         contacts[Person.for(f)] = true
 300       end
 301     end
 302
 303     contacts.keys.compact
 304   end
 305
 306 protected
 307
 308   ## TODO: convert this to query objects rather than strings
 309   def build_query opts
 310     query = ""
 311     query += opts[:labels].map { |t| "+label:#{t}" }.join(" ") if opts[:labels]
 312     query += " +label:#{opts[:label]}" if opts[:label]
 313     query += " #{opts[:content]}" if opts[:content]
 314     if opts[:participants]
 315       query += "+(" +
 316         opts[:participants].map { |p| "from:#{p.email} OR to:#{p.email}" }.join(" OR ") + ")"
 317     end
 318
 319     query += " -label:spam" unless opts[:load_spam] || opts[:labels] == :spam ||
 320       (opts[:labels] && opts[:labels].include?(:spam))
 321     query += " -label:killed" unless opts[:load_killed] || opts[:labels] == :killed ||
 322       (opts[:labels] && opts[:labels].include?(:killed))
 323     query
 324   end
 325
 326   def load_sources fn=Redwood::SOURCE_FN
 327     @sources = Hash[*(Redwood::load_yaml_obj(fn) || []).map { |s| [s.id, s] }.flatten]
 328     @sources_dirty = false
 329   end
 330
 331   def save_sources fn=Redwood::SOURCE_FN
 332     if @sources_dirty || @sources.any? { |id, s| s.dirty? }
 333       bakfn = fn + ".bak"
 334       if File.exists? fn
 335         File.chmod 0600, fn
 336         FileUtils.mv fn, bakfn, :force => true unless File.exists?(bakfn) && File.size(bakfn) > File.size(fn)
 337       end
 338       Redwood::save_yaml_obj @sources.values, fn
 339       File.chmod 0600, fn
 340     end
 341     @sources_dirty = false
 342   end
 343 end
 344
 345 end