1 ## the index structure for redwood. interacts with ferret.
6 require_gem 'ferret', ">= 0.10.13"
10 class IndexError < StandardError
13 def initialize source, s
22 attr_reader :index # debugging only
24 def initialize dir=BASE_DIR
27 @sources_dirty = false
29 wsa = Ferret::Analysis::WhiteSpaceAnalyzer.new true
30 sa = Ferret::Analysis::StandardAnalyzer.new Ferret::Analysis::FULL_ENGLISH_STOP_WORDS, true
31 @analyzer = Ferret::Analysis::PerFieldAnalyzer.new wsa
33 @qparser ||= Ferret::QueryParser.new :default_field => :body, :analyzer => @analyzer
35 self.class.i_am_the_instance self
44 FileUtils.mkdir_p @dir unless File.exists? @dir
50 raise "duplicate source!" if @sources.include? source
52 source.id ||= @sources.size
53 ##TODO: why was this necessary?
54 ##source.id += 1 while @sources.member? source.id
55 @sources[source.id] = source
58 def source_for name; @sources.values.find { |s| s.is_source_for? name }; end
59 def usual_sources; @sources.values.find_all { |s| s.usual? }; end
61 def load_index dir=File.join(@dir, "ferret")
63 Redwood::log "loading index"
64 @index = Ferret::Index::Index.new(:path => dir, :analyzer => @analyzer)
66 Redwood::log "creating index"
67 field_infos = Ferret::Index::FieldInfos.new :store => :yes
68 field_infos.add_field :message_id
69 field_infos.add_field :source_id
70 field_infos.add_field :source_info
71 field_infos.add_field :date, :index => :untokenized
72 field_infos.add_field :body, :store => :no
73 field_infos.add_field :label
74 field_infos.add_field :subject
75 field_infos.add_field :from
76 field_infos.add_field :to
77 field_infos.add_field :refs
78 field_infos.add_field :snippet, :index => :no, :term_vector => :no
79 field_infos.create_index dir
80 @index = Ferret::Index::Index.new(:path => dir, :analyzer => @analyzer)
84 ## update the message by deleting and re-adding
85 def update_message m, source=nil, source_info=nil
86 docid, entry = load_entry_for_id m.id
88 source ||= entry[:source_id].to_i
89 source_info ||= entry[:source_info].to_i
92 ## this happens sometimes. i'm not sure why. ferret bug?
93 raise "no entry and no source info for message #{m.id}" unless source && source_info
95 raise "deleting non-corresponding entry #{docid}" unless @index[docid][:message_id] == m.id
100 def save_index fn=File.join(@dir, "ferret")
101 # don't have to do anything apparently
105 @index.search(Ferret::Search::TermQuery.new(:message_id, id)).total_hits > 0
107 def contains? m; contains_id? m.id; end
108 def size; @index.size; end
110 ## you should probably not call this on a block that doesn't break
111 ## rather quickly because the results will probably be, as we say
112 ## in scotland, frikkin' huuuge.
113 EACH_BY_DATE_NUM = 100
114 def each_id_by_date opts={}
115 return if @index.size == 0 # otherwise ferret barfs ###TODO: remove this once my ferret patch is accepted
116 query = build_query opts
119 results = @index.search(query, :sort => "date DESC", :limit => EACH_BY_DATE_NUM, :offset => offset)
120 Redwood::log "got #{results.total_hits} results for query (offset #{offset}) #{query.inspect}"
121 results.hits.each { |hit| yield @index[hit.doc][:message_id], lambda { build_message hit.doc } }
122 break if offset >= results.total_hits - EACH_BY_DATE_NUM
123 offset += EACH_BY_DATE_NUM
127 def num_results_for opts={}
128 query = build_query opts
129 x = @index.search(query).total_hits
130 Redwood::log "num_results_for: have #{x} for query #{query}"
134 SAME_SUBJECT_DATE_LIMIT = 7
135 def each_message_in_thread_for m, opts={}
140 ## temporarily disabling subject searching because it's a
141 ## significant slowdown.
143 ## TODO: make this configurable, i guess
145 date_min = m.date - (SAME_SUBJECT_DATE_LIMIT * 12 * 3600)
146 date_max = m.date + (SAME_SUBJECT_DATE_LIMIT * 12 * 3600)
148 q = Ferret::Search::BooleanQuery.new true
149 sq = Ferret::Search::PhraseQuery.new(:subject)
150 wrap_subj(Message.normalize_subj(m.subj)).split(/\s+/).each do |t|
153 q.add_query sq, :must
154 q.add_query Ferret::Search::RangeQuery.new(:date, :>= => date_min.to_indexable_s, :<= => date_max.to_indexable_s), :must
156 pending = @index.search(q).hits.map { |hit| @index[hit.doc][:message_id] }
157 Redwood::log "found #{pending.size} results for subject query #{q}"
162 until pending.empty? || (opts[:limit] && messages.size >= opts[:limit])
164 next if searched.member? id
166 q = Ferret::Search::BooleanQuery.new true
167 q.add_query Ferret::Search::TermQuery.new(:message_id, id), :should
168 q.add_query Ferret::Search::TermQuery.new(:refs, id), :should
171 @index.search_each(q, :limit => :all) do |docid, score|
172 break if opts[:limit] && messages.size >= opts[:limit]
173 mid = @index[docid][:message_id]
174 unless messages.member? mid
175 messages[mid] ||= lambda { build_message docid }
176 refs = @index[docid][:refs].split(" ")
181 Redwood::log "ran #{num_queries} queries to build thread of #{messages.size} messages for #{m.id}"
182 messages.each { |mid, builder| yield mid, builder }
185 ## builds a message object from a ferret result
186 def build_message docid
188 source = @sources[doc[:source_id].to_i]
189 #puts "building message #{doc[:message_id]} (#{source}##{doc[:source_info]})"
190 raise "invalid source #{doc[:source_id]}" unless source
197 Message.new :source => source, :source_info => doc[:source_info].to_i,
198 :labels => doc[:label].split(" ").map { |s| s.intern },
199 :snippet => doc[:snippet]
200 rescue MessageFormatError => e
201 raise IndexError.new(source, "error building message #{doc[:message_id]} at #{source}/#{doc[:source_info]}: #{e.message}")
202 rescue SourceError => e
209 "date" => Time.at(doc[:date].to_i),
210 "subject" => unwrap_subj(doc[:subject]),
211 "from" => doc[:from],
213 "message-id" => doc[:message_id],
214 "references" => doc[:refs],
217 m = Message.new :labels => doc[:label].split(" ").map { |s| s.intern },
218 :snippet => doc[:snippet], :header => fake_header,
222 An error occurred while loading this message. It is possible that the source
223 has changed, or (in the case of remote sources) is down.
225 The error message was:
232 def fresh_thread_id; @next_thread_id += 1; end
233 def wrap_subj subj; "__START_SUBJECT__ #{subj} __END_SUBJECT__"; end
234 def unwrap_subj subj; subj =~ /__START_SUBJECT__ (.*?) __END_SUBJECT__/ && $1; end
237 return false if contains? m
240 if m.source.is_a? Integer
243 m.source.id or raise "unregistered source #{m.source}"
246 to = (m.to + m.cc + m.bcc).map { |x| x.email }.join(" ")
249 :source_id => source_id,
250 :source_info => m.source_info,
251 :date => m.date.to_indexable_s,
253 :snippet => m.snippet,
254 :label => m.labels.join(" "),
255 :from => m.from ? m.from.email : "",
256 :to => (m.to + m.cc + m.bcc).map { |x| x.email }.join(" "),
257 :subject => wrap_subj(Message.normalize_subj(m.subj)),
258 :refs => (m.refs + m.replytos).uniq.join(" "),
261 @index.add_document d
263 ## TODO: figure out why this is sometimes triggered
264 #docid, entry = load_entry_for_id m.id
265 #raise "just added message #{m.id} but couldn't find it in a search" unless docid
269 def drop_entry docno; @index.delete docno; end
271 def load_entry_for_id mid
272 results = @index.search(Ferret::Search::TermQuery.new(:message_id, mid))
273 return if results.total_hits == 0
274 docid = results.hits[0].doc
275 [docid, @index[docid]]
278 def load_contacts emails, h={}
279 q = Ferret::Search::BooleanQuery.new true
281 qq = Ferret::Search::BooleanQuery.new true
282 qq.add_query Ferret::Search::TermQuery.new(:from, e), :should
283 qq.add_query Ferret::Search::TermQuery.new(:to, e), :should
286 q.add_query Ferret::Search::TermQuery.new(:label, "spam"), :must_not
288 Redwood::log "contact search: #{q}"
291 @index.search_each(q, :sort => "date DESC", :limit => :all) do |docid, score|
292 break if contacts.size >= num
293 #Redwood::log "got message with to: #{@index[docid][:to].inspect} and from: #{@index[docid][:from].inspect}"
294 f = @index[docid][:from]
295 t = @index[docid][:to]
297 if AccountManager.is_account_email? f
298 t.split(" ").each { |e| #Redwood::log "adding #{e} because there's a message to him from account email #{f}";
299 contacts[Person.for(e)] = true }
301 #Redwood::log "adding from #{f} because there's a message from him to #{t}"
302 contacts[Person.for(f)] = true
306 contacts.keys.compact
311 def parse_user_query_string str; @qparser.parse str; end
314 query = Ferret::Search::BooleanQuery.new
315 query.add_query opts[:qobj], :must if opts[:qobj]
316 labels = ([opts[:label]] + (opts[:labels] || [])).compact
317 labels.each { |t| query.add_query Ferret::Search::TermQuery.new("label", t.to_s), :must }
318 if opts[:participants]
319 q2 = Ferret::Search::BooleanQuery.new
320 opts[:participants].each do |p|
321 q2.add_query Ferret::Search::TermQuery.new("from", p.email), :should
322 q2.add_query Ferret::Search::TermQuery.new("to", p.email), :should
324 query.add_query q2, :must
327 query.add_query Ferret::Search::TermQuery.new("label", "spam"), :must_not unless opts[:load_spam] || labels.include?(:spam)
328 query.add_query Ferret::Search::TermQuery.new("label", "killed"), :must_not unless opts[:load_killed] || labels.include?(:killed)
332 def load_sources fn=Redwood::SOURCE_FN
333 @sources = Hash[*(Redwood::load_yaml_obj(fn) || []).map { |s| [s.id, s] }.flatten]
334 @sources_dirty = false
337 def save_sources fn=Redwood::SOURCE_FN
338 if @sources_dirty || @sources.any? { |id, s| s.dirty? }
342 FileUtils.mv fn, bakfn, :force => true unless File.exists?(bakfn) && File.size(bakfn) > File.size(fn)
344 Redwood::save_yaml_obj @sources.values, fn
347 @sources_dirty = false