1 ## the index structure for redwood. interacts with ferret.
5 require_gem 'ferret', ">= 0.10.13"
9 class IndexError < StandardError
12 def initialize source, s
21 attr_reader :index # debugging only
23 def initialize dir=BASE_DIR
27 @sources_dirty = false
29 self.class.i_am_the_instance self
38 FileUtils.mkdir_p @dir unless File.exists? @dir
44 raise "duplicate source!" if @sources.include? source
46 source.id ||= @sources.size
47 ##TODO: why was this necessary?
48 ##source.id += 1 while @sources.member? source.id
49 @sources[source.id] = source
52 def source_for name; @sources.values.find { |s| s.is_source_for? name }; end
53 def usual_sources; @sources.values.find_all { |s| s.usual? }; end
55 def load_index dir=File.join(@dir, "ferret")
56 wsa = Ferret::Analysis::WhiteSpaceAnalyzer.new false
57 sa = Ferret::Analysis::StandardAnalyzer.new
58 analyzer = Ferret::Analysis::PerFieldAnalyzer.new wsa
62 Redwood::log "loading index"
63 @index = Ferret::Index::Index.new(:path => dir, :analyzer => analyzer)
65 Redwood::log "creating index"
66 field_infos = Ferret::Index::FieldInfos.new :store => :yes
67 field_infos.add_field :message_id
68 field_infos.add_field :source_id
69 field_infos.add_field :source_info
70 field_infos.add_field :date, :index => :untokenized
71 field_infos.add_field :body, :store => :no
72 field_infos.add_field :label
73 field_infos.add_field :subject
74 field_infos.add_field :from
75 field_infos.add_field :to
76 field_infos.add_field :refs
77 field_infos.add_field :snippet, :index => :no, :term_vector => :no
78 field_infos.create_index dir
79 @index = Ferret::Index::Index.new(:path => dir, :analyzer => analyzer)
83 ## update the message by deleting and re-adding
84 def update_message m, source=nil, source_info=nil
85 docid, entry = load_entry_for_id m.id
87 source ||= entry[:source_id].to_i
88 source_info ||= entry[:source_info].to_i
90 raise "no entry and no source info for message #{m.id}" unless source && source_info
92 raise "deleting non-corresponding entry #{docid}" unless @index[docid][:message_id] == m.id
97 def save_index fn=File.join(@dir, "ferret")
98 # don't have to do anything apparently
102 @index.search(Ferret::Search::TermQuery.new(:message_id, id)).total_hits > 0
104 def contains? m; contains_id? m.id; end
105 def size; @index.size; end
107 ## you should probably not call this on a block that doesn't break
108 ## rather quickly because the results will probably be, as we say
109 ## in scotland, frikkin' huuuge.
110 EACH_BY_DATE_NUM = 100
111 def each_id_by_date opts={}
112 return if @index.size == 0 # otherwise ferret barfs
113 query = build_query opts
116 results = @index.search(query, :sort => "date DESC", :limit => EACH_BY_DATE_NUM, :offset => offset)
117 Redwood::log "got #{results.total_hits} results for query (offset #{offset}) #{query.inspect}"
118 results.hits.each { |hit| yield @index[hit.doc][:message_id], lambda { build_message hit.doc } }
119 break if offset >= results.total_hits - EACH_BY_DATE_NUM
120 offset += EACH_BY_DATE_NUM
124 def num_results_for opts={}
125 query = build_query opts
126 x = @index.search(query).total_hits
127 Redwood::log "num_results_for: have #{x} for query #{query}"
131 SAME_SUBJECT_DATE_LIMIT = 7
132 def each_message_in_thread_for m, opts={}
137 ## temporarily disabling subject searching because it's a
138 ## significant slowdown.
140 ## TODO: make this configurable, i guess
142 date_min = m.date - (SAME_SUBJECT_DATE_LIMIT * 12 * 3600)
143 date_max = m.date + (SAME_SUBJECT_DATE_LIMIT * 12 * 3600)
145 q = Ferret::Search::BooleanQuery.new true
146 sq = Ferret::Search::PhraseQuery.new(:subject)
147 wrap_subj(Message.normalize_subj(m.subj)).split(/\s+/).each do |t|
150 q.add_query sq, :must
151 q.add_query Ferret::Search::RangeQuery.new(:date, :>= => date_min.to_indexable_s, :<= => date_max.to_indexable_s), :must
153 pending = @index.search(q).hits.map { |hit| @index[hit.doc][:message_id] }
154 Redwood::log "found #{pending.size} results for subject query #{q}"
159 until pending.empty? || (opts[:limit] && messages.size >= opts[:limit])
161 next if searched.member? id
163 q = Ferret::Search::BooleanQuery.new true
164 q.add_query Ferret::Search::TermQuery.new(:message_id, id), :should
165 q.add_query Ferret::Search::TermQuery.new(:refs, id), :should
168 @index.search_each(q, :limit => :all) do |docid, score|
169 break if opts[:limit] && messages.size >= opts[:limit]
170 mid = @index[docid][:message_id]
171 unless messages.member? mid
172 messages[mid] ||= lambda { build_message docid }
173 refs = @index[docid][:refs].split(" ")
178 Redwood::log "ran #{num_queries} queries to build thread of #{messages.size} messages for #{m.id}"
179 messages.each { |mid, builder| yield mid, builder }
182 ## builds a message object from a ferret result
183 def build_message docid
185 source = @sources[doc[:source_id].to_i]
186 #puts "building message #{doc[:message_id]} (#{source}##{doc[:source_info]})"
187 raise "invalid source #{doc[:source_id]}" unless source
194 Message.new :source => source, :source_info => doc[:source_info].to_i,
195 :labels => doc[:label].split(" ").map { |s| s.intern },
196 :snippet => doc[:snippet]
197 rescue MessageFormatError => e
198 raise IndexError.new(source, "error building message #{doc[:message_id]} at #{source}/#{doc[:source_info]}: #{e.message}")
199 rescue SourceError => e
206 "date" => Time.at(doc[:date].to_i),
207 "subject" => unwrap_subj(doc[:subject]),
208 "from" => doc[:from],
210 "message-id" => doc[:message_id],
211 "references" => doc[:refs],
214 m = Message.new :labels => doc[:label].split(" ").map { |s| s.intern },
215 :snippet => doc[:snippet], :header => fake_header,
219 An error occurred while loading this message. It is possible that the source
220 has changed, or (in the case of remote sources) is down.
222 The error message was:
229 def fresh_thread_id; @next_thread_id += 1; end
230 def wrap_subj subj; "__START_SUBJECT__ #{subj} __END_SUBJECT__"; end
231 def unwrap_subj subj; subj =~ /__START_SUBJECT__ (.*?) __END_SUBJECT__/ && $1; end
234 return false if contains? m
237 if m.source.is_a? Integer
240 m.source.id or raise "unregistered source #{m.source}"
243 to = (m.to + m.cc + m.bcc).map { |x| x.email }.join(" ")
246 :source_id => source_id,
247 :source_info => m.source_info,
248 :date => m.date.to_indexable_s,
250 :snippet => m.snippet,
251 :label => m.labels.join(" "),
252 :from => m.from ? m.from.email : "",
253 :to => (m.to + m.cc + m.bcc).map { |x| x.email }.join(" "),
254 :subject => wrap_subj(Message.normalize_subj(m.subj)),
255 :refs => (m.refs + m.replytos).uniq.join(" "),
258 @index.add_document d
260 ## TODO: figure out why this is sometimes triggered
261 #docid, entry = load_entry_for_id m.id
262 #raise "just added message #{m.id} but couldn't find it in a search" unless docid
266 def drop_entry docno; @index.delete docno; end
268 def load_entry_for_id mid
269 results = @index.search(Ferret::Search::TermQuery.new(:message_id, mid))
270 return if results.total_hits == 0
271 docid = results.hits[0].doc
272 [docid, @index[docid]]
275 def load_contacts emails, h={}
276 q = Ferret::Search::BooleanQuery.new true
278 qq = Ferret::Search::BooleanQuery.new true
279 qq.add_query Ferret::Search::TermQuery.new(:from, e), :should
280 qq.add_query Ferret::Search::TermQuery.new(:to, e), :should
283 q.add_query Ferret::Search::TermQuery.new(:label, "spam"), :must_not
285 Redwood::log "contact search: #{q}"
288 @index.search_each(q, :sort => "date DESC", :limit => :all) do |docid, score|
289 break if contacts.size >= num
290 #Redwood::log "got message with to: #{@index[docid][:to].inspect} and from: #{@index[docid][:from].inspect}"
291 f = @index[docid][:from]
292 t = @index[docid][:to]
294 if AccountManager.is_account_email? f
295 t.split(" ").each { |e| #Redwood::log "adding #{e} because there's a message to him from account email #{f}";
296 contacts[Person.for(e)] = true }
298 #Redwood::log "adding from #{f} because there's a message from him to #{t}"
299 contacts[Person.for(f)] = true
303 contacts.keys.compact
308 ## TODO: convert this to query objects rather than strings
311 query += opts[:labels].map { |t| "+label:#{t}" }.join(" ") if opts[:labels]
312 query += " +label:#{opts[:label]}" if opts[:label]
313 query += " #{opts[:content]}" if opts[:content]
314 if opts[:participants]
316 opts[:participants].map { |p| "from:#{p.email} OR to:#{p.email}" }.join(" OR ") + ")"
319 query += " -label:spam" unless opts[:load_spam] || opts[:labels] == :spam ||
320 (opts[:labels] && opts[:labels].include?(:spam))
321 query += " -label:killed" unless opts[:load_killed] || opts[:labels] == :killed ||
322 (opts[:labels] && opts[:labels].include?(:killed))
326 def load_sources fn=Redwood::SOURCE_FN
327 @sources = Hash[*(Redwood::load_yaml_obj(fn) || []).map { |s| [s.id, s] }.flatten]
328 @sources_dirty = false
331 def save_sources fn=Redwood::SOURCE_FN
332 if @sources_dirty || @sources.any? { |id, s| s.dirty? }
336 FileUtils.mv fn, bakfn, :force => true unless File.exists?(bakfn) && File.size(bakfn) > File.size(fn)
338 Redwood::save_yaml_obj @sources.values, fn
341 @sources_dirty = false