X-Git-Url: https://git.cworth.org/git?a=blobdiff_plain;f=lib%2Fsup%2Fthread.rb;h=99f21dc31a07bfcb3424a1fc206e20133e2ec8ab;hb=85f89a5ed5b7bc844410adc243d05cc528205a9f;hp=be3a7780c92f18b0196b731996a5a0032a16e696;hpb=4c8091314b3ad6005bc33d89793f8ebda145756b;p=sup diff --git a/lib/sup/thread.rb b/lib/sup/thread.rb index be3a778..99f21dc 100644 --- a/lib/sup/thread.rb +++ b/lib/sup/thread.rb @@ -1,4 +1,28 @@ -require 'date' +## Herein lies all the code responsible for threading messages. It's +## basically an online version of the JWZ threading algorithm: +## http://www.jwz.org/doc/threading.html +## +## I didn't implement it for efficiency, but thanks to our search +## engine backend, it's typically not applied to very many messages at +## once. +## +## At the top level, we have a ThreadSet, which represents a set of +## threads, e.g. a message folder or an inbox. Each ThreadSet contains +## zero or more Threads. A Thread represents all the message related +## to a particular subject. Each Thread has one or more Containers. A +## Container is a recursive structure that holds the message tree as +## determined by the references: and in-reply-to: headers. Each +## Container holds zero or one messages. In the case of zero messages, +## it means we've seen a reference to the message but haven't (yet) +## seen the message itself. +## +## A Thread can have multiple top-level Containers if we decide to +## group them together independent of tree structure, typically if +## (e.g. due to someone using a primitive MUA) the messages have the +## same subject but we don't have evidence from in-reply-to: or +## references: headers. In this case Thread#each can optionally yield +## a faked root object tying them all together into one tree +## structure. module Redwood @@ -7,7 +31,9 @@ class Thread attr_reader :containers def initialize - raise "wrong thread, buddy!" if block_given? + ## ah, the joys of a multithreaded application with a class called + ## "Thread". i keep instantiating the wrong one... + raise "wrong Thread class, buddy!" if block_given? @containers = [] end @@ -16,22 +42,23 @@ class Thread end def empty?; @containers.empty?; end - - def drop c - raise "bad drop" unless @containers.member? c - @containers.delete c - end - - def dump - puts "=== start thread #{self} with #{@containers.length} trees ===" - @containers.each { |c| c.dump_recursive } - puts "=== end thread ===" + def empty!; @containers.clear; end + def drop c; @containers.delete(c) or raise "bad drop"; end + + ## unused + def dump f=$stdout + f.puts "=== start thread with #{@containers.length} trees ===" + @containers.each { |c| c.dump_recursive f; f.puts } + f.puts "=== end thread ===" end - ## yields each message and some stuff + ## yields each message, its depth, and its parent. the message yield + ## parameter can be a Message object, or :fake_root, or nil (no + ## message found but the presence of one deduced from other + ## messages). def each fake_root=false adj = 0 - root = @containers.find_all { |c| !Message.subj_is_reply?(c) }.argmin { |c| c.date } + root = @containers.find_all { |c| c.message && !Message.subj_is_reply?(c.message.subj) }.argmin { |c| c.date } if root adj = 1 @@ -55,15 +82,21 @@ class Thread end end + def first; each { |m, *o| return m if m }; nil; end def dirty?; any? { |m, *o| m && m.dirty? }; end def date; map { |m, *o| m.date if m }.compact.max; end - def snippet; argfind { |m, *o| m && m.snippet }; end + def snippet + with_snippets = select { |m, *o| m && m.snippet && !m.snippet.empty? } + first_unread, * = with_snippets.select { |m, *o| m.has_label?(:unread) }.sort_by { |m, *o| m.date }.first + return first_unread.snippet if first_unread + last_read, * = with_snippets.sort_by { |m, *o| m.date }.last + return last_read.snippet if last_read + "" + end def authors; map { |m, *o| m.from if m }.compact.uniq; end def apply_label t; each { |m, *o| m && m.add_label(t) }; end - def remove_label t - each { |m, *o| m && m.remove_label(t) } - end + def remove_label t; each { |m, *o| m && m.remove_label(t) }; end def toggle_label label if has_label? label @@ -152,6 +185,10 @@ class Container def root?; @parent.nil?; end def root; root? ? self : @parent.root; end + ## skip over any containers which are empty and have only one child. we use + ## this make the threaded display a little nicer, and only stick in the + ## "missing message" line when it's graphically necessary, i.e. when the + ## missing message has more than one descendent. def first_useful_descendant if empty? && @children.size == 1 @children.first.first_useful_descendant @@ -170,7 +207,7 @@ class Container def subj; find_attr :subj; end def date; find_attr :date; end - def is_reply?; subj && Message.subject_is_reply?(subj); end + def is_reply?; subj && Message.subj_is_reply?(subj); end def to_s [ "<#{id}", @@ -179,178 +216,212 @@ class Container ].compact.join(" ") + ">" end - def dump_recursive indent=0, root=true, parent=nil + def dump_recursive f=$stdout, indent=0, root=true, parent=nil raise "inconsistency" unless parent.nil? || parent.children.include?(self) unless root - print " " * indent - print "+->" + f.print " " * indent + f.print "+->" end - line = #"[#{useful? ? 'U' : ' '}] " + + line = "[#{thread.nil? ? ' ' : '*'}] " + #"[#{useful? ? 'U' : ' '}] " + if @message - "[#{thread}] #{@message.subj} " ##{@message.refs.inspect} / #{@message.replytos.inspect}" + message.subj ##{@message.refs.inspect} / #{@message.replytos.inspect}" else "" end - puts "#{id} #{line}"#[0 .. (105 - indent)] + f.puts "#{id} #{line}"#[0 .. (105 - indent)] indent += 3 - @children.each { |c| c.dump_recursive indent, false, self } + @children.each { |c| c.dump_recursive f, indent, false, self } end end -## a set of threads (so a forest). builds the thread structures by -## reading messages from an index. +## A set of threads, so a forest. Is integrated with the index and +## builds thread structures by reading messages from it. +## +## If 'thread_by_subj' is true, puts messages with the same subject in +## one thread, even if they don't reference each other. This is +## helpful for crappy MUAs that don't set In-reply-to: or References: +## headers, but means that messages may be threaded unnecessarily. +## +## The following invariants are maintained: every Thread has at least one +## Container tree, and every Container tree has at least one Message. class ThreadSet attr_reader :num_messages + bool_reader :thread_by_subj - def initialize index + def initialize index, thread_by_subj=true @index = index @num_messages = 0 - @messages = {} ## map from message ids to container objects - @subj_thread = {} ## map from subject strings to thread objects + ## map from message ids to container objects + @messages = SavingHash.new { |id| Container.new id } + ## map from subject strings or (or root message ids) to thread objects + @threads = SavingHash.new { Thread.new } + @thread_by_subj = thread_by_subj end - def contains_id? id; @messages.member?(id) && !@messages[id].empty?; end - def thread_for m - (c = @messages[m.id]) && c.root.thread - end - - def delete_empties - @subj_thread.each { |k, v| @subj_thread.delete(k) if v.empty? } - end - private :delete_empties + def thread_for_id mid; @messages.member?(mid) && @messages[mid].root.thread end + def contains_id? id; @messages.member?(id) && !@messages[id].empty? end + def thread_for m; thread_for_id m.id end + def contains? m; contains_id? m.id end - def threads; delete_empties; @subj_thread.values; end - def size; delete_empties; @subj_thread.size; end + def threads; @threads.values end + def size; @threads.size end - def dump - @subj_thread.each do |s, t| - puts "**********************" - puts "** for subject #{s} **" - puts "**********************" - t.dump + def dump f=$stdout + @threads.each do |s, t| + f.puts "**********************" + f.puts "** for subject #{s} **" + f.puts "**********************" + t.dump f end end + ## link two containers def link p, c, overwrite=false if p == c || p.descendant_of?(c) || c.descendant_of?(p) # would create a loop -# puts "*** linking parent #{p} and child #{c} would create a loop" + #puts "*** linking parent #{p.id} and child #{c.id} would create a loop" return end - if c.parent.nil? || overwrite - c.parent.children.delete c if overwrite && c.parent - if c.thread - c.thread.drop c - c.thread = nil - end - p.children << c - c.parent = p - end + #puts "in link for #{p.id} to #{c.id}, perform? #{c.parent.nil?} || #{overwrite}" + + return unless c.parent.nil? || overwrite + remove_container c + p.children << c + c.parent = p + + ## if the child was previously a top-level container, it now ain't, + ## so ditch our thread and kill it if necessary + prune_thread_of c end private :link - def remove mid + def remove_container c + c.parent.children.delete c if c.parent # remove from tree + end + private :remove_container + + def prune_thread_of c + return unless c.thread + c.thread.drop c + @threads.delete_if { |k, v| v == c.thread } if c.thread.empty? + c.thread = nil + end + private :prune_thread_of + + def remove_id mid return unless(c = @messages[mid]) + remove_container c + prune_thread_of c + end - c.parent.children.delete c if c.parent - if c.thread - c.thread.drop c - c.thread = nil - end + def remove_thread_containing_id mid + c = @messages[mid] or return + t = c.root.thread + @threads.delete_if { |key, thread| t == thread } end ## load in (at most) num number of threads from the index def load_n_threads num, opts={} @index.each_id_by_date opts do |mid, builder| - break if size >= num + break if size >= num unless num == -1 next if contains_id? mid m = builder.call - add_message m - load_thread_for_message m - yield @subj_thread.size if block_given? + load_thread_for_message m, :skip_killed => opts[:skip_killed], :load_deleted => opts[:load_deleted], :load_spam => opts[:load_spam] + yield size if block_given? end end ## loads in all messages needed to thread m - def load_thread_for_message m - @index.each_message_in_thread_for m, :limit => 100 do |mid, builder| + ## may do nothing if m's thread is killed + def load_thread_for_message m, opts={} + good = @index.each_message_in_thread_for m, opts do |mid, builder| next if contains_id? mid add_message builder.call end + add_message m if good + end + + ## merges in a pre-loaded thread + def add_thread t + raise "duplicate" if @threads.values.member? t + t.each { |m, *o| add_message m } + end + + ## merges two threads together. both must be members of this threadset. + ## does its best, heuristically, to determine which is the parent. + def join_threads threads + return if threads.size < 2 + + containers = threads.map do |t| + c = @messages[t.first.id] + raise "not in threadset: #{t.first.id}" unless c && c.message + c + end + + ## use subject headers heuristically + parent = containers.find { |c| !c.is_reply? } + + ## no thread was rooted by a non-reply, so make a fake parent + parent ||= @messages["joining-ref-" + containers.map { |c| c.id }.join("-")] + + containers.each do |c| + next if c == parent + c.message.add_ref parent.id + link parent, c + end + + true end def is_relevant? m - m.refs.any? { |ref_id| @messages[ref_id] } + m.refs.any? { |ref_id| @messages.member? ref_id } end - ## an "online" version of the jwz threading algorithm. + ## the heart of the threading code def add_message message - id = message.id - el = (@messages[id] ||= Container.new id) - return if @messages[id].message # we've seen it before + el = @messages[message.id] + return if el.message # we've seen it before + + #puts "adding: #{message.id}, refs #{message.refs.inspect}" el.message = message oldroot = el.root ## link via references: - prev = nil - message.refs.each do |ref_id| - raise "non-String ref id #{ref_id.inspect} (full: #{message.refs.inspect})" unless ref_id.is_a?(String) - ref = (@messages[ref_id] ||= Container.new ref_id) + (message.refs + [el.id]).inject(nil) do |prev, ref_id| + ref = @messages[ref_id] link prev, ref if prev - prev = ref + ref end - link prev, el, true if prev ## link via in-reply-to: message.replytos.each do |ref_id| - ref = (@messages[ref_id] ||= Container.new ref_id) + ref = @messages[ref_id] link ref, el, true break # only do the first one end - ## update subject grouping root = el.root - # puts "> have #{el}, root #{root}, oldroot #{oldroot}" - # el.dump_recursive - - if root == oldroot - if oldroot.thread - # puts "*** root (#{root.subj}) == oldroot (#{oldroot.subj}); ignoring" + key = + if thread_by_subj? + Message.normalize_subj root.subj else - ## to disable subject grouping, use the next line instead - ## (and the same for below) - #Redwood::log "[1] normalized subject for #{id} is #{Message.normalize_subj(root.subj)}" - thread = (@subj_thread[Message.normalize_subj(root.subj)] ||= Thread.new) - #thread = (@subj_thread[root.id] ||= Thread.new) - - thread << root - root.thread = thread - # puts "# (1) added #{root} to #{thread}" - end - else - if oldroot.thread - ## new root. need to drop old one and put this one in its place - # puts "*** DROPPING #{oldroot} from #{oldroot.thread}" - oldroot.thread.drop oldroot - oldroot.thread = nil + root.id end - if root.thread - # puts "*** IGNORING cuz root already has a thread" - else - ## to disable subject grouping, use the next line instead - ## (and the same above) - #Redwood::log "[2] normalized subject for #{id} is #{Message.normalize_subj(root.subj)}" - thread = (@subj_thread[Message.normalize_subj(root.subj)] ||= Thread.new) - #thread = (@subj_thread[root.id] ||= Thread.new) - - thread << root - root.thread = thread - # puts "# (2) added #{root} to #{thread}" + ## check to see if the subject is still the same (in the case + ## that we first added a child message with a different + ## subject) + if root.thread + if @threads.member?(key) && @threads[key] != root.thread + @threads.delete key end + else + thread = @threads[key] + thread << root + root.thread = thread end ## last bit