From: wmorgan Date: Thu, 1 Feb 2007 21:20:21 +0000 (+0000) Subject: split sup-import into sup-add and sup-import; merged poll.rb and sup-import code... X-Git-Url: https://git.cworth.org/git?a=commitdiff_plain;h=af4509909beb1cb175aa30fd6d24f43e334fa991;p=sup split sup-import into sup-add and sup-import; merged poll.rb and sup-import code into Index methods git-svn-id: svn://rubyforge.org/var/svn/sup/trunk@291 5c8cc53c-5e98-4d25-b20a-d8db53a31250 --- diff --git a/HACKING b/HACKING index 47c072d..0d9f68e 100644 --- a/HACKING +++ b/HACKING @@ -7,13 +7,44 @@ ruby -I lib -w bin/sup Coding standards ---------------- -- Wrap code at 99999 characters. The days of 80-column displays are - long over. Wrap comments and other text at whatever Emacs meta-Q - does. +- Don't wrap code unless it really benefits from it. The days of + 80-column displays are long over. But do wrap comments and other + text at whatever Emacs meta-Q does. - Use as few parentheses as possible. - Use {} for one-liner blocks and do/end for multi-line blocks. -- For one-line functions, put a semicolon before "end", like this: - def bool_writer *args; attr_writer(*args); end - (I just started doing this for no real reason, and now I kinda like - it.) + +How messages are updated in the index +------------------------------------- + +Ferret doesn't have any concept of updating; to change message state +it must be deleted then re-added to the index. + +Thus there are a couple situations where we'll have a message to be +"added", but it already exists in the index, and we need to decide +which parts of which version to keep: + +1. The user has changed the state of the message, e.g. read it or + added a user label. In this case we want to use the state of the + version in memory, but keep everything else on disk. + + This is the behavior of Index#update_message + +2. We've received a new copy of the message. Crucially, this can + happen for two different reasons: + + a. The message was sent to a mailing list to which the user is + subscribed, and we're now getting that message back, possibly + with altered content (subject mangling, signature adding, etc.) + + b. The user has moved the message between sources. E.g. if the + primary inbox has a quota, and other sources are on local, + quota-less disk, the user may regularly move messages from the + inbox to the sources on disk. + + In both of these cases, the solution is to keep the state from the + index, but use the new message contents. + + This is the behavior of Index#update_or_add_message, which can be + also be called for new message. + diff --git a/Manifest.txt b/Manifest.txt index 6645ada..46aefd3 100644 --- a/Manifest.txt +++ b/Manifest.txt @@ -4,6 +4,7 @@ Manifest.txt README.txt Rakefile bin/sup +bin/sup-add bin/sup-import bin/sup-recover-sources doc/FAQ.txt diff --git a/bin/sup-add b/bin/sup-add new file mode 100644 index 0000000..4d33641 --- /dev/null +++ b/bin/sup-add @@ -0,0 +1,93 @@ +#!/usr/bin/env ruby + +require 'uri' +require 'rubygems' +require 'highline/import' +require 'trollop' +require "sup" + +Thread.abort_on_exception = true # make debugging possible + +$opts = Trollop::options do + version "sup-add (sup #{Redwood::VERSION})" + banner <+ + +where + is one or more source URIs or mbox filenames, e.g. +"imaps://my.imapserver.com", or "/var/spool/mail/me". + +Options are: +EOS + opt :archive, "Automatically archive all new messages from these sources." + opt :unusual, "Do not automatically poll these sources for new messages." + opt :force_new, "Create a new account for this source, even if one already exists." +end + +Trollop::die "require one or more sources" if ARGV.empty? + +## for sources that require login information, prompt the user for +## that. also provide a list of previously-defined login info to +## choose from, if any. +def get_login_info uri, sources + uri = URI(uri) + accounts = sources.map do |s| + next unless s.respond_to?(:username) + suri = URI(s.uri) + [suri.host, s.username, s.password] + end.compact.uniq.sort_by { |h, u, p| h == uri.host ? 0 : 1 } + + username, password = nil, nil + unless accounts.empty? || $opts[:force_new] + say "Would you like to use the same account as for a previous source for #{uri}?" + choose do |menu| + accounts.each do |host, olduser, oldpw| + menu.choice("Use the account info for #{olduser}@#{host}") { username, password = olduser, oldpw } + end + menu.choice("Use a new account") { } + menu.prompt = "Account selection? " + end + end + + unless username && password + username = ask("Username for #{uri.host}: "); + password = ask("Password for #{uri.host}: ") { |q| q.echo = false } + puts # why? + end + + [username, password] +end + +$terminal.wrap_at = :auto +Redwood::start +index = Redwood::Index.new +index.load + +ARGV.each do |uri| + uri = "mbox://#{uri}" unless uri =~ %r!://! + if !$opts[:force_new] && index.source_for(uri) + say "Already know about #{uri}; skipping." + next + end + source = + case uri + when %r!^mbox\+ssh://! + say "For SSH connections, if you will use public key authentication, you may leave the username and password blank." + say "" + username, password = get_login_info uri, index.sources + Redwood::MBox::SSHLoader.new(uri, username, password, nil, !$opts[:unusual], $opts[:archive]) + when %r!^imaps?://! + username, password = get_login_info uri, index.sources + Redwood::IMAP.new(uri, username, password, nil, !$opts[:unusual], $opts[:archive]) + else + Redwood::MBox::Loader.new(uri, nil, !$opts[:unusual], $opts[:archive]) + end + say "Adding #{source}..." + index.add_source source +end + +say "Saving source list..." +index.save +Redwood::finish diff --git a/bin/sup-import b/bin/sup-import index d5fec8a..a9487aa 100644 --- a/bin/sup-import +++ b/bin/sup-import @@ -2,7 +2,7 @@ require 'uri' require 'rubygems' -require 'highline/import' +require 'trollop' require "sup" Thread.abort_on_exception = true # make debugging possible @@ -24,203 +24,88 @@ def time Time.now - startt end -def educate_user - $stderr.puts <* -where * is zero or more source descriptions (e.g., mbox -filenames on disk, or imap/imaps URIs). - -If the sources listed are not already in the Sup source list, -they will be added to it, as parameterized by the following options: - --archive: messages from these sources will not appear in the inbox - --unusual: these sources will not be polled when the flag --the-usual - is called - -Regardless of whether the sources are new or not, they will be polled, -and any new messages will be added to the index, as parameterized by -the following options: - --force-archive: regardless of the source "archive" flag, any new - messages found will not appear in the inbox. - --force-read: any messages found will not be marked as new. - -The following options can also be specified: - --verbose: print message ids as they're processed - --the-usual: import new messages from all usual sources - --rebuild: rebuild the index for the specified sources rather than - just adding new messages. Useful if the sources - have changed in any way *other* than new messages - being added. Only updates messages if the offsets have - changed. - --force-rebuild: force a rebuild of all messages in the inbox, not just - ones that have changed. You probably won't need this - unless William changes the index format. - --overwrite-labels: if rebuilding, update message if the labels have - changed, not just the offset. - --optimize: optimize the index after adding any new messages. - --help: don't do anything, just show this message. -EOS - exit -end -#' stupid ruby-mode - -## for sources that require login information, prompt the user for -## that. also provide a list of previously-defined login info to -## choose from, if any. -def get_login_info uri, sources - uri = URI(uri) - accounts = sources.map do |s| - next unless s.respond_to?(:username) - suri = URI(s.uri) - [suri.host, s.username, s.password] - end.compact.uniq.sort_by { |h, u, p| h == uri.host ? 0 : 1 } - - username, password = nil, nil - unless accounts.empty? - say "Would you like to use the same account as for a previous source for #{uri}?" - choose do |menu| - accounts.each do |host, olduser, oldpw| - menu.choice("Use the account info for #{olduser}@#{host}") { username, password = olduser, oldpw } - end - menu.choice("Use a new account") { } - menu.prompt = "Account selection? " - end - end - - unless username && password - username = ask("Username for #{uri.host}: "); - password = ask("Password for #{uri.host}: ") { |q| q.echo = false } - puts # why? - end - - [username, password] -end -educate_user if ARGV.member? '--help' - -archive = ARGV.delete "--archive" -unusual = ARGV.delete "--unusual" -force_archive = ARGV.delete "--force-archive" -force_read = ARGV.delete "--force-read" -the_usual = ARGV.delete "--the-usual" -rebuild = ARGV.delete "--rebuild" -force_rebuild = ARGV.delete "--force-rebuild" -overwrite_labels = ARGV.delete "--overwrite-labels" -optimize = ARGV.delete "--optimize" -verbose = ARGV.delete "--verbose" -start_at = # ok really need to use optparse or something now - if(i = ARGV.index("--start-at")) - raise "start-at requires a numeric argument: #{ARGV[i + 1].inspect}" unless ARGV.length > (i + 1) && ARGV[i + 1] =~ /\d/ - ARGV.delete_at i - ARGV.delete_at(i).to_i # whoa! - end +where * is zero or more source URIs or mbox filenames, e.g. +"imaps://my.imapserver.com", or "/var/spool/mail/me". If no sources +are given, imports messages from all sources marked as "usual". -if(o = ARGV.find { |x| x =~ /^--/ }) - $stderr.puts "error: unknown option #{o}" - educate_user +Options are: +EOS + opt :archive, "Automatically archive any imported messages." + opt :read, "Automatically mark as read any imported messages." + opt :verbose, "Print message ids as they're processed." + opt :optimize, "As the last stage of the import, optimize the index." + text < :int + opt :overwrite_state, "For --full-rebuild, overwrite the message state to the default state for that source, obeying --archive and --read if given." end +Trollop::die :start_at, "must be non-negative" if (opts[:start_at] || 0) < 0 +Trollop::die :start_at, "requires either --rebuild or --full-rebuild" if opts[:start_at] && !(opts[:rebuild] || opts[:full_rebuild]) +Trollop::die :overwrite_state, "requires --full-rebuild" if opts[:overwrite_state] && !opts[:full_rebuild] +Trollop::die :force_rebuild, "cannot be specified with --rebuild" if opts[:full_rebuild] && opts[:rebuild] -$terminal.wrap_at = :auto Redwood::start index = Redwood::Index.new index.load sources = ARGV.map do |uri| uri = "mbox://#{uri}" unless uri =~ %r!://! - source = index.source_for uri - unless source - source = - case uri - when %r!^mbox\+ssh://! - say "For SSH connections, if you will use public key authentication, you may leave the username and password blank." - say "\n" - username, password = get_login_info uri, index.sources - Redwood::MBox::SSHLoader.new(uri, username, password, nil, !unusual, !!archive) - when %r!^imaps?://! - username, password = get_login_info uri, index.sources - Redwood::IMAP.new(uri, username, password, nil, !unusual, !!archive) - else - Redwood::MBox::Loader.new(uri, nil, !unusual, !!archive) - end - index.add_source source - end - source + index.source_for uri or raise "Unknown source: #{uri}" end -sources = (sources + index.usual_sources).uniq if the_usual -if rebuild || force_rebuild - if start_at - sources.each { |s| s.seek_to! start_at } +sources = index.usual_sources if sources.empty? + +if opts[:rebuild] || opts[:full_rebuild] + if opts[:start_at] + sources.each { |s| s.seek_to! opts[:start_at] } else sources.each { |s| s.reset! } end end -found = {} start = Time.now +found = {} begin sources.each do |source| - if source.broken? - $stderr.puts "error loading messages from #{source}: #{source.broken_msg}" - next - end - next if source.done? - puts "loading from #{source}... " num = 0 - start_offset = nil - source.each do |offset, labels| - labels.each { |l| Redwood::LabelManager << l } - - start_offset ||= offset - labels -= [:inbox] if force_archive || archive - labels -= [:unread] if force_read - begin - m = Redwood::Message.new :source => source, :source_info => offset, :labels => labels - if found[m.id] - puts "skipping duplicate message #{m.id}" - next - else - found[m.id] = true - end - - if m.source_marked_read? - m.remove_label :unread - labels -= [:unread] - end - puts "# message at #{offset}, labels: #{labels * ' '}" if verbose - - ## possibly rebuild the message - if (rebuild || force_rebuild) && (docid, entry = index.load_entry_for_id(m.id)) && entry - oldlabels = entry[:label].split(" ").sort - newlabels = labels.map { |x| x.to_s }.sort - - if force_rebuild || entry[:source_info].to_i != offset || (overwrite_labels && (oldlabels != newlabels)) - if overwrite_labels - puts "replacing message #{m.id}: offset #{entry[:source_info]} => #{offset}, labels #{oldlabels * ' '} => #{newlabels * ' '}" - m.labels = newlabels.map { |l| l.intern } - else - puts "replacing message #{m.id}: offset #{entry[:source_info]} => #{offset}" - m.labels = oldlabels - end - num += 1 if index.update_message m, source, offset - end - else - num += 1 if index.add_message m - end - rescue Redwood::MessageFormatError, Redwood::SourceError => e - $stderr.puts "ignoring erroneous message at #{source}##{offset}: #{e.message}" - end + index.add_new_messages_from source do |m, offset, source_labels, entry| + found[m.id] = true + m.labels = source_labels if opts[:overwrite_state] + m.labels -= [:inbox] if opts[:archive] + m.labels -= [:unread] if opts[:read] + + num += 1 if num % 1000 == 0 && num > 0 elapsed = Time.now - start pctdone = source.pct_done remaining = (100.0 - pctdone) * (elapsed.to_f / pctdone) puts "## #{num} (#{pctdone}% done) read; #{elapsed.to_time_s} elapsed; est. #{remaining.to_time_s} remaining" end + + ## update if... + if entry.nil? || # it's a new message; or + opts[:full_rebuild] || # we're updating everyone; or + (opts[:rebuild] && (entry[:source_id].to_i != source.id || entry[:source_info].to_i != offset)) # we're updating just the changed ones + puts "# message at #{offset}, labels: #{m.labels * ' '}" if opts[:verbose] + m + else + nil + end end - puts "loaded #{num} messages" unless num == 0 + puts "loaded #{num} messages from #{source}" unless num == 0 end ensure $stderr.puts "saving index and sources..." @@ -228,27 +113,33 @@ ensure Redwood::finish end -if rebuild || force_rebuild +## delete any messages in the index that claim they're from one of +## these sources, but that we didn't see. +## +## kinda crappy code here, because we delve directly into the Ferret +## API. +## +## TODO: move this to Index, i suppose. +if opts[:rebuild] || opts[:full_rebuild] puts "deleting missing messages from the index..." numdel = num = 0 sources.each do |source| raise "no source id for #{source}" unless source.id q = "+source_id:#{source.id}" - q += " +source_info: >= #{start_at}" if start_at - #p q + q += " +source_info: >= #{opts[:start_at]}" if opts[:start_at] num += index.index.search_each(q, :limit => :all) do |docid, score| mid = index.index[docid][:message_id] +# puts "got #{mid}" next if found[mid] puts "deleting #{mid}" index.index.delete docid numdel += 1 end - #p num end puts "deleted #{numdel} / #{num} messages" end -if optimize +if opts[:optimize] puts "optimizing index..." optt = time { index.index.optimize } puts "optimized index of size #{index.size} in #{optt}s." diff --git a/doc/FAQ.txt b/doc/FAQ.txt index 6a3b756..7155176 100644 --- a/doc/FAQ.txt +++ b/doc/FAQ.txt @@ -62,8 +62,9 @@ Q: I want to move messages from one source to another. (E.g., my preserving message state? A: Move the messages from the source to the target using whatever tool you'd like. Then (and this is the important part), sup-import - --rescan the target, and THEN the source. If you do it the other - way around, you'll lose any message state. + --rescan both sources at once. If you do it one at a time, you may + lose message state. (Depending, actually, on which order you do it + in. But just do them both at once.) Q: How is Sup possible? A: Sup is only possible through the hard work of Dave Balmain, the diff --git a/doc/TODO b/doc/TODO index c6a81e8..8b2385d 100644 --- a/doc/TODO +++ b/doc/TODO @@ -4,9 +4,6 @@ bugfix: resuming a draft asks before discard bugfix: killed threads bugfix: new messages, drafts sometimes not showing up in inbox bugfix: changing IMAP ids -use trollop to handle sup-devel args -add a flag to sup-import to force the creation of a new source (see http://rubyforge.org/forum/forum.php?thread_id=10973&forum_id=10340) -clean up import code and share between poll.rb and sup-import de-archived messages should be auto-added to inbox maildir undo @@ -32,6 +29,11 @@ pop be able to mark individual messages as spam in thread-view-mode toggle wrapping +done +---- +x add a flag to sup-import to force the creation of a new source (see http://rubyforge.org/forum/forum.php?thread_id=10973&forum_id=10340) +x use trollop to handle sup-devel args +x clean up import code and share between poll.rb and sup-import x on startup, multi-threadedly call #connect on all sources x bugfix: first time viewing a message only gets the first to:; subsequent views get them all (wtf) x search for other messages from author in thread-view-mode diff --git a/lib/sup/imap.rb b/lib/sup/imap.rb index 3daa3d4..b3c4b2c 100644 --- a/lib/sup/imap.rb +++ b/lib/sup/imap.rb @@ -25,8 +25,8 @@ require 'time' ## slow for large mailboxes, and we'll just have to hope that there ## are no collisions. ho ho! a perfectly reasonable solution! -## fuck you, imap committee. you managed to design something as shitty -## as mbox but goddamn THIRTY YEARS LATER. +## fuck you, imap committee. you managed to design something nearly as +## shitty as mbox but goddamn THIRTY YEARS LATER. module Redwood class IMAP < Source @@ -64,6 +64,7 @@ class IMAP < Source x.nil? || x.empty? ? 'INBOX' : x end def ssl?; @parsed_uri.scheme == 'imaps' end + def == o; o.is_a?(IMAP) && o.uri == self.uri && o.username == self.username; end def load_header id MBox::read_header StringIO.new(raw_header(id)) diff --git a/lib/sup/index.rb b/lib/sup/index.rb index 24e6cee..cb56efe 100644 --- a/lib/sup/index.rb +++ b/lib/sup/index.rb @@ -3,7 +3,6 @@ require 'thread' require 'fileutils' require 'ferret' -#require_gem 'ferret', ">= 0.10.13" module Redwood @@ -73,24 +72,67 @@ class Index end end - ## update the message by deleting and re-adding - def update_message m, source=nil, source_info=nil - docid, entry = load_entry_for_id m.id - if entry - source ||= entry[:source_id].to_i - source_info ||= entry[:source_info].to_i + ## Update the message state on disk, by deleting and re-adding it. + ## The message must exist in the index. docid and entry are found + ## unless given. + def update_message m, docid=nil, entry=nil + unless docid && entry + docid, entry = load_entry_for_id m.id + raise ArgumentError, "cannot find #{m.id} in the index" unless entry end - ## this happens sometimes. i'm not sure why. ferret bug? - raise "no entry and no source info for message #{m.id}: source #{source.inspect}, info #{source_info.inspect}, entry #{entry.inspect}, query #{Ferret::Search::TermQuery.new(:message_id, m.id)}, results #{@index.search(Ferret::Search::TermQuery.new(:message_id, m.id)).inspect}" unless source && source_info + raise "no entry and no source info for message #{m.id}" unless m.source && m.source_info raise "deleting non-corresponding entry #{docid}" unless @index[docid][:message_id] == m.id + @index.delete docid add_message m end + ## for each new message form the source, yields a bunch of stuff, + ## gets the message back from the block, and adds it or updates it. + def add_new_messages_from source + found = {} + return if source.done? || source.broken? + + source.each do |offset, labels| + if source.broken? + Redwood::log "error loading messages from #{source}: #{source.broken_msg}" + return + end + + labels.each { |l| LabelManager << l } + + begin + m = Message.new :source => source, :source_info => offset, :labels => labels + if found[m.id] + puts "skipping duplicate message #{m.id}" + next + else + found[m.id] = true + end + + if m.source_marked_read? + m.remove_label :unread + labels.delete :unread + end + + docid, entry = load_entry_for_id m.id + m = yield m, offset, labels, entry + next unless m + if entry + update_message m, docid, entry + else + add_message m + end + rescue MessageFormatError, SourceError => e + Redwood::log "ignoring erroneous message at #{source}##{offset}: #{e.message}" + end + end + end + def save_index fn=File.join(@dir, "ferret") - # don't have to do anything apparently + # don't have to do anything, apparently end def contains_id? id @@ -203,8 +245,10 @@ class Index def wrap_subj subj; "__START_SUBJECT__ #{subj} __END_SUBJECT__"; end def unwrap_subj subj; subj =~ /__START_SUBJECT__ (.*?) __END_SUBJECT__/ && $1; end + ## Adds a message to the index. The message cannot already exist in + ## the index. def add_message m - return false if contains? m + raise ArgumentError, "index already contains #{m.id}" if contains? m source_id = if m.source.is_a? Integer @@ -230,8 +274,8 @@ class Index @index.add_document d - ## TODO: figure out why this is sometimes triggered docid, entry = load_entry_for_id m.id + ## this hasn't been triggered in a long time. TODO: decide whether it's still a problem. raise "just added message #{m.id} but couldn't find it in a search" unless docid true end diff --git a/lib/sup/poll.rb b/lib/sup/poll.rb index 0abe1d3..272126e 100644 --- a/lib/sup/poll.rb +++ b/lib/sup/poll.rb @@ -38,59 +38,23 @@ class PollManager end end - ## TODO: merge this with sup-import def do_poll total_num = total_numi = 0 @mutex.synchronize do found = {} Index.usual_sources.each do |source| - next if source.broken? || source.done? - - yield "Loading from #{source}... " - start_offset = nil + yield "Loading from #{source}... " unless source.done? || source.broken? num = 0 - num_inbox = 0 - - source.each do |offset, labels| - break if source.broken? - start_offset ||= offset - yield "Found message at #{offset} with labels #{labels * ', '}" - - begin - begin - m = Redwood::Message.new :source => source, :source_info => offset, :labels => labels - rescue MessageFormatError => e - yield "Non-fatal error loading message #{source}##{offset}: #{e.message}" - next - end - - if found[m.id] - yield "Skipping duplicate message #{m.id}" - next - end - found[m.id] = true - - if Index.add_message m - UpdateManager.relay :add, m - num += 1 - total_num += 1 - total_numi += 1 if m.labels.include? :inbox - end - - if num % 1000 == 0 && num > 0 - elapsed = Time.now - start - pctdone = source.pct_done - remaining = (100.0 - pctdone) * (elapsed.to_f / pctdone) - yield "## #{num} (#{pctdone}% done) read; #{elapsed.to_time_s} elapsed; est. #{remaining.to_time_s} remaining" - end - rescue SourceError => e - msg = "Fatal error loading from #{source}: #{e.message}" - Redwood::log msg - yield msg - break - end + numi = 0 + Index.add_new_messages_from source do |m, offset, source_labels, entry| + yield "Found message at #{offset} with labels #{m.labels * ', '}" + num += 1 + numi += 1 if m.labels.include? :inbox + m end - yield "Found #{num} messages" unless num == 0 + yield "Found #{num} messages, #{numi} to inbox" unless num == 0 + total_num += num + total_numi += numi end yield "Done polling; loaded #{total_num} new messages total"