#!/usr/bin/env ruby require 'uri' require 'rubygems' require 'trollop' require "sup" class Float def to_s; sprintf '%.2f', self; end def to_time_s infinite? ? "unknown" : super end end class Numeric def to_time_s i = to_i sprintf "%d:%02d:%02d", i / 3600, (i / 60) % 60, i % 60 end end def time startt = Time.now yield Time.now - startt end opts = Trollop::options do version "sup-sync (sup #{Redwood::VERSION})" banner <* where * is zero or more source URIs. If no sources are given, sync from all usual sources. Supported source URI schemes can be seen by running "sup-add --help". Options controlling WHICH messages sup-sync operates on: EOS opt :new, "Operate on new messages only. Don't scan over the entire source. (Default.)", :short => :none opt :changed, "Scan over the entire source for messages that have been deleted, altered, or moved from another source. (In the case of mbox sources, this includes all messages AFTER an altered message.)" opt :restored, "Operate only on those messages included in a dump file as specified by --restore which have changed state." opt :all, "Operate on all messages in the source, regardless of newness or changedness." opt :start_at, "For --changed and --all, start at a particular offset.", :type => :int text < :none opt :restore, "Restore message state from a dump file created with sup-dump. If a message is not in this dumpfile, act as --asis.", :type => String, :short => :none opt :discard, "Discard any message state in the index and use the default source state. Dangerous!", :short => :none opt :archive, "When using the default source state, mark messages as archived.", :short => "-x" opt :read, "When using the default source state, mark messages as read." opt :extra_labels, "When using the default source state, also apply these user-defined labels. Should be a comma-separated list.", :type => String, :short => :none text < :none opt :dry_run, "Don't actually modify the index. Probably only useful with --verbose.", :short => "-n" opt :version, "Show version information", :short => :none conflicts :changed, :all, :new, :restored conflicts :asis, :restore, :discard end Trollop::die :restored, "requires --restore" if opts[:restored] unless opts[:restore] if opts[:start_at] Trollop::die :start_at, "must be non-negative" if opts[:start_at] < 0 Trollop::die :start_at, "requires either --changed or --all" unless opts[:changed] || opts[:all] end target = [:new, :changed, :all, :restored].find { |x| opts[x] } || :new op = [:asis, :restore, :discard].find { |x| opts[x] } || :asis Redwood::start index = Redwood::Index.new restored_state = if opts[:restore] dump = {} $stderr.puts "Loading state dump from #{opts[:restore]}..." IO.foreach opts[:restore] do |l| l =~ /^(\S+) \((.*?)\)$/ or raise "Can't read dump line: #{l.inspect}" mid, labels = $1, $2 dump[mid] = labels.split(" ").map { |x| x.intern } end $stderr.puts "Read #{dump.size} entries from dump file." dump else {} end seen = {} index.lock_or_die begin index.load sources = ARGV.map do |uri| index.source_for uri or Trollop::die "Unknown source: #{uri}. Did you add it with sup-add first?" end sources = index.usual_sources if sources.empty? sources = index.sources if opts[:all_sources] unless target == :new if opts[:start_at] sources.each { |s| s.seek_to! opts[:start_at] } else sources.each { |s| s.reset! } end end sources.each do |source| $stderr.puts "Scanning #{source}..." num_added = num_updated = num_scanned = num_restored = 0 last_info_time = start_time = Time.now Redwood::PollManager.add_messages_from source do |m, offset, entry| num_scanned += 1 seen[m.id] = true ## skip if we're operating only on changed messages, the message ## is in the index, and it's unchanged from what the source is ## reporting. next if target == :changed && entry && entry[:source_id].to_i == source.id && entry[:source_info].to_i == offset ## get the state currently in the index index_state = if entry entry[:label].split(/\s+/).map { |x| x.intern } else nil end ## skip if we're operating on restored messages, and this one ## ain't. next if target == :restored && (!restored_state[m.id] || (index_state && restored_state[m.id].sort_by { |s| s.to_s } == index_state.sort_by { |s| s.to_s })) ## m.labels is the default source labels. tweak these according ## to default source state modification flags. m.labels -= [:inbox] if opts[:archive] m.labels -= [:unread] if opts[:read] m.labels += opts[:extra_labels].split(/\s*,\s*/).map { |x| x.intern } if opts[:extra_labels] ## assign message labels based on the operation we're performing case op when :asis m.labels = index_state if index_state when :restore ## if the entry exists on disk if restored_state[m.id] m.labels = restored_state[m.id] num_restored += 1 elsif index_state m.labels = index_state end when :discard ## nothin! use default source labels end if Time.now - last_info_time > 60 last_info_time = Time.now elapsed = last_info_time - start_time pctdone = source.respond_to?(:pct_done) ? source.pct_done : 100.0 * (source.cur_offset.to_f - source.start_offset).to_f / (source.end_offset - source.start_offset).to_f remaining = (100.0 - pctdone) * (elapsed.to_f / pctdone) $stderr.puts "## #{num_scanned} (#{pctdone}%) read; #{elapsed.to_time_s} elapsed; #{remaining.to_time_s} remaining" end if index_state.nil? puts "Adding message #{source}##{offset} with state {#{m.labels * ', '}}" if opts[:verbose] num_added += 1 else puts "Updating message #{source}##{offset}, source #{entry[:source_id]} => #{source.id}, offset #{entry[:source_info]} => #{offset}, state {#{index_state * ', '}} => {#{m.labels * ', '}}" if opts[:verbose] num_updated += 1 end opts[:dry_run] ? nil : m end $stderr.puts "Scanned #{num_scanned}, added #{num_added}, updated #{num_updated} messages from #{source}." $stderr.puts "Restored state on #{num_restored} (#{100.0 * num_restored / num_scanned}%) messages." if num_restored > 0 end ## delete any messages in the index that claim they're from one of ## these sources, but that we didn't see. ## ## kinda crappy code here, because we delve directly into the Ferret ## API. ## ## TODO: move this to Index, i suppose. if target == :all || target == :changed $stderr.puts "Deleting missing messages from the index..." num_del, num_scanned = 0, 0 sources.each do |source| raise "no source id for #{source}" unless source.id q = "+source_id:#{source.id}" q += " +source_info: >= #{opts[:start_at]}" if opts[:start_at] index.index.search_each(q, :limit => :all) do |docid, score| num_scanned += 1 mid = index.index[docid][:message_id] unless seen[mid] puts "Deleting #{mid}" if opts[:verbose] index.index.delete docid unless opts[:dry_run] num_del += 1 end end end $stderr.puts "Deleted #{num_del} / #{num_scanned} messages" end index.save if opts[:optimize] $stderr.puts "Optimizing index..." optt = time { index.index.optimize unless opts[:dry_run] } $stderr.puts "Optimized index of size #{index.size} in #{optt}s." end rescue Redwood::FatalSourceError => e $stderr.puts "Sorry, I couldn't communicate with a source: #{e.message}" rescue Exception => e File.open("sup-exception-log.txt", "w") { |f| f.puts e.backtrace } raise ensure Redwood::finish index.unlock end