X-Git-Url: https://git.cworth.org/git?a=blobdiff_plain;ds=sidebyside;f=lib%2Fsup%2Fmbox%2Floader.rb;h=a11bf9541e6efed049754cf85442c6cdbd714c8b;hb=4efc8adaf8c63ff054c71149de6c12fe8194ff03;hp=51af5f26742e22312979cf46ba05497c3c5d216d;hpb=d9cb9b4e7f2b1414578d5d8d56951173906e4507;p=sup diff --git a/lib/sup/mbox/loader.rb b/lib/sup/mbox/loader.rb index 51af5f2..a11bf95 100644 --- a/lib/sup/mbox/loader.rb +++ b/lib/sup/mbox/loader.rb @@ -1,116 +1,179 @@ -require 'thread' require 'rmail' +require 'uri' +require 'set' module Redwood module MBox -class Error < StandardError; end - -class Loader - attr_reader :filename - bool_reader :usual, :archived, :read, :dirty - attr_accessor :id, :labels - - ## end_offset is the last offsets within the file which we've read. - ## everything after that is considered new messages that haven't - ## been indexed. - def initialize filename, end_offset=0, usual=true, archived=false, id=nil - @filename = filename.gsub(%r(^mbox://), "") - @end_offset = end_offset - @dirty = false - @usual = usual - @archived = archived - @id = id +class Loader < Source + include SerializeLabelsNicely + yaml_properties :uri, :cur_offset, :usual, :archived, :id, :labels + + attr_reader :labels + + ## uri_or_fp is horrific. need to refactor. + def initialize uri_or_fp, start_offset=0, usual=true, archived=false, id=nil, labels=nil @mutex = Mutex.new - @f = File.open @filename - @labels = ([ - :unread, - archived ? nil : :inbox, - ] + - if File.dirname(filename) =~ /\b(var|usr|spool)\b/ - [] - else - [File.basename(filename).intern] - end).compact + @labels = Set.new((labels || []) - LabelManager::RESERVED_LABELS) + + case uri_or_fp + when String + uri = URI(Source.expand_filesystem_uri(uri_or_fp)) + raise ArgumentError, "not an mbox uri" unless uri.scheme == "mbox" + raise ArgumentError, "mbox URI ('#{uri}') cannot have a host: #{uri.host}" if uri.host + raise ArgumentError, "mbox URI must have a path component" unless uri.path + @f = File.open uri.path + @path = uri.path + else + @f = uri_or_fp + @path = uri_or_fp.path + end + + super uri_or_fp, start_offset, usual, archived, id end - def reset!; @end_offset = 0; @dirty = true; end - def == o; o.is_a?(Loader) && o.filename == filename; end - def to_s; "mbox://#{@filename}"; end + def file_path; @path end + def is_source_for? uri; super || (self.uri.is_a?(String) && (URI(Source.expand_filesystem_uri(uri)) == URI(Source.expand_filesystem_uri(self.uri)))) end - def is_source_for? s - @filename == s || self.to_s == s + def self.suggest_labels_for path + ## heuristic: use the filename as a label, unless the file + ## has a path that probably represents an inbox. + if File.dirname(path) =~ /\b(var|usr|spool)\b/ + [] + else + [File.basename(path).downcase.intern] + end + end + + def check + if (cur_offset ||= start_offset) > end_offset + raise OutOfSyncSourceError, "mbox file is smaller than last recorded message offset. Messages have probably been deleted by another client." + end end - def load_header offset=nil + def start_offset; 0; end + def end_offset; File.size @f; end + + def load_header offset header = nil @mutex.synchronize do - @f.seek offset if offset - header = MBox::read_header @f + @f.seek offset + l = @f.gets + unless MBox::is_break_line? l + raise OutOfSyncSourceError, "mismatch in mbox file offset #{offset.inspect}: #{l.inspect}." + end + header = parse_raw_email_header @f end header end def load_message offset - ret = nil @mutex.synchronize do @f.seek offset - RMail::Mailbox::MBoxReader.new(@f).each_message do |input| - return RMail::Parser.read(input) + begin + ## don't use RMail::Mailbox::MBoxReader because it doesn't properly ignore + ## "From" at the start of a message body line. + string = "" + l = @f.gets + string << l until @f.eof? || MBox::is_break_line?(l = @f.gets) + RMail::Parser.read string + rescue RMail::Parser::Error => e + raise FatalSourceError, "error parsing mbox file: #{e.message}" + end + end + end + + ## scan forward until we're at the valid start of a message + def correct_offset! + @mutex.synchronize do + @f.seek cur_offset + string = "" + until @f.eof? || MBox::is_break_line?(l = @f.gets) + string << l end + self.cur_offset += string.length end end - ## load the full header text - def load_header_text offset + def raw_header offset ret = "" @mutex.synchronize do @f.seek offset - until @f.eof? || (l = @f.gets) =~ /^$/ - ret += l + until @f.eof? || (l = @f.gets) =~ /^\r*$/ + ret << l end end ret end - def next - return nil if done? - @dirty = true - next_end_offset = @end_offset + def raw_message offset + ret = "" + each_raw_message_line(offset) { |l| ret << l } + ret + end - @mutex.synchronize do - @f.seek @end_offset + def store_message date, from_email, &block + need_blank = File.exists?(@filename) && !File.zero?(@filename) + File.open(@filename, "a") do |f| + f.puts if need_blank + f.puts "From #{from_email} #{date.utc}" + yield f + end + end - @f.gets # skip the From separator - next_end_offset = @f.tell - while(line = @f.gets) - break if line =~ BREAK_RE - next_end_offset = @f.tell + ## apparently it's a million times faster to call this directly if + ## we're just moving messages around on disk, than reading things + ## into memory with raw_message. + ## + ## i hoped never to have to move shit around on disk but + ## sup-sync-back has to do it. + def each_raw_message_line offset + @mutex.synchronize do + @f.seek offset + yield @f.gets + until @f.eof? || MBox::is_break_line?(l = @f.gets) + yield l end end - - start_offset = @end_offset - @end_offset = next_end_offset - - start_offset end - def each - until @end_offset >= File.size(@f) - n = self.next - yield(n, labels) if n + def next + returned_offset = nil + next_offset = cur_offset + + begin + @mutex.synchronize do + @f.seek cur_offset + + ## cur_offset could be at one of two places here: + + ## 1. before a \n and a mbox separator, if it was previously at + ## EOF and a new message was added; or, + ## 2. at the beginning of an mbox separator (in all other + ## cases). + + l = @f.gets or return nil + if l =~ /^\s*$/ # case 1 + returned_offset = @f.tell + @f.gets # now we're at a BREAK_RE, so skip past it + else # case 2 + returned_offset = cur_offset + ## we've already skipped past the BREAK_RE, so just go + end + + while(line = @f.gets) + break if MBox::is_break_line? line + next_offset = @f.tell + end + end + rescue SystemCallError, IOError => e + raise FatalSourceError, "Error reading #{@f.path}: #{e.message}" end - end - def each_header - each { |offset, labels| yield offset, labels, load_header(offset) } + self.cur_offset = next_offset + [returned_offset, (labels + [:unread])] end - - def done?; @end_offset >= File.size(@f); end - def total; File.size @f; end end -Redwood::register_yaml(Loader, %w(filename end_offset usual archived id)) - end end