From: William Morgan Date: Tue, 2 Jun 2009 14:39:18 +0000 (-0700) Subject: refactor iconv stuff, and normalize message body and headers X-Git-Url: https://git.cworth.org/git?a=commitdiff_plain;h=5cb77d7f09872013df43d9e96cdd4078c21e0aa1;p=sup refactor iconv stuff, and normalize message body and headers - Message body and headers are now normalized to the current encoding. This should limit the amount of unprintable characters on the screen. - No more MessageFormatError (was never being thrown anyways). - Move exception handling to Iconv.easy_decode - No more Message#convert_from, since Iconv.easy_decode does it all now. --- diff --git a/bin/sup-recover-sources b/bin/sup-recover-sources index af39b7d..d3b1424 100755 --- a/bin/sup-recover-sources +++ b/bin/sup-recover-sources @@ -72,18 +72,14 @@ ARGV.each do |fn| source_ids = {} count = 0 source.each do |offset, labels| - begin - m = Redwood::Message.new :source => source, :source_info => offset - docid, entry = index.load_entry_for_id m.id - next unless entry - #puts "# #{source} #{offset} #{entry[:source_id]}" - - source_ids[entry[:source_id]] = (source_ids[entry[:source_id]] || 0) + 1 - count += 1 - break if count == $opts[:scan_num] - rescue Redwood::MessageFormatError => e - puts "# #{e.message}" - end + m = Redwood::Message.new :source => source, :source_info => offset + docid, entry = index.load_entry_for_id m.id + next unless entry + #puts "# #{source} #{offset} #{entry[:source_id]}" + + source_ids[entry[:source_id]] = (source_ids[entry[:source_id]] || 0) + 1 + count += 1 + break if count == $opts[:scan_num] end if source_ids.size == 1 diff --git a/lib/sup/message-chunks.rb b/lib/sup/message-chunks.rb index 1bf7796..865dbf8 100644 --- a/lib/sup/message-chunks.rb +++ b/lib/sup/message-chunks.rb @@ -99,7 +99,7 @@ EOS text = case @content_type when /^text\/plain\b/ - Message.convert_from @raw_content, encoded_content.charset + Iconv.easy_decode $encoding, encoded_content.charset, @raw_content else HookManager.run "mime-decode", :content_type => content_type, :filename => lambda { write_to_disk }, diff --git a/lib/sup/message.rb b/lib/sup/message.rb index 6dd1f7d..5993729 100644 --- a/lib/sup/message.rb +++ b/lib/sup/message.rb @@ -1,10 +1,7 @@ require 'time' -require 'iconv' module Redwood -class MessageFormatError < StandardError; end - ## a Message is what's threaded. ## ## it is also where the parsing for quotes and signatures is done, but @@ -64,6 +61,13 @@ class Message end def parse_header header + ## forcibly decode these headers from and to the current encoding, + ## which serves to strip out characters that aren't displayable + ## (and which would otherwise be screwing up the display) + %w(from to subject cc bcc).each do |f| + header[f] = Iconv.easy_decode($encoding, $encoding, header[f]) if header[f] + end + @id = if header["message-id"] mid = header["message-id"] =~ /<(.+?)>/ ? $1 : header["message-id"] sanitize_message_id mid @@ -73,7 +77,7 @@ class Message #Redwood::log "faking non-existent message-id for message from #{from}: #{id}" id end - + @from = Person.from_address(if header["from"] header["from"] else @@ -204,7 +208,7 @@ class Message ## so i will keep this. parse_header @source.load_header(@source_info) message_to_chunks @source.load_message(@source_info) - rescue SourceError, SocketError, MessageFormatError => e + rescue SourceError, SocketError => e Redwood::log "problem getting messages from #{@source}: #{e.message}" ## we need force_to_top here otherwise this window will cover ## up the error message one @@ -421,24 +425,15 @@ private ## otherwise, it's body text else - body = Message.convert_from m.decode, m.charset if m.body + ## if there's no charset, use the current encoding as the charset. + ## this ensures that the body is normalized to avoid non-displayable + ## characters + body = Iconv.easy_decode($encoding, m.charset || $encoding, m.decode) if m.body text_to_chunks((body || "").normalize_whitespace.split("\n"), encrypted) end end end - def self.convert_from body, charset - begin - raise MessageFormatError, "RubyMail decode returned a null body" unless body - return body unless charset - Iconv.easy_decode($encoding, charset, body) - rescue Errno::EINVAL, Iconv::InvalidEncoding, Iconv::IllegalSequence, MessageFormatError => e - Redwood::log "warning: error (#{e.class.name}) decoding message body from #{charset}: #{e.message}" - File.open(File.join(BASE_DIR,"unable-to-decode.txt"), "w") { |f| f.write body } - body - end - end - ## parse the lines of text into chunk objects. the heuristics here ## need tweaking in some nice manner. TODO: move these heuristics ## into the classes themselves. diff --git a/lib/sup/poll.rb b/lib/sup/poll.rb index fb4abb2..354bd21 100644 --- a/lib/sup/poll.rb +++ b/lib/sup/poll.rb @@ -137,33 +137,29 @@ EOS def add_messages_from source, opts={} begin return if source.done? || source.has_errors? - + source.each do |offset, labels| if source.has_errors? Redwood::log "error loading messages from #{source}: #{source.error.message}" return end - + labels.each { |l| LabelManager << l } labels = labels + (source.archived? ? [] : [:inbox]) - begin - m = Message.new :source => source, :source_info => offset, :labels => labels - m.load_from_source! - - if m.source_marked_read? - m.remove_label :unread - labels.delete :unread - end + m = Message.new :source => source, :source_info => offset, :labels => labels + m.load_from_source! - docid, entry = Index.load_entry_for_id m.id - HookManager.run "before-add-message", :message => m - m = yield(m, offset, entry) or next if block_given? - times = Index.sync_message m, false, docid, entry, opts - UpdateManager.relay self, :added, m unless entry - rescue MessageFormatError => e - Redwood::log "ignoring erroneous message at #{source}##{offset}: #{e.message}" + if m.source_marked_read? + m.remove_label :unread + labels.delete :unread end + + docid, entry = Index.load_entry_for_id m.id + HookManager.run "before-add-message", :message => m + m = yield(m, offset, entry) or next if block_given? + times = Index.sync_message m, false, docid, entry, opts + UpdateManager.relay self, :added, m unless entry end rescue SourceError => e Redwood::log "problem getting messages from #{source}: #{e.message}" diff --git a/lib/sup/util.rb b/lib/sup/util.rb index b479908..049a304 100644 --- a/lib/sup/util.rb +++ b/lib/sup/util.rb @@ -628,16 +628,21 @@ class Iconv def self.easy_decode target, charset, text return text if charset =~ /^(x-unknown|unknown[-_ ]?8bit|ascii[-_ ]?7[-_ ]?bit)$/i charset = case charset - when /UTF[-_ ]?8/i: "utf-8" - when /(iso[-_ ])?latin[-_ ]?1$/i: "ISO-8859-1" - when /iso[-_ ]?8859[-_ ]?15/i: 'ISO-8859-15' - when /unicode[-_ ]1[-_ ]1[-_ ]utf[-_]7/i: "utf-7" - else charset - end - - # Convert: - # - # Remember - Iconv.open(to, from)! - Iconv.iconv(target + "//IGNORE", charset, text + " ").join[0 .. -2] + when /UTF[-_ ]?8/i: "utf-8" + when /(iso[-_ ])?latin[-_ ]?1$/i: "ISO-8859-1" + when /iso[-_ ]?8859[-_ ]?15/i: 'ISO-8859-15' + when /unicode[-_ ]1[-_ ]1[-_ ]utf[-_]7/i: "utf-7" + else charset + end + + begin + Iconv.iconv(target + "//IGNORE", charset, text + " ").join[0 .. -2] + rescue Errno::EINVAL, Iconv::InvalidEncoding, Iconv::IllegalSequence => e + Redwood::log "warning: error (#{e.class.name}) decoding text from #{charset} to #{target}: #{text[0 ... 20]}" + text + end end + + ## normalize a string to be in the current encoding ($encoding) + def self.normalize s; easy_decode $encoding, $encoding, s end end