From: wmorgan Date: Mon, 18 Jun 2007 14:26:59 +0000 (+0000) Subject: encoding and rfc2047 header support X-Git-Url: https://git.cworth.org/git?a=commitdiff_plain;h=81f806e535dd0ee10edbc114482f9a624b276e86;p=sup encoding and rfc2047 header support git-svn-id: svn://rubyforge.org/var/svn/sup/trunk@463 5c8cc53c-5e98-4d25-b20a-d8db53a31250 --- diff --git a/Manifest.txt b/Manifest.txt index 8d90c39..440f0bb 100644 --- a/Manifest.txt +++ b/Manifest.txt @@ -55,6 +55,7 @@ lib/sup/modes/thread-index-mode.rb lib/sup/modes/thread-view-mode.rb lib/sup/person.rb lib/sup/poll.rb +lib/sup/rfc2047.rb lib/sup/sent.rb lib/sup/source.rb lib/sup/suicide.rb diff --git a/Rakefile b/Rakefile index cc4a12a..0ea68f3 100644 --- a/Rakefile +++ b/Rakefile @@ -16,7 +16,7 @@ Hoe.new('sup', Redwood::VERSION) do |p| p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[2].gsub(/^\s+/, "") p.changes = p.paragraphs_of('History.txt', 0..0).join("\n\n") p.email = "wmorgan-sup@masanjin.net" - p.extra_deps = [['ferret', '>= 0.10.13'], ['ncurses', '>= 0.9.1'], ['rmail', '>= 0.17'], 'highline', 'net-ssh', ['trollop', '>= 1.5'], 'lockfile'] + p.extra_deps = [['ferret', '>= 0.10.13'], ['ncurses', '>= 0.9.1'], ['rmail', '>= 0.17'], 'highline', 'net-ssh', ['trollop', '>= 1.5'], 'lockfile', 'iconv'] end rule 'ss?.png' => 'ss?-small.png' do |t| diff --git a/lib/sup.rb b/lib/sup.rb index b7c8f02..3f305ce 100644 --- a/lib/sup.rb +++ b/lib/sup.rb @@ -47,6 +47,16 @@ module Redwood YAML_DOMAIN = "masanjin.net" YAML_DATE = "2006-10-01" +## determine encoding and character set +## probably a better way to do this + $ctype = ENV["LC_CTYPE"] || ENV["LANG"] || "en-US.utf-8" + $encoding = + if $ctype =~ /\.(.*)?/ + $1 + else + "utf-8" + end + ## record exceptions thrown in threads nicely $exception = nil def reporting_thread diff --git a/lib/sup/mbox.rb b/lib/sup/mbox.rb index 894d563..97ddcbb 100644 --- a/lib/sup/mbox.rb +++ b/lib/sup/mbox.rb @@ -1,10 +1,14 @@ require "sup/mbox/loader" require "sup/mbox/ssh-file" require "sup/mbox/ssh-loader" +require "sup/rfc2047" module Redwood -## some utility functions +## some utility functions. actually these are not mbox-specific at all +## and should be moved somewhere else. +## +## TODO: move functionality to somewhere better, like message.rb module MBox BREAK_RE = /^From \S+/ @@ -46,6 +50,16 @@ module MBox header[mid_field] = $1 end + header.each do |k, v| + next unless Rfc2047.is_encoded? v + header[k] = + begin + Rfc2047.decode_to $encoding, v + rescue Errno::EINVAL, Icon::InvalidEncoding, Iconv::IllegalSequence => e + Redwood::log "warning: error decoding RFC 2047 header: #{e.message}" + v + end + end header end diff --git a/lib/sup/message.rb b/lib/sup/message.rb index 711ab2e..f104127 100644 --- a/lib/sup/message.rb +++ b/lib/sup/message.rb @@ -1,5 +1,6 @@ require 'tempfile' require 'time' +require 'iconv' module Redwood @@ -268,7 +269,21 @@ private else case m.header.content_type when "text/plain", nil + charset = + if m.header.field?("content-type") && m.header.fetch("content-type") =~ /charset=(.*?)(;|$)/ + $1 + end + m.body && body = m.decode or raise MessageFormatError, "For some bizarre reason, RubyMail was unable to parse this message." + + if charset + begin + body = Iconv.iconv($encoding, charset, body).join + rescue Errno::EINVAL, Icon::InvalidEncoding, Iconv::IllegalSequence => e + Redwood::log "warning: error decoding message body from #{charset}: #{e.message}" + end + end + text_to_chunks(body.normalize_whitespace.split("\n")) when /^multipart\// [] diff --git a/lib/sup/rfc2047.rb b/lib/sup/rfc2047.rb new file mode 100644 index 0000000..ab006a0 --- /dev/null +++ b/lib/sup/rfc2047.rb @@ -0,0 +1,61 @@ +## from: http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/101949 + +# $Id: rfc2047.rb,v 1.4 2003/04/18 20:55:56 sam Exp $ +# MODIFIED slightly by William Morgan +# +# An implementation of RFC 2047 decoding. +# +# This module depends on the iconv library by Nobuyoshi Nakada, which I've +# heard may be distributed as a standard part of Ruby 1.8. Many thanks to him +# for helping with building and using iconv. +# +# Thanks to "Josef 'Jupp' Schugt" for pointing out an error with +# stateful character sets. +# +# Copyright (c) Sam Roberts 2004 +# +# This file is distributed under the same terms as Ruby. + +require 'iconv' + +module Rfc2047 + WORD = %r{=\?([!\#$%&'*+-/0-9A-Z\\^\`a-z{|}~]+)\?([BbQq])\?([!->@-~]+)\?=} # :nodoc: 'stupid ruby-mode + WORDSEQ = %r{(#{WORD.source})\s+(?=#{WORD.source})} + + def Rfc2047.is_encoded? s; s =~ WORD end + + # Decodes a string, +from+, containing RFC 2047 encoded words into a target + # character set, +target+. See iconv_open(3) for information on the + # supported target encodings. If one of the encoded words cannot be + # converted to the target encoding, it is left in its encoded form. + def Rfc2047.decode_to(target, from) + from = from.gsub(WORDSEQ, '\1') + out = from.gsub(WORD) do + |word| + charset, encoding, text = $1, $2, $3 + + # B64 or QP decode, as necessary: + case encoding + when 'b', 'B' + #puts text + text = text.unpack('m*')[0] + #puts text.dump + + when 'q', 'Q' + # RFC 2047 has a variant of quoted printable where a ' ' character + # can be represented as an '_', rather than =32, so convert + # any of these that we find before doing the QP decoding. + text = text.tr("_", " ") + text = text.unpack('M*')[0] + + # Don't need an else, because no other values can be matched in a + # WORD. + end + + # Convert: + # + # Remember - Iconv.open(to, from)! + text = Iconv.iconv(target, charset, text).join + end + end +end