module Redwood
-## some utility functions. actually these are not mbox-specific at all
-## and should be moved somewhere else.
-##
-## TODO: move functionality to somewhere better, like message.rb
module MBox
- BREAK_RE = /^From \S+@\S+ /
+ BREAK_RE = /^From \S+ (.+)$/
- HEADER_RE = /\s*(.*?)\s*/
+
+ def is_break_line? l
+ l =~ BREAK_RE or return false
+ time = $1
+ begin
+ ## hack -- make Time.parse fail when trying to substitute values from Time.now
+ Time.parse time, 0
+ true
+ rescue NoMethodError
+ Redwood::log "found invalid date in potential mbox split line, not splitting: #{l.inspect}"
+ false
+ end
+ end
+ module_function :is_break_line?
-
- def read_header f
- header = {}
- last = nil
-
- ## i do it in this weird way because i am trying to speed things up
- ## when scanning over large mbox files.
- while(line = f.gets)
- case line
- ## these three can occur multiple times, and we want the first one
- when /^(Delivered-To):#{HEADER_RE}$/i,
- /^(X-Original-To):#{HEADER_RE}$/i,
- /^(Envelope-To):#{HEADER_RE}$/i: header[last = $1] ||= $2
-
- when /^(From):#{HEADER_RE}$/i,
- /^(To):#{HEADER_RE}$/i,
- /^(Cc):#{HEADER_RE}$/i,
- /^(Bcc):#{HEADER_RE}$/i,
- /^(Subject):#{HEADER_RE}$/i,
- /^(Date):#{HEADER_RE}$/i,
- /^(References):#{HEADER_RE}$/i,
- /^(In-Reply-To):#{HEADER_RE}$/i,
- /^(Reply-To):#{HEADER_RE}$/i,
- /^(List-Post):#{HEADER_RE}$/i,
- /^(List-Subscribe):#{HEADER_RE}$/i,
- /^(List-Unsubscribe):#{HEADER_RE}$/i,
- /^(Status):#{HEADER_RE}$/i,
- /^(X-\S+):#{HEADER_RE}$/: header[last = $1] = $2
- when /^(Message-Id):#{HEADER_RE}$/i: header[mid_field = last = $1] = $2
-
- when /^\r*$/: break
- when /^\S+:/: last = nil # some other header we don't care about
- else
- header[last] += " " + line.chomp.gsub(/^\s+/, "") if last
- end
- end
-
- if mid_field && header[mid_field] && header[mid_field] =~ /<(.*?)>/
- header[mid_field] = $1
- end
-
- header.each do |k, v|
- next unless Rfc2047.is_encoded? v
- header[k] =
- begin
- Rfc2047.decode_to $encoding, v
- rescue Errno::EINVAL, Iconv::InvalidEncoding, Iconv::IllegalSequence => e
- Redwood::log "warning: error decoding RFC 2047 header (#{e.class.name}): #{e.message}"
- v
- end
- end
- header
- end
-
- ## never actually called
- def read_body f
- body = []
- f.each_line do |l|
- break if is_break_line?(l)
- body << l.chomp
- end
- body
- end
-
- module_function :read_header, :read_body
end
end
--- /dev/null
+#!/usr/bin/ruby
+
+require 'test/unit'
+require 'sup'
+require 'stringio'
+
+include Redwood
+
+class TestMBoxParsing < Test::Unit::TestCase
+ def setup
+ end
+
+ def teardown
+ end
+
+ def test_normal_headers
+ h = Source.parse_raw_email_header StringIO.new(<<EOS)
+From: Bob <bob@bob.com>
+To: Sally <sally@sally.com>
+EOS
+
+ assert_equal "Bob <bob@bob.com>", h["from"]
+ assert_equal "Sally <sally@sally.com>", h["to"]
+ assert_nil h["message-id"]
+ end
+
+ def test_multiline
+ h = Source.parse_raw_email_header StringIO.new(<<EOS)
+From: Bob <bob@bob.com>
+Subject: one two three
+ four five six
+To: Sally <sally@sally.com>
+References: <seven>
+ <eight>
+Seven: Eight
+EOS
+
+ assert_equal "one two three four five six", h["subject"]
+ assert_equal "Sally <sally@sally.com>", h["to"]
+ assert_equal "<seven> <eight>", h["references"]
+ end
+
+ def test_ignore_spacing
+ variants = [
+ "Subject:one two three end\n",
+ "Subject: one two three end\n",
+ "Subject: one two three end \n",
+ ]
+ variants.each do |s|
+ h = Source.parse_raw_email_header StringIO.new(s)
+ assert_equal "one two three end", h["subject"]
+ end
+ end
+
+ def test_message_id_ignore_spacing
+ variants = [
+ "Message-Id: <one@bob.com> \n",
+ "Message-Id:<one@bob.com> \n",
+ ]
+ variants.each do |s|
+ h = Source.parse_raw_email_header StringIO.new(s)
+ assert_equal "<one@bob.com>", h["message-id"]
+ end
+ end
+
+ def test_blank_lines
+ h = Source.parse_raw_email_header StringIO.new("")
+ assert_equal nil, h["message-id"]
+ end
+
+ def test_empty_headers
+ variants = [
+ "Message-Id: \n",
+ "Message-Id:\n",
+ ]
+ variants.each do |s|
+ h = Source.parse_raw_email_header StringIO.new(s)
+ assert_equal "", h["message-id"]
+ end
+ end
+
+ def test_detect_end_of_headers
+ h = Source.parse_raw_email_header StringIO.new(<<EOS)
+From: Bob <bob@bob.com>
+
+To: a dear friend
+EOS
+ assert_equal "Bob <bob@bob.com>", h["from"]
+ assert_nil h["to"]
+
+ h = Source.parse_raw_email_header StringIO.new(<<EOS)
+From: Bob <bob@bob.com>
+\r
+To: a dear friend
+EOS
+ assert_equal "Bob <bob@bob.com>", h["from"]
+ assert_nil h["to"]
+
+ h = Source.parse_raw_email_header StringIO.new(<<EOS)
+From: Bob <bob@bob.com>
+\r\n\r
+To: a dear friend
+EOS
+ assert_equal "Bob <bob@bob.com>", h["from"]
+ assert_nil h["to"]
+ end
++
++ def test_from_line_splitting
++ l = MBox::Loader.new StringIO.new(<<EOS)
++From sup-talk-bounces@rubyforge.org Mon Apr 27 12:56:18 2009
++From: Bob <bob@bob.com>
++To: a dear friend
++
++Hello there friend. How are you?
++
++From sea to shining sea
++
++From bob@bob.com I get only spam.
++
++From bob@bob.com
++
++From bob@bob.com
++
++(that second one has spaces at the endj
++
++This is the end of the email.
++EOS
++ offset, labels = l.next
++ assert_equal 0, offset
++ offset, labels = l.next
++ assert_nil offset
++ end
++
++ def test_more_from_line_splitting
++ l = MBox::Loader.new StringIO.new(<<EOS)
++From sup-talk-bounces@rubyforge.org Mon Apr 27 12:56:18 2009
++From: Bob <bob@bob.com>
++To: a dear friend
++
++Hello there friend. How are you?
++
++From bob@bob.com Mon Apr 27 12:56:19 2009
++From: Bob <bob@bob.com>
++To: a dear friend
++
++Hello again! Would you like to buy my products?
++EOS
++ offset, labels = l.next
++ assert_not_nil offset
++
++ offset, labels = l.next
++ assert_not_nil offset
++
++ offset, labels = l.next
++ assert_nil offset
++ end
+end