From 6af3b3fc7fa791b4e253078747cd6d146c8cdb6a Mon Sep 17 00:00:00 2001 From: William Morgan Date: Sun, 26 Apr 2009 11:44:37 -0400 Subject: [PATCH] change email header parsing. MASSIVE SPEEDUP! restructure the email header parsing code to improve speed and to improve API. MBox::read_header now returns a hash containing all headers, with keys downcased but nothing else. update Message#parse_header to accept this new format. update unit tests to reflect new format. The exciting news is that changes in how the parsing works have resulted in a 480% speedup in message parsing speed (634 message/s vs 132 messages/s on my machine), across ALL source types. hell yes. --- lib/sup/mbox.rb | 60 +++++++++++++----------------------- lib/sup/message.rb | 65 +++++++++++++++++---------------------- test/test_mbox_parsing.rb | 47 +++++++++++----------------- test/test_message.rb | 2 +- 4 files changed, 69 insertions(+), 105 deletions(-) diff --git a/lib/sup/mbox.rb b/lib/sup/mbox.rb index e9787ca..223bb7c 100644 --- a/lib/sup/mbox.rb +++ b/lib/sup/mbox.rb @@ -10,58 +10,40 @@ module Redwood ## ## TODO: move functionality to somewhere better, like message.rb module MBox - BREAK_RE = /^From \S+/ - HEADER_RE = /\s*(.*?)\s*/ + BREAK_RE = /^From \S+/ ######### TODO REMOVE ME + ## WARNING! THIS IS A SPEED-CRITICAL SECTION. Everything you do here will have + ## a significant effect on Sup's processing speed of email from ALL sources. + ## Little things like string interpolation, regexp interpolation, += vs <<, + ## all have DRAMATIC effects. BE CAREFUL WHAT YOU DO! def read_header f header = {} last = nil - ## i do it in this weird way because i am trying to speed things up - ## when scanning over large mbox files. while(line = f.gets) case line ## these three can occur multiple times, and we want the first one - when /^(Delivered-To):#{HEADER_RE}$/i, - /^(X-Original-To):#{HEADER_RE}$/i, - /^(Envelope-To):#{HEADER_RE}$/i: header[last = $1] ||= $2 - - when /^(From):#{HEADER_RE}$/i, - /^(To):#{HEADER_RE}$/i, - /^(Cc):#{HEADER_RE}$/i, - /^(Bcc):#{HEADER_RE}$/i, - /^(Subject):#{HEADER_RE}$/i, - /^(Date):#{HEADER_RE}$/i, - /^(References):#{HEADER_RE}$/i, - /^(In-Reply-To):#{HEADER_RE}$/i, - /^(Reply-To):#{HEADER_RE}$/i, - /^(List-Post):#{HEADER_RE}$/i, - /^(List-Subscribe):#{HEADER_RE}$/i, - /^(List-Unsubscribe):#{HEADER_RE}$/i, - /^(Status):#{HEADER_RE}$/i, - /^(X-\S+):#{HEADER_RE}$/: header[last = $1] = $2 - when /^(Message-Id):#{HEADER_RE}$/i: header[mid_field = last = $1] = $2 - - when /^\r*$/: break - when /^\S+:/: last = nil # some other header we don't care about + when /^(Delivered-To|X-Original-To|Envelope-To):\s*(.*?)\s*$/i; header[last = $1.downcase] ||= $2 + ## mark this guy specially. not sure why i care. + when /^([^:\s]+):\s*(.*?)\s*$/i; header[last = $1.downcase] = $2 + when /^\r*$/; break else - header[last] += " " + line.chomp.gsub(/^\s+/, "") if last + if last + header[last] << " " unless header[last].empty? + header[last] << line.strip + end end end - if mid_field && header[mid_field] && header[mid_field] =~ /<(.*?)>/ - header[mid_field] = $1 - end - - header.each do |k, v| + %w(subject from to cc bcc).each do |k| + v = header[k] or next next unless Rfc2047.is_encoded? v - header[k] = - begin - Rfc2047.decode_to $encoding, v - rescue Errno::EINVAL, Iconv::InvalidEncoding, Iconv::IllegalSequence => e - Redwood::log "warning: error decoding RFC 2047 header (#{e.class.name}): #{e.message}" - v - end + header[k] = begin + Rfc2047.decode_to $encoding, v + rescue Errno::EINVAL, Iconv::InvalidEncoding, Iconv::IllegalSequence => e + Redwood::log "warning: error decoding RFC 2047 header (#{e.class.name}): #{e.message}" + v + end end header end diff --git a/lib/sup/message.rb b/lib/sup/message.rb index 0ee46fb..57187b2 100644 --- a/lib/sup/message.rb +++ b/lib/sup/message.rb @@ -64,45 +64,38 @@ class Message end def parse_header header - header.keys.each { |k| header[k.downcase] = header[k] } # canonicalize - - fakeid = nil - fakename = nil - - @id = - if header["message-id"] - sanitize_message_id header["message-id"] - else - fakeid = "sup-faked-" + Digest::MD5.hexdigest(raw_header) - end + @id = if header["message-id"] + mid = header["message-id"] =~ /<(.+?)>/ ? $1 : header["message-id"] + sanitize_message_id mid + else + id = "sup-faked-" + Digest::MD5.hexdigest(raw_header) + from = header["from"] + #Redwood::log "faking non-existent message-id for message from #{from}: #{id}" + id + end - @from = - if header["from"] - Person.from_address header["from"] - else - fakename = "Sup Auto-generated Fake Sender " - Person.from_address fakename - end - - Redwood::log "faking message-id for message from #@from: #{id}" if fakeid - Redwood::log "faking from for message #@id: #{fakename}" if fakename - - date = header["date"] - @date = - case date - when Time - date - when String - begin - Time.parse date - rescue ArgumentError => e - Redwood::log "faking date header for #{@id} due to error parsing date #{header['date'].inspect}: #{e.message}" - Time.now - end - else - Redwood::log "faking date header for #{@id}" + @from = Person.from_address(if header["from"] + header["from"] + else + name = "Sup Auto-generated Fake Sender " + Redwood::log "faking non-existent sender for message #@id: #{name}" + name + end) + + @date = case(date = header["date"]) + when Time + date + when String + begin + Time.parse date + rescue ArgumentError => e + Redwood::log "faking mangled date header for #{@id} (orig #{header['date'].inspect} gave error: #{e.message})" Time.now end + else + Redwood::log "faking non-existent date header for #{@id}" + Time.now + end @subj = header.member?("subject") ? header["subject"].gsub(/\s+/, " ").gsub(/\s+$/, "") : DEFAULT_SUBJECT @to = Person.from_address_list header["to"] diff --git a/test/test_mbox_parsing.rb b/test/test_mbox_parsing.rb index 32687e5..3486f1b 100644 --- a/test/test_mbox_parsing.rb +++ b/test/test_mbox_parsing.rb @@ -19,18 +19,9 @@ From: Bob To: Sally EOS - assert_equal "Bob ", h["From"] - assert_equal "Sally ", h["To"] - assert_nil h["Message-Id"] - end - - ## this is shitty behavior in retrospect, but it's built in now. - def test_message_id_stripping - h = MBox.read_header StringIO.new("Message-Id: \n") - assert_equal "one@bob.com", h["Message-Id"] - - h = MBox.read_header StringIO.new("Message-Id: one@bob.com\n") - assert_equal "one@bob.com", h["Message-Id"] + assert_equal "Bob ", h["from"] + assert_equal "Sally ", h["to"] + assert_nil h["message-id"] end def test_multiline @@ -39,14 +30,14 @@ From: Bob Subject: one two three four five six To: Sally -References: seven - eight +References: + Seven: Eight EOS - assert_equal "one two three four five six", h["Subject"] - assert_equal "Sally ", h["To"] - assert_equal "seven eight", h["References"] + assert_equal "one two three four five six", h["subject"] + assert_equal "Sally ", h["to"] + assert_equal " ", h["references"] end def test_ignore_spacing @@ -57,26 +48,24 @@ EOS ] variants.each do |s| h = MBox.read_header StringIO.new(s) - assert_equal "one two three end", h["Subject"] + assert_equal "one two three end", h["subject"] end end def test_message_id_ignore_spacing variants = [ "Message-Id: \n", - "Message-Id: one@bob.com \n", "Message-Id: \n", - "Message-Id:one@bob.com \n", ] variants.each do |s| h = MBox.read_header StringIO.new(s) - assert_equal "one@bob.com", h["Message-Id"] + assert_equal "", h["message-id"] end end def test_blank_lines h = MBox.read_header StringIO.new("") - assert_equal nil, h["Message-Id"] + assert_equal nil, h["message-id"] end def test_empty_headers @@ -86,7 +75,7 @@ EOS ] variants.each do |s| h = MBox.read_header StringIO.new(s) - assert_equal "", h["Message-Id"] + assert_equal "", h["message-id"] end end @@ -96,23 +85,23 @@ From: Bob To: a dear friend EOS - assert_equal "Bob ", h["From"] - assert_nil h["To"] + assert_equal "Bob ", h["from"] + assert_nil h["to"] h = MBox.read_header StringIO.new(< \r To: a dear friend EOS - assert_equal "Bob ", h["From"] - assert_nil h["To"] + assert_equal "Bob ", h["from"] + assert_nil h["to"] h = MBox.read_header StringIO.new(< \r\n\r To: a dear friend EOS - assert_equal "Bob ", h["From"] - assert_nil h["To"] + assert_equal "Bob ", h["from"] + assert_nil h["to"] end end diff --git a/test/test_message.rb b/test/test_message.rb index e38ac50..0a7db45 100644 --- a/test/test_message.rb +++ b/test/test_message.rb @@ -511,7 +511,7 @@ EOS # Look at another header field whose first line was blank. list_unsubscribe = sup_message.list_unsubscribe - assert_equal(" , " + + assert_equal(", " + "", list_unsubscribe) -- 2.43.0