* Author: Carl Worth <cworth@cworth.org>
*/
+/* This indexer creates a Xapian mail index that is remarkably similar
+ * to that created by sup. The big difference, (and the thing that
+ * will keep a notmuch index from being used by sup directly), is that
+ * sup expects a serialized ruby data structure in the document's data
+ * field, but notmuch just puts the mail's filename there (trusting
+ * that the email client can get the data in needs from the filename).
+ *
+ * Note: One bug here is that sup actually merges together fields such
+ * as To, CC, Bcc etc. when finding multiple emails with the same
+ * message ID. To support something similar, notmuch should list
+ * multiple files in the data field.
+ *
+ * Other differences between sup and notmuch-index identified so far:
+ *
+ * o sup supports encrypted mime parts by prompting for a passphrase
+ * to decrypt the message. So far, notmuch doesn't support this,
+ * both because I'm lazy to code it, and I also think doing so
+ * would present a security leak.
+ *
+ * o sup and notmuch have different heuristics for identifying (and
+ * thus ignoring) signatures. For example, sup considers a line
+ * consisting of two hypens as a signature separator, while
+ * notmuch expects those two hyphens to be followed by a space
+ * character.
+ *
+ * o sup as been seen to split some numbers before indexing
+ * them. For example, the number 1754 in an email message was
+ * indexed by sup as separate terms 17 and 54. I couldn't find any
+ * explanation for this behavior and did not try to replicate it
+ * in notmuch.
+ */
+
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <time.h>
+#include <sys/time.h>
#include <iostream>
int i;
InternetAddress *address;
+ if (addresses == NULL)
+ return;
+
for (i = 0; i < internet_address_list_length (addresses); i++) {
address = internet_address_list_get_address (addresses, i);
gen_terms_address_name (term_gen, address, address_type);
int i;
InternetAddress *address;
+ if (addresses == NULL)
+ return;
+
for (i = 0; i < internet_address_list_length (addresses); i++) {
address = internet_address_list_get_address (addresses, i);
add_term_address_addr (doc, address, address_type);
{
const char *s = subject;
+ if (subject == NULL)
+ return NULL;
+
while (*s) {
while (*s && isspace (*s))
s++;
stream = g_mime_stream_mem_new_with_byte_array (byte_array);
g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (stream), FALSE);
wrapper = g_mime_part_get_content_object (GMIME_PART (part));
- g_mime_data_wrapper_write_to_stream (wrapper, stream);
+ if (wrapper)
+ g_mime_data_wrapper_write_to_stream (wrapper, stream);
g_object_unref (stream);
GIOChannel *channel;
GIOStatus gio_status;
GError *error = NULL;
+ int count;
+ struct timeval tv_start, tv_now;
if (argc < 2) {
usage (argv[0]);
channel = g_io_channel_unix_new (fileno (stdin));
+ count = 0;
+
+ gettimeofday (&tv_start, NULL);
+
while (1) {
gio_status = g_io_channel_read_line (channel, &filename,
NULL, NULL, &error);
index_file (db, term_gen, filename);
g_free (filename);
+
+ count++;
+ if (count % 250 == 0) {
+ gettimeofday (&tv_now, NULL);
+ printf ("Indexed %d messages (%g messages/second)\n",
+ count, count / ((tv_now.tv_sec - tv_start.tv_sec) +
+ (tv_now.tv_usec - tv_start.tv_usec) / 1e6));
+ }
}
} catch (const Xapian::Error &error) {