2 * Copyright © 2009 Carl Worth
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see http://www.gnu.org/licenses/ .
17 * Author: Carl Worth <cworth@cworth.org>
28 #include <gmime/gmime.h>
34 #define ARRAY_SIZE(arr) (sizeof (arr) / sizeof (arr[0]))
36 /* These prefix values are specifically chosen to be compatible
37 * with sup, (http://sup.rubyforge.org), written by
38 * William Morgan <wmorgan-sup@masanjin.net>, and released
39 * under the GNU GPL v2.
47 prefix_t NORMAL_PREFIX[] = {
50 { "from_name", "FN" },
56 prefix_t BOOLEAN_PREFIX[] = {
58 { "from_email", "FE" },
64 { "attachment_extension", "O" },
70 /* Similarly, these value numbers are also chosen to be sup
74 NOTMUCH_VALUE_MESSAGE_ID = 0,
75 NOTMUCH_VALUE_THREAD = 1,
76 NOTMUCH_VALUE_DATE = 2
80 find_prefix (const char *name)
84 for (i = 0; i < ARRAY_SIZE (NORMAL_PREFIX); i++)
85 if (strcmp (name, NORMAL_PREFIX[i].name) == 0)
86 return NORMAL_PREFIX[i].prefix;
88 for (i = 0; i < ARRAY_SIZE (BOOLEAN_PREFIX); i++)
89 if (strcmp (name, BOOLEAN_PREFIX[i].name) == 0)
90 return BOOLEAN_PREFIX[i].prefix;
95 int TERM_COMBINED = 0;
98 add_term (Xapian::Document doc,
99 const char *prefix_name,
108 prefix = find_prefix (prefix_name);
110 term = g_strdup_printf ("%s%s", prefix, value);
118 gen_terms (Xapian::TermGenerator term_gen,
119 const char *prefix_name,
127 prefix = find_prefix (prefix_name);
129 term_gen.index_text (text, 1, prefix);
133 gen_terms_address_name (Xapian::TermGenerator term_gen,
134 InternetAddress *address,
135 const char *prefix_name)
139 name = internet_address_get_name (address);
142 gen_terms (term_gen, prefix_name, name);
146 gen_terms_address_names (Xapian::TermGenerator term_gen,
147 InternetAddressList *addresses,
148 const char *address_type)
151 InternetAddress *address;
153 for (i = 0; i < internet_address_list_length (addresses); i++) {
154 address = internet_address_list_get_address (addresses, i);
155 gen_terms_address_name (term_gen, address, address_type);
156 gen_terms_address_name (term_gen, address, "name");
157 gen_terms_address_name (term_gen, address, "body");
162 add_term_address_addr (Xapian::Document doc,
163 InternetAddress *address,
164 const char *prefix_name)
166 InternetAddressMailbox *mailbox = INTERNET_ADDRESS_MAILBOX (address);
169 addr = internet_address_mailbox_get_addr (mailbox);
172 add_term (doc, prefix_name, addr);
176 add_terms_address_addrs (Xapian::Document doc,
177 InternetAddressList *addresses,
178 const char *address_type)
181 InternetAddress *address;
183 for (i = 0; i < internet_address_list_length (addresses); i++) {
184 address = internet_address_list_get_address (addresses, i);
185 add_term_address_addr (doc, address, address_type);
186 add_term_address_addr (doc, address, "email");
191 skip_re_in_subject (const char *subject)
193 const char *s = subject;
196 while (*s && isspace (*s))
198 if (strncasecmp (s, "re:", 3) == 0)
207 /* Add a term for each message-id in the References header of the
210 add_terms_references (Xapian::Document doc,
211 GMimeMessage *message)
213 const char *refs, *end, *next;
216 refs = g_mime_object_get_header (GMIME_OBJECT (message), "references");
222 while (*refs && isspace (*refs))
227 while (*end && !isspace (*end))
231 if (end > refs && *end == '>')
234 term = g_strndup (refs, end - refs + 1);
235 add_term (doc, "ref", term);
242 /* Generate terms for the body of a message, given the filename of the
243 * message and the offset at which the headers of the message end,
244 * (and hence the body begins). */
246 gen_terms_body (Xapian::TermGenerator term_gen,
247 const char * filename,
251 GIOStatus gio_status;
252 GError *error = NULL;
253 char *p, *body_line = NULL, *prev_line = NULL;
255 channel = g_io_channel_new_file (filename, "r", &error);
256 if (channel == NULL) {
257 fprintf (stderr, "Error: %s\n", error->message);
261 gio_status = g_io_channel_seek_position (channel, body_offset,
263 if (gio_status != G_IO_STATUS_NORMAL) {
264 fprintf (stderr, "Error: %s\n", error->message);
272 gio_status = g_io_channel_read_line (channel, &body_line,
274 if (gio_status == G_IO_STATUS_EOF)
276 if (gio_status != G_IO_STATUS_NORMAL) {
277 fprintf (stderr, "Error: %s\n", error->message);
281 if (strlen (body_line) == 0)
284 /* If the line looks like it might be introducing a quote,
285 * save it until we see if the next line begins a quote. */
286 p = body_line + strlen (body_line) - 1;
287 while (p > body_line and isspace (*p))
290 prev_line = body_line;
295 /* Skip quoted lines, (and previous lines that introduced them) */
296 if (body_line[0] == '>') {
304 /* Now that we're not looking at a quote we can add the prev_line */
306 gen_terms (term_gen, "body", prev_line);
311 /* Skip signatures */
312 /* XXX: Should only do this if "near" the end of the message. */
313 if (strncmp (body_line, "-- ", 3) == 0 ||
314 strncmp (body_line, "----------", 10) == 0 ||
315 strncmp (body_line, "__________", 10) == 0)
318 gen_terms (term_gen, "body", body_line);
324 g_io_channel_close (channel);
328 index_file (Xapian::WritableDatabase db,
329 Xapian::TermGenerator term_gen,
330 const char *filename)
332 Xapian::Document doc;
336 GMimeMessage *message;
337 InternetAddressList *addresses;
341 const char *value, *from;
344 struct tm gm_time_tm;
345 char date_str[16]; /* YYYYMMDDHHMMSS + 1 for Y100k compatibility ;-) */
347 file = fopen (filename, "r");
349 fprintf (stderr, "Error opening %s: %s\n", filename, strerror (errno));
353 stream = g_mime_stream_file_new (file);
355 parser = g_mime_parser_new_with_stream (stream);
357 message = g_mime_parser_construct_message (parser);
359 doc = Xapian::Document ();
361 doc.set_data (filename);
363 term_gen.set_stemmer (Xapian::Stem ("english"));
365 term_gen.set_document (doc);
367 from = g_mime_message_get_sender (message);
368 addresses = internet_address_list_parse_string (from);
370 gen_terms_address_names (term_gen, addresses, "from_name");
372 addresses = g_mime_message_get_all_recipients (message);
373 gen_terms_address_names (term_gen, addresses, "to_name");
375 value = g_mime_message_get_subject (message);
376 value = skip_re_in_subject (value);
377 gen_terms (term_gen, "subject", value);
378 gen_terms (term_gen, "body", value);
380 gen_terms_body (term_gen, filename,
381 g_mime_parser_get_headers_end (parser));
383 add_terms_references (doc, message);
385 from = g_mime_message_get_sender (message);
386 addresses = internet_address_list_parse_string (from);
388 add_terms_address_addrs (doc, addresses, "from_email");
390 add_terms_address_addrs (doc,
391 g_mime_message_get_all_recipients (message),
394 g_mime_message_get_date (message, &time, NULL);
396 gmtime_r (&time, &gm_time_tm);
398 if (strftime (date_str, sizeof (date_str),
399 "%Y%m%d%H%M%S", &gm_time_tm) == 0) {
400 fprintf (stderr, "Internal error formatting time\n");
404 add_term (doc, "date", date_str);
406 add_term (doc, "label", "inbox");
407 add_term (doc, "label", "unread");
408 add_term (doc, "type", "mail");
409 add_term (doc, "source_id", "1");
411 value = g_mime_message_get_message_id (message);
412 add_term (doc, "msgid", value);
413 add_term (doc, "thread", value);
415 doc.add_value (NOTMUCH_VALUE_MESSAGE_ID, value);
416 doc.add_value (NOTMUCH_VALUE_THREAD, value);
418 doc.add_value (NOTMUCH_VALUE_DATE, Xapian::sortable_serialise (time));
420 db.add_document (doc);
422 g_object_unref (message);
423 g_object_unref (parser);
424 g_object_unref (stream);
428 usage (const char *argv0)
430 fprintf (stderr, "Usage: %s <path-to-xapian-database>\n", argv0);
431 fprintf (stderr, "\n");
432 fprintf (stderr, "Messages to be indexed are read from stdnin as absolute filenames\n");
433 fprintf (stderr, "one file per line.");
437 main (int argc, char **argv)
439 const char *database_path;
442 GIOStatus gio_status;
443 GError *error = NULL;
450 database_path = argv[1];
455 Xapian::WritableDatabase db;
456 Xapian::TermGenerator term_gen;
458 db = Xapian::WritableDatabase (database_path,
459 Xapian::DB_CREATE_OR_OPEN);
461 term_gen = Xapian::TermGenerator ();
463 channel = g_io_channel_unix_new (fileno (stdin));
466 gio_status = g_io_channel_read_line (channel, &filename,
468 if (gio_status == G_IO_STATUS_EOF)
470 if (gio_status != G_IO_STATUS_NORMAL) {
471 fprintf (stderr, "An error occurred reading from stdin: %s\n",
476 g_strchomp (filename);
477 index_file (db, term_gen, filename);
482 } catch (const Xapian::Error &error) {
483 cerr << "A Xapian exception occurred: " << error.get_msg () << endl;