git.cworth.org Git - obsolete/notmuch-old/blob - lib/index.cc

   1 /*
   2  * Copyright © 2009 Carl Worth
   3  *
   4  * This program is free software: you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation, either version 3 of the License, or
   7  * (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program.  If not, see http://www.gnu.org/licenses/ .
  16  *
  17  * Author: Carl Worth <cworth@cworth.org>
  18  */
  19
  20 #include "notmuch-private.h"
  21
  22 #include <gmime/gmime.h>
  23 #include <gmime/gmime-filter.h>
  24
  25 #include <xapian.h>
  26
  27 /* Oh, how I wish that gobject didn't require so much noisy boilerplate!
  28  * (Though I have at least eliminated some of the stock set...) */
  29 typedef struct _NotmuchFilterDiscardUuencode NotmuchFilterDiscardUuencode;
  30 typedef struct _NotmuchFilterDiscardUuencodeClass NotmuchFilterDiscardUuencodeClass;
  31
  32 /**
  33  * NotmuchFilterDiscardUuencode:
  34  *
  35  * @parent_object: parent #GMimeFilter
  36  * @encode: encoding vs decoding
  37  * @state: State of the parser
  38  *
  39  * A filter to discard uuencoded portions of an email.
  40  *
  41  * A uuencoded portion is identified as beginning with a line
  42  * matching:
  43  *
  44  *      begin [0-7][0-7][0-7] .*
  45  *
  46  * After that detection, and beginning with the following line,
  47  * characters will be discarded as long as the first character of each
  48  * line begins with M and subsequent characters on the line are within
  49  * the range of ASCII characters from ' ' to '`'.
  50  *
  51  * This is not a perfect UUencode filter. It's possible to have a
  52  * message that will legitimately match that pattern, (so that some
  53  * legitimate content is discarded). And for most UUencoded files, the
  54  * final line of encoded data (the line not starting with M) will be
  55  * indexed.
  56  **/
  57 struct _NotmuchFilterDiscardUuencode {
  58     GMimeFilter parent_object;
  59     int state;
  60 };
  61
  62 struct _NotmuchFilterDiscardUuencodeClass {
  63     GMimeFilterClass parent_class;
  64 };
  65
  66 GMimeFilter *notmuch_filter_discard_uuencode_new (void);
  67
  68 static void notmuch_filter_discard_uuencode_finalize (GObject *object);
  69
  70 static GMimeFilter *filter_copy (GMimeFilter *filter);
  71 static void filter_filter (GMimeFilter *filter, char *in, size_t len, size_t prespace,
  72                            char **out, size_t *outlen, size_t *outprespace);
  73 static void filter_complete (GMimeFilter *filter, char *in, size_t len, size_t prespace,
  74                              char **out, size_t *outlen, size_t *outprespace);
  75 static void filter_reset (GMimeFilter *filter);
  76
  77
  78 static GMimeFilterClass *parent_class = NULL;
  79
  80 static void
  81 notmuch_filter_discard_uuencode_class_init (NotmuchFilterDiscardUuencodeClass *klass)
  82 {
  83     GObjectClass *object_class = G_OBJECT_CLASS (klass);
  84     GMimeFilterClass *filter_class = GMIME_FILTER_CLASS (klass);
  85
  86     parent_class = (GMimeFilterClass *) g_type_class_ref (GMIME_TYPE_FILTER);
  87
  88     object_class->finalize = notmuch_filter_discard_uuencode_finalize;
  89
  90     filter_class->copy = filter_copy;
  91     filter_class->filter = filter_filter;
  92     filter_class->complete = filter_complete;
  93     filter_class->reset = filter_reset;
  94 }
  95
  96 static void
  97 notmuch_filter_discard_uuencode_finalize (GObject *object)
  98 {
  99     G_OBJECT_CLASS (parent_class)->finalize (object);
 100 }
 101
 102 static GMimeFilter *
 103 filter_copy (GMimeFilter *gmime_filter)
 104 {
 105     (void) gmime_filter;
 106     return notmuch_filter_discard_uuencode_new ();
 107 }
 108
 109 static void
 110 filter_filter (GMimeFilter *gmime_filter, char *inbuf, size_t inlen, size_t prespace,
 111                char **outbuf, size_t *outlen, size_t *outprespace)
 112 {
 113     NotmuchFilterDiscardUuencode *filter = (NotmuchFilterDiscardUuencode *) gmime_filter;
 114     register const char *inptr = inbuf;
 115     const char *inend = inbuf + inlen;
 116     char *outptr;
 117
 118     (void) prespace;
 119
 120     /* Simple, linear state-transition diagram for our filter.
 121      *
 122      * If the character being processed is within the range of [a, b]
 123      * for the current state then we transition next_if_match
 124      * state. If not, we transition to the next_if_not_match state.
 125      *
 126      * The final two states are special in that they are the states in
 127      * which we discard data. */
 128     static const struct {
 129         int state;
 130         int a;
 131         int b;
 132         int next_if_match;
 133         int next_if_not_match;
 134     } states[] = {
 135         {0,  'b',  'b',  1,  0},
 136         {1,  'e',  'e',  2,  0},
 137         {2,  'g',  'g',  3,  0},
 138         {3,  'i',  'i',  4,  0},
 139         {4,  'n',  'n',  5,  0},
 140         {5,  ' ',  ' ',  6,  0},
 141         {6,  '0',  '7',  7,  0},
 142         {7,  '0',  '7',  8,  0},
 143         {8,  '0',  '7',  9,  0},
 144         {9,  ' ',  ' ',  10, 0},
 145         {10, '\n', '\n', 11, 10},
 146         {11, 'M',  'M',  12, 0},
 147         {12, ' ',  '`',  12, 11}
 148     };
 149     int next;
 150
 151     g_mime_filter_set_size (gmime_filter, inlen, FALSE);
 152     outptr = gmime_filter->outbuf;
 153
 154     while (inptr < inend) {
 155         if (*inptr >= states[filter->state].a &&
 156             *inptr <= states[filter->state].b)
 157         {
 158             next = states[filter->state].next_if_match;
 159         }
 160         else
 161         {
 162             next = states[filter->state].next_if_not_match;
 163         }
 164
 165         if (filter->state < 11)
 166             *outptr++ = *inptr;
 167
 168         filter->state = next;
 169         inptr++;
 170     }
 171
 172     *outlen = outptr - gmime_filter->outbuf;
 173     *outprespace = gmime_filter->outpre;
 174     *outbuf = gmime_filter->outbuf;
 175 }
 176
 177 static void
 178 filter_complete (GMimeFilter *filter, char *inbuf, size_t inlen, size_t prespace,
 179                  char **outbuf, size_t *outlen, size_t *outprespace)
 180 {
 181     if (inbuf && inlen)
 182         filter_filter (filter, inbuf, inlen, prespace, outbuf, outlen, outprespace);
 183 }
 184
 185 static void
 186 filter_reset (GMimeFilter *gmime_filter)
 187 {
 188     NotmuchFilterDiscardUuencode *filter = (NotmuchFilterDiscardUuencode *) gmime_filter;
 189
 190     filter->state = 0;
 191 }
 192
 193 /**
 194  * notmuch_filter_discard_uuencode_new:
 195  *
 196  * Returns: a new #NotmuchFilterDiscardUuencode filter.
 197  **/
 198 GMimeFilter *
 199 notmuch_filter_discard_uuencode_new (void)
 200 {
 201     static GType type = 0;
 202     NotmuchFilterDiscardUuencode *filter;
 203
 204     if (!type) {
 205         static const GTypeInfo info = {
 206             sizeof (NotmuchFilterDiscardUuencodeClass),
 207             NULL, /* base_class_init */
 208             NULL, /* base_class_finalize */
 209             (GClassInitFunc) notmuch_filter_discard_uuencode_class_init,
 210             NULL, /* class_finalize */
 211             NULL, /* class_data */
 212             sizeof (NotmuchFilterDiscardUuencode),
 213             0,    /* n_preallocs */
 214             NULL, /* instance_init */
 215             NULL  /* value_table */
 216         };
 217
 218         type = g_type_register_static (GMIME_TYPE_FILTER, "NotmuchFilterDiscardUuencode", &info, (GTypeFlags) 0);
 219     }
 220
 221     filter = (NotmuchFilterDiscardUuencode *) g_object_newv (type, 0, NULL);
 222     filter->state = 0;
 223
 224     return (GMimeFilter *) filter;
 225 }
 226
 227 /* We're finally down to a single (NAME + address) email "mailbox". */
 228 static void
 229 _index_address_mailbox (notmuch_message_t *message,
 230                         const char *prefix_name,
 231                         InternetAddress *address)
 232 {
 233     InternetAddressMailbox *mailbox = INTERNET_ADDRESS_MAILBOX (address);
 234     const char *name, *addr;
 235     void *local = talloc_new (message);
 236
 237     name = internet_address_get_name (address);
 238     addr = internet_address_mailbox_get_addr (mailbox);
 239
 240     /* In the absence of a name, we'll strip the part before the @
 241      * from the address. */
 242     if (! name) {
 243         const char *at;
 244
 245         at = strchr (addr, '@');
 246         if (at)
 247             name = talloc_strndup (local, addr, at - addr);
 248     }
 249
 250     if (name)
 251         _notmuch_message_gen_terms (message, prefix_name, name);
 252     if (addr)
 253         _notmuch_message_gen_terms (message, prefix_name, addr);
 254
 255     talloc_free (local);
 256 }
 257
 258 static void
 259 _index_address_list (notmuch_message_t *message,
 260                      const char *prefix_name,
 261                      InternetAddressList *addresses);
 262
 263 /* The outer loop over the InternetAddressList wasn't quite enough.
 264  * There can actually be a tree here where a single member of the list
 265  * is a "group" containing another list. Recurse please.
 266  */
 267 static void
 268 _index_address_group (notmuch_message_t *message,
 269                       const char *prefix_name,
 270                       InternetAddress *address)
 271 {
 272     InternetAddressGroup *group;
 273     InternetAddressList *list;
 274
 275     group = INTERNET_ADDRESS_GROUP (address);
 276     list = internet_address_group_get_members (group);
 277
 278     if (! list)
 279         return;
 280
 281     _index_address_list (message, prefix_name, list);
 282 }
 283
 284 static void
 285 _index_address_list (notmuch_message_t *message,
 286                      const char *prefix_name,
 287                      InternetAddressList *addresses)
 288 {
 289     int i;
 290     InternetAddress *address;
 291
 292     if (addresses == NULL)
 293         return;
 294
 295     for (i = 0; i < internet_address_list_length (addresses); i++) {
 296         address = internet_address_list_get_address (addresses, i);
 297         if (INTERNET_ADDRESS_IS_MAILBOX (address)) {
 298             _index_address_mailbox (message, prefix_name, address);
 299         } else if (INTERNET_ADDRESS_IS_GROUP (address)) {
 300             _index_address_group (message, prefix_name, address);
 301         } else {
 302             INTERNAL_ERROR ("GMime InternetAddress is neither a mailbox nor a group.\n");
 303         }
 304     }
 305 }
 306
 307 static const char *
 308 skip_re_in_subject (const char *subject)
 309 {
 310     const char *s = subject;
 311
 312     if (subject == NULL)
 313         return NULL;
 314
 315     while (*s) {
 316         while (*s && isspace (*s))
 317             s++;
 318         if (strncasecmp (s, "re:", 3) == 0)
 319             s += 3;
 320         else
 321             break;
 322     }
 323
 324     return s;
 325 }
 326
 327 /* Callback to generate terms for each mime part of a message. */
 328 static void
 329 _index_mime_part (notmuch_message_t *message,
 330                   GMimeObject *part)
 331 {
 332     GMimeStream *stream, *filter;
 333     GMimeFilter *discard_uuencode_filter;
 334     GMimeDataWrapper *wrapper;
 335     GByteArray *byte_array;
 336     GMimeContentDisposition *disposition;
 337     char *body;
 338
 339     if (GMIME_IS_MULTIPART (part)) {
 340         GMimeMultipart *multipart = GMIME_MULTIPART (part);
 341         int i;
 342
 343         for (i = 0; i < g_mime_multipart_get_count (multipart); i++) {
 344             if (GMIME_IS_MULTIPART_SIGNED (multipart)) {
 345                 /* Don't index the signature. */
 346                 if (i == 1)
 347                     continue;
 348                 if (i > 1)
 349                     fprintf (stderr, "Warning: Unexpected extra parts of multipart/signed. Indexing anyway.\n");
 350             }
 351             _index_mime_part (message,
 352                               g_mime_multipart_get_part (multipart, i));
 353         }
 354         return;
 355     }
 356
 357     if (GMIME_IS_MESSAGE_PART (part)) {
 358         GMimeMessage *mime_message;
 359
 360         mime_message = g_mime_message_part_get_message (GMIME_MESSAGE_PART (part));
 361
 362         _index_mime_part (message, g_mime_message_get_mime_part (mime_message));
 363
 364         return;
 365     }
 366
 367     if (! (GMIME_IS_PART (part))) {
 368         fprintf (stderr, "Warning: Not indexing unknown mime part: %s.\n",
 369                  g_type_name (G_OBJECT_TYPE (part)));
 370         return;
 371     }
 372
 373     disposition = g_mime_object_get_content_disposition (part);
 374     if (disposition &&
 375         strcmp (disposition->disposition, GMIME_DISPOSITION_ATTACHMENT) == 0)
 376     {
 377         const char *filename = g_mime_part_get_filename (GMIME_PART (part));
 378
 379         _notmuch_message_add_term (message, "tag", "attachment");
 380         _notmuch_message_gen_terms (message, "attachment", filename);
 381
 382         /* XXX: Would be nice to call out to something here to parse
 383          * the attachment into text and then index that. */
 384         return;
 385     }
 386
 387     byte_array = g_byte_array_new ();
 388
 389     stream = g_mime_stream_mem_new_with_byte_array (byte_array);
 390     g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (stream), FALSE);
 391
 392     filter = g_mime_stream_filter_new (stream);
 393     discard_uuencode_filter = notmuch_filter_discard_uuencode_new ();
 394
 395     g_mime_stream_filter_add (GMIME_STREAM_FILTER (filter),
 396                               discard_uuencode_filter);
 397
 398     wrapper = g_mime_part_get_content_object (GMIME_PART (part));
 399     if (wrapper)
 400         g_mime_data_wrapper_write_to_stream (wrapper, filter);
 401
 402     g_object_unref (stream);
 403     g_object_unref (filter);
 404     g_object_unref (discard_uuencode_filter);
 405
 406     g_byte_array_append (byte_array, (guint8 *) "\0", 1);
 407     body = (char *) g_byte_array_free (byte_array, FALSE);
 408
 409     if (body) {
 410         _notmuch_message_gen_terms (message, NULL, body);
 411
 412         free (body);
 413     }
 414 }
 415
 416 notmuch_status_t
 417 _notmuch_message_index_file (notmuch_message_t *message,
 418                              const char *filename)
 419 {
 420     GMimeStream *stream = NULL;
 421     GMimeParser *parser = NULL;
 422     GMimeMessage *mime_message = NULL;
 423     InternetAddressList *addresses;
 424     FILE *file = NULL;
 425     const char *from, *subject;
 426     notmuch_status_t ret = NOTMUCH_STATUS_SUCCESS;
 427     static int initialized = 0;
 428
 429     if (! initialized) {
 430         g_mime_init (0);
 431         initialized = 1;
 432     }
 433
 434     file = fopen (filename, "r");
 435     if (! file) {
 436         fprintf (stderr, "Error opening %s: %s\n", filename, strerror (errno));
 437         ret = NOTMUCH_STATUS_FILE_ERROR;
 438         goto DONE;
 439     }
 440
 441     /* Evil GMime steals my FILE* here so I won't fclose it. */
 442     stream = g_mime_stream_file_new (file);
 443
 444     parser = g_mime_parser_new_with_stream (stream);
 445
 446     mime_message = g_mime_parser_construct_message (parser);
 447
 448     from = g_mime_message_get_sender (mime_message);
 449     addresses = internet_address_list_parse_string (from);
 450
 451     _index_address_list (message, "from", addresses);
 452
 453     addresses = g_mime_message_get_all_recipients (mime_message);
 454     _index_address_list (message, "to", addresses);
 455
 456     subject = g_mime_message_get_subject (mime_message);
 457     subject = skip_re_in_subject (subject);
 458     _notmuch_message_gen_terms (message, "subject", subject);
 459
 460     _index_mime_part (message, g_mime_message_get_mime_part (mime_message));
 461
 462   DONE:
 463     if (mime_message)
 464         g_object_unref (mime_message);
 465
 466     if (parser)
 467         g_object_unref (parser);
 468
 469     if (stream)
 470         g_object_unref (stream);
 471
 472     return ret;
 473 }