*
* And for each document ID:
*
- * All terms
- * All values
- *
- * Things not yet dumped include:
- *
- * Data associated with a document.
+ * Document data
+ * All document terms
+ * All document values
*/
#include <cstdlib>
#include <iostream>
+#include <algorithm>
#include <xapian.h>
using namespace std;
+vector<int> UNSERIALIZE;
+
+unsigned int MAX_TERMS = 0;
+
+static void
+print_escaped_string (const char *s)
+{
+ printf ("\"");
+
+ while (*s) {
+ if (*s == '"')
+ printf ("\\");
+ printf ("%c", *s);
+ s++;
+ }
+
+ printf ("\"");
+}
+
static void
print_document_terms (Xapian::Document doc)
{
- Xapian::TermIterator i;
+ Xapian::TermIterator it;
+ unsigned int i;
+
+ printf (" {\n");
+
+ for (it = doc.termlist_begin (), i = 0;
+ it != doc.termlist_end ();
+ it++, i++)
+ {
+ printf (" ");
+ print_escaped_string ((*it).c_str());
+ printf (",\n");
+ }
- printf ("Terms:\n");
+ for ( ; i < MAX_TERMS; i++)
+ printf (" \"\",\n");
- for (i = doc.termlist_begin (); i != doc.termlist_end (); i++)
- cout << "\t" << *i << endl;
+ printf (" },\n");
+}
+
+static int
+vector_int_contains (vector<int> v, int i)
+{
+ vector<int>::iterator result;
+
+ result = find (v.begin(), v.end(), i);
+
+ return result != v.end();
}
static void
print_document_values (Xapian::Document doc)
{
Xapian::ValueIterator i;
+ int value_no, value_int;
+ double value_float;
+
+ for (i = doc.values_begin (); i != doc.values_end (); i++) {
+ value_no = i.get_valueno();
+
+ printf (" ");
+
+ if (vector_int_contains (UNSERIALIZE, value_no)) {
+ value_float = Xapian::sortable_unserialise (*i);
+ value_int = value_float;
+ if (value_int == value_float)
+ printf ("%d", value_int);
+ else
+ printf ("\"%f\"", value_float);
+ } else {
+ print_escaped_string ((*i).c_str ());
+ }
- printf ("Values:\n");
+ printf (",\n");
+ }
- for (i = doc.values_begin (); i != doc.values_end (); i++)
- cout << "\t" << i.get_valueno() << ": " << *i << endl;
}
static void
{
Xapian::Document doc;
- printf ("Document %u:\n", id);
+ printf ("{\n");
doc = db.get_document (id);
+ printf (" \"%s\",\n", doc.get_data ().c_str());
+
print_document_terms (doc);
print_document_values (doc);
+
+ printf ("},\n");
}
int
main (int argc, char *argv[])
{
const char *database_path;
+ int i;
if (argc < 2) {
- fprintf (stderr, "Usage: %s <path-to-xapian-database>\n",
+ fprintf (stderr, "Usage: %s <path-to-xapian-database> [value_nos...]\n",
argv[0]);
+ fprintf (stderr, "Dumps data from the given database.\n");
+ fprintf (stderr, "The values corresponding to any value numbers given on the command line\n");
+ fprintf (stderr, "will be unserialized to an before being printed.\n");
exit (1);
}
database_path = argv[1];
- try {
+ UNSERIALIZE = vector<int> ();
+
+ for (i = 2; i < argc; i++)
+ UNSERIALIZE.push_back (atoi (argv[i]));
+ try {
Xapian::Database db;
Xapian::PostingIterator i;
Xapian::docid doc_id;
db = Xapian::Database (database_path);
+
+ for (i = db.postlist_begin (""); i != db.postlist_end (""); i++) {
+ Xapian::Document doc;
+
+ doc_id = *i;
+
+ doc = db.get_document (doc_id);
+
+ if (doc.termlist_count () > MAX_TERMS)
+ MAX_TERMS = doc.termlist_count ();
+ }
+
+ printf ("#define MAX_TERMS %d\n\n", MAX_TERMS);
+
+ printf ("typedef struct {\n"
+ " char data[255];\n"
+ " char terms[MAX_TERMS][255];\n"
+ " char message_id[255];\n"
+ " char thread_id[4096];\n"
+ " time_t time;\n"
+ "} document_dump_t;\n\n");
+
+ printf ("document_dump_t dump[] = {\n");
+
for (i = db.postlist_begin (""); i != db.postlist_end (""); i++) {
doc_id = *i;
print_document (db, doc_id);
}
+ printf ("};\n");
+
} catch (const Xapian::Error &error) {
cerr << "A Xapian exception occurred: " << error.get_msg () << endl;
exit (1);