/* omindex.cc: index static documents into the omega db
 *
 * Copyright 1999,2000,2001 BrightStation PLC
 * Copyright 2001,2005 James Aylett
 * Copyright 2001,2002 Ananova Ltd
 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009 Olly Betts
 * Copyright 2009 Frank J Bruzzaniti
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */

#include <config.h>

#include <algorithm>
#include <fstream>
#include <iostream>
#include <string>
#include <map>
#include <vector>

#include <sys/types.h>
#include "safeunistd.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "safefcntl.h"
#include "safeerrno.h"
#include <time.h>

#include <xapian.h>

#include "commonhelp.h"
#include "diritor.h"
#include "hashterm.h"
#include "loadfile.h"
#include "md5wrap.h"
#include "metaxmlparse.h"
#include "myhtmlparse.h"
#include "runfilter.h"
#include "sample.h"
#include "stringutils.h"
#include "utf8convert.h"
#include "utils.h"
#include "values.h"
#include "xmlparse.h"
#include "xpsxmlparse.h"

#include "gnu_getopt.h"

#ifndef HAVE_MKDTEMP
extern char * mkdtemp(char *);
#endif

using namespace std;

#define TITLE_SIZE 128
#define SAMPLE_SIZE 512

#define PROG_NAME "omindex"
#define PROG_DESC "Index static website data via the filesystem"

static bool skip_duplicates = false;
static bool follow_symlinks = false;
static string dbpath;
static string root;
static string indexroot;
static string baseurl;
static Xapian::WritableDatabase db;
static Xapian::Stem stemmer("english");
static Xapian::TermGenerator indexer;
static vector<bool> updated;
static string tmpdir;

inline static bool
p_notalnum(unsigned int c)
{
    return !isalnum(static_cast<unsigned char>(c));
}

static string
shell_protect(const string & file)
{
    string safefile = file;
    string::size_type p = 0;
    if (!safefile.empty() && safefile[0] == '-') {
	// If the filename starts with a '-', protect it from being treated as
	// an option by prepending "./".
	safefile.insert(0, "./");
	p = 2;
    }
    while (p < safefile.size()) {
	// Don't escape some safe characters which are common in filenames.
	unsigned char ch = safefile[p];
	if (!isalnum(ch) && strchr("/._-", ch) == NULL) {
	    safefile.insert(p, "\\");
	    ++p;
	}
	++p;
    }
    return safefile;
}

static bool ensure_tmpdir() {
    if (!tmpdir.empty()) return true;

    const char * p = getenv("TMPDIR");
    if (!p) p = "/tmp";
    char * dir_template = new char[strlen(p) + 15 + 1];
    strcpy(dir_template, p);
    strcat(dir_template, "/omindex-XXXXXX");
    p = mkdtemp(dir_template);
    if (p) {
	tmpdir.assign(dir_template);
	tmpdir += '/';
    }
    delete dir_template;
    return (p != NULL);
}

static string
file_to_string(const string &file)
{
    string out;
    if (!load_file(file, out)) throw ReadError();
    return out;
}

static void
get_pdf_metainfo(const string & safefile, string &title, string &keywords)
{
    try {
	string pdfinfo = stdout_to_string("pdfinfo -enc UTF-8 " + safefile);

	string::size_type idx;

	if (strncmp(pdfinfo.c_str(), "Title:", 6) == 0) {
	    idx = 0;
	} else {
	    idx = pdfinfo.find("\nTitle:");
	}
	if (idx != string::npos) {
	    if (idx) ++idx;
	    idx = pdfinfo.find_first_not_of(' ', idx + 6);
	    string::size_type end = pdfinfo.find('\n', idx);
	    if (end != string::npos) {
		if (pdfinfo[end - 1] == '\r') --end;
		end -= idx;
	    }
	    title.assign(pdfinfo, idx, end);
	}

	if (strncmp(pdfinfo.c_str(), "Keywords:", 9) == 0) {
	    idx = 0;
	} else {
	    idx = pdfinfo.find("\nKeywords:");
	}
	if (idx != string::npos) {
	    if (idx) ++idx;
	    idx = pdfinfo.find_first_not_of(' ', idx + 9);
	    string::size_type end = pdfinfo.find('\n', idx);
	    if (end != string::npos) {
		if (pdfinfo[end - 1] == '\r') --end;
		end -= idx;
	    }
	    keywords.assign(pdfinfo, idx, end);
	}
    } catch (ReadError) {
	// It's probably best to index the document even if pdfinfo fails.
    }
}

static void
index_file(const string &url, const string &mimetype, time_t last_mod, off_t size)
{
    string file = root + url;
    string title, sample, keywords, dump;

    cout << "Indexing \"" << url << "\" as " << mimetype << " ... " << flush;

    string urlterm("U");
    urlterm += baseurl;
    urlterm += url;

    cout << "url term: " << urlterm << endl;

    if (urlterm.length() > MAX_SAFE_TERM_LENGTH)
	urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH);

    if (skip_duplicates && db.term_exists(urlterm)) {
	cout << "duplicate. Ignored." << endl;
	return;
    }

    string md5;
    if (mimetype == "text/html") {
	cout << "handling text/html" << endl;
	string text;
	try {
	    text = file_to_string(file);
	} catch (ReadError) {
	    cout << "can't read \"" << file << "\" - skipping\n";
	    return;
	}
	cout << "read file " << file << endl;
	MyHtmlParser p;
	try {
	    // Default HTML character set is latin 1, though not specifying one
	    // is deprecated these days.
	    p.parse_html(text, "iso-8859-1", false);
	} catch (const string & newcharset) {
	    cout << "reparsing HTML with charset " << newcharset << endl;
	    try {
		p.reset();
		p.parse_html(text, newcharset, true);
	    } catch (bool) {
	    }
	} catch (bool) {
	    // MyHtmlParser throws a bool to abandon parsing at </body> or when
	    // indexing is disallowed
	}
	cout << "parsed HTML" << endl;
	if (!p.indexing_allowed) {
	    cout << "indexing disallowed by meta tag - skipping\n";
	    return;
	}
	dump = p.dump;
	title = p.title;
	keywords = p.keywords;
	sample = p.sample;
	md5_string(text, md5);
	cout << "md5 calculated" << endl;
    } else if (mimetype == "text/plain") {
	try {
	    // Currently we assume that text files are UTF-8 unless they have a
	    // byte-order mark.
	    dump = file_to_string(file);
	    md5_string(dump, md5);

	    // Look for Byte-Order Mark (BOM).
	    if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
		// UTF-16 in big-endian/little-endian order - we just convert
		// it as "UTF-16" and let the conversion handle the BOM as that
		// way we avoid the copying overhead of erasing 2 bytes from
		// the start of dump.
		convert_to_utf8(dump, "UTF-16");
	    } else if (startswith(dump, "\xef\xbb\xbf")) {
		// UTF-8 with stupid Windows not-the-byte-order mark.
		dump.erase(0, 3);
	    } else {
		// FIXME: What charset is the file?  Look at contents?
	    }
	} catch (ReadError) {
	    cout << "can't read \"" << file << "\" - skipping\n";
	    return;
	}
    } else if (mimetype == "application/pdf") {
	string safefile = shell_protect(file);
	string cmd = "pdftotext -enc UTF-8 " + safefile + " -";
	try {
	    dump = stdout_to_string(cmd);
	} catch (ReadError) {
	    cout << "\"" << cmd << "\" failed - skipping\n";
	    return;
	}
	get_pdf_metainfo(safefile, title, keywords);
    } else if (mimetype == "application/postscript") {
	// There simply doesn't seem to be a Unicode capable PostScript to
	// text converter (e.g. pstotext always outputs ISO-8859-1).  The only
	// solution seems to be to convert via PDF using ps2pdf and then
	// pdftotext.  This gives plausible looking UTF-8 output for some
	// Chinese PostScript files I found using Google.  It also has the
	// benefit of allowing us to extract meta information from PostScript
	// files.
	if (!ensure_tmpdir()) {
	    // FIXME: should this be fatal?  Or disable indexing postscript?
	    cout << "Couldn't create temporary directory (" << strerror(errno) << ") - skipping" << endl;
	    return;
	}
	string tmpfile = tmpdir + "/tmp.pdf";
	string safetmp = shell_protect(tmpfile);
	string cmd = "ps2pdf " + shell_protect(file) + " " + safetmp;
	try {
	    (void)stdout_to_string(cmd);
	    cmd = "pdftotext -enc UTF-8 " + safetmp + " -";
	    dump = stdout_to_string(cmd);
	} catch (ReadError) {
	    cout << "\"" << cmd << "\" failed - skipping" << endl;
	    unlink(tmpfile.c_str());
	    return;
	} catch (...) {
	    unlink(tmpfile.c_str());
	    throw;
	}
	try {
	    get_pdf_metainfo(safetmp, title, keywords);
	} catch (...) {
	    unlink(tmpfile.c_str());
	    throw;
	}
	unlink(tmpfile.c_str());
    } else if (startswith(mimetype, "application/vnd.sun.xml.") ||
	       startswith(mimetype, "application/vnd.oasis.opendocument."))
    {
	// Inspired by http://mjr.towers.org.uk/comp/sxw2text
	string safefile = shell_protect(file);
	string cmd = "unzip -p " + safefile + " content.xml";
	try {
	    XmlParser xmlparser;
	    xmlparser.parse_html(stdout_to_string(cmd));
	    dump = xmlparser.dump;
	} catch (ReadError) {
	    cout << "\"" << cmd << "\" failed - skipping\n";
	    return;
	}

	cmd = "unzip -p " + safefile + " meta.xml";
	try {
	    MetaXmlParser metaxmlparser;
	    metaxmlparser.parse_html(stdout_to_string(cmd));
	    title = metaxmlparser.title;
	    keywords = metaxmlparser.keywords;
	    sample = metaxmlparser.sample;
	} catch (ReadError) {
	    // It's probably best to index the document even if this fails.
	}
    } else if (mimetype == "application/msword") {
	string cmd = "antiword -mUTF-8.txt " + shell_protect(file);
	try {
	    dump = stdout_to_string(cmd);
	} catch (ReadError) {
	    cout << "\"" << cmd << "\" failed - skipping\n";
	    return;
	}
    } else if (mimetype == "application/vnd.ms-excel") {
	string cmd = "xls2csv -q0 -dutf-8 " + shell_protect(file);
	try {
	    dump = stdout_to_string(cmd);
	} catch (ReadError) {
	    cout << "\"" << cmd << "\" failed - skipping\n";
	    return;
	}
    } else if (mimetype == "application/vnd.ms-powerpoint") {
	string cmd = "catppt -dutf-8 " + shell_protect(file);
	try {
	    dump = stdout_to_string(cmd);
	} catch (ReadError) {
	    cout << "\"" << cmd << "\" failed - skipping\n";
	    return;
	}
    } else if (startswith(mimetype, "application/vnd.openxmlformats-officedocument.")) {
	const char * args = NULL;
	string tail(mimetype, 46);
	if (startswith(tail, "wordprocessingml.")) {
	    args = " word/document.xml";
	} else if (startswith(tail, "spreadsheetml.")) {
	    args = " xl/sharedStrings.xml";
	} else if (startswith(tail, "presentationml.")) {
	    args = " ppt/slides/slide*.xml";
	} else {
	    // Don't know how to index this type.
	    cout << "unknown Office 2007 MIME subtype - skipping\n";
	    return;
	}
	string safefile = shell_protect(file);
	string cmd = "unzip -p " + safefile + args;
	try {
	    XmlParser xmlparser;
	    xmlparser.parse_html(stdout_to_string(cmd));
	    dump = xmlparser.dump;
	} catch (ReadError) {
	    cout << "\"" << cmd << "\" failed - skipping\n";
	    return;
	}
    } else if (mimetype == "application/vnd.wordperfect") {
	// Looking at the source of wpd2html and wpd2text I think both output
	// utf-8, but it's hard to be sure without sample Unicode .wpd files
	// as they don't seem to be at all well documented.
	string cmd = "wpd2text " + shell_protect(file);
	try {
	    dump = stdout_to_string(cmd);
	} catch (ReadError) {
	    cout << "\"" << cmd << "\" failed - skipping\n";
	    return;
	}
    } else if (mimetype == "application/vnd.ms-works") {
	// wps2text produces UTF-8 output from the sample files I've tested.
	string cmd = "wps2text " + shell_protect(file);
	try {
	    dump = stdout_to_string(cmd);
	} catch (ReadError) {
	    cout << "\"" << cmd << "\" failed - skipping\n";
	    return;
	}
    } else if (mimetype == "application/x-abiword") {
	// FIXME: Implement support for metadata.
	try {
	    XmlParser xmlparser;
	    string text = file_to_string(file);
	    xmlparser.parse_html(text);
	    dump = xmlparser.dump;
	    md5_string(text, md5);
	} catch (ReadError) {
	    cout << "can't read \"" << file << "\" - skipping\n";
	    return;
	}
    } else if (mimetype == "application/x-abiword-compressed") {
	// FIXME: Implement support for metadata.
	string cmd = "gzip -dc " + shell_protect(file);
	try {
	    XmlParser xmlparser;
	    xmlparser.parse_html(stdout_to_string(cmd));
	    dump = xmlparser.dump;
	} catch (ReadError) {
	    cout << "\"" << cmd << "\" failed - skipping\n";
	    return;
	}
    } else if (mimetype == "text/rtf") {
	// The --text option unhelpfully converts all non-ASCII characters to
	// "?" so we use --html instead, which produces HTML entities.
	string cmd = "unrtf --nopict --html 2>/dev/null " + shell_protect(file);
	MyHtmlParser p;
	try {
	    // No point going looking for charset overrides as unrtf doesn't
	    // produce them.
	    p.parse_html(stdout_to_string(cmd), "iso-8859-1", true);
	} catch (ReadError) {
	    cout << "\"" << cmd << "\" failed - skipping\n";
	    return;
	} catch (bool) {
	    // MyHtmlParser throws a bool to abandon parsing at </body> or when
	    // indexing is disallowed
	}
	if (!p.indexing_allowed) {
	    cout << "indexing disallowed by meta tag - skipping\n";
	    return;
	}
	dump = p.dump;
	title = p.title;
	keywords = p.keywords;
	sample = p.sample;
    } else if (mimetype == "text/x-perl") {
	// pod2text's output character set doesn't seem to be documented, but
	// from inspecting the source it looks like it's probably iso-8859-1.
	string cmd = "pod2text " + shell_protect(file);
	try {
	    dump = stdout_to_string(cmd);
	    convert_to_utf8(dump, "ISO-8859-1");
	} catch (ReadError) {
	    cout << "\"" << cmd << "\" failed - skipping\n";
	    return;
	}
    } else if (mimetype == "application/x-dvi") {
	// FIXME: -e0 means "UTF-8", but that results in "fi", "ff", "ffi", etc
	// appearing as single ligatures.  For European languages, it's
	// actually better to use -e2 (ISO-8859-1) and then convert, so let's
	// do that for now until we handle Unicode "compatibility
	// decompositions".
	string cmd = "catdvi -e2 -s " + shell_protect(file);
	try {
	    dump = stdout_to_string(cmd);
	    convert_to_utf8(dump, "ISO-8859-1");
	} catch (ReadError) {
	    cout << "\"" << cmd << "\" failed - skipping\n";
	    return;
	}
    } else if (mimetype == "image/vnd.djvu") {
	// Output is UTF-8 according to "man djvutxt".  Generally this seems to
	// be true, though some examples from djvu.org generate isolated byte
	// 0x95 in a context which suggests it might be intended to be a bullet
	// (as it is in CP1250).
	string cmd = "djvutxt " + shell_protect(file);
	try {
	    dump = stdout_to_string(cmd);
	} catch (ReadError) {
	    cout << "\"" << cmd << "\" failed - skipping\n";
	    return;
	}
    } else if (mimetype == "application/vnd.ms-xpsdocument") {
	string safefile = shell_protect(file);
	string cmd = "unzip -p " + safefile + " Documents/1/Pages/*.fpage";
	try {
	    XpsXmlParser xpsparser;
	    dump = stdout_to_string(cmd);
	    // Look for Byte-Order Mark (BOM).
	    if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
		// UTF-16 in big-endian/little-endian order - we just convert
		// it as "UTF-16" and let the conversion handle the BOM as that
		// way we avoid the copying overhead of erasing 2 bytes from
		// the start of dump.
		convert_to_utf8(dump, "UTF-16");
	    }
	    xpsparser.parse_html(dump);
	    dump = xpsparser.dump;
	} catch (ReadError) {
	    cout << "\"" << cmd << "\" failed - skipping\n";
	    return;
	}
    } else {
	// Don't know how to index this type.
	cout << "unknown MIME type - skipping\n";
	return;
    }

    // Compute the MD5 of the file if we haven't already.
    if (md5.empty() && md5_file(file, md5) == 0) {
	cout << "failed to read file to calculate MD5 checksum - skipping\n";
	return;
    }

    // Produce a sample
    if (sample.empty()) {
	sample = generate_sample(dump, SAMPLE_SIZE);
    } else {
	sample = generate_sample(sample, SAMPLE_SIZE);
    }
    cout << "sample: " << sample << endl;

    // Put the data in the document
    Xapian::Document newdocument;
    string record = "url=" + baseurl + url + "\nsample=" + sample;
    if (!title.empty()) {
	record += "\ncaption=" + generate_sample(title, TITLE_SIZE);
    }
    record += "\ntype=" + mimetype;
    if (last_mod != (time_t)-1)
	record += "\nmodtime=" + long_to_string(last_mod);
    if (size)
	record += "\nsize=" + long_to_string(size);
    newdocument.set_data(record);

    // Index the title, document text, and keywords.
    indexer.set_document(newdocument);
    if (!title.empty()) {
	indexer.index_text(title, 2);
	indexer.increase_termpos(100);
    }
    if (!dump.empty()) {
	indexer.index_text(dump);
    }
    if (!keywords.empty()) {
	indexer.increase_termpos(100);
	indexer.index_text(keywords);
    }

    newdocument.add_term("T" + mimetype); // mimeType
    string::size_type j;
    j = find_if(baseurl.begin(), baseurl.end(), p_notalnum) - baseurl.begin();
    if (j > 0 && baseurl.substr(j, 3) == "://") {
	j += 3;
	string::size_type k = baseurl.find('/', j);
	if (k == string::npos) {
	    newdocument.add_term("P/"); // Path
	    newdocument.add_term("H" + baseurl.substr(j));
	} else {
	    newdocument.add_term("P" + baseurl.substr(k)); // Path
	    string::const_iterator l;
	    l = find(baseurl.begin() + j, baseurl.begin() + k, ':');
	    string::size_type host_len = l - baseurl.begin() - j;
	    newdocument.add_term("H" + baseurl.substr(j, host_len)); // Host
	}
    } else {
	newdocument.add_term("P" + baseurl); // Path
    }

    struct tm *tm = localtime(&last_mod);
    string date_term = "D" + date_to_string(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday);
    newdocument.add_term(date_term); // Date (YYYYMMDD)
    date_term.resize(7);
    date_term[0] = 'M';
    newdocument.add_term(date_term); // Month (YYYYMM)
    date_term.resize(5);
    date_term[0] = 'Y';
    newdocument.add_term(date_term); // Year (YYYY)

    newdocument.add_term(urlterm); // Url

    // Add last_mod as a value to allow "sort by date".
    newdocument.add_value(VALUE_LASTMOD, int_to_binary_string((uint32_t)last_mod));

    // Add MD5 as a value to allow duplicate documents to be collapsed together.
    newdocument.add_value(VALUE_MD5, md5);

    if (!skip_duplicates) {
	// If this document has already been indexed, update the existing
	// entry.
	try {
	    Xapian::docid did = db.replace_document(urlterm, newdocument);
	    if (did < updated.size()) {
		updated[did] = true;
		cout << "updated." << endl;
	    } else {
		cout << "added." << endl;
	    }
	} catch (...) {
	    // FIXME: is this ever actually needed?
	    db.add_document(newdocument);
	    cout << "added (failed re-seek for duplicate)." << endl;
	}
    } else {
	// If this were a duplicate, we'd have skipped it above.
	db.add_document(newdocument);
	cout << "added." << endl;
    }
}

static void
index_directory(size_t depth_limit, const string &dir,
		map<string, string>& mime_map)
{
    string path = root + indexroot + dir;

    cout << "[Entering directory " << dir << "]" << endl;

    DirectoryIterator d(follow_symlinks);
    try {
	d.start(path);
	while (d.next()) try {
	    string url = dir;
	    if (!url.empty() && url[url.size() - 1] != '/') url += '/';
	    url += d.leafname();
	    string file = root + indexroot + url;
	    switch (d.get_type()) {
		case DirectoryIterator::DIRECTORY:
		    if (depth_limit == 1) continue;
		    try {
			size_t new_limit = depth_limit;
			if (new_limit) --new_limit;
			index_directory(new_limit, url, mime_map);
		    } catch (...) {
			cout << "Caught unknown exception in index_directory, rethrowing" << endl;
			throw;
		    }
		    continue;
		case DirectoryIterator::REGULAR_FILE: {
		    string ext;
		    string::size_type dot = url.find_last_of('.');
		    if (dot != string::npos) ext = url.substr(dot + 1);

		    map<string,string>::iterator mt = mime_map.find(ext);
		    if (mt == mime_map.end()) {
			// If the extension isn't found, see if the lower-cased
			// version (if different) is found.
			bool changed = false;
			string::iterator i;
			for (i = ext.begin(); i != ext.end(); ++i) {
			    if (*i >= 'A' && *i <= 'Z') {
				*i = tolower(*i);
				changed = true;
			    }
			}
			if (changed) mt = mime_map.find(ext);
		    }
		    if (mt != mime_map.end()) {
			if (mt->second.empty()) {
			    cout << "Skipping file, required filter not "
				    "installed: \"" << file << "\""
				 << endl;
			    continue;
			}

			// Only check the file size if we recognise the
			// extension to avoid a call to stat()/lstat() for
			// files we can't handle when readdir() tells us the
			// file type.
			off_t size = d.get_size();
			if (size == 0) {
			    cout << "Skipping empty file: \"" << file << "\""
				 << endl;
			    continue;
			}

			// It's in our MIME map so we know how to index it.
			const string & mimetype = mt->second;
			try {
			    time_t mtime = d.get_mtime();
			    index_file(indexroot + url, mimetype, mtime, size);
			} catch (NoSuchFilter) {
			    // FIXME: we ought to ignore by mime-type not
			    // extension.
			    cout << "Filter for \"" << mimetype
				 << "\" not installed - ignoring extension \""
				 << ext << "\"" << endl;
			    mt->second = string();
			}
		    } else {
			cout << "Unknown extension: \"" << file
			     << "\" - skipping" << endl;
		    }
		    continue;
		}
		default:
		    cout << "Not a regular file \"" << file
			 << "\" - skipping" << endl;
	    }
	} catch (const std::string & error) {
	    cout << error << " - skipping" << endl;
	    continue;
	}
    } catch (const std::string & error) {
	cout << error << " - skipping directory" << endl;
	return;
    }
}

int
main(int argc, char **argv)
{
    // If overwrite is true, the database will be created anew even if it
    // already exists.
    bool overwrite = false;
    // If preserve_unupdated is false, delete any documents we don't
    // replace (if in replace duplicates mode)
    bool preserve_unupdated = false;
    size_t depth_limit = 0;

    static const struct option longopts[] = {
	{ "help",	no_argument,		NULL, 'h' },
	{ "version",	no_argument,		NULL, 'v' },
	{ "overwrite",	no_argument,		NULL, 'o' },
	{ "duplicates",	required_argument,	NULL, 'd' },
	{ "preserve-nonduplicates",	no_argument,	NULL, 'p' },
	{ "db",		required_argument,	NULL, 'D' },
	{ "url",	required_argument,	NULL, 'U' },
	{ "mime-type",	required_argument,	NULL, 'M' },
	{ "depth-limit",required_argument,	NULL, 'l' },
	{ "follow",	no_argument,		NULL, 'f' },
	{ "stemmer",	required_argument,	NULL, 's' },
	{ 0, 0, NULL, 0 }
    };

    int getopt_ret;

    map<string, string> mime_map;
    // Plain text:
    mime_map["txt"] = "text/plain";
    mime_map["text"] = "text/plain";
    // HTML:
    mime_map["html"] = "text/html";
    mime_map["htm"] = "text/html";
    mime_map["shtml"] = "text/html";
    mime_map["php"] = "text/html"; // Our HTML parser knows to ignore PHP code.
    // PDF:
    mime_map["pdf"] = "application/pdf";
    // PostScript:
    mime_map["ps"] = "application/postscript";
    mime_map["eps"] = "application/postscript";
    mime_map["ai"] = "application/postscript";
    // OpenDocument:
    // FIXME: need to find sample documents to test all of these.
    mime_map["odt"] = "application/vnd.oasis.opendocument.text";
    mime_map["ods"] = "application/vnd.oasis.opendocument.spreadsheet";
    mime_map["odp"] = "application/vnd.oasis.opendocument.presentation";
    mime_map["odg"] = "application/vnd.oasis.opendocument.graphics";
    mime_map["odc"] = "application/vnd.oasis.opendocument.chart";
    mime_map["odf"] = "application/vnd.oasis.opendocument.formula";
    mime_map["odb"] = "application/vnd.oasis.opendocument.database";
    mime_map["odi"] = "application/vnd.oasis.opendocument.image";
    mime_map["odm"] = "application/vnd.oasis.opendocument.text-master";
    mime_map["ott"] = "application/vnd.oasis.opendocument.text-template";
    mime_map["ots"] = "application/vnd.oasis.opendocument.spreadsheet-template";
    mime_map["otp"] = "application/vnd.oasis.opendocument.presentation-template";
    mime_map["otg"] = "application/vnd.oasis.opendocument.graphics-template";
    mime_map["otc"] = "application/vnd.oasis.opendocument.chart-template";
    mime_map["otf"] = "application/vnd.oasis.opendocument.formula-template";
    mime_map["oti"] = "application/vnd.oasis.opendocument.image-template";
    mime_map["oth"] = "application/vnd.oasis.opendocument.text-web";
    // OpenOffice/StarOffice documents:
    mime_map["sxc"] = "application/vnd.sun.xml.calc";
    mime_map["stc"] = "application/vnd.sun.xml.calc.template";
    mime_map["sxd"] = "application/vnd.sun.xml.draw";
    mime_map["std"] = "application/vnd.sun.xml.draw.template";
    mime_map["sxi"] = "application/vnd.sun.xml.impress";
    mime_map["sti"] = "application/vnd.sun.xml.impress.template";
    mime_map["sxm"] = "application/vnd.sun.xml.math";
    mime_map["sxw"] = "application/vnd.sun.xml.writer";
    mime_map["sxg"] = "application/vnd.sun.xml.writer.global";
    mime_map["stw"] = "application/vnd.sun.xml.writer.template";
    // MS Office 2007 formats:
    mime_map["docx"] = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"; // Word 2007
    mime_map["dotx"] = "application/vnd.openxmlformats-officedocument.wordprocessingml.template"; // Word 2007 template
    mime_map["xlsx"] = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; // Excel 2007
    mime_map["xltx"] = "application/vnd.openxmlformats-officedocument.spreadsheetml.template"; // Excel 2007 template
    mime_map["pptx"] = "application/vnd.openxmlformats-officedocument.presentationml.presentation"; // PowerPoint 2007 presentation
    mime_map["ppsx"] = "application/vnd.openxmlformats-officedocument.presentationml.slideshow"; // PowerPoint 2007 slideshow
    mime_map["potx"] = "application/vnd.openxmlformats-officedocument.presentationml.template"; // PowerPoint 2007 template
    mime_map["xps"] = "application/vnd.ms-xpsdocument";
    // Some other word processor formats:
    mime_map["doc"] = "application/msword";
    mime_map["dot"] = "application/msword"; // Word template
    mime_map["wpd"] = "application/vnd.wordperfect";
    mime_map["wps"] = "application/vnd.ms-works";
    mime_map["wpt"] = "application/vnd.ms-works"; // Works template
    mime_map["abw"] = "application/x-abiword"; // AbiWord
    mime_map["zabw"] = "application/x-abiword-compressed"; // AbiWord compressed
    mime_map["rtf"] = "text/rtf";
    // Other MS formats:
    mime_map["xls"] = "application/vnd.ms-excel";
    mime_map["xlb"] = "application/vnd.ms-excel";
    mime_map["xlt"] = "application/vnd.ms-excel"; // Excel template
    mime_map["ppt"] = "application/vnd.ms-powerpoint";
    mime_map["pps"] = "application/vnd.ms-powerpoint"; // Powerpoint slideshow
    // Perl:
    mime_map["pl"] = "text/x-perl";
    mime_map["pm"] = "text/x-perl";
    mime_map["pod"] = "text/x-perl";
    // TeX DVI:
    mime_map["dvi"] = "application/x-dvi";
    // DjVu:
    mime_map["djv"] = "image/vnd.djvu";
    mime_map["djvu"] = "image/vnd.djvu";

    while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:l:s:pf", longopts, NULL)) != -1) {
	switch (getopt_ret) {
	case 'h': {
	    cout << PROG_NAME" - "PROG_DESC"\n\n"
"Usage: "PROG_NAME" [OPTIONS] --db DATABASE [BASEDIR] DIRECTORY\n\n"
"Options:\n"
"  -d, --duplicates         set duplicate handling ('ignore' or 'replace')\n"
"  -p, --preserve-nonduplicates  don't delete unupdated documents in\n"
"                                duplicate replace mode\n"
"  -D, --db                 path to database to use\n"
"  -U, --url                base url DIRECTORY represents (default: /)\n"
"  -M, --mime-type          additional MIME mapping ext:type\n"
"  -l, --depth-limit=LIMIT  set recursion limit (0 = unlimited)\n"
"  -f, --follow             follow symbolic links\n"
"      --overwrite          create the database anew (the default is to update\n"
"                           if the database already exists)" << endl;
	    print_stemmer_help("     ");
	    print_help_and_version_help("     ");
	    return 0;
	}
	case 'v':
	    print_package_info(PROG_NAME);
	    return 0;
	case 'd': // how shall we handle duplicate documents?
	    switch (optarg[0]) {
	    case 'i':
		skip_duplicates = true;
		break;
	    case 'r':
		skip_duplicates = false;
		break;
	    }
	    break;
	case 'p': // don't delete unupdated documents
	    preserve_unupdated = true;
	    break;
	case 'l': { // Set recursion limit
	    int arg = atoi(optarg);
	    if (arg < 0) arg = 0;
	    depth_limit = size_t(arg);
	    break;
	}
	case 'f': // Turn on following of symlinks
	    follow_symlinks = true;
	    break;
	case 'M': {
	    const char * s = strchr(optarg, ':');
	    if (s != NULL) {
		if (s[1]) {
		    mime_map[string(optarg, s - optarg)] = string(s + 1);
		} else {
		    // -Mtxt: removes the default mapping for .txt files.
		    mime_map.erase(string(optarg, s - optarg));
		}
	    } else {
		cerr << "Invalid MIME mapping '" << optarg << "'\n"
			"Should be of the form ext:type, eg txt:text/plain\n"
			"(or txt: to delete a default mapping)" << endl;
		return 1;
	    }
	    break;
	}
	case 'D':
	    dbpath = optarg;
	    break;
	case 'U':
	    baseurl = optarg;
	    break;
	case 'o': // --overwrite
	    overwrite = true;
	    break;
	case 's':
	    try {
		stemmer = Xapian::Stem(optarg);
	    } catch (const Xapian::Error &) {
		cerr << "Unknown stemming language '" << optarg << "'.\n";
		cerr << "Available language names are: "
		     << Xapian::Stem::get_available_languages() << endl;
		return 1;
	    }
	    break;
	case ':': // missing param
	    return 1;
	case '?': // unknown option: FIXME -> char
	    return 1;
	}
    }

    if (dbpath.empty()) {
	cerr << PROG_NAME": you must specify a database with --db.\n";
	return 1;
    }
    if (baseurl.empty()) {
	cerr << PROG_NAME": --url not specified, assuming `/'.\n";
    }
    // baseurl mustn't end '/' or you end up with the wrong URL
    // (//thing is different to /thing). We could probably make this
    // safe a different way, by ensuring that we don't put a leading '/'
    // on leafnames when scanning a directory, but this will do.
    if (!baseurl.empty() && baseurl[baseurl.length() - 1] == '/') {
	cout << "baseurl has trailing '/' ... removing ... " << endl;
	baseurl.resize(baseurl.size() - 1);
    }

    if (optind >= argc || optind + 2 < argc) {
	cerr << PROG_NAME": you must specify a directory to index.\n"
"Do this either as a single directory (corresponding to the base URL)\n"
"or two directories - the first corresponding to the base URL and the second\n"
"a subdirectory of that to index." << endl;
	return 1;
    }
    root = argv[optind];
    if (optind + 2 == argc) {
	indexroot = argv[optind + 1]; // relative to root
	if (indexroot.empty() || indexroot[0] != '/') {
	    indexroot = "/" + indexroot;
	}
    } else {
	indexroot = ""; // index the whole of root
    }

    int exitcode = 1;
    try {
	if (!overwrite) {
	    db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OPEN);
	    if (!skip_duplicates) {
		// + 1 so that db.get_lastdocid() is a valid subscript.
		updated.resize(db.get_lastdocid() + 1);
	    }
	} else {
	    db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OVERWRITE);
	}

	indexer.set_stemmer(stemmer);

	index_directory(depth_limit, "/", mime_map);
	if (!skip_duplicates && !preserve_unupdated) {
	    for (Xapian::docid did = 1; did < updated.size(); ++did) {
		if (!updated[did]) {
		    try {
			db.delete_document(did);
			cout << "Deleted document #" << did << endl;
		    } catch (const Xapian::DocNotFoundError &) {
		    }
		}
	    }
	}
	db.flush();
	// cout << "\n\nNow we have " << db.get_doccount() << " documents.\n";
	exitcode = 0;
    } catch (const Xapian::Error &e) {
	cout << "Exception: " << e.get_msg() << endl;
    } catch (const string &s) {
	cout << "Exception: " << s << endl;
    } catch (const char *s) {
	cout << "Exception: " << s << endl;
    } catch (...) {
	cout << "Caught unknown exception" << endl;
    }

    // If we created a temporary directory then delete it.
    if (!tmpdir.empty()) rmdir(tmpdir.c_str());

    return exitcode;
}

