
/** @file
 * @author Enrico Zini <enrico@enricozini.org>
 * Fast full-text search
 */

/*
 * Copyright (C) 2007  Enrico Zini <enrico@debian.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <ept/textsearch/textsearch.h>
#include <ept/textsearch/maint/path.h>
#include <ept/apt/apt.h>
#include <ept/apt/packagerecord.h>
#include <ept/debtags/debtags.h>

#include <wibble/regexp.h>
#include <cctype>

#include <algorithm>

#include <iostream>

using namespace std;
using namespace ept::apt;
using namespace ept::debtags;

namespace ept {
namespace textsearch {

size_t max_index = 0;

TextSearch::TextSearch()
	: m_timestamp(0), m_stem("en")
{
	m_timestamp = Path::indexTimestamp();
	if (m_timestamp)
		m_db.add_database(Xapian::Database(Path::indexDir()));
}

std::string TextSearch::toLower(const std::string& str)
{
	std::string res;
	res.reserve(str.size());
	for (std::string::const_iterator i = str.begin(); i != str.end(); ++i)
		res += tolower(*i);
	return res;
}

bool TextSearch::needsRebuild(apt::Apt& apt)
{
	return apt.timestamp() > m_timestamp;
}

void TextSearch::normalize_and_add(Xapian::Document& doc, const std::string& term, int& pos) const
{
	string t = TextSearch::toLower(term);
	string s = m_stem(t);
	doc.add_term(t);
	if (s != t)
		doc.add_term(s);
}

bool TextSearch::rebuildIfNeeded(apt::Apt& apt)
{
	// Check if a rebuild is needed, and keep a copy of the APT timestamp for
	// saving later
	time_t aptts = apt.timestamp();
	if (aptts <= m_timestamp)
		return false;

	// Reindex
	Xapian::WritableDatabase database(Xapian::Flint::open(Path::indexDir(), Xapian::DB_CREATE_OR_OPEN));
	//database.begin_transaction();
	PackageRecord rec;
	size_t count = 0;
	for (Apt::record_iterator i = apt.recordBegin();
			i != apt.recordEnd(); ++i, ++count)
	{
		// If we are testing, we can set a limit to how many packages we index,
		// to avoid it taking too much time
		if (max_index != 0 && count > max_index)
			break;

		rec.scan(*i);

		Xapian::Document doc;
		doc.set_data(rec.package());

		int idx = 1;
		string pkgid = "pkg:" + rec.package();
		//std::cerr << "Add " << pkgid << ": " << idx << std::endl;
		doc.add_term(pkgid);

		// Index tags as well
		set<string> tags = rec.tag();
		for (set<string>::const_iterator ti = tags.begin();
				ti != tags.end(); ++ti)
			doc.add_term("T"+*ti);

		normalize_and_add(doc, rec.package(), idx);

		string desc = toLower(rec.description());
		wibble::Tokenizer tok(desc, "[A-Za-z0-9_-]+", REG_EXTENDED);
		for (wibble::Tokenizer::const_iterator i = tok.begin(); i != tok.end(); ++i)
		{
			//std::cerr << "Token" << idx <<" [" << pkgid << "]: " << *i << std::endl;
			//normalize_and_add(doc, *i, idx);
			string s = m_stem(*i);
			doc.add_term(*i);
			if (s != *i)
				doc.add_term(s);
		}

		if (m_timestamp)
			database.replace_document(pkgid, doc);
		else
			database.add_document(doc);
	}

	//database.commit_transaction();

	if (!m_timestamp)
		m_db.add_database(Xapian::Database(Path::indexDir()));
	else
		m_db.reopen();

	m_timestamp = aptts;

	Path::setTimestamp(aptts);

	return true;
}

bool TextSearch::rebuildIfNeeded(apt::Apt& apt, const debtags::Debtags& debtags)
{
	// Check if a rebuild is needed, and keep a copy of the APT timestamp for
	// saving later
	time_t aptts = apt.timestamp();
	if (aptts <= m_timestamp)
		return false;

	// Reindex
	Xapian::WritableDatabase database(Xapian::Flint::open(Path::indexDir(), Xapian::DB_CREATE_OR_OPEN));
	//database.begin_transaction();
	PackageRecord rec;
	size_t count = 0;
	for (Apt::record_iterator i = apt.recordBegin();
			i != apt.recordEnd(); ++i, ++count)
	{
		// If we are testing, we can set a limit to how many packages we index,
		// to avoid it taking too much time
		if (max_index != 0 && count > max_index)
			break;

		rec.scan(*i);

		Xapian::Document doc;
		doc.set_data(rec.package());

		int idx = 1;
		string pkgid = "pkg:" + rec.package();
		//std::cerr << "Add " << pkgid << ": " << idx << std::endl;
		doc.add_term(pkgid);

		// Index tags as well
		set<Tag> tags = debtags.getTagsOfItem(rec.package());
		for (set<Tag>::const_iterator ti = tags.begin();
				ti != tags.end(); ++ti)
			doc.add_term("T"+ti->fullname());

		normalize_and_add(doc, rec.package(), idx);

		string desc = toLower(rec.description());
		wibble::Tokenizer tok(desc, "[A-Za-z0-9_-]+", REG_EXTENDED);
		for (wibble::Tokenizer::const_iterator i = tok.begin(); i != tok.end(); ++i)
		{
			//std::cerr << "Token" << idx <<" [" << pkgid << "]: " << *i << std::endl;
			//normalize_and_add(doc, *i, idx);
			string s = m_stem(*i);
			doc.add_term(*i);
			if (s != *i)
				doc.add_term(s);
		}

		if (m_timestamp)
			database.replace_document(pkgid, doc);
		else
			database.add_document(doc);
	}

	//database.commit_transaction();

	if (!m_timestamp)
		m_db.add_database(Xapian::Database(Path::indexDir()));
	else
		m_db.reopen();

	m_timestamp = aptts;

	Path::setTimestamp(aptts);

	return true;
}

Xapian::Query TextSearch::makeORQuery(const std::string& keywords) const
{
	wibble::Tokenizer tok(keywords, "[A-Za-z0-9_-]+", REG_EXTENDED);
	return makeORQuery(tok.begin(), tok.end());
}

Xapian::Query TextSearch::makePartialORQuery(const std::string& keywords) const
{
	wibble::Tokenizer tok(keywords, "[A-Za-z0-9_-]+", REG_EXTENDED);
	vector<string> tokens;
	// FIXME: make the Tokenizer iterators properly iterable
	for (wibble::Tokenizer::const_iterator i = tok.begin();
			i != tok.end(); ++i)
		tokens.push_back(*i);
	// Add all the terms starting with 'last'
	if (!tokens.empty())
	{
		string& last = *tokens.rbegin();
		if (last.size() == 1)
			// Ignore one-letter partial terms: they make the query uselessly
			// large and slow, and it's worth just to wait for more characters
			// to come
			tokens.resize(tokens.size() - 1);
		else
			copy(m_db.allterms_begin(last), m_db.allterms_end(last), back_inserter(tokens));
		/*
		for (Xapian::TermIterator t = m_db.allterms_begin(last);
				t != m_db.allterms_end(last); ++t)
			tokens.push_back(*t);
		*/
	}
	return makeORQuery(tokens.begin(), tokens.end());
}

Xapian::docid TextSearch::docidByName(const std::string& pkgname) const
{
	Xapian::PostingIterator i = m_db.postlist_begin("pkg:"+pkgname);
	if (i == m_db.postlist_end("pkg:"+pkgname))
		return 0;
	else
		return *i;
}

struct TagFilter : public Xapian::ExpandDecider
{
	virtual bool operator()(const std::string &term) const { return term[0] == 'T'; }
};

static TagFilter tagFilter;

vector<string> TextSearch::expand(Xapian::Enquire& enq) const
{
	Xapian::RSet rset;
	// Get the top 5 results as 'good ones' to compute the search expansion
	Xapian::MSet mset = enq.get_mset(0, 5);
	for (Xapian::MSet::iterator i = mset.begin(); i != mset.end(); ++i)
		rset.add_document(i);
	// Get the expanded set, only expanding the query with tag names
	Xapian::ESet eset = enq.get_eset(5, rset, &tagFilter);
	vector<string> res;
	for (Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i)
		res.push_back(*i);
	return res;
}

Xapian::Query TextSearch::makeRelatedQuery(const std::string& pkgname) const
{
	Xapian::Enquire enquire(db());
	
	// Retrieve the document for the given package
	enquire.set_query(Xapian::Query("pkg:"+pkgname));
	Xapian::MSet matches = enquire.get_mset(0, 1);
	Xapian::MSetIterator mi = matches.begin();
	if (mi == matches.end()) return Xapian::Query();
	Xapian::Document doc = mi.get_document();

	// Return the query to get the list of similar documents
	return Xapian::Query(Xapian::Query::OP_OR, doc.termlist_begin(), doc.termlist_end());
}

#if 0

struct FTSData
{
	Config& cfg;
	Xapian::Database database;

	FTSData(Config& cfg) : cfg(cfg), stem("en")
	{
		if (!wibble::sys::fs::access(cfg.ftsdb, F_OK))
			index();
		database.add_database(Xapian::Database(cfg.ftsdb));
	}

	Xapian::docid getDocID(const std::string& pkg)
	{
		Xapian::Enquire enquire(database);
		Xapian::Query query(pkg, 1, 1);
		enquire.set_query(query);
		Xapian::MSet matches = enquire.get_mset(0, 1);
		if (matches.empty())
			return 0;
		//	cfg.log() << "DOC " << matches.begin().get_document().get_data() << endl;
		return *matches.begin();
	}

#if 0
	Xapian::Query similarityQuery(const std::string& pkg)
	{
		using namespace std;
		tagcoll::input::Stdio in(cfg.pkgdb);
		DebDBParser parser(in);
		DebDBParser::Record rec;
		vector<string> postings;
		postings.push_back(pkg);
		cfg.log() << "SQT " << pkg << std::endl;
		while (parser.nextRecord(rec))
		{
			if (rec["Package"] != pkg)
				continue;

			wibble::Tokenizer tok(rec["Description"], "[A-Za-z0-9_-]+", REG_EXTENDED);
			for (wibble::Tokenizer::const_iterator i = tok.begin(); i != tok.end(); ++i)
			{
				cfg.log() << "SQT " << normalise(*i) << std::endl;
				postings.push_back(normalise(*i));
			}
		}
		return Xapian::Query(Xapian::Query::OP_ELITE_SET, postings.begin(), postings.end());
	}
#endif
};

Xapian::MSet TextSearch::search(const std::vector<std::string>& keys)
{
	using namespace std;
	Xapian::Enquire enquire(m_db);
	vector<string> terms;
	for (vector<string>::const_iterator i = keys.begin(); i != keys.end(); ++i)
	{
		string t = toLower(*i);
		string s = stem(t);
		terms.push_back(t);
		if (s != t)
			terms.push_back(s);
	}
	Xapian::Query query(Xapian::Query::OP_OR, terms.begin(), terms.end());
	enquire.set_query(query);
	//Xapian::MSet matches = enquire.get_mset(0, 10);
	Xapian::MSet matches = enquire.get_mset(0, 100);

	std::set<std::string> res;
	for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i)
	{
		// Cut off poor results
		if (res.size() > 10 && i.get_percent() < 60) break;
		//cfg.log() << i.get_document().get_data() << ": " << i.get_percent() << "%" << endl;
		res.insert(i.get_document().get_data());
	}
	return res;
}

std::vector<std::string> FullTextSearch::similar(const std::string& pkg)
{
	using namespace std;
	Xapian::docid id = data->getDocID(pkg);
	if (id == 0)
		return vector<string>();

	Xapian::Enquire enquire(data->database);
	Xapian::RSet rset;
	rset.add_document(id);
	Xapian::ESet eset = enquire.get_eset(7, rset);
	if (eset.empty())
		return vector<string>();
//	for (Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i)
//		cfg.log() << "EI " << *i << ": " << i.get_weight() << endl;

	Xapian::Query query(Xapian::Query::OP_OR, eset.begin(), eset.end());
	enquire.set_query(query);
	//Xapian::MSet matches = enquire.get_mset(0, 10);
	Xapian::MSet matches = enquire.get_mset(0, 100);
	std::vector<std::string> res;
	for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i)
	{
		// Cut off poor results
		if (res.size() > 20 && i.get_percent() < 60) break;
		//cfg.log() << i.get_document().get_data() << ": " << i.get_percent() << "%" << endl;
		if (i.get_document().get_data() != pkg)
			res.push_back(i.get_document().get_data());
	}
	return res;
}
#if 0
std::set<std::string> FullTextSearch::similar(const std::string& pkg)
{
	using namespace std;
	Xapian::Enquire enquire(data->database);
	enquire.set_query(data->similarityQuery(pkg));
	Xapian::MSet matches = enquire.get_mset(0, 100);
	std::set<std::string> res;
	for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i)
	{
		// Cut off poor results
		if (res.size() > 10 && i.get_percent() < 60) break;
		cfg.log() << i.get_document().get_data() << ": " << i.get_percent() << "% " << *i << endl;
		res.insert(i.get_document().get_data());
	}

	Xapian::RSet rset;
	rset.add_document(1863);
	Xapian::ESet eset = enquire.get_eset(100, rset);
	cfg.log() << "ES " << eset.size() << endl;
	for (Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i)
		cfg.log() << "EI " << *i << endl;
	return res;
}
#endif
#endif

}
}

#include <ept/debtags/debtags.tcc>

// vim:set ts=4 sw=4:
