#include <cstdlib>
#include <cctype>

#include "poppler/Object.h"
#include "poppler/PDFDoc.h"
#include "poppler/Stream.h"
#include "poppler/TextOutputDev.h"
#include "poppler/GlobalParams.h"

#include "crypto++/sha.h"
#include "crypto++/hex.h"

#include "mysql++/mysql++.h"

#include <stdexcept>
#include <string>
#include <iostream>
#include <sstream>
#include <vector>
#include <map>
#include <fstream>

#include <boost/regex.hpp>

class TextifierError : public std::runtime_error
{
public:
	TextifierError(std::string msg) : std::runtime_error(msg) {}
};

class Textifier
{
public:
	virtual ~Textifier() {}
	virtual void init(const std::string& filename, std::istream* stream) = 0;
	virtual bool is_supported() = 0;
	virtual std::stringstream& stream() = 0;
};

class TextifierDispatch
{
public:
	TextifierDispatch(const std::string& filename) : filename(filename)
	{
		inf = new std::ifstream(filename.c_str(), std::ifstream::binary);
		if (!inf->good())
			throw TextifierError("Unable to open file");
		calculate_hex_digest();
	}
	~TextifierDispatch()
	{
		inf->close();
		delete inf;
	}
	const std::string get_hex_digest() { return hex_digest; }
	void operator()();
private:
	void calculate_hex_digest()
	{
		CryptoPP::SHA sha;
		size_t block_size = sha.OptimalBlockSize();;
		byte* block = new byte[block_size];
		byte* digest = new byte[CryptoPP::SHA::DIGESTSIZE]; memset(digest, 0, CryptoPP::SHA::DIGESTSIZE);
	
		while(inf->good())
		{
			memset(block, 0, block_size);
			size_t this_block_size = inf->readsome((char*)block, block_size);
			if (!this_block_size) break;
			sha.Update(block, this_block_size);
		}
		sha.Final(digest);
		delete[] block;

		CryptoPP::HexEncoder hexenc(NULL, false);
		hexenc.Attach(new CryptoPP::StringSink(hex_digest));
		hexenc.Put(digest, CryptoPP::SHA::DIGESTSIZE);
		hexenc.MessageEnd();

		delete[] digest;
		inf->clear();
		inf->seekg(0);
	}
	std::string hex_digest;
	std::string filename;
	std::ifstream* inf;
};

class PopplerTextifier : public Textifier
{
public:
	PopplerTextifier() : pdf(NULL) { }

	void init(const std::string& filename, std::istream*)
	{
		// from poppler
		globalParams = new GlobalParams();
		
		pdf = new PDFDoc(new GooString(filename.c_str()));
	}

	~PopplerTextifier()
	{
		if (pdf)
		{
			delete pdf;
			delete globalParams;
		}
	}

	std::stringstream& stream()
	{
		TextOutputDev* tod = new TextOutputDev((TextOutputFunc)&text_writer, (void*)out.rdbuf(), false, true);
		pdf->displayPages(tod, 1, pdf->getNumPages(), 72, 72, 0, false, false, false);
		delete tod;

		return out;
	}

	bool is_supported()
	{
		return pdf->isOk();
	}
private:
	static void text_writer(std::stringbuf* stream, char* text, int len)
	{
		stream->sputn(text, len);
	}

	std::stringstream out;
	PDFDoc* pdf;
};

class RtfTextifier : public Textifier
{
public:
	void init(const std::string&, std::istream* stream) { in = stream; }

	bool is_supported()
	{
		char header[6] = {0};
		in->read(header, 5);
		in->seekg(0);
		return strcmp(header, "{\\rtf") == 0;
	}

	std::stringstream& stream()
	{
		char ch;
		// TODO definitely broken in a lot of common cases. need to either beef this up or link against a real rtf parser
		while (ffwd_past("\\insrsid"))
		{
			while (isdigit(in->peek()))
				in->seekg(1, std::ios_base::cur);
			bool in_section = true;
			while (in_section)
			{
				in->get(ch);
				switch (ch)
				{
					case '\\':
						while (in->peek() != ' ')
							in->seekg(1, std::ios_base::cur);
						while (in->peek() == ' ')
							in->seekg(1, std::ios_base::cur);
					break;
					case '}':
						in_section = false;
					break;
					default:
						out << ch;
				}
			}
		}
		return out;
	}
private:
	bool ffwd_past(const std::string& search)
	{
		const size_t len = search.length();
		bool fnd = false;
		char ch;
		
		while (in->good())
		{
			fnd = true;
			for (size_t idx = 0; idx < len; ++idx)
			{
				in->get(ch);
				if (ch != search[idx])
				{
					fnd = false;
					break;
				}
			}
			if (fnd) return true;
		}
		return false;
	}
	
	std::stringstream out;
	std::istream* in;
};

class DocXTextifier : public Textifier
{
public:
	void init(const std::string& filename, std::istream* stream) { this->filename = filename; in = stream; }

	bool is_supported()
	{
		char header[3] = {0};
		in->read(header, 2);
		in->seekg(0);
		return strcmp(header, "PK") == 0;
	}

	std::stringstream& stream()
	{
		// TODO - zlib doesn't work on these files, need to find something else linkable to pkunzip
		FILE* xml = popen((std::string("/usr/bin/unzip -p ") + filename + std::string(" word/document.xml")).c_str(), "r");

		if (!xml)
			return out;

		bool in_text_node = false;
  		char ch;
		// <w:t>
		while ((ch = fgetc(xml)) != EOF)
		{
			if (in_text_node)
				if (ch == '<' && fgetc(xml) == '/' && fgetc(xml) == 'w' && fgetc(xml) == ':' && fgetc(xml) == 't' && fgetc(xml) == '>')
					in_text_node = false;
				else
					out << ch;
			else if (ch == '<' && fgetc(xml) == 'w' && fgetc(xml) == ':' && fgetc(xml) == 't' && fgetc(xml) == '>')
				in_text_node = true;
		}

		fclose(xml);
		return out;
	}
private:
	std::stringstream out;
	std::istream* in;
	std::string filename;
};

class MSWordTextifier : public Textifier
{
public:
	void init(const std::string&, std::istream* stream)
	{
		in = stream;
	}

	/* TODO but it doesn't hurt anything if it's at the end of the chain */
	bool is_supported() { return true; }

	std::stringstream& stream()
	{
		std::stringstream text;

		char ch;
		size_t text_len = 0;
		bool output_anything = false;
		while (in->good())
		{
			in->get(ch);
			if (ch >= ' ' && ch <= '~')
			{
				++text_len;
				text << ch;
			}
			else if (text_len)
			{
				if (text_len >= TEXT_LEN_THRESHOLD && text.str() != "bjbjN")
				{
					out << text.str() << "\n";
					output_anything = true;
				}
				text.str("");
				text_len = 0;
			}
			if (output_anything && ch == '\n')
				return out;
		}
		return out;
	}
private:
	static const size_t TEXT_LEN_THRESHOLD = 5;
	std::istream* in;
	std::stringstream out;
};

class TextifierConf
{
public:
	TextifierConf()
	{
		std::ifstream conf("/etc/textifier.conf");
		if (!conf.good())
			throw TextifierError("Could not read from /etc/textifier.conf");

		std::stringstream scratch;
		char ch;
		bool in_key = true;
		std::string key;
		while (conf.good())
		{
			conf.get(ch);
			if (in_key)
			{
				if (ch == ':')
				{
					in_key = false;
					key = scratch.str();
					scratch.str("");
					do
					{
						conf.seekg(1, std::ios_base::cur);
					} while (conf.peek() == ' ');
				}
				else
					scratch << ch;
			}
			else
			{
				if (ch == '\n')
				{
					in_key = true;
					options[key] = scratch.str();
					scratch.str("");
				}
				else
					scratch << ch;
			}
		}
		conf.close();
	}

	std::string operator[](const std::string& key)
	{
		std::string rv;
		if (get_if_exists(key, rv))
			return rv;
		throw TextifierError("No configuration set for option " + key);
	}

	bool get_if_exists(const std::string& key, std::string& rv)
	{
		std::map<std::string, std::string>::const_iterator cit = options.find(key);
		if (cit == options.end())
			return false;

		rv = cit->second;
		return true;
	}
private:
	std::map<std::string, std::string> options;
};

void TextifierDispatch::operator()()
{
	TextifierConf conf;

	std::vector<Textifier*> textifiers;
	textifiers.push_back(new DocXTextifier);
	textifiers.push_back(new RtfTextifier);
	textifiers.push_back(new PopplerTextifier);
	textifiers.push_back(new MSWordTextifier);

	bool found_handler = false;
	std::string text = "";
	for (std::vector<Textifier*>::iterator it = textifiers.begin(); it != textifiers.end(); ++it)
	{
		if (!found_handler)
		{
			(*it)->init(filename, inf);
			if ((*it)->is_supported())
			{
				text = (*it)->stream().str();
				found_handler = true;
			}
		}
		delete *it;
	}

	if (found_handler)
	{
		boost::regex bad_chars("(?:[^-a-zA-Z0-9\\(\\)\\[\\] \t\n\r&%$'\";:?.,+*/]|-[ \t\n\r]+)");
		text = boost::regex_replace(text, bad_chars, "");
	
		std::string socket;
		mysqlpp::Connection* conn;
		if (conf.get_if_exists("socket", socket))
		{
			conn = new mysqlpp::UnixDomainSocketConnection();
			if (!((mysqlpp::UnixDomainSocketConnection*)conn)->connect(socket.c_str(), conf["db"].c_str(), conf["user"].c_str(), conf["password"].c_str()))
				throw TextifierError("Unable to connect to MySQL using supplied credentials");
		}
		else
		{
			conn = new mysqlpp::Connection(false);
			if (!conn->connect(conf["db"].c_str(), conf["host"].c_str(), conf["user"].c_str(), conf["password"].c_str()))
				throw TextifierError("Unable to connect to MySQL using supplied credentials");
		}

		mysqlpp::Query sql = conn->query();
		sql << conf["insert"];
		sql.parse();
		mysqlpp::SQLQueryParms params;
		params += hex_digest;
		params += text;
		sql.execute(params);
		delete conn;
		std::string indexer_bin;
		if (conf.get_if_exists("indexer", indexer_bin))
			system(indexer_bin.c_str());
	}
}


int main(int argc, char** argv)
{
	if (argc != 2)
	{
		std::cerr << "Expected exactly one argument, the path to the file to be textified\n"
		          << argv[0] << " <filename>\n";
		exit(EXIT_FAILURE);
	}

    pid_t pid;
    if (getppid() == 1) return 0;

	TextifierDispatch disp = TextifierDispatch(std::string(argv[1]));

    pid = fork();
    if (pid < 0) 
	{
		std::cerr << "Couldn't fork\n";
        exit(EXIT_FAILURE);
	}
    if (pid > 0)
	{
		std::cout << disp.get_hex_digest() << std::endl;
        exit(EXIT_SUCCESS);
	}
	disp();
}
