/*
 * Analyser.cpp
 *
 * Responsible for interacting with the Cma code and Arb.
 *
 *  Created on: Feb 15, 2010
 *      Author: breno
 *
 *  Institute of Microbiology (Technical University Munich)
 *  http://www.arb-home.de/
 */

#include "Analyser.h"

#include <ctime>
#include <iostream>
#include <iterator>

/**
 * Constructor
 */
Analyser::Analyser(GBDATA *gb_main) :
    loader(NULp),
    cma(NULp)
{

    //Definition of the alphabet
    vector<string> alphabet(0);
    alphabet.push_back("A");
    alphabet.push_back("C");
    alphabet.push_back("G");
    alphabet.push_back("T");

    loader = new AlignedSequenceLoader(gb_main);
    if (!loader->has_error()) {
        VecVecType *seqs = loader->getSequences();
        cma = new Cma(alphabet, seqs->size());
    }
}

/**
 * Destructor.
 */
Analyser::~Analyser() {
    delete loader;
    delete cma;
}

/**
 * Getter for the Cma object.
 *
 * @return the Cma object.
 */
Cma* Analyser::getCma() {
    arb_assert(!has_error());
    return cma;
}

/**
 * Returns the AlignedSequenceLoader object.
 *
 * @return the AlignedSequenceLoader object.
 */
AlignedSequenceLoader* Analyser::getLoader() {
    return loader;
}

/**
 * Saves the clusters to the DB as SAI.
 *
 * @param clusters: the cluster indices.
 * @param threshold: the clustering threshold used.
 *
 * @return an GB_ERROR if some DB transaction fails.
 */
GB_ERROR Analyser::saveSAI(GBDATA *gb_main, vector<size_t> clusters, double threshold) {
    GB_ERROR error = GB_push_transaction(gb_main);

    if (!error) {
        char *al_name = GBT_get_default_alignment(gb_main);
        if (!al_name) {
            error = GB_await_error();
        }
        else {
            vector<char> clusts = normalizeClusters(clusters);
            // build result string
            stringstream ss;
            copy(clusts.begin(), clusts.end(), ostream_iterator<char> (ss, ""));
            string result = ss.str();

            //save
            GBDATA *gb_sai = GBT_find_or_create_SAI(gb_main, getSAIname());
            GBDATA *gb_data = GBT_add_data(gb_sai, al_name, "data", GB_STRING);
            error = GB_write_string(gb_data, result.c_str());

            if (!error) {
                GBDATA       *gb_options = GBT_add_data(gb_sai, al_name, "_TYPE", GB_STRING);
                stringstream  options;
                options << "CMA_CLUSTERING: [threshold: " << threshold << "]";

                error = GB_write_string(gb_options, options.str().c_str());
            }
            free(al_name);
        }
    }
    error = GB_end_transaction(gb_main, error);

    return error;
}

/**
 * Gives clusters a reasonable name. The clustering algorithm may return
 * clusters with indices 123 even though there are only 50 clusters.
 * Here we normalise the cluster names, in this example we would have
 * only clusters from 0..50 as result.
 *
 * @param clusters: the result of the clustering algorithm.
 *
 * @return the new cluster names (note that the cluster index is a char
 *         because we have to be able to show it in the SAI, where we
 *         are allowed to use only one character for each position).
 */
vector<char> Analyser::normalizeClusters(vector<size_t> clusters) {

    vector<char> result;
    map<unsigned int, char> rename_map;
    rename_map[0] = '-';
    char classes = '0';

    for (vector<size_t>::iterator it = clusters.begin(); it != clusters.end(); ++it) {
        if (rename_map.find(*it) == rename_map.end()) {
            rename_map[*it] = classes++;
        }
        result.push_back(rename_map[*it]);
    }
    return result;

}

//--------------------------------

GB_ERROR run_rnacma(GBDATA *gb_main, bool interactive, double threshold) {
    /*! interactive main loop of RNACMA.
     * @param gb_main the database.
     * @param interactive normally true. set to false for (non-interactive) unittesting.
     * @param threshold only used if !interactive (i.e. for unittesting)
     */

    GB_ERROR error = NULp;
    try {
        Analyser a(gb_main);
        error = a.get_error();

        if (!error) {
            Cma *cma = a.getCma();

            cma->computeMutualInformationP(*(a.getLoader()->getSequences()));

            list<MITuple> mituple = cma->compute_mituples(cma->getMIp());

            cout << endl
                 << "The highest MI value was: " << mituple.begin()->MI
                 << " at position (" << mituple.begin()->pos1 << ", "
                 << mituple.begin()->pos2 << ")." << endl
                 << "(Note: pairs with MI-values down to the threshold will be joined in one cluster)" << endl;

            while (true) {
                if (interactive) {
                    cout << endl
                         << "Press Ctrl-d to quit or" << endl
                         << "choose a threshold value for the clustering process: ";


                    string input;
                    cin >> input;

                    if (input.empty()) {
                        cout << endl << "quit" << endl;
                        break;
                    }

                    threshold = strtod(input.c_str(), NULp);
                }
                cout << "Building clusters with threshold = " << threshold << endl;

                VectorXi cl = cma->computeClusters(mituple, size_t(cma->getMIp().cols()), threshold);
                vector<size_t> *clusters = new vector<size_t> (0, 0);
                AlignedSequenceLoader *l = a.getLoader();
                size_t i = 0;
                size_t j = 0;
                for (vector<size_t>::const_iterator it = l->getPositionMap().begin(); it != l->getPositionMap().end(); ++it) {
                    while (i < *it) {
                        clusters->push_back(0);
                        i++;
                    }
                    clusters->push_back(cl[j]);
                    j++;
                    i++;
                }

                GB_ERROR e = a.saveSAI(gb_main, *clusters, threshold);
                delete clusters;

                if (e) {
                    cout << "Error while saving SAI: " << e << endl;
                    throw e;
                }
                else {
                    cout << "Saved results to SAI '" << Analyser::getSAIname() << "'" << endl
                         << "(To analyse the results, open the editor and visualise the clusters using the SAI annotations)" << endl;
                }

                if (!interactive) break;
            }
        }
    }
    catch (const char *err) {
        error = GBS_global_string("caught exception: %s", err);
    }
    return error;
}

int ARB_main(int /*argc*/, char */*argv*/[]) {
    cout
        << "arb_rnacma -- correlated mutation analysis" << endl
        << "              computes clusters of correlated positions" << endl
        << "(C) 2010 The ARB-project" << endl
        << "Written 2009/2010 by Breno Faria" << endl
        << endl
        << "arb_rnacma uses the eigen C++ library (http://eigen.tuxfamily.org/)" << endl
        << "eigen is copyrighted by LGPL3" << endl
        << endl;

    int exitcode = EXIT_SUCCESS;
    {
        GB_ERROR error = NULp;
        {
            GB_shell  shell;
            GBDATA   *gb_main = GB_open(":", "rwt");

            if (!gb_main) {
                error = GB_await_error();
            }
            else {
                error = run_rnacma(gb_main, true, 0.0);
            }
            GB_close(gb_main);
        }

        if (error) {
            cout << endl << "Error in arb_rnacma: " << error << ". terminating." << endl;
            exitcode = EXIT_FAILURE;
        }
    }

    return exitcode;
}

// --------------------------------------------------------------------------------

#ifdef UNIT_TESTS
#ifndef TEST_UNIT_H
#include <test_unit.h>
#endif

void TEST_RNACMA() {
    GB_shell shell;
    {
        const char *db      = "TEST_nuc_short.arb"; // ../UNIT_TESTER/run/TEST_nuc_short.arb
        const char *aliname = "ali_16s";
        const char *sainame = "cma_clusters";

        GBDATA *gb_main;
        TEST_EXPECT_RESULT__NOERROREXPORTED(gb_main = GB_open(db, "rw"));

        const int NO_OF_MARKED = 8;
        TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), NO_OF_MARKED);

        {
            GB_transaction ta(gb_main);
            TEST_EXPECT_NORESULT__NOERROREXPORTED(GBT_find_SAI(gb_main, sainame)); // expect SAI to be missing
        }

        {
            GB_transaction ta(gb_main);

            struct {
                double      threshold;
                const char *expected_clusters_data;
            } data[] = {
                { 0.5,   "--------------------------0-----10-----0---------------------23--041---000--5-55-----000060----------0600005---55575-7----5000140--3----0--0--------------2-6------0--0000--000--------0000------0-00----------0------0-00000-0-" },
                { 0.666, "--------------------------0------1-----1----------------------2--13----456--7-88-----9::0;------------;004:8---888<7-<----8494-3---2----5--5----------------;----------16---199---------01---------------------9------:--::99---" },
                { 0.833, "----------------------------------------------------------------------------0--1-----2-------------------------111-0---------------------------------------------------------3---------------------------------3------------2---" },
                { 1.0,   "-------------------------------------------------------------------------------0--------------------------------00-----------------------------------------------------------1---------------------------------1----------------" },
                { 1.5,   "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" },

                // output for testdata: "The highest MI value was: 1.48 at position (77, 106)."
                { -1.0, NULp }
            };

            TEST_EXPECT_NO_ERROR(run_rnacma(gb_main, false, data[0].threshold));

            GBDATA *gb_sai = GBT_find_SAI(gb_main, sainame);
            TEST_REJECT_NULL(gb_sai);

            // test sai content:
            GBDATA *gb_sai_data = GBT_find_sequence(gb_sai, aliname);
            TEST_REJECT_NULL(gb_sai_data);

            int idx = 0;
            while (1) {
                TEST_ANNOTATE(GBS_global_string("idx=%i", idx));
                const char *sai_data = GB_read_char_pntr(gb_sai_data);
                TEST_EXPECT_EQUAL(sai_data, data[idx].expected_clusters_data);

                ++idx;
                if (!data[idx].expected_clusters_data) break; // continue as long there are results to test

                TEST_EXPECT_NO_ERROR(run_rnacma(gb_main, false, data[idx].threshold));
            }
        }

        const int SOME_THRESHOLD = 0.5;

        // fail on invalid default alignment:
        {
            GB_transaction ta(gb_main);
            TEST_EXPECT_NO_ERROR(GBT_set_default_alignment(gb_main, "ali_fantasy"));
            TEST_EXPECT_ERROR_CONTAINS(run_rnacma(gb_main, false, SOME_THRESHOLD), "alignment 'ali_fantasy' not found");
            ta.close("abort"); // undo change of default alignment
        }

        // want failure if a marked sequence lacks data:
        for (int fail = 1; fail<=2; ++fail) {
            TEST_ANNOTATE(GBS_global_string("fail=%i", fail));

            GBDATA *gb_species;
            {
                GB_transaction ta(gb_main);

                gb_species      = GBT_first_marked_species(gb_main);
                GBDATA *gb_data = GBT_find_sequence(gb_species, aliname);

                switch (fail) {
                    case 1: {
                        GB_topSecurityLevel permit(gb_main);
                        TEST_EXPECT_NO_ERROR(GB_delete(gb_data));
                        break;
                    }
                    case 2: {
                        GBDATA *gb_ali = GB_get_father(gb_data);
                        TEST_EXPECT_NO_ERROR(GB_delete(gb_ali));
                        break;
                    }
                    default: arb_assert(0); break;
                }

                TEST_EXPECT_ERROR_CONTAINS(run_rnacma(gb_main, false, SOME_THRESHOLD), "species 'LcbPont2' has no data in selected alignment 'ali_16s'");

                ta.close("abort"); // undo changes
            }
            {
                // somehow the species flag is reset by the above (aborted) transaction.
                // @@@ could be a general database bug!? undo delete is known to be problematic (see #496)
                GB_transaction ta(gb_main);
                GB_write_flag(gb_species, 1);
            }
        }
        TEST_ANNOTATE(NULp);
        TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), NO_OF_MARKED);

        // @@@ provoke some more errors?

        // test failure when no species marked:
        {
            GB_transaction ta(gb_main);
            GBT_mark_all(gb_main, 0);
            TEST_EXPECT_ERROR_CONTAINS(run_rnacma(gb_main, false, SOME_THRESHOLD), "caught exception: no (marked?) sequences");
            ta.close("abort"); // undo changes
        }

        // Note that marks are correctly restored by aborting TA
        TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), NO_OF_MARKED);

        GB_close(gb_main);
    }
}

#endif // UNIT_TESTS

// --------------------------------------------------------------------------------
