/*
 * AlignedSequenceLoader.cxx
 *
 *  Created on: Feb 15, 2010
 *      Author: Breno Faria
 *
 *  Institute of Microbiology (Technical University Munich)
 *  http://www.arb-home.de/
 */


#include "AlignedSequenceLoader.h"

#include <arbdbt.h>

/**
 * Loads the marked sequences aligned from Arb's DB.
 * This loader only considers letters given by the following regular
 * expression: [ACGTUacgtu]
 */
AlignedSequenceLoader::AlignedSequenceLoader(GBDATA *gb_main) :
    error(NULp),
    seqs(NULp)
{
    error = GB_push_transaction(gb_main);
    if (!error) {
        char *al_name = GBT_get_default_alignment(gb_main);
        if (!al_name) {
            error = GB_await_error();
        }
        else {
            int al_len = GBT_get_alignment_len(gb_main, al_name);

            if (al_len <= 0) {
                error = GB_await_error();
            }
            else {
                seqs    = new VecVecType(0);
                MSA_len = al_len;

                size_t occurrences[MSA_len];
                for (size_t i = 0; i < MSA_len; i++) occurrences[i] = 0;

                cout << "loading marked species: ";
                flush(cout);
                for (GBDATA *gb_species = GBT_first_marked_species(gb_main);
                     gb_species && !error;
                     gb_species = GBT_next_marked_species(gb_species))
                {
                    GBDATA *gb_data = GBT_find_sequence(gb_species, al_name);
                    if (!gb_data) {
                        error = GBS_global_string("species '%s' has no data in selected alignment '%s'",
                                                  GBT_get_name_or_description(gb_species),
                                                  al_name);
                    }
                    else { // existing alignment data for this species
                        cout << GBT_get_name_or_description(gb_species) << " ";
                        flush(cout);

                        string sequence = GB_read_char_pntr(gb_data);

                        string *seq_as_vec = new string[MSA_len];
                        int     k          = 0;
                        for (string::iterator i = sequence.begin(); i != sequence.end(); ++i) {
                            switch (*i) {
                                case 'A':
                                case 'a':
                                    seq_as_vec[k] = "A";
                                    occurrences[k] += 1;
                                    break;
                                case 'C':
                                case 'c':
                                    seq_as_vec[k] = "C";
                                    occurrences[k] += 1;
                                    break;
                                case 'G':
                                case 'g':
                                    seq_as_vec[k] = "G";
                                    occurrences[k] += 1;
                                    break;
                                case 'T':
                                case 't':
                                case 'U':
                                case 'u':
                                    seq_as_vec[k] = "T";
                                    occurrences[k] += 1;
                                    break;
                                default:
                                    seq_as_vec[k] = "-";
                                    break;
                            }
                            k++;
                        }

                        arb_assert((size_t)k == MSA_len);
                        vector<string> seq_vector(&seq_as_vec[0], &seq_as_vec[k]);
                        delete [] seq_as_vec;

                        seqs->push_back(seq_vector);
                    }
                }

                cout << "done. Total number of species: " << seqs->size() << endl;
                flush(cout);

                cleanSeqs(occurrences, MSA_len);
            }

            free(al_name);
        }
    }

    error = GB_end_transaction(gb_main, error);
}

/**
 * Returns the aligned seqs.
 *
 * @return the aligned seqs.
 */
VecVecType* AlignedSequenceLoader::getSequences() {
    arb_assert(!has_error());
    arb_assert(seqs);
    return seqs;
}

/**
 * Destructor.
 */
AlignedSequenceLoader::~AlignedSequenceLoader() {
    delete seqs;
}

/**
 * Getter for the MSA_len.
 *
 * @return the MSA length.
 */
size_t AlignedSequenceLoader::getMsaLen() {
    arb_assert(!has_error());
    return MSA_len;
}

/**
 * Getter for the position map. The position map maps positions in the clean
 * sequence to the original positions in the alignment.
 *
 * @return the position map.
 */
const vector<size_t>& AlignedSequenceLoader::getPositionMap() {
    arb_assert(!has_error());
    return position_map;
}

/**
 * This method cleans-up the empty positions of the MSA.
 *
 *
 * @param occurrences: a list gathering the number of occurrences of bases at
 *                     each position of the MSA.
 * @param len: the length of occurrences.
 */
void AlignedSequenceLoader::cleanSeqs(const size_t *occurrences, long len) {

    cout << "cleaning-up sequences of empty positions... " << endl;
    flush( cout);

    size_t num_of_bases = 0;
    for (int i = 0; i < len; i++) {
        if (occurrences[i] != 0) {
            num_of_bases++;
        }
    }

    cout << "number of non-empty positions in MSA: " << num_of_bases
         << ". Filtered out " << len - num_of_bases << " positions." << endl;

    VecVecType *clean_seqs = new VecVecType(0);

    cout << "computing position map..." << endl;
    position_map.resize(num_of_bases, 0);

    int j = 0;
    for (int i = 0; i < len; i++) {
        if (occurrences[i] != 0) {
            position_map.at(j) = i;
            j++;
        }
    }

    for (VecVecType::iterator seq = seqs->begin(); seq != seqs->end(); ++seq) {
        // for every sequence
        vector<string> sequence(num_of_bases, "");
        int jj = 0;
        for (int i = 0; i < len; ++i) {
            if (occurrences[i] != 0) {
                sequence.at(jj) = seq->at(i);
                jj++;
            }
        }
        arb_assert(sequence.size() == num_of_bases);
        clean_seqs->push_back(sequence);
    }

    delete seqs; // before overwriting it
    seqs = clean_seqs;

    cout << "clean-up done." << endl;

}

