// =============================================================== //
//                                                                 //
//   File      : Translate.cxx                                     //
//   Purpose   : Nucleotide->AA translation                        //
//                                                                 //
//   Coded by Ralf Westram (coder@reallysoft.de) in June 2006      //
//   Institute of Microbiology (Technical University Munich)       //
//   http://www.arb-home.de/                                       //
//                                                                 //
// =============================================================== //

#include "Translate.hxx"

#include <AP_pro_a_nucs.hxx>
#include <AP_codon_table.hxx>
#include <arbdbt.h>
#include <arb_global_defs.h>
#include <cctype>

#define tl_assert(cond) arb_assert(cond)

GB_ERROR translate_saveInfo(GBDATA *gb_species, int arb_transl_table, int codon_start) {
    int embl_transl_table = TTIT_arb2embl(arb_transl_table);

    tl_assert(codon_start >= 0 && codon_start<3); // codon_start has to be 0..2
    tl_assert(embl_transl_table >= 0);

    GB_ERROR error    = GBT_write_string(gb_species, "transl_table", GBS_global_string("%i", embl_transl_table));
    if (!error) error = GBT_write_string(gb_species, "codon_start",  GBS_global_string("%i", codon_start+1));

    return error;
}

GB_ERROR translate_removeInfo(GBDATA *gb_species) {
    GB_ERROR error = NULp;

    GBDATA *gb_transl_table    = GB_entry(gb_species, "transl_table");
    if (gb_transl_table) error = GB_delete(gb_transl_table);

    if (!error) {
        GBDATA *gb_codon_start    = GB_entry(gb_species, "codon_start");
        if (gb_codon_start) error = GB_delete(gb_codon_start);
    }

    return error;
}

GB_ERROR translate_getInfo(GBDATA *gb_item, int& arb_transl_table, int& codon_start) {
    // looks for sub-entries 'transl_table' and 'codon_start' of species (works for genes as well)
    // if found -> test for validity and translate 'transl_table' from EMBL to ARB table number
    //
    // returns: an error in case of problems
    //
    // 'arb_transl_table' is set to -1 if not found, otherwise it contains the arb table number
    // 'codon_start'      is set to -1 if not found, otherwise it contains the codon_start (0..2)

    arb_transl_table = -1;          // not found yet
    codon_start      = -1;          // not found yet

    GB_ERROR  error           = NULp;
    GBDATA   *gb_transl_table = GB_entry(gb_item, "transl_table");

    if (gb_transl_table) {
        int embl_table   = atoi(GB_read_char_pntr(gb_transl_table));
        arb_transl_table = TTIT_embl2arb(embl_table);
        if (arb_transl_table == -1) { // ill. table
            error = GBS_global_string("Illegal (or unsupported) value (%i) in 'transl_table'", embl_table);
        }
    }

    if (!error) {
        GBDATA *gb_codon_start = GB_entry(gb_item, "codon_start");
        if (gb_codon_start) {
            int codon_start_value = atoi(GB_read_char_pntr(gb_codon_start));

            if (codon_start_value<1 || codon_start_value>3) {
                error = GBS_global_string("Illegal value (%i) in 'codon_start' (allowed: 1..3)", codon_start_value);
            }
            else {
                codon_start = codon_start_value-1; // internal value is 0..2
            }
        }
        else if (arb_transl_table != -1) {
            // default to codon_start 1
            error = GBT_write_string(gb_item, "codon_start", "1");
            if (!error) codon_start = 0; // internal value is 0..2
        }
    }

    if (!error && arb_transl_table != codon_start) {
        if (arb_transl_table == -1) error = "Found 'codon_start', but 'transl_table' is missing";
        else if (codon_start == -1) error = "Found 'transl_table', but 'codon_start' is missing";
    }

    if (error) { // append species name to error message
        error = GBS_global_string("%s (item='%s')", error, GBT_get_name_or_description(gb_item));
    }

    return error;
}

inline void memcpy3(char *dest, const char *source) {
    dest[0] = source[0];
    dest[1] = source[1];
    dest[2] = source[2];
}

int translate_nuc2aa(int arb_code_nr, char *data, size_t size, size_t pos, bool translate_all, bool create_start_codon, bool append_stop_codon, int *translatedSize) {
    // if translate_all == true -> 'pos' > 1 produces a leading 'X' in protein data
    //                             (otherwise nucleotides in front of the starting pos are simply ignored)
    //
    // if 'create_start_codon' is true and the first generated codon is a start codon of the used
    //                                 code, a 'M' is inserted instead of the codon
    // if 'append_stop_codon' is true, the stop codon is appended as '*'. This is only done, if the last
    //                                 character not already is a stop codon. (Note: provide data with correct size)
    //
    // returns:
    // - the translated protein sequence in 'data'
    // - the length of the translated protein sequence in 'translatedSize' (if != 0)
    // - number of stop-codons in translated sequence as result (incl. optional or appended stop-codons)

    bool create_stop_codon = true; // could be a param; acts similar to 'create_start_codon', but for optional stop codons

    arb_assert(pos <= 2);

    for (char *p = data; *p;  p++) {
        char c = *p;
        if ((c>='a') && (c<='z')) c = c+'A'-'a';
        if (c=='U') c = 'T';
        *p = c;
    }

    char codonBuf[4];
    codonBuf[3] = 0;

    char *dest  = data;

    if (pos && translate_all) {
        for (char *p = data; p<data+pos; ++p) {
            char c = *p;
            if (!GAP::is_std_gap(c)) { // found a nucleotide
                *dest++ = 'X';
                break;
            }
        }
    }

    int    stops      = 0;
    size_t i          = pos;
    char   startCodon = 0;

    AWT_translator translator(arb_code_nr);

    if (create_start_codon) {
        memcpy3(codonBuf, data+pos);
        startCodon = translator.isStartCodon(codonBuf);
    }

    for (char *p = data+pos; i+2<size; p+=3, i+=3) {
        memcpy3(codonBuf, p);

        char aa = translator.codon2aa(codonBuf);
        if (aa == '*') ++stops;
        arb_assert(!islower(aa));

        *(dest++) = aa;
    }

    if (dest>data) { // at least 1 amino written
        if (create_start_codon && startCodon) {
            arb_assert(startCodon == 'M');
            data[0] = startCodon;
        }

        bool last_is_stop = dest[-1] == '*';
        if (!last_is_stop) {
            if (create_stop_codon && translator.isStopCodon(codonBuf)) { // correct optional stop-codon
                arb_assert(translator.CodeNr()>=20); // appears first in table 20 (=EMBL 27)
                dest[-1] = '*'; // use it (we are at EOS)
                ++stops;
            }
            else if (append_stop_codon) {
                *dest++ = '*';
                ++stops;
            }
        }
    }
    dest[0] = 0;

    if (translatedSize) *translatedSize = dest-data;

    return stops;
}

// --------------------------------------------------------------------------------

#ifdef UNIT_TESTS
#ifndef TEST_UNIT_H
#include <test_unit.h>
#endif

static arb_test::match_expectation translates_into(int arb_code_nr, const char *dna, const char *exp_transl, int exp_stops, int exp_size) {
    using namespace arb_test;

    size_t  dna_len = strlen(dna);
    char   *data    = ARB_strduplen(dna, dna_len);

    int size;
    int stops = translate_nuc2aa(arb_code_nr, data, dna_len, 0, false, true, true, &size);
    // test all 3 reading frames?

    const char *translated = data;

    expectation_group expected(that(translated).is_equal_to(exp_transl));
    expected.add(that(stops).is_equal_to(exp_stops));
    expected.add(that(size).is_equal_to(exp_size));

    free(data);

    return all().ofgroup(expected);
}

#define TEST_TRANSLATION(nr,dna,aa,stp,siz)         TEST_EXPECTATION(translates_into(nr,dna,aa,stp,siz))
#define TEST_TRANSLATION__WANTED(nr,dna,aa,stp,siz) TEST_EXPECTATION__WANTED(translates_into(nr,dna,aa,stp,siz))
#define e2a(ec)                                     TTIT_embl2arb(ec)

void TEST_translate() {
    TEST_TRANSLATION(e2a(1), "TTYTCN", "FS*", 1, 3); // stop-codon appended (dna does not end with stop)

    // test optional start-codons:
    TEST_TRANSLATION(e2a(2), "ATCATCTTTTAR", "MIF*", 1, 4); // only std nucs
    TEST_TRANSLATION(e2a(2), "ATYATYTTTTAR", "MIF*", 1, 4); // containing IUPAC-nucs
    TEST_TRANSLATION(e2a(2), "ATAATATARTAR", "MM**", 2, 4); // ATA->M (always, i.e. non-optional)

    // test optional stop-codons:
    TEST_TRANSLATION(e2a(27), "TGATGA", "W*", 1, 2); // only std nucs
    TEST_TRANSLATION(e2a(28), "TGATGA", "W*", 1, 2);
    TEST_TRANSLATION(e2a(28), "TAGTAG", "Q*", 1, 2);
    TEST_TRANSLATION(e2a(28), "TAATAA", "Q*", 1, 2);
    TEST_TRANSLATION(e2a(28), "TARTAR", "Q*", 1, 2); // containing IUPAC-nucs

    TEST_TRANSLATION(e2a(31), "TAGTAG", "E*", 1, 2); // only std nucs
    TEST_TRANSLATION(e2a(31), "TAATAA", "E*", 1, 2);
    TEST_TRANSLATION(e2a(31), "TARTAR", "E*", 1, 2); // containing IUPAC-nucs
}

#endif // UNIT_TESTS

// --------------------------------------------------------------------------------
