// =============================================================== //
//                                                                 //
//   File      : AP_codon_table.cxx                                //
//   Purpose   : codon definitions for DNA -> AA translation       //
//                                                                 //
//   Coded by Ralf Westram (coder@reallysoft.de) in January 2010   //
//   Institute of Microbiology (Technical University Munich)       //
//   http://www.arb-home.de/                                       //
//                                                                 //
// =============================================================== //

#include "AP_codon_table.hxx"
#include "AP_pro_a_nucs.hxx"
#include "iupac.h"

#include <arb_global_defs.h>
#include <arb_str.h>

#include <cctype>

#define pn_assert(cond) arb_assert(cond)

#define EMBL_BACTERIAL_TABLE_INDEX      11
#define AWT_CODON_TABLE_MAX_NAME_LENGTH 57 // increasing this limit forces GUI re-layout (look4: AWT_get_codon_code_name)

#define VALID_PROTEIN      "ABCDEFGHIJKLMNPQRSTVWXYZ*" // all possible translations
#define VALID_PROTEIN_NO_X "ABCDEFGHIJKLMNPQRSTVWYZ*"  // same as VALID_PROTEIN w/o 'X'

// ----------------------------------------------------------------------------------------------------
//
// Info about translation codes was taken from
//     http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
// and
//     https://en.wikipedia.org/wiki/List_of_genetic_codes
//
// Whenever adding new or correcting existing code tables, please
// - check data on NCBI webpage mentioned above
// - document last update in ../../HELP_SOURCE/source/transl_table.hlp@LAST_UPDATE_FROM_WEBPAGE
//
// ----------------------------------------------------------------------------------------------------

static AWT_Codon_Code_Definition AWT_codon_def[AWT_CODON_TABLES+1] =
    {
        //   0000000000111111111122222222223333333333444444444455555555556666    codon number (0-63)
        //   0123456789012345678901234567890123456789012345678901234567890123
        //
        //  "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",  base1
        //  "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",  base2
        //  "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"   base3
        {
            " (1) Standard code",
            "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", // The first code in this table has to be 'Standard code'!
            "---M------**--*----M---------------M----------------------------",
            1 // arb:0
        },
        {
            " (2) Vertebrate mitochondrial code",
            "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",
            "----------**--------------------MMMM----------**---M------------",
            2 // arb:1
        },
        {
            " (3) Yeast mitochondrial code",
            "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
            "----------**----------------------MM----------------------------",
            3 // arb:2
        },
        //  " (X) 6789012345678901234567890123456789012345678901234567", // max.name length (57)
        {
            " (4) Coelenterate Mitochondrial + Mycoplasma/Spiroplasma",
            "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
            "--MM------**-------M------------MMMM---------------M------------",
            4 // arb:3
        },
        {
            " (5) Invertebrate mitochondrial code",
            "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG",
            "---M------**--------------------MMMM---------------M------------",
            5 // arb:4
        },
        {
            " (6) Ciliate, Dasycladacean and Hexamita nuclear code",
            "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
            "--------------*--------------------M----------------------------",
            6 // arb:5
        },
        {
            " (9) Echinoderm and Flatworm mitochondrial code",
            "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
            "----------**-----------------------M---------------M------------",
            9 // arb:6
        },
        {
            "(10) Euplotid nuclear code",
            "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
            "----------**-----------------------M----------------------------",
            10 // arb:7
        },
        //   0000000001111111111222222222233333333334444444444555555555566666
        //   1234567890123456789012345678901234567890123456789012345678901234

        //  "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",  base1
        //  "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",  base2
        //  "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"   base3
        {
            "(11) Bacterial and Plant Plastid code",
            "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
            "---M------**--*----M------------MMMM---------------M------------",
            11 // arb:8
        },
        {
            "(12) Alternative Yeast nuclear code",
            "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
            "----------**--*----M---------------M----------------------------",
            12 // arb:9
        },
        {
            "(13) Ascidian mitochondrial code",
            "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG",
            "---M------**----------------------MM---------------M------------",
            13 // arb:10
        },
        {
            "(14) Alternative Flatworm mitochondrial code",
            "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
            "-----------*-----------------------M----------------------------",
            14 // arb:11
        },
        {
            "(15) Blepharisma nuclear code (deleted?)", // why is it no longer listed at NCBI?
            "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
            "----------*---*--------------------M----------------------------", // converted to new format manually (no source)
            15 // arb:12
        },
        {
            "(16) Chlorophycean mitochondrial code",
            "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
            "----------*---*--------------------M----------------------------",
            16 // arb:13
        },
        {
            "(21) Trematode mitochondrial code",
            "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
            "----------**-----------------------M---------------M------------",
            21 // arb:14
        },
        {
            "(22) Scenedesmus obliquus mitochondrial code",
            "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
            "------*---*---*--------------------M----------------------------",
            22 // arb:15
        },
        {
            "(23) Thraustochytrium mitochondrial code",
            "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
            "--*-------**--*-----------------M--M---------------M------------",
            23 // arb:16
        },
        {
            "(24) Pterobranchia Mitochondrial Code",
            "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG",
            "---M------**-------M---------------M---------------M------------",
            24 // arb:17
        },
        {
            "(25) Candidate Division SR1 and Gracilibacteria Code",
            "FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
            "---M------**-----------------------M---------------M------------",
            25 // arb:18
        },
        {
            "(26) Pachysolen tannophilus Nuclear Code",
            "FFLLSSSSYY**CC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
            "----------**--*----M---------------M----------------------------",
            26 // arb:19
        },
        {
            "(27) Karyorelict Nuclear",
            "FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
            "--------------*--------------------M----------------------------",
            27 // arb:20
        },
        {
            "(28) Condylostoma Nuclear",
            "FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
            "----------**--*--------------------M----------------------------",
            28 // arb:21
        },
        {
            "(29) Mesodinium Nuclear",
            "FFLLSSSSYYYYCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
            "--------------*--------------------M----------------------------",
            29 // arb:22
        },
        {
            "(30) Peritrich Nuclear",
            "FFLLSSSSYYEECC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
            "--------------*--------------------M----------------------------",
            30 // arb:23
        },
        {
            "(31) Blastocrithidia Nuclear",
            "FFLLSSSSYYEECCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
            "----------**-----------------------M----------------------------",
            31 // arb:24
        },

        { NULp, NULp, NULp, 0 } // end of table-marker
    };

// When adding new genetic code:
// - increase AP_codon_table.hxx@AWT_CODON_TABLES
// - increase .@MAX_EMBL_TRANSL_TABLE_VALUE
// - add arb-codenr to .@ALL_TABLES

#define MAX_EMBL_TRANSL_TABLE_VALUE 31 // maximum known EMBL transl_table value

// --------------------------------------------------------------------------------

int TTIT_embl2arb(int embl_code_nr) {
    // returns -1 if embl_code_nr is not known by ARB

    static bool initialized = false;
    static int  arb_code_nr_table[MAX_EMBL_TRANSL_TABLE_VALUE+1];                 // key: embl_code_nr, value: arb_code_nr or -1

    if (!initialized) {
        for (int embl = 0; embl <= MAX_EMBL_TRANSL_TABLE_VALUE; ++embl) {
            arb_code_nr_table[embl] = -1; // illegal table
        }
        for (int arb_code_nr = 0; arb_code_nr < AWT_CODON_TABLES; ++arb_code_nr) {
            int embl = AWT_codon_def[arb_code_nr].embl_feature_transl_table;

            pn_assert(embl<=MAX_EMBL_TRANSL_TABLE_VALUE); // defined embl code is above limit
            pn_assert(arb_code_nr_table[embl] == -1); // duplicate definition of EMBL table number

            arb_code_nr_table[embl] = arb_code_nr;
        }
        // should be index of 'Bacterial and Plant Plastid code'
        // (otherwise maybe AWAR_PROTEIN_TYPE_bacterial_code_index  is wrong)
        pn_assert(arb_code_nr_table[EMBL_BACTERIAL_TABLE_INDEX] == AWAR_PROTEIN_TYPE_bacterial_code_index);
        pn_assert(arb_code_nr_table[1] == 0); // Standard code has to be on index zero!
        pn_assert(arb_code_nr_table[MAX_EMBL_TRANSL_TABLE_VALUE] != -1); // arb_code_nr_table is defined too big

        initialized = true;
    }

    if (embl_code_nr<0 || embl_code_nr>MAX_EMBL_TRANSL_TABLE_VALUE) return -1;

    int arb_code_nr = arb_code_nr_table[embl_code_nr];
#ifdef DEBUG
    if (arb_code_nr != -1) {
        pn_assert(arb_code_nr >= 0 && arb_code_nr < AWT_CODON_TABLES);
        pn_assert(TTIT_arb2embl(arb_code_nr) == embl_code_nr);
    }
#endif
    return arb_code_nr;
}

int TTIT_arb2embl(int arb_code_nr) {
    pn_assert(arb_code_nr >= 0 && arb_code_nr<AWT_CODON_TABLES);
    return AWT_codon_def[arb_code_nr].embl_feature_transl_table;
}


static bool codon_tables_initialized = false;
static char definite_translation[AWT_MAX_CODONS]; // contains 0 if ambiguous, otherwise it contains the definite translation
static char *ambiguous_codons[AWT_MAX_CODONS]; // for each ambiguous codon: contains all translations (each only once)

static void addToAmbiguous(int codon_nr, char possible_translation) {
    static uint8_t length[AWT_MAX_CODONS];

    char*&   ambEntry = ambiguous_codons[codon_nr];
    uint8_t& ambLen   = length[codon_nr];

    if (!ambEntry) { // first insert
        ambEntry    = ARB_calloc<char>(AWT_MAX_CODONS+1);
        ambEntry[0] = possible_translation;
        ambLen      = 1;
    }
    else if (!strchr(ambEntry, possible_translation)) {
        ambEntry[ambLen++] = possible_translation;
    }
}

void AP_initialize_codon_tables() {
    if (codon_tables_initialized) return;

    int codon_nr;
    int code_nr;

    for (codon_nr=0; codon_nr<AWT_MAX_CODONS; codon_nr++) {
        ambiguous_codons[codon_nr] = NULp;
    }

    pn_assert(AWT_CODON_TABLES>=1);
    pn_assert(!AWT_codon_def[AWT_CODON_TABLES].aa); // Error in AWT_codon_def or AWT_CODON_CODES

    for (code_nr=0; code_nr<AWT_CODON_TABLES; code_nr++) {
        const char *translation = AWT_codon_def[code_nr].aa;
        const char *startStop   = AWT_codon_def[code_nr].startStop;

        pn_assert(strlen(AWT_codon_def[code_nr].name) <= AWT_CODON_TABLE_MAX_NAME_LENGTH); // GUI layout depends on max. name length

        for (codon_nr=0; codon_nr<AWT_MAX_CODONS; codon_nr++) {
            bool isOptionalStartStop = false;

            // check definition of 'translation' and 'startStop' is consistent:
            switch (startStop[codon_nr]) {
                case 'M': // defined as start-codon
                    pn_assert(translation[codon_nr] != '*'); // invalid def: stop AND start
                    isOptionalStartStop = translation[codon_nr] != 'M';
                    break;

                case '*': // defined as stop-codon (new def style)
                    pn_assert(translation[codon_nr] != 'M'); // invalid def: start AND stop
                    isOptionalStartStop = translation[codon_nr] != '*';
                    break;

                case '-': // neither start nor stop (new def style) not start (old def style)
                    pn_assert(translation[codon_nr] != '*'); // invalid def: stop codons have to be marked in 'Starts' definition
                    break;

                default:
                    pn_assert(0); // invalid character in startStop
                    break;
            }

            // detect definite/ambiguous translations:
            if (code_nr == 0) { // first table (no ambiguity possible yet)
                if (isOptionalStartStop) {
                    addToAmbiguous(codon_nr, translation[codon_nr]);
                    addToAmbiguous(codon_nr, startStop[codon_nr]);
                    definite_translation[codon_nr] = 0;
                }
                else {
                    definite_translation[codon_nr] = translation[codon_nr];
                }
            }
            else if (definite_translation[codon_nr]) { // is definite till now
                if (definite_translation[codon_nr] != translation[codon_nr] || isOptionalStartStop) { // we found a different translation
                    addToAmbiguous(codon_nr, definite_translation[codon_nr]);
                    addToAmbiguous(codon_nr, translation[codon_nr]);
                    if (isOptionalStartStop) addToAmbiguous(codon_nr, startStop[codon_nr]);
                    definite_translation[codon_nr] = 0;
                }
            }
            else { // is ambiguous
                addToAmbiguous(codon_nr, translation[codon_nr]);
                if (isOptionalStartStop) addToAmbiguous(codon_nr, startStop[codon_nr]);
            }
        }
    }

    codon_tables_initialized = true;
}

// return 0..3 (ok) or 4 (failure)
inline int dna2idx(char c) {
    switch (c) {
        case 'T': case 't':
        case 'U': case 'u': return 0;
        case 'C': case 'c': return 1;
        case 'A': case 'a': return 2;
        case 'G': case 'g': return 3;
    }
    return 4;
}

inline char idx2dna(int idx) {
    pn_assert(idx>=0 && idx<4);
    return "TCAG"[idx];
}

inline int calc_codon_nr(const char *dna) {
    int i1 = dna2idx(dna[0]); if (i1 == 4) return AWT_MAX_CODONS; // is not a codon
    int i2 = dna2idx(dna[1]); if (i2 == 4) return AWT_MAX_CODONS;
    int i3 = dna2idx(dna[2]); if (i3 == 4) return AWT_MAX_CODONS;

    int codon_nr = i1*16 + i2*4 + i3;
    pn_assert(codon_nr>=0 && codon_nr<=AWT_MAX_CODONS);
    return codon_nr;
}

inline void build_codon(int codon_nr, char *to_buffer) {
    pn_assert(codon_nr>=0 && codon_nr<AWT_MAX_CODONS);

    to_buffer[0] = idx2dna((codon_nr>>4)&3);
    to_buffer[1] = idx2dna((codon_nr>>2)&3);
    to_buffer[2] = idx2dna(codon_nr&3);
}

const char* AWT_get_codon_code_name(int code) {
    pn_assert(code>=0 && code<AWT_CODON_TABLES);
    return AWT_codon_def[code].name;
}

static const char *aa_3letter_name[26+1] = {
    "Ala", // A
    "Asx", // B (= D or N)
    "Cys", // C
    "Asp", // D
    "Glu", // E
    "Phe", // F
    "Gly", // G
    "His", // H
    "Ile", // I
    "Xle", // J (= I or L)
    "Lys", // K
    "Leu", // L
    "Met", // M
    "Asn", // N
    NULp,  // O
    "Pro", // P
    "Gln", // Q
    "Arg", // R
    "Ser", // S
    "Thr", // T
    NULp,  // U
    "Val", // V
    "Trp", // W
    "Xaa", // X
    "Tyr", // Y
    "Glx", // Z (= E or Q)
    NULp
};

const char *getAminoAcidAbbr(char aa) {
    if (aa=='*') return "End";
    aa = toupper(aa);
    if (aa>='A' && aa<='Z') return aa_3letter_name[aa-'A'];
    return NULp;
}

#ifdef DEBUG

inline char nextBase(char c) {
    switch (c) {
        case 'T': return 'C';
        case 'C': return 'A';
        case 'A': return 'G';
#if 0
        case 'G': return 0;
#else
        case 'G': return 'M';
        case 'M': return 'R';
        case 'R': return 'W';
        case 'W': return 'S';
        case 'S': return 'Y';
        case 'Y': return 'K';
        case 'K': return 'V';
        case 'V': return 'H';
        case 'H': return 'D';
        case 'D': return 'B';
        case 'B': return 'N';
        case 'N': return 0;
#endif
        default: pn_assert(0);
    }
    return 0;
}

void AWT_dump_codons(TranslationTableIndexType type, bool skipX) {
    // use for debugging

    const TransTables all_allowed;

    for (char c='*'; c<='Z'; c++) {
        printf("Codons for '%c': ", c);

        if (skipX && c == 'X') {
            fputs("skipped", stdout);
        }
        else {
            bool first_line = true;
            bool found      = false;
            for (char b1='T'; b1; b1=nextBase(b1)) {
                for (char b2='T'; b2; b2=nextBase(b2)) {
                    for (char b3='T'; b3; b3=nextBase(b3)) {
                        char dna[4];
                        dna[0]=b1;
                        dna[1]=b2;
                        dna[2]=b3;
                        dna[3]=0;

                        TransTables remaining;
                        if (AWT_is_codon(c, dna, all_allowed, remaining)) {
                            if (!first_line) fputs("\n                ", stdout);
                            first_line = false;
                            printf("%s (%s)", dna, remaining.to_string(type));
                            found = true;
                        }
                    }
                }
            }
            if (!found) fputs("none", stdout);
        }
        fputs("\n", stdout);
        if (c=='*') c='A'-1;
    }
}
#endif

inline char isStartOrStopCodonNr(int codon_nr, int code_nr) {
    char isStartStop = 0;
    pn_assert(code_nr >= 0 && code_nr<AWT_CODON_TABLES);

    pn_assert(codon_nr != AWT_MAX_CODONS);               // should not be called with IUPAC codons
    pn_assert(codon_nr >= 0 && codon_nr<AWT_MAX_CODONS); // (use isStartOrStopCodon, isStartCodon or isStopCodon)

    if (codon_nr != AWT_MAX_CODONS) { // 'codon' is a clean codon (it contains no iupac-codes)
        isStartStop = AWT_codon_def[code_nr].startStop[codon_nr];
        if (isStartStop == '-') {
            isStartStop = 0;
        }
    }

    arb_assert(implicated(isStartStop, isStartStop == '*' || isStartStop == 'M'));
    return isStartStop;
}

char AWT_translator::isStartOrStopCodon(const char *codon) const {
    /*! test whether 'codon' is a start- or stop-codon.
     * @param codon   three bases definining the codon
     * @return '*' for stop-codons, 'M' for start-codons, 0 otherwise
     */

    char result   = 0;
    int  codon_nr = calc_codon_nr(codon);
    if (codon_nr == AWT_MAX_CODONS) { // codon contains iupac codes (rare case -> brute force implementation ok)
        TransTables allowed;
        allowed.forbidAllBut(CodeNr());
        TransTables remaining = allowed;

        bool is_start = AWT_is_codon('M', codon, allowed, remaining, NULp);
        bool is_stop  = is_start ? false : AWT_is_codon('*', codon, allowed, remaining, NULp);

        pn_assert(!(is_start && is_stop));
        result = is_start ? 'M' : (is_stop ? '*' : 0);
    }
    else { // codon is a clean codon
        result = isStartOrStopCodonNr(calc_codon_nr(codon), code_nr);
    }
    return result;
}

inline bool protMatches(char p1, char p2) {
    /*! return true if p1 matches p2
     * @param p1 "normal" protein (neither B, Z nor J)
     * @param p2 any protein (B, Z and J ok)
     * B is a shortcut for Asp(=D) or Asn(=N)
     * J is a shortcut for Ile(=I) or Leu(=L)
     * Z is a shortcut for Glu(=E) or Gln(=Q)
     */
    pn_assert(p1 != 'B' && p1 != 'Z' && p1 != 'J');
    pn_assert(p1 == toupper(p1));
    pn_assert(p2 == toupper(p2));

    if (p1 == p2) return true;
    if (p2 == 'B') return p1 == 'D' || p1 == 'N';
    if (p2 == 'J') return p1 == 'I' || p1 == 'L';
    if (p2 == 'Z') return p1 == 'E' || p1 == 'Q';
    return false;
}
inline bool containsProtMatching(const char *pstr, char p) {
    /*! return true, if 'pstr' contains any protein that matches 'p'.
     * uses same logic as protMatches()
     */
    pn_assert(p == toupper(p));
    if (p == 'B') return strchr(pstr, 'D') || strchr(pstr, 'N');
    if (p == 'J') return strchr(pstr, 'I') || strchr(pstr, 'L');
    if (p == 'Z') return strchr(pstr, 'E') || strchr(pstr, 'Q');
    return strchr(pstr, p);
}
inline bool isGap(char c) { return GAP::is_std_gap(c); }

inline GB_ERROR neverTranslatesError(const char *dna, char protein) {
    if (!strchr(VALID_PROTEIN, protein)) {
        return GBS_global_string("'%c' is no valid amino acid", protein);
    }
    return GBS_global_string("'%c%c%c' never translates to '%c'", dna[0], dna[1], dna[2], protein);
}

bool AWT_is_codon(char protein, const char *const dna, const TransTables& allowed, TransTables& remaining, const char **fail_reason_ptr) {
    /*! test if 'dna' codes 'protein'
     * @param protein amino acid
     * @param dna three nucleotides (gaps allowed, e.g. 'A-C' can be tested vs 'X')
     * @param allowed allowed translation tables
     * @param remaining returns the remaining allowed translation tables (only if this functions returns true)
     * @param fail_reason_ptr if not NULp => store reason for failure here (or set it to NULp on success)
     * @return true if dna translates to protein
     */

    pn_assert(allowed.any());
    pn_assert(codon_tables_initialized);

    const char *fail_reason               = NULp;
    if (fail_reason_ptr) *fail_reason_ptr = NULp;

    bool is_codon        = false;
    int  codon_nr        = calc_codon_nr(dna);
    int  first_iupac_pos = -1;
    int  iupac_positions = 0;
    bool decided         = false;
    bool general_failure = false;

    protein = toupper(protein);

    if (codon_nr==AWT_MAX_CODONS) { // dna is not a clean codon (i.e. it contains iupac-codes or gaps)
        bool too_short = false;
        int  nucs_seen = 0;
        for (int iupac_pos=0; iupac_pos<3 && !too_short && !fail_reason; iupac_pos++) {
            char N = dna[iupac_pos];

            if (!N) too_short = true;
            else if (!isGap(N)) {
                nucs_seen++;
                if (!strchr("ACGTU", N)) {
                    if (first_iupac_pos==-1) first_iupac_pos = iupac_pos;
                    iupac_positions++;
                    const char *decoded_iupac = iupac::decode(N, GB_AT_DNA, 0);
                    if (!decoded_iupac[0]) { // no valid IUPAC
                        fail_reason = GBS_global_string("Invalid character '%c' in DNA", N);
                    }
                }
            }
        }

        if (!fail_reason && !nucs_seen) { // got no dna
            fail_reason = "No nucleotides left";
        }
        else if (nucs_seen<3) {
            too_short = true;
        }

        if (fail_reason) {
            decided = true; // fails for all proteins
        }
        else if (too_short) {
            decided  = true;
            if (protein == 'X') {
                is_codon = true;
            }
            else {
                char dna_copy[4];
                strncpy(dna_copy, dna, 3);
                dna_copy[3] = 0;

                fail_reason = GBS_global_string("Not enough nucleotides (got '%s')", dna_copy);
            }
        }
    }

    if (!decided) {
        if (protein == 'X') {
            TransTables  allowed_copy = allowed;
            const char  *valid_prot   = VALID_PROTEIN_NO_X;

            for (int i = 0; valid_prot[i]; ++i) {
                if (AWT_is_codon(valid_prot[i], dna, allowed_copy, remaining)) {
                    allowed_copy.forbid(remaining);
                    if (allowed_copy.none()) break;
                }
            }

            if (allowed_copy.any()) {
                is_codon  = true;
                remaining = allowed_copy;
            }
            else {
                fail_reason = neverTranslatesError(dna, protein);
            }
        }
        else if (codon_nr==AWT_MAX_CODONS) { // dna is a codon with one or more IUPAC codes
            pn_assert(iupac_positions);
            const char *decoded_iupac = iupac::decode(dna[first_iupac_pos], GB_AT_DNA, 0);
            pn_assert(decoded_iupac[0]); // already should have been catched above

            char dna_copy[4];
            memcpy(dna_copy, dna, 3);
            dna_copy[3] = 0;

            bool all_are_codons = true;
            bool one_is_codon   = false;

            TransTables allowed_copy = allowed;

            for (int i=0; decoded_iupac[i]; i++) {
                dna_copy[first_iupac_pos] = decoded_iupac[i];
                const char *subfail;
                if (!AWT_is_codon(protein, dna_copy, allowed_copy, remaining, &subfail)) {
                    all_are_codons = false;
                    if (!one_is_codon && ARB_strBeginsWith(subfail, "Not all ")) one_is_codon = true;
                    if (one_is_codon) break;
                }
                else {
                    one_is_codon = true;
                    allowed_copy = remaining;
                }
            }

            if (all_are_codons) {
                pn_assert(allowed_copy.any());
                remaining = allowed_copy;
                is_codon  = true;
            }
            else {
                remaining.forbidAll();
                dna_copy[first_iupac_pos] = dna[first_iupac_pos];
                if (one_is_codon) {
                    fail_reason = GBS_global_string("Not all IUPAC-combinations of '%s' translate to '%c'", dna_copy, protein); // careful when changing this message (see above)
                }
                else {
                    fail_reason = neverTranslatesError(dna_copy, protein);
                }
            }
        }
        else if (definite_translation[codon_nr]) { // codon has a definite translation (i.e. translates equal for all code-tables)
            char defTransl = definite_translation[codon_nr];

#if defined(ASSERTION_USED)
            bool optionalCodonExists = false;
            for (int code_nr=0; code_nr<AWT_CODON_TABLES && !optionalCodonExists; code_nr++) {
                char startStop = isStartOrStopCodonNr(codon_nr, code_nr);
                if (startStop && startStop != defTransl) { // got optional start/stop codon
                    if (allowed.is_allowed(code_nr)) {
                        pn_assert(startStop == '*' || startStop == 'M');
                        optionalCodonExists = true;
                    }
                }
            }
            pn_assert(!optionalCodonExists); // when this fails -> definite_translation[] is wrong
#endif

            int ok = protMatches(defTransl, protein);
            if (ok) {
                remaining = allowed;
                is_codon  = true;
            }
            else {
                remaining.forbidAll();
                fail_reason     = GBS_global_string("'%c%c%c' translates to '%c', not to '%c'", dna[0], dna[1], dna[2], defTransl, protein);
                general_failure = true;
            }
        }
        else if (!containsProtMatching(ambiguous_codons[codon_nr], protein)) { // codon does not translate to protein in any code-table
            remaining.forbidAll();
            fail_reason     = neverTranslatesError(dna, protein);
            general_failure = true;
        }
        else {
#if defined(ASSERTION_USED)
            bool correct_disallowed_translation = false;
#endif

            // Now codon translates to protein in at least 1 code-table!
            // Check whether protein translates in any of the allowed code-tables and forbid rest
            for (int code_nr=0; code_nr<AWT_CODON_TABLES; code_nr++) {
                bool mayTranslate = protMatches(AWT_codon_def[code_nr].aa[codon_nr], protein);
                if (!mayTranslate && (protein == '*' || protein == 'M')) {
                    char startOrStop = isStartOrStopCodonNr(codon_nr, code_nr);
                    mayTranslate     = startOrStop && protMatches(startOrStop, protein);
                }

                if (mayTranslate) { // may codon_nr translate to protein for code_nr
                    if (allowed.is_allowed(code_nr)) { // is this code allowed?
                        remaining.allow(code_nr);
                        is_codon = true;
                    }
                    else {
                        remaining.forbid(code_nr); // otherwise forbid code in future
#if defined(ASSERTION_USED)
                        correct_disallowed_translation = true;
#endif
                    }
                }
                else {
                    remaining.forbid(code_nr); // otherwise forbid code in future
                }
            }

            if (!is_codon) {
                pn_assert(correct_disallowed_translation); // should be true because otherwise we shouldn't run into this else-branch
                fail_reason = GBS_global_string("'%c%c%c' does not translate to '%c'", dna[0], dna[1], dna[2], protein);
            }
        }
    }

    if (!is_codon) {
        pn_assert(fail_reason);
        if (fail_reason_ptr) {
            if (!allowed.all() && !general_failure) {
                int one = allowed.explicit_table();
                if (one == -1) {
                    const char *left_tables = allowed.to_string(TTIT_EMBL);
                    pn_assert(left_tables[0]); // allowed should never be empty!

                    fail_reason = GBS_global_string("%s (for any of the leftover trans-tables: %s)", fail_reason, left_tables);
                }
                else {
                    int one_embl = TTIT_arb2embl(one);
                    fail_reason  = GBS_global_string("%s (for trans-table %i)", fail_reason, one_embl);
                }
            }

            *fail_reason_ptr = fail_reason; // set failure-reason if requested
        }
    }
#if defined(ASSERTION_USED)
    else {
        pn_assert(remaining.is_subset_of(allowed));
    }
#endif
    return is_codon;
}

// -------------------------------------------------------------------------------- Codon_Group

#if defined(DEBUG)
// #define DUMP_CODON_GROUP_EXPANSION
#endif

class Codon_Group {
    char codon[64]; // index is calculated with calc_codon_nr

public:
    Codon_Group(char protein, int code_nr);
    ~Codon_Group() {}

    Codon_Group& operator += (const Codon_Group& other);
    int expand(char *to_buffer) const;
};

Codon_Group::Codon_Group(char protein, int code_nr) {
    protein = toupper(protein);
    pn_assert(protein=='*' || isalpha(protein));
    pn_assert(code_nr>=0 && code_nr<AWT_CODON_TABLES);

    const char *amino_table = AWT_codon_def[code_nr].aa;
    for (int i=0; i<AWT_MAX_CODONS; i++) {
        codon[i] = amino_table[i]==protein;
    }
}

Codon_Group& Codon_Group::operator+=(const Codon_Group& other) {
    for (int i=0; i<AWT_MAX_CODONS; i++) {
        codon[i] = codon[i] || other.codon[i];
    }
    return *this;
}

inline int legal_dna_no(int i) { return i>=0 && i<4; }

inline const char *buildMixedCodon(const char *const con1, const char *const con2) {
    int mismatches = 0;
    int mismatch_index = -1;
    static char buf[4];

    for (int i=0; i<3; i++) {
        if (con1[i]!=con2[i]) {
            mismatches++;
            mismatch_index = i;
        }
        else {
            buf[i] = con1[i];
        }
    }

    if (mismatches==1) { // exactly one position differs between codons
        pn_assert(mismatch_index!=-1);
        buf[mismatch_index] = iupac::combine(con1[mismatch_index], con2[mismatch_index], GB_AT_DNA);
        buf[3]              = 0;

        if (memcmp(con1, buf, 3) == 0 ||
            memcmp(con2, buf, 3) == 0)
        {
            return NULp;
        }

#if defined(DUMP_CODON_GROUP_EXPANSION)
        printf(" buildMixedCodon('%c%c%c','%c%c%c') == '%s'\n",
               con1[0], con1[1], con1[2],
               con2[0], con2[1], con2[2],
               buf);
#endif

        return buf;
    }
    return NULp;
}

static int expandMore(const char *bufferStart, int no_of_condons, char*&to_buffer) {
    int i, j;
    const char *con1, *con2;
    int added = 0;

    for (i=0; i<no_of_condons; i++) {
        con1 = bufferStart+3*i;

        for (j=i+1; j<no_of_condons; j++) {
            con2 = bufferStart+3*j;
            const char *result = buildMixedCodon(con1, con2);
            if (result) {
                to_buffer[0] = 0;
                // do we already have this codon?
                const char *found;
                const char *startSearch = bufferStart;
                for (;;) {
                    found = strstr(startSearch, result);
                    if (!found) break;
                    int pos = (found-bufferStart);
                    if ((pos%3)==0) break; // yes already here!
                    startSearch = found+1; // was misaligned -> try behind
                }

                if (!found) {
                    memmove(to_buffer, result, 3); to_buffer+=3;
                    added++;
                }
            }
        }
    }
    return no_of_condons+added;
}

int Codon_Group::expand(char *to_buffer) const {
    int count = 0;
    int i;
    char *org_to_buffer = to_buffer;

    for (i=0; i<AWT_MAX_CODONS; i++) {
        if (codon[i]) {
            build_codon(i, to_buffer);
            to_buffer += 3;
            count++;
        }
    }

#if defined(DUMP_CODON_GROUP_EXPANSION)
    to_buffer[0] = 0;
    printf("codons = '%s'\n", org_to_buffer);
#endif

    for (;;) {
        int new_count = expandMore(org_to_buffer, count, to_buffer);
        if (new_count==count) break; // nothing expanded -> done
        count = new_count;
#if defined(DUMP_CODON_GROUP_EXPANSION)
        to_buffer[0] = 0;
        printf("codons (expandedMore) = '%s'\n", org_to_buffer);
#endif
    }

    pn_assert(count==(int(to_buffer-org_to_buffer)/3));

    return count;
}

// --------------------------------------------------------------------------------

static Codon_Group *get_Codon_Group(char protein, int code_nr) {
    pn_assert(code_nr>=0 && code_nr<AWT_CODON_TABLES);
    protein = toupper(protein);
    pn_assert(isalpha(protein) || protein=='*');
    pn_assert(codon_tables_initialized);

    Codon_Group *cgroup = NULp;

    if (protein=='B') {
        cgroup = new Codon_Group('D', code_nr);
        Codon_Group N('N', code_nr);
        *cgroup += N;
    }
    else if (protein=='Z') {
        cgroup = new Codon_Group('E', code_nr);
        Codon_Group Q('Q', code_nr);
        *cgroup += Q;
    }
    else {
        cgroup = new Codon_Group(protein, code_nr);
    }

    pn_assert(cgroup);

    return cgroup;
}

#define MAX_CODON_LIST_LENGTH (70*3)

const char *AP_get_codons(char protein, int code_nr) {
    // get a list of all codons ("xyzxyzxyz...") encoding 'protein' in case we use Codon-Code 'code_nr'
    // (includes all completely contained IUPAC-encoded codons at the end of list)
    //
    // Optional start-/stop-codons are not added
    // (i.e. a query for 'M' or '*' may report "incomplete" results)

    Codon_Group *cgroup = get_Codon_Group(protein, code_nr);

    static char buffer[MAX_CODON_LIST_LENGTH+1];
    int offset = 3*cgroup->expand(buffer);
    pn_assert(offset<MAX_CODON_LIST_LENGTH);
    buffer[offset] = 0;

    delete cgroup;

    return buffer;
}

// --------------------------------------------------------------------------------

#ifdef UNIT_TESTS
#ifndef TEST_UNIT_H
#include <test_unit.h>
#endif

static const char *startStopSummary() {
    // returns string showing summary for start/stop
    // position = codon_nr
    // content:
    //      '*' -> translates to stop-codon for at least one code
    //      'M' -> translates to start-codon for at least one code
    //      '2' -> both (not necessarily same code)
    //      '-' -> does not translate to start or stop for any code

    static char result[AWT_MAX_CODONS+1];

    for (int codon = 0; codon<AWT_MAX_CODONS; ++codon) {
        char startStop = '-';
        for (int code = 0; code<AWT_CODON_TABLES && (startStop != '2'); ++code) {
            switch (isStartOrStopCodonNr(codon, code)) {
                case '*':
                    switch (startStop) {
                        case '*': break;
                        case '-': startStop = '*'; break;
                        case 'M': startStop = '2'; break;
                        default: pn_assert(0); break;
                    }
                    break;
                case 'M':
                    switch (startStop) {
                        case 'M': break;
                        case '-': startStop = 'M'; break;
                        case '*': startStop = '2'; break;
                        default: pn_assert(0); break;
                    }
                    break;

                case 0: break;
                default: pn_assert(0); break;
            }
        }
        result[codon] = startStop;
    }
    result[AWT_MAX_CODONS] = 0;
    return result;
}
static const char *optionality() {
    // returns string indicating whether start/stop-codon is optional
    // position = codon_nr
    // content:
    //      '-' -> only non-optional start/stop
    //      '!' -> only optional start/stop
    //      '?' -> both
    //      ' ' -> never start or stop

    static char result[AWT_MAX_CODONS+1];

    for (int codon = 0; codon<AWT_MAX_CODONS; ++codon) {
        char optional = ' ';
        for (int code = 0; code<AWT_CODON_TABLES && (optional != '?'); ++code) {
            char startStop = isStartOrStopCodonNr(codon, code);
            if (startStop) {
                bool is_optional = AWT_codon_def[code].aa[codon] != startStop;

                switch (optional) {
                    case ' ': optional = is_optional ? '!' : '-'; break;
                    case '-': optional = is_optional ? '?' : '-'; break;
                    case '!': optional = is_optional ? '!' : '?'; break;
                    default: pn_assert(0); break;
                }
            }
        }

#if defined(ASSERTION_USED)
        bool sometimes_optional = optional == '!' || optional == '?';
        pn_assert(!sometimes_optional || !definite_translation[codon]);
#endif

        result[codon] = optional;
    }
    result[AWT_MAX_CODONS] = 0;

    return result;
}
static const char *definite() {
    static char result[AWT_MAX_CODONS+1];
    for (int codon = 0; codon<AWT_MAX_CODONS; ++codon) {
        result[codon] = definite_translation[codon] ? definite_translation[codon] : ' ';
    }
    result[AWT_MAX_CODONS] = 0;
    return result;
}
static const char *ambig_count() {
    static char result[AWT_MAX_CODONS+1];
    for (int codon = 0; codon<AWT_MAX_CODONS; ++codon) {
        const char *amb = ambiguous_codons[codon];
        result[codon]   = amb ? '0'+strlen(amb) : ' ';
    }
    result[AWT_MAX_CODONS] = 0;
    return result;
}

#define e2a(c) TTIT_embl2arb(c)

void TEST_codon_check() {
    AP_initialize_codon_tables();

    //                                     0000000000111111111122222222223333333333444444444455555555556666    codon number (0-63)
    //                                     0123456789012345678901234567890123456789012345678901234567890123
    //
    //                                    "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG"  base1
    //                                    "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG"  base2
    //                                    "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"  base3
    TEST_EXPECT_EQUAL(startStopSummary(), "--2M--*---**--*----M------------MMMM----------**---M------------");
    TEST_EXPECT_EQUAL(optionality     (), "  ?!  -   ??  ?    !            !!?-          --   !            ");
    TEST_EXPECT_EQUAL(definite        (), "FF  SS SYY  CC W    PPPPHHQQRRRR   MTTTTNN KSS  VVV AAAADDEEGGGG"); // optional start/stop codons shall never be definite
    TEST_EXPECT_EQUAL(ambig_count     (), "  32  2   45  4 2225            222       2   45   2            "); // number of proteins in ambiguous_codons

    TEST_EXPECT_EQUAL(getAminoAcidAbbr('*'), "End");
    TEST_EXPECT_EQUAL(getAminoAcidAbbr('C'), "Cys");
    TEST_EXPECT_EQUAL(getAminoAcidAbbr('B'), "Asx");
    TEST_EXPECT_EQUAL(getAminoAcidAbbr('b'), "Asx");
    TEST_EXPECT_EQUAL(getAminoAcidAbbr('J'), "Xle");
    TEST_EXPECT_EQUAL(getAminoAcidAbbr('O'), NULp);
    TEST_EXPECT_EQUAL(getAminoAcidAbbr('X'), "Xaa");
    TEST_EXPECT_EQUAL(getAminoAcidAbbr('x'), "Xaa");
    TEST_EXPECT_EQUAL(getAminoAcidAbbr('-'), NULp);
    TEST_EXPECT_EQUAL(getAminoAcidAbbr('='), NULp);
    TEST_EXPECT_EQUAL(getAminoAcidAbbr('7'), NULp);

    TEST_EXPECT(protMatches('V', 'V'));
    TEST_EXPECT(protMatches('N', 'B'));
    TEST_EXPECT(protMatches('E', 'Z'));
    TEST_EXPECT(!protMatches('N', 'Z'));
    TEST_EXPECT(!protMatches('V', 'Z'));

    TEST_EXPECT_EQUAL(AP_get_codons('D', 0), "GATGACGAY");
    TEST_EXPECT_EQUAL(AP_get_codons('N', 0), "AATAACAAY");
    TEST_EXPECT_EQUAL(AP_get_codons('B', 0), "AAT" "AAC" "GAT" "GAC" "AAY" "RAT" "RAC" "GAY" "RAY"); // 'B' = 'D' or 'N'

    TEST_EXPECT_EQUAL(AP_get_codons('L', 0),  "TTATTGCTTCTCCTACTG"    "TTRYTAYTGCTYCTWCTKCTMCTSCTRCTHCTBCTDCTVYTRCTN");
    TEST_EXPECT_EQUAL(AP_get_codons('L', 2),  "TTATTG"                "TTR");
    TEST_EXPECT_EQUAL(AP_get_codons('L', 9),  "TTATTGCTTCTCCTAT"      "TRYTACTYCTWCTMCTH");
    TEST_EXPECT_EQUAL(AP_get_codons('L', 13), "TTATTGTAGCTTCTCCTACTG" "TTRYTATWGYTGCTYCTWCTKCTMCTSCTRCTHCTBCTDCTVYTRCTN");
    TEST_EXPECT_EQUAL(AP_get_codons('L', 16), "TTGCTTCTCCTAC"         "TGYTGCTYCTWCTKCTMCTSCTRCTHCTBCTDCTVCTN");

    TEST_EXPECT_EQUAL(AP_get_codons('S', 0),  "TCTTCCTCATCGAGTAGC"       "TCYTCWTCKTCMTCSTCRAGYTCHTCBTCDTCVTCN");
    TEST_EXPECT_EQUAL(AP_get_codons('S', 4),  "TCTTCCTCATCGAGTAGCAGAAGG" "TCYTCWTCKTCMTCSTCRAGYAGWAGKAGMAGSAGRTCHTCBTCDTCVAGHAGBAGDAGVTCNAGN");
    TEST_EXPECT_EQUAL(AP_get_codons('S', 9),  "TCTTCCTCATCGCTGAGTAGC"    "TCYTCWTCKTCMTCSTCRAGYTCHTCBTCDTCVTCN");
    TEST_EXPECT_EQUAL(AP_get_codons('S', 15), "TCTTCCTCGAGTAGC"          "TCYTCKTCSAGYTCB");

    // stop-codons:
    TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 1)), "TAATAGTGA"     "TARTRA"); // the 3 standard stop codons and their IUPAC covers
    TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 2)), "TAATAGAGAAGG"  "TARAGR");
    TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 3)), "TAATAG"        "TAR");    // not TGA
    TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 4)), "TAATAG"        "TAR");
    TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 5)), "TAATAG"        "TAR");
    TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 9)), "TAATAG"        "TAR");
    TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(10)), "TAATAG"        "TAR");
    TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(13)), "TAATAG"        "TAR");
    TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(21)), "TAATAG"        "TAR");
    TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(15)), "TAATGA"        "TRA");    // not TAG
    TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(16)), "TAATGA"        "TRA");
    TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 6)), "TGA");                    // not TAA TAG
    TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(14)), "TAG");                    // not TAA TGA
    TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(22)), "TCATAATGA"     "TMATSATRATVA");
    TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(23)), "TTATAATAGTGA"  "TWATKATARTRATDA");

    {
        // Note: optional start/stop-codons are not added in Codon_Group,
        //       because they would introduce ambiguous mapping.

        // test optional stop-codons:
        TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(27)), "");
        TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(28)), "");
        TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(31)), "");

        // test optional start-codons:
        TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 1)), "ATG");                 // 3 (start-codons listed in table-definition)
        TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 2)), "ATAATG"        "ATR"); // 5
        TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 3)), "ATAATG"        "ATR"); // 2
        TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 4)), "ATG");                 // 8
        TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 5)), "ATAATG"        "ATR"); // 6
        TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 6)), "ATG");                 // 1
        TEST_EXPECT_EQUAL(AP_get_codons('M', e2a(11)), "ATG");                 // 7
        TEST_EXPECT_EQUAL(AP_get_codons('M', e2a(13)), "ATAATG"        "ATR"); // 4
        TEST_EXPECT_EQUAL(AP_get_codons('M', e2a(24)), "ATG");                 // 4
    }

    TEST_EXPECT_EQUAL(AP_get_codons('X', 0), ""); // @@@ wrong: TGR->X (or disallow call)

    const TransTables allowed;

    // ---------------------------
    //      test valid codons
    struct test_is_codon {
        char        protein;
        const char *codon;
        const char *tables;
    };

#define ALL_TABLES "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24" // contains arb table-numbers

    test_is_codon is_codon[] = {
        { 'P', "CCC", ALL_TABLES },
        { 'P', "CCN", ALL_TABLES },
        { 'R', "CGN", ALL_TABLES },

        { 'D', "GAY", ALL_TABLES },
        { 'N', "AAY", ALL_TABLES },
        { 'B', "AAY", ALL_TABLES }, // translates to 'N', but matches B(=D|N) for realigner
        { 'B', "GAY", ALL_TABLES }, // translates to 'D', but matches B(=D|N) for realigner
        { 'B', "RAY", ALL_TABLES }, // translates to 'D' or to 'N' (i.e. only matches 'B', see failing test for 'RAY' below)
        { 'B', "RAT", ALL_TABLES },

        { 'Q', "CAR", ALL_TABLES },
        { 'E', "GAR", ALL_TABLES },
        { 'Z', "SAR", ALL_TABLES },

        { 'X', "NNN", ALL_TABLES },

        { 'L', "TTR", "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15" ",17,18,19,20,21,22,23,24" }, { 'X', "TTR", "16" },
        { 'L', "YTA", "0,1"",3,4,5,6,7,8,9,10,11,12,13,14,15" ",17,18,19,20,21,22,23,24" }, { 'X', "YTA", "2,16" }, // Y=TC
        { 'L', "CTM", "0,1"",3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24" }, { 'T', "CTM", "2" },    // M=AC
        { 'L', "CTN", "0,1"",3,4,5,6,7,8"",10,11,12,13,14,15,16,17,18" ",20,21,22,23,24" }, { 'T', "CTN", "2" }, { 'X', "CTN", "9,19" },
        { 'L', "CTK", "0,1"",3,4,5,6,7,8"",10,11,12,13,14,15,16,17,18" ",20,21,22,23,24" }, { 'T', "CTK", "2" }, { 'X', "CTK", "9,19" }, // K=TG

        { 'L', "TWG", "13,15" }, // W=AT
        { 'J', "TWG", "13,15" }, // translates to 'L', but matches J(=I|L) for realigner
        { 'X', "TWG", "0,1,2,3,4,5,6,7,8,9,10,11,12" ",14" ",16,17,18,19,20,21,22,23,24" }, // all but 'L<->TWG'

        { 'S', "AGY", ALL_TABLES },
        { 'S', "TCY", ALL_TABLES },
        { 'S', "TCN", "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,16,17,18,19,20,21,22,23,24" }, // all but 15 (where 'TCA->*')
        { 'S', "AGN", "4,6,11,14" },
        { 'S', "AGR", "4,6,11,14" },

        { '*', "AGR", "1" }, // R=AG
        { 'G', "AGR", "10" },
        { 'X', "AGR", "17" },
        { 'R', "AGR", "0,2,3,5,7,8,9,12,13,15,16,18,19,20,21,22,23,24" },

        { 'G', "AGA", "10" },
        { 'S', "AGA", "4,6,11,14,17" },
        { 'R', "AGA", "0,2,3,5,7,8,9,12,13,15,16,18,19,20,21,22,23,24" },
        { '*', "AGA", "1" },

        { 'K', "AGG", "17" },

        { 'W', "TGR", "1,2,3,4,6,10,11,14,17,20,21,24" },
        { 'X', "TGR", "0,5,7,8,9,12,13,15,16,18,19,22,23" }, // all but 'W<->TGR' (e.g. code==0: TGA->* & TGG->W => TGR->X)

        { 'C', "TGW", "7" }, // W = AT
        { 'X', "TGW", "0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24" }, // all but 'C<->TGW'

        { 'C', "TGT", ALL_TABLES },

        { 'C', "TGA", "7" },
        { 'G', "TGA", "18" },
        { 'W', "TGA", "1,2,3,4,6,10,11,14,17,20,21,24" },
        { '*', "TGA", "0,5,8,9,12,13,15,16,19,20,21,22,23" }, // standard stop codons
        { '*', "TAA", "0,1,2,3,4,6,7,8,9,10,12,13,14,15,16,17,18,19,21,24" },
        { '*', "TAG", "0,1,2,3,4,6,7,8,9,10,11,14,16,17,18,19,21,24" },

        { '*', "TRA", "0,8,9,12,13,15,16,19,21" }, // R=AG
        { 'X', "TRA", "1,2,3,4,5,6,7,10,11,14,17,18,20,22,23,24" }, // all but '*<->TRA'

        { '*', "TAR", "0,1,2,3,4,6,7,8,9,10,14,16,17,18,19,21,24" },
        { 'Y', "TAR", "22" },
        { 'E', "TAR", "23,24" },
        { 'Q', "TAR", "5,20,21" },
        { 'Z', "TAR", "5,20,21,23,24" }, // Z=EQ (TAR never translates to 'E', only 'Q')
        { 'X', "TAR", "11,12,13,15" },

        { 'B', "AAW", "6,11,14" }, // W=AT
        { 'N', "AAW", "6,11,14" },
        { 'X', "AAW", "0,1,2,3,4,5,7,8,9,10,12,13,15,16,17,18,19,20,21,22,23,24" }, // all but 'B<->AAW' & 'N<->AAW'

        { 'T', "CTG", "2" },
        { 'S', "CTG", "9" },
        { 'A', "CTG", "19" },
        { 'L', "CTG", "0,1,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,20,21,22,23,24" }, // all but 'T<->CTG' & 'S<->CTG'
        { 'J', "CTG", "0,1,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,20,21,22,23,24" }, // same as for 'L'
        { 'M', "CTG", "0,3,8,9,17,19" }, // optional start-codon

        { 'T', "CTR", "2" },
        { 'X', "CTR", "9,19" },
        { 'L', "CTR", "0,1,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,20,21,22,23,24" }, // all but 'T<->CTR' & 'X<->CTR'

        { 'E', "KAR", "23,24" },
        // Q <->KAR fails (see below)
        { 'Z', "KAR", "5,20,21,23,24" }, // Z=E|Q
        { 'X', "KAR", "0,1,2,3,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,22" },

        { 'G', "KGA", "18" },
        { 'X', "KGA", "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,20,21,22,23,24" }, // all but G<->KGA

        { 'E', "TAG", "23,24" },
        { 'Q', "TAG", "5,12,20,21" },
        { 'L', "TAG", "13,15" },
        { 'Y', "TAG", "22" },
        { 'J', "TAG", "13,15" }, // J=I|L
        { 'Z', "TAG", "5,12,20,21,23,24" }, // Z=E|Q
        { '*', "TAG", "0,1,2,3,4,6,7,8,9,10,11,14,16,17,18,19,21,24" },

        { 'J', "WTA", "0,3,5,6,7,8,9,11,12,13,15,17,18,19,20,21,22,23,24" },

        { 'X', "A-C", ALL_TABLES },
        { 'X', ".T.", ALL_TABLES },

        // tests to protect buffer overflows in dna
        { 'X', "CG", ALL_TABLES },
        { 'X', "T",  ALL_TABLES },

        //   0000000000111111111122222222223333333333444444444455555555556666    codon number (0-63)
        //   0123456789012345678901234567890123456789012345678901234567890123
        //
        //  "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG"  base1
        //  "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG"  base2
        //  "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"  base3
        //  "--2M--*---**--*----M------------MMMM----------**---M------------"  (= startStopSummary)
        //  "  ?!  -   ??  ?    !            !!?-          --   !            "  (= optionality: !=all start/stop optional; -=no start/stop optional, ?=mixed)

        // test all start codons:
        { 'M', "TTA", "3" }, // start AND stop -> see ../ALILINK/TranslateRealign.cxx@TTA_AMBIGUITY
        { 'M', "TTG", "0,3,4,8,10,17,18" },
        { 'L', "TTG", ALL_TABLES },
        // M <->CTG already tested above
        { 'M', "ATT", "1,3,4,8,16" },
        { 'M', "ATC", "1,3,4,8" },
        { 'M', "ATA", "1,2,3,4,8,10,14" },
        { 'I', "ATA", "0,3,5,6,7,8,9,11,12,13,15,16,17,18,19,20,21,22,23,24" }, // optional for 3, 8
        { 'M', "ATG", ALL_TABLES },   // no optional start
        { 'M', "ATR", "1,2,3,4,8,10,14" }, // R = AG (code=3 -> ATA->IM ATG->M)
        { 'M', "ATM", "1,3,4,8" },    // M = AC
        { 'M', "ATS", "1,3,4,8" },    // S = CG
        { 'M', "ATY", "1,3,4,8" },    // Y = TC
        { 'M', "ATK", "1,3,4,8,16" }, // K = TG
        { 'M', "ATW", "1,3,4,8" },    // W = AT
        { 'M', "ATV", "1,3,4,8" },    // V = ACG
        { 'M', "ATB", "1,3,4,8" },    // B = TCG
        { 'M', "ATD", "1,3,4,8" },    // D = ATG

        { 'M', "ATH", "1,3,4,8" },    // H = ACT
        { 'I', "ATH", "0,3,5,6,7,8,9,11,12,13,15,16,17,18,19,20,21,22,23,24" },
        { 'X', "ATH", "2,10,14" },

        { 'M', "ATN", "1,3,4,8" },    // H = ATCG
        { 'M', "GTG", "1,3,4,6,8,10,14,16,17,18" },

        // test all stop codons:
        { '*', "AGA", "1" }, // (DUPTEST)
        { '*', "AGG", "1" },
        { '*', "TAA", "0,1,2,3,4,6,7,8,9,10,12,13,14,15,16,17,18,19,21,24" },//(DUPTEST)
        { '*', "TAG", "0,1,2,3,4,6,7,8,9,10,11,14,16,17,18,19,21,24" }, //     (DUPTEST)
        { '*', "TCA", "15" },
        { '*', "TGA", "0,5,8,9,12,13,15,16,19,20,21,22,23" },        //        (DUPTEST)
        { '*', "TTA", "16" },

        { '*', "TWA", "16" },                                        // W = AT
        { '*', "TMA", "15" },                                        // M = AC
        { '*', "TAR", "0,1,2,3,4,6,7,8,9,10,14,16,17,18,19,21,24" }, // R = AG (DUPTEST)
        { '*', "TRA", "0,8,9,12,13,15,16,19,21" },                   // R = AG (DUPTEST)
        { '*', "AGR", "1" },                                         // R = AG (DUPTEST)

        { 0, NULp, NULp}
    };

    for (int c = 0; is_codon[c].protein; ++c) {
        const test_is_codon& C = is_codon[c];
        TEST_ANNOTATE(GBS_global_string("%c <- %s", C.protein, C.codon));

        TransTables  remaining;
        const char  *failure;
        bool         isCodon = AWT_is_codon(C.protein, C.codon, allowed, remaining, &failure);

        TEST_EXPECT_NULL(failure);
        TEST_EXPECT(isCodon);
        TEST_EXPECT_EQUAL(remaining.to_string(TTIT_ARB), C.tables);
    }

    // -----------------------------
    //      test invalid codons
    struct test_not_codon {
        char        protein;
        const char *codon;
        const char *error;
    };
    test_not_codon not_codon[] = {
        { 'P', "SYK", "Not all IUPAC-combinations of 'SYK' translate to 'P'" }, // correct (possible translations are PAL)
        { 'F', "SYK", "'SYK' never translates to 'F'" },                        // correct failure
        { 'P', "NNN", "Not all IUPAC-combinations of 'NNN' translate to 'P'" }, // correct failure
        { 'D', "RAY", "Not all IUPAC-combinations of 'RAY' translate to 'D'" }, // correct failure
        { 'E', "SAR", "Not all IUPAC-combinations of 'SAR' translate to 'E'" }, // correct failure
        { 'Q', "KAR", "Not all IUPAC-combinations of 'KAR' translate to 'Q'" }, // correct failure

        { 'S', "CYT", "'CYT' never translates to 'S'" }, // correct failure

        { 'O', "RAY", "'O' is no valid amino acid" },
        { 'U', "AAA", "'U' is no valid amino acid" },

        { 'L', "A-C", "Not enough nucleotides (got 'A-C')" }, // correct failure
        { 'V', ".T.", "Not enough nucleotides (got '.T.')" }, // correct failure
        { 'L', "...", "No nucleotides left" },
        { 'J', "...", "No nucleotides left" },

        { 'I', "ATR", "Not all IUPAC-combinations of 'ATR' translate to 'I'" }, // R = AG // ok: 'ATG' translates to 'M', not to 'I'

        { '*', "TYA", "Not all IUPAC-combinations of 'TYA' translate to '*'" }, // Y = TC; TCA(code=15) TTA(code=16) -> no code for both
        { '*', "TRR", "Not all IUPAC-combinations of 'TRR' translate to '*'" }, // R = AG (TGG does never translate to '*')
        { '*', "WGA", "Not all IUPAC-combinations of 'WGA' translate to '*'" }, // W = AT; AGA(1) TGA(other) -> no common codes
        { '*', "THA", "Not all IUPAC-combinations of 'THA' translate to '*'" }, // H = ACT; TAA(many) TCA(15) TTA(16) -> no code overlap between TCA and TTA

        { 'X', "...", "No nucleotides left" },
        { 'X', "..",  "No nucleotides left" },
        { 'X', "-",   "No nucleotides left" },
        { 'X', "",    "No nucleotides left" },

        // test invalid chars
        { 'X', "AZA", "Invalid character 'Z' in DNA" },
        { 'X', "A@A", "Invalid character '@' in DNA" },
        { 'L', "AZA", "Invalid character 'Z' in DNA" },

        // tests to protect buffer overflows in dna

        { 'A', "--", "No nucleotides left" },
        { 'L', ".",  "No nucleotides left" },
        { 'J', ".",  "No nucleotides left" },
        { 'L', "AT", "Not enough nucleotides (got 'AT')" },
        { 'L', "C",  "Not enough nucleotides (got 'C')" },
        { 'L', "",   "No nucleotides left" },

        { 0, NULp, NULp}
    };
    for (int c = 0; not_codon[c].protein; ++c) {
        const test_not_codon& C = not_codon[c];
        TEST_ANNOTATE(GBS_global_string("%c <- %s", C.protein, C.codon));

        TransTables  remaining;
        const char  *failure;
        bool         isCodon = AWT_is_codon(C.protein, C.codon, allowed, remaining, &failure);

        if (isCodon) {                                            // the test-case makes no sense in 'not_codon'
            TEST_EXPECT_EQUAL(remaining.to_string(TTIT_ARB), ""); // -> move the failing test-case up into 'is_codon'-section
        }
        else {
            TEST_EXPECT_EQUAL(failure, C.error);
        }
        TEST_EXPECT(!isCodon);
    }

    // ----------------------------------
    //      test uncombinable codons
    struct test_uncombinable_codons {
        char        protein1;
        const char *codon1;
        const char *tables;
        char        protein2;
        const char *codon2;
        const char *error;
    };
    test_uncombinable_codons uncomb_codons[] = {
        { '*', "TTA", "16",      'E', "SAR", "Not all IUPAC-combinations of 'SAR' translate to 'E' (for trans-table 23)" },
        { '*', "TTA", "16",      'X', "TRA", "'TRA' never translates to 'X' (for trans-table 23)" },
        { 'L', "TAG", "13,15",   'X', "TRA", "'TRA' never translates to 'X' (for any of the leftover trans-tables: 16,22)" },
        { 'L', "TAG", "13,15",   'Q', "TAR", "'TAR' never translates to 'Q' (for any of the leftover trans-tables: 16,22)" },
        { '*', "TTA", "16",      '*', "TCA", "'TCA' does not translate to '*' (for trans-table 23)" },
        { 'N', "AAA", "6,11,14", 'X', "AAW", "'AAW' never translates to 'X' (for any of the leftover trans-tables: 9,14,21)" },
        { 'N', "AAA", "6,11,14", 'K', "AAA", "'AAA' does not translate to 'K' (for any of the leftover trans-tables: 9,14,21)" },

        { 0, NULp, NULp, 0, NULp, NULp}
    };

    for (int c = 0; uncomb_codons[c].protein1; ++c) {
        const test_uncombinable_codons& C = uncomb_codons[c];
        TEST_ANNOTATE(GBS_global_string("%c <- %s + %c <- %s", C.protein1, C.codon1, C.protein2, C.codon2));

        TransTables remaining1;
        const char *failure;
        bool        isCodon = AWT_is_codon(C.protein1, C.codon1, allowed, remaining1, &failure);

        TEST_EXPECT(isCodon);
        TEST_EXPECT_EQUAL(remaining1.to_string(TTIT_ARB), C.tables);

        // @@@ add separate test: show protein2/codon2 return true from AWT_is_codon if not called with remaining1

        TransTables remaining2;
        isCodon = AWT_is_codon(C.protein2, C.codon2, remaining1, remaining2, &failure);
        TEST_EXPECT_EQUAL(failure, C.error);
        TEST_REJECT(isCodon);

    }
}

#endif // UNIT_TESTS

// --------------------------------------------------------------------------------
