// =============================================================== // // // // File : AP_codon_table.cxx // // Purpose : codon definitions for DNA -> AA translation // // // // Coded by Ralf Westram (coder@reallysoft.de) in January 2010 // // Institute of Microbiology (Technical University Munich) // // http://www.arb-home.de/ // // // // =============================================================== // #include "AP_codon_table.hxx" #include "AP_pro_a_nucs.hxx" #include "iupac.h" #include #include #include #define pn_assert(cond) arb_assert(cond) #define EMBL_BACTERIAL_TABLE_INDEX 11 #define AWT_CODON_TABLE_MAX_NAME_LENGTH 57 // increasing this limit forces GUI re-layout (look4: AWT_get_codon_code_name) #define VALID_PROTEIN "ABCDEFGHIJKLMNPQRSTVWXYZ*" // all possible translations #define VALID_PROTEIN_NO_X "ABCDEFGHIJKLMNPQRSTVWYZ*" // same as VALID_PROTEIN w/o 'X' // ---------------------------------------------------------------------------------------------------- // // Info about translation codes was taken from // http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi // and // https://en.wikipedia.org/wiki/List_of_genetic_codes // // Whenever adding new or correcting existing code tables, please // - check data on NCBI webpage mentioned above // - document last update in ../../HELP_SOURCE/source/transl_table.hlp@LAST_UPDATE_FROM_WEBPAGE // // ---------------------------------------------------------------------------------------------------- static AWT_Codon_Code_Definition AWT_codon_def[AWT_CODON_TABLES+1] = { // 0000000000111111111122222222223333333333444444444455555555556666 codon number (0-63) // 0123456789012345678901234567890123456789012345678901234567890123 // // "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG", base1 // "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG", base2 // "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG" base3 { " (1) Standard code", "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", // The first code in this table has to be 'Standard code'! "---M------**--*----M---------------M----------------------------", 1 // arb:0 }, { " (2) Vertebrate mitochondrial code", "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG", "----------**--------------------MMMM----------**---M------------", 2 // arb:1 }, { " (3) Yeast mitochondrial code", "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "----------**----------------------MM----------------------------", 3 // arb:2 }, // " (X) 6789012345678901234567890123456789012345678901234567", // max.name length (57) { " (4) Coelenterate Mitochondrial + Mycoplasma/Spiroplasma", "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "--MM------**-------M------------MMMM---------------M------------", 4 // arb:3 }, { " (5) Invertebrate mitochondrial code", "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG", "---M------**--------------------MMMM---------------M------------", 5 // arb:4 }, { " (6) Ciliate, Dasycladacean and Hexamita nuclear code", "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "--------------*--------------------M----------------------------", 6 // arb:5 }, { " (9) Echinoderm and Flatworm mitochondrial code", "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", "----------**-----------------------M---------------M------------", 9 // arb:6 }, { "(10) Euplotid nuclear code", "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "----------**-----------------------M----------------------------", 10 // arb:7 }, // 0000000001111111111222222222233333333334444444444555555555566666 // 1234567890123456789012345678901234567890123456789012345678901234 // "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG", base1 // "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG", base2 // "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG" base3 { "(11) Bacterial and Plant Plastid code", "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "---M------**--*----M------------MMMM---------------M------------", 11 // arb:8 }, { "(12) Alternative Yeast nuclear code", "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "----------**--*----M---------------M----------------------------", 12 // arb:9 }, { "(13) Ascidian mitochondrial code", "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG", "---M------**----------------------MM---------------M------------", 13 // arb:10 }, { "(14) Alternative Flatworm mitochondrial code", "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", "-----------*-----------------------M----------------------------", 14 // arb:11 }, { "(15) Blepharisma nuclear code (deleted?)", // why is it no longer listed at NCBI? "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "----------*---*--------------------M----------------------------", // converted to new format manually (no source) 15 // arb:12 }, { "(16) Chlorophycean mitochondrial code", "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "----------*---*--------------------M----------------------------", 16 // arb:13 }, { "(21) Trematode mitochondrial code", "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG", "----------**-----------------------M---------------M------------", 21 // arb:14 }, { "(22) Scenedesmus obliquus mitochondrial code", "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "------*---*---*--------------------M----------------------------", 22 // arb:15 }, { "(23) Thraustochytrium mitochondrial code", "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "--*-------**--*-----------------M--M---------------M------------", 23 // arb:16 }, { "(24) Pterobranchia Mitochondrial Code", "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG", "---M------**-------M---------------M---------------M------------", 24 // arb:17 }, { "(25) Candidate Division SR1 and Gracilibacteria Code", "FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "---M------**-----------------------M---------------M------------", 25 // arb:18 }, { "(26) Pachysolen tannophilus Nuclear Code", "FFLLSSSSYY**CC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "----------**--*----M---------------M----------------------------", 26 // arb:19 }, { "(27) Karyorelict Nuclear", "FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "--------------*--------------------M----------------------------", 27 // arb:20 }, { "(28) Condylostoma Nuclear", "FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "----------**--*--------------------M----------------------------", 28 // arb:21 }, { "(29) Mesodinium Nuclear", "FFLLSSSSYYYYCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "--------------*--------------------M----------------------------", 29 // arb:22 }, { "(30) Peritrich Nuclear", "FFLLSSSSYYEECC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "--------------*--------------------M----------------------------", 30 // arb:23 }, { "(31) Blastocrithidia Nuclear", "FFLLSSSSYYEECCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "----------**-----------------------M----------------------------", 31 // arb:24 }, { NULp, NULp, NULp, 0 } // end of table-marker }; // When adding new genetic code: // - increase AP_codon_table.hxx@AWT_CODON_TABLES // - increase .@MAX_EMBL_TRANSL_TABLE_VALUE // - add arb-codenr to .@ALL_TABLES #define MAX_EMBL_TRANSL_TABLE_VALUE 31 // maximum known EMBL transl_table value // -------------------------------------------------------------------------------- int TTIT_embl2arb(int embl_code_nr) { // returns -1 if embl_code_nr is not known by ARB static bool initialized = false; static int arb_code_nr_table[MAX_EMBL_TRANSL_TABLE_VALUE+1]; // key: embl_code_nr, value: arb_code_nr or -1 if (!initialized) { for (int embl = 0; embl <= MAX_EMBL_TRANSL_TABLE_VALUE; ++embl) { arb_code_nr_table[embl] = -1; // illegal table } for (int arb_code_nr = 0; arb_code_nr < AWT_CODON_TABLES; ++arb_code_nr) { int embl = AWT_codon_def[arb_code_nr].embl_feature_transl_table; pn_assert(embl<=MAX_EMBL_TRANSL_TABLE_VALUE); // defined embl code is above limit pn_assert(arb_code_nr_table[embl] == -1); // duplicate definition of EMBL table number arb_code_nr_table[embl] = arb_code_nr; } // should be index of 'Bacterial and Plant Plastid code' // (otherwise maybe AWAR_PROTEIN_TYPE_bacterial_code_index is wrong) pn_assert(arb_code_nr_table[EMBL_BACTERIAL_TABLE_INDEX] == AWAR_PROTEIN_TYPE_bacterial_code_index); pn_assert(arb_code_nr_table[1] == 0); // Standard code has to be on index zero! pn_assert(arb_code_nr_table[MAX_EMBL_TRANSL_TABLE_VALUE] != -1); // arb_code_nr_table is defined too big initialized = true; } if (embl_code_nr<0 || embl_code_nr>MAX_EMBL_TRANSL_TABLE_VALUE) return -1; int arb_code_nr = arb_code_nr_table[embl_code_nr]; #ifdef DEBUG if (arb_code_nr != -1) { pn_assert(arb_code_nr >= 0 && arb_code_nr < AWT_CODON_TABLES); pn_assert(TTIT_arb2embl(arb_code_nr) == embl_code_nr); } #endif return arb_code_nr; } int TTIT_arb2embl(int arb_code_nr) { pn_assert(arb_code_nr >= 0 && arb_code_nr(AWT_MAX_CODONS+1); ambEntry[0] = possible_translation; ambLen = 1; } else if (!strchr(ambEntry, possible_translation)) { ambEntry[ambLen++] = possible_translation; } } void AP_initialize_codon_tables() { if (codon_tables_initialized) return; int codon_nr; int code_nr; for (codon_nr=0; codon_nr=1); pn_assert(!AWT_codon_def[AWT_CODON_TABLES].aa); // Error in AWT_codon_def or AWT_CODON_CODES for (code_nr=0; code_nr=0 && idx<4); return "TCAG"[idx]; } inline int calc_codon_nr(const char *dna) { int i1 = dna2idx(dna[0]); if (i1 == 4) return AWT_MAX_CODONS; // is not a codon int i2 = dna2idx(dna[1]); if (i2 == 4) return AWT_MAX_CODONS; int i3 = dna2idx(dna[2]); if (i3 == 4) return AWT_MAX_CODONS; int codon_nr = i1*16 + i2*4 + i3; pn_assert(codon_nr>=0 && codon_nr<=AWT_MAX_CODONS); return codon_nr; } inline void build_codon(int codon_nr, char *to_buffer) { pn_assert(codon_nr>=0 && codon_nr>4)&3); to_buffer[1] = idx2dna((codon_nr>>2)&3); to_buffer[2] = idx2dna(codon_nr&3); } const char* AWT_get_codon_code_name(int code) { pn_assert(code>=0 && code='A' && aa<='Z') return aa_3letter_name[aa-'A']; return NULp; } #ifdef DEBUG inline char nextBase(char c) { switch (c) { case 'T': return 'C'; case 'C': return 'A'; case 'A': return 'G'; #if 0 case 'G': return 0; #else case 'G': return 'M'; case 'M': return 'R'; case 'R': return 'W'; case 'W': return 'S'; case 'S': return 'Y'; case 'Y': return 'K'; case 'K': return 'V'; case 'V': return 'H'; case 'H': return 'D'; case 'D': return 'B'; case 'B': return 'N'; case 'N': return 0; #endif default: pn_assert(0); } return 0; } void AWT_dump_codons(TranslationTableIndexType type, bool skipX) { // use for debugging const TransTables all_allowed; for (char c='*'; c<='Z'; c++) { printf("Codons for '%c': ", c); if (skipX && c == 'X') { fputs("skipped", stdout); } else { bool first_line = true; bool found = false; for (char b1='T'; b1; b1=nextBase(b1)) { for (char b2='T'; b2; b2=nextBase(b2)) { for (char b3='T'; b3; b3=nextBase(b3)) { char dna[4]; dna[0]=b1; dna[1]=b2; dna[2]=b3; dna[3]=0; TransTables remaining; if (AWT_is_codon(c, dna, all_allowed, remaining)) { if (!first_line) fputs("\n ", stdout); first_line = false; printf("%s (%s)", dna, remaining.to_string(type)); found = true; } } } } if (!found) fputs("none", stdout); } fputs("\n", stdout); if (c=='*') c='A'-1; } } #endif inline char isStartOrStopCodonNr(int codon_nr, int code_nr) { char isStartStop = 0; pn_assert(code_nr >= 0 && code_nr= 0 && codon_nr brute force implementation ok) TransTables allowed; allowed.forbidAllBut(CodeNr()); TransTables remaining = allowed; bool is_start = AWT_is_codon('M', codon, allowed, remaining, NULp); bool is_stop = is_start ? false : AWT_is_codon('*', codon, allowed, remaining, NULp); pn_assert(!(is_start && is_stop)); result = is_start ? 'M' : (is_stop ? '*' : 0); } else { // codon is a clean codon result = isStartOrStopCodonNr(calc_codon_nr(codon), code_nr); } return result; } inline bool protMatches(char p1, char p2) { /*! return true if p1 matches p2 * @param p1 "normal" protein (neither B, Z nor J) * @param p2 any protein (B, Z and J ok) * B is a shortcut for Asp(=D) or Asn(=N) * J is a shortcut for Ile(=I) or Leu(=L) * Z is a shortcut for Glu(=E) or Gln(=Q) */ pn_assert(p1 != 'B' && p1 != 'Z' && p1 != 'J'); pn_assert(p1 == toupper(p1)); pn_assert(p2 == toupper(p2)); if (p1 == p2) return true; if (p2 == 'B') return p1 == 'D' || p1 == 'N'; if (p2 == 'J') return p1 == 'I' || p1 == 'L'; if (p2 == 'Z') return p1 == 'E' || p1 == 'Q'; return false; } inline bool containsProtMatching(const char *pstr, char p) { /*! return true, if 'pstr' contains any protein that matches 'p'. * uses same logic as protMatches() */ pn_assert(p == toupper(p)); if (p == 'B') return strchr(pstr, 'D') || strchr(pstr, 'N'); if (p == 'J') return strchr(pstr, 'I') || strchr(pstr, 'L'); if (p == 'Z') return strchr(pstr, 'E') || strchr(pstr, 'Q'); return strchr(pstr, p); } inline bool isGap(char c) { return GAP::is_std_gap(c); } inline GB_ERROR neverTranslatesError(const char *dna, char protein) { if (!strchr(VALID_PROTEIN, protein)) { return GBS_global_string("'%c' is no valid amino acid", protein); } return GBS_global_string("'%c%c%c' never translates to '%c'", dna[0], dna[1], dna[2], protein); } bool AWT_is_codon(char protein, const char *const dna, const TransTables& allowed, TransTables& remaining, const char **fail_reason_ptr) { /*! test if 'dna' codes 'protein' * @param protein amino acid * @param dna three nucleotides (gaps allowed, e.g. 'A-C' can be tested vs 'X') * @param allowed allowed translation tables * @param remaining returns the remaining allowed translation tables (only if this functions returns true) * @param fail_reason_ptr if not NULp => store reason for failure here (or set it to NULp on success) * @return true if dna translates to protein */ pn_assert(allowed.any()); pn_assert(codon_tables_initialized); const char *fail_reason = NULp; if (fail_reason_ptr) *fail_reason_ptr = NULp; bool is_codon = false; int codon_nr = calc_codon_nr(dna); int first_iupac_pos = -1; int iupac_positions = 0; bool decided = false; bool general_failure = false; protein = toupper(protein); if (codon_nr==AWT_MAX_CODONS) { // dna is not a clean codon (i.e. it contains iupac-codes or gaps) bool too_short = false; int nucs_seen = 0; for (int iupac_pos=0; iupac_pos<3 && !too_short && !fail_reason; iupac_pos++) { char N = dna[iupac_pos]; if (!N) too_short = true; else if (!isGap(N)) { nucs_seen++; if (!strchr("ACGTU", N)) { if (first_iupac_pos==-1) first_iupac_pos = iupac_pos; iupac_positions++; const char *decoded_iupac = iupac::decode(N, GB_AT_DNA, 0); if (!decoded_iupac[0]) { // no valid IUPAC fail_reason = GBS_global_string("Invalid character '%c' in DNA", N); } } } } if (!fail_reason && !nucs_seen) { // got no dna fail_reason = "No nucleotides left"; } else if (nucs_seen<3) { too_short = true; } if (fail_reason) { decided = true; // fails for all proteins } else if (too_short) { decided = true; if (protein == 'X') { is_codon = true; } else { char dna_copy[4]; strncpy(dna_copy, dna, 3); dna_copy[3] = 0; fail_reason = GBS_global_string("Not enough nucleotides (got '%s')", dna_copy); } } } if (!decided) { if (protein == 'X') { TransTables allowed_copy = allowed; const char *valid_prot = VALID_PROTEIN_NO_X; for (int i = 0; valid_prot[i]; ++i) { if (AWT_is_codon(valid_prot[i], dna, allowed_copy, remaining)) { allowed_copy.forbid(remaining); if (allowed_copy.none()) break; } } if (allowed_copy.any()) { is_codon = true; remaining = allowed_copy; } else { fail_reason = neverTranslatesError(dna, protein); } } else if (codon_nr==AWT_MAX_CODONS) { // dna is a codon with one or more IUPAC codes pn_assert(iupac_positions); const char *decoded_iupac = iupac::decode(dna[first_iupac_pos], GB_AT_DNA, 0); pn_assert(decoded_iupac[0]); // already should have been catched above char dna_copy[4]; memcpy(dna_copy, dna, 3); dna_copy[3] = 0; bool all_are_codons = true; bool one_is_codon = false; TransTables allowed_copy = allowed; for (int i=0; decoded_iupac[i]; i++) { dna_copy[first_iupac_pos] = decoded_iupac[i]; const char *subfail; if (!AWT_is_codon(protein, dna_copy, allowed_copy, remaining, &subfail)) { all_are_codons = false; if (!one_is_codon && ARB_strBeginsWith(subfail, "Not all ")) one_is_codon = true; if (one_is_codon) break; } else { one_is_codon = true; allowed_copy = remaining; } } if (all_are_codons) { pn_assert(allowed_copy.any()); remaining = allowed_copy; is_codon = true; } else { remaining.forbidAll(); dna_copy[first_iupac_pos] = dna[first_iupac_pos]; if (one_is_codon) { fail_reason = GBS_global_string("Not all IUPAC-combinations of '%s' translate to '%c'", dna_copy, protein); // careful when changing this message (see above) } else { fail_reason = neverTranslatesError(dna_copy, protein); } } } else if (definite_translation[codon_nr]) { // codon has a definite translation (i.e. translates equal for all code-tables) char defTransl = definite_translation[codon_nr]; #if defined(ASSERTION_USED) bool optionalCodonExists = false; for (int code_nr=0; code_nr definite_translation[] is wrong #endif int ok = protMatches(defTransl, protein); if (ok) { remaining = allowed; is_codon = true; } else { remaining.forbidAll(); fail_reason = GBS_global_string("'%c%c%c' translates to '%c', not to '%c'", dna[0], dna[1], dna[2], defTransl, protein); general_failure = true; } } else if (!containsProtMatching(ambiguous_codons[codon_nr], protein)) { // codon does not translate to protein in any code-table remaining.forbidAll(); fail_reason = neverTranslatesError(dna, protein); general_failure = true; } else { #if defined(ASSERTION_USED) bool correct_disallowed_translation = false; #endif // Now codon translates to protein in at least 1 code-table! // Check whether protein translates in any of the allowed code-tables and forbid rest for (int code_nr=0; code_nr=0 && code_nr=0 && i<4; } inline const char *buildMixedCodon(const char *const con1, const char *const con2) { int mismatches = 0; int mismatch_index = -1; static char buf[4]; for (int i=0; i<3; i++) { if (con1[i]!=con2[i]) { mismatches++; mismatch_index = i; } else { buf[i] = con1[i]; } } if (mismatches==1) { // exactly one position differs between codons pn_assert(mismatch_index!=-1); buf[mismatch_index] = iupac::combine(con1[mismatch_index], con2[mismatch_index], GB_AT_DNA); buf[3] = 0; if (memcmp(con1, buf, 3) == 0 || memcmp(con2, buf, 3) == 0) { return NULp; } #if defined(DUMP_CODON_GROUP_EXPANSION) printf(" buildMixedCodon('%c%c%c','%c%c%c') == '%s'\n", con1[0], con1[1], con1[2], con2[0], con2[1], con2[2], buf); #endif return buf; } return NULp; } static int expandMore(const char *bufferStart, int no_of_condons, char*&to_buffer) { int i, j; const char *con1, *con2; int added = 0; for (i=0; i try behind } if (!found) { memmove(to_buffer, result, 3); to_buffer+=3; added++; } } } } return no_of_condons+added; } int Codon_Group::expand(char *to_buffer) const { int count = 0; int i; char *org_to_buffer = to_buffer; for (i=0; i done count = new_count; #if defined(DUMP_CODON_GROUP_EXPANSION) to_buffer[0] = 0; printf("codons (expandedMore) = '%s'\n", org_to_buffer); #endif } pn_assert(count==(int(to_buffer-org_to_buffer)/3)); return count; } // -------------------------------------------------------------------------------- static Codon_Group *get_Codon_Group(char protein, int code_nr) { pn_assert(code_nr>=0 && code_nrexpand(buffer); pn_assert(offset #endif static const char *startStopSummary() { // returns string showing summary for start/stop // position = codon_nr // content: // '*' -> translates to stop-codon for at least one code // 'M' -> translates to start-codon for at least one code // '2' -> both (not necessarily same code) // '-' -> does not translate to start or stop for any code static char result[AWT_MAX_CODONS+1]; for (int codon = 0; codon only non-optional start/stop // '!' -> only optional start/stop // '?' -> both // ' ' -> never start or stop static char result[AWT_MAX_CODONS+1]; for (int codon = 0; codonX (or disallow call) const TransTables allowed; // --------------------------- // test valid codons struct test_is_codon { char protein; const char *codon; const char *tables; }; #define ALL_TABLES "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24" // contains arb table-numbers test_is_codon is_codon[] = { { 'P', "CCC", ALL_TABLES }, { 'P', "CCN", ALL_TABLES }, { 'R', "CGN", ALL_TABLES }, { 'D', "GAY", ALL_TABLES }, { 'N', "AAY", ALL_TABLES }, { 'B', "AAY", ALL_TABLES }, // translates to 'N', but matches B(=D|N) for realigner { 'B', "GAY", ALL_TABLES }, // translates to 'D', but matches B(=D|N) for realigner { 'B', "RAY", ALL_TABLES }, // translates to 'D' or to 'N' (i.e. only matches 'B', see failing test for 'RAY' below) { 'B', "RAT", ALL_TABLES }, { 'Q', "CAR", ALL_TABLES }, { 'E', "GAR", ALL_TABLES }, { 'Z', "SAR", ALL_TABLES }, { 'X', "NNN", ALL_TABLES }, { 'L', "TTR", "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15" ",17,18,19,20,21,22,23,24" }, { 'X', "TTR", "16" }, { 'L', "YTA", "0,1"",3,4,5,6,7,8,9,10,11,12,13,14,15" ",17,18,19,20,21,22,23,24" }, { 'X', "YTA", "2,16" }, // Y=TC { 'L', "CTM", "0,1"",3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24" }, { 'T', "CTM", "2" }, // M=AC { 'L', "CTN", "0,1"",3,4,5,6,7,8"",10,11,12,13,14,15,16,17,18" ",20,21,22,23,24" }, { 'T', "CTN", "2" }, { 'X', "CTN", "9,19" }, { 'L', "CTK", "0,1"",3,4,5,6,7,8"",10,11,12,13,14,15,16,17,18" ",20,21,22,23,24" }, { 'T', "CTK", "2" }, { 'X', "CTK", "9,19" }, // K=TG { 'L', "TWG", "13,15" }, // W=AT { 'J', "TWG", "13,15" }, // translates to 'L', but matches J(=I|L) for realigner { 'X', "TWG", "0,1,2,3,4,5,6,7,8,9,10,11,12" ",14" ",16,17,18,19,20,21,22,23,24" }, // all but 'L<->TWG' { 'S', "AGY", ALL_TABLES }, { 'S', "TCY", ALL_TABLES }, { 'S', "TCN", "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,16,17,18,19,20,21,22,23,24" }, // all but 15 (where 'TCA->*') { 'S', "AGN", "4,6,11,14" }, { 'S', "AGR", "4,6,11,14" }, { '*', "AGR", "1" }, // R=AG { 'G', "AGR", "10" }, { 'X', "AGR", "17" }, { 'R', "AGR", "0,2,3,5,7,8,9,12,13,15,16,18,19,20,21,22,23,24" }, { 'G', "AGA", "10" }, { 'S', "AGA", "4,6,11,14,17" }, { 'R', "AGA", "0,2,3,5,7,8,9,12,13,15,16,18,19,20,21,22,23,24" }, { '*', "AGA", "1" }, { 'K', "AGG", "17" }, { 'W', "TGR", "1,2,3,4,6,10,11,14,17,20,21,24" }, { 'X', "TGR", "0,5,7,8,9,12,13,15,16,18,19,22,23" }, // all but 'W<->TGR' (e.g. code==0: TGA->* & TGG->W => TGR->X) { 'C', "TGW", "7" }, // W = AT { 'X', "TGW", "0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24" }, // all but 'C<->TGW' { 'C', "TGT", ALL_TABLES }, { 'C', "TGA", "7" }, { 'G', "TGA", "18" }, { 'W', "TGA", "1,2,3,4,6,10,11,14,17,20,21,24" }, { '*', "TGA", "0,5,8,9,12,13,15,16,19,20,21,22,23" }, // standard stop codons { '*', "TAA", "0,1,2,3,4,6,7,8,9,10,12,13,14,15,16,17,18,19,21,24" }, { '*', "TAG", "0,1,2,3,4,6,7,8,9,10,11,14,16,17,18,19,21,24" }, { '*', "TRA", "0,8,9,12,13,15,16,19,21" }, // R=AG { 'X', "TRA", "1,2,3,4,5,6,7,10,11,14,17,18,20,22,23,24" }, // all but '*<->TRA' { '*', "TAR", "0,1,2,3,4,6,7,8,9,10,14,16,17,18,19,21,24" }, { 'Y', "TAR", "22" }, { 'E', "TAR", "23,24" }, { 'Q', "TAR", "5,20,21" }, { 'Z', "TAR", "5,20,21,23,24" }, // Z=EQ (TAR never translates to 'E', only 'Q') { 'X', "TAR", "11,12,13,15" }, { 'B', "AAW", "6,11,14" }, // W=AT { 'N', "AAW", "6,11,14" }, { 'X', "AAW", "0,1,2,3,4,5,7,8,9,10,12,13,15,16,17,18,19,20,21,22,23,24" }, // all but 'B<->AAW' & 'N<->AAW' { 'T', "CTG", "2" }, { 'S', "CTG", "9" }, { 'A', "CTG", "19" }, { 'L', "CTG", "0,1,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,20,21,22,23,24" }, // all but 'T<->CTG' & 'S<->CTG' { 'J', "CTG", "0,1,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,20,21,22,23,24" }, // same as for 'L' { 'M', "CTG", "0,3,8,9,17,19" }, // optional start-codon { 'T', "CTR", "2" }, { 'X', "CTR", "9,19" }, { 'L', "CTR", "0,1,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,20,21,22,23,24" }, // all but 'T<->CTR' & 'X<->CTR' { 'E', "KAR", "23,24" }, // Q <->KAR fails (see below) { 'Z', "KAR", "5,20,21,23,24" }, // Z=E|Q { 'X', "KAR", "0,1,2,3,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,22" }, { 'G', "KGA", "18" }, { 'X', "KGA", "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,20,21,22,23,24" }, // all but G<->KGA { 'E', "TAG", "23,24" }, { 'Q', "TAG", "5,12,20,21" }, { 'L', "TAG", "13,15" }, { 'Y', "TAG", "22" }, { 'J', "TAG", "13,15" }, // J=I|L { 'Z', "TAG", "5,12,20,21,23,24" }, // Z=E|Q { '*', "TAG", "0,1,2,3,4,6,7,8,9,10,11,14,16,17,18,19,21,24" }, { 'J', "WTA", "0,3,5,6,7,8,9,11,12,13,15,17,18,19,20,21,22,23,24" }, { 'X', "A-C", ALL_TABLES }, { 'X', ".T.", ALL_TABLES }, // tests to protect buffer overflows in dna { 'X', "CG", ALL_TABLES }, { 'X', "T", ALL_TABLES }, // 0000000000111111111122222222223333333333444444444455555555556666 codon number (0-63) // 0123456789012345678901234567890123456789012345678901234567890123 // // "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG" base1 // "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG" base2 // "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG" base3 // "--2M--*---**--*----M------------MMMM----------**---M------------" (= startStopSummary) // " ?! - ?? ? ! !!?- -- ! " (= optionality: !=all start/stop optional; -=no start/stop optional, ?=mixed) // test all start codons: { 'M', "TTA", "3" }, // start AND stop -> see ../ALILINK/TranslateRealign.cxx@TTA_AMBIGUITY { 'M', "TTG", "0,3,4,8,10,17,18" }, { 'L', "TTG", ALL_TABLES }, // M <->CTG already tested above { 'M', "ATT", "1,3,4,8,16" }, { 'M', "ATC", "1,3,4,8" }, { 'M', "ATA", "1,2,3,4,8,10,14" }, { 'I', "ATA", "0,3,5,6,7,8,9,11,12,13,15,16,17,18,19,20,21,22,23,24" }, // optional for 3, 8 { 'M', "ATG", ALL_TABLES }, // no optional start { 'M', "ATR", "1,2,3,4,8,10,14" }, // R = AG (code=3 -> ATA->IM ATG->M) { 'M', "ATM", "1,3,4,8" }, // M = AC { 'M', "ATS", "1,3,4,8" }, // S = CG { 'M', "ATY", "1,3,4,8" }, // Y = TC { 'M', "ATK", "1,3,4,8,16" }, // K = TG { 'M', "ATW", "1,3,4,8" }, // W = AT { 'M', "ATV", "1,3,4,8" }, // V = ACG { 'M', "ATB", "1,3,4,8" }, // B = TCG { 'M', "ATD", "1,3,4,8" }, // D = ATG { 'M', "ATH", "1,3,4,8" }, // H = ACT { 'I', "ATH", "0,3,5,6,7,8,9,11,12,13,15,16,17,18,19,20,21,22,23,24" }, { 'X', "ATH", "2,10,14" }, { 'M', "ATN", "1,3,4,8" }, // H = ATCG { 'M', "GTG", "1,3,4,6,8,10,14,16,17,18" }, // test all stop codons: { '*', "AGA", "1" }, // (DUPTEST) { '*', "AGG", "1" }, { '*', "TAA", "0,1,2,3,4,6,7,8,9,10,12,13,14,15,16,17,18,19,21,24" },//(DUPTEST) { '*', "TAG", "0,1,2,3,4,6,7,8,9,10,11,14,16,17,18,19,21,24" }, // (DUPTEST) { '*', "TCA", "15" }, { '*', "TGA", "0,5,8,9,12,13,15,16,19,20,21,22,23" }, // (DUPTEST) { '*', "TTA", "16" }, { '*', "TWA", "16" }, // W = AT { '*', "TMA", "15" }, // M = AC { '*', "TAR", "0,1,2,3,4,6,7,8,9,10,14,16,17,18,19,21,24" }, // R = AG (DUPTEST) { '*', "TRA", "0,8,9,12,13,15,16,19,21" }, // R = AG (DUPTEST) { '*', "AGR", "1" }, // R = AG (DUPTEST) { 0, NULp, NULp} }; for (int c = 0; is_codon[c].protein; ++c) { const test_is_codon& C = is_codon[c]; TEST_ANNOTATE(GBS_global_string("%c <- %s", C.protein, C.codon)); TransTables remaining; const char *failure; bool isCodon = AWT_is_codon(C.protein, C.codon, allowed, remaining, &failure); TEST_EXPECT_NULL(failure); TEST_EXPECT(isCodon); TEST_EXPECT_EQUAL(remaining.to_string(TTIT_ARB), C.tables); } // ----------------------------- // test invalid codons struct test_not_codon { char protein; const char *codon; const char *error; }; test_not_codon not_codon[] = { { 'P', "SYK", "Not all IUPAC-combinations of 'SYK' translate to 'P'" }, // correct (possible translations are PAL) { 'F', "SYK", "'SYK' never translates to 'F'" }, // correct failure { 'P', "NNN", "Not all IUPAC-combinations of 'NNN' translate to 'P'" }, // correct failure { 'D', "RAY", "Not all IUPAC-combinations of 'RAY' translate to 'D'" }, // correct failure { 'E', "SAR", "Not all IUPAC-combinations of 'SAR' translate to 'E'" }, // correct failure { 'Q', "KAR", "Not all IUPAC-combinations of 'KAR' translate to 'Q'" }, // correct failure { 'S', "CYT", "'CYT' never translates to 'S'" }, // correct failure { 'O', "RAY", "'O' is no valid amino acid" }, { 'U', "AAA", "'U' is no valid amino acid" }, { 'L', "A-C", "Not enough nucleotides (got 'A-C')" }, // correct failure { 'V', ".T.", "Not enough nucleotides (got '.T.')" }, // correct failure { 'L', "...", "No nucleotides left" }, { 'J', "...", "No nucleotides left" }, { 'I', "ATR", "Not all IUPAC-combinations of 'ATR' translate to 'I'" }, // R = AG // ok: 'ATG' translates to 'M', not to 'I' { '*', "TYA", "Not all IUPAC-combinations of 'TYA' translate to '*'" }, // Y = TC; TCA(code=15) TTA(code=16) -> no code for both { '*', "TRR", "Not all IUPAC-combinations of 'TRR' translate to '*'" }, // R = AG (TGG does never translate to '*') { '*', "WGA", "Not all IUPAC-combinations of 'WGA' translate to '*'" }, // W = AT; AGA(1) TGA(other) -> no common codes { '*', "THA", "Not all IUPAC-combinations of 'THA' translate to '*'" }, // H = ACT; TAA(many) TCA(15) TTA(16) -> no code overlap between TCA and TTA { 'X', "...", "No nucleotides left" }, { 'X', "..", "No nucleotides left" }, { 'X', "-", "No nucleotides left" }, { 'X', "", "No nucleotides left" }, // test invalid chars { 'X', "AZA", "Invalid character 'Z' in DNA" }, { 'X', "A@A", "Invalid character '@' in DNA" }, { 'L', "AZA", "Invalid character 'Z' in DNA" }, // tests to protect buffer overflows in dna { 'A', "--", "No nucleotides left" }, { 'L', ".", "No nucleotides left" }, { 'J', ".", "No nucleotides left" }, { 'L', "AT", "Not enough nucleotides (got 'AT')" }, { 'L', "C", "Not enough nucleotides (got 'C')" }, { 'L', "", "No nucleotides left" }, { 0, NULp, NULp} }; for (int c = 0; not_codon[c].protein; ++c) { const test_not_codon& C = not_codon[c]; TEST_ANNOTATE(GBS_global_string("%c <- %s", C.protein, C.codon)); TransTables remaining; const char *failure; bool isCodon = AWT_is_codon(C.protein, C.codon, allowed, remaining, &failure); if (isCodon) { // the test-case makes no sense in 'not_codon' TEST_EXPECT_EQUAL(remaining.to_string(TTIT_ARB), ""); // -> move the failing test-case up into 'is_codon'-section } else { TEST_EXPECT_EQUAL(failure, C.error); } TEST_EXPECT(!isCodon); } // ---------------------------------- // test uncombinable codons struct test_uncombinable_codons { char protein1; const char *codon1; const char *tables; char protein2; const char *codon2; const char *error; }; test_uncombinable_codons uncomb_codons[] = { { '*', "TTA", "16", 'E', "SAR", "Not all IUPAC-combinations of 'SAR' translate to 'E' (for trans-table 23)" }, { '*', "TTA", "16", 'X', "TRA", "'TRA' never translates to 'X' (for trans-table 23)" }, { 'L', "TAG", "13,15", 'X', "TRA", "'TRA' never translates to 'X' (for any of the leftover trans-tables: 16,22)" }, { 'L', "TAG", "13,15", 'Q', "TAR", "'TAR' never translates to 'Q' (for any of the leftover trans-tables: 16,22)" }, { '*', "TTA", "16", '*', "TCA", "'TCA' does not translate to '*' (for trans-table 23)" }, { 'N', "AAA", "6,11,14", 'X', "AAW", "'AAW' never translates to 'X' (for any of the leftover trans-tables: 9,14,21)" }, { 'N', "AAA", "6,11,14", 'K', "AAA", "'AAA' does not translate to 'K' (for any of the leftover trans-tables: 9,14,21)" }, { 0, NULp, NULp, 0, NULp, NULp} }; for (int c = 0; uncomb_codons[c].protein1; ++c) { const test_uncombinable_codons& C = uncomb_codons[c]; TEST_ANNOTATE(GBS_global_string("%c <- %s + %c <- %s", C.protein1, C.codon1, C.protein2, C.codon2)); TransTables remaining1; const char *failure; bool isCodon = AWT_is_codon(C.protein1, C.codon1, allowed, remaining1, &failure); TEST_EXPECT(isCodon); TEST_EXPECT_EQUAL(remaining1.to_string(TTIT_ARB), C.tables); // @@@ add separate test: show protein2/codon2 return true from AWT_is_codon if not called with remaining1 TransTables remaining2; isCodon = AWT_is_codon(C.protein2, C.codon2, remaining1, remaining2, &failure); TEST_EXPECT_EQUAL(failure, C.error); TEST_REJECT(isCodon); } } #endif // UNIT_TESTS // --------------------------------------------------------------------------------