// =============================================================== // // // // File : ED4_protein_2nd_structure.cxx // // Purpose : // // // // Institute of Microbiology (Technical University Munich) // // http://www.arb-home.de/ // // // // =============================================================== // /*! \file ED4_protein_2nd_structure.cxx * \brief Implements the functions defined in ed4_protein_2nd_structure.hxx. * \author Markus Urban * \date 2008-02-08 * \sa Refer to ed4_protein_2nd_structure.hxx for details, please. */ #include "ed4_protein_2nd_structure.hxx" #include "ed4_class.hxx" #include "ed4_awars.hxx" #include #include #include #include "arbdbt.h" #include #include #define e4_assert(bed) arb_assert(bed) using namespace std; // -------------------------------------------------------------------------------- // exported data //! Awars for the match type; binds the #PFOLD_MATCH_TYPE to the corresponding awar name. name_value_pair pfold_match_type_awars[] = { { "Perfect_match", STRUCT_PERFECT_MATCH }, { "Good_match", STRUCT_GOOD_MATCH }, { "Medium_match", STRUCT_MEDIUM_MATCH }, { "Bad_match", STRUCT_BAD_MATCH }, { "No_match", STRUCT_NO_MATCH }, { "Unknown_match", STRUCT_UNKNOWN }, { NULp, PFOLD_MATCH_TYPE_COUNT } }; //! Symbols for the match quality (defined by #PFOLD_MATCH_TYPE) as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT in ED4_pfold_calculate_secstruct_match(). char *pfold_pair_chars[PFOLD_PAIRS] = { ARB_strdup(" "), // STRUCT_PERFECT_MATCH ARB_strdup("-"), // STRUCT_GOOD_MATCH ARB_strdup("~"), // STRUCT_MEDIUM_MATCH ARB_strdup("+"), // STRUCT_BAD_MATCH ARB_strdup("#"), // STRUCT_NO_MATCH ARB_strdup("?") // STRUCT_UNKNOWN }; //! Match pair definition (see #PFOLD_MATCH_TYPE) as used for match methods #SECSTRUCT_SECSTRUCT and #SECSTRUCT_SEQUENCE_PREDICT in ED4_pfold_calculate_secstruct_match(). char *pfold_pairs[PFOLD_PAIRS] = { ARB_strdup("HH GG II TT EE BB SS -- -. .."), // STRUCT_PERFECT_MATCH ARB_strdup("HG HI HS EB ES TS H- G- I- T- E- B- S-"), // STRUCT_GOOD_MATCH ARB_strdup("HT GT IT"), // STRUCT_MEDIUM_MATCH ARB_strdup("ET BT"), // STRUCT_BAD_MATCH ARB_strdup("EH BH EG EI"), // STRUCT_NO_MATCH ARB_strdup("") // STRUCT_UNKNOWN }; static struct pfold_mem_handler { ~pfold_mem_handler() { for (int i = 0; i * H = alpha-helix,
* E = beta-sheet,
* T = beta-turn. */ static char structure_chars[3] = { 'H', 'E', 'T' }; //! Amino acids that break a certain structure (#ALPHA_HELIX or #BETA_SHEET) as used in ED4_pfold_extend_nucleation_sites(). static const char *structure_breaker[2] = { "NYPG", "PDESGK" }; //! Amino acids that are indifferent for a certain structure (#ALPHA_HELIX or #BETA_SHEET) as used in ED4_pfold_extend_nucleation_sites(). static const char *structure_indifferent[2] = { "RTSC", "RNHA" }; //! Awars for the match method; binds the #PFOLD_MATCH_METHOD to the corresponding name that is used to create the menu in ED4_pfold_create_props_window(). static name_value_pair pfold_match_method_awars[4] = { { "Secondary Structure <-> Secondary Structure", SECSTRUCT_SECSTRUCT }, { "Secondary Structure <-> Sequence", SECSTRUCT_SEQUENCE }, { "Secondary Structure <-> Sequence (Full Prediction)", SECSTRUCT_SEQUENCE_PREDICT }, { NULp, PFOLD_MATCH_METHOD_COUNT } }; static double max_former_value[3] = { 1.42, 1.62, 156 }; //!< Maximum former value for alpha-helix, beta-sheet (in #cf_parameters) and beta-turn (in #cf_parameters_norm). static double min_former_value[3] = { 0.0, 0.0, 47 }; //!< Minimum former value for alpha-helix, beta-sheet (in #cf_parameters) and beta-turn (in #cf_parameters_norm). static double max_breaker_value[3] = { 1.21, 2.03, 0.0 }; //!< Maximum breaker value for alpha-helix, beta-sheet (in #cf_parameters) and beta-turn (no breaker values => 0). // -------------------------------------------------------------------------------- // TODO: is there a way to prevent doxygen from stripping the comments from the table? // I simply added the parameter table as verbatim environment to show the comments in // the documentation. /*! \brief Former and breaker values for alpha-helices and beta-sheets (= strands). * * \hideinitializer * \par Initial value: * \verbatim { // Helix Former Strand Former Helix Breaker Strand Breaker Amino // Value Value Value Value Acid // ----------------------------------------------------------------------- { 1.34, 0.00, 0.00, 0.00 }, // A { 0.00, 0.00, 0.00, 0.00 }, // R { 0.50, 0.00, 0.00, 1.39 }, // D { 0.00, 0.00, 1.03, 0.00 }, // N { 0.00, 1.13, 0.00, 0.00 }, // C { 1.42, 0.00, 0.00, 2.03 }, // E { 1.05, 1.05, 0.00, 0.00 }, // Q { 0.00, 0.00, 1.21, 1.00 }, // G { 0.50, 0.00, 0.00, 0.00 }, // H { 1.02, 1.52, 0.00, 0.00 }, // I { 1.14, 1.24, 0.00, 0.00 }, // L { 1.09, 0.00, 0.00, 1.01 }, // K { 1.37, 1.00, 0.00, 0.00 }, // M { 1.07, 1.31, 0.00, 0.00 }, // F { 0.00, 0.00, 1.21, 1.36 }, // P { 0.00, 0.00, 0.00, 1.00 }, // S { 0.00, 1.13, 0.00, 0.00 }, // T { 1.02, 1.30, 0.00, 0.00 }, // W { 0.00, 1.40, 1.00, 0.00 }, // Y { 1.00, 1.62, 0.00, 0.00 }}; // V \endverbatim * * The former and breaker values are used to find alpha-helix and beta-sheet * nucleation sites in ED4_pfold_find_nucleation_sites() and to resolve overlaps * in ED4_pfold_resolve_overlaps(). Addressing the array with the enums * #ALPHA_HELIX or #BETA_SHEET as second index gives the former values and * addressing it with #ALPHA_HELIX+2 or #BETA_SHEET+2 gives the breaker values. * The first index is for the amino acid. Use #char2AA to convert an amino acid * character to the corresponding index. * * \sa Refer to the definition in the source code for commented table. */ static double cf_parameters[20][4] = { /* Helix Former Strand Former Helix Breaker Strand Breaker Amino Value Value Value Value Acid ----------------------------------------------------------------------- */ { 1.34, 0.00, 0.00, 0.00 }, // A { 0.00, 0.00, 0.00, 0.00 }, // R { 0.50, 0.00, 0.00, 1.39 }, // D { 0.00, 0.00, 1.03, 0.00 }, // N { 0.00, 1.13, 0.00, 0.00 }, // C { 1.42, 0.00, 0.00, 2.03 }, // E { 1.05, 1.05, 0.00, 0.00 }, // Q { 0.00, 0.00, 1.21, 1.00 }, // G { 0.50, 0.00, 0.00, 0.00 }, // H { 1.02, 1.52, 0.00, 0.00 }, // I { 1.14, 1.24, 0.00, 0.00 }, // L { 1.09, 0.00, 0.00, 1.01 }, // K { 1.37, 1.00, 0.00, 0.00 }, // M { 1.07, 1.31, 0.00, 0.00 }, // F { 0.00, 0.00, 1.21, 1.36 }, // P { 0.00, 0.00, 0.00, 1.00 }, // S { 0.00, 1.13, 0.00, 0.00 }, // T { 1.02, 1.30, 0.00, 0.00 }, // W { 0.00, 1.40, 1.00, 0.00 }, // Y { 1.00, 1.62, 0.00, 0.00 } }; // V /*! \brief Normalized former values for alpha-helices, beta-sheets (= strands) * and beta-turns as well as beta-turn probabilities. * * \hideinitializer * \par Initial value: * \verbatim { // P(a) P(b) P(turn) f(i) f(i+1) f(i+2) f(i+3) Amino Acid // -------------------------------------------------------------------- { 142, 83, 66, 0.060, 0.076, 0.035, 0.058 }, // A { 98, 93, 95, 0.070, 0.106, 0.099, 0.085 }, // R { 101, 54, 146, 0.147, 0.110, 0.179, 0.081 }, // D { 67, 89, 156, 0.161, 0.083, 0.191, 0.091 }, // N { 70, 119, 119, 0.149, 0.050, 0.117, 0.128 }, // C { 151, 37, 74, 0.056, 0.060, 0.077, 0.064 }, // E { 111, 110, 98, 0.074, 0.098, 0.037, 0.098 }, // Q { 57, 75, 156, 0.102, 0.085, 0.190, 0.152 }, // G { 100, 87, 95, 0.140, 0.047, 0.093, 0.054 }, // H { 108, 160, 47, 0.043, 0.034, 0.013, 0.056 }, // I { 121, 130, 59, 0.061, 0.025, 0.036, 0.070 }, // L { 116, 74, 101, 0.055, 0.115, 0.072, 0.095 }, // K { 145, 105, 60, 0.068, 0.082, 0.014, 0.055 }, // M { 113, 138, 60, 0.059, 0.041, 0.065, 0.065 }, // F { 57, 55, 152, 0.102, 0.301, 0.034, 0.068 }, // P { 77, 75, 143, 0.120, 0.139, 0.125, 0.106 }, // S { 83, 119, 96, 0.086, 0.108, 0.065, 0.079 }, // T { 108, 137, 96, 0.077, 0.013, 0.064, 0.167 }, // W { 69, 147, 114, 0.082, 0.065, 0.114, 0.125 }, // Y { 106, 170, 50, 0.062, 0.048, 0.028, 0.053 }}; // V \endverbatim * * The normalized former values are used to find beta-turns in an amino acid * sequence in ED4_pfold_find_turns(). Addressing the array with the enums * #ALPHA_HELIX, #BETA_SHEET or #BETA_TURN as second index gives the former * values and addressing it with #BETA_TURN+i \f$(1 <= i <= 4)\f$ gives the * turn probabilities. The first index is for the amino acid. Use #char2AA to * convert an amino acid character to the corresponding index. * * \sa Refer to the definition in the source code for commented table. */ static double cf_parameters_norm[20][7] = { /* P(a) P(b) P(turn) f(i) f(i+1) f(i+2) f(i+3) Amino Acid -------------------------------------------------------------------- */ { 142, 83, 66, 0.060, 0.076, 0.035, 0.058 }, // A { 98, 93, 95, 0.070, 0.106, 0.099, 0.085 }, // R { 101, 54, 146, 0.147, 0.110, 0.179, 0.081 }, // D { 67, 89, 156, 0.161, 0.083, 0.191, 0.091 }, // N { 70, 119, 119, 0.149, 0.050, 0.117, 0.128 }, // C { 151, 37, 74, 0.056, 0.060, 0.077, 0.064 }, // E { 111, 110, 98, 0.074, 0.098, 0.037, 0.098 }, // Q { 57, 75, 156, 0.102, 0.085, 0.190, 0.152 }, // G { 100, 87, 95, 0.140, 0.047, 0.093, 0.054 }, // H { 108, 160, 47, 0.043, 0.034, 0.013, 0.056 }, // I { 121, 130, 59, 0.061, 0.025, 0.036, 0.070 }, // L { 116, 74, 101, 0.055, 0.115, 0.072, 0.095 }, // K { 145, 105, 60, 0.068, 0.082, 0.014, 0.055 }, // M { 113, 138, 60, 0.059, 0.041, 0.065, 0.065 }, // F { 57, 55, 152, 0.102, 0.301, 0.034, 0.068 }, // P { 77, 75, 143, 0.120, 0.139, 0.125, 0.106 }, // S { 83, 119, 96, 0.086, 0.108, 0.065, 0.079 }, // T { 108, 137, 96, 0.077, 0.013, 0.064, 0.167 }, // W { 69, 147, 114, 0.082, 0.065, 0.114, 0.125 }, // Y { 106, 170, 50, 0.062, 0.048, 0.028, 0.053 } }; // V // -------------------------------------------------------------------------------- /*! \brief Symmetric arithmetic rounding of a double value to an integer value. * * \param[in] d Value to be rounded * \return Rounded value * * Rounds a double value to an integer value using symmetric arithmetic rounding, * i.e. a number \f$x.y\f$ is rounded to \f$x\f$ if \f$y < 5\f$ and to \f$x+1\f$ * otherwise. */ inline int ED4_pfold_round_sym(double d) { return int(d + .5); } /*! \brief Initializes static variables. * * So far, this function only concerns #char2AA which gets initialized here. * See #char2AA for details on the values. It is called by * ED4_pfold_predict_structure() and ED4_pfold_calculate_secstruct_match(). * * \attention If any other prediction function is used alone before calling one * of the mentioned functions, this function has to be called first. */ static void ED4_pfold_init_statics() { // specify the characters used for amino acid one letter code if (!char2AA) { char2AA = new int [256]; for (int i = 0; i < 256; i++) { char2AA[i] = -1; } for (int i = 0; amino_acids[i]; i++) { char2AA[(unsigned char)amino_acids[i]] = i; } } } /*! \brief Finds nucleation sites that initiate the specified structure. * * \param[in] sequence Amino acid sequence * \param[out] structure Predicted secondary structure * \param[in] length Size of \a sequence and \a structure * \param[in] s Secondary structure type (either #ALPHA_HELIX or #BETA_SHEET) * * This function finds nucleation sites that initiate the specified structure * (alpha-helix or beta-sheet). A window of a fixed size is moved over the * sequence and former and breaker values (as defined by #cf_parameters) for * the amino acids in the window are summed up. If the former values in this * region reach a certain value and the breaker values do not exceed a certain * limit a nucleation site is formed, i.e. the region is assumed to be the * corresponding secondary structure. The result is stored in \a structure. */ static void ED4_pfold_find_nucleation_sites(const unsigned char *sequence, char *structure, int length, const PFOLD_STRUCTURE s) { #ifdef SHOW_PROGRESS cout << endl << "Searching for nucleation sites:" << endl; #endif e4_assert(s == ALPHA_HELIX || s == BETA_SHEET); // incorrect value for s e4_assert(char2AA); // char2AA not initialized; ED4_pfold_init_statics() failed or hasn't been called yet char *gap_chars = ED4_ROOT->aw_root->awar(ED4_AWAR_GAP_CHARS)->read_string(); // gap characters int windowSize = (s == ALPHA_HELIX ? 6 : 5); // window size used for finding nucleation sites double sumOfFormVal = 0, sumOfBreakVal = 0; // sum of former resp. breaker values in window int pos; // current position in sequence int count; // number of amino acids found in window for (int i = 0; i < ((length + 1) - windowSize); i++) { int aa = 0; // amino acid pos = i; for (count = 0; count < windowSize; count++) { // skip gaps while (pos < ((length + 1) - windowSize) && strchr(gap_chars, sequence[pos + count])) { pos++; } aa = char2AA[sequence[pos + count]]; if (aa == -1) break; // unknown character found // compute former and breaker values sumOfFormVal += cf_parameters[aa][s]; sumOfBreakVal += cf_parameters[aa][s+2]; } // assign sequence and save start and end of nucleation site for later extension if ((sumOfFormVal > (windowSize - 2)) && (sumOfBreakVal < 2)) { for (int j = i; j < (pos + count); j++) { if (char2AA[sequence[j]] != -1) structure[j] = structure_chars[s]; } } if (aa == -1) i = pos + count; // skip unknown character sumOfFormVal = 0, sumOfBreakVal = 0; } free(gap_chars); #ifdef SHOW_PROGRESS cout << structure << endl; #endif } /*! \brief Extends the found nucleation sites in both directions. * * \param[in] sequence Amino acid sequence * \param[out] structure Predicted secondary structure * \param[in] length Size of \a sequence and \a structure * \param[in] s Secondary structure type (either #ALPHA_HELIX or #BETA_SHEET) * * The function extends the nucleation sites found by ED4_pfold_find_nucleation_sites() * in both directions. Extension continues until a certain amino acid constellation * is found. The amino acid 'P' breaks an alpha-helix and 'P' as well as 'E' break * a beta-sheet. Also, two successive breakers or one breaker followed by an * indifferent amino acid (as defined by #structure_breaker and #structure_indifferent) * break the structure. The result is stored in \a structure. */ static void ED4_pfold_extend_nucleation_sites(const unsigned char *sequence, char *structure, int length, const PFOLD_STRUCTURE s) { #ifdef SHOW_PROGRESS cout << endl << "Extending nucleation sites:" << endl; #endif e4_assert(s == ALPHA_HELIX || s == BETA_SHEET); // incorrect value for s e4_assert(char2AA); // char2AA not initialized; ED4_pfold_init_statics() failed or hasn't been called yet bool break_structure = false; // break the current structure int start = 0, end = 0; // start and end of nucleation site int neighbour = 0; // neighbour of start or end char *gap_chars = ED4_ROOT->aw_root->awar(ED4_AWAR_GAP_CHARS)->read_string(); // gap characters // find nucleation sites and extend them in both directions (for whole sequence) for (int indStruct = 0; indStruct < length; indStruct++) { // search start and end of nucleated region while (indStruct < length && ((structure[indStruct] == ' ') || strchr(gap_chars, sequence[indStruct])) ) indStruct++; if (indStruct >= length) break; // get next amino acid that is not included in nucleation site start = indStruct - 1; while (indStruct < length && (structure[indStruct] != ' ' || strchr(gap_chars, sequence[indStruct]))) { indStruct++; } // get next amino acid that is not included in nucleation site end = indStruct; // extend nucleated region in both directions // left side: while (start > 1 && strchr(gap_chars, sequence[start])) { start--; // skip gaps } // break if no amino acid is found if (start >= 0) break_structure = (char2AA[sequence[start]] == -1); while (!break_structure && (start > 1) && (structure[start] == ' ')) { // break if absolute breaker (P or E) is found break_structure = (sequence[start] == 'P'); if (s == BETA_SHEET) break_structure |= (sequence[start] == 'E'); if (break_structure) break; // check for breaker at current position break_structure = strchr(structure_breaker[s], sequence[start]); neighbour = start - 1; // get neighbour while (neighbour > 0 && strchr(gap_chars, sequence[neighbour])) { neighbour--; // skip gaps } // break if out of bounds or no amino acid is found if (neighbour <= 0 || char2AA[sequence[neighbour]] == -1) { break; } // break if another breaker or indifferent amino acid is found break_structure &= strchr(structure_breaker[s], sequence[neighbour]) || strchr(structure_indifferent[s], sequence[neighbour]); if (!break_structure) { structure[start] = structure_chars[s]; } start = neighbour; // continue with neighbour } // right side: while (end < (length - 2) && strchr(gap_chars, sequence[end])) { end++; // skip gaps } // break if no amino acid is found if (end <= (length - 1)) break_structure = (char2AA[sequence[end]] == -1); while (!break_structure && (end < (length - 2))) { // break if absolute breaker (P or E) is found break_structure = (sequence[end] == 'P'); if (s == BETA_SHEET) break_structure |= (sequence[end] == 'E'); if (break_structure) break; // check for breaker at current position break_structure = strchr(structure_breaker[s], sequence[end]); neighbour = end + 1; // get neighbour while (neighbour < (length - 2) && strchr(gap_chars, sequence[neighbour])) { neighbour++; // skip gaps } // break if out of bounds or no amino acid is found if (neighbour >= (length - 1) || char2AA[sequence[neighbour]] == -1) { end = neighbour; break; } // break if another breaker or indifferent amino acid is found break_structure &= strchr(structure_breaker[s], sequence[neighbour]) || strchr(structure_indifferent[s], sequence[neighbour]); if (!break_structure) { structure[end] = structure_chars[s]; } end = neighbour; // continue with neighbour } indStruct = end; // continue with end } free(gap_chars); #ifdef SHOW_PROGRESS cout << structure << endl; #endif } /*! \brief Predicts beta-turns from the given amino acid sequence * * \param[in] sequence Amino acid sequence * \param[out] structure Predicted secondary structure * \param[in] length Size of \a sequence and \a structure * * A window of a fixed size is moved over the sequence and former values for * alpha-helices, beta-sheets and beta-turns are summed up. In addition, * beta-turn probabilities are multiplied. The values are specified in * #cf_parameters_norm. If the former values for beta-turn are greater than * the ones for alpha-helix and beta-sheet and the turn probabilities * exceed a certain limit the region is assumed to be a beta-turn. The result * is stored in \a structure. */ static void ED4_pfold_find_turns(const unsigned char *sequence, char *structure, int length) { #ifdef SHOW_PROGRESS cout << endl << "Searching for beta-turns: " << endl; #endif e4_assert(char2AA); // char2AA not initialized; ED4_pfold_init_statics() failed or hasn't been called yet char *gap_chars = ED4_ROOT->aw_root->awar(ED4_AWAR_GAP_CHARS)->read_string(); // gap characters const int windowSize = 4; // window size double P_a = 0, P_b = 0, P_turn = 0; // former values for helix, sheet and beta-turn double p_t = 1; // probability for beta-turn int pos; // position in sequence int count; // position in window int aa; // amino acid for (int i = 0; i < ((length + 1) - windowSize); i++) { pos = i; for (count = 0; count < windowSize; count++) { // skip gaps while (pos < ((length + 1) - windowSize) && strchr(gap_chars, sequence[pos + count])) { pos++; } aa = char2AA[sequence[pos + count]]; if (aa == -1) break; // unknown character found // compute former values and turn probability P_a += cf_parameters_norm[aa][0]; P_b += cf_parameters_norm[aa][1]; P_turn += cf_parameters_norm[aa][2]; p_t *= cf_parameters_norm[aa][3 + count]; } if (count != 0) { P_a /= count; P_b /= count; P_turn /= count; if ((p_t > 0.000075) && (P_turn > 100) && (P_turn > P_a) && (P_turn > P_b)) { for (int j = i; j < (pos + count); j++) { if (char2AA[sequence[j]] != -1) structure[j] = structure_chars[BETA_TURN]; } } } if (aa == -1) i = pos + count; // skip unknown character p_t = 1, P_a = 0, P_b = 0, P_turn = 0; } free(gap_chars); #ifdef SHOW_PROGRESS cout << structure << endl; #endif } /*! \brief Resolves overlaps of predicted secondary structures and creates structure summary. * * \param[in] sequence Amino acid sequence * \param[in,out] structures Predicted secondary structures (#ALPHA_HELIX, #BETA_SHEET, * #BETA_TURN and #STRUCTURE_SUMMARY, in this order) * \param[in] length Size of \a sequence and \a structures[i] * * The function takes the given predicted structures (alpha-helix, beta-sheet * and beta-turn) and searches for overlapping regions. If a beta-turn is found * the structure summary is assumed to be a beta-turn. For overlapping alpha-helices * and beta-sheets the former values are summed up for this region and the * structure summary is assumed to be the structure type with the higher former * value. The result is stored in \a structures[3] (= \a structures[#STRUCTURE_SUMMARY]). * * \attention I couldn't find a standard procedure for resolving overlaps and * there might be other (possibly better) ways to do that. */ static void ED4_pfold_resolve_overlaps(const unsigned char *sequence, char *structures[4], int length) { #ifdef SHOW_PROGRESS cout << endl << "Resolving overlaps: " << endl; #endif e4_assert(char2AA); // char2AA not initialized; ED4_pfold_init_statics() failed or hasn't been called yet int start = -1; // start of overlap int end = -1; // end of overlap double P_a = 0; // sum of former values for alpha-helix in overlapping regions double P_b = 0; // sum of former values for beta-sheet in overlapping regions PFOLD_STRUCTURE s; // structure with the highest former values char *gap_chars = ED4_ROOT->aw_root->awar(ED4_AWAR_GAP_CHARS)->read_string(); // gap characters // scan structures for overlaps for (int pos = 0; pos < length; pos++) { // if beta-turn is found at position pos -> summary is beta-turn if (structures[BETA_TURN][pos] != ' ') { structures[STRUCTURE_SUMMARY][pos] = structure_chars[BETA_TURN]; // if helix and sheet are overlapping and no beta-turn is found -> check which structure has the highest sum of former values } else if ((structures[ALPHA_HELIX][pos] != ' ') && (structures[BETA_SHEET][pos] != ' ')) { // search start and end of overlap (as long as no beta-turn is found) start = pos; end = pos; while (structures[ALPHA_HELIX][end] != ' ' && structures[BETA_SHEET][end] != ' ' && structures[BETA_TURN][end] == ' ') { end++; } // calculate P_a and P_b for overlap for (int i = start; i < end; i++) { // skip gaps while (i < end && strchr(gap_chars, sequence[i])) { i++; } int aa = char2AA[sequence[i]]; if (aa != -1) { P_a += cf_parameters[aa][ALPHA_HELIX]; P_b += cf_parameters[aa][BETA_SHEET]; } } // check which structure is more likely and set s appropriately s = (P_a > P_b) ? ALPHA_HELIX : BETA_SHEET; // set structure for overlapping region for (int i = start; i < end; i++) { structures[STRUCTURE_SUMMARY][i] = structure_chars[s]; } // set variables for next pass of loop resp. end of loop P_a = 0, P_b = 0; pos = end - 1; // if helix and sheet are not overlapping and no beta-turn is found -> set structure accordingly } else { // summary at position pos is helix resp. sheet if (structures[ALPHA_HELIX][pos] != ' ') { structures[STRUCTURE_SUMMARY][pos] = structure_chars[ALPHA_HELIX]; } else if (structures[BETA_SHEET][pos] != ' ') { structures[STRUCTURE_SUMMARY][pos] = structure_chars[BETA_SHEET]; } } } free(gap_chars); #ifdef SHOW_PROGRESS cout << structures[summary] << endl; #endif } /*! \brief Predicts protein secondary structures from the amino acid sequence. * * \param[in] sequence Amino acid sequence * \param[out] structures Predicted secondary structures (#ALPHA_HELIX, #BETA_SHEET, * #BETA_TURN and #STRUCTURE_SUMMARY, in this order) * \param[in] length Size of \a sequence and \a structures[i] * \return Error description, if an error occurred; 0 otherwise * * This function predicts the protein secondary structures from the amino acid * sequence according to the Chou-Fasman algorithm. In a first step, nucleation sites * for alpha-helices and beta-sheets are found using ED4_pfold_find_nucleation_sites(). * In a next step, the found structures are extended obeying certain rules with * ED4_pfold_extend_nucleation_sites(). Beta-turns are found with the function * ED4_pfold_find_turns(). In a final step, overlapping regions are identified and * resolved to create a structure summary with ED4_pfold_resolve_overlaps(). * The results are written to \a structures[i] and can be accessed via the enums * #ALPHA_HELIX, #BETA_SHEET, #BETA_TURN and #STRUCTURE_SUMMARY. */ static GB_ERROR ED4_pfold_predict_structure(const unsigned char *sequence, char *structures[4], int length) { #ifdef SHOW_PROGRESS cout << endl << "Predicting secondary structure for sequence:" << endl << sequence << endl; #endif GB_ERROR error = NULp; e4_assert((int)strlen((const char *)sequence) == length); // init memory ED4_pfold_init_statics(); e4_assert(char2AA); // predict structure ED4_pfold_find_nucleation_sites(sequence, structures[ALPHA_HELIX], length, ALPHA_HELIX); ED4_pfold_find_nucleation_sites(sequence, structures[BETA_SHEET], length, BETA_SHEET); ED4_pfold_extend_nucleation_sites(sequence, structures[ALPHA_HELIX], length, ALPHA_HELIX); ED4_pfold_extend_nucleation_sites(sequence, structures[BETA_SHEET], length, BETA_SHEET); ED4_pfold_find_turns(sequence, structures[BETA_TURN], length); ED4_pfold_resolve_overlaps(sequence, structures, length); return error; } GB_ERROR ED4_pfold_calculate_secstruct_match(const unsigned char *structure_sai, const unsigned char *structure_cmp, const int start, const int end, char *result_buffer, PFOLD_MATCH_METHOD match_method) { GB_ERROR error = NULp; e4_assert(structure_sai); e4_assert(structure_cmp); e4_assert(start >= 0); e4_assert(result_buffer); e4_assert(match_method >= 0 && match_method < PFOLD_MATCH_METHOD_COUNT); ED4_pfold_init_statics(); e4_assert(char2AA); e4_assert(end >= start); size_t end_minus_start = size_t(end-start); // @@@ use this size_t length = strlen((const char *)structure_sai); size_t match_end = std::min(std::min(end_minus_start, length), strlen((const char *)structure_cmp)); enum { BEND = 3, NOSTRUCT = 4 }; char *struct_chars[] = { ARB_strdup("HGI"), // helical structures (enum ALPHA_HELIX) ARB_strdup("EB"), // sheet-like structures (enum BETA_SHEET) ARB_strdup("T"), // beta-turn (enum BETA_TURN) ARB_strdup("S"), // bends (enum BEND) ARB_strdup("") // no structure (enum NOSTRUCT) }; // init awars AW_root *awr = ED4_ROOT->aw_root; char *gap_chars = awr->awar(ED4_AWAR_GAP_CHARS)->read_string(); char *pairs[PFOLD_MATCH_TYPE_COUNT] = { NULp }; char *pair_chars[PFOLD_MATCH_TYPE_COUNT] = { NULp }; char *pair_chars_2 = awr->awar(PFOLD_AWAR_SYMBOL_TEMPLATE_2)->read_string(); char awar[256]; for (int i = 0; pfold_match_type_awars[i].name; i++) { sprintf(awar, PFOLD_AWAR_PAIR_TEMPLATE, pfold_match_type_awars[i].name); pairs[i] = awr->awar(awar)->read_string(); sprintf(awar, PFOLD_AWAR_SYMBOL_TEMPLATE, pfold_match_type_awars[i].name); pair_chars[i] = awr->awar(awar)->read_string(); } int struct_start = start; int struct_end = start; size_t count = 0; int current_struct = 4; int aa = -1; double prob = 0; // TODO: move this check to callback for the corresponding field? if (strlen(pair_chars_2) != 10) { error = GB_export_error("You have to define 10 match symbols."); } if (!error) { switch (match_method) { case SECSTRUCT_SECSTRUCT: // TODO: one could try to find out, if structure_cmp is really a secondary structure and not a sequence (define awar for allowed symbols in secondary structure) for (count = 0; count < match_end; count++) { result_buffer[count] = *pair_chars[STRUCT_UNKNOWN]; for (int n_pt = 0; n_pt < PFOLD_MATCH_TYPE_COUNT; n_pt++) { int len = strlen(pairs[n_pt])-1; char *p = pairs[n_pt]; for (int j = 0; j < len; j += 3) { if ((p[j] == structure_sai[count + start] && p[j+1] == structure_cmp[count + start]) || (p[j] == structure_cmp[count + start] && p[j+1] == structure_sai[count + start])) { result_buffer[count] = *pair_chars[n_pt]; n_pt = PFOLD_MATCH_TYPE_COUNT; // stop searching the pair types break; // stop searching the pairs array } } } } // fill the remaining buffer with spaces while (count <= end_minus_start) { // LOOP_VECTORIZED[!<720] result_buffer[count] = ' '; count++; } break; case SECSTRUCT_SEQUENCE: // clear result buffer for (size_t i = 0; i <= end_minus_start; i++) result_buffer[i] = ' '; // LOOP_VECTORIZED[!<720] // skip gaps while (structure_sai[struct_start] != '\0' && structure_cmp[struct_start] != '\0' && strchr(gap_chars, structure_sai[struct_start]) && strchr(gap_chars, structure_cmp[struct_start])) { struct_start++; } if (structure_sai[struct_start] == '\0' || structure_cmp[struct_start] == '\0') break; // check structure at the first displayed position and find out where it starts for (current_struct = 0; current_struct < 4 && !strchr(struct_chars[current_struct], structure_sai[struct_start]); current_struct++) { ; } if (current_struct != BEND && current_struct != NOSTRUCT) { struct_start--; // check structure left of start while (struct_start >= 0) { // skip gaps while (struct_start > 0 && strchr(gap_chars, structure_sai[struct_start]) && strchr(gap_chars, structure_cmp[struct_start])) { struct_start--; } aa = char2AA[structure_cmp[struct_start]]; if (struct_start == 0 && aa == -1) { // nothing was found break; } else if (strchr(struct_chars[current_struct], structure_sai[struct_start]) && aa != -1) { prob += cf_former(aa, current_struct) - cf_breaker(aa, current_struct); // sum up probabilities struct_start--; count++; } else { break; } } } // parse structures struct_start = start; // skip gaps while (structure_sai[struct_start] != '\0' && structure_cmp[struct_start] != '\0' && strchr(gap_chars, structure_sai[struct_start]) && strchr(gap_chars, structure_cmp[struct_start])) { struct_start++; } if (structure_sai[struct_start] == '\0' || structure_cmp[struct_start] == '\0') break; struct_end = struct_start; while (struct_end < end) { aa = char2AA[structure_cmp[struct_end]]; if (current_struct == NOSTRUCT) { // no structure found -> move on struct_end++; } else if (aa == -1) { // structure found but no corresponding amino acid -> doesn't fit at all result_buffer[struct_end - start] = pair_chars_2[0]; struct_end++; } else if (current_struct == BEND) { // bend found -> fits perfectly everywhere result_buffer[struct_end - start] = pair_chars_2[9]; struct_end++; } else { // helix, sheet or beta-turn found -> while structure doesn't change: sum up probabilities while (structure_sai[struct_end] != '\0') { // skip gaps while (strchr(gap_chars, structure_sai[struct_end]) && strchr(gap_chars, structure_cmp[struct_end]) && structure_sai[struct_end] != '\0' && structure_cmp[struct_end] != '\0') { struct_end++; } aa = char2AA[structure_cmp[struct_end]]; if (structure_sai[struct_end] != '\0' && structure_cmp[struct_end] != '\0' && strchr(struct_chars[current_struct], structure_sai[struct_end]) && aa != -1) { prob += cf_former(aa, current_struct) - cf_breaker(aa, current_struct); // sum up probabilities struct_end++; count++; } else { break; } } if (count != 0) { // compute average and normalize probability prob /= count; prob = (prob + max_breaker_value[current_struct] - min_former_value[current_struct]) / (max_breaker_value[current_struct] + max_former_value[current_struct] - min_former_value[current_struct]); #if 0 // code w/o effect // map to match characters and store in result_buffer int prob_normalized = ED4_pfold_round_sym(prob * 9); // e4_assert(prob_normalized >= 0 && prob_normalized <= 9); // if this happens check if normalization is correct or some undefined characters mess everything up char prob_symbol = *pair_chars[STRUCT_UNKNOWN]; if (prob_normalized >= 0 && prob_normalized <= 9) { prob_symbol = pair_chars_2[prob_normalized]; } #endif } } // find next structure type if (structure_sai[struct_end] == '\0' || structure_cmp[struct_end] == '\0') { break; } else { prob = 0; count = 0; struct_start = struct_end; for (current_struct = 0; current_struct < 4 && !strchr(struct_chars[current_struct], structure_sai[struct_start]); current_struct++) { ; } } } break; case SECSTRUCT_SEQUENCE_PREDICT: // predict structures from structure_cmp and compare with structure_sai char *structures[4]; for (int i = 0; i < 4 && !error; i++) { structures[i] = new char [length + 1]; if (!structures[i]) { error = "Out of memory"; } else { for (size_t j = 0; j < length; j++) { // LOOP_VECTORIZED[!<810] structures[i][j] = ' '; } structures[i][length] = '\0'; } } if (!error) error = ED4_pfold_predict_structure(structure_cmp, structures, length); if (!error) { for (count = 0; count < match_end; count++) { result_buffer[count] = *pair_chars[STRUCT_UNKNOWN]; if (!strchr(gap_chars, structure_sai[count + start]) && strchr(gap_chars, structure_cmp[count + start])) { result_buffer[count] = *pair_chars[STRUCT_NO_MATCH]; } else if (strchr(gap_chars, structure_sai[count + start]) || (structures[ALPHA_HELIX][count + start] == ' ' && structures[BETA_SHEET][count + start] == ' ' && structures[BETA_TURN][count + start] == ' ')) { result_buffer[count] = *pair_chars[STRUCT_PERFECT_MATCH]; } else { // search for good match first // if found: stop searching // otherwise: continue searching for a less good match for (int n_pt = 0; n_pt < PFOLD_MATCH_TYPE_COUNT; n_pt++) { int len = strlen(pairs[n_pt])-1; char *p = pairs[n_pt]; for (int n_struct = 0; n_struct < 3; n_struct++) { for (int j = 0; j < len; j += 3) { if ((p[j] == structures[n_struct][count + start] && p[j+1] == structure_sai[count + start]) || (p[j] == structure_sai[count + start] && p[j+1] == structures[n_struct][count + start])) { result_buffer[count] = *pair_chars[n_pt]; n_struct = 3; // stop searching the structures n_pt = PFOLD_MATCH_TYPE_COUNT; // stop searching the pair types break; // stop searching the pairs array } } } } } } // fill the remaining buffer with spaces while (count <= end_minus_start) { // LOOP_VECTORIZED[!<720] result_buffer[count] = ' '; count++; } } // free buffer for (int i = 0; i < 4; i++) { if (structures[i]) { delete structures[i]; structures[i] = NULp; } } break; default: e4_assert(0); // function called with invalid argument for 'match_method' break; } } free(gap_chars); free(pair_chars_2); for (int i = 0; pfold_match_type_awars[i].name; i++) { free(pairs[i]); free(pair_chars[i]); } if (error) { // clear result buffer for (size_t i = 0; i <= end_minus_start; i++) result_buffer[i] = ' '; // LOOP_VECTORIZED[!<720] } return error; } GB_ERROR ED4_pfold_set_SAI(char **protstruct, GBDATA *gb_main, const char *alignment_name, long *protstruct_len) { // @@@ use references for protstruct + protstruct_len GB_ERROR error = NULp; GB_transaction ta(gb_main); AW_root *aw_root = ED4_ROOT->aw_root; char *SAI_name = aw_root->awar(PFOLD_AWAR_SELECTED_SAI)->read_string(); GBDATA *gb_protstruct = GBT_find_SAI(gb_main, SAI_name); freenull(*protstruct); if (gb_protstruct) { GBDATA *gb_data = GBT_find_sequence(gb_protstruct, alignment_name); if (gb_data) *protstruct = GB_read_string(gb_data); // @@@ NOT_ALL_SAI_HAVE_DATA } if (*protstruct) { if (protstruct_len) *protstruct_len = (long)strlen(*protstruct); } else { if (protstruct_len) protstruct_len = NULp; if (aw_root->awar(PFOLD_AWAR_ENABLE)->read_int()) { error = GBS_global_string("SAI \"%s\" does not exist.\nDisabled protein structure display!", SAI_name); aw_root->awar(PFOLD_AWAR_ENABLE)->write_int(0); } } free(SAI_name); return error; } /*! \brief Callback function to select the reference protein structure SAI and to * update the SAI option menu. * * \param[in] aww The calling window * \param[in,out] oms The SAI option menu * \param[in] set_sai Specifies if SAI should be updated * * The function is called whenever the selected SAI or the SAI filter is changed * in the "Protein Match Settings" dialog (see ED4_pfold_create_props_window()). * It can be called with \a set_sai defined to update the reference protein secondary * structure SAI in the editor via ED4_pfold_set_SAI() and to update the selection in * the SAI option menu. If \a set_sai is 0 only the option menu is updated. This is * necessary if only the SAI filter changed but not the selected SAI. */ static void ED4_pfold_select_SAI_and_update_option_menu(AW_window *aww, AW_option_menu_struct *oms, bool set_sai) { e4_assert(aww); e4_assert(oms); char *selected_sai = ED4_ROOT->aw_root->awar(PFOLD_AWAR_SELECTED_SAI)->read_string(); char *sai_filter = ED4_ROOT->aw_root->awar(PFOLD_AWAR_SAI_FILTER)->read_string(); GBDATA *gb_main = ED4_ROOT->get_gb_main(); if (set_sai) { const char *err = ED4_pfold_set_SAI(&ED4_ROOT->protstruct, gb_main, ED4_ROOT->get_alignment_name(), &ED4_ROOT->protstruct_len); if (err) aw_message(err); } aww->clear_option_menu(oms); aww->insert_default_option(selected_sai, "", selected_sai); GB_transaction ta(gb_main); for (GBDATA *sai = GBT_first_SAI(gb_main); sai; sai = GBT_next_SAI(sai)) { const char *sai_name = GBT_get_name_or_description(sai); if (strcmp(sai_name, selected_sai) != 0 && strstr(sai_name, sai_filter)) { aww->callback(makeWindowCallback(ED4_pfold_select_SAI_and_update_option_menu, oms, true)); // @@@ used as OPTIONMENU_SELECT_CB (see #559) aww->insert_option(sai_name, "", sai_name); } } free(selected_sai); free(sai_filter); aww->update_option_menu(); // ED4_expose_all_windows(); // @@@ need update here ? } static void setup_pfold_config(AWT_config_definition& cdef) { cdef.add(PFOLD_AWAR_ENABLE, "enable"); cdef.add(PFOLD_AWAR_SELECTED_SAI, "sai"); cdef.add(PFOLD_AWAR_SAI_FILTER, "saifilter"); cdef.add(PFOLD_AWAR_MATCH_METHOD, "matchmethod"); cdef.add(PFOLD_AWAR_SYMBOL_TEMPLATE_2, "matchsymbols2"); char awar[256]; for (int i = 0; pfold_match_type_awars[i].name; i++) { const char *name = pfold_match_type_awars[i].name; sprintf(awar, PFOLD_AWAR_PAIR_TEMPLATE, name); cdef.add(awar, name); sprintf(awar, PFOLD_AWAR_SYMBOL_TEMPLATE, name); cdef.add(awar, GBS_global_string("%s_symbol", name)); } } AW_window *ED4_pfold_create_props_window(AW_root *awr, const WindowCallback *refreshCallback) { AW_window_simple *aws = new AW_window_simple; aws->init(awr, "PFOLD_PROPS", "PROTEIN_MATCH_SETTINGS"); // create close button aws->at(10, 10); aws->auto_space(5, 2); aws->callback(AW_POPDOWN); aws->create_button("CLOSE", "CLOSE", "C"); // create help button aws->callback(makeHelpCallback("pfold_props.hlp")); aws->create_button("HELP", "HELP"); AWT_insert_config_manager(aws, AW_ROOT_DEFAULT, "pfold", makeConfigSetupCallback(setup_pfold_config)); aws->at_newline(); aws->label_length(27); int ex = 0, ey = 0; char awar[256]; // create toggle field for showing the protein structure match aws->label("Show protein structure match?"); aws->callback(*refreshCallback); // @@@ used as TOGGLE_CLICK_CB (see #559) aws->create_toggle(PFOLD_AWAR_ENABLE); aws->at_newline(); // create SAI option menu aws->label_length(30); aws->label("Selected Protein Structure SAI"); AW_option_menu_struct *oms_sai = aws->create_option_menu(PFOLD_AWAR_SELECTED_SAI); ED4_pfold_select_SAI_and_update_option_menu(aws, oms_sai, false); aws->at_newline(); aws->label("-> Filter SAI names for"); aws->callback(makeWindowCallback(ED4_pfold_select_SAI_and_update_option_menu, oms_sai, false)); // @@@ used as INPUTFIELD_CB (see #559) aws->create_input_field(PFOLD_AWAR_SAI_FILTER, 10); aws->at_newline(); // create match method option menu PFOLD_MATCH_METHOD match_method = (PFOLD_MATCH_METHOD) ED4_ROOT->aw_root->awar(PFOLD_AWAR_MATCH_METHOD)->read_int(); aws->label_length(12); aws->label("Match Method"); aws->create_option_menu(PFOLD_AWAR_MATCH_METHOD); for (int i = 0; const char *mm_aw = pfold_match_method_awars[i].name; i++) { aws->callback(*refreshCallback); // @@@ used as OPTIONMENU_SELECT_CB (see #559) if (match_method == pfold_match_method_awars[i].value) { aws->insert_default_option(mm_aw, "", match_method); } else { aws->insert_option(mm_aw, "", pfold_match_method_awars[i].value); } } aws->update_option_menu(); aws->at_newline(); // create match symbols and/or match types input fields // TODO: show only fields that are relevant for current match method -> bind to callback function? aws->label_length(40); aws->label("Match Symbols (Range 0-100% in steps of 10%)"); aws->callback(*refreshCallback); // @@@ used as INPUTFIELD_CB (see #559) aws->create_input_field(PFOLD_AWAR_SYMBOL_TEMPLATE_2, 10); aws->at_newline(); for (int i = 0; pfold_match_type_awars[i].name; i++) { aws->label_length(12); sprintf(awar, PFOLD_AWAR_PAIR_TEMPLATE, pfold_match_type_awars[i].name); aws->label(pfold_match_type_awars[i].name); aws->callback(*refreshCallback); // @@@ used as INPUTFIELD_CB (see #559) aws->create_input_field(awar, 30); // TODO: is it possible to disable input field for STRUCT_UNKNOWN? // if (pfold_match_type_awars[i].value == STRUCT_UNKNOWN) if (!i) aws->get_at_position(&ex, &ey); sprintf(awar, PFOLD_AWAR_SYMBOL_TEMPLATE, pfold_match_type_awars[i].name); aws->callback(*refreshCallback); // @@@ used as INPUTFIELD_CB (see #559) aws->create_input_field(awar, 3); aws->at_newline(); } aws->window_fit(); return aws; }