#include "GDE_proto.h" #include #include #include #include #include #include #include // AISC_MKPT_PROMOTE:#ifndef GDE_EXTGLOB_H // AISC_MKPT_PROMOTE:#include "GDE_extglob.h" // AISC_MKPT_PROMOTE:#endif typedef unsigned int UINT; int Arbdb_get_curelem(NA_Alignment& dataset) { int curelem = dataset.numelements++; if (curelem == 0) { dataset.maxnumelements = 5; ARB_alloc(dataset.element, dataset.maxnumelements); } else if (curelem == dataset.maxnumelements) { dataset.maxnumelements *= 2; ARB_realloc(dataset.element, dataset.maxnumelements); } return curelem; } static void set_constant_fields(NA_Sequence *this_elem) { this_elem->attr = DEFAULT_X_ATTR; this_elem->comments = ARB_strdup("no comments"); this_elem->comments_maxlen = 1 + (this_elem->comments_len = strlen(this_elem->comments)); this_elem->rmatrix = NULp; this_elem->tmatrix = NULp; this_elem->col_lut = Default_PROColor_LKUP; } static void AppendNA_and_free(NA_Sequence *this_elem, uchar *& sequfilt) { AppendNA((NA_Base *)sequfilt, strlen((const char *)sequfilt), this_elem); freenull(sequfilt); } __ATTR__USERESULT static int InsertDatainGDE(NA_Alignment& dataset, GBDATA **the_species, unsigned char **the_names, unsigned char **the_sequences, unsigned long numberspecies, unsigned long maxalignlen, const AP_filter *filter, GapCompression compress, bool cutoff_stop_codon, TypeInfo typeinfo) { GBDATA *gb_species; NA_Sequence *this_elem; AP_filter *allocatedFilter = NULp; gde_assert(contradicted(the_species, the_names)); if (!filter) { allocatedFilter = new AP_filter(maxalignlen); filter = allocatedFilter; } else { size_t fl = filter->get_length(); if (fl < maxalignlen) { aw_message("Warning: Your filter is shorter than the alignment len"); maxalignlen = fl; } } GB_ERROR error = filter->is_invalid(); if (!error) { size_t *seqlen = ARB_calloc(numberspecies); // sequences may have different length { unsigned long i; for (i=0; i(numberspecies+1); GB_alignment_type alitype = GBT_get_alignment_type(dataset.gb_main, dataset.alignment_name); gde_assert(alitype != GB_AT_UNKNOWN); if (compress==COMPRESS_ALL) { // compress all gaps and filter positions long len = filter->get_filtered_length(); unsigned long i; for (i=0; iuse_position(col) && !GAP::is_std_gap(c)) { sequfilt[i][newcount++] = c; } } } } else { if (compress==COMPRESS_VERTICAL_GAPS || // compress vertical gaps (and '.') compress == COMPRESS_NONINFO_COLUMNS) // and additionally all columns containing no info (only N or X) { size_t i; bool isInfo[256]; for (i=0; i<256; i++) isInfo[i] = true; isInfo[UINT('-')] = false; isInfo[UINT('.')] = false; if (compress == COMPRESS_NONINFO_COLUMNS) { switch (alitype) { case GB_AT_RNA: case GB_AT_DNA: isInfo[UINT('N')] = false; isInfo[UINT('n')] = false; break; case GB_AT_AA: isInfo[UINT('X')] = false; isInfo[UINT('x')] = false; break; default: gde_assert(0); break; } } bool modified = false; char *filterString = filter->to_string(); for (i=0; iuse_position(i)) { bool wantColumn = false; for (size_t n=0; n take column break; } } } if (!wantColumn) { filterString[i] = '0'; modified = true; } } } if (modified) { size_t len = filter->get_length(); delete allocatedFilter; filter = allocatedFilter = new AP_filter(filterString, NULp, len); } free(filterString); } if (!error) error = filter->is_invalid(); if (!error) { long len = filter->get_filtered_length(); size_t i; for (i=0; iget_simplify_table(); for (size_t col=0; (coluse_position(col)) { sequfilt[i][newcount++] = simplify[c]; } } } } } free(seqlen); if (!error) { GB_transaction ta(db_access.gb_main); char *str = filter->to_string(); error = GBT_write_string(db_access.gb_main, AWAR_GDE_EXPORT_FILTER, str); free(str); } if (!error) { long number = 0; int curelem; int bad_names = 0; int elementtype = TEXT; int elementtype_init = RNA; switch (typeinfo) { case UNKNOWN_TYPEINFO: gde_assert(0); case BASIC_TYPEINFO: break; case DETAILED_TYPEINFO: switch (alitype) { case GB_AT_RNA: elementtype = RNA; break; case GB_AT_DNA: elementtype = DNA; break; case GB_AT_AA: elementtype = PROTEIN; break; default : gde_assert(0); break; } gde_assert(elementtype != TEXT); elementtype_init = elementtype; break; } if (!error) { arb_progress progress("Read data from DB", numberspecies); if (the_species) { for (gb_species = the_species[number]; gb_species && !error; gb_species = the_species[++number]) { curelem = Arbdb_get_curelem(dataset); this_elem = &(dataset.element[curelem]); InitNASeq(this_elem, elementtype_init); this_elem->gb_species = gb_species; #define GET_FIELD_CONTENT(fieldname,buffer,bufsize) do { \ gbd = GB_entry(gb_species, fieldname); \ if (gbd) { \ const char *val = GB_read_char_pntr(gbd); \ strcpy_truncate(buffer, val, bufsize); \ } \ else buffer[0] = 0; \ } while(0) GBDATA *gbd; GET_FIELD_CONTENT("name", this_elem->short_name, SIZE_SHORT_NAME); GET_FIELD_CONTENT("author", this_elem->authority, SIZE_AUTHORITY); GET_FIELD_CONTENT("full_name", this_elem->seq_name, SIZE_SEQ_NAME); GET_FIELD_CONTENT("acc", this_elem->id, SIZE_ID); this_elem->elementtype = elementtype; if (AWTC_name_quality(this_elem->short_name) != 0) bad_names++; AppendNA_and_free(this_elem, sequfilt[number]); set_constant_fields(this_elem); progress.inc_and_check_user_abort(error); } } else { // use the_names unsigned char *species_name; for (species_name=the_names[number]; species_name && !error; species_name=the_names[++number]) { curelem = Arbdb_get_curelem(dataset); this_elem = &(dataset.element[curelem]); InitNASeq(this_elem, elementtype_init); this_elem->gb_species = NULp; strcpy_truncate(this_elem->short_name, (char*)species_name, SIZE_SHORT_NAME); this_elem->authority[0] = 0; this_elem->seq_name[0] = 0; this_elem->id[0] = 0; this_elem->elementtype = elementtype; if (AWTC_name_quality(this_elem->short_name) != 0) bad_names++; AppendNA_and_free(this_elem, sequfilt[number]); set_constant_fields(this_elem); progress.inc_and_check_user_abort(error); } } } if (!error) { if (bad_names) { aw_message(GBS_global_string("Problematic names found: %i\n" "External program call may fail or produce invalid results.\n" "You might want to use 'Species/Synchronize IDs' and read the associated help.", bad_names)); } { unsigned long i; for (i=0; i0); char **the_sequences; ARB_calloc(the_sequences, numberspecies+1); for (long i=0; the_species[i]; i++) { ARB_alloc(the_sequences[i], maxalignlen+1); the_sequences[i][maxalignlen] = 0; memset(the_sequences[i], '.', (size_t)maxalignlen); const char *data = GB_read_char_pntr(GBT_find_sequence(the_species[i], dataset.alignment_name)); int size = strlen(data); if (size > maxalignlen) size = (int)maxalignlen; strcpy_truncate(the_sequences[i], data, size+1); } int res = InsertDatainGDE(dataset, the_species, NULp, (unsigned char **)the_sequences, numberspecies, maxalignlen, filter, compress, cutoff_stop_codon, typeinfo); for (long i=0; iseqlen == 0) return -1; if (boffset || (b>a->offset+a->seqlen)) { switch (a->elementtype) { case DNA: case RNA: return 0; case PROTEIN: case TEXT: return '~'; case MASK: return '0'; default: return '-'; } } return a->sequence[b-a->offset]; } void putelem(NA_Sequence *a, int b, NA_Base c) { if (b>=(a->offset+a->seqmaxlen)) { Warning("Putelem:insert beyond end of sequence space ignored"); } else if (b >= (a->offset)) { a->sequence[b-(a->offset)] = c; } else { NA_Base *temp = ARB_calloc(a->seqmaxlen + a->offset - b); switch (a->elementtype) { // Pad out with gap characters fron the point of insertion to the offset case MASK: for (int j=b; joffset; j++) temp[j-b] = '0'; break; case DNA: case RNA: for (int j=b; joffset; j++) temp[j-b] = '\0'; break; case PROTEIN: for (int j=b; joffset; j++) temp[j-b] = '-'; break; case TEXT: default: for (int j=b; joffset; j++) temp[j-b] = ' '; break; } for (int j=0; jseqmaxlen; j++) temp[j+a->offset-b] = a->sequence[j]; free(a->sequence); a->sequence = temp; a->seqlen += (a->offset - b); a->seqmaxlen += (a->offset - b); a->offset = b; a->sequence[0] = c; } }