// ============================================================= //
//                                                               //
//   File      : seq_export.cxx                                  //
//   Purpose   :                                                 //
//                                                               //
//   Institute of Microbiology (Technical University Munich)     //
//   http://www.arb-home.de/                                     //
//                                                               //
// ============================================================= //

#include "seqio.hxx"

#include <AP_filter.hxx>
#include <xferset.h>

#include <arbdbt.h>
#include <gb_aci.h>

#include <arb_strarray.h>
#include <arb_file.h>
#include <arb_diff.h>
#include <arb_progress.h>
#include <arb_global_defs.h>

#include <xml.hxx>

#include <unistd.h>

#define sio_assert(cond) arb_assert(cond)

using           std::string;
using namespace SEQIO;
using namespace FieldTransfer;

// ---------------------------------
//      internal export commands

enum EXPORT_CMD {
    // real formats
    EXPORT_XML,

    EXPORT_INVALID,
    EXPORT_USING_FORM,        // default mode (has to be last entry in enum)
};

static const char *internal_export_commands[] = {
    "xml_write",
    NULp
};

static EXPORT_CMD check_internal(const char *command) {
    EXPORT_CMD cmd = EXPORT_INVALID;
    for (int i = 0; internal_export_commands[i]; ++i) {
        if (strcmp(command, internal_export_commands[i]) == 0) {
            cmd = static_cast<EXPORT_CMD>(i);
        }
    }
    return cmd;
}

// ----------------------
//      export_format

struct export_format : virtual Noncopyable {
    char *system;
    char *pre_format;
    char *suffix;
    char *description; // (multiline) description of filter
    char *form;        // transformed export expression (part behind 'BEGIN')

    EXPORT_CMD export_mode;

    export_format()
        : system(NULp),
          pre_format(NULp),
          suffix(NULp),
          description(NULp),
          form(NULp),
          export_mode(EXPORT_XML)
    {}
    ~export_format() {
        free(system);
        free(pre_format);
        free(suffix);
        free(description);
        free(form);
    }
};

static GB_ERROR read_export_format(export_format *efo, const char *file, bool load_complete_form) {
    GB_ERROR error = NULp;

    if (!file || !file[0]) {
        error = "No export format selected";
    }
    else {
        char *fullfile = NULp;
        if (GB_is_regularfile(file)) { // prefer files that are completely specified (full/rel path)
            fullfile = strdup(GB_canonical_path(file));
        }
        else {
            fullfile = nulldup(GB_path_in_ARBHOME(file)); // fallback to ARBHOME-relative specification
        }

        FILE *in = fopen(fullfile, "r");

        if (!in) error = GB_IO_error("reading export form", fullfile);
        else {
            efo->export_mode = EXPORT_USING_FORM; // default mode
            {
                bool    seen_BEGIN = false;
                char   *s1, *s2;
                size_t  linenumber = 0;

                while (!error && !seen_BEGIN && read_string_pair(in, s1, s2, linenumber)) {
                    if      (!strcmp(s1, "SYSTEM"))      { reassign(efo->system,     s2); }
                    else if (!strcmp(s1, "PRE_FORMAT"))  { reassign(efo->pre_format, s2); }
                    else if (!strcmp(s1, "SUFFIX"))      { reassign(efo->suffix,     s2); }
                    else if (!strcmp(s1, "DESCRIPTION")) { appendTo(efo->description, '\n', s2); }
                    else if (!strcmp(s1, "INTERNAL")) {
                        efo->export_mode = check_internal(s2);
                        if (efo->export_mode == EXPORT_INVALID) {
                            error = GBS_global_string("Unknown INTERNAL command '%s'", s2);
                        }
                    }
                    else if (!strcmp(s1, "BEGIN")) {
                        if (efo->export_mode != EXPORT_USING_FORM) {
                            error = "'BEGIN' not allowed when 'INTERNAL' is used";
                        }
                        else {
                            seen_BEGIN = true;
                        }
                    }
                    else {
                        error = GBS_global_string("Unknown command '%s'", s1);
                    }

                    // add error location
                    if (error) error = GBS_global_string("%s in line #%zu", error, linenumber);

                    free(s2);
                    free(s1);
                }
            }

            if (!error && load_complete_form && efo->export_mode == EXPORT_USING_FORM) {
                // now 'in' points to line behind 'BEGIN'
                char *form = GB_read_fp(in); // read rest of file

                // Join lines that end with \ with next line.
                // Replace ' = ' and ':' by '\=' and '\:'
                efo->form  = GBS_string_eval(form, "\\\\\n=:\\==\\\\\\=:*=\\*\\=*1:\\:=\\\\\\:");
                if (!efo->form) error = GB_failedTo_error("evaluate part below 'BEGIN'", NULp, GB_await_error());
                free(form);
            }

            // some checks for incompatible commands
            if (!error) {
                if      (efo->system && !efo->pre_format) error = "Missing 'PRE_FORMAT' (needed by 'SYSTEM')";
                else if (efo->pre_format && !efo->system) error = "Missing 'SYSTEM' (needed by 'PRE_FORMAT')";
                else if (efo->export_mode != EXPORT_USING_FORM) {
                    if (efo->system)     error = "'SYSTEM' is not allowed together with 'INTERNAL'";
                    if (efo->pre_format) error = "'PRE_FORMAT' is not allowed together with 'INTERNAL'";
                }
            }

            error = GB_failedTo_error("read export format", fullfile, error);
            fclose(in);
        }
        free(fullfile);
    }

    return error;
}

// ----------------------------------------
// export sequence helper class

class SpeciesSelector : virtual Noncopyable {
    ExportWhich  which;
    const char  *one_species;

public:
    SpeciesSelector(ExportWhich which_, const char *one_species_) :
        which(which_),
        one_species(one_species_)
    {}
    GBDATA *select_first(GBDATA *gb_main) const {
        GBDATA *gb_species = NULp;
        switch (which) {
            case EBF_ALL:    gb_species = GBT_first_species(gb_main);             break;
            case EBF_MARKED: gb_species = GBT_first_marked_species(gb_main);      break;
            case EBF_ONE:    gb_species = GBT_find_species(gb_main, one_species); break;
        }
        return gb_species;
    }
    GBDATA *select_next(GBDATA *gb_previous) const {
        GBDATA *gb_species = NULp;
        switch (which) {
            case EBF_ALL:    gb_species = GBT_next_species(gb_previous);        break;
            case EBF_MARKED: gb_species = GBT_next_marked_species(gb_previous); break;
            case EBF_ONE:    break;
        }
        return gb_species;
    }
};

class export_sequence_data : virtual Noncopyable { // @@@ simplify using FilteredExport?
    GBDATA *last_species_read;
    char   *seq;
    size_t  len;
    char   *error;

    GBDATA *gb_main;
    char   *ali;

    SpeciesSelector whichSpecies;

    size_t     species_count;
    AP_filter *filter;
    bool       cut_stop_codon;
    int        compress;           // 0 = no;1 = vertical gaps; 2 = all gaps;

    long    max_ali_len;                            // length of alignment
    size_t *export_column;                          // list of exported seq data positions
    size_t  columns;                                // how many columns get exported

    GBDATA *single_species;     // if set to species -> first/next only return this species (used to export to multiple files)

public:

    export_sequence_data(GBDATA *Gb_Main, ExportWhich which, const char *one_species, AP_filter* Filter, bool CutStopCodon, int Compress) :
        last_species_read(NULp),
        seq(NULp),
        len(0),
        error(NULp),
        gb_main(Gb_Main),
        whichSpecies(which, one_species),
        species_count(size_t(-1)),
        filter(Filter),
        cut_stop_codon(CutStopCodon),
        compress(Compress),
        export_column(NULp),
        columns(0),
        single_species(NULp)
    {
        sio_assert(filter);
        sio_assert(!filter->is_invalid()); // you have to pass a valid filter

        ali = GBT_get_default_alignment(gb_main);
        sio_assert(ali); // cannot occur (when no ali selected/exist -> filter would have been invalid above)

        max_ali_len = GBT_get_alignment_len(gb_main, ali);
        sio_assert(max_ali_len>0);

        if (cut_stop_codon) {
            GB_alignment_type ali_type = GBT_get_alignment_type(gb_main, ali);
            sio_assert(ali_type != GB_AT_UNKNOWN);
            if (ali_type !=  GB_AT_AA) {
                GB_warning("Cutting stop codon makes no sense - ignored");
                cut_stop_codon = false;
            }
        }

        if (max_ali_len>=0 && filter->get_length() < size_t(max_ali_len)) {
            GB_warningf("Warning: Your filter is shorter than the alignment (%zu<%li)",
                        filter->get_length(), max_ali_len);
            max_ali_len = filter->get_length();
        }
    }

    ~export_sequence_data() {
        delete [] export_column;
        delete [] seq;
        free(error);
        free(ali);
    }

    const char *getAlignment() const { return ali; }
    long getAliLen() const { return max_ali_len; }
    GBDATA *get_gb_main() const { sio_assert(gb_main); return gb_main; }

    void set_single_mode(GBDATA *gb_species) { single_species = gb_species; }
    bool in_single_mode() const { return single_species; }

    GBDATA *first_species() const { return single_species ? single_species : whichSpecies.select_first(gb_main); }
    GBDATA *next_species(GBDATA *gb_prev) const { return single_species ? NULp : whichSpecies.select_next(gb_prev); }

    const unsigned char *get_seq_data(GBDATA *gb_species, size_t& slen, GB_ERROR& error) const;
    static bool isGap(char c) { return GAP::is_std_gap(c); }

    size_t count_species() {
        sio_assert(!in_single_mode());
        if (species_count == size_t(-1)) {
            species_count = 0;
            for (GBDATA *gb_species = whichSpecies.select_first(gb_main);
                 gb_species;
                 gb_species = whichSpecies.select_next(gb_species))
            {
                species_count++;
            }
        }
        return species_count;
    }

    GB_ERROR    detectVerticalGaps();
    const char *get_export_sequence(GBDATA *gb_species, size_t& seq_len, GB_ERROR& error);
};

const unsigned char *export_sequence_data::get_seq_data(GBDATA *gb_species, size_t& slen, GB_ERROR& err) const {
    const char *data   = NULp;
    GBDATA     *gb_seq = GBT_find_sequence(gb_species, ali);

    if (!gb_seq) {
        err  = GBS_global_string_copy("No data in alignment '%s' of species '%s'", ali, GBT_get_name_or_description(gb_species));
        slen = 0;
    }
    else {
        data = GB_read_char_pntr(gb_seq);
        slen = GB_read_count(gb_seq);
        err  = NULp;
    }
    return (const unsigned char *)data;
}


GB_ERROR export_sequence_data::detectVerticalGaps() {
    GB_ERROR err = NULp;

    sio_assert(!in_single_mode());

    if (compress == 1) {        // compress vertical gaps!
        // @@@ detection of vertical gaps should better be done either by AP_filter directly or by FilteredExport

        size_t  gap_columns = filter->get_filtered_length();
        size_t *gap_column  = new size_t[gap_columns+1];

        const size_t *filterpos_2_seqpos = filter->get_filterpos_2_seqpos();
        memcpy(gap_column, filterpos_2_seqpos, gap_columns*sizeof(*gap_column));
        gap_column[gap_columns] = max_ali_len;

        arb_progress progress("Calculating vertical gaps", count_species());

        for (GBDATA *gb_species = first_species();
             gb_species && !err;
             gb_species = next_species(gb_species))
        {
            size_t               slen;
            const unsigned char *sdata = get_seq_data(gb_species, slen, err);

            if (!err) {
                size_t j = 0;
                size_t i;
                for (i = 0; i<gap_columns; ++i) {
                    if (isGap(sdata[gap_column[i]])) {
                        gap_column[j++] = gap_column[i]; // keep gap column
                    }
                    // otherwise it's overwritten
                }

                sio_assert(i >= j);
                size_t skipped_columns  = i-j;
                sio_assert(gap_columns >= skipped_columns);
                gap_columns            -= skipped_columns;
            }
            progress.inc_and_check_user_abort(err);
        }

        if (!err) {
            columns       = filter->get_filtered_length() - gap_columns;
            export_column = new size_t[columns];

            size_t gpos = 0;           // index into array of vertical gaps
            size_t epos = 0;           // index into array of exported columns
            size_t flen = filter->get_filtered_length();
            size_t a;
            for (a = 0; a<flen && gpos<gap_columns; ++a) {
                size_t fpos = filterpos_2_seqpos[a];
                if (fpos == gap_column[gpos]) { // only gaps here -> skip column
                    gpos++;
                }
                else { // not only gaps -> use column
                    sio_assert(fpos<gap_column[gpos]);
                    sio_assert(epos < columns); // got more columns than expected
                    export_column[epos++] = fpos;
                }
            }
            for (; a<flen; ++a) { // LOOP_VECTORIZED
                export_column[epos++] = filterpos_2_seqpos[a];
            }

            sio_assert(epos == columns);
        }

        delete [] gap_column;
    }
    else { // compress all or none (simply use filter)
        const size_t *filterpos_2_seqpos = filter->get_filterpos_2_seqpos();

        columns       = filter->get_filtered_length();
        export_column = new size_t[columns];

        memcpy(export_column, filterpos_2_seqpos, columns*sizeof(*filterpos_2_seqpos));
    }

    seq = new char[columns+1];

    return err;
}

const char *export_sequence_data::get_export_sequence(GBDATA *gb_species, size_t& seq_len, GB_ERROR& err) {
    if (gb_species != last_species_read) {
        freenull(error);

        // read + filter a new species
        GB_ERROR             curr_error;
        const unsigned char *data = get_seq_data(gb_species, len, curr_error);

        if (curr_error) {
            error = strdup(curr_error);
        }
        else {
            size_t       i;
            const uchar *simplify = filter->get_simplify_table();

            if (cut_stop_codon) {
                const unsigned char *stop_codon = (const unsigned char *)memchr(data, '*', len);
                if (stop_codon) {
                    len = stop_codon-data;
                }
            }

            if (compress == 2) { // compress all gaps
                size_t j = 0;
                for (i = 0; i<columns; ++i) {
                    size_t seq_pos = export_column[i];
                    if (seq_pos<len) {
                        unsigned char c = data[seq_pos];
                        if (!isGap(c)) {
                            seq[j++] = simplify[c];
                        }
                    }
                }
                seq[j] = 0;
                len    = j;
            }
            else { // compress vertical or compress none (simply use filter in both cases)
                for (i = 0; i<columns; ++i) {
                    size_t seq_pos = export_column[i];
                    if (seq_pos<len) {
                        seq[i] = simplify[data[seq_pos]];
                    }
                    else {
                        seq[i] = simplify['.'];
                    }
                }
                seq[i] = 0;
                len    = columns;
            }
        }
    }

    err = error;
    if (error) {
        seq_len  = 0;
        return NULp;
    }

    seq_len  = len;
    return seq;
}

// ----------------------------------------
// exported_sequence is hooked into ACI temporary (provides result of command 'export_sequence')
// which is the sequence filtered and compressed according to settings in the export window

static export_sequence_data *esd = NULp;

static const char *exported_sequence(GBDATA *gb_species, size_t *seq_len, GB_ERROR *error) {
    sio_assert(esd);
    return esd->get_export_sequence(gb_species, *seq_len, *error);
}

static GB_ERROR XML_recursive(GBDATA *gbd, int depth) {
    GB_ERROR    error    = NULp;
    const char *key_name = GB_read_key_pntr(gbd);
    XML_Tag    *tag      = NULp;
    bool        descend  = true;

    if (depth == 1 && strncmp(key_name, "ali_", 4) == 0) { // hack needed if seq-quality information exists
        sio_assert(esd);
        descend = false; // do not descend into alignments
        if (strcmp(esd->getAlignment(), key_name) == 0) { // the wanted alignment

            tag = new XML_Tag("ALIGNMENT");
            tag->add_attribute("name", key_name+4);

            GBDATA     *gb_species = GB_get_father(gbd);
            size_t      len;
            const char *seq        = exported_sequence(gb_species, &len, &error);

            if (seq) {
                XML_Tag dtag("data");
                { XML_Text seqText(seq); }
            }
        }
    }
    else {
        tag = new XML_Tag(key_name);

        if (GB_is_container(gbd)) {
            const char *name = GBT_read_char_pntr(gbd, "name");
            if (name) tag->add_attribute("name", name);
        }
    }

    if (descend) {
        if (GB_read_type(gbd) == GB_DB) {
            for (GBDATA *gb_child = GB_child(gbd); gb_child && !error; gb_child = GB_nextChild(gb_child)) {
                const char *sub_key_name = GB_read_key_pntr(gb_child);

                if (strcmp(sub_key_name, "name") != 0) { // do not recurse for "name" (is handled above)
                    error = XML_recursive(gb_child, depth+1);
                }
            }
        }
        else {
            char *content = GB_read_as_string(gbd);
            if (content) {
                XML_Text text(content);
                free(content);
            }
            else {
                tag->add_attribute("error", "unsavable");
            }
        }
    }

    delete tag;
    return error;
}

static GB_ERROR export_species_using_form(FILE *out, const char *form, const GBL_call_env& callEnv) { // @@@ pass preparsed command (form)
    GB_ERROR  error  = NULp;
    char     *pars   = GBS_string_eval_in_env(" ", form, callEnv);
    if (!pars) error = GB_await_error();
    else {
        char *p;
        char *o = pars;
        while ((p = GBS_find_string(o, "$$DELETE_LINE$$", 0))) {
            char *l, *r;
            for (l = p; l>o; l--) if (*l=='\n') break;
            r = strchr(p, '\n'); if (!r) r = p + strlen(p);
            fwrite(o, 1, l-o, out);
            o = r;
        }
        fputs(o, out);
        free(pars);
    }
    return error;
}

static GB_ERROR export_write_species(GBDATA *gb_species, FILE *out, const GBL_env& env, const export_format& efo) {
    GB_ERROR error = NULp;
    switch (efo.export_mode) {
        case EXPORT_USING_FORM: {
            GBL_call_env callEnv(gb_species, env);
            error = export_species_using_form(out, efo.form, callEnv);
            break;
        }

        case EXPORT_XML:
            error = XML_recursive(gb_species, 0);
            break;

        case EXPORT_INVALID:
            sio_assert(0);
            break;
    }
    return error;
}

static GB_ERROR export_format_single(const char *db_name, const char *formname, const char *outname, char **resulting_outname, RuleSetPtr ruleset) {
    // Exports sequences specified by 'esd' (module global variable)
    // to format specified by 'formname'.
    //
    // if 'outname' == NULp -> export species to temporary file, otherwise to 'outname'.
    // Full path of generated file is returned in 'resulting_outname'

    static int export_depth     = 0;
    export_depth++;

    *resulting_outname = NULp;

    export_format efo;
    GB_ERROR      error = read_export_format(&efo, formname, true);

    if (!error) {
        if (!outname) {                             // if no 'outname' is given -> export to temporary file
            char *unique_outname = GB_unique_filename("exported", efo.suffix);
            *resulting_outname   = GB_create_tempfile(unique_outname);
            free(unique_outname);

            if (!*resulting_outname) error = GB_await_error();
        }
        else *resulting_outname = strdup(outname);
    }

    sio_assert(error || *resulting_outname);

    if (!error) {
        if (efo.pre_format) {
            // Export data using format 'pre_format'.
            // Afterwards convert to wanted format using 'system'.

            sio_assert(efo.system);

            char *intermediate_export;
            error = export_format_single(db_name, efo.pre_format, NULp, &intermediate_export, ruleset);
            if (!error) {
                sio_assert(GB_is_privatefile(intermediate_export, false));

                GB_informationf("Converting to %s", efo.suffix);

                char *srt = GBS_global_string_copy("$<=%s:$>=%s", intermediate_export, *resulting_outname);
                char *sys = GBS_string_eval(efo.system, srt);

                GB_informationf("exec '%s'", efo.system);
                error = GBK_system(sys);

                GB_unlink_or_warn(intermediate_export, &error);

                free(sys);
                free(srt);
            }
            free(intermediate_export);
        }
        else {
            FILE *out       = fopen(*resulting_outname, "wt");
            if (!out) error = GB_IO_error("writing", *resulting_outname);
            else {
                XML_Document *xml = NULp;

                long allCount   = 0;
                for (GBDATA *gb_species = esd->first_species();
                     gb_species && !error;
                     gb_species = esd->next_species(gb_species))
                {
                    allCount++;
                }

                arb_progress progress(allCount);
                progress.auto_subtitles("Saving species");

                if (efo.export_mode == EXPORT_XML) {
                    xml = new XML_Document("ARB_SEQ_EXPORT", "arb_seq_export.dtd", out);
                    {
                        xml->add_attribute("database", db_name);
                    }
                    xml->add_attribute("export_date", ARB_date_string());
                    {
                        XML_Comment rem("There is a basic version of ARB_seq_export.dtd in $ARBHOME/lib/dtd\n"
                                        "but you might need to expand it by yourself,\n"
                                        "because the ARB-database may contain any kind of fields.");
                    }
                }

                GBL_env env(esd->get_gb_main(), NULp);

                for (GBDATA *gb_species = esd->first_species();
                     gb_species && !error;
                     gb_species = esd->next_species(gb_species))
                {
                    if (ruleset.isSet()) {
                        GB_topSecurityLevel unsecured(env.get_gb_main()); // needed to clone species (overwrites name .. in temporary clone)
                        ItemClonedByRuleSet clone(gb_species, CLONE_ITEM_SPECIES, ruleset, RENAME_ITEM_WHILE_TEMP_CLONE_EXISTS, NULp, NULp);
                        if (clone.has_error()) {
                            error = clone.get_error();
                        }
                        else {
                            GB_previousSecurityLevel user(unsecured); // run export itself with normal security
                            error = export_write_species(clone.get_clone(), out, env, efo);
                        }
                    }
                    else {
                        error = export_write_species(gb_species, out, env, efo);
                    }
                    progress.inc_and_check_user_abort(error);
                }

                delete xml;
                fclose(out);
            }
        }
    }

    if (error) {
        if (*resulting_outname) {
            GB_unlink_or_warn(*resulting_outname, NULp);
            freenull(*resulting_outname);
        }
    }

    export_depth--;

    return error;
}

static GB_ERROR export_format_multiple(const char* dbname, const char *formname, const char *outname, bool multiple, char **resulting_outname, RuleSetPtr ruleset) {
    GB_ERROR error = NULp;

    if (multiple) {
        char *path, *name, *suffix;
        GB_split_full_path(outname, &path, NULp, &name, &suffix);
        *resulting_outname = NULp;

        arb_progress progress("Exporting data", esd->count_species());

        for (GBDATA *gb_species = esd->first_species();
             gb_species && !error;
             gb_species = esd->next_species(gb_species))
        {
            const char *species_name = GBT_read_char_pntr(gb_species, "name");
            if (!species_name) error = "Can't export unnamed species";
            else {
                const char *fname = GB_append_suffix(GBS_global_string("%s_%s", name, species_name), suffix);
                progress.subtitle(fname);

                char *oname = strdup(GB_concat_path(path, fname));
                char *res_oname;

                esd->set_single_mode(gb_species); // means: only export 'gb_species'
                error = export_format_single(dbname, formname, oname, &res_oname, ruleset);
                esd->set_single_mode(NULp);

                if (!*resulting_outname || // not set yet
                    (res_oname && strcmp(*resulting_outname, res_oname)>0)) // or smaller than set one
                {
                    reassign(*resulting_outname, res_oname);
                }

                free(res_oname);
                free(oname);
            }

            progress.inc_and_check_user_abort(error);
        }

        free(suffix);
        free(name);
        free(path);
    }
    else {
        arb_progress progress("Exporting data");
        error = export_format_single(dbname, formname, outname, resulting_outname, ruleset);
    }

    return error;
}

namespace SEQIO {

    GB_ERROR export_by_format(GBDATA *gb_main, ExportWhich which, const char *one_species,
                              AP_filter *filter, int cut_stop_codon, int compress,
                              const char *dbname, const char *formname, const char *field_transfer_set,
                              const char *outname, int multiple, char **real_outname)
    {
        sio_assert(!GB_have_error());

        if (field_transfer_set && !field_transfer_set[0]) { // empty 'field_transfer_set' given
            field_transfer_set = NULp; // -> handle like NULp
        }

        GB_ERROR error = filter->is_invalid();

        RuleSetPtr ruleset;
        if (!error) {
            if (field_transfer_set) { // if specified load ruleset:
                ErrorOrRuleSetPtr loaded = RuleSet::loadFrom(field_transfer_set);

                if (loaded.hasError()) {
                    ARB_ERROR lerror = loaded.getError();
                    error            = lerror.deliver();
                }
                else {
                    ruleset = loaded.getValue();
                }
            }
        }

        if (!error) {
            esd = new export_sequence_data(gb_main, which, one_species, filter, cut_stop_codon, compress);
            sio_assert(esd->getAliLen()>0);

            GB_set_export_sequence_hook(exported_sequence);

            error = esd->detectVerticalGaps();
            if (!error) {
                error = export_format_multiple(dbname, formname, outname, multiple, real_outname, ruleset);
                if (error) error = GBS_static_string(error); // error is member of export_sequence_data -> copy to static buffer
            }

            GB_set_export_sequence_hook(NULp);
        }
        delete esd;
        esd = NULp;

        sio_assert(!GB_have_error());
        return error;
    }

    GB_ERROR get_exportFormat_information(const char *eft_formname, ExportFormatInfo& info) {
        export_format efs;
        GB_ERROR      error = read_export_format(&efs, eft_formname, false);

        if (!error) {
            if (efs.suffix) {
                info.suffix = efs.suffix;
                efs.suffix  = NULp;
            }
            if (efs.description) {
                info.description = efs.description;
                efs.description  = NULp;
            }
        }

        return error;
    }

    char *get_exportFormat_evalForm(const char *eft_formname, GB_ERROR& error) {
        // load copy of form that gets evaluated during export.
        export_format efs;
        error = read_export_format(&efs, eft_formname, true);
        if (!error && efs.form) {
            if (efs.pre_format) {
                sio_assert(strcmp(efs.form, "*=") == 0); // caused by eval in read_export_format?
                return get_exportFormat_evalForm(efs.pre_format, error);
            }

            sio_assert(efs.pre_format == NULp);
            return ARB_strdup(efs.form);
        }
        // failed to load form

        sio_assert(efs.form == NULp);
        sio_assert(efs.pre_format == NULp);
        if (!error) {
            if (efs.export_mode != EXPORT_USING_FORM) {
                if (efs.export_mode == EXPORT_XML) {
                    error = "exports all fields";
                }
                else {
                    error = "unsupported filter type";
                }
            }
            else {
                error = "no form loaded";
            }
        }

        sio_assert(error);
        if (error) {
            char *nameOnly = NULp;
            GB_split_full_path(eft_formname, NULp, &nameOnly, NULp, NULp);

            const char *shownName = nameOnly ? nameOnly : eft_formname;
            error                 = GBS_global_string("%s (%s)", error, shownName);

            free(nameOnly);
        }
        return NULp;
    }

};

// --------------------------------------------------------------------------------

#ifdef UNIT_TESTS
#include <test_unit.h>

// uncomment to auto-update exported files
// (needed once after changing database or export formats)
// #define TEST_AUTO_UPDATE
#define TEST_AUTO_UPDATE_ONLY_MISSING // do auto-update only if file is missing

void TEST_sequence_export() {
    GB_shell              shell;
    arb_suppress_progress silence;

    GBDATA   *gb_main    = GB_open("TEST_loadsave.arb", "r");
    char     *export_dir = nulldup(GB_path_in_ARBLIB("export"));
    StrArray  eft;
    GBS_read_dir(eft, export_dir, "*.eft");

    AP_filter *filter = NULp;
    {
        GB_transaction ta(gb_main);

        char *ali = GBT_get_default_alignment(gb_main);
        TEST_REJECT_NULL(ali);

        size_t alilen = GBT_get_alignment_len(gb_main, ali);
        TEST_REJECT(alilen<=0);

        filter = new AP_filter(alilen);

        GBT_mark_all(gb_main, 0);
        GBDATA *gb_species = GBT_find_species(gb_main, "MetMazei");
        TEST_REJECT_NULL(gb_species);

        GB_write_flag(gb_species, 1); // mark
        free(ali);
    }
    for (int e = 0; eft[e]; ++e) {
        for (int complete = 0; complete <= 1; ++complete) {
            const char *name = strrchr(eft[e], '/');
            TEST_REJECT_NULL(name);
            name++;

            TEST_ANNOTATE(name);

            {
                export_format efo;
                TEST_EXPECT_NO_ERROR(read_export_format(&efo, eft[e], complete));
                if (strcmp(name, "fasta_wacc.eft") == 0) { // test description of one filter
                    TEST_EXPECT_EQUAL(efo.description,
                                      "Exports sequences to fasta-format.\n"
                                      "Header exported as: >ID SEQLENGTH bp SEQTYPE ACC");
                }
            }

            if (complete) {
                const char *outname      = "impexp/exported";
                char       *used_outname = NULp;

                {
                    GB_transaction ta(gb_main);
                    TEST_EXPECT_NO_ERROR(export_by_format(gb_main, EBF_MARKED, NULp,
                                                          filter, 0, 0,
                                                          "DBname", eft[e], NULp, // @@@ currently only tests export w/o FTS (pass FTS for some formats? or separately)
                                                          outname, 0, &used_outname));
                }

                char *expected = GBS_global_string_copy("impexp/%s.exported", name);

#if defined(TEST_AUTO_UPDATE)
#if defined(TEST_AUTO_UPDATE_ONLY_MISSING)
                if (GB_is_regularfile(expected)) {
                    TEST_EXPECT_TEXTFILE_DIFFLINES_IGNORE_DATES(outname, expected, 0);
                }
                else
#else
                {
                    TEST_COPY_FILE(outname, expected);
                }
#endif
#else
                TEST_EXPECT_TEXTFILE_DIFFLINES_IGNORE_DATES(outname, expected, 0);
                // see ../../UNIT_TESTER/run/impexp
#endif // TEST_AUTO_UPDATE
                TEST_EXPECT_ZERO_OR_SHOW_ERRNO(unlink(outname));

                free(expected);
                free(used_outname);
            }
        }
    }

    delete filter;
    free(export_dir);
    GB_close(gb_main);
}

#endif // UNIT_TESTS
