// =============================================================== //
//                                                                 //
//   File      : TreeWrite.cxx                                     //
//   Purpose   :                                                   //
//                                                                 //
//   Institute of Microbiology (Technical University Munich)       //
//   http://www.arb-home.de/                                       //
//                                                                 //
// =============================================================== //

#include <TreeWrite.h>
#include <TreeNode.h>
#include <arb_strbuf.h>
#include <arb_file.h>
#include <xml.hxx>

using namespace std;

#define tree_assert(cond) arb_assert(cond)

inline void replace_by_underscore(char *str, const char *toReplace) {
    char *unwanted = strpbrk(str, toReplace);
    while (unwanted) {
        unwanted[0] = '_';
        unwanted    = strpbrk(unwanted+1, toReplace);
    }
}

inline bool isQuoteChar(char c) { return c == '"' || c == '\''; }
inline bool whole_label_quoted(const char *label, size_t length) { return isQuoteChar(label[0]) && label[0] == label[length-1]; }


const char *Node_ID_Labeler::speciesLabel(GBDATA *, GBDATA *, TreeNode *leafNode, const char *) const {
    return leafNode->name;
}
const char *Node_ID_Labeler::groupLabel(GBDATA *, GBDATA *, TreeNode *innerNode, const char *) const {
    return innerNode->name;
}

inline char first_non_ascii_char(const char *label) {
    for (int i = 0; label[i]; ++i) {
        if (label[i]<0) { // non-ASCII chars are negative (because char is signed)
            return label[i];
        }
    }
    return 0;
}

static GB_ERROR export_tree_label(const char *label, FILE *out, LabelQuoting qmode) {
    // writes a label into the Newick file
    // label is quoted if necessary
    // label may be an internal_node_label, a leaf_label or a root_label
    GB_ERROR error = NULp;

    tree_assert(label);

    bool accept_non_ascii = qmode&LABEL_ACCEPT_NON_ASCII;
    char non_ASCII        = accept_non_ascii ? 0 : first_non_ascii_char(label);

    if (non_ASCII) {
        error = GBS_global_string("non-ASCII character ('%c'=%i) detected in label '%s'",
                                  non_ASCII, int(safeCharIndex(non_ASCII)), label);
    }
    else {
        const char *disallowed_chars = " \t\'\"()[]:;,"; // '(' is first problem_char
        const char *problem_chars    = disallowed_chars+4;
        tree_assert(problem_chars[0] == '(');

        bool need_quotes  = strpbrk(label, disallowed_chars);
        char used_quote   = 0;
        bool force_quotes = (qmode & LABEL_FORCE_QUOTES);

        if (force_quotes || need_quotes) {
            if      (qmode&LABEL_SINGLE_QUOTES) used_quote = '\'';
            else if (qmode&LABEL_DOUBLE_QUOTES) used_quote = '\"';
        }

        char *fixed_label;
        {
            size_t label_length = strlen(label);
            fixed_label         = ARB_strduplen(label, label_length);

            if (whole_label_quoted(fixed_label, label_length)) {
                // if whole label is quoted -> remove quotes
                memmove(fixed_label, fixed_label+1, label_length-2);
                fixed_label[label_length-2] = 0;
            }
        }

        if (used_quote) {
            // replace all problematic characters if requested
            bool force_replace = (qmode & LABEL_FORCE_REPLACE);
            if (force_replace) replace_by_underscore(fixed_label, problem_chars);

            // replace used quote by an '_' if it appears inside label
            char used_quote_as_string[] = { used_quote, 0 };
            replace_by_underscore(fixed_label, used_quote_as_string);

            if (!force_quotes) {
                need_quotes = strpbrk(fixed_label, disallowed_chars);
                if (!need_quotes) used_quote = 0; // @@@ fails if both quote-types are used in one name
            }
        }
        else {
            // unquoted label - always replace all problematic characters by '_'
            replace_by_underscore(fixed_label, disallowed_chars);
        }

        if (used_quote) fputc(used_quote, out);
        fputs(fixed_label, out);
        if (used_quote) fputc(used_quote, out);

        free(fixed_label);
    }

    return error;
}



// documentation of the Newick Format is in ../../SOURCE_TOOLS/docs/newick_doc.html

inline void indentTo(int indent, FILE *out) {
    for (int i = 0; i < indent; i++) {
        putc(' ', out);
        putc(' ', out);
    }
}

static GB_ERROR export_tree_node_print(GBDATA *gb_main, FILE *out, TreeNode *tree, const char *tree_name,
                                       bool pretty, int indent,
                                       const TreeLabeler& labeler, bool save_branchlengths,
                                       bool save_bootstraps, bool save_groupnames, LabelQuoting qmode)
{
    GB_ERROR error = NULp;

    if (pretty) indentTo(indent, out);

    if (tree->is_leaf()) {
        const char *label = labeler.speciesLabel(gb_main, tree->gb_node, tree, tree_name);
        error             = export_tree_label(label, out, qmode);
    }
    else {
        if (pretty) fputs("(\n", out);
        else        putc('(', out);

        error = export_tree_node_print(gb_main, out, tree->get_leftson(), tree_name, pretty, indent+1, labeler, save_branchlengths, save_bootstraps, save_groupnames, qmode);
        if (!error) {
            if (save_branchlengths) fprintf(out, ":%.5f", tree->leftlen);
            fputs(",\n", out);

            error = export_tree_node_print(gb_main, out, tree->get_rightson(), tree_name, pretty, indent+1, labeler, save_branchlengths, save_bootstraps, save_groupnames, qmode);
        }
        if (!error) {
            if (save_branchlengths) fprintf(out, ":%.5f", tree->rightlen);
            fputc('\n', out);

            if (pretty) indentTo(indent, out);
            fputc(')', out);

            char *bootstrap = NULp;
            if (save_bootstraps) {
                double value;
                switch (tree->parse_bootstrap(value)) {
                    case REMARK_BOOTSTRAP: bootstrap = GBS_global_string_copy("%i", int(value+0.5)); break;
                    case REMARK_OTHER:     bootstrap = strdup(tree->get_remark()); break;
                    case REMARK_NONE:      break;
                }
            }

            const char *label = NULp;
            {
                bool useGroupLabel = save_groupnames && tree->has_group_info();
                if (useGroupLabel) {
                    const char *group = labeler.groupLabel(gb_main, tree->gb_node, tree, tree_name);
                    if (tree->keelTarget()) {
                        error = GBS_global_string("contains a keeled group named '%s'\n"
                                                  "(to export a tree, you have to correct all keeled groups)", group);
                    }
                    else {
                        if (bootstrap) label = GBS_global_string("%s:%s", bootstrap, group);
                        else           label = group;
                    }
                }
                else if (bootstrap) label = bootstrap;
            }

            if (label) {
                arb_assert(!error);
                error = export_tree_label(label, out, qmode);
            }

            free(bootstrap);
        }
    }

    return error;
}

inline string buildNodeIdentifier(const string& parent_id, int& son_counter) {
    ++son_counter;
    if (parent_id.empty()) return GBS_global_string("n_%i", son_counter);
    return GBS_global_string("%s.%i", parent_id.c_str(), son_counter);
}

static const char *export_tree_node_print_xml(GBDATA *gb_main, TreeNode *tree, double my_length, const char *tree_name,
                                              const TreeLabeler& labeler, bool skip_folded, const string& parent_id, int& parent_son_counter) {
    const char *error = NULp;

    if (tree->is_leaf()) {
        XML_Tag item_tag("ITEM");

        item_tag.add_attribute("name",     buildNodeIdentifier(parent_id, parent_son_counter));
        item_tag.add_attribute("itemname", labeler.speciesLabel(gb_main, tree->gb_node, tree, tree_name));
        item_tag.add_attribute("length",   GBS_global_string("%.5f", my_length));
    }
    else {
        char *bootstrap = NULp;
        {
            double value;
            switch (tree->parse_bootstrap(value)) {
                case REMARK_BOOTSTRAP: bootstrap = GBS_global_string_copy("%i", int(value+0.5)); break;
                case REMARK_OTHER:     break; // @@@ other branch-remarks are currently not saved into xml format
                case REMARK_NONE:      break;
            }
        }

        bool  folded    = false;
        char *groupname = NULp;
        if (tree->name) {
            const char *buf = labeler.speciesLabel(gb_main, tree->gb_node, tree, tree_name);
            tree_assert(buf);
            groupname = strdup(buf);

            GBDATA *gb_grouped = GB_entry(tree->gb_node, "grouped");
            if (gb_grouped) {
                folded = GB_read_byte(gb_grouped);
            }
        }

        if (my_length || bootstrap || groupname) {
            bool hide_this_group = skip_folded && folded; // hide folded groups only if skip_folded is true

            XML_Tag branch_tag(hide_this_group ? "FOLDED_GROUP" : "BRANCH");
            string  my_id = buildNodeIdentifier(parent_id, parent_son_counter);

            branch_tag.add_attribute("name", my_id);

            if (my_length) {
                branch_tag.add_attribute("length", GBS_global_string("%.5f", my_length));
            }
            if (bootstrap) {
                branch_tag.add_attribute("bootstrap", bootstrap);
                freenull(bootstrap);
            }
            if (groupname) {
                branch_tag.add_attribute("groupname", groupname);
                freenull(groupname);
                if (folded) branch_tag.add_attribute("folded", "1");
            }
            else {
                tree_assert(!folded);
            }

            if (hide_this_group) {
                branch_tag.add_attribute("items_in_group", GBT_count_leafs(tree));
            }
            else {
                int my_son_counter = 0;
                if (!error) error  = export_tree_node_print_xml(gb_main, tree->get_leftson(),  tree->leftlen,  tree_name, labeler, skip_folded, my_id, my_son_counter);
                if (!error) error  = export_tree_node_print_xml(gb_main, tree->get_rightson(), tree->rightlen, tree_name, labeler, skip_folded, my_id, my_son_counter);
            }
        }
        else {
            if (!error) error = export_tree_node_print_xml(gb_main, tree->get_leftson(),  tree->leftlen,  tree_name, labeler, skip_folded, parent_id, parent_son_counter);
            if (!error) error = export_tree_node_print_xml(gb_main, tree->get_rightson(), tree->rightlen, tree_name, labeler, skip_folded, parent_id, parent_son_counter);
        }
    }

    return error;
}

GB_ERROR TREE_write_XML(GBDATA *gb_main, const char *db_name, const char *tree_name, const TreeLabeler& labeler, bool skip_folded, const char *path) {
    GB_ERROR  error  = NULp;
    FILE     *output = fopen(path, "w");

    if (!output) error = GB_export_errorf("file '%s' could not be opened for writing", path);
    else {
        GB_transaction ta(gb_main);

        TreeNode *tree   = GBT_read_tree(gb_main, tree_name, new SimpleRoot);
        if (!tree) error = GB_await_error();
        else {
            error = GBT_link_tree(tree, gb_main, true, NULp, NULp);

            if (!error) {
                GBDATA *tree_cont   = GBT_find_tree(gb_main, tree_name);
                GBDATA *tree_remark = GB_entry(tree_cont, "remark");

                XML_Document xml_doc("ARB_TREE", "arb_tree.dtd", output);

                xml_doc.add_attribute("database", db_name);
                xml_doc.add_attribute("treename", tree_name);
                xml_doc.add_attribute("export_date", ARB_date_string());

                if (tree_remark) {
                    char *remark = GB_read_string(tree_remark);
                    XML_Tag  remark_tag("COMMENT");
                    XML_Text remark_text(remark);
                    free(remark);
                }

                int my_son_counter = 0;
                error              = export_tree_node_print_xml(gb_main, tree, 0.0, tree_name, labeler, skip_folded, "", my_son_counter);
            }
        }
        fclose(output);
    }

    return error;
}

static char *complete_newick_comment(const char *comment) {
    // ensure that all '[' in 'comment' are closed by corresponding ']' by inserting additional brackets

    int           openBrackets = 0;
    GBS_strstruct out(strlen(comment)*1.1);

    for (int o = 0; comment[o]; ++o) {
        switch (comment[o]) {
            case '[':
                openBrackets++;
                break;
            case ']':
                if (openBrackets == 0) { // no brackets opened
                    out.put('['); // insert one to enforce balancing
                }
                else {
                    openBrackets--;
                }
                break;

            default:
                break;
        }
        out.put(comment[o]);
    }

    while (openBrackets>0) { // close all opened brackets
        out.put(']');
        openBrackets--;
    }

    tree_assert(openBrackets == 0);

    return out.release();
}

GB_ERROR TREE_write_Newick(GBDATA *gb_main, const char *tree_name, const TreeLabeler& labeler, bool save_branchlengths, bool save_bootstraps, bool save_groupnames, bool pretty, LabelQuoting quoteMode, const char *path) {
    // userland newick exporter
    // see also: testcode newick exporter in ../../ARBDB/adtree.cxx@NEWICK_EXPORTER

    GB_ERROR  error  = NULp;
    FILE     *output = fopen(path, "w");

    if (!output) error = GBS_global_string("file '%s' could not be opened for writing", path);
    else {
        GB_transaction ta(gb_main);

        TreeNode *tree   = GBT_read_tree(gb_main, tree_name, new SimpleRoot);
        if (!tree) error = GB_await_error();
        else {
            error = GBT_link_tree(tree, gb_main, true, NULp, NULp);

            if (!error) {
                char   *remark      = NULp;
                GBDATA *tree_cont   = GBT_find_tree(gb_main, tree_name);
                GBDATA *tree_remark = GB_entry(tree_cont, "remark");

                if (tree_remark) {
                    remark = GB_read_string(tree_remark);
                }
                {
                    const char *saved_to = GBS_global_string("%s saved to %s", tree_name, path);
                    freeset(remark, GBS_log_action_to(remark, saved_to, true));
                }

                if (remark) {
                    char *wellformed = complete_newick_comment(remark);

                    tree_assert(wellformed);

                    fputc('[', output); fputs(wellformed, output); fputs("]\n", output);
                    free(wellformed);
                }
                free(remark);
                if (!error) {
                    error = export_tree_node_print(gb_main, output, tree, tree_name, pretty, 0, labeler, save_branchlengths, save_bootstraps, save_groupnames, quoteMode);
                }
            }

            destroy(tree);
        }

        fprintf(output, ";\n");
        fclose(output);

        if (error) {
            GB_unlink_or_warn(path, &error);
        }
    }

    return error;
}

// --------------------------------------------------------------------------------

static void export_tree_node_print_remove(char *str) {
    int i = 0;
    while (char c = str[i]) {
        if (c == '\'' || c == '\"') str[i] = '.';
        i++;
    }
}

static void export_tree_rek(TreeNode *tree, FILE *out, bool export_branchlens, bool dquot) {
    if (tree->is_leaf()) {
        export_tree_node_print_remove(tree->name);
        fprintf(out,
                dquot ? " \"%s\" " : " '%s' ",
                tree->name);
    }
    else {
        fputc('(', out);
        export_tree_rek(tree->get_leftson(),  out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f,", tree->leftlen);
        export_tree_rek(tree->get_rightson(), out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f",  tree->rightlen);
        fputc(')', out);

        if (tree->name) {
            export_tree_node_print_remove(tree->name);
            fprintf(out,
                    dquot ? "\"%s\"" : "'%s'",
                    tree->name);
        }
    }
}

// @@@ maybe replace TREE_export_tree by TREE_write_Newick (need some additional parameters: no comment, trifurcation)

GB_ERROR TREE_export_tree(GBDATA *, FILE *out, TreeNode *tree, bool triple_root, bool export_branchlens, bool dquot) {
    if (triple_root) {
        TreeNode *one, *two, *three;
        if (tree->is_leaf()) {
            return GB_export_error("Tree is too small, minimum 3 nodes");
        }
        if (tree->leftson->is_leaf() && tree->rightson->is_leaf()) {
            return GB_export_error("Tree is too small, minimum 3 nodes");
        }
        if (tree->leftson->is_leaf()) {
            one   = tree->get_leftson();
            two   = tree->get_rightson()->get_leftson();
            three = tree->get_rightson()->get_rightson();
        }
        else {
            one   = tree->get_leftson()->get_leftson();
            two   = tree->get_leftson()->get_rightson();
            three = tree->get_rightson();
        }
        fputc('(', out);
        export_tree_rek(one,   out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f", 1.0); fputc(',', out);
        export_tree_rek(two,   out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f", 1.0); fputc(',', out);
        export_tree_rek(three, out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f", 1.0);
        fputc(')', out);
    }
    else {
        export_tree_rek(tree, out, export_branchlens, dquot);
    }
    return NULp;
}


