// =============================================================== //
//                                                                 //
//   File      : NT_tree_cmp.cxx                                   //
//   Purpose   :                                                   //
//                                                                 //
//   Institute of Microbiology (Technical University Munich)       //
//   http://www.arb-home.de/                                       //
//                                                                 //
// =============================================================== //

#include "NT_species_set.h"
#include <aw_msg.hxx>
#include <gb_aci.h>
#include <gb_aci_impl.h>
#include <arb_progress.h>
#include <arb_defs.h>
#include <arb_msg_nospam.h>
#include <string>
#include <climits>

using namespace std;

SpecSetRegistry::SpecSetRegistry(long nspecies_, arb_progress *progress_, const GroupMatchScorer& scorer_) :
    species_counter(0),
    nspecies(nspecies_),
    nsets(0),
    sets(ARB_calloc<RSpecSet*>(max_nsets())),
    scorer(scorer_),
    progress(progress_),
    species_hash(GBS_create_hash(nspecies, GB_IGNORE_CASE))
{
    for (int i=0; i<256; i++) {
        int j = i;
        int count = 0;
        while (j) {             // count bits
            if (j&1) count ++;
            j = j>>1;
        }
        set_bits[i] = count;
    }
    tmp_bitstring = allocate_bitstring();
}

SpecSetRegistry::~SpecSetRegistry() {
    for (int i=0; i<nsets; i++) delete sets[i];
    free(tmp_bitstring);
    free(sets);
    GBS_free_hash(species_hash);
}

void SpecSetRegistry::add(const char *species_name) {
    if (GBS_read_hash(species_hash, species_name)) {
        aw_message(GBS_global_string("Warning: Species '%s' is found more than once in tree", species_name));
    }
    else {
        GBS_write_hash(species_hash, species_name, ++species_counter);
        nt_assert(species_counter<=nspecies); // more species added than planned
        nt_assert(species_counter<=nspecies);
    }
}

void SpecSetRegistry::add(RSpecSet *rset) {
    nt_assert(rset->size()>1); // do NOT register leafs (only inner nodes allowed)
    nt_assert(nsets<max_nsets());
    sets[nsets++] = rset;
}

void SpecSetRegistry::dump_bitstring(const char *tag, unsigned char *bs) {
    fprintf(stderr, "%s: ", tag);
    const long bbytes = bitstring_bytes();
    for (long i = 0; i<bbytes; ++i) {
        int val = bs[i];
        for (int b = 0; b<8; ++b) {
            fputc(val&128 ? 'X' : '-', stderr);
            val<<=1;
        }
    }
    fputc('\n', stderr);
}

GroupPenalty GroupMatchScorer::calcPenalty(long removed, long added, long commonSpecies) {
    // - can be calculated from removed/added/commonSpecies
    // - shall be reported (result param?)

    if (commonSpecies) {
        long   oldGroupSize  = removed+commonSpecies;
        long   newGroupSize  = added+commonSpecies;
        double ingroupRatio  = 1.0 - removed/double(oldGroupSize); // = percent of old members in new group
        double outgroupRatio = added/double(newGroupSize);         // = percent of non-group-members in new group

        if (insideLimits(ingroupRatio, outgroupRatio)) {
            double penalty =
                // abs. penalty:
                removed*ingroupPep +
                added*outgroupPep +
                // rel. penalty:
                (1-ingroupRatio)*ingroupInvRelPen +
                outgroupRatio*outgroupRelPen;

            return GroupPenalty(penalty, ingroupRatio, outgroupRatio, oldGroupSize);
        }
    }
    return GroupPenalty::NoMatch();
}

GroupPenalty GroupMatchScorer::matchGroups(const TSpecSet& sourceSet, const RSpecSet& targetSet, long commonSpecies, long overallSpecies) {
    /*! calculate score of group match (tests normal and keeled insertion, reports better)
     * @param sourceSet       tested set (= named group in source tree (if moving groups))
     * @param targetSet       registered set (= subtree of target tree (if moving groups))
     * @param commonSpecies   number of species common in sourceSet and targetSet
     * @param overallSpecies  number of registered species (in SpecSetRegistry)
     * @return better GroupPenalty
     */

    long removed = sourceSet.get_known_members() - commonSpecies;
    long added   = targetSet.get_known_members() - commonSpecies;

    GroupPenalty match = calcPenalty(removed, added, commonSpecies);
    nt_assert(implicated(match.doesMatch(), (removed+added)<overallSpecies));

    long targetSize_keeled    = overallSpecies - targetSet.get_known_members();
    long commonSpecies_keeled = sourceSet.get_known_members() - commonSpecies;
    long removed_keeled       = commonSpecies;
    long added_keeled         = targetSize_keeled - commonSpecies_keeled;

    GroupPenalty match_keeled  = calcPenalty(removed_keeled, added_keeled, commonSpecies_keeled);
    match_keeled.mark_as_keeled();
    if (match_keeled.doesMatch()) { // do not add to NoMatch!
        match_keeled.addPenalty(keelPenalty);
    }
    nt_assert(implicated(match_keeled.doesMatch(), (removed_keeled+added_keeled)<overallSpecies));

    if (match_keeled.betterThan(match)) {
        // @@@ if this happens, the match semantic is inverted: if moving a group -> group should be keeled (by caller)!
        // related to #512 + #451
        return match_keeled;
    }

    return match;
}

double GroupMatchScorer::calcUnknownMembersPenalty(const TSpecSet& sourceSet) const {
    return sourceSet.get_unknown_members() * unfoundPep; // penalty for unknown members in source tree
}

GB_ERROR GroupMatchScorer::check_validity() const {
    if (ingroupPep == 0.0 && ingroupInvRelPen == 0.0) {
        return "one ingroup penalty has to be different from zero";
    }
    if (outgroupPep == 0.0 && outgroupRelPen == 0.0) {
        return "one outgroup penalty has to be different from zero";
    }

    if (ingroupPep<0.0 || outgroupPep<0.0 || ingroupInvRelPen<0.0 || outgroupRelPen<0.0) {
        return "invalid negative in/outgroup penalty";
    }

    if (!ingroup.isValid() || !outgroup.isValid()) {
        return "invalid limits";
    }

    return NULp;
}

RSpecSet *SpecSetRegistry::search_best_match(const TSpecSet *tset, GroupPenalty& min_penalty) {
    // returns best match for 'tset' (against all registered sets).
    //         NULp if no match found.
    // sets 'min_penalty' to penalty found for returned set.

    RSpecSet *bset = NULp;
    min_penalty    = GroupPenalty::NoMatch();

    const long bbytes = bitstring_bytes();
    const long blongs = bitstring_longs();

    unsigned char * const tbs    = tset->bitstring;
    long * const          tbl    = (long*)tbs;
    long * const          tmp_bl = (long*)tmp_bitstring;

    for (long i=nsets-1; i>=0; i--) {
        long same = 0; // amount of known members present in both sets

        RSpecSet *rset = sets[i];

        unsigned char * const rbs = rset->bitstring;
        long * const          rbl = (long*)rbs;

        for (long j = 0; j<blongs; ++j) { // LOOP_VECTORIZED
            tmp_bl[j] = tbl[j] & rbl[j];
        }
        for (long j = 0; j<bbytes; ++j) { // (Note: failed to optimize: gcc wont vectorize double-indexed access)
            same += set_bits[tmp_bitstring[j]];
        }

        GroupPenalty penalty = scorer.matchGroups(*tset, *rset, same, nspecies);
        if (penalty.betterThan(min_penalty)) {
            min_penalty = penalty;
            bset        = rset;
        }
        progress->inc();
    }

    return bset;
}

void GroupPenalty::registerUnfound(const GroupMatchScorer& scorer, const TSpecSet& tset) {
    addPenalty(scorer.calcUnknownMembersPenalty(tset));
    nt_assert(unknown == -1); // registerUnfound shall be called exactly once!
    unknown = tset.get_unknown_members();
}

double SpecSetRegistry::search_and_remember_best_match_and_log_errors(const TSpecSet *tset, FILE *log) {
    /*! search for best matching registered subtree.
     * @param tset    tested set
     * @param log     if != NULp -> log move information (not allowed when "comparing topo"!).
     * @return the penalty for best matching subtree (plus a small penalty for unregistered species)
     *         or -1 if best matching subtree is "too bad" (i.e. if no matching subtree was found).
     *
     * if return value >= 0 -> stores best_match & best_node (inside best match for 'tset' found in other tree)
     *
     * When moving node info,     'tset' represents a (named!) subtree of the source tree.
     * When comparing topologies, 'tset' represents a subtree of the destination tree (trees are swapped in .@NTREE_move_tree_info)
     */

    GroupPenalty  match;
    RSpecSet     *bset = search_best_match(tset, match);

    if (!bset) {
        // no good match found.
        // atm this happens for small groups (e.g. with 2 species) + a special placement of these species in tree.
        // in the future 'finding no match' will get normal behavior.
        return -1;
    }

    AP_tree *earlierFoundGroup = bset->matchedNode(); // group previously considered for same destination node
    AP_tree *currentGroup      = tset->set_node;

    match.registerUnfound(scorer, *tset);
    if (match.betterThan(bset->bestMatch())) {
        nt_assert(!bset->bestMatch().isPerfectMatch()); // shall never supersede optimally placed group!
        if (earlierFoundGroup && log) {
            nt_assert(earlierFoundGroup->name && currentGroup->name);
            fprintf(log, "Group '%s' skipped (got superseded by group '%s'; same best target positions)\n",
                    earlierFoundGroup->name,
                    currentGroup->name);
        }

        bset->storeBetterMatch(match, currentGroup);
    }
    else {
        nt_assert(!match.isPerfectMatch()); // found two groups with optimal placement. should be impossible by design! can be caused by invalid penalties
        if (log) {
            nt_assert(earlierFoundGroup->name && currentGroup->name);
            fprintf(log, "Group '%s' skipped (does NOT supersede group '%s'; same best target positions)\n",
                    currentGroup->name,
                    earlierFoundGroup->name);
        }
    }

    return match.get_penalty();
}

void SpecSet::init(AP_tree *nodei, const SpecSetRegistry& ssr) {
    bitstring     = ssr.allocate_bitstring();
    known_members = 0;
    set_node      = nodei;
}

SpecSet::SpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name) {
    init(nodei, ssr);

    long species_index = ssr.get_species_index(species_name);
    if (species_index) {
        nt_assert(species_index>0);
        --species_index; // [1..N] -> [0..N-1]
        nt_assert((species_index/8) < ssr.bitstring_bytes());
        bitstring[species_index/8] |= 1 << (species_index % 8);
        known_members               = 1;
    }
}

SpecSet::SpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const SpecSet *l, const SpecSet *r) {
    init(nodei, ssr);

    const long *lbs = (const long *)l->bitstring;
    const long *rbs = (const long *)r->bitstring;
    long       *dbs = (long *)bitstring;

    for (long j = ssr.bitstring_longs()-1; j>=0; j--) { // LOOP_VECTORIZED=2
        dbs[j] = lbs[j] | rbs[j];
    }
    known_members = l->known_members + r->known_members;
}

SpecSet::~SpecSet() {
    free(bitstring);
}

RSpecSet::RSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name) :
    SpecSet(nodei, ssr, species_name),
    best_node(NULp)
{}
RSpecSet::RSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const RSpecSet *l, const RSpecSet *r) :
    SpecSet(nodei, ssr, l, r),
    best_node(NULp)
{}
TSpecSet::TSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name) :
    SpecSet(nodei, ssr, species_name),
    unfound_species_count(1-known_members)
{
}
TSpecSet::TSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const TSpecSet *l, const TSpecSet *r) :
    SpecSet(nodei, ssr, l, r),
    unfound_species_count(l->unfound_species_count + r->unfound_species_count)
{
}

RSpecSet *SpecSetRegistry::registerTree(AP_tree *node) {
    RSpecSet *ss;
    // Warning: confusing resource handling:
    // - leafs are returned "NOT owned by anybody"
    // - inner nodes are added to and owned by this->sets

    if (node->is_leaf()) {
        this->add(node->name);
        ss = new RSpecSet(node, *this, node->name);
        nt_assert(ss->is_leaf_set());
    }
    else {
        RSpecSet *ls = registerTree(node->get_leftson());
        RSpecSet *rs = registerTree(node->get_rightson());

        ss = new RSpecSet(node, *this, ls, rs);
        this->add(ss);
        nt_assert(!ss->is_leaf_set());

        if (ls->is_leaf_set()) delete ls;
        if (rs->is_leaf_set()) delete rs;
    }
    return ss;
}

TSpecSet *SpecSetRegistry::find_best_matches_info(AP_tree *node, FILE *log, bool compare_node_info) {
    /* Go through all nodes of subtree 'node' (part of the source tree if copying info)
     * and search for the best matching registered node (in this->sets; which belong to dest tree if copying info).
     *
     * [Note: If comparing topologies, source- and dest-trees were exchanged by caller.]
     *
     * Named groups (if copying info)  resp. all nodes (if comparing topologies)
     * get compared versus all registered nodes and the best matching node is detected.
     * The compared node is stored inside the best matching node.
     */

    nt_assert(contradicted(log, compare_node_info));

    TSpecSet *ss = NULp;
    if (node->is_leaf()) {
        ss = new TSpecSet(node, *this, node->name);
    }
    else {
        TSpecSet *ls =      find_best_matches_info(node->get_leftson(),  log, compare_node_info);
        TSpecSet *rs = ls ? find_best_matches_info(node->get_rightson(), log, compare_node_info) : NULp;

        if (rs) {
            ss = new TSpecSet(node, *this, ls, rs);
            if (compare_node_info) {
                nt_assert(!log); // does not work / not allowed

                double penalty  = search_and_remember_best_match_and_log_errors(ss, log);
                int    ipenalty = int(penalty);

                // the #-sign is important (otherwise TREE_write_Newick will not work correctly; interference with bootstrap values; refer to #787 + #767)
                if      (ipenalty>0) node->use_as_remark(GBS_global_string_copy("# %i", ipenalty));
                else if (ipenalty<0) node->set_remark("# no match");
                else                 node->remove_remark();
            }
            else {
                if (node->name) {
                    double penalty = search_and_remember_best_match_and_log_errors(ss, log);
                    if (penalty<0) {
                        nt_assert(log);
                        fprintf(log, "Group '%s' doesn't fit to any destination subtree.\n", node->name);
                    }
                }
            }
        }
        delete rs;
        delete ls;
    }

    if (ss && progress->aborted()) {
        delete ss;
        ss = NULp; // abort recursion after user-abort
    }
    return ss;
}

// -----------------------------
//      local ACI extension

class GroupXfer_callenv : public GBL_call_env {
    RefPtr<const char>  name;
    const GroupPenalty& match;

    int oldGroupSize;
    int newGroupSize;

public:
    GroupXfer_callenv(const GBL_env& env_, const char *name_, const GroupPenalty& match_, int oldGroupSize_, int newGroupSize_) :
        GBL_call_env(NULp, env_),
        name(name_),
        match(match_),
        oldGroupSize(oldGroupSize_),
        newGroupSize(newGroupSize_)
    {}

    const char *get_group_name() const { return name; }
    const GroupPenalty& get_match() const { return match; }
    int get_old_group_size() const { return oldGroupSize; }
    int get_new_group_size() const { return newGroupSize; }
};

inline const GroupXfer_callenv& custom_gx_env(GBL_command_arguments *args) { return DOWNCAST_REFERENCE(const GroupXfer_callenv, args->get_callEnv()); }
inline const GroupPenalty& custom_match(GBL_command_arguments *args) { return custom_gx_env(args).get_match(); }

using namespace GBL_IMPL;

#define IMPL_FORMATVALUE_CMD(args,fmt,value)    \
    COMMAND_DROPS_INPUT_STREAMS(args);          \
    GB_ERROR error = check_no_parameter(args);  \
    if (!error) FORMAT_2_OUT(args, fmt, value); \
    return error

static GB_ERROR gxl_groupname(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%s", custom_gx_env(args).get_group_name()); }
static GB_ERROR gxl_penalty(GBL_command_arguments *args)   { IMPL_FORMATVALUE_CMD(args, "%f", custom_match(args).get_penalty()); }
static GB_ERROR gxl_ingroup(GBL_command_arguments *args)   { IMPL_FORMATVALUE_CMD(args, "%.1f%%", custom_match(args).get_ingroup_ratio()*100.0); }
static GB_ERROR gxl_outgroup(GBL_command_arguments *args)  { IMPL_FORMATVALUE_CMD(args, "%.1f%%", custom_match(args).get_outgroup_ratio()*100.0); }
static GB_ERROR gxl_oldsize(GBL_command_arguments *args)   { IMPL_FORMATVALUE_CMD(args, "%i", custom_gx_env(args).get_old_group_size()); }
static GB_ERROR gxl_newsize(GBL_command_arguments *args)   { IMPL_FORMATVALUE_CMD(args, "%i", custom_gx_env(args).get_new_group_size()); }
static GB_ERROR gxl_unknown(GBL_command_arguments *args)   { IMPL_FORMATVALUE_CMD(args, "%i", custom_match(args).get_unknown()); }
static GB_ERROR gxl_keeled(GBL_command_arguments *args)    { IMPL_FORMATVALUE_CMD(args, "%i", int(custom_match(args).shouldHaveBeenKeeled())); }

static GBL_command_definition groupXfer_command_table[] = {
    { "groupname", gxl_groupname },
    { "penalty",   gxl_penalty },
    { "ingroup",   gxl_ingroup },
    { "outgroup",  gxl_outgroup },
    { "oldsize",   gxl_oldsize },
    { "newsize",   gxl_newsize },
    { "unknown",   gxl_unknown },
    { "keeled",    gxl_keeled },
    { NULp, NULp }
};

static const GBL_command_lookup_table& get_GroupXfer_customized_ACI_commands() {
    static GBL_custom_command_lookup_table clt(groupXfer_command_table,
                                               ARRAY_ELEMS(groupXfer_command_table)-1,
                                               ACI_get_standard_commands());
    return clt;
}



GB_ERROR SpecSetRegistry::write_node_information(FILE *log, bool delete_old_nodes, GroupsToTransfer what, const char *aci) {
    GB_ERROR error = NULp;
    MessageSpamFilter suppress("problematic group names");

    if (log) fputs("\nDetailed group changes:\n\n", log);

    GBL_env env(NULp, // have no item here
                NULp, // use no treename
                get_GroupXfer_customized_ACI_commands());

    for (long j=this->nsets-1; j>=0 && !error; j--) {
        RSpecSet * const cset = this->sets[j];

        char    *old_group_name  = NULp; // !=NULp -> old node has been deleted
        AP_tree *sourceNode      = cset->matchedNode();
        AP_tree *targetNode      = cset->set_node;
        bool     insert_new_node = sourceNode && sourceNode->name;

        if (what == XFER_GROUPS_WITH_MARKED && insert_new_node) {
            if (!targetNode->contains_marked_species()) insert_new_node = false; // @@@ this tests the wrong tree (the target tree)! (#792)
        }

        if (targetNode->gb_node && (delete_old_nodes || insert_new_node)) { // There is already a node, delete old
            if (!targetNode->name) {
                GBDATA *gb_name = GB_entry(targetNode->gb_node, "group_name");
                if (gb_name) {
                    targetNode->name = GB_read_string(gb_name);
                }
                else {
                    targetNode->name = ARB_strdup("<gb_node w/o name>"); // @@@ happens whenever any other node info is present (e.g. linewidth or angles; node gets deleted when delete_old_nodes==true); #793
                }
            }

            old_group_name = ARB_strdup(targetNode->name); // store old_group_name to rename new inserted node

            error = GB_delete(targetNode->gb_node);
            if (!error) {
                targetNode->gb_node = NULp;
                freenull(targetNode->name);
            }
        }

        if (!error) {
            if (insert_new_node) {
                targetNode->gb_node = GB_create_container(targetNode->get_tree_root()->get_gb_tree(), "node");
                error               = GB_copy_dropProtectMarksAndTempstate(targetNode->gb_node, sourceNode->gb_node); // @@@ this copies more info than group-name (e.g. linewidth or angles; unwanted!); #793
                // @@@ need a function which copies/moves/deletes group-related info (between two nodes). currently affected fields: 'group_name', 'grouped', 'keeled'; #793

                if (!error) {
                    GBDATA *gb_name = GB_entry(targetNode->gb_node, "group_name");
                    nt_assert(gb_name);
                    if (gb_name) {
                        const GroupPenalty& match = cset->bestMatch();

                        int oldGroupsize = match.get_groupsize();
                        int newGroupsize = cset->size();

                        char *new_group_name = NULp; // new wanted groupname
                        {
                            char *source_group_name = GB_read_string(gb_name); // existing groupname (in group-container copied from source tree)
                            if (aci) {
                                GroupXfer_callenv callEnv(env, source_group_name, match, oldGroupsize, newGroupsize);
                                new_group_name = GB_command_interpreter_in_env("", aci, callEnv);
                                if (!new_group_name) error = GB_await_error();
                            }
                            else {
                                reassign(new_group_name, source_group_name);
                            }
                            free(source_group_name);
                        }

                        if (!error) {
                            nt_assert(new_group_name);
                            if (old_group_name) { // overwrite existing group
                                if (!delete_old_nodes) {
                                    if (strcmp(old_group_name, new_group_name) != 0) { // old and new name differ
                                        char *combined_name = GBS_global_string_copy("%s [was: %s]", new_group_name, old_group_name);
                                        if (log) fprintf(log, "Destination group '%s' overwritten by '%s'", old_group_name, combined_name);
                                        reassign(new_group_name, combined_name);
                                    }
                                    else { // name matches existing
                                        if (log) fprintf(log, "Group '%s' remains unchanged", old_group_name);
                                    }
                                }
                                else {
                                    if (log) {
                                        if (strcmp(old_group_name, new_group_name) == 0) {
                                            fprintf(log, "Group '%s' remains unchanged", old_group_name);
                                        }
                                        else {
                                            fprintf(log, "Destination group '%s' overwritten by '%s'", old_group_name, new_group_name);
                                        }
                                    }
                                }
                            }
                            else { // no group at destination -> insert
                                if (log) fprintf(log, "Group '%s' inserted", new_group_name);
                            }
                        }

                        if (!error) error = GBT_write_group_name(gb_name, new_group_name, true); // always write wanted target name

                        if (log) {
                            if (error) {
                                fprintf(log, " (Failed! Reason: %s)\n", error);
                            }
                            else if (match.isPerfectMatch()) {
                                fputc('\n', log);
                            }
                            else {
                                fprintf(log, " (not placed optimal; penalty=%f; in=%.1f%%/%i; out=%.1f%%/%i%s)\n",
                                        match.get_penalty(),
                                        match.get_ingroup_ratio() *100.0, oldGroupsize,
                                        match.get_outgroup_ratio()*100.0, newGroupsize,
                                        match.shouldHaveBeenKeeled() ? "; WARNING: matched inverse set, i.e. rest of tree!" : "");
                            }
                        }

                        free(new_group_name);
                    }
                }
            }
            else {
                nt_assert(implicated(old_group_name, delete_old_nodes));
                if (old_group_name && log) {
                    fprintf(log, "Destination group '%s' removed\n", old_group_name);
                }
            }
        }
        free(old_group_name);
    }
    return error;
}

void SpecSetRegistry::finish(GB_ERROR& error) {
    if (!error) error = progress->error_if_aborted();
    progress->done();
}

GB_ERROR NTREE_move_tree_info(GBDATA *gb_main, const char *tree_source, const char *tree_dest, const char *log_file, GroupTransferMode mode, GroupsToTransfer what, const GroupMatchScorer& scorer, const char *aci) {
    GB_ERROR  error = NULp;
    FILE     *log   = NULp;

    if (!contradicted(mode == COMPARE_TOPOLOGY, log_file)) {
        GBK_terminatef("invalid use of function NTREE_move_tree_info: mode=%i log_file=%s\n", int(mode), log_file);
    }

    if (aci && !aci[0]) aci = NULp; // handle "empty ACI" like "no ACI"

    if (mode == COMPARE_TOPOLOGY) {
        // info is written into 'tree_source' in .@find_best_matches_info
        // (but we want to modify destination tree - like 'mode node info' does)
        std::swap(tree_source, tree_dest);
    }

    if (log_file) {
        nt_assert(mode == REMOVE_EXISTING_GROUPS || mode == KEEP_OLD_NAMES);
        log = fopen(log_file, "w");
        if (log) {
            const char *transferredGroups = what == XFER_GROUPS_WITH_MARKED ? "groups containing marked" : "all";
            const char *overwriteMode     = NULp;

            switch (mode) {
                case COMPARE_TOPOLOGY: nt_assert(0); break;
                case REMOVE_EXISTING_GROUPS: overwriteMode = "remove existing groups"; break;
                case KEEP_OLD_NAMES:         overwriteMode = "keep old names";         break;
            }

            nt_assert(transferredGroups && overwriteMode);

            fprintf(log, // start of log
                    "LOGFILE: transfer group information\n"
                    "\n"
                    "     Source tree '%s'\n"
                    "Destination tree '%s'\n"
                    "\n"
                    "transferred groups: %s\n"
                    "    overwrite mode: %s\n"
                    "\n",
                    tree_source,
                    tree_dest,
                    transferredGroups,
                    overwriteMode);
        }
        else {
            error = GB_IO_error("writing", log_file);
        }
    }

    GB_begin_transaction(gb_main);

    AP_tree_root  rsource(new AliView(gb_main), NULp, false, NULp);
    AP_tree_root  rdest  (new AliView(gb_main), NULp, false, NULp);
    AP_tree_root& rsave = (mode == COMPARE_TOPOLOGY) ? rsource : rdest;
    arb_progress progress(mode == COMPARE_TOPOLOGY ? "Compare topologies" : (mode == REMOVE_EXISTING_GROUPS ? "Copy group information" : "Add group information"));

    if (!error) error = scorer.check_validity();
    if (!error) error = rsource.loadFromDB(tree_source);
    if (!error) error = rsource.linkToDB(NULp, NULp);
    if (!error) {
        error             = rdest.loadFromDB(tree_dest);
        if (!error) error = rdest.linkToDB(NULp, NULp);
        if (!error) {
            AP_tree *source = rsource.get_root_node();
            AP_tree *dest   = rdest.get_root_node();

            int dest_leafs  = dest->count_leafs();
            int dest_nodes  = leafs_2_nodes(dest_leafs, ROOTED);
            int dest_inodes = dest_nodes-dest_leafs; // inner nodes

            int source_leafs  = source->count_leafs();
            int source_nodes  = leafs_2_nodes(source_leafs, ROOTED);
            int source_inodes = source_nodes-source_leafs; // inner nodes

            size_t compared_nodes = mode == COMPARE_TOPOLOGY ? source_inodes : source->count_clades();
            size_t progress_steps = (compared_nodes + 2) * dest_inodes; // 2 for searching both sides of root

            arb_progress compare_progress(progress_steps);
            compare_progress.subtitle("Register topology");

            {
                SpecSetRegistry ssr(dest_leafs, &compare_progress, scorer);

                ssr.registerTree(dest);

                if (source_leafs < 3) error = GB_export_errorf("tree '%s' has less than 3 species", tree_source);
                else {
                    compare_progress.subtitle("Match subtrees");

                    TSpecSet *root_tsetl =              ssr.find_best_matches_info(source->get_leftson(),  log, mode == COMPARE_TOPOLOGY);
                    TSpecSet *root_tsetr = root_tsetl ? ssr.find_best_matches_info(source->get_rightson(), log, mode == COMPARE_TOPOLOGY) : NULp;

                    if (root_tsetr) {
                        if (mode != COMPARE_TOPOLOGY) {
                            compare_progress.subtitle("Write group information");
                            error = ssr.write_node_information(log, mode == REMOVE_EXISTING_GROUPS, what, aci);
                        }

                        if (!error) {
                            GroupPenalty dummy;

                            ssr.setScorer(GroupMatchScorer()); // otherwise may find no matches
                            RSpecSet *new_root_rsetl = ssr.search_best_match(root_tsetl, dummy);
                            RSpecSet *new_root_rsetr = ssr.search_best_match(root_tsetr, dummy);

                            nt_assert(new_root_rsetl && new_root_rsetr); // should always report matches (even really bad ones!)

                            AP_tree *new_rootl = new_root_rsetl->set_node;
                            AP_tree *new_rootr = new_root_rsetr->set_node;

                            // naive attempt to set root of target tree
                            // @@@ setting root wont work very well..
                            // - if these two nodes are NOT adjacent (esp. if 'new_rootr' is less "good" than 'new_rootl')
                            // - probably modifies topology even if not necessary; see ad_trees.cxx@NAIVE_ROOTING
                            new_rootl->set_root();
                            new_rootr->set_root();

                            compare_progress.subtitle("Save trees");

                            AP_tree *saved_root = (mode == COMPARE_TOPOLOGY) ? source : new_rootr->get_root_node();
                            error = GBT_overwrite_tree(rsave.get_gb_tree(), saved_root);

                            if (!error) {
                                char *entry;
                                if (mode == COMPARE_TOPOLOGY) {
                                    entry = GBS_global_string_copy("Compared topology with %s", tree_dest); // Note: the modified tree is 'tree_source'!
                                }
                                else {
                                    const char *copiedOrAdded = mode == REMOVE_EXISTING_GROUPS ? "Copied" : "Added";

                                    entry = GBS_global_string_copy("%s node info %sfrom %s",
                                                                   copiedOrAdded,
                                                                   what == XFER_GROUPS_WITH_MARKED ? "of marked " : "",
                                                                   tree_source);
                                }
                                GBT_log_to_tree_remark(rsave.get_gb_tree(), entry, true);
                                free(entry);
                            }
                        }
                    }

                    delete root_tsetl;
                    delete root_tsetr;
                }

                ssr.finish(error);
            }
            if (error) compare_progress.done();
        }
    }

    if (log) {
        if (error) fprintf(log, "\nError: %s\n", error);       // write error to log as well
        else fputs("[done]\n", log);
        fclose(log);
    }

    return GB_end_transaction(gb_main, error);
}

// --------------------------------------------------------------------------------

#ifdef UNIT_TESTS
#ifndef TEST_UNIT_H
#include <test_unit.h>
#endif

void TEST_species_sets() {
    // SpecSetRegistry ssr(0, NULp); // does fatal error - fine

    GroupMatchScorer defaultScorer;
    {
        SpecSetRegistry s1(1, NULp, defaultScorer);
        TEST_EXPECT_EQUAL(s1.bitstring_bytes(), 1);
        TEST_EXPECT_EQUAL(s1.bitstring_longs(), 1);

        s1.add("one");
        // s1.add("too much"); // fails assertion; fine
        TSpecSet t1(NULp, s1, "one");
    }
    {
        SpecSetRegistry s8(8, NULp, defaultScorer);
        TEST_EXPECT_EQUAL(s8.bitstring_bytes(), 1);
        TEST_EXPECT_EQUAL(s8.bitstring_longs(), 1);

        for (int i = 1; i<=8; ++i) {
            s8.add(GBS_global_string("%i", i));
        }
        // s8.add("too much"); // fails assertion; fine
        for (int i = 1; i<=8; ++i) {
            TSpecSet t(NULp, s8, GBS_global_string("%i", i));
        }
    }
    {
        SpecSetRegistry s9(9, NULp, defaultScorer);
        TEST_EXPECT_EQUAL(s9.bitstring_bytes(), 2);
        TEST_EXPECT_EQUAL(s9.bitstring_longs(), 1);

        for (int i = 1; i<=9; ++i) {
            s9.add(GBS_global_string("%i", i));
        }
        // s9.add("too much"); // fails assertion; fine
        for (int i = 1; i<=9; ++i) {
            TSpecSet t(NULp, s9, GBS_global_string("%i", i));
        }
    }

    {
        SpecSetRegistry s64(64, NULp, defaultScorer);
        TEST_EXPECT_EQUAL(s64.bitstring_bytes(), 8);
        TEST_EXPECT_EQUAL(s64.bitstring_longs(), 1);

        for (int i = 1; i<=64; ++i) {
            s64.add(GBS_global_string("%i", i));
        }
        // s64.add("too much"); // fails assertion; fine
        for (int i = 1; i<=64; ++i) {
            TSpecSet t(NULp, s64, GBS_global_string("%i", i));
        }
    }
    {
        SpecSetRegistry s65(65, NULp, defaultScorer);
        TEST_EXPECT_EQUAL(s65.bitstring_bytes(), 9);
        TEST_EXPECT_EQUAL(s65.bitstring_longs(), 2);

        for (int i = 1; i<=65; ++i) {
            s65.add(GBS_global_string("%i", i));
        }
        // s65.add("too much"); // fails assertion; fine
        for (int i = 1; i<=65; ++i) {
            TSpecSet t(NULp, s65, GBS_global_string("%i", i));
        }
    }
}

#endif // UNIT_TESTS

// --------------------------------------------------------------------------------
