// =============================================================== //
//                                                                 //
//   File      : NT_species_set.h                                  //
//   Purpose   :                                                   //
//                                                                 //
//   Institute of Microbiology (Technical University Munich)       //
//   http://www.arb-home.de/                                       //
//                                                                 //
// =============================================================== //

#ifndef NT_SPECIES_SET_H
#define NT_SPECIES_SET_H

#ifndef NT_TREE_CMP_H
#include "NT_tree_cmp.h"
#endif
#ifndef ARBTOOLS_H
#include <arbtools.h>
#endif
#ifndef AP_TREE_HXX
#include <AP_Tree.hxx>
#endif

class RSpecSet;
class TSpecSet;
class arb_progress;

// @@@ improve compare logic:
// - species sets (and bitstrings) should only contain species that occur in both trees
// - species that occur only in RSpecSet-tree shall be stored in RSpecSet (like done in TSpecSet::unfound_species_count)
// - a small penalty shall be assigned (as done for TSpecSet)

class SpecSetRegistry : virtual Noncopyable {
    long species_counter; // number of species added to hash
    long nspecies;
    long nsets; // number of RSpecSet added to 'sets'

    RSpecSet **sets;
    int        set_bits[256];

    GroupMatchScorer  scorer;
    arb_progress     *progress;
    GB_HASH          *species_hash; // contains [1..N]
    unsigned char    *tmp_bitstring;

    int max_nsets() const { return leafs_2_innerNodes(nspecies, ROOTED); }

    void dump_bitstring(const char *tag, unsigned char *bs);

    void add(const char *species_name); // max nspecies
    void add(RSpecSet *rset);           // max 2 * nspecies

    double search_and_remember_best_match_and_log_errors(const TSpecSet *tset, FILE *log);

#if defined(UNIT_TESTS)
    friend void TEST_species_sets();
#endif

public:
    SpecSetRegistry(long nspecies_, arb_progress *progress_, const GroupMatchScorer& scorer_);
    ~SpecSetRegistry();
    void finish(GB_ERROR& error); // call before destruction to retrieve errors

    long bitstring_bytes() const { return (nspecies-1)/8 + 1; }
    long bitstring_longs() const { return (bitstring_bytes()-1)/sizeof(long) + 1; }

    unsigned char *allocate_bitstring() const { return ARB_calloc<unsigned char>(bitstring_longs()*sizeof(long)); }

    long get_species_index(const char *species_name) const { return GBS_read_hash(species_hash, species_name); }
    RSpecSet *registerTree(AP_tree *node);

    RSpecSet *search_best_match(const TSpecSet *tset, GroupPenalty& min_penalty);
    TSpecSet *find_best_matches_info(AP_tree *node, FILE *log, bool compare_node_info);
    GB_ERROR  write_node_information(FILE *log, bool delete_old_nodes, GroupsToTransfer what, const char *aci);

    void setScorer(const GroupMatchScorer& newScorer) { scorer = newScorer; }
};


class SpecSet : virtual Noncopyable {
protected:
    // SpecSet should only be used by derived classes

    int known_members; // number of registered members

    void init(AP_tree *nodei, const SpecSetRegistry& ssr);

    SpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name);           // create from species..
    SpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const SpecSet *l, const SpecSet *r); // ..or from two subsets
    ~SpecSet();

public:
    // @@@ make member private
    unsigned char *bitstring;
    AP_tree       *set_node; // node in tree (from which SpecSet was initialized)

    bool is_leaf_set() const { return set_node && set_node->is_leaf(); } // @@@ might be wrong for zombies
    int get_known_members() const { return known_members; }
};

class RSpecSet : public SpecSet { // derived from Noncopyable
    // set registered in SpecSetRegistry
    AP_tree      *best_node;  // node in other tree
    GroupPenalty  best_match; // result of matching 'this' versus TSpecSet of best_node

public:
    RSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name);             // create from species..
    RSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const RSpecSet *l, const RSpecSet *r); // ..or from two subsets

    void storeBetterMatch(const GroupPenalty& match, AP_tree *matched_node) {
        // if 'this' was detected as best match for any TSpecSet of other (not registered) tree,
        // -> store match in best_match + node of TSpecSet in best_node:

        nt_assert(!best_match.betterThan(match)); // avoid overwriting with worse match

        best_match = match;
        best_node  = matched_node;
    }

    int size() const { return known_members; } // only contains known members by definition
    const GroupPenalty& bestMatch() const { return best_match; }
    AP_tree* matchedNode() const { return best_node; }
};

class TSpecSet : public SpecSet { // derived from Noncopyable
    // set tested against sets in registry

    int unfound_species_count; // species missing in SpecSetRegistry
public:
    TSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name);             // create from species..
    TSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const TSpecSet *l, const TSpecSet *r); // ..or from two subsets

    int size() const { return known_members + unfound_species_count; }
    int get_unknown_members() const { return unfound_species_count; }
};

#else
#error NT_species_set.h included twice
#endif // NT_SPECIES_SET_H
