// ============================================================= //
//                                                               //
//   File      : group_search.h                                  //
//   Purpose   : public interface of group search                //
//                                                               //
//   Coded by Ralf Westram (coder@reallysoft.de) in April 2017   //
//   http://www.arb-home.de/                                     //
//                                                               //
// ============================================================= //

#ifndef GROUP_SEARCH_H
#define GROUP_SEARCH_H

#ifndef LAZY_H
#include <lazy.h>
#endif
#ifndef QUERY_EXPR_H
#include <query_expr.h>
#endif
#ifndef ARBDB_BASE_H
#include <arbdb_base.h>
#endif
#ifndef ARBTOOLS_H
#include <arbtools.h>
#endif
#ifndef ARB_ERROR_H
#include <arb_error.h>
#endif
#ifndef SMARTPTR_H
#include <smartptr.h>
#endif
#ifndef CB_H
#include <cb.h>
#endif
#ifndef _GLIBCXX_VECTOR
#include <vector>
#endif
#ifndef _GLIBCXX_SET
#include <set>
#endif
#ifndef _GLIBCXX_LIST
#include <list>
#endif
#ifndef _GLIBCXX_STRING
#include <string>
#endif

#define gs_assert(cond) arb_assert(cond)

class GroupSearchCommon;

enum GroupFoldingMode { // bit mask
    GFM_EXPAND        = 1,  // expand groups (otherwise collapse)
    GFM_TOGGLE        = 2,  // toggle state of groups (if set -> bit0 is ignored)
    GFM_RECURSE       = 4,  // apply to parents                            (use with GFM_EXPAND)
    GFM_PARENTS_ONLY  = 8,  // only apply to parents, not to listed groups (use with GFM_EXPAND)
    GFM_COLLAPSE_REST = 16, // collapse all other groups in each tree

    // convenience defs
    GFM_COLLAPSE           = 0,
    GFM_EXPANDREC          = GFM_EXPAND|GFM_RECURSE,
    GFM_EXPANDPARENTS      = GFM_EXPAND|GFM_RECURSE|GFM_PARENTS_ONLY,
    GFM_EXPANDREC_COLLREST = GFM_EXPANDREC|GFM_COLLAPSE_REST,
};

enum GroupMarkMode { // same values as ../TREEDISP/TreeCallbacks.cxx@MARK_MODE_LOWER_BITS
    GMM_UNMARK = 0,
    GMM_MARK   = 1,
    GMM_INVERT = 2,
};

enum CollectMode { INTERSECT, UNITE };

// ----------------------------------------
//      query expression related types

enum CriterionOperator { // @@@ rename
    CO_AND,
    CO_OR,
    CO_IGNORE,
};
enum CriterionMatch { // @@@ rename
    CM_MATCH,
    CM_MISMATCH,
};
enum CriterionType { // @@@ rename
    CT_NAME,          // groupname (STRING)
    CT_PARENT_DIRECT, // groupname of direct parent (STRING)
    CT_PARENT_ANY,    // groupname of any parent (STRING)
    CT_PARENT_ALL,    // groupname of all parents (STRING)
    CT_NESTING_LEVEL, // group-nesting-level (INT)
    CT_FOLDED,        // folded (INT)
    CT_SIZE,          // size of group = number of species (INT)
    CT_MARKED,        // number of marked species (INT)
    CT_MARKED_PC,     // percentage of marked species (FLOAT)
    CT_ZOMBIES,       // number of zombies (INT)
    CT_AID,           // average ingroup distance
    CT_KEELED,        // keeled state (INT)
};

// ------------------------
//      search results

struct ColumnWidths {
    // Stores widths (or maxvalues for integers) of all
    // FoundGroup-attributes that might get displayed in result list.

    int name;    // max. width of name
    int reason;  // max. width of hit_reason

    int max_nesting;    // max. value for nesting
    int max_size;       // max. value for size
    int max_marked;     // max. value for marked
    int max_marked_pc;  // max. value for marked%
    int max_cluster_id; // max. value of cluster id
    int max_aid;        // max. value in front of dot

    bool seen_keeled; // any keeled group found?

    ColumnWidths() :
        name(0),
        reason(0),
        max_nesting(0),
        max_size(0),
        max_marked(0),
        max_marked_pc(0),
        max_cluster_id(0),
        seen_keeled(false)
    {}

    static int max2width(const int& i) { // @@@ a bit expensive (better use specialized type for 'max_nesting' and other maximas)
        // calculate column width from maximum
        // (Note: fails if negative values are involved)
        return strlen(GBS_global_string("%i", i));
    }
    static int percent(int part, int all) {
        return int(part*100.0/all+0.5);
    }

    void track(int wName, int wReason, int nesting, int size, int marked, int clusID, double aid, bool keeled);
};

class QueriedGroups;

class FoundGroup {
    // stores/exports info about a group (as found by group search)
    // - usable w/o tree loaded
    // - (able to) know details (like nesting, number of marked species, ...)

    RefPtr<GBDATA> gb_group;

    ARB_ERROR set_folded(bool folded);
    ARB_ERROR set_overlap_folded(bool folded);
    int get_name_length() const;

    GBDATA *get_overlap_group() const {
        gs_assert(keeled.has_value()); // means: pointer 'gb_overlap_group' is valid
        return gb_overlap_group;
    }

protected:
    // optional values (needed to be "informed")
    // - set manually by derivate ('Candidate' in group_search.cxx@inform_group)
    // - knows_details() returns true if everything is properly set
    std::string   hit_reason;
    Lazy<int, -1> nesting;
    Lazy<int, -1> size;
    Lazy<int, -1> marked;

    LazyFloat<double> aid;              // average ingroup distance

    Lazy<int, -1>  keeled;
    RefPtr<GBDATA> gb_overlap_group;    // valid if 'keeled.has_value()'; NULp -> no overlapping group

    int clusterID;

public:
    explicit FoundGroup(GBDATA *gb_group_) :
        gb_group(gb_group_),
        gb_overlap_group(NULp),
        clusterID(0)
    {}

    GBDATA *get_pointer() const { return gb_group; }
    GBDATA *get_tree_data() const;

    const char *get_name() const;
    const char *get_tree_name() const;
    int get_tree_order() const;
    const std::string& get_hit_reason() const { return hit_reason; }

    void set_cluster_id(int id) { gs_assert(id>0); clusterID = id; }
    void forget_cluster_id() { clusterID = 0; }
    int get_cluster_id() const { return clusterID; }

    bool overlap_is_folded() const;
    bool is_folded() const;

    bool knows_details() const { // returns true if group is "informed"
        return
            !hit_reason.empty() &&
            nesting.has_value() &&
            size.has_value()    &&
            marked.has_value()  &&
            keeled.has_value()  &&
            aid.has_value();
    }
    void track_max_widths(ColumnWidths& widths) const;

    // getters for "informed" groups:
    int get_nesting()   const { gs_assert(knows_details()); return nesting; }
    int get_size()      const { gs_assert(knows_details()); return size; }
    int get_marked()    const { gs_assert(knows_details()); return marked; }
    int get_marked_pc() const { gs_assert(knows_details()); return ColumnWidths::percent(marked, size); }
    int get_keeled()    const { gs_assert(knows_details()); return keeled; }
    double get_aid()    const { gs_assert(knows_details()); return aid; }

    bool operator == (const FoundGroup& other) const {
        return gb_group == other.gb_group;
    }

    // modify DB:
    GB_ERROR  delete_from_DB();
    ARB_ERROR rename_by_ACI(const char *acisrt, const QueriedGroups& results, int hit_idx);
    ARB_ERROR change_folding(GroupFoldingMode mode);
};

class Candidate; // a derivate of FoundGroup
class GroupSearch;

typedef std::set<GBDATA*>                   GBDATAset;
typedef std::set<std::string>               SpeciesNames;
typedef std::vector<FoundGroup>             FoundGroupContainer;
typedef FoundGroupContainer::const_iterator FoundGroupCIter;
typedef FoundGroupContainer::iterator       FoundGroupIter;

enum GroupSortCriterion {
    GSC_NONE,      // special criterion (not allowed in SortCriteria)
    GSC_REVERSE,   // special criterion: modifies previous criterion; may occur multiple times
    GSC_NAME,
    GSC_TREENAME,
    GSC_TREEORDER, // as defined in tree-admin
    GSC_HIT_REASON,
    GSC_NESTING,
    GSC_SIZE,
    GSC_MARKED,
    GSC_MARKED_PC,
    GSC_CLUSTER,
    GSC_AID, // average ingroup distance
    GSC_KEELED,
};

typedef std::list<GroupSortCriterion> SortCriteria;

class QueriedGroups {
    FoundGroupContainer        found;
    RefPtr<const SortCriteria> sorted_by;

    mutable SmartPtr<ColumnWidths> widths;
    void invalidate_widths() const { widths.setNull(); }

public:
    QueriedGroups() : sorted_by(NULp) {}

    size_t size() const { return found.size(); }
    bool empty() const { return size() == 0; }

    void add_informed_group(const FoundGroup& group) {
        gs_assert(group.knows_details());
        found.push_back(group);
        invalidate_widths();
    }
    void add_candidate(const GroupSearch& group_search, Candidate& cand, const std::string& hit_reason);

    FoundGroupCIter begin() const { return found.begin(); }
    FoundGroupIter  begin()       { return found.begin(); }
    FoundGroupCIter end()   const { return found.end(); }
    FoundGroupIter  end()         { return found.end(); }

    const FoundGroup& operator[](size_t idx) const { gs_assert(idx<size()); return found.at(idx); }
    FoundGroup&       operator[](size_t idx)       { gs_assert(idx<size()); return found.at(idx); }

    bool erase_deleted(GroupSearchCommon *common);
    bool contains_changed(GroupSearchCommon *common) const;

    const ColumnWidths& get_column_widths() const;
    const char *get_group_display(const FoundGroup& g, bool show_tree_name) const;

    void sort_by(const SortCriteria& by);

    void remove_hit(size_t idx);
};

// --------------------------
//      DupCriteria

enum DupNameCriterionType {
    DNC_WHOLENAME, // match of whole groupname
    DNC_WORDWISE,  // wordwise match

    // maybe add something more fuzzy?
};
enum DupTreeCriterionType {
    DLC_SAME_TREE, // duplicates need to be in same tree
    DLC_DIFF_TREE, // duplicates need to be in different trees (but: cluster will be extended with groups from same tree)
    DLC_ANYWHERE,  // no restriction
};

typedef std::set<std::string> WordSet;

class DupCriteria;

// ---------------------
//      GroupSearch

DECLARE_CBTYPE_FVV_AND_BUILDERS(GroupSearchCallback, void, class GroupSearch*);   // generates makeGroupSearchCallback

typedef std::set<std::string> TreeNameSet;

enum GroupSearchMode {       // bit mask
    GSM_FORGET_EXISTING = 1, // otherwise keep existing results
    GSM_ADD             = 2, // otherwise remove
    GSM_MISMATCH        = 4, // otherwise target matching

    // convenience defs
    GSM_FIND   = GSM_FORGET_EXISTING|GSM_ADD,
    GSM_KEEP   = 0,
    GSM_REMOVE = GSM_KEEP|GSM_MISMATCH, // has to be XORed
    GSM_MATCH  = 0,
};

class GroupSearch : virtual Noncopyable {
    GBDATA                  *gb_main;
    SmartPtr<QueryExpr>      query_expr;
    TreeNameSet              trees_to_search;
    SmartPtr<QueriedGroups>  found;
    GroupSearchCallback      redisplay_cb;

    SortCriteria order;
    bool         sortedByOrder;

    SmartPtr<DupCriteria> dups;

    static GroupSearchCommon *common;

    void sort_results();
    GB_ERROR clusterDuplicates();

#if defined(UNIT_TESTS)
public:
#endif
    ARB_ERROR collectSpecies(const QueriedGroups& groups, CollectMode cmode, SpeciesNames& species);
public:
    GroupSearch(GBDATA *gb_main_, const GroupSearchCallback& redisplay_results_cb);
    ~GroupSearch();

    GBDATA *get_gb_main() const { return gb_main; }

    // -------------------------
    //      define search:

    void addQueryExpression(CriterionOperator op, CriterionType type, CriterionMatch mtype, const char *expression);
    void forgetQExpressions();

    void addSortCriterion(GroupSortCriterion gsc);
    void forgetSortCriteria() {
        order.clear();
        sortedByOrder = false;
    }

    void setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, DupTreeCriterionType ttype, int min_cluster_size);
    void setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, int min_words, const WordSet& ignored_words, const char *wordSeparators, DupTreeCriterionType ttype, int min_cluster_size);
    void setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, int min_words, const char    *ignored_words, const char *wordSeparators, DupTreeCriterionType ttype, int min_cluster_size);
    void forgetDupCriteria();

    void setSearchRange(const TreeNameSet& trees_to_search_) { // empty -> search all trees
        trees_to_search = trees_to_search_;
    }

    // ----------------
    //      search

    void perform_search(GroupSearchMode mode);

    GBDATA *get_parent_group(GBDATA *gb_group) const;
    int calc_nesting_level(GBDATA *gb_group) const;

    // ------------------------
    //      handle results:

    const QueriedGroups& get_results();
    bool has_results() const { return found.isSet() && !found->empty(); }
    void remove_hit(size_t idx) { found->remove_hit(idx); }
    void forget_results() { found = new QueriedGroups; }

    // ----------------------------
    //      actions on results:

    GB_ERROR delete_group(size_t idx);
    GB_ERROR delete_found_groups();

    ARB_ERROR rename_group(size_t idx, const char *acisrt);
    ARB_ERROR rename_found_groups(const char *acisrt);

    ARB_ERROR fold_group(size_t idx, GroupFoldingMode mode);
    ARB_ERROR fold_found_groups(GroupFoldingMode mode);

    ARB_ERROR set_marks_in_group(size_t idx, GroupMarkMode mode);
    ARB_ERROR set_marks_in_found_groups(GroupMarkMode mode, CollectMode cmode);

#if defined(UNIT_TESTS)
    // inspectors:
    const SortCriteria& inspect_order() const { return order; }
    static GroupSearchCommon *get_common() { return common; }
#endif

    // internal (dont use):
    void refresh_results_after_DBchanges();
};

char *GS_calc_resulting_groupname(GBDATA *gb_main, const QueriedGroups& queried, int hit_idx, const char *input_name, const char *acisrt, ARB_ERROR& error);


#else
#error group_search.h included twice
#endif // GROUP_SEARCH_H
