// =============================================================== //
//                                                                 //
//   File      : admatch.cxx                                       //
//   Purpose   : functions related to string match/replace         //
//                                                                 //
//   ReCoded for POSIX ERE                                         //
//            by Ralf Westram (coder@reallysoft.de) in April 2009  //
//   Institute of Microbiology (Technical University Munich)       //
//   http://www.arb-home.de/                                       //
//                                                                 //
// =============================================================== //

#include "gb_local.h"

#include "gb_aci_impl.h"

#include <arb_strbuf.h>
#include <arb_match.h>

#include <cctype>

using namespace GBL_IMPL;

// ------------------------
//      string matcher

enum string_matcher_type {
    SM_INVALID = -1,
    SM_ANY     = 0,             // matches any string
    SM_WILDCARDED,              // match with wildcards (GBS_string_matches)
    SM_REGEXPR,                 // match using regexpr
};

struct GBS_string_matcher {
    string_matcher_type  type;
    GB_CASE              case_flag;
    char                *wildexpr;
    GBS_regex           *regexpr;
};

GBS_string_matcher *GBS_compile_matcher(const char *search_expr, GB_CASE case_flag) {
    /* returns a valid string matcher (to be used with GBS_string_matches_regexp)
     * or NULp (in which case an error was exported)
     */

    GBS_string_matcher *matcher = ARB_alloc<GBS_string_matcher>(1);
    GB_ERROR            error   = NULp;

    matcher->type      = SM_INVALID;
    matcher->case_flag = case_flag;
    matcher->wildexpr  = NULp;
    matcher->regexpr   = NULp;

    if (search_expr[0] == '/') {
        const char *end = strchr(search_expr, 0)-1;
        if (end>search_expr && end[0] == '/') {
            GB_CASE     expr_attached_case;
            const char *unwrapped_expr = GBS_unwrap_regexpr(search_expr, &expr_attached_case, &error);

            if (unwrapped_expr) {
                if (expr_attached_case != GB_MIND_CASE) error = "format '/../i' not allowed here";
                else {
                    matcher->regexpr = GBS_compile_regexpr(unwrapped_expr, case_flag, &error);
                    if (matcher->regexpr) {
                        matcher->type = SM_REGEXPR;
                    }
                }
            }
        }
    }

    if (!matcher->regexpr && !error) {
        if (strcmp(search_expr, "*") == 0) {
            matcher->type = SM_ANY;
        }
        else {
            matcher->type     = SM_WILDCARDED;
            matcher->wildexpr = ARB_strdup(search_expr);
        }
    }

    if (matcher->type == SM_INVALID) {
        error = GBS_global_string("Failed to create GBS_string_matcher from '%s'", search_expr);
    }

    if (error) {
        GBS_free_matcher(matcher);
        matcher = NULp;
        GB_export_error(error);
    }
    return matcher;
}

void GBS_free_matcher(GBS_string_matcher *matcher) {
    free(matcher->wildexpr);
    if (matcher->regexpr) GBS_free_regexpr(matcher->regexpr);
    free(matcher);
}

// -------------------------
//      wildcard search

GB_CSTR GBS_find_string(GB_CSTR cont, GB_CSTR substr, int match_mode) {
    /* search a substring in another string
     * match_mode == 0     -> exact match
     * match_mode == 1     -> a==A
     * match_mode == 2     -> a==a && a==?
     * match_mode == else  -> a==A && a==?
     */
    const char *p1, *p2;
    char        b;

    switch (match_mode) {

        case 0: // exact match
            for (p1 = cont, p2 = substr; *p1;) {
                if (!(b = *p2)) {
                    return (char *)cont;
                }
                else {
                    if (b == *p1) {
                        p1++;
                        p2++;
                    }
                    else {
                        p2 = substr;
                        p1 = (++cont);
                    }
                }
            }
            if (!*p2)   return (char *)cont;
            break;

        case 1: // a==A
            for (p1 = cont, p2 = substr; *p1;) {
                if (!(b = *p2)) {
                    return (char *)cont;
                }
                else {
                    if (toupper(*p1) == toupper(b)) {
                        p1++;
                        p2++;
                    }
                    else {
                        p2 = substr;
                        p1 = (++cont);
                    }
                }
            }
            if (!*p2) return (char *)cont;
            break;
        case 2: // a==a && a==?
            for (p1 = cont, p2 = substr; *p1;) {
                if (!(b = *p2)) {
                    return (char *)cont;
                }
                else {
                    if (b == *p1 || (b=='?')) {
                        p1++;
                        p2++;
                    }
                    else {
                        p2 = substr;
                        p1 = (++cont);
                    }
                }
            }
            if (!*p2) return (char *)cont;
            break;

        default: // a==A && a==?
            for (p1 = cont, p2 = substr; *p1;) {
                if (!(b = *p2)) {
                    return (char *)cont;
                }
                else {
                    if (toupper(*p1) == toupper(b) || (b=='?')) {
                        p1++;
                        p2++;
                    }
                    else {
                        p2 = substr;
                        p1 = (++cont);
                    }
                }
            }
            if (!*p2) return (char *)cont;
            break;
    }
    return NULp;
}

bool GBS_string_matches(const char *str, const char *expr, GB_CASE case_sens) {
    /* Wildcards in 'expr' string:
     *      ?   one character
     *      *   several characters
     *
     * if 'case_sens' == GB_IGNORE_CASE -> change all letters to uppercase
     *
     * returns true if strings are equal, false otherwise
     */

    const char *ps = str;
    const char *pe = expr;

    while (1) {
        char s = *ps;
        char e = *pe;

        if (e == '*') {
            if (!pe[1]) { // '*' at end of expression
                break; // always match (even "nothing")
            }

            const char *nextStar = ARB_strchrnul(pe+1, '*');
            int         len      = nextStar-pe-1; // part after '*' (and before EOS or next '*')
            if (!nextStar[0]) { // no 2nd '*' found
                // -> tail of string (if there is any) has to match
                int psl = strlen(ps); // length of tail
                if (psl<len) {    // str-tail shorter than expr-tail
                    return false; // -> match impossible
                }

                ps += psl-len; // skip over characters expected to match the '*' (=goto str-tail)
                ++pe;          // goto expr-tail
            }
            else { // found 2nd '*' -> search for string part between stars
                {
                    char *part = ARB_strpartdup(pe+1, nextStar-1);
                    ps         = GBS_find_string(ps, part, 2+(case_sens == GB_IGNORE_CASE)); // match with '?' wildcard
                    free(part);
                }

                if (!ps) {
                    return false;
                }
                ps += len;
                pe = nextStar;
            }
            continue;
        }

        if (!s) {
            return !e;
        }
        if (s != e) {
            if (e != '?') {
                if (!e) {
                    return !s;
                }
                if (case_sens == GB_IGNORE_CASE) {
                    s = toupper(s);
                    e = toupper(e);
                    if (s != e) {
                        return false;
                    }
                }
                else {
                    return false;
                }
            }
        }
        ps++;
        pe++;
    }
    return true;
}

bool GBS_string_matches_regexp(const char *str, const GBS_string_matcher *expr) {
    /* Wildcard or regular expression match
     * Returns true if match
     *
     * Use GBS_compile_matcher() and GBS_free_matcher() to maintain 'expr'
     */
    bool matches = false;

    switch (expr->type) {
        case SM_ANY: {
            matches = true;
            break;
        }
        case SM_WILDCARDED: {
            matches = GBS_string_matches(str, expr->wildexpr, expr->case_flag);
            break;
        }
        case SM_REGEXPR: {
            matches = GBS_regmatch_compiled(str, expr->regexpr, NULp);
            break;
        }
        case SM_INVALID: {
            gb_assert(0);
            break;
        }
    }

    return matches;
}

// -----------------------------------
//      Search replace tool (SRT)

#define GBS_SET   ((char)1)
#define GBS_SEP   ((char)2)
#define GBS_MWILD ((char)3)
#define GBS_WILD  ((char)4)

__ATTR__USERESULT static GB_ERROR gbs_build_replace_string(GBS_strstruct& out,
                                                           char *replaceBy, // will be modified!
                                                           const char       *sWildcards, long sWildMax,
                                                           const char*const *mWildcards, long mWildMax,
                                                           const GBL_call_env& callEnv)
{
    int sWildAuto = 0; // count plain occurrences of '?' in replace string (ie. w/o number behind)
    int mWildAuto = 0; // same for '*'

    GBDATA *gb_container = callEnv.get_item_ref();

    char *p = replaceBy;
    char  c;
    while ((c=*(p++))) {
        switch (c) {
            case GBS_MWILD:
            case GBS_WILD: {
                char d = *(p++);
                if (d=='(') { // "*(..)" expressions
                    char *closingParen = search_matching_parenthesis(p);

                    if (!closingParen) {
                        return GBS_global_string("Unbalanced parenthesis in '%s'", p-1);
                    }

                    // found reference: "*(gbd)"
                    int separator = 0;
                    *closingParen = 0;
                    char *psym = strpbrk(p, "#|:");
                    if (psym) {
                        separator = *psym;
                        *psym = 0;
                    }

                    GBDATA *gb_entry = NULp;
                    if (*p) { // key was specified
                        if (!gb_container) {
                            return GBS_global_string("can't read key '%s' (called w/o database item)", p);
                        }
                        if (!GB_is_container(gb_container)) {
                            if (ARB_strBeginsWith(p, "../")) { // redirect search via parent
                                p += 3;
                                gb_container = GB_get_father(gb_container);
                            }
                            else {
                                return GBS_global_string("can't read key '%s' (DB item is no container)", p);
                            }
                        }

                        gb_entry = GB_search(gb_container, p, GB_FIND);
                        callEnv.track_field_access(p);

                        if (!gb_entry && GB_have_error()) {
                            return GB_await_error();
                        }
                    }
                    else {
                        gb_entry = gb_container;
                    }

                    if (psym) *psym = separator;

                    char *entry = (gb_entry && gb_entry != gb_container)
                        ? GB_read_as_string(gb_entry)
                        : ARB_strdup("");

                    if (entry) {
                        char *h;
                        switch (separator) {
                            case ':':
                                h = GBS_string_eval_in_env(entry, psym+1, callEnv);
                                if (!h) {
                                    free(entry);
                                    return GB_await_error();
                                }

                                out.cat(h);
                                free(h);
                                break;

                            case '|':
                                h = GB_command_interpreter_in_env(entry, psym+1, callEnv);
                                if (!h) {
                                    free(entry);
                                    return GB_await_error();
                                }

                                out.cat(h);
                                free(h);
                                break;

                            case '#':
                                if (!entry[0]) { // missing field or empty content
                                    out.cat(psym+1);
                                    break;
                                }
                                // fall-through
                            default:
                                out.cat(entry);
                                break;
                        }
                        free(entry);
                    }
                    *closingParen = ')';
                    p = closingParen+1;
                }
                else {
                    int  wildcard_num       = d - '1';
                    bool followed_by_number = wildcard_num>=0 && wildcard_num<=9; // @@@ in fact this will also accept ':'

                    if (c == GBS_WILD) {
                        if (!followed_by_number) { // char behind wildcard is not in [1-9]
                            --p; // "put back" that character
                            wildcard_num = sWildAuto++;
                        }
                        if (wildcard_num>=sWildMax) {
                            out.put('?');
                        }
                        else {
                            out.put(sWildcards[wildcard_num]);
                        }
                    }
                    else {
                        if (!followed_by_number) { // char behind wildcard is not in [1-9]
                            --p; // "put back" that character
                            wildcard_num = mWildAuto++;
                        }
                        if (wildcard_num>=mWildMax) {
                            out.put('*');
                        }
                        else {
                            out.cat(mWildcards[wildcard_num]);
                        }
                    }
                }
                break;
            }
            default:
                out.put(c);
                break;
        }
    }
    return NULp;
}

static char *gbs_compress_command(const char *com) {
    /* Prepare SRT.
     *
     * Replaces all
     *   '=' by GBS_SET
     *   ':' by GBS_SEP
     *   '?' by GBS_WILD if followed by a number or '?'
     *   '*' by GBS_MWILD  or '('
     * \ is the escape character
     */

    char       *result = ARB_strdup(com);
    char       *d      = result;
    const char *s      = result;
    char ch;

    while ((ch = *(s++))) {
        switch (ch) {
            case '=':   *(d++) = GBS_SET; break;
            case ':':   *(d++) = GBS_SEP; break;
            case '?':   *(d++) = GBS_WILD; break;
            case '*':   *(d++) = GBS_MWILD; break;
            case '\\':
                ch = *(s++); if (!ch) { s--; break; };
                switch (ch) {
                    case 'n':   *(d++) = '\n'; break;
                    case 't':   *(d++) = '\t'; break;
                    case '0':   *(d++) = '\0'; break;
                    default:    *(d++) = ch; break;
                }
                break;

            default: *(d++) = ch; break;
        }
    }
    *d = 0;
    return result;
}

// AISC_MKPT_PROMOTE: class GBL_call_env;

char *GBS_string_eval_in_env(const char *insource, const char *icommand, const GBL_call_env& callEnv) {
     /* GBS_string_eval_in_env replaces substrings in source (implements SRT)
      * Syntax: command = "oliver=olli:peter=peti"
      *
      * Returns a heapcopy of result of replacement.
      *
      *         * is a wildcard for any number of characters
      *         ? is a wildcard for exactly one character
      *
      * To reference the parts matched by wildcards on the left side of the '=' use '?' and '*',
      * to reference in a particular order use
      *         *1 to reference to the first occurrence of *
      *         *2 ----------"-------- second ------"-------
      *         ...
      *         *9 ----------"-------- ninth -------"-------
      *
      * If the first and last characters of the search string are no '*' wildcards,
      * then the replace is repeated as many times as possible.
      *
      * '\' is the escape character: e.g. \n is newline; '\\' is '\'; '\=' is '='; ....
      *
      * If the passed GBL_call_env refers to a database entry (which has to be of type GB_DB, i.e. has to be a container),
      * fields of that container may be inserted using
      *
      *         *(arb_field)          is the value of the containers child entry 'arb_field'
      *         *(arb_field#string)   value of the child entry 'arb_field' or 'string' (if that entry does not exist)
      *         *(arb_field\:SRT)     runs SRT recursively on the value of the child entry 'arb_field'
      *         *([arb_field]|ACI)    runs the ACI command interpreter on the value of the child entry 'arb_field' (or on an empty string)
      *
      * If an error occurs it returns NULp - in this case the error-message gets exported!
      *
      * Notes:
      * - global interpreter (SRT+ACI+REG) is provided by GB_command_interpreter_in_env()
      * - REG is provided by GBS_regreplace(), GBS_regmatch() and GBS_regmatch_compiled()
      * - ACI is only provided via GB_command_interpreter_in_env()
      */
    if (!icommand || !icommand[0]) {
        return ARB_strdup(insource);
    }

    if (traceACI) {
        print_trace(GBS_global_string("SR: in='%s' cmd='%s':\n", insource, icommand));
    }
    LocallyModify<int> inc(traceIndent, traceIndent+1);

    char *command = gbs_compress_command(icommand);

    // copy insource (to allow to modify it)
    size_t        inlen = strlen(insource);
    GBS_strstruct in(inlen+1);
    in.ncat(insource, inlen);

    GBS_strstruct out(inlen+500);

    GB_ERROR  error = NULp;
    char     *next_subcmd;
    for (char *subcmd = command; subcmd; subcmd = next_subcmd) { // loop over sub-commands
        // search next subcommand (=pos behind next colon):
        next_subcmd = strchr(subcmd, GBS_SEP);
        if (next_subcmd) *(next_subcmd++) = 0;

        if (!subcmd[0]) continue; // empty subcommand -> do nothing

        // search for replace string:
        char *replaceBy = strchr(subcmd+1, GBS_SET);
        if (!replaceBy) {
            error = GBS_global_string("SRT ERROR: no '=' found in command '%s' (position > %zi)", icommand, subcmd-command+1);
            break;
        }
        *(replaceBy++) = 0;

        GB_CSTR not_yet_copied = in.get_data(); // point into 'in' string (to not-yet-copied part)
        out.erase();

        if (in.empty() && subcmd[0] == GBS_MWILD && subcmd[1] == 0) {
            // plain '*' shall also match an empty input string -> handle manually here
            const char *empty = "";
            error             = gbs_build_replace_string(out, replaceBy, NULp, 0, &empty, 1, callEnv);
        }
        else {
            char  sWildcard[40];  // character  which matched vs one '?'
            char *mWildcard[10];  // substrings which matched vs one '*'
            long  sWildSeen = 0;  // number of '?' seen (on left side on subcommand)
            long  mWildSeen = 0;  // number of '*' seen (on left side on subcommand)

            bool match_failed = false;
            for (GB_CSTR source = not_yet_copied; *source; ) { // loop over string
                gb_assert(!match_failed);

                char    *search      = subcmd;
                GB_CSTR  start_match = NULp; // start of string that matches a wildcard (none yet)

                char     c;
                while (!match_failed && (c = *(search++))) { // match expression vs. string
                    switch (c) {
                        case GBS_MWILD: {
                            if (!start_match) start_match = source;

                            char *start_of_wildcard = search;
                            if (!(c = *(search++))) {       // last character is a '*' wildcard -> expression matched
                                mWildcard[mWildSeen++] = ARB_strdup(source);
                                source                     = strchr(source, 0); // jump to EOS
                                --search;
                                break; // (effectively does exit while-loop)
                            }
                            // @@@ 'c' read in above if-condition is ignored if non-zero (got tests)

                            while ((c=*(search++)) && c!=GBS_MWILD && c!=GBS_WILD) ; // search the next wildcardstring

                            search--; // back one character
                            *search = 0;

                            char    what_wild_card = c;
                            GB_CSTR p              = GBS_find_string(source, start_of_wildcard, 0);

                            if (!p) match_failed = true; // string behind wildcard does not appear in input -> no match
                            else {
                                mWildcard[mWildSeen++] = ARB_strpartdup(source, p-1);
                                source = p + strlen(start_of_wildcard);
                                *search = what_wild_card;
                            }
                            break;
                        }
                        case GBS_WILD:
                            if (!source[0]) match_failed = true; // '?' does not match "nothing" -> no match
                            else {
                                if (!start_match) start_match = source;
                                sWildcard[sWildSeen++]        = *(source++);
                            }
                            break;

                        default:
                            if (start_match) {
                                if (c != *(source++)) match_failed = true; // mismatch after '?' or after last '*'
                            }
                            else {
                                char *buf1 = search-1;

                                while ((c=*(search++)) && c != GBS_MWILD && c!=GBS_WILD) ;   // search the next wildcardstring

                                search--;                                                    // back one character
                                *search = 0;

                                char    what_wild_card = c;
                                GB_CSTR p              = GBS_find_string(source, buf1, 0);
                                if (!p) {
                                    // string infrontof wildcard (or EOS) not found -> no match
                                    match_failed = true;
                                }
                                else  {
                                    start_match = p;
                                    source      = p + strlen(buf1);
                                    *search     = what_wild_card;
                                }

                            }
                            break;
                    }
                }

                if (!match_failed) {
                    /* now we got
                     *
                     * in:                 GBS_strstruct containing entire input string
                     * source:             pointer to end of match   (inside 'in')
                     * start_match:        pointer to start of match (inside 'in')
                     * not_yet_copied:     pointer to the not-copied part of the input string
                     * replaceBy:          the replace string
                     */

                    // now look for the replace string
                    out.ncat(not_yet_copied, start_match-not_yet_copied);                                                                // concat part before the match
                    error          = gbs_build_replace_string(out, replaceBy, sWildcard, sWildSeen, mWildcard, mWildSeen, callEnv);      // execute SRT command
                    not_yet_copied = source;
                }

                for (long i = 0; i < mWildSeen; i++) {
                    freenull(mWildcard[i]);
                }
                sWildSeen = 0;
                mWildSeen = 0;

                if (error || match_failed) break;
            }
        }

        // Note: reached when left side expression didn't match input string
        // (also reached when done with current sub-expression)
        if (error) break;

        out.cat(not_yet_copied); // cat the rest of the input

        if (traceACI) {
            print_trace(GBS_global_string("'%s' -> '%s'\n", in.get_data(), out.get_data()));
        }

        in.swap_content(out);
    }
    free(command);
    if (error) {
        GB_export_error(error);
        return NULp;
    }
    return in.release();
}

char *GBS_string_eval(const char *insource, const char *icommand) {
    GBL_env      env(NULp, NULp);
    GBL_call_env callEnv(NULp, env);

    return GBS_string_eval_in_env(insource, icommand, callEnv);
}

// --------------------------------------------------------------------------------

#ifdef UNIT_TESTS
#ifndef TEST_UNIT_H
#include <test_unit.h>
#endif

#include <arb_strarray.h>

static char *tokenMatchResults(const char *expr, GB_CASE caseDef, const char *tokenStr) {
    ConstStrArray token;
    GBT_split_string(token,tokenStr, ';');

    GBS_string_matcher *matcher = GBS_compile_matcher(expr, caseDef);
    for (int t = 0; token[t]; ++t) {
        bool matched = GBS_string_matches_regexp(token[t], matcher);
        token.replace(t, matched ? "1" : "0");
    }
    GBS_free_matcher(matcher);
    return GBT_join_strings(token, 0);
}


#define TEST_MATCH_TOKENS(expr,caseDef,tokenStr,expected) do{           \
        char *results = tokenMatchResults(expr, caseDef, tokenStr);     \
        TEST_EXPECT_EQUAL(results, expected);                           \
        free(results);                                                  \
    }while(0)

#define TEST_MATCH_TOKENS__BROKEN(expr,caseDef,tokenStr,expected,got) do{       \
        char *results = tokenMatchResults(expr, caseDef, tokenStr);             \
        TEST_EXPECT_EQUAL__BROKEN(results, expected, got);                      \
        free(results);                                                          \
    }while(0)

void TEST_matcher() {
    TEST_MATCH_TOKENS("???",  GB_MIND_CASE, "ab;abc;abcd", "010");  // only matches 2nd string
    TEST_MATCH_TOKENS("???*", GB_MIND_CASE, "ab;abc;abcd", "011");  // match at least 3 characters
    TEST_MATCH_TOKENS("?*",   GB_MIND_CASE, ";a;ab;abc",   "0111"); // match at least 1 character

    TEST_MATCH_TOKENS("a*c", GB_MIND_CASE,   "ca;ab;abc;abC;ABC;abcd", "001000");
    TEST_MATCH_TOKENS("a*c", GB_IGNORE_CASE, "ca;ab;abc;abC;ABC;abcd", "001110");

    TEST_MATCH_TOKENS("a*c*a",   GB_MIND_CASE, "aca;aaacccaaa;a--c--a;acac;acaca;aba", "111010");
    TEST_MATCH_TOKENS("a*c?d*c", GB_MIND_CASE, "acxdc;a--cxd--c;acxdcxdcxdc;acdcdcdc", "1110");

    TEST_MATCH_TOKENS("???*.c",    GB_MIND_CASE, "ab.c;abc.c;abcd.c",                   "011");
    TEST_MATCH_TOKENS("?b.*.c",    GB_MIND_CASE, "ab.c;ab..c;ab.x.c",                   "011");
    TEST_MATCH_TOKENS("rere*erer", GB_MIND_CASE, "rerer;rererer;rerererer;rererererer", "0011");
}

TEST_PUBLISH(TEST_matcher);

#endif // UNIT_TESTS

// --------------------------------------------------------------------------------

