#!/usr/bin/perl
# =============================================================== #
#                                                                 #
#   File      : import_from_table.pl                              #
#   Purpose   : import data from comma- or tab-separated tables   #
#                                                                 #
#   Coded by Ralf Westram (coder@reallysoft.de) in January 2011   #
#   Institute of Microbiology (Technical University Munich)       #
#   http://www.arb-home.de/                                       #
#                                                                 #
# =============================================================== #

use strict;
use warnings;

BEGIN {
  if (not exists $ENV{'ARBHOME'}) { die "Environment variable \$ARBHOME has to be defined"; }
  my $arbhome = $ENV{'ARBHOME'};
  push @INC, "$arbhome/lib";
  push @INC, "$arbhome/PERL_SCRIPTS/lib";
  1;
}

use ARB;
use tools;

sub usage() {
  print(
        "Usage: perl import_from_table.pl --match CF --write CF [options] datafile [database outdb]\n".
        "\n".
        "Imports one column from the calc-sheet 'datafile' into an ARB database.\n".
        "\n".
        "'datafile'     should be a list of tab-separated values.\n".
        "'database'     if a name is specified, the modified DB will be saved as 'outdb'.\n".
        "               Otherwise the database running in ARB will be modified.\n".
        "\n".
        "--match CF     CF:=column,field\n".
        "               Define a 'column' in the 'datafile' and a species-'field' in the database.\n".
        "               For each row the content of the 'column' has to match the content of the\b".
        "               'field' for exactly one species in the 'database'.\n".
        "               Useful fields are 'acc' and 'name'.\n".
        "--write CF     CF:=column,field\n".
        "               For each row in 'datafile' write the content of 'column' into the\n".
        "               'field' of the species matched via --match\n".
        "--mode MM      MM:=the match mode\n".
        "               Use '=' for plain string comparison (traditional behavior)\n".
        "               Also supports wordwise comparison - see http://help.arb-home.de/agde_import_calc.html\n".
        "\n".
        "Available 'options':\n".
        "--csv            expect 'datafile' is a list of comma-separated values (default: TAB-separated)\n".
        "--overwrite      overwrite 'field' specified via --write (default: abort if 'field' exists)\n".
        "--skip-unknown   silently skip rows that don't match any species (default: abort if no match found)\n".
        "--skip-empty     silently skip rows where match column is empty (default: error if occurs multiple)\n".
        "--marked-only    only write to marked species (default: all species)\n".
        "--mark           mark species to which field has been imported (unmarks rest)\n".
        "--as-integer     use INTEGER database-type for field (default: STRING)\n"
       );

}

sub max($$) { my ($a,$b) = @_; return $a<$b ? $b : $a; }

sub parse_CF($$) {
  my ($switch,$CF) = @_;
  my ($column,$field);
  eval {
    if ($CF =~ /^([^,]+),(.*)$/o) {
      ($column,$field) = ($1,$2);
      my $int_column = int($column);
      if ($int_column<1) { die "'$column' is not a valid column\n"; }
      my $error = ARB::check_key($field);
      if (defined $error) { die "'$field' is not a valid DB field name\n"; }
    }
    else { die "',' expected in '$CF'\n"; }
  };
  if ($@) { die "in '$switch $CF': $@\n"; }
  return ($column,$field);
}

# ---------------------------------------- [customized column parser]

my $reg_plain_normal  = undef;
my $reg_plain_lastCol = undef;
my $reg_quoted_normal  = undef;
my $reg_quoted_lastCol = undef;

sub set_separator($) {
  my ($sep) = @_;

  my $plain_rex  = "^([^$sep\"\r\n]*)"; # plain, probably empty column
  my $quoted_rex = "^\"((\"\"|[^\"]*)*)\""; # quoted column (may contain "" inside quotes)

  $reg_plain_lastCol  = qr/$plain_rex\n/;
  $reg_quoted_lastCol = qr/$quoted_rex\n/;

  $reg_plain_normal  = qr/$plain_rex$sep/;
  $reg_quoted_normal = qr/$quoted_rex$sep/;
}

sub parse_column(\$) {
  my ($line_r) = @_;
  my ($quoted,$lastCol) = (-1,-1);

  if    ($$line_r =~ $reg_plain_normal)   { $quoted = 0; $lastCol = 0; }
  elsif ($$line_r =~ $reg_quoted_normal)  { $quoted = 1; $lastCol = 0; }
  elsif ($$line_r =~ $reg_plain_lastCol)  { $quoted = 0; $lastCol = 1; }
  elsif ($$line_r =~ $reg_quoted_lastCol) { $quoted = 1; $lastCol = 1; }

  if ($quoted == -1) {
    return (undef, undef, undef);
  }

  my $col = $1;
  $$line_r = $';
  if ($quoted == 1) {
    if ($col =~ /\"/) {
      $col =~ s/\"\"/\"/og;
    }
  }
  return ($col, $quoted, $lastCol);
}

# ---------------------------------------- [custom CLI flags]

my ($matchcolumn,$matchfield);
my ($writecolumn,$writefield);

my $skip_unknown = 0;
my $skip_empty = 0;
my $overwrite = 0;
my $marked_only = 0;
my $mark = 0;
my $help_requested = 0;
my $int_type = 0;

# The next variables either
# - are undef if field/cell shall not get splitted, or
# - contain regexpr to be used with split().
my $reg_split_field = undef;
my $reg_split_cell = undef;
my $matchmode = undef;
my $ignore_dup_words = 0;

# ---------------------------------------- [custom CLI flags end]

sub main() {
  my $datafile;
  my $database     = ':';
  my $database_out = undef;

  my @no_option = ();

  set_separator("\t");

  eval {
    while (scalar(@ARGV)>0) {
      my $arg = shift @ARGV;
      if    ($arg eq '--match') { ($matchcolumn,$matchfield) = parse_CF($arg, shift @ARGV); }
      elsif ($arg eq '--write') { ($writecolumn,$writefield) = parse_CF($arg, shift @ARGV); }
      elsif ($arg eq '--mode') { $matchmode = shift @ARGV; }
      elsif ($arg eq '--csv') { set_separator(','); }
      elsif ($arg eq '--overwrite') { $overwrite = 1; }
      elsif ($arg eq '--skip-unknown') { $skip_unknown = 1; }
      elsif ($arg eq '--skip-empty') { $skip_empty = 1; }
      elsif ($arg eq '--marked-only') { $marked_only = 1; }
      elsif ($arg eq '--mark') { $mark = 1; }
      elsif ($arg eq '--as-integer') { $int_type = 1; }
      elsif ($arg eq '--help') { $help_requested = 1; return; } # only returns from eval!
      else { push @no_option, $arg; }
    }

    foreach (@no_option) {
      if (/^--/) {
        die "Unknown switch '$_'\n";
      }
    }

    $datafile = shift @no_option;
    if (not defined $datafile) { die "Missing argument 'datafile'\n"; }

    if (scalar(@no_option)) {
      $database     = shift @no_option;
      $database_out = shift @no_option;
      if (not defined $database_out) { die "Missing argument 'outdb'\n"; }
    }

    if (scalar(@no_option)) { die "Unexpected arguments: ".join(',', @no_option)."\n"; }

    if (not defined $matchcolumn) { die "Mandatory option '--match CF' missing\n"; }
    if (not defined $writecolumn) { die "Mandatory option '--write CF' missing\n"; }

    if (not defined $matchmode) { $matchmode = '='; }
    if ($matchmode ne '=') {
      eval {
        my $mm = $matchmode;
        while ($mm ne '') {
          if ($mm =~ /^([cf])([wd=])/o) {
            my ($target, $mode) = ($1, $2);
            $mm = $';
            my $target_reg = $target eq 'c' ? \$reg_split_cell : \$reg_split_field;
            if ($mode eq '=') {
              $$target_reg = undef;
            }
            else {
              if ($mm =~ /^(.)/o) {
                my $sep = $1;
                $mm = $';
                my $regexpr = qr/$sep/;
                $$target_reg = $regexpr;
                if ($mode eq 'd') {
                  if ($target ne 'c') { die "invalid use of '$mode' after '$target'\n"; }
                  $ignore_dup_words = 1;
                }
              }
              else {
                die "expected a separator char behind '$target$mode'\n";
              }
            }
          }
          else {
            die "unexpected content seen at '$mm'\n";
          }
        }
      };
      if ($@) {
        chomp $@;
        die "could not handle matchmode '$matchmode' (Reason: $@)";
      }
    }
  };

  if ($@) {
      die "\nError: $@(use --help to show usage)\n ";
  }
  if ($help_requested) {
    usage();
  }
  else {
    work($datafile, $database, $database_out);
  }
}

sub trim($) {
  my ($str) = @_;
  $str =~ s/^[\s]+//go;
  $str =~ s/[\s]+$//go;
  return $str;
}

sub split_wordwise($$) {
  my ($value, $reg_split) = @_;
  my @splitted = split $reg_split, $value;
  my %words = map {
    my $word = trim($_);
    if ($word eq '') { ; }
    else { $word => 1; }
  } @splitted;
  return keys %words;
}

sub work($$$) {
  my ($datafile, $database, $database_out) = @_;

  my $inform_ARB = 0; # [previously was defined globally]
  my $gb_main = ARB::open($database, "rw");
  if ($database eq ':') {
    if ($gb_main) { $inform_ARB = 1; }
    else { expectError('db connect (no running ARB)'); }
  }
  else {
    $gb_main || expectError('db connect (wrong \'database\' specified?)');
  }

  my %write_table = (); # key=matchvalue, value=writevalue
  my %source_line = (); # key=matchvalue, value=source-linenumber (corrected by joined_lines)
  my %cell_word   = (); # key=word from matchvalue (trimmed), value=matchvalue

  my $joined_lines = 0;
  my $no_content = 0;

  eval {
    if (not -f $datafile) { die "No such file '$datafile'\n"; }
    open(TABLE,'<'.$datafile) || die "can't open '$datafile' (Reason: $!)\n";

    eval {
      my $min_elems = max($matchcolumn,$writecolumn);
      my $line;
      my $current_line; # line number

      while (defined($line=<TABLE>)) {
        eval {
          my @row = ();

          my $done = 0;
          while ($done == 0) {
            my ($column, $wasQuoted, $wasLastCol) = parse_column($line);
            # print "column='$column' wasQuoted=$wasQuoted wasLastCol=$wasLastCol\n";
            if (defined $column) {
              push @row, $column;

              if ($wasLastCol==1 and $line =~ /^[\r\n]*$/o) { # only LF + CR or nothing left -> done with line
                $done = 1;
              }
            }
            else {                # test whether line contains quoted LF -> join next line
              if ($line =~ /\"/o) {
                my $nextLine = <TABLE>;
                if (not defined $nextLine) {
                  die "reached EOF while attempting to append multiline (quoted entry seems to contain LF/CR)\n";
                }
                $line .= $nextLine;
                $joined_lines++;
                # print "(detected quoted LF/CR -> appended next line)\n";
              }
              else {
                die "cannot interpret rest of line: '$line'\n";
              }
            }
          }

          my $relems = scalar(@row);
          if ($relems<$min_elems) {
            die "need at least $min_elems columns per table-line\n".
              "(seen only $relems column. Maybe wrong separator chosen?)\n";
          }

          my $matchvalue = $row[$matchcolumn-1];
          my $writevalue = $row[$writecolumn-1];

          if ($matchvalue eq '' and $skip_empty) {
            # skip rows with empty match cell, if requested via option!
            $no_content++;
            return; # from eval block
          }

          if (exists $write_table{$matchvalue}) {
            die "duplicated value '$matchvalue' in column $matchcolumn (first seen in row ".$source_line{$matchvalue}.")\n";
          }

          $current_line = $.-$joined_lines;
          $write_table{$matchvalue} = $writevalue;
          $source_line{$matchvalue} = $current_line;

          if (defined $reg_split_cell) {
            my @words = split_wordwise($matchvalue, $reg_split_cell);
            foreach my $word (@words) {
              if (exists $cell_word{$word}) {
                my $first_occurred_line = $source_line{$cell_word{$word}};
                my $dup_msg = "duplicated word '$word' (first seen in row $first_occurred_line)";

                if ($ignore_dup_words) {
                  if ($cell_word{$word} ne '') {
                    print "Warning: line $current_line: ignoring $dup_msg\n";
                  }
                }
                else {
                  die "Error: $dup_msg\n";
                }
              }
              else {
                $cell_word{$word} = $matchvalue;
              }
            }
          }
        };
        if ($@) { die "$@ (in row $current_line of '$datafile')\n"; }
      }

      # match and write to species
      dieOnError(ARB::begin_transaction($gb_main), 'begin_transaction');

      my $report = '';

      eval {
        my $ambiguous_hits  = 0;
        my %written         = (); # key=matchvalue, value: 1=written, 2=skipped cause not marked (but did match)

        for (my $gb_species = BIO::first_species($gb_main);
             $gb_species;
             $gb_species = BIO::next_species($gb_species)) {
          eval {
            my $species_value = BIO::read_as_string($gb_species, $matchfield);
            my $wanted_mark = 0;
            if ($species_value) {
              my $matched_value = undef;

              if (defined $reg_split_field) {
                my @words = split_wordwise($species_value, $reg_split_field);
                my %matched = (); # key=matchvalue, value=by which word
                foreach my $word (@words) {
                  my $mv = undef;
                  if ($reg_split_cell) {
                    $mv = $cell_word{$species_value};
                  }
                  else {
                    if (exists $write_table{$species_value}) {
                      $mv = $species_value;
                    }
                  }
                  if (defined $mv) {
                    $matched{$mv} = $word;
                  }
                }
                my @matched = keys %matched;
                my $matched = scalar(@matched);
                if ($matched>0) {
                  if ($matched==1) {
                    $matched_value = $matched[0];
                  }
                  else {
                    my @ambig_words = values %matched;
                    my @quoted_words = map { $_ => "'$_'"; } @ambig_words;
                    my $ambig_words = join ", ", @quoted_words;
                    die "words in field '$matchfield' hit multiple table rows ($ambig_words)";
                  }
                }
              }
              else {
                if (defined $reg_split_cell) {
                  $matched_value = $cell_word{$species_value};
                }
                else {
                  if (exists $write_table{$species_value}) {
                    $matched_value = $species_value;
                  }
                }
              }

              if (defined $matched_value) { # found table entry matching current species
                if ($marked_only==1 and ARB::read_flag($gb_species)==0) {
                  $written{$matched_value} = 2;
                }
                else {
                  my $existing_entry = BIO::read_as_string($gb_species, $writefield);
                  if ($existing_entry and not $overwrite) {
                    die "already has an existing field '$writefield'.\n".
                      "Use --overwrite to allow replacement.\n";
                  }
                  my $error = undef;
                  if ($int_type==1) {
                    $error = BIO::write_int($gb_species, $writefield, int($write_table{$matched_value}));
                  }
                  else {
                    $error = BIO::write_string($gb_species, $writefield, $write_table{$matched_value});
                  }
                  if ($error) { die $error; }
                  $wanted_mark = 1;
                  my $prev_written = $written{$matched_value};
                  if (defined $prev_written) {
                    $ambiguous_hits++;
                  }
                  $written{$matched_value} = 1;
                }
              }
            }
            if ($mark==1) {
              my $error = ARB::write_flag($gb_species,$wanted_mark);
              if ($error) { die $error; }
            }
          };
          if ($@) {
            my $name = BIO::get_name_or_description($gb_species);
            die "species '$name': $@";
          }
        }
        my $not_found  = 0;
        my $not_marked = 0;
        {
          my %missing = ();
          my $what = $skip_unknown ? 'Warning' : 'Error';
          foreach (keys %write_table) {
            my $wr = $written{$_};
            if (not defined $wr) {
              $missing{$_} = 1;
              $not_found++;
            }
            elsif ($wr==2) { # was not marked
              $not_marked++;
            }
          }

          if ($not_found>0) {
            my $shown = 0;
            my $maxShown = 30;
            my $show_all = 0;
            if ($not_found<50 or $skip_unknown==0) {
              $show_all = 1;
            }

          SHOWN: foreach (sort { $source_line{$a} <=> $source_line{$b}; } keys %missing) {
              print "$what: Found no matching species for row ".$source_line{$_}." ($matchfield='$_')\n";
              $shown++;
              if ($show_all==0 and $shown>$maxShown) {
                print "$what: (suppressing rest of $not_found messages)\n";
                print "Hint: to list all unmatched rows, do NOT allow to skip them using --skip-unknown\n";
                last SHOWN;
              }
            }
          }
        }
        if ($not_found>0 and $skip_unknown==0) {
          die "Failed to find $not_found species - aborting.\n".
            "(Note: use --skip-unknown to allow unknown references)\n";
        }
        $report = "Entries imported: ".(scalar(keys %written)-$not_marked)."\n";
        if ($ambiguous_hits>0) { $report .= "Ambiguous hits: $ambiguous_hits\n"; }
        if ($no_content>0) { $report .= "Skipped rows with empty match-cell: $no_content\n"; }
        if ($not_found>0) { $report .= "Unmatched (skipped) entries: $not_found\n"; }
        if ($not_marked>0) { $report .= "Entries not imported because species were not marked: $not_marked\n"; }

        print "\n".$report;
      };
      if ($@) {
        ARB::abort_transaction($gb_main);
        die $@;
      }
      ARB::commit_transaction($gb_main);
      if ($database ne ':') {                                   # database has been loaded
        print "Saving modified database to '$database_out'\n";
        my $error = ARB::save_as($gb_main, $database_out, "b");
        if ($error) { die $error; }
      }
      ARB::close($gb_main);

      if ($inform_ARB==1) {
        $report =~ s/\n$//;
        `arb_message "$report"`;
      }
    };
    if ($@) {
      close(TABLE);
      die $@;
    }
  };
  if ($@) {
    ARB::close($gb_main);
    die $@;
  }
}

# call main()

eval {
  set_inGlobalEvalState(1);
  main();
};
set_inGlobalEvalState(0);
if ($@) {
  die $@; # this die message calls arb_message (see ARB.pm)
  exit(-1);
}
exit(0);
