#!/usr/bin/perl
# ========================================================= #
#                                                           #
#   File      : clone_taxonomy.pl                           #
#   Purpose   : clone taxonomy entries                      #
#                                                           #
#   Coded by Ralf Westram (coder@reallysoft.de) in Jul 20   #
#   http://www.arb-home.de/                                 #
#                                                           #
# ========================================================= #

use strict;
use warnings;

# ---------------------------------
#      you may configure here:

my $tax_field      = 'tax_ltp';
my $fullname_field = 'fullname_ltp';

my $show_correct_entries = 0; # list genera with correct and unique tax-entries? (previously was default behavior)
my $check_invalid_inconsistencies = 1; # set to 0 to hide invalid entries from inconsistency check

# ---------------------------------

BEGIN {
  if (not exists $ENV{'ARBHOME'}) { die "Environment variable \$ARBHOME has to be defined"; }
  my $arbhome = $ENV{'ARBHOME'};
  push @INC, "$arbhome/lib";
  push @INC, "$arbhome/PERL_SCRIPTS/lib";
  1;
}

use ARB;
use tools;

sub is_unclassified($) {
  # tax entry will be overwritten, when this return 1
  my ($tax_value) = @_;
  if ($tax_value) {
    if (($tax_value =~ /unclassified/io) and not ($tax_value =~ /;/o)) { 1; }
    elsif ($tax_value eq '') { 1; }
    else { 0; }
  }
  else { 1; }
}
sub is_valid_tax4genus($$) {
  my ($tax,$genus) = @_;

  if (not ($tax =~ /;$genus$/)) { 0; }
  elsif ($tax =~ /^(.*;)*\s*(unclassified|undefined)\s*(;.*)*$/io) { 0; } # comment line to accept ";Unclassified;" inside tax
  else { 1; }
}

my %tax_seen  = (); # key=tax (full or prefix); value=occur count
my %tax_level = (); # key=tax (full or prefix); value=level (=number of ;)
my %tax_invalid = (); # key=tax (full only); value=1 -> invalid entry (not 6 tax levels)

sub register_tax($$);
sub register_tax($$) {
  my ($tax,$level) = @_; # level==0 -> initial call

  if (exists $tax_seen{$tax}) {
    $tax_seen{$tax}++;
  }
  else {
    $tax_seen{$tax} = 1;
    my $semi = $tax;
    $semi =~ s/[^;]//go;
    $tax_level{$tax} = length($semi);
    if ($level==0) {
      if ($tax_level{$tax} != 5) { # 5 semicolons == 6 tax levels
        $tax_invalid{$tax} = 1;
        $tax = ''; # do not register prefixes
      }
    }
  }
  if ($tax =~ /;[^;]+$/o) {
    register_tax($`,$level+1);
  }
}

sub log_tax_inconsistencies() {
  # log tax entries with an invalid number of levels:
  my @sorted_invalid = sort keys %tax_invalid;
  foreach my $tax (@sorted_invalid) {
    my $lev = $tax_level{$tax}+1;
    print "Warning: '$tax' has $lev hierarchy levels\n";
  }

  # search inconsistencies in tax hierarchy
  my @valid_taxa = map {
    if ($check_invalid_inconsistencies) { $_; }
    elsif (exists $tax_invalid{$_}) { ; }
    else { $_; }
  } keys %tax_seen;

  my %tax_ending_in = (); # key = suffix of tax (last part); value = array containing tax
  foreach my $tax (@valid_taxa) {
    my $suffix;
    if ($tax =~ /;([^;]+)$/o) {
      $suffix = $1;
    }
    else {
      $suffix = $tax;
    }
    my $end_r = $tax_ending_in{$suffix};
    if (defined $end_r) {
      push @$end_r, $tax;
    }
    else {
      $tax_ending_in{$suffix} = [ $tax ];
    }
  }

  my @sorted_suffixes = sort keys %tax_ending_in;
  foreach my $suf (@sorted_suffixes) {
    my $end_r = $tax_ending_in{$suf};
    my %unique_taxa = map { $_ => 1; } @$end_r;
    my @unique_taxa = keys %unique_taxa;
    my $count = scalar(@unique_taxa);
    $count>0 || die;
    if ($count>1) {
      print "Warning: taxonomy part '$suf' used multiple times:\n";
      foreach my $tax (sort @$end_r) {
        print "- '$tax'\n";
      }
    }
  }
}

sub dump_tax_statistic() {
  my @sorted_taxa = sort keys %tax_seen;
  my %level_count = (); # key=level; value=sum of 'tax_seen' of taxa on 'level'
  my $max_level = 0;

  foreach my $tax (@sorted_taxa) {
    my $seen = $tax_seen{$tax};
    my $level = $tax_level{$tax};

    if (not exists $level_count{$level}) {
      $level_count{$level} = $seen;
    }
    else {
      $level_count{$level} += $seen;
    }
    if ($level>$max_level) {
      $max_level = $level;
    }
  }

  for (my $level=0; $level<=($max_level+1); $level++) {
    my $lcount = $level_count{$level};
    if ((not defined $lcount) or (not $lcount)) {
      print "Found no taxonomy entries with level $level\n";
    }
    else {
      print "Taxonomy entries with level $level:\n";
      foreach my $tax (@sorted_taxa) {
        if ($tax_level{$tax}==$level) {
          my $pcount = $lcount;
          if ($tax =~ /;[^;]+$/o) {
            $pcount = $tax_seen{$`};
          }
          my $seen = $tax_seen{$tax};
          my $pc = $seen * 100.0 / $pcount;

          print sprintf("%5i=%5.1f%%: %s\n", $seen, $pc, $tax);
        }
      }
    }
  }
}

my %match_tax = (); # key=genus, value=matching tax entry
my %diff_tax  = (); # key=genus, value=matching, but different tax entry
my %bad_tax   = (); # key=genus, value=non-matching tax entry
my %seen_target = (); # key=genus, value=1 -> seen target entry (undefined tax + marked(if requested))
my %mod_targets = (); # key=genus, value=count of modified targets

sub init_genus($) {
  my ($genus) = @_;
  if (not defined $seen_target{$genus}) {
    $match_tax{$genus} = undef;
    $diff_tax{$genus} = undef;
    $bad_tax{$genus} = undef;
    $seen_target{$genus} = 0;
    $mod_targets{$genus} = 0;
  }
}

my $reg_genus = qr/^([^\s]+)\s/o;

sub clone_tax($$) {
  my ($markedOnly,$detailed_statistic) = @_;

  my $gb_main = ARB::open(":","r");
  $gb_main || expectError('db connect (no running ARB?)');

  dieOnError(ARB::begin_transaction($gb_main), 'begin_transaction');

  my $no_fullname = 0;
  my $has_fullname = 0;
  my $genus_detected = 0;

  # scan tax and fullnames:
  for (my $gb_species = BIO::first_species($gb_main);
       $gb_species;
       $gb_species = BIO::next_species($gb_species)) {

    my $fullname = BIO::read_string($gb_species, $fullname_field);
    if ($fullname) {
      $has_fullname++;
      if ($fullname =~ $reg_genus) {
        my $genus = $1;
        init_genus($genus);

        $genus_detected++;

        my $tax = BIO::read_string($gb_species, $tax_field);
        if ($tax) {
          $tax =~ s/[;\s]*$//ig; # silently ignore ';' and spaces at end of tax-entry. Also will clone w/o these.
        }
        my $unclassified = is_unclassified($tax);
        if ($unclassified) {
          my $is_target = $markedOnly ? ARB::read_flag($gb_species) : 1;
          if ($is_target) { $seen_target{$genus} = 1; }
        }
        else {
          if (is_valid_tax4genus($tax,$genus)) {
            if (defined $match_tax{$genus}) {
              if ($match_tax{$genus} ne $tax) {
                if (not defined $diff_tax{$genus}) {      # only store first different tax
                  $diff_tax{$genus} = $tax;
                }
              }
            }
            else {
              $match_tax{$genus} = $tax;
            }
          }
          else {
            if (not defined $bad_tax{$genus}) {
              $bad_tax{$genus} = $tax;
            }
          }

          if ($markedOnly==0 || ARB::read_flag($gb_species)==1) {
            register_tax($tax,0);
          }
        }
      }
      else {
        my $name = BIO::read_string($gb_species, "name");
        print "Warning: no genus detected in $fullname_field '$fullname' (species=$name)\n";
      }
    }
    else {
      $no_fullname++;
    }
  }

  print "species w/o  $fullname_field: $no_fullname\n";
  print "species with $fullname_field: $has_fullname\n";
  print "species with genus detected: $genus_detected\n";

  my @genera = sort keys %seen_target;

  print "different genera detected: ".scalar(@genera)."\n";

  for (my $pass=1; $pass<=4; ++$pass) {
    foreach my $genus (@genera) {
      if (defined $bad_tax{$genus}) {
        if ($pass==1) {
          print "Warning: seen invalid $tax_field content for '$genus': ".$bad_tax{$genus}."\n";
        }
      }
      elsif (defined $diff_tax{$genus}) {
        if ($pass==2) {
          print "Warning: $tax_field entries differ for '$genus':\n";
          print "  '".$match_tax{$genus}."'\n";
          print "  '".$diff_tax{$genus}."'\n";
        }
      }
      elsif (not defined $match_tax{$genus}) {
        if ($pass==3) {
          print "Warning: No valid $tax_field entry seen for '$genus'\n";
        }
      }
      elsif ($seen_target{$genus}==0) {
        if (($pass==4) and ($show_correct_entries!=0)) {
          print "All".($markedOnly ? " marked " : " ")."entries for '$genus' contain correct $tax_field content: ".$match_tax{$genus}."\n";
        }
      }
    }
  }

  for (my $gb_species = BIO::first_species($gb_main);
       $gb_species;
       $gb_species = BIO::next_species($gb_species)) {

    my $fullname = BIO::read_string($gb_species, $fullname_field);
    if ($fullname) {
      $has_fullname++;
      if ($fullname =~ $reg_genus) {
        my $genus = $1;
        if ((defined $match_tax{$genus}) and not (defined $diff_tax{$genus} or defined $bad_tax{$genus})) {
          if ($seen_target{$genus}==1) {
            my $tax = BIO::read_string($gb_species, $tax_field);
            if ($tax) {
              $tax =~ s/[;\s]*$//ig; # silently ignore ';' and spaces at end of tax-entry. Also will clone w/o these.
            }
            my $unclassified = is_unclassified($tax);
            if ($unclassified) {
              my $is_target = $markedOnly ? ARB::read_flag($gb_species) : 1;
              if ($is_target) {
                my $error = BIO::write_string($gb_species, $tax_field, $match_tax{$genus});
                if ($error) { die $error; }
                $mod_targets{$genus}++;
              }
            }
          }
        }
      }
    }
  }

  my $mod_sum = 0;
  my $mod_genera = 0;
  foreach my $genus (@genera) {
    my $mod = $mod_targets{$genus};
    if ($mod > 0) {
      print "Changed $mod entries of '$genus' into: ".$match_tax{$genus}."\n";
      $mod_sum += $mod;
      $mod_genera++;
    }
  }

  log_tax_inconsistencies();

  print "Overall: changed $mod_sum entries for $mod_genera genera.\n";

  if ($detailed_statistic==1) { dump_tax_statistic(); }

  ARB::commit_transaction($gb_main);
  ARB::close($gb_main);
}

sub main() {
  my $args = scalar(@ARGV);
  if ($args==0) {
    print "Usage: clone_taxonomy.pl [--marked|--all] [--statistic]\n";
    print "\n";
    print "Tries to clone the field '$tax_field' from existing entries.\n";
    print "\n";
    print "Operates on the database currently running in ARB\n";
    print "(call this script via \"Tools/Start XTERM\").\n";
    print "\n";
    print "Uses the first word of field '$fullname_field' to create\n";
    print "separate clusters (for separate genera). Each cluster is expected\n";
    print "to use only ONE common value in field '$tax_field'.\n";
    print "That value has to consist of multiple words separated by ';'. The last\n";
    print "word has to match the first word from field '$fullname_field'.\n";
    print "Missing entries (or value \"unclassified\" or similar) are accepted and will\n";
    print "be replaced by the detected common value for each genus.\n";
    print "\n";
    print "If --statistic is specified, additional statistic about taxonomy is dumped.\n";
    print "\n";
    print "This script logs many warnings about inconsistencies in '$tax_field'.\n";
  }
  else {
    my $arg = shift @ARGV;

    my $markedOnly         = 0;
    my $detailed_statistic = 0;

    while (defined $arg) {
      if ($arg eq '--marked') { $markedOnly = 1; }
      elsif ($arg eq '--all') { $markedOnly = 0; }
      elsif ($arg eq '--statistic') { $detailed_statistic = 1; }
      else { die "unknown argument '$arg'"; }
      $arg = shift @ARGV;
    }
    clone_tax($markedOnly,$detailed_statistic);
  }
}

main();
