#!/usr/bin/perl
# ========================================================= #
#                                                           #
#   File      : findNewContent.pl                           #
#   Purpose   : mark species by field content               #
#                                                           #
#   Coded by Ralf Westram (coder@reallysoft.de) in Dec 22   #
#   http://www.arb-home.de/                                 #
#                                                           #
# ========================================================= #

use strict;
use warnings;

BEGIN {
  if (not exists $ENV{'ARBHOME'}) { die "Environment variable \$ARBHOME has to be defined"; }
  my $arbhome = $ENV{'ARBHOME'};
  push @INC, "$arbhome/lib";
  push @INC, "$arbhome/PERL_SCRIPTS/lib";
  1;
}

use ARB;
use tools;

sub findNewContent($) {
  my ($field) = @_;

  my $gb_main = ARB::open(":","r");
  $gb_main || expectError('db connect (no running ARB?)');

  dieOnError(ARB::begin_transaction($gb_main), 'begin_transaction');

  my %marked_content   = ();
  my %unmarked_content = ();

  my $markedCount = 0; # counts previously marked species
  my $allCount    = 0; # count all species

  for (my $gb_species = BIO::first_species($gb_main);
       $gb_species;
       $gb_species = BIO::next_species($gb_species))
    {
      ++$allCount;

      my $content = BIO::read_string($gb_species, $field);
      if (not $content) { $content = ''; }

      my $marked = ARB::read_flag($gb_species);
      if ($marked==1) {
        ++$markedCount;
        $marked_content{$content} = 1;
      }
      else {
        $unmarked_content{$content} = 1;
      }
    }

  my $marked_content   = scalar(keys %marked_content);
  my $unmarked_content = scalar(keys %unmarked_content);

  my $notmarkedCount = $allCount-$markedCount; # species which were unmarked

  if ($markedCount==0 or $notmarkedCount==0) {
    print "Error: a subset of all species has to be marked (marked=$markedCount, unmarked=$notmarkedCount)\n";
  }
  else {
    print "Different content detected in species subsets:\n";
    print sprintf('- unmarked:%6i (in%6i species; %5.2f ~use)'."\n", $unmarked_content, $notmarkedCount, $notmarkedCount/$unmarked_content);
    print sprintf('- marked:  %6i (in%6i species; %5.2f ~use)'."\n", $marked_content,   $markedCount,    $markedCount/$marked_content);

    my $unmarkedCount = 0; # count species where mark has been removed

    for (my $gb_species = BIO::first_species($gb_main);
         $gb_species;
         $gb_species = BIO::next_species($gb_species)) {
      my $marked = ARB::read_flag($gb_species);
      if ($marked==1) {
        my $content = BIO::read_string($gb_species, $field);
        if (not $content) { $content = ''; }

        if (exists $unmarked_content{$content}) { # already seen in old subset
          ARB::write_flag($gb_species, 0); # => unmark species
          ++$unmarkedCount;
        }
      }
    }

    print "Unmarked $unmarkedCount of previously $markedCount marked species.\n";
    my $remain = $markedCount - $unmarkedCount;
    print "Species with new content remained marked: $remain\n";
  }

  ARB::commit_transaction($gb_main);
  ARB::close($gb_main);
}

sub show_usage() {
  print "Purpose: Find new content in field of a subset of species.\n";
  print "Usage: findNewContent.pl <field>\n";
  print "\n";
  print "What this script does:\n";
  print "- detect contents of <field> in subsets of marked and unmarked species.\n";
  print "- unmark all species which have <field> content already seen in at least one unmarked species.\n";
  print "\n";
  print "Missing fields will be treated as if field had an empty string as content.\n";
  print "=> when you specify a non-existing field, this script will simply unmark all species.\n";
}

sub main() {
  my $args = scalar(@ARGV);
  if ($args!=1) {
    show_usage();
  }
  else {
    my $field = $ARGV[0];
    findNewContent($field);
  }
}

main();
