#!/usr/bin/perl
# ========================================================= #
#                                                           #
#   File      : databaseReport.pl                           #
#   Purpose   : create generalized database report          #
#                                                           #
#   Coded by Ralf Westram (coder@reallysoft.de) in Jul 25   #
#   http://www.arb-home.de/                                 #
#                                                           #
# ========================================================= #
#
# Script to analyze and summarize ARB database contents.
#
# Generates structured reports about:
# - Field usage and protection levels
# - Species counts and marking statistics
# - Alignments, trees, SAIs, and selection areas
#
# Intended for diagnostic and audit purposes in ARB workflows.
#
# ========================================================= #

use strict;
use warnings;

BEGIN {
  if (not exists $ENV{'ARBHOME'}) { die "Environment variable \$ARBHOME has to be defined"; }
  my $arbhome = $ENV{'ARBHOME'};
  push @INC, "$arbhome/lib";
  push @INC, "$arbhome/PERL_SCRIPTS/lib";
  1;
}

use ARB;
use tools;

# utility functions to return the max/min of two values:
sub max($$) {
  my ($a, $b) = @_;
  $a<$b ? $b : $a;
}
sub min($$) {
  my ($a, $b) = @_;
  $a<$b ? $a : $b;
}

sub report_stamped($$) {
  # Prints a timestamped message to the output stream.
  my ($OUT, $message) = @_;

  my $stamp = scalar(localtime);
  print $OUT "$stamp | $message\n";
}
sub start_section($$$) {
  # Begins a new report section with optional underlining.
  my ($OUT, $section, $underlined) = @_;
  print $OUT "\n$section:\n";
  print $OUT '=' x (length($section)+1)."\n" if $underlined;
  print $OUT "\n";
}

# functions to read values of required or optional database entries:
sub read_required_entry($$$$$) {
  my ($parent, $key, $item_desc, $key_desc, $reader) = @_;
  my $gb_entry = ARB::entry($parent, $key);
  die "$item_desc lacks '$key'" unless $gb_entry;
  my $value = $reader->($gb_entry);
  expectError("read $key_desc of $item_desc") unless defined $value;
  return $value;
}
sub read_required_string_entry($$$$) {
  return read_required_entry($_[0], $_[1], $_[2], $_[3], \&ARB::read_string);
}
sub read_required_int_entry($$$$) {
  return read_required_entry($_[0], $_[1], $_[2], $_[3], \&ARB::read_int);
}
sub read_optional_string_entry($$) {
  my ($parent, $key) = @_;
  my $gb_entry = ARB::entry($parent, $key);
  return undef unless $gb_entry;

  my $value = ARB::read_string($gb_entry);
  expectError("read value of '$key'") unless defined $value;
  return $value;
}


sub scanFieldsRecursive($);
sub scanFieldsRecursive($) {
  # Recursively scans all subnodes of the current item.
  # Collects paths to leaf fields and their associated protection levels.
  my ($gb_item) = @_;
  my %childs = ();
  for (my $gb_child = ARB::child($gb_item); $gb_child; $gb_child = ARB::nextChild($gb_child)) {
    my $key = ARB::read_key($gb_child);

    my %sub = scanFieldsRecursive($gb_child);
    if (%sub) {
      foreach (keys %sub) {
        $childs{$key.'/'.$_} = $sub{$_};
      }
    }
    else {
      my $prot = ARB::read_security_write($gb_child);
      $childs{$key} = $prot;
    }
  }
  return %childs; # key=recursive field, value=protection
}

sub report_fields_by_name($\%\%\%\%\%) {
  # Outputs all collected fields sorted by name.
  # Includes information about data type, protection level and
  # the database browsers visibility status (hidden/unknown).
  my ($OUT, $fields_r, $hidden_r, $type_r, $protMin_r, $protMax_r) = @_;

  my $typeCodes = "-bcif-B-CIFlSS-%"; # copied from GB_type_2_char [c++]

  start_section($OUT, "Fields by name", 0);
  print $OUT "    TYPE            NAME\n\n";
  my @fieldsByName = sort { lc($a) cmp lc($b); } keys %$fields_r;
  foreach (@fieldsByName) {
    my $prot = undef;
    if (not defined $$protMin_r{$_}) { $prot .= '-'; }
    elsif ($$protMin_r{$_} == $$protMax_r{$_}) { $prot .= sprintf("%i", $$protMin_r{$_}); }
    else { $prot .= sprintf("%i-%i", $$protMin_r{$_}, $$protMax_r{$_}); }

    my $typeChar = undef;
    my $note = '';
    if (not defined $$type_r{$_}) {
      $typeChar = '?';
      $note .= '[unknown]';
    }
    else {
      die "expected known type for '$_'" if not defined $$type_r{$_};
      my $type = $$type_r{$_};
      $typeChar = substr($typeCodes, $type, 1);

      if (defined $$hidden_r{$_}) {
        if ($$hidden_r{$_}) {
          $note .= '[hidden]';
        }
      }
    }
    print $OUT sprintf("    %s%-3s %-10s %s\n", $typeChar, $prot, $note, $_);
  }
}

sub report_fields_by_frequency($\%$) {
  # Outputs all collected fields sorted by descending frequency.
  # Useful to identify commonly or rarely used fields.
  my ($OUT, $fields_r, $speciesCount) = @_;

  my $frequencyDigits = max($speciesCount =~ tr/0-9//, 9);

  start_section($OUT, "Fields by frequency", 0);
  my @fieldsByFrequency = sort {
    $fields_r->{$b} <=> $fields_r->{$a} || lc($a) cmp lc($b);
  } keys %{$fields_r};

  print $OUT sprintf("%*s %8s  %s\n\n", $frequencyDigits, "COUNT", "FREQ%", "NAME");
  foreach (@fieldsByFrequency) {
    my $count = $fields_r->{$_};
    my $percentage = int($count * 10000 / $speciesCount) / 100;
    print $OUT sprintf("%*i  %6.2f%%  %s\n", $frequencyDigits, $count, $percentage, $_);
  }
}

# --------------------------------------------------------------------------------
# globals collected by collectOccurringFields().
# used by reportSpeciesFields() and reportAlignments().

my $fieldsWereCollected = 0;

my $speciesCount  = 0;
my $markedSpecies = 0;

my %fields  = (); # parsed from species (key=fieldname, value=count)
my %protMin = (); # key=fieldname, value=min.protection
my %protMax = (); # key=fieldname, value=max.protection

# --------------------------------------------------------------------------------

sub collectOccurringFields($) {
  # Traverses the entire database and collects statistics on field usage and protection.
  # (this function must be called before any report_* function)
  my ($gb_main) = @_;

  # scan all species and collect occurring fields:
  for (my $gb_species = BIO::first_species($gb_main);
       $gb_species;
       $gb_species = BIO::next_species($gb_species)) {
    $speciesCount++;
    my $marked = ARB::read_flag($gb_species);
    if ($marked==1) {
      $markedSpecies++;
    }
    my %childs = scanFieldsRecursive($gb_species);
    foreach (keys %childs) {
      my $seen = $fields{$_};
      my $protect = $childs{$_};
      if (defined $seen) {
        $fields{$_} = $seen+1;
        $protMax{$_} = max($protMax{$_}, $protect);
        $protMin{$_} = min($protMin{$_}, $protect);
      }
      else {
        $fields{$_} = 1;
        $protMin{$_} = $protect;
        $protMax{$_} = $protect;
      }
    }
  }

  $fieldsWereCollected = 1;
}

sub reportSpeciesFields($$) {
  # Scans all known fields from /presets/key_data (=refreshed fields).
  # Considers all occurring fields (scanned by collectOccurringFields).
  #
  # Prints an overview about species & fields, and two field lists, one sorted
  # alphabetically, the other by frequency.
  my ($gb_main, $OUT) = @_;

  die "collectOccurringFields not called" if $fieldsWereCollected==0;

  my @keys   = (); # keys parsed from /presets/key_data
  my %hidden = (); # key=fieldname, value=1 if hidden
  my %type   = (); # key=fieldname, value=type number

  # search known keys (aka refreshed fields):
  my $gb_keys = ARB::search($gb_main, '/presets/key_data', 'NONE');
  die 'failed to find key-data' if not $gb_keys;
  for (my $gb_key = ARB::entry($gb_keys, 'key'); $gb_key; $gb_key = ARB::nextEntry($gb_key)) {
    my $name = read_required_string_entry($gb_key, 'key_name', 'key', 'name');
    my $type = read_required_int_entry($gb_key, 'key_type', 'key', 'type');

    # types are defined in ../../ARBDB/arbdb.h@GB_TYPES

    if ($type != 15) { # ignore containers
      my $hidden = read_required_int_entry($gb_key, 'key_hidden', 'key', 'hidden-flag');

      push @keys, $name;
      $hidden{$name} = $hidden;
      $type{$name} = $type;

      # add fields which are present in key-data but not occurring in fields with zero counter.
      if (not defined $fields{$name}) {
        $fields{$name} = 0;
      }
    }
  }

  start_section($OUT, "Species information", 1);

  my $fieldsCount = scalar(keys %fields);

  print $OUT "    Species: $speciesCount\n";
  print $OUT "    Marked:  $markedSpecies\n";
  print $OUT "    Fields:  $fieldsCount\n";

  report_fields_by_name($OUT, %fields, %hidden, %type, %protMin, %protMax);
  report_fields_by_frequency($OUT, %fields, $speciesCount);
}

sub reportAlignments($$) {
  # Outputs information about all known alignments defined in /presets/alignment.
  # For each alignment, shows name, type, length and usage in the DB.
  my ($gb_main, $OUT) = @_;

  die "collectOccurringFields not called" if $fieldsWereCollected==0;

  start_section($OUT, "Alignment information", 1);

  my $gb_presets = ARB::search($gb_main, '/presets', 'NONE');
  die 'failed to find alignment-data' if not $gb_presets;

  my @alignment = ();
  my %type = (); # key=name, value=type
  my %length = (); # key=name, value=length

  my $maxOccur = 0;
  my $maxLength = 0;

  for (my $gb_ali = ARB::entry($gb_presets, 'alignment'); $gb_ali; $gb_ali = ARB::nextEntry($gb_ali)) { # read all alignments
    my $name   = read_required_string_entry($gb_ali, 'alignment_name', 'alignment', 'name');
    my $type   = read_required_string_entry($gb_ali, 'alignment_type', 'alignment', 'type');
    my $length = read_required_int_entry($gb_ali, 'alignment_len', 'alignment', 'length');

    push @alignment, $name;
    $type{$name} = $type;
    $length{$name} = $length;

    $maxLength = max($maxLength, $length);
  }

  # extract occurring alignments from fields:
  my %seen = ();
  foreach (keys %fields) {
    if (/^(ali_[^\/]*)\/data$/o) {
      my $occur = $fields{$_};
      $seen{$1} = $occur;
      $maxOccur = max($maxOccur, $occur);
    }
  }

  my @all_ali = keys %seen;
  push @all_ali, @alignment;
  {
    my %all_ali = map { $_ => 1; } @all_ali;
    @all_ali = sort keys %all_ali;
  }

  my $OCCUR  = "SPECIES";
  my $LENGTH = "LENGTH";

  my $occurDigits = max(($maxOccur =~ tr/0-9//), length($OCCUR));
  my $lengthDigits = max(($maxLength =~ tr/0-9//), length($LENGTH));

  print $OUT sprintf("    %*s %8s  %4s  %*s  %s\n\n",
                     $occurDigits, $OCCUR, "FREQ%",
                     "TYPE",
                     $lengthDigits, $LENGTH,
                     "NAME");

  foreach (@all_ali) {
    my $occur = $seen{$_};
    if (not defined $occur) { $occur = 0; }
    my $percentage = int($occur * 10000 / $speciesCount) / 100;

    print $OUT sprintf("    %*s  %6.2f%%  %4s  %*s  %s\n",
                       $occurDigits, $occur, $percentage,
                       $type{$_},
                       $lengthDigits, $length{$_},
                       $_);
  }
}

sub countGroups($) {
  # Counts the number of taxonomic groups defined in the passed tree.
  my ($gb_tree) = @_;

  my $count = 0;
  for (my $gb_node = ARB::entry($gb_tree, 'node'); $gb_node; $gb_node = ARB::nextEntry($gb_node)) { # read all nodes (=groups)
    $count++;
  }
  return $count;
}

sub reportTrees($$) {
  # Lists all trees.
  # Reports how many species and groups are contained in each tree.
  my ($gb_main, $OUT) = @_;

  start_section($OUT, "Tree information", 1);

  my $gb_trees = ARB::search($gb_main, '/tree_data', 'NONE');
  die 'failed to find tree-data' if not $gb_trees;

  my @trees = ();
  my %nodeCount = ();
  my %groupCount = ();
  my $maxNodeCount = 0;
  my $maxGroupCount = 0;

  for (my $gb_sub = ARB::child($gb_trees); $gb_sub; $gb_sub = ARB::nextChild($gb_sub)) { # read all childs
    my $name = ARB::read_key($gb_sub); # the tree name is "stored" in the key name :-(
    expectError('read name of tree') if not $name;

    my $isTree = substr($name, 0, 5) eq 'tree_';
    die "entry is no tree: '$name'" if not $isTree;

    push @trees, $name;

    my $nodeCount = read_required_int_entry($gb_sub, 'nnodes', 'tree', 'nodeCount');
    my $groupCount = countGroups($gb_sub);

    $nodeCount{$name} = $nodeCount;
    $groupCount{$name} = $groupCount;

    $maxNodeCount = max($maxNodeCount, $nodeCount);
    $maxGroupCount = max($maxGroupCount, $groupCount);
  }

  my $nodeCountDigits = max(($maxNodeCount =~ tr/0-9//), 3);
  my $groupCountDigits = max(($maxGroupCount =~ tr/0-9//), 4);

  print $OUT sprintf("   %*s %*s  %s\n\n",
                     $nodeCountDigits+1, "LEAF",
                     $groupCountDigits+1, "GROUP",
                     "NAME");

  foreach (sort @trees) {
    my $nodeCount = $nodeCount{$_};
    my $groupCount = $groupCount{$_};
    print $OUT sprintf("    %*i  %*i  %s\n",
                       $nodeCountDigits, $nodeCount,
                       $groupCountDigits, $groupCount,
                       $_);
  }
}

sub reportSAIs($$) {
  # Reports all SAIs (sequence associated information) in the database.
  # Each SAI includes group, name and all alignments for which it has associated data. 
  my ($gb_main, $OUT) = @_;

  start_section($OUT, "SAI information", 1);

  my $gb_SAIs = ARB::search($gb_main, '/extended_data', 'NONE');
  die 'failed to find SAI-data' if not $gb_SAIs;

  my @SAIs = ();
  my %group = (); # key=sainame, value=groupname
  my %alis = ();  # key=sainame, value=ali-names

  my $maxGroupLen = 0;
  my $maxNameLen = 0;

  for (my $gb_SAI = ARB::entry($gb_SAIs, 'extended'); $gb_SAI; $gb_SAI = ARB::nextEntry($gb_SAI)) {
    my $name = read_required_string_entry($gb_SAI, 'name', 'SAI', 'name');
    push @SAIs, $name;
    $maxNameLen = max($maxNameLen, length($name));

    my $groupname = read_optional_string_entry($gb_SAI, 'sai_group');
    if (defined $groupname) {
      $group{$name} = $groupname;
      $maxGroupLen = max($maxGroupLen, length($groupname));
    }

    my @ali = ();
    for (my $gb_sub = ARB::child($gb_SAI); $gb_sub; $gb_sub = ARB::nextChild($gb_sub)) {
      my $key = ARB::read_key($gb_sub);
      expectError('entry w/o key') if not $key;
      if ($key =~ /^ali_/o) {
        push @ali, $';
      }
    }
    $alis{$name} = join ' ', sort @ali;
  }

  if ($maxGroupLen>0) {
    $maxGroupLen += 4; # 2 for brackets + 2 spaces
  }

  my @sortedSAIs = sort {
    (defined $group{$b} <=> defined $group{$a})
      ||
      (defined $group{$a} # safe to compare only if both are defined
       ? (lc($group{$a}) cmp lc($group{$b})
          ||
          $group{$a} cmp $group{$b})
       : 0)
      ||
      lc($a) cmp lc($b);
  } @SAIs;

  print $OUT sprintf("    %-*s%-*s    %s\n\n",
                     int($maxGroupLen), $maxGroupLen ? "GROUP" : "",
                     int($maxNameLen), "NAME",
                     "ALIGNMENTS");
  foreach (@sortedSAIs) {
    my $groupname = $group{$_};
    if (defined $groupname) { $groupname = '['.$groupname.']'; }
    else { $groupname = ''; }

    print $OUT sprintf("    %-*s%-*s    %s\n",
                       int($maxGroupLen), $groupname,
                       int($maxNameLen), $_,
                       $alis{$_});
  }
}

sub parse_area_data($) {
  # Extracts and parses an area-part of a species selection.
  # Counts the number of contained species, SAIs and groups.
  # see also similar code in ../../ARBDB/ad_config.cxx@PARSE_SELECTION

  my ($area) = @_;
  my ($species, $sai, $groups) = (0, 0, 0);

  my $sep = chr(1);
  my @items = split $sep, $area;

  die "area string is expected to start with separator"
    if $items[0] ne '';
  shift @items; # drop first

  foreach (@items) {
    my $type = substr($_, 0, 1);
    if ($type eq 'L') { $species++; }
    elsif ($type eq 'S') { $sai++; }
    elsif ($type eq 'F' or $type eq 'G') { $groups++; }
    elsif ($type ne 'E') {
      print "Warning: Invalid type $type in token '$_' (while parsing selection)\n";
    }
  }

  return ($species, $sai, $groups);
}

sub parse_config_data($$) {
  # Parses and sums up data from both configuration areas.
  my ($top_area, $middle_area) = @_;
  my ($tspecies, $tsai, $tgroups) = parse_area_data($top_area);
  my ($mspecies, $msai, $mgroups) = parse_area_data($middle_area);
  return ($tspecies+$mspecies, $tsai+$msai, $tgroups+$mgroups);
}

sub reportSelections($$) {
  # Reports information about species selections.
  # For each entry, shows name and number of selected species, SAIs and groups.
  my ($gb_main, $OUT) = @_;

  start_section($OUT, "Selection information", 1);

  my $gb_selections = ARB::search($gb_main, '/configuration_data', 'NONE');
  die 'failed to find configuration_data' if not $gb_selections;

  my @speciesSelections = ();

  my $maxSpecies = 0;
  my $maxSai     = 0;
  my $maxGroups  = 0;

  my %species = (); # key=configname, value=speciescount
  my %sai     = (); # key=configname, value=saicount
  my %groups  = (); # key=configname, value=groupscount

  for (my $gb_sel = ARB::entry($gb_selections, 'configuration'); $gb_sel; $gb_sel = ARB::nextEntry($gb_sel)) {
    my $name = read_required_string_entry($gb_sel, 'name', 'speciesSelection', 'name');
    push @speciesSelections, $name;

    my $top_area    = read_optional_string_entry($gb_sel, 'top_area')    // ""; # if result is undef => fallback to empty string
    my $middle_area = read_optional_string_entry($gb_sel, 'middle_area') // "";

    my ($species, $sai, $groups) = parse_config_data($top_area, $middle_area);

    $species{$name} = $species;
    $sai{$name}     = $sai;
    $groups{$name}  = $groups;

    $maxSpecies = max($maxSpecies, $species);
    $maxSai     = max($maxSai, $sai);
    $maxGroups  = max($maxGroups, $groups);
  }

  my @sortedSelections = sort {
    (($b eq 'default_configuration') <=> ($a eq 'default_configuration'))
    ||
    lc($a) cmp lc($b)
  } @speciesSelections;

  my $SPECIES = "SPEC";
  my $SAI     = "SAI";
  my $GROUPS  = "GRP";

  my $speciesDigits = max(($maxSpecies =~ tr/0-9//), length($SPECIES));
  my $saiDigits     = max(($maxSai     =~ tr/0-9//), length($SAI));
  my $groupsDigits  = max(($maxGroups  =~ tr/0-9//), length($GROUPS));

  print $OUT sprintf("    %*s  %*s  %*s  %s\n\n",
                     $speciesDigits, $SPECIES,
                     $saiDigits, $SAI,
                     $groupsDigits, $GROUPS,
                     "NAME");
  foreach (@sortedSelections) {
    print $OUT sprintf("    %*s  %*s  %*s  %s\n",
                       $speciesDigits, $species{$_},
                       $saiDigits,     $sai{$_},
                       $groupsDigits,  $groups{$_},
                       $_);
  }
}

sub generateReport($$) {
  # Main function that orchestrates report generation.
  # Opens the database, calls data collection, and generates all report sections.
  my ($dbFile, $reportFile) = @_;

  my $gb_main = ARB::open($dbFile,"r");
  $gb_main || expectError('db connect ('.(($dbFile eq ':') ? 'no running ARB?' : "could not open database '$dbFile'").')');
  dieOnError(ARB::begin_transaction($gb_main), 'begin_transaction');

  my $dbname;
  if ($dbFile eq ':') {
    $dbname = 'database running in arb';
  }
  else {
    if ($dbFile =~ /[^\/]*$/o) { $dbname = $&; }
    else { $dbname = $dbFile; }
  }

  open(REPORT, '>'.$reportFile) || die "Failed to write to $reportFile (Reason: $!)";
  report_stamped(\*REPORT, "analysing database '$dbname'");

  collectOccurringFields($gb_main);

  reportTrees($gb_main, \*REPORT);
  reportSAIs($gb_main, \*REPORT);
  reportAlignments($gb_main, \*REPORT);
  reportSelections($gb_main, \*REPORT);
  reportSpeciesFields($gb_main, \*REPORT);

  print REPORT "\n";
  report_stamped(\*REPORT, "end of report");
  close(REPORT);

  ARB::commit_transaction($gb_main);
  ARB::close($gb_main);
}

sub show_usage() {
  # Prints command line usage information.
  print <<"USAGE";
Purpose:
  Generate a structured and comparable report from an ARB database.

Usage:
  databaseReport.pl <database> <report>

Arguments:
  <database>     Path to the ARB database to analyze.
                 Use ':' to select the currently running database (much slower).
  <report>       Name of the report to create.

Description:
  This script analyzes selected contents of an ARB database and writes a
  standardized report. Such reports are suitable for visual inspection and
  comparison across different databases using diff-viewers like meld or kompare.

  The generated report includes:
    - Field usage, types and protection levels.
    - Species counts, marks and selections.
    - Alignments, trees and SAIs.

Example:
  ./databaseReport.pl my_db.arb my_report.txt

Note:
  Using ':' to access the currently running database is supported, but significantly slower.
  Prefer using a specific database path when possible.
USAGE
}

sub main() {
  # Handles argument parsing and output file handling.
  my $args = scalar(@ARGV);
  if ($args!=2) {
    show_usage();
  }
  else {
    my $dbFile = $ARGV[0];
    my $reportFile = $ARGV[1];

    generateReport($dbFile, $reportFile);
  }
}

main();