#!perl
use strict;
use Data::Dumper;
use Carp;

#
# This is a SAS Component
#

=head1 roles_to_proteins


roles_to_proteins can be used to extract the set of proteins (designated by MD5 values)
that currently are believed to implement a given role.  Note that the proteins
may be multifunctional, meaning that they may be implementing other roles, as well.


Example:

    roles_to_proteins [arguments] < input > output

The standard input should be a tab-separated table (i.e., each line
is a tab-separated set of fields).  Normally, the last field in each
line would contain the identifer. If another column contains the identifier
use

    -c N

where N is the column (from 1) that contains the subsystem.

This is a pipe command. The input is taken from the standard input, and the
output is to the standard output.

=head2 Documentation for underlying call

This script is a wrapper for the CDMI-API call roles_to_proteins. It is documented as follows:

  $return = $obj->roles_to_proteins($roles)

=over 4

=item Parameter and return types

=begin html

<pre>
$roles is a roles
$return is a reference to a hash where the key is a role and the value is a proteins
roles is a reference to a list where each element is a role
role is a string
proteins is a reference to a list where each element is a protein
protein is a string

</pre>

=end html

=begin text

$roles is a roles
$return is a reference to a hash where the key is a role and the value is a proteins
roles is a reference to a list where each element is a role
role is a string
proteins is a reference to a list where each element is a protein
protein is a string


=end text

=back

=head2 Command-Line Options

=over 4

=item -c Column

This is used only if the column containing the subsystem is not the last column.

=item -i InputFile    [ use InputFile, rather than stdin ]

=back

=head2 Output Format

The standard output is a tab-delimited file. It consists of the input
file with extra columns added. For each input line there can be multiple output lines, one per protein associated with the role.
Two extra columns are added to the output, function and protein.

Input lines that cannot be extended are written to stderr.

=cut


my $usage = "usage: roles_to_proteins [-c column] < input > output";

use Bio::KBase::CDMI::CDMIClient;
use Bio::KBase::Utilities::ScriptThing;

my $column;

my $input_file;

my $kbO = Bio::KBase::CDMI::CDMIClient->new_for_script('c=i' => \$column,
				      'i=s' => \$input_file);
if (! $kbO) { print STDERR $usage; exit }

my $ih;
if ($input_file)
{
    open $ih, "<", $input_file or die "Cannot open input file $input_file: $!";
}
else
{
    $ih = \*STDIN;
}

while (my @tuples = Bio::KBase::Utilities::ScriptThing::GetBatch($ih, 1, $column)) {
    my @h = map { $_->[0] } @tuples;
    my $h = $kbO->roles_to_proteins(\@h);
    for my $tuple (@tuples) {
        #
        # Process output here and print.
        #
        my ($id, $line) = @$tuple;
        my $v = $h->{$id};

        if (! defined($v))
        {
            print STDERR $line,"\n";
        }
        elsif (ref($v) eq 'ARRAY')
        {
            foreach $_ (@$v)
            {
                print "$line\t$_\n";
            }
        }
        else
        {
            print "$line\t$v\n";
        }
    }
}
