#!/usr/bin/perl

# documentation at end of file

use strict;
use Getopt::Long;
use Pod::Usage;
use File::Basename qw(fileparse);
use Bio::ToolBox::legacy_helper qw(
	load_data_file
	open_data_file
	write_data_file
	open_to_write_fh
);
use Bio::ToolBox::utility;
my $VERSION =  '1.30';

print "\n This script will concatenate two or more data files\n\n";


### Quick help
unless (@ARGV) { 
	# when no command line options are present
	# print SYNOPSIS
	pod2usage( {
		'-verbose' => 0, 
		'-exitval' => 1,
	} );
}



### Get command line options and initialize values
my (
	$outfile,
	$gz,
	$help,
	$print_version,
);

# Command line options
GetOptions( 
	'out=s'     => \$outfile, # specify the input data file
	'gz!'       => \$gz, # compress output files
	'help'      => \$help, # request help
	'version'   => \$print_version, # print the version
) or die " unrecognized option(s)!! please refer to the help documentation\n\n";

# Print help
if ($help) {
	# print entire POD
	pod2usage( {
		'-verbose' => 2,
		'-exitval' => 1,
	} );
}

# Print version
if ($print_version) {
	print " Biotoolbox script join_data_file.pl, version $VERSION\n\n";
	exit;
}




### Check for required values
unless (scalar @ARGV > 1) {
	die "  OOPS! Two or more data files must be given!\n use $0 --help\n";
}




### Load first file
my $first_file = shift @ARGV;
my $first_data = load_data_file($first_file) or
	die "Unable to open first file '$first_file'!\n";




### Prepare output file name
# get outfile name
unless ($outfile) {
	# if the file was a set generated by split_data_file.pl
	# then it may use the # as a demarcation symbol for the basename and split
	# value
	# look for it and regenerate the original basename
	if ($first_data->{'basename'} =~ /^(.+)\#\w+$/) {
		$outfile = $1;
	}
	else {
		# ask the user for input, what else to do?
		print " please enter the output file name   ";
		$outfile = <STDIN>;
		chomp $outfile;
	}
}

# check extension
unless (defined $gz) {
	if ($outfile =~ /\.gz$/) {
		$gz = 1;
	}
	elsif ($ARGV[0] =~ /\.gz$/) {
		# first input file is compressed, so keep it that way
		$gz = 1;
	}
	# otherwise, keep it undefined
}




### Begin writing file
# Rewrite the first file using the new file output name
my $line_count = $first_data->{'last_row'};
my $new_outfile = write_data_file(
	'data'     => $first_data,
	'filename' => $outfile,
	'gz'       => $gz,
);
unless ($new_outfile) {
	die " unable to write output file!\n";
}
print " Joining file '$first_file'...   " . format_with_commas($line_count) . 
	" data lines merged\n";

# dump the table, no need to keep it in memory
delete $first_data->{'data_table'};





### Now write the remaining files

# Reopen the new file for appended writing
my $out_fh = open_to_write_fh($new_outfile, $gz, 1);

foreach my $file (@ARGV) {
	
	print " Joining file '$file'...";
	
	# open the file
	my ($file_fh, $file_data_ref) = open_data_file($file);
	unless ($file_fh) {
		die "\n Unable to open file '$file'! Unable to proceed!\n";
	}
	
	# check that file extension matches
	unless ($file_data_ref->{extension} eq $first_data->{extension}) {
		die "\n File extensions of input files do not match! Questionable joining!\n" . 
			" Compare $file_data_ref->{extension} with $first_data->{extension}\n";
	}
	
	# check for equal number of columns
	unless (
		$file_data_ref->{number_columns} == 
			$first_data->{number_columns} 
	) {
		die "\n Number of file columns don't match! Unable to proceed!\n";
	}
	
	# check first and last column names
	unless (
		$file_data_ref->{column_names}->[0] eq 
			$first_data->{0}->{name}
		and
		$file_data_ref->{column_names}->[-1] eq 
			$first_data->{ $first_data->{number_columns}-1 }->{name}
	) {
		print "\n   WARNING! Column header names don't match!! ";
		for my $i (0 .. $first_data->{number_columns}-1) {
			if (
				$file_data_ref->{column_names}->[$i] ne 
				$first_data->{column_names}->[$i]
			) {
				print "compare index $i, '" . 
					$file_data_ref->{column_names}->[$i] . "' with '" . 
					$first_data->{column_names}->[$i] . "'\n";
			}
		}
	}
	
	# continue writing the file
	while (my $line = $file_fh->getline) {
		print {$out_fh} $line;
		$line_count++;
	}
	$file_fh->close;
	$file_fh = undef;
	print "   " . format_with_commas($line_count) . " data lines merged\n";
}




### Finish
$out_fh->close;
print " Wrote combined file '$new_outfile'\n";




__END__

=head1 NAME

join_data_file.pl

A script to join two or more data files and concatenate rows.

=head1 SYNOPSIS

join_data_file.pl [--options] <file1> <file2> ...
  
  Options:
  --out <filename>
  --gz
  --version
  --help

=head1 OPTIONS

The command line flags and descriptions:

=over 4

=item --out <filename>

Provide the name of the output file. If the input files were 
split using 'split_data_file.pl', then the original base name 
may be reconstituted. Otherwise, the user will be asked for 
an output file name.

=item --gz

Indicate whether the output files should be compressed 
with gzip. Default behavior is to preserve the compression 
status of the first input file.

=item --version

Print the version number.

=item --help

Display the POD documentation

=back

=head1 DESCRIPTION

This program will join two or or more data files, essentially concatanating
the files but intelligently dealing with the metadata and column headers. 
Checks are made to ensure that the number of columns in the subsequent files 
match the first file.

The program will not merge datasets from multiple files; see 
the program 'merge_datasets.pl' for that.

This program is intended as the complement to 'split_data_files.pl'.

=head1 AUTHOR

 Timothy J. Parnell, PhD
 Howard Hughes Medical Institute
 Dept of Oncological Sciences
 Huntsman Cancer Institute
 University of Utah
 Salt Lake City, UT, 84112

This package is free software; you can redistribute it and/or modify
it under the terms of the Artistic License 2.0.  
