#!/usr/bin/env perl

### trinucleotide_genome.pl #######################################################################
# Reads the reference genome by 3 nucleotide windows and store the information
# - read the reference genome by 3 nucleotides windows
# - create hash table of genome locations by trinucleotide context
# - makes a directory for each of the sequences provided in the genome fasta, or for the chromosome specified
# - write the chromosome and middle nucleotide location to trinucleotide file
# - uses callable bases bed file to filter trinucleotides so only callable ones are hashed
# - zips files with gzip

### HISTORY #######################################################################################
# Version		Date		Developer		Comments
# 0.01			2015-05-20	lliu     		Initial code development.
#
### INCLUDES ######################################################################################
use BoutrosLab::Base;
use warnings;
use strict;
use Carp;
use Getopt::Long;
use Pod::Usage;
use File::Basename;
use File::Path qw(make_path);   # makes directories given paths
use FindBin qw($Bin);           # finds bin where this method was invoked
use Path::Class;
use Data::Dumper;
use List::MoreUtils qw( each_array );
use POSIX qw(strftime);
use Bio::SeqIO;                 # reads reference genome fasta file
use Bio::SeqIO::fasta;
use IO::Compress::Gzip qw(gzip $GzipError);

### COMMAND LINE DEFAULT ARGUMENTS ################################################################
# list of arguments and default values go here as hash key/value pairs
our %opts = (
	genome                           => undef,         # the reference genome file, in fasta format
	trinucleotide_directory          => undef,
	cb_bed_dir						 => 'None',
	filtered_dir					 => 'None',
	chromosome 						 => 'None',
	zip_files 						 => 'Y',
	only_zip						 => 'N'
	);

$opts {'scripts'} = {
	};

### MAIN CALLER ###################################################################################
my $result = main();
exit($result);

### FUNCTIONS #####################################################################################

### main ##########################################################################################
# Description:
# 		Main subroutine for program
# Input Variables:
# 		%opts = command line arguments
# Output Variables:
# 		N/A
sub main {
	GetOptions(
		\%opts,
		"help|?",
		"man",
		"genome|g:s" => \$opts{'genome'},
		"trinucleotide_directory|td=s" => \$opts{'trinucleotide_directory'},
		"cb_bed_dir|cbd:s" => \$opts{'cb_bed_dir'},
		"filtered_dir|fd:s" => \$opts{'filtered_dir'},
		"chromosome|c:s" => \$opts{'chromosome'},
		"zip_files:s" => \$opts{'zip_files'},
		"only_zip:s" => \$opts{'only_zip'}
		) or pod2usage(64);

	if ($opts{'help'}) { pod2usage(1) };
	if ($opts{'man'}) { pod2usage(-exitstatus => 0, -verbose => 2) };

	### check for undef arguments in %opts ###
	while(my ($arg, $value) = each(%opts)) {
		if (!defined $value) {
			print("ERROR: Missing argument $arg\n");
			pod2usage(128);		# prints out SYNOPSIS
			}
		} # while

	### ensure valid workspace ###
	if (! -d "$opts{'trinucleotide_directory'}") {
		print("ERROR: trinucleotide directory at $opts{'trinucleotide_directory'} does not exist\n");
		exit;
		}

	my $trinucleotide_dir = dir("$opts{'trinucleotide_directory'}");
	my $chrom = $opts{'chromosome'};
	my $cb_bed_dir = dir("$opts{'cb_bed_dir'}");

	#zip the txt trinucleotide files in the directory
	if ($opts{'only_zip'} eq 'Y') {
		&zip_files($trinucleotide_dir, $chrom);
		return 0;
		}

	#if genome is given then process the entire genome for trinucleotide information
	if ($opts{'genome'} ne 'None') {
		say "INFO: genome given. Generating trinucleotide files";
		my $genome = Bio::SeqIO->new(-file => "<$opts{'genome'}",
									-format => "largefasta");

		while (my $seq = $genome->next_seq) {

			my $primary_id = $seq->primary_id;

			if ($chrom eq 'None' || $chrom eq "$primary_id") {
				
				my $cb_bed = 'None';
				#if bed file is given then only hashing the region in bed file
				if ($cb_bed_dir ne 'None') {
					$cb_bed = "$cb_bed_dir/${primary_id}_collapsed.bed";
				}

				my ($trinucleotides) = &hash_genome($seq, $trinucleotide_dir, $cb_bed);
				my $write_dir = $trinucleotides->{"$primary_id"}->{"dir"};

				foreach my $trinucleotide (keys $trinucleotides->{"$primary_id"}) {
					next if $trinucleotide =~ m/dir/;
					(open my $output_file, ">", "$write_dir/$trinucleotide.txt") or croak "ERROR: unable to open $trinucleotide file at: $write_dir";
					#my $trinucleotide_num = scalar(@{$trinucleotides->{"$primary_id"}->{"$trinucleotide"}});
					#print ($output_file "$trinucleotide_num\n");
					foreach my $location (@{$trinucleotides->{"$primary_id"}->{"$trinucleotide"}}) {
						print ($output_file join("\t", @{$location})."\n");
						}
					close($output_file);
					
					if ($opts{'zip_files'} ne 'N') {
						`gzip -9 "$write_dir/$trinucleotide.txt"`;
						}
					}
				}
			}
		return 0;
		}

	#if bed directory is given but no genome is given then filter trinucleotides
	#TAKES MUCH LONGER THEN JUST HASHING THE GENOME WITH BED FILE, NOT RECOMMENDED
	if ($cb_bed_dir ne 'None' && $opts{'filtered_dir'} ne 'None') {
		say "INFO: No genome given. Filtering trinucleotides with bed file";

		if (! -d "$opts{'filtered_dir'}") {
			print("ERROR: filter trinucleotide directory at $opts{'filtered_dir'} does not exist\n");
			exit;
		}

		my $filtered_dir = dir("$opts{'filtered_dir'}");
		my @chromosomes = ();

		if ($chrom eq 'None') {
		# find all chromosome folders in the trinucleotide directory
			opendir(DIR, $trinucleotide_dir) or croak "ERROR: unable to open trinucleotide directory at : $trinucleotide_dir";
			while (my $folder = readdir(DIR)) {
				next unless (-d "$trinucleotide_dir/$folder");
				next if $folder eq '.' or $folder eq '..';
				ush (@chromosomes, $folder);
			}
			closedir DIR;
			@chromosomes = sort @chromosomes;
		} else {
			@chromosomes = ("$chrom");
			}

		foreach my $chromosome (@chromosomes) {
			say "INFO: filtering for $chromosome";
			my $chr_dir = $trinucleotide_dir->subdir($chromosome);
			my $fil_dir = $filtered_dir->subdir($chromosome);
			my $cb_bed = "$cb_bed_dir/${chromosome}_collapsed.bed";
			#read first line to check if bed file is correct
			if (-e $cb_bed){
				make_path("$fil_dir");
			} else {
				say "WARNING: bed file $cb_bed does not exist for $chromosome, filtering skipped, link made";
				#make symbolic link to original trinucleotides folder
				`ln -s $chr_dir $filtered_dir`;
				next;
			}

			my ($filtered) = &filter_locations($chr_dir, $cb_bed, $chromosome, $fil_dir);

			foreach my $trinucleotide (keys $filtered) {

				next if $trinucleotide =~ m/dir/;

				(open my $output_file, ">", "$fil_dir/$trinucleotide.txt") or croak "ERROR: unable to open $trinucleotide file at: $fil_dir";
				#my $trinucleotide_num = scalar(@{$trinucleotides->{"$seq"}->{"$trinucleotide"}});
				#print ($output_file "$trinucleotide_num\n");
				foreach my $location (@{$filtered->{"$trinucleotide"}}) {
					print ($output_file join("\t", @{$location})."\n");
					}

				close($output_file);
				if ($opts{'zip_files'} ne 'N') {
					my @args = ("gzip -9", "$fil_dir/$trinucleotide.txt");
					say "INFO: $trinucleotide file has been zipped." if system(@args) == 0;
					}
				}
			}
		return 0;
		}

	return 0;
	} #main

### hash_genome ####################################################################################
# Description:
# 		Read the reference genome file and create hash table of all trinucleotides with chromosome and location
#		Make the corresponding directories for each sequence in the genome
# Input Variables:
#		$seq = the sequence of the current chromosome given by genome
#		$trinucleotide_dir = the directory to generate the trinucleotide files
#		$cb_bed = the callable bases bed file for the current chromosome
# Output Variables:
# 		$trinucleotides = the trinucleotide hash of the genome

sub hash_genome {
	my ($seq, $trinucleotide_dir, $cb_bed) = @_;

	my $trinucleotides = {};
	my @nucleotides = ('A', 'T', 'C', 'G');
	my $seq_dirs = {};

	my $primary_id = $seq->primary_id;
	my $length = $seq->length;
	say "INFO: ID:\t$primary_id\tLength:\t$length";

	#list all 32 trinucleotide contexts
	foreach my $first (@nucleotides) {
		foreach my $middle ('C', 'T') {
			foreach my $last (@nucleotides) {
				$trinucleotides->{"$primary_id"}->{"$first$middle$last"} = [];
				}
			}
		}

	my $seq_dir = $trinucleotide_dir->subdir($primary_id);
	$trinucleotides->{"$primary_id"}->{"dir"} = $seq_dir;
	make_path("$seq_dir");

	#Read in the bed file for range or using all of the sequence length
	my @ranges;
	if (-e $cb_bed){
		#read in bed file locations
		my @bed_loc;
		open (my $bed_in, '<', $cb_bed) or croak "ERROR: unable to open bed file at : $cb_bed";
			while (<$bed_in>) {
				s/\W+$//;
				@bed_loc = split('\t');
				next if ($primary_id ne $bed_loc[0]); #skip line if chromosome of bed line does not match with current chromosome
				next if ($bed_loc[1] < 0 || $bed_loc[2] > $length);
				#bed file includes position at chromSTART and excludes position at chromEND
				my $loc_start = $bed_loc[1]+1;
				my $loc_end = $bed_loc[2];
				if ($bed_loc[1] == 0) {
					$loc_start+=1 if ($bed_loc[2] > 2);
					}
				if ($bed_loc[2] == $length) {
					$loc_end-=1 if ($bed_loc[1] < $length-1);
					}
				if ($loc_end >= $loc_start && $loc_start < $length && $loc_end > 1) {
					push (@ranges, [$loc_start, $loc_end]);
					}
				}
		close($bed_in);
		say "INFO: bed file $cb_bed read in for hashing genome within callable bases ranges";
	} else {
		push (@ranges, [2, $length-1]);
	}

	foreach my $range (@ranges) {
		foreach my $i (${$range}[0]..${$range}[1]) {
			my $trinucleotide = $seq->subseq($i-1,$i+1);
			my $strand = '+';
			$trinucleotide = uc($trinucleotide);
			next if $trinucleotide =~ m/N+/; #by default avoids picking mutation in location with N
			my $middle = substr($trinucleotide, 1, 1);
			if ($middle =~ m/A|G|R/) {
				$trinucleotide =~ tr/ATCGRY/TAGCYR/;
				$trinucleotide =~ s/^(.)(.)(.)$/$3$2$1/;
				$strand = '-';
				}
			#in case special contexts with other than ATCG exist, ignored
			if (exists $trinucleotides->{"$primary_id"}->{"$trinucleotide"}){
				push (@{$trinucleotides->{"$primary_id"}->{"$trinucleotide"}}, [$primary_id, $i, $strand]);
			} 
			#else {
			#	$trinucleotides->{"$primary_id"}->{"$trinucleotide"} = [];
			#	push (@{$trinucleotides->{"$primary_id"}->{"$trinucleotide"}}, [$primary_id, $i, $strand]);
			#	}
			}
		}
	return ($trinucleotides);
	}

### filter_locations ####################################################################################
# Description:
# 		Read the trinucleotide files and filter out locations not in bed files
# Input Variables:
#		$chr_dir = the of the trinucleotide files of a chromosome
#		$cb_bed = the callable bases beds
#		$chromosome = the chromosome of the current directory and bed file
#		$fil_dir = the directory to place the filtered trinucleotide files
# Output Variables:
# 		$filtered = the trinucleotide hash filtered

sub filter_locations {
	my ($chr_dir, $cb_bed, $chromosome, $fil_dir) = @_;

	my $filtered = {};
	my $bed_locations = [];
	my @nucleotides = ('A', 'T', 'C', 'G');
	my @trinucleotides = ();
	my $out_sum = 0;
	# preset flag assuming that location is NOT found
	my $flag = 1;
	my $count = 0;

	foreach my $first (@nucleotides) {
		foreach my $middle ('C', 'T') {
			foreach my $last (@nucleotides) {
				$filtered->{"$first$middle$last"} = [];
				push (@trinucleotides,"$first$middle$last");
				}
			}
		}

	#read in bed file locations
	open (my $bed_in, '<', $cb_bed) or croak "ERROR: unable to open bed file at : $cb_bed";
		while (<$bed_in>) {
			s/\W+$//;
			my @bed_loc = split('\t');
			next if ($chromosome ne $bed_loc[0]); #skip line if chromosome of bed line does not match with current chromosome
			push (@{$bed_locations}, [$bed_loc[1], $bed_loc[2]]);
			}
	close($bed_in);
	say "INFO: bed file $cb_bed read in";

	foreach my $trinucleotide (@trinucleotides){
		my $file = "$chr_dir/$trinucleotide.txt.gz";
		my $location_in;
		if (-e "$chr_dir/$trinucleotide.txt.gz"){
			(open $location_in, '-|', 'gzip', '-dc', "$chr_dir/$trinucleotide.txt.gz") or croak "ERROR: unable to open/decompress $trinucleotide.txt.gz file at: $chr_dir";
		} elsif (-e "$chr_dir/$trinucleotide.txt") {
			(open $location_in, "<", "$chr_dir/$trinucleotide.txt") or croak "ERROR: unable to open/decompress $trinucleotide.txt file at: $chr_dir";
		} else {
			croak "ERROR: trinucleotide file for $chromosome $trinucleotide does not exist";
		}
		while (<$location_in>) {
			$count += 1;
			s/\W+$//;
			my @tri_loc = split('\t');
			my $pos = ${tri_loc}[1] - 1;
			foreach my $location (@{$bed_locations}){
				#Bed file does not include the chromEND
				if ($pos >= ${$location}[0] && $pos < ${$location}[1]){
					push (@{$filtered->{"$trinucleotide"}}, @tri_loc);
					$flag = 0;
					last;
					}
				}
			$out_sum += 1 if ($flag == 1);
			$flag = 1;
			print "Trinucleotide:\t$trinucleotide\tread:\t$count\tfiltered:\t$out_sum\n" if ($count % 1000000 == 0);
		}
		close($location_in);
	}

	my $percent = sprintf("%.2f", ($out_sum/$count)*100);
	print "INFO: $chromosome\tread:\t$count\tfiltered:\t$out_sum\tpercent:\t${percent}%\n";
	return ($filtered);
	}

### zip_files ####################################################################################
# Description:
# 		Read the trinucleotide location files and zips them
# Input Variables:
#		$trinucleotide_dir = the directory of the trinucleotide files (overall directory)
#		$chrom = the chromosome of the current directory and bed file
# Output Variables:
# 		None

sub zip_files {

	my ($trinucleotide_dir, $chrom) = @_;

	my @chromosomes;
	my $trinucleotides = [];
	my @nucleotides = ('A', 'T', 'C', 'G');

	if ($chrom eq 'None') {
		#find all of the chromosome directories under trinculeotide directory
		opendir(DIR, $trinucleotide_dir) or croak "ERROR: unable to open trinucleotide directory at : $trinucleotide_dir";
		while (my $folder = readdir(DIR)) {
			next unless (-d "$trinucleotide_dir/$folder");
			next if $folder eq '.' or $folder eq '..';
			push (@chromosomes, $folder);
			}
		closedir DIR;
		@chromosomes = sort @chromosomes;
	} else {
		@chromosomes = ("$chrom");
		}

	#list all 32 trinucleotide contexts
	foreach my $first (@nucleotides) {
		foreach my $middle ('C', 'T') {
			foreach my $last (@nucleotides) {
				push(@{$trinucleotides}, "$first$middle$last");
				}
			}
		}

	foreach my $chromosome (@chromosomes){
		say "INFO: zipping files in $chromosome directory under $trinucleotide_dir";
		my $chr_dir = $trinucleotide_dir->subdir($chromosome);

		foreach my $trinucleotide (@{$trinucleotides}) {
			my $file = "$chr_dir/$trinucleotide.txt";
			`gzip -9 "$file"`;
		}

	}

	return undef;
	}


__END__


=head1 NAME

trinucleotide_genome.pl

=head1 SYNOPSIS

B<trinucleotide_genome.pl> [options] [file ...]

	Options:
	--help				brief help message
	--man				full documentation
	--genome			Genome FASTA in which to parse trinucleotide information (default = "None")
	--trinucleotide_dir location to generate the trinucleotide information files (Required)
	--cb_bed_dir		location to the bed files where callable bases information can be found (default = "None")
	--filtered_dir		location to generate the filtered trinucleotide files (default = "None")
	--chromosome 		chromosome or name of fasta sequence specified to act upon only (default = "None")
	--zip_files 		whether to zip the trinucleotide files generated (default: Y)
	--only_zip			whether to only zip files in the directory and run nothing else (default: N)

=head1 OPTIONS

=over 8

=item B<--help>

Print a brief help message and exit.

=item B<--man>

Print the manual page.

=item B<--genome>

Genome FASTA in which to parse trinucleotide information (default = "None").

=item B<--trinucleotide_dir>

Location to generate the trinucleotide information files.

=item B<--cb_bed_dir>

Location to the bed files where callable bases information can be found (default = "None").

=item B<--filtered_dir>

Location to generate the filtered trinucleotide files (default = "None").

=item B<--chromosome>

Chromosome or name of fasta sequence specified to act upon only (default = "None").

=item B<--zip_files>

Whether to zip the trinucleotide files generated (default: Y).

=item B<--only_zip>

Whether to only zip files in the directory and run nothing else (default: N).

=back

=head1 DESCRIPTION

B<trinucleotide_genome.pl> Reads the reference genome by 3 nucleotide window and store the information

 - read the reference genome by 3 nucleotides windows
 - create hash table of genome locations by trinucleotide context
 - makes a directory for each of the sequences provided in the genome fasta, or for the chromosome specified
 - write the chromosome and middle nucleotide location to trinucleotide file
 - uses callable bases bed file to filter trinucleotides so only callable ones are hashed
 - zips files with gzip through system call

=head1 EXAMPLE

Typical usage:

	trinucleotide_genome.pl --genome /path/to/your/sample.fasta --trinucleotide_directory /directory/to/save/files

	trinucleotide_genome.pl --genome /path/to/your/sample.fasta --trinucleotide_directory /directory/to/save/files --chromosome ${chrom} --cb_bed_dir /directory/to/your/beds

=head1 AUTHOR

Lydia Liu

Boutros Lab
The Ontario Institute for Cancer Research

=head1 ACKNOWLEDGEMENTS

Paul Boutros, PhD, PI -- Boutros Lab

Srinivasan Sivanandan -- Boutros Lab

=head1 SEE ALSO

generate_signature.pl --man

BAMsurgeon.pl --man

=cut
