#!/usr/bin/env perl

##################################################################################################################################
#                                                                                                                                #
# Gene Expression Omnibus (GEO): Cancer Prognostic Datasets Retriever                                                            #
# ~    ~          ~              ~      ~          ~        ~                                                                    #        
#                                                                                                                                # 
#                                                                                                                                #
# geoCancerPrognosticDatasetsRetriever, version 1.02                                                                             #
# -----------------------------------------------------                                                                          #
#                                                                                                                                #
# Last Update: 30/11/21                                                                                                          #
#                                                                                                                                #
# Author:    Abbas Alameer <abbas.alameer@ku.edu.kw>,                                                                            #
#          Bioinformatics and Molecular Modelling Group,                                                                         #
#                      Kuwait University                                                                                         #
#                                                                                                                                #
# Please email queries, suggestions, and possible bug information to the above author.                                           #
#                                                                                                                                #
# Brief Description:                                                                                                             #
# ------------------                                                                                                             #
#                                                                                                                                #
# Gene Expression Omnibus (GEO) Cancer Prognostic Datasets Retriever is a Bioinformatics tool for cancer prognostic dataset      #
# retrieval from the GEO database. It requires a GeoDatasets input file listing all GSE dataset entries for a specific cancer    #
# (ex. Bladder cancer), obtained as a download from the GEO database. This Bioinformatics tool functions by applying two         #
# heuristic filters to examine individual GSE dataset entries listed in a GEO DataSets input file. The Prognostic text filter    #
# flags for prognostic keywords (ex. “prognosis” or “survival”) used by clinical science researchers and present in the          #
# title/abstract entries of a GSE dataset. If found, this tool retrieves those flagged datasets. Next, the second filter         #
# (Prognostic Signature filter) filters these datasets further by applying prognostic signature pattern matching                 #
# (Perl regular expression signatures) to identify if the GSE dataset is a likely prognostic dataset.                            #
#                                                                                                                                #
#                                                                                                                                #
# The prerequisite for running this program in a UNIX or Linux environment is:                                                   #
# ----------------------------------------------------------------------------                                                   #
#                                                                                                                                #
# 1. cURL: If using a Ubuntu-based system, the program will assist the user in installing curl, otherwise manual installation    #
#          is required.                                                                                                          #
#                                                                                                                                #
# Program Usage:                                                                                                                 #
# --------------                                                                                                                 #
#                                                                                                                                #
# geoCancerPrognosticDatasetsRetriever -h [-d CANCER_TYPE] [-p PLATFORMS_CODES] [-f DIRECTORY_PATH] [-k]                         #
#                                                                                                                                #
##################################################################################################################################



#import standard Perl modules
use strict;
use warnings;
use Term::ANSIColor;
use Getopt::Std;
use LWP::Simple;
use File::Basename;
use File::HomeDir;


#variables
my %options 	          = (); #hash for storing command line switches and arguments
my $input_file;
my $formatted_input_file  = "formatted-input.dat";
my $cancer_type;
my ($query_term_1, $query_term_2);
my $output_file;
my $platform_gpl;
my $regex_platform;
my $line;
my $flag;
my $prog_flag;
my $wget_flag;
my $human_flag;
my $i                     = 0;
my $keyword_count         = 0;
my $more_count            = 0;
my $ftp_keyword_count     = 0;
my $extra_count           = 0;
my $filter2_count         = 0;
my %simple_hash           = ();
my $prog_path;
my $current_date_time     = date_time();
my $run_dir;
my @GEO_list              = ();
my $input_command_line;
my $general_dir;
my $temp_subdir;
my $data_subdir;
my $results_subdir;
my $home_dir              = File::HomeDir -> my_home;
my $absolute_path;
my $file_cleanup_switch   = 1; #set switch to true (i.e. defaults to file cleanup, unless user disables switch at CLI)
my $argv_line;



#run start-up.
start_up();

#perform initial checks.
initial_checks();

#check for input switches/arguments.
input_parameters_check();

#format the input file.
format_input($input_file, $formatted_input_file);

#run main processing events of geo_CPDR and output results.
main($formatted_input_file, $output_file);





###################################################
#                                                 #
#             SUBROUTINES BELOW                   #
#             -----------------                   #
#                                                 #
###################################################

############################ SUBROUTINE 1 #######################################################
#various checks done before program's run execution.
sub initial_checks {
	
	#check 1 - check that the script is installed on the system.
	#Prompt user to install it, if not found in the $PATH.
	my $which_path	  = qx{which geoCancerPrognosticDatasetsRetriever};
	
	unless ($which_path) {
		
		print color ("red"), "geoCancerPrognosticDatasetsRetriever is not installed on this system...\n", color("reset");
		print color ("red"), "See \"README\" for installation instructions.\n", color("reset");
		exit;
	} 

	#check 2 - check if CPAN module (LWP::Protocol::https) is installed on current system 
	#and install it if not found.
	my $cpan_module = "LWP::Protocol::https";

	eval "use $cpan_module";

	if ($@) { 

		print color ("red"), "CPAN module: \"$cpan_module\" not found...\n", color("reset");
		print color ("green"), "Preparing one time installation of $cpan_module....\nInstalling cpanm....\n", color("reset");
		#install cpanm to make installing other modules easier
		system ("cpan App::cpanminus");
		print color ("green"), "done\n", color("reset");
		print color ("green"), "Installing $cpan_module....\n", color("reset");
		#now install LWP::Protocol::https module
		system ("cpanm $cpan_module");
		print color ("green"), "done\n", color("reset");
	}
	
	#check 3 - check for the presence of curl binary in the $PATH. 
	#if not found, install on an Ubuntu/Ubuntu-based systems. 
	#if system is not Ubuntu, prompt user to install it manually.
	my $check = qx{which curl};
	
	if (!$check) {
			
		#check if current system is Ubuntu/or Ubuntu-based
		my $ubuntu = qx{uname -a};
			
		if ($ubuntu=~ /.+ubuntu.+/ig) {
			
			print color ("red"), "curl binary was not found: follow onscreen instructions/input your password for its installation...\n\n", color("reset");
			system("sudo apt -y install curl"); #install curl
			print "done\n";	
		} 
			
		else { 
				
			print color ("red"), "curl binary was not found: install it on your system.\n", color("reset"); 
		}	
	}	
}
############################ SUBROUTINE 2 #######################################################
#get the current date and time.
sub date_time {
	
    my ($sec, $min, $hour, $mday, $mon, $yr, $wday, $yday, $isdst) = localtime();
    my $ctime = localtime();
    my $time_hour;
    my $time_minutes; 
                                       #hour  #minutes
    if ($ctime =~ m/^\w+\s+\w+\s+\d+\s+(\d+)\:(\d+)\:\d+\s+\d+/) {
		
		$time_hour = $1;
		$time_minutes = $2;
	}
	
    my $month    = $mon + 1;
    my $year     = $yr + 1900;
    $current_date_time = "$year-0$month-$mday\_h$time_hour$time_minutes";
}
############################ SUBROUTINE 3 #######################################################
#This subroutine prints the program details at start-up.
sub start_up {
	
	print color ("yellow"),"  
#######################################################################
#                                                                     #
#           GEO Cancer Prognostic Datasets Retriever v1.02            #
#           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~            #
#                                                                     #
#              Author: Abbas Alameer, Kuwait University               #
#                         abbas.alameer\@ku.edu.kw                     #
#                                                                     #
#                                                                     #
#                       Developed in March/November 2021              #
#                     and released under GPLv2 license                #
#                                                                     #
#######################################################################\n\n\n" , color("reset");
}
############################ SUBROUTINE 4 #######################################################
#This subroutine checks all command line input switches and arguments (including optional ones).
#It warns user if mandatory command line input switches and arguments are missing.
sub input_parameters_check {

my $error_message = "Error: The following argument is missing: CANCER_TYPE\n";
my $help_message1  = "Usage: geoCancerPrognosticDatasetsRetriever -d \"CANCER_TYPE\"";
my $help_message2  = "Mandatory arguments:
  CANCER_TYPE           type of the cancer as query search term

Optional arguments:
  -p                    list of GPL platform codes
  -f                    user-specified absolute path to save results files
  -k                    option to keep temporary files
  -h                    show help message and exit\n";
 
 
 	#get command line parameters from @ARGV and append them all in string. Used for later output at the end of a run.	
	foreach my $element (@ARGV) {
	
		if ($element !~ m/-d|-f|-p|-k/) {
		
			$argv_line .= "\"$element\" ";
			
		} else {
		
			$argv_line .= "$element ";
		}
	}

	if ($argv_line) {
		
		$input_command_line = "User input command: geoCancerPrognosticDatasetsRetriever $argv_line";
	}
 
 
	#parse command line switches and their arguments into a hash.
	getopts("hd:p:kf:", \%options);
	
	#create main run directories - ignore if already present
	$general_dir	  = "~/geoCancerPrognosticDatasetsRetriever_files";
	$temp_subdir       = "~/geoCancerPrognosticDatasetsRetriever_files/temp/";
	$data_subdir	  = "~/geoCancerPrognosticDatasetsRetriever_files/data/";
	system("mkdir -p $general_dir $temp_subdir $data_subdir");
	$prog_path = $home_dir . "/geoCancerPrognosticDatasetsRetriever_files";
	
	#if user specifies that all results files should be kept, change $file_cleanup_switch to false.
	$file_cleanup_switch = 0 if defined $options{k};
	
    #Check for help switch and, if present, output help text.
	if ($options{h}) {

		print color ("green"), "$help_message1", color("reset");
		print color ("green"), "\n$help_message2\n", color("reset");
		exit;
	}
	
	#elsif ($options{d} and $options{p}) {
	elsif ($options{d}) {
		
		print color ("green"), "Checking input parameters...\n", color("reset");
		
		my $temp_filename = "$options{d}";
		($query_term_1, $query_term_2) = split ( / /, $temp_filename );
		
		#enforce a user to use full term for cancer in -d argument (i.e. "bladder cancer", not "bladder")
		if (!$query_term_1 or !$query_term_2) {
			
			print color ("red"), "Input argument error: Use full name for cancer in -d argument...\nFor example: \"bladder cancer\", not \"bladder\". The script will terminate here.\n", color("reset");
			exit;
			
		} else {
			
			#add dash in cancer type query search term.
			$cancer_type = uc ( join ( '-', $query_term_1, $query_term_2 ) );
		}
		
		#check for optional user-specified results directory and create it
		#or default to standard result directory
		if (exists $options{f}) {
		
			my $absolute_path = $options{f};
			$results_subdir = "/geoCancerPrognosticDatasetsRetriever_files/results/";
		
			#check for string argument (i.e. user-specified absolute path)
			if ($absolute_path =~ m/^\/\S+\/$/) {
			 
				chomp($absolute_path);
				
				#Check for forward slashes enclosing the arguments
				$results_subdir = "$absolute_path". "results/";
			} else {
				
				print color ("red"), "Input argument error: The directory name should include forward slashes...\nFor example: -f \"/temp_directory/\". The script will terminate here.\n", color("reset");
				exit;
			} 
		
		} else {
			
			print color ("green"), "No user-provided temporary file path through the -f argument is present...\nSaving files to: \"~/geoCancerPrognosticDatasetsRetriever_files/\" folder...\n", color("reset");
			 
			$results_subdir = "/geoCancerPrognosticDatasetsRetriever_files/results/";
		}
	
		#after checking if the CLI's "-f" option is present or not, create results directory's path.
		$results_subdir = $home_dir . $results_subdir;
		system("mkdir -p $results_subdir");
		
		#check if user-defined GPL platforms are specified.
		if (defined $options{p}) {
				
			$platform_gpl = uc($options{p});
				
			#error checks done in case a user specifies "-p" flag but includes missing or wrong inputs
			#and if that is the case, the default platform will be used.
			if ($platform_gpl !~ m/^GPL|^\d+/) { 
					
				$platform_gpl = "GPL570";
				print color ("green"), "No user-provided input platform through the -p argument is present...\nDefault platform: GPL570 will be used...\n", color("reset"); 
			}
				
		} else {
				
			#if a user does not specify any platform, default platform is set to "GPL570" 		
			$platform_gpl = "GPL570";
			print color ("green"), "No user-provided input platform through the -p argument is present...\nDefault platform: GPL570 will be used...\n", color("reset"); 
		}
		
		#after all checks above are completed - run mini() to initiate a run or check/restart aborted runs.
		mini();
	}
	
	#elsif (!$options{d} or !$options{p}) {
	elsif (!$options{d}) {
			
		print color ("green"), "$help_message1\n$help_message2\n", color("reset");
		print color ("red"), $error_message, color("reset");
		exit;	
	} 
	
	sub mini {
		
		#print color ("green"), "done\n", color("reset");
		
		my $restart_input_file;
		my $cancer = "$query_term_1";
		my @files = glob("$prog_path/data/$cancer\_cancer_GEO_*.txt");
		my @sorted_files = sort {$b cmp $a} @files;
		$run_dir = "$results_subdir" . "$cancer_type\_GEO-files";
		
		foreach my $file (@sorted_files) {
			
			$restart_input_file = basename($file);
			last;
		}

		#If an old run file was found, prompt the user with choices to make.
		if (-e "$run_dir") {
			
			print color ("red"), "$cancer_type\_GEO-files directory exists...This run was not completed\n", color("reset");
			my $text = "";
			my $ok   = timed_response( sub { 	
			
				print color ("red"), "Do you want to resume an interrupted execution [r], or start a new one [n]? (r/n)\nDefault selection will be [n] after 10 seconds...\n", color("reset"); $text = <STDIN>; 
			
			}, 10);
			
			chomp($text);

			if ($text eq "r") {
				
				print color ("green"), "Resuming analysis using input file: $restart_input_file\n", color("reset");
				#$platform_gpl	= uc($options{p});
				$platform_gpl	= uc($platform_gpl);
				my $regex1 		= join( '', ( split(/GPL/, $platform_gpl) ) );
				$regex_platform = join( '|', ( split(/ /, $regex1) ) );
				$input_file 	= $restart_input_file;
				$output_file 	= "$cancer_type.out";
			}
			
			#this is when the user selects "n", or types nothing/ or 10 seconds elapse -> defaults to "n"
			else {

				print color ("green"), "Starting new analysis...\n", color("reset");
				system ("rm -r $run_dir"); #remove old results output directory
				new_run($query_term_1);
			}

			sub timed_response {

				my ($f, $sec) = @_;

				return eval {
			  
					local $SIG{ALRM} = sub { die };
					alarm($sec);
					$f->();
					alarm(0);
					1;
				};
			}	
		}

		#else no "interrupted" run directory was found. Start a new run.
		else {
			
			new_run($query_term_1);
		}

		sub new_run {
			
			my $cancer      = $_[0];
			print color ("green"), "Downloading input file for \"$cancer\" cancer from GeoDatasets...", color("reset");
			$input_file     = download_geo_input($options{d});
			print color ("green"), "done\n", color("reset");
			system ("mkdir $run_dir"); #create results output directory
			my $regex1      = join( '', ( split(/GPL/, $platform_gpl) ) );
			$regex_platform = join( '|', ( split(/ /, $regex1) ) );
			$output_file    = "$cancer_type.out";
			
			#Check for the presence of the input file.
			unless (-e "$prog_path/data/$input_file") {
		
				print color ("red"), "Input file: $input_file was not found.\n", color("reset");
				exit;
			}
		}	
	}
}
############################ SUBROUTINE 5 #######################################################
# The following code was reused from the NCBI's NBK25501 reference textbook.
# See: https://www.ncbi.nlm.nih.gov/books/NBK25501/
# It was adapted in this subroutine with additional modifications.
sub download_geo_input {

    my $query    = $_[0];
    my ($cancer) = split(/ /, $query);
    my $geo_db   = 'gds';
    my $base     = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/';
    my $url      = $base . "esearch.fcgi?db=$geo_db&term=$query&usehistory=y";
    my $output   = get($url);                               
    my $web      = $1 if ($output =~ /<WebEnv>(\S+)<\/WebEnv>/);   
    my $key      = $1 if ($output =~ /<QueryKey>(\d+)<\/QueryKey>/);    

    #assemble the efetch URL
    $url         = $base . "efetch.fcgi?db=$geo_db&query_key=$key&WebEnv=$web";
    $url        .= "&rettype=abstract&retmode=text";
    
    my $data     = get($url);
    
    #Check for a GeoDatasets timeout error and abort run, if found.
    if (!$data) {
		
		print color ("red"), "\nThe download from GeoDatasets was not successful...\nA GeoDatasets timeout error was detected: current run aborted...\nPlease restart the run...\n", color("reset");
		exit; #abort current run
	}

    #add date & time to current input file download
	my $geo_datasets_file = "$cancer\_cancer_GEO_$current_date_time.txt"; 
    
    open(FH, '>', "$prog_path/data/$geo_datasets_file") or die "Cannot open file for writing the GDS input:$!\n";
    binmode(FH, ":utf8");
    
    print FH "$data";

    close(FH);
    
    return $geo_datasets_file;
}
############################ SUBROUTINE 6 #######################################################
#This subroutine performs minor formatting of a GEO input file to merge the title and abstract 
#lines together to prevent the regex lines from missing potential keyword hits in the 'title' 
#line.
sub format_input {
	
	my $raw_input 	=	$_[0];
	my $out_file  	=	$_[1];
	my $concatenate;
	
	print color ("green"), "Formatting Input: $input_file...", color("reset");
	
	open (IN, "$prog_path/data/$raw_input") or die "Cannot open file for reformatting: $raw_input. $!.\n";
	open (OUT, '>', "$prog_path/data/$out_file") or die "Cannot open file for writing reformatted data: $out_file $!\n";
	
	while ($line = <IN>) {
			
		#title line check only
		if ($line =~ m/(^\d+\.\s+.*)/) { 
			
			$concatenate = $line;
			chomp($concatenate);
		}
 			
		#abstract line
		elsif ($line !~ m/(^\d+\.\s+.*)/) {
			
			$concatenate .= "$line";
			print OUT "$concatenate";
			$concatenate  = ""; #reinitialize variable for next entry.
		}	
	}
	
	print color ("green"), "done\n", color("reset");
	
	close (IN);
	close (OUT);
}
############################ SUBROUTINE 7 #######################################################
#This subroutine runs the main processing steps, while running other subroutines to continue the 
#processing pipeline.
sub main {
	
	my $main_formatted_input_file  = $_[0];
	my $main_output_file           = $_[1];
	
	print color ("green"), "Analyzing Input: $main_formatted_input_file file...\n", color("reset");

	#open input file
	open (FH, "$prog_path/data/$main_formatted_input_file") or die "Cannot open file: $main_formatted_input_file $!\n";

	#open output file
	open (FH2, '>', "$results_subdir$main_output_file") or die "Cannot open file for writing data: $!\n";
    
	while ($line = <FH>) {
			
		if ($line =~ m/.*(prognosis|prognostic|prognostically|prognosticator|survival|survive|survives|survived|surviving).*/ig) { 
		#if ($line =~ m/.*(progno.+\s?|surviv.+\s?).*/ig) {
			
			$flag = 1; 
			$prog_flag = 1; 
			next; 
		}
		#this conditional activates when the above keywords are not found and only "more..." is found. 
		#Note this conditional implicitly doesn't get executed if both the desired keyword and "more..." are found.
		elsif ($line =~ m/.+(more\.\.\.)/ig) { 	
			
			$flag = 1; 
			$wget_flag = 1; 
			next;
		}
		
		elsif ($line =~ m/(^Organism:\s+Homo\s+sapiens.*)/ig) { 
			
			$simple_hash{'Organism_line'} = "$1";
			$human_flag = 1; 
			next; 
		}
		
		elsif ($line =~ m/^Type:.+/) { next; }
		
		#elsif ($line =~ m/.*Platform.?:\s+GPL(570|96|97)\s+.+/) {
		elsif ($line =~ m/.*Platform.?:\s+GPL($regex_platform)\s+.+/) {	
			
			$i++;
			$flag = 1;
			print "$i.\n$line$simple_hash{'Organism_line'}\n";
			print FH2 "$i.\n$line$simple_hash{'Organism_line'}\n";
			
			if ($wget_flag) { 
				
				print color ("green"), "Found 'more...': Checking abstract further...\n", color("reset");
				print FH2 "Found 'more...': Checking abstract further...\n"; 
			}
			
			next;
		} 
		
		#this regex will probably get alot of unwanted entries, but you can determine if they are legitimate 
		#if their samples are written in the output file for one of the 3 GPL570/97/96 platforms.
		elsif ($line =~ m/.*related\s+Platform.?.+/) {
							
			$i++;
			#Keep flag off to prevent particular GSE datasets - with unlisted platform data - from being processed. 
			#This action possibly reduces the FP rate.
			#$flag = 1;
			$flag = 0;  
			print "$i.\n$line$simple_hash{'Organism_line'}\n";
			print FH2 "$i.\n$line$simple_hash{'Organism_line'}\n";
			next;
		} 
		
		if ($flag) {
				
					      #FTP line main                              #GSE/GDS_code
			if ($line =~ m/^FTP.+(ftp:\/\/ftp.ncbi.nlm.nih.gov\/geo\/.+\/.+\/)(.+)\//) {
				
				my $ftp_line1 = $1;
				my $gse_code = $2;				
				my $ftp_command = "$ftp_line1" . "$gse_code" . "/";
				my $link = "$ftp_command" . "soft/$gse_code" . "_family.soft.gz";
				
				print $line;
				print FH2 $line;
				
				#Check if GDS file is found, then move to next line. Only GSE soft files are desired.
				if ($gse_code =~ m/GDS.*/ig) { 
					 
					#print $line; print FH2 $line;
					next; 
				}
					
				if ($prog_flag && $human_flag) { 
					
					print color ("yellow"), "Prognostic Text: <Prognostic keywords found>\n", color("reset");
					print FH2 "Prognostic Text filter: <Prognostic keywords found>\n";
					
					$prog_flag = 0; 
					$human_flag = 0; 
					$keyword_count++; 
					
					my $unzip_file = download_soft_file($link, $gse_code); #download soft file and store filename in variable $unzip_file.
					my %local_hash = prognostic_signature_finder($unzip_file);
					
					if (exists ($local_hash{$unzip_file})) { print FH2 "$local_hash{$unzip_file}\n"; }
				}
				
				elsif ($wget_flag && $human_flag) {
					
					$more_count++; 
					
					my %local_hash;
					($ftp_keyword_count, %local_hash) = soft_file_abstract_check($link, $gse_code);
					
					my $current_file = $gse_code . "_family.soft";
					if (exists ($local_hash{$current_file})) { print FH2 "$local_hash{$current_file}\n"; }
					if ($ftp_keyword_count) { $keyword_count++; $extra_count++; } #update keyword count, if a keyword was found in a soft file.
					
					$wget_flag = 0;
					$human_flag = 0;
				}
				
				next;
			}
				
			elsif ($line =~ m/^Series.+/) {		
				#print "$line\n"; print FH2 "$line\n";
				print "\n";
				next; 
			}
			
			elsif ($line =~ m/^Sample.+/) {
				#print "$line\n"; #print FH2 "$line\n";
				print "\n";
				next;
			}
				
			else { 
				
				$flag = 0;
				$prog_flag = 0;
				$wget_flag = 0;
				$human_flag = 0;
				next;	
			}
		}	
	}
	
	print color ("green"), "Analysis complete.\n", color("reset");
	print color ("green"), "$input_command_line\n\n", color("reset");
	
	system ("rm $prog_path\/data\/$main_formatted_input_file");
	
	my ($main_output_file_1, $main_outputout_file_2) = split (/\./, $main_output_file);
	my $main_output_file_timestamped = $main_output_file_1 . "_" . $current_date_time . "." . $main_outputout_file_2;
	
	system("mv $results_subdir$main_output_file $results_subdir$main_output_file_timestamped");
	
	print color ("green"), "=========================================================================================\n", color("reset");
	print FH2 "==========================================================================================\n";
	print color ("green"), "Check results file: $results_subdir", color("reset");
	print color ("blue"), "$main_output_file_timestamped\n", color("reset");
	print color ("green"), "Total prognostic datasets found: $filter2_count\n";
	print FH2 "Total prognostic datasets found:\t$filter2_count\n";

	foreach my $i (0 .. $#GEO_list) {
	
		my ($GSE_dataset) = split(/_/, "$GEO_list[$i]");
		my $j = $i + 1;
		print color ("green"), "[$j] $GSE_dataset\n";
		print FH2 "[$j] $GSE_dataset\n";
	} 

	close(FH);
	close(FH2);
	
	#check if output file is empty and if it is, then defined GPL series were not present in the user's input file. Alert user.
	is_file_empty($main_output_file_timestamped);

	#append date and time stamp to current run_dir in the results directory
	system("mv $run_dir $run_dir\_$current_date_time");
	
	#check $file_cleanup_switch status: if "1" ==> delete .soft files directory. 
	#This is only done at the end of a run here to allow for restarts of prematurely terminated runs.
	if ($file_cleanup_switch) {
		
		#remove the temp run files, keeping only the .out file
		system("rm -r $run_dir\_$current_date_time");
	}
	
}
############################ SUBROUTINE 8 #######################################################
#This subroutine checks if the output file is empty. If it is, then defined GPL series were not 
#present in user's input file.
sub is_file_empty {
		
	open my $check_file, '<', "$results_subdir$_[0]"; 
	my $first_line = <$check_file>;

	if ($first_line =~ m/^=+/) {
		
		print color ("red"), "No GPL series \"$platform_gpl\" were found in $input_file\n", color("reset"); 
	}

	close $check_file;
}
############################ SUBROUTINE 9 #######################################################
#This subroutine is called by other subroutines when a SOFT file download 
#is needed for further analysis.
sub download_soft_file {
		
	my $dsf_wget_file 	= $_[0];
	my $dsf_gse_id 		= $_[1];
	my $dsf_zip_file    = $dsf_gse_id . "_family.soft.gz";
	my $dsf_unzip_file  = $dsf_gse_id . "_family.soft";
	
	#Check for the presence of a ".gz file" for the current GSE dataset. 
	#If found, this means there is a potential incomplete/or corrupted download.
	#Delete file to restart download.
	if (-e "$prog_path/temp/$dsf_zip_file") { 
		
		print color ("red"), "\"$dsf_zip_file\" zip file exists\n", color("reset");
		print color ("red"), "Deleting corrupted file...", color("reset");
		system("rm $prog_path/temp/$dsf_zip_file");
		print color ("red"), "done\n", color("reset");
	}
	
	print color ("green"), "Downloading $dsf_gse_id soft file...\n", color("reset");
	
	if (-e "$run_dir/$dsf_unzip_file") {
		
		print color ("red"), "\"$dsf_unzip_file\" unzipped file exists\n", color("reset");	
	}
	
	else {
		
		#system("touch curl_log.txt"); #create a curl log file.
		#system("curl -O -C - $dsf_wget_file");
		#system ("cd $prog_path/bin/ && { curl -O -C - $dsf_wget_file ; cd -; }");
		system ("cd $prog_path/temp/ && { curl -O -C - $dsf_wget_file ; }");

		print color ("green"), "...done\n", color("reset");	
		print color ("green"), "Unzipping file...", color("reset"); #unzip file
		
		system ("gunzip  $prog_path/temp/$dsf_zip_file");
		system ("mv $prog_path/temp/*.soft $run_dir");
		
		print color ("green"), "done\n", color("reset");
	}
	
	return $dsf_unzip_file;
}
############################ SUBROUTINE 10 #######################################################
#This subroutine checks the GSE entries' full abstract for prognostic keywords. If the input 
#file's abstract is incomplete, "more..." is found. It calls the download_soft_file() to download 
#the .soft file and then checks for prognostic keywords. If keywords are detected, it calls the 
#prognostic_signature_finder() to check for prognostic signatures in the same .soft file
sub soft_file_abstract_check {
	
	my $wget_file     = $_[0];
	my $gse_id        = $_[1];
	my $wget_counter  = 0;
	my $unzip_file    = download_soft_file($wget_file, $gse_id); #download soft file and store filename in variable $unzip_file.
	
	#open soft file and search for prognostic keywords in all GSE entry abstracts.
	open (SOFT, "$run_dir/$unzip_file") or die "Cannot open file: $unzip_file $!\n";

	while (<SOFT>) {
		
		if ($_=~ m/^!Series_summary.+(prognosis|prognostic|prognostically|prognosticator|survival|survive|survives|survived|surviving).+/ig) {
		#if ($line =~ m/^!Series_summary.+(progno.+\s?|surviv.+\s?).*/ig) {
			$wget_counter++;	
		}
		
		else { next; }
	}
	
	close (SOFT);
	
	if ($wget_counter) { 
		
		print color ("yellow"), "Prognostic Text filter: <Prognostic keywords found>\n", color("reset");
		print FH2 "Prognostic Text filter: <Prognostic keywords found>\n";
		my %local_hash = prognostic_signature_finder($unzip_file);
		
		#returns count (i.e. 1 => minimum no. of occurrence of keyword), which will be added to count value from main script.
		#and returns a copy of the result of prognostic_signature_finder().
		return (1, %local_hash);
		
	} else {
		
		print color ("yellow"), "Prognostic Text filter: <No prognostic keywords found>\n", color("reset");
		print FH2 "Prognostic Text filter: <No prognostic keywords found>\n";
		return 0;
	}
}
############################ SUBROUTINE 11 #######################################################
#This subroutine uses regular expression analysis to detect prognostic signature patterns. 
#The regexes are based on over 50 parsed signatures used for different cancer types.
sub prognostic_signature_finder {

	my $soft_file             = $_[0];
	my %hash_signature        = ();
	my $soft_line             = "";
	my $regex_tail            = '(:|=)\s*(alive.*|no\sdeath|dea(d|th.*)|deceased|NED|DOD|DOC|0\s|1\s|no|yes|.*patient|died.*|alive|surviv(al|ed)|living|long|short|Y|N|NED|DOD|AWD|Exitus)';
	my $regex_keyword_type_A  = '(.*stat?us.*|.*(dea(d|th)|.*alive).*|.*\srecur\s.*|.*Die.*)' . $regex_tail;
	my $regex_keyword_type_B  = '(dss.?event\s?\(.+\)|dfs.*\w+|drfs.*|e\.dmfs.*|e\.rfs.*|e?\.?os.*)' . $regex_tail;
	my $regex_keyword_type_C  = '((Overall)?\s?survival.*|(overall)?.?event.*|outcome.*|prognosis.*|comort.*|evolution.*)' . $regex_tail;
	my $regex_keyword_type_Ca = '(Overall)?\s?survival\s.*(:|=)\s*(\d*)';
	my $outcome_1             = "Prognostic Signature filter: <Prognostic signature found>\n";
	my $outcome_2             = "Prognostic Signature filter: <No data found>\n";
	my $i                     = 0;
	
	#open SOFT file and read its contents
	open (KEY, "$run_dir/$soft_file") or die "Cannot open $soft_file: $!";
	
	while ($soft_line = <KEY>) {
	
		#regexes are evaluated according to their predominance from A - C
		if ($soft_line =~ /^!Sample_characteristics_ch1 = $regex_keyword_type_A/ig) { 
		
			print color ("yellow"), "$outcome_1", color("reset"); 
			print color ("green"), "$soft_line", color("reset");
			$hash_signature{"$soft_file"} = "$outcome_1";
			$i++; 
			last;
		} 
				
		elsif ($soft_line =~ /^!Sample_characteristics_ch1 = $regex_keyword_type_B/ig) { 
			
			print color ("yellow"), "$outcome_1", color("reset"); 
			print color ("green"), "$soft_line", color("reset");
			$hash_signature{"$soft_file"} = "$outcome_1";
			$i++;
			last;
		}
				
		elsif ($soft_line =~ /^!Sample_characteristics_ch1 = $regex_keyword_type_C/ig) { 
			
			print color ("yellow"), "$outcome_1", color("reset"); 
			print color ("green"), "$soft_line", color("reset"); 
			$hash_signature{"$soft_file"} = "$outcome_1";
			$i++;
			last;
		}
		
		elsif ($soft_line =~ /^!Sample_characteristics_ch1 = $regex_keyword_type_Ca/ig) { 
			
			print color ("yellow"), "$outcome_1", color("reset");  
			print color ("green"), "$soft_line", color("reset");
			$hash_signature{"$soft_file"} = "$outcome_1";
			$i++; 
			last;
		}
		
	}

	#Alert user if no prognostic signature was found in the current GSE .soft file
	unless ($i) { 
		
		print color ("yellow"), "$outcome_2", color("reset");
		$hash_signature{"$soft_file"} = "$outcome_2"; 
		
	}
	
	else {
		
			#store current dataset in an array
			push (@GEO_list, "$soft_file");
	}
	
	#count total flagged datasets found by the Prognostic Signature filter
	$filter2_count += $i;
	
	return %hash_signature;
	
	close (KEY);
}

exit 0;

=pod 

=encoding utf8

=head1 NAME

geoCancerPrognosticDatasetsRetriever - GEO Cancer Prognostic Datasets Retriever is a bioinformatics tool for cancer prognostic dataset retrieval from the GEO website.

=head1 SYNOPSIS

    Usage: geoCancerPrognosticDatasetsRetriever -h -d "CANCER_TYPE" -p "PLATFORMS_CODES" -f "DIRECTORY_PATH" -k 

An example basic command using "bladder cancer" as a query: 

    $ geoCancerPrognosticDatasetsRetriever -d "bladder cancer"

When using the basic command, the input and output files of geoCancerPrognosticDatasetsRetriever will be found in the `~/geoCancerPrognosticDatasetsRetriever_files/data/` and `~/geoCancerPrognosticDatasetsRetriever_files/results/` directories, respectively.

=head1 DESCRIPTION

Gene Expression Omnibus (GEO) Cancer Prognostic Datasets Retriever is a bioinformatics tool for cancer prognostic dataset retrieval from the GEO database. It requires a GeoDatasets input file listing all GSE dataset entries for a specific cancer (for example, bladder cancer), obtained as a download from the GEO database. This bioinformatics tool functions by applying two heuristic filters to examine individual GSE dataset entries listed in a GEO DataSets input file. The Prognostic Text filter flags for prognostic keywords (ex. "prognosis" or "survival") used by clinical scientists and present in the title/abstract entries of a GSE dataset. If found, this tool retrieves those flagged datasets. Next, the second filter (Prognostic Signature filter) filters these datasets further by applying prognostic signature pattern matching (Perl regular expression signatures) to identify if the GSE dataset is a likely prognostic dataset.

=head1 DEPENDENCIES

=over

=item strict

=item warnings

=item Term::ANSIColor

=item Getopt::Std

=item LWP::Simple

=item File::Basename

=item File::HomeDir

=item App::cpanminus

=item Net::SSLeay

=back

=head1 INSTALLATION

geoCancerPrognosticDatasetsRetriever can be used on any Linux, macOS, or Windows machines. On the Windows operating system you will need to install the Windows Subsystem for Linux (WSL) compatibility layer (L<The WSL Installation Page|https://docs.microsoft.com/en-us/windows/wsl/install/>). Once WSL is launched, the user can follow the geoCancerPrognosticDatasetsRetriever installation instructions described below.

By default, Perl is installed on all Linux or macOS operating systems. Likewise, cURL is installed on all macOS versions. cURL may not be installed on Linux and would need to be manually installed through a Linux distribution’s software centre. It will be installed automatically on Linux Ubuntu by geoCancerPrognosticDatasetsRetriever.

Manual install:

    $  perl Makefile.PL
    $  make
    $  make install

On Linux Ubuntu, you might need to run the last command as a superuser
(`sudo make install`) and you will need to manually install (if not
already installed in your Perl 5 configuration) the following packages:

libfile-homedir-perl

    $  sudo apt-get install -y libfile-homedir-perl

cpanminus

    $  sudo apt -y install cpanminus

LWP::Simple

    $  perl -MCPAN -e 'install "LWP::Simple"'

libnet-ssleay-perl

    $  sudo apt-get install -y libnet-ssleay-perl

CPAN install:

    $  cpanm App::geoCancerPrognosticDatasetsRetriever

To uninstall:

    $  cpanm --uninstall App::geoCancerPrognosticDatasetsRetriever
    
=head1 DATA FILE

The required input file is a GEO DataSets file obtainable as a download from GEO DataSets, upon querying for any particular cancer (for example, bladder cancer) in geoCancerPrognosticDatasetsRetriever.

=head1 EXECUTION INSTRUCTIONS

The basic usage for running geoCancerPrognosticDatasetsRetriever is:

    $  geoCancerPrognosticDatasetsRetriever -d "CANCER_TYPE"

An example basic usage command using "bladder cancer" as a query: 

    $  geoCancerPrognosticDatasetsRetriever -d "bladder cancer"

With the basic usage command, the mandatory -d (download) flag is used to download and then retrieve bladder cancer prognostic dataset(s) associated with the GPL570 platform code (default selection). When using this command, the input and output files of geoCancerPrognosticDatasetsRetriever will be found in the `~/geoCancerPrognosticDatasetsRetriever_files/data/` and `~/geoCancerPrognosticDatasetsRetriever_files/results/` directories, respectively.

For specialized options, allowing more fine-grained user control, the following options are made available:

-p <list of GPL platform codes>

A list of GPL platform codes may be specified prior to execution, for expanding prognostic datasets retrieval for a particular cancer (i.e. bladder cancer). For example:

    $  geoCancerPrognosticDatasetsRetriever -d "bladder cancer" -p "GPL570 GPL97 GPL96"

-f <user-specified absolute path to save results files>

A user-specified absolute path to save results files (overriding the default results directory) may by specified prior to execution. For example:

    $  geoCancerPrognosticDatasetsRetriever -d "bladder cancer" -p "GPL570 GPL97 GPL96" -f "/Bladder_cancer_files/"

With this command, the input files will be found in the same directory as a basic usage run's input files (`~/geoCancerPrognosticDatasetsRetriever_files/data/`. The output files will be found in the user-specified directory (for example, "/Bladder_cancer_files/"), created in the user's home directory.

-k <option to keep temporary files>

This option allows a user to keep large temporary/output files instead of them
being removed by default. For example:

    $  geoCancerPrognosticDatasetsRetriever -d "bladder cancer" -p "GPL570 GPL97 GPL96" -f "/Bladder_cancer_files/" -k

=head1 HELP

Help information can be read by typing the following command: 

    $ geoCancerPrognosticDatasetsRetriever -h

This command will print the following instructions:

Usage: geoCancerPrognosticDatasetsRetriever -h

Mandatory arguments:

    CANCER_TYPE           type of the cancer as query search term

    Optional arguments:
    -h                    show help message and exit
    -p                    list of GPL platform codes
    -f                    user-specified absolute path to save results files
    -k                    option to keep temporary files
    -h                    show help message and exit

=head1 AUTHOR

Abbas Alameer (Bioinformatics and Molecular Modelling Group, Kuwait University), in collaboration with Davide Chicco (University of Toronto)

For information, please contact Abbas Alameer at abbas.alameer(AT)ku.edu.kw

=head1 COPYRIGHT AND LICENSE

Copyright 2021 by Abbas Alameer, Kuwait University

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2 (GPLv2).

=cut
