#!/usr/local/bin/perl
#
# $Id: findphone,v 1.1 1993/08/31 20:45:41 johans Exp $
#
# findphone
#
#   Performs a context sensitive search of phonemes in a given
#   database of label waveforms.
#
#
# Usage:
#	 findphone dbname [options] search_pattern
#
# Options:
#
#	-lola <file>	sets the lola output file name.
#			Defaults to <dbname>.$LolaExtention
#	
#	-from <file>	sets the 'from' file name.
#			Defaults to from.$LolaExtention
#
#	-wave <file>	sets the waveform output file name.
#			Defaults to <dbname>.$WaveExtention
#
#	-pause <number>	sets the pause in milliseconds between
#			the found phonemes. Defaults to 60 ms.
#
#	-pre		saves the pre-phoneme.
#
#	-post		saves the post-phoneme.
#
#	-count <number>	saves <number> of random phoneme.
#			Defaults to saving all found phonemes.
#
#	-nowave		tells findphone not to create
#			the wave file.
#
#	-seq <number>	is used in with -count.  This cause
#			findphone to save the next -count
#			number of phonemes after <number>.
#			
#
# Search Pattern:
#
#	Format:  <pre-phonemes> , <phonemes> , <post-phonemes>
#
#	<pre-phonemes>, <phonemes>, and <post-phonemes> are
#	whitespace separated list of phonemes. <pre-phonemes>
#	and <post-phonemes> can also be '-', which matches all phonemes.
#
# Example:
#
#	findphone say_lname -lola z.lola -wave z.adc -pause 50 -,z,-
#	findphone say_lname -pause 50 aa , z,-
#	findphone say_lname -lola z.lola aa ie ,ey z , ix
#

$ConfDir = "/projects/cslu/speech/findphone/configs";

$BaseDir;
$IndexDir;
$WaveExtention;
$LolaExtention;
$LolaPoints;		# in seconds
$SampleRate;		# in Hz
$NumberOfWaveforms = "all";
$SequentsNumber;

$PauseLength = 60;	# 60 millisecond pause between phonemes
$PauseSamples;
$SamplesPerFrame;
$FramesPerPause;

$FromOutputFile;
$LolaOutputFile;
$WaveOutputFile;

$DatabaseName;

$SavePrePhoneme = 0;
$SavePostPhoneme = 0;
$CreateWaveFile = 1;

@WaveformDirs;
@LolaDirs;

@PhonemeFoundList;

#
# CommandTable
#
# command table for interactive CLI
#
%CommandTable = (
	"quit", "QuitCmd",
	"database", "DbnameCmd",
	"show", "ShowCmd",
	"lola", "LolaCmd",
	"from", "FromCmd",
	"wave", "WaveCmd",
	"pre", "PreCmd",
	"post", "PostCmd",
	"pause", "PauseCmd",
	"count", "CountCmd",
	"help", "HelpCmd",
	"?", "PrintCmd", 
	"h", "PrintCmd", 
	"search", "SearchCmd"
);

#
# HelpTable
#
# helps strings for commands
#
%HelpTable = (
	"quit", "exit findphone",
	"database", "sets the current database name",
	"show", "display the current parrameter settings",
	"lola", "sets the name of the lola output file",
	"from", "sets the name of the from output file",
	"wave", "sets the name of the waveform output file",
	"pre", "toggles the saving of the pre-phoneme waveform",
	"post", "toggles the saving of the post-phoneme waveform",
	"count", "sets the number of waveforms to save",
	"search", "perform a phoneme search",
	"pause", "sets the pause in msec between the saved waveforms",
	"help", "prints help information",
	"?", "list commands",
	"h", "list commands",
);


#
# ExtractCmd
#
#  Maps $WaveExtention's to the extractor program.
#
%ExtractCmd = (
	"adc", "wavextract",
	"wav", "wavextract"
);

@IdMap;


sub StripWhite {
	local( $str ) = @_;

	while( $str =~ /\s$/ ) {
		chop( $str );
	}

	$str;
}

sub ParseConfig {
	local( $dbname ) = @_;
	local( $conffile, $path );


	$conffile = $ConfDir . "/" . $dbname . ".conf";

	open( CONF, $conffile ) || die "findphone: could not open configuration file for database \'$dbname\'";


	while( $_ = <CONF> ) {

		if( /^\n/ || /^#/ ) {
			next;
		}

		if( /^WaveformDir:\s*([^\n]*)/io ) {

			$path = do StripWhite( $1 );
			push( @WaveformDirs, $path );

		} elsif( /^LolaDir:\s*([^\n]*)/io ) {

			$path = do StripWhite( $1 );
			push( @LolaDirs, $path );

		} elsif( /^BaseDir:\s*([^\n]*)/io ) {
	
			if( defined( $BaseDir ) ) {
				print STDERR "BaseDir already set!!\n";
				exit( 1 );
			} 

			$BaseDir = do StripWhite( $1 );

		} elsif( /^IndexDir:\s*([^\n]*)/io ) {

			if( defined( $IndexDir ) ) {

				print STDERR "IndexDir already set!!\n";
				exit( 1 );

			}

			$IndexDir = do StripWhite( $1 );

		} elsif( /^WaveformExtention:\s*([^\n]*)/io ) {

			if( defined( $WaveExtention ) ) {
				print STDERR "WaveformExtention already set!\n";
				exit( 1 );
			}

			$WaveExtention = do StripWhite( $1 );

		} elsif( /^LolaExtention:\s*([^\n]*)/io ) {

			if( defined( $LolaExtention ) ) {
				print STDERR "LolaExtention already set!\n";
				exit( 1 );
			}

			$LolaExtention = do StripWhite( $1 );

		} elsif( /^LolaPoints:\s*([^\n]*)/io ) {

			if( defined( $LolaPoints ) ) {
				print STDERR "LolaPoints already set!\n";
				exit( 1 );
			}

			$LolaPoints = do StripWhite( $1 );

		} elsif( /^SampleRate:\s*([^\n]*)/io ) {

			if( defined( $SampleRate ) ) {
				print STDERR "SampleRate already set!\n";
				exit( 1 );
			}

			$SampleRate = do StripWhite( $1 );

		} else {
	
			chop( $_ );
			print STDERR "Unknown configuration line: \'$_\'";

		}

	}
}

sub ParseOptions {
	local( $arg );

	while( @ARGV ) {

		if( $ARGV[0] !~ /^-/ ) {
			last;
		}

		$arg = shift( @ARGV );

		if( $arg eq "-lola" ) {

			$arg = shift( @ARGV );
			$LolaOutputFile = $arg;

		} elsif( $arg eq "-from" ) {

			$arg = shift( @ARGV );
			$FromOutputFile = $arg;

		} elsif( $arg eq "-wave" ) {
	
			$arg = shift( @ARGV );
			$WaveOutputFile = $arg;

		} elsif( $arg eq "-pause" ) {

			$arg = shift( @ARGV );
			$PauseLength = $arg;

		} elsif( $arg eq "-pre" ) {

			$SavePrePhoneme = 1;

		} elsif( $arg eq "-post" ) {

			$SavePostPhoneme = 1;

		} elsif( $arg eq "-count" ) {

			$arg = shift( @ARGV );
			$NumberOfWaveforms = $arg;

		} elsif( $arg eq "-seq" ) {

			$arg = shift( @ARGV );
			$SequentsNumber = $arg;

		} elsif( $arg eq "-nowave" ) {

			$CreateWaveFile = 0;

		} elsif( $arg =~ /-,.*/ ) {

			unshift( @ARGV, $arg );
			return;

		} else {
		
			printf STDERR "findphone: bad arg \'$arg\'\n";
			exit( 1 );
		}
	}
}


sub ParsePhonemes {
	local( @args ) = @_;
	local( $tmp, $tmplist );


	while( @args ) {
		$tmp = shift( @args );
		$tmp =~ s/\s*,\s*/ , /og;
		$tmplist .= $tmp . " ";
	}
	chop( $tmplist );
	$tmplist =~ s/\s+/ /g;
	do SetPhonemes( $tmplist );

}

sub ReadFileMap {
	local( $id, $file );
	local( $k, @file );
	
	open( ID_MAP, "$IndexDir/ID.map" ) || die "could not open map file";
	@file = <ID_MAP>;
	close( ID_MAP );

	foreach $k ( @file ) {

		( $id, $file ) = split( /\s/, $k );
		$IdMap[$id] = $file;
	}


}
sub SetPhonemes {
        local( $str ) = @_;
        local( @list );


        @list = split( /,/ , $str );

	chop( $list[0] );
        $list[2] =~ s/^ //;
        $list[1] =~ s/^ //;

	if( $#list != 2 ) {
		print STDERR "$0: bad phoneme search pattern \'$str\'\n";
		exit( 1 );
	}

	if( ( $list[0] !~ /^\w+( \w+)*/ ) && ( $list[0] ne '-' ) 
	    && ($list[0] !~ /#h/) && ($list[0] !~ /h#/) ) {
		print STDERR "$0: bad phoneme search pattern (left) \'$str\'\n";
		exit( 1 );
	}

	if( ($list[1] !~ /^\w+( \w+)*/ ) && ($list[1] !~ /h#/) && ($list[1] !~ /#h/)) {
		print STDERR "$0: bad phoneme search pattern (center) \'$str\'\n";
		print STDERR "list[1] is <$list[1]>\n";
		exit( 1 );
	}

	if( ( $list[2] !~ /^\w+( \w+)*/ ) && ( $list[2] ne '-' ) 
	    && ($list[2] !~ /#h/) && ($list[2] !~ /h#/) ) {
		print STDERR "$0: bad phoneme search pattern (right) \'$str\'\n";
		exit( 1 );
	}

	$PrePhonemes = '(' . $list[0] . ')';
	$PrePhonemes =~ s/ /|/g;

        $SucPhonemes = '(' . $list[2] . ')';
        $SucPhonemes =~ s/ /|/g;

        @Phonemes = split( / /, $list[1] );

}

sub SearchDBFile {
        local( $file, $pat1, $pat2 ) = @_;
        local( $k, @file, @res );


        if( $pat1 eq "(-)" ) {
                $pat1 = ".+";
        }

        if( $pat2 eq "(-)" ) {
                $pat2 = ".+";
        }

        $pat = "^\\d+ \\d+ \\d+ $pat1 \\d+ $pat2 \\d+";


        open( INDEX, $file ) || return( () );

	@file = <INDEX>;

	foreach $k ( @file ) {

		if( $k =~ /$pat/o ) {
			push( @res, $k );
		}
	}

        close( INDEX );


        @res;
}

sub SearchIndexs {
	local( $p, $k, @tmp );

	foreach $p ( @Phonemes ) {

		if( -s "$IndexDir/$p.index" ) {

			@tmp = &SearchDBFile( "$IndexDir/$p.index", 
						$PrePhonemes, $SucPhonemes );

			foreach $k ( @tmp ) {
				push( @PhonemeFoundList, "$p $k" );
			}
		} else {
	
			print STDERR "$0: no index for phoneme \'$p\', skipping\n";
		}
	}
}


sub CreateLolaFile {
	local( $cur, $l );
	local( $ph, $id, $start, $stop, $p_ph, $p_start );
	local( $s_ph, $s_stop );
	local( $framerate );

	$cur = 0;

	open( LOLA, ">$LolaOutputFile" ) || die "findphone: could not open $LolaOutputFile";
	open( FROM, ">$FromOutputFile" ) || die "findphone: could not open $FromOutputFile";

	# Add headers to both lola files
	$framerate = $LolaPoints * 1000;
        print LOLA "MillisecondsPerFrame: $framerate\n";
        print LOLA "END OF HEADER\n";
        print FROM "MillisecondsPerFrame: $framerate\n";
        print FROM "END OF HEADER\n";

	foreach $l ( @PhonemeFoundList ) {
		( $ph, $id, $start, $stop, $p_ph, $p_start, 
			$s_ph, $s_stop ) = split( /\s/, $l );

		# Change from milliseconds to frames

		$start = int($start / 1000 / $LolaPoints);
		$stop = int($stop / 1000 / $LolaPoints);
		$p_start = int( $p_start / 1000 / $LolaPoints);
		$s_stop = int($s_stop / 1000 / $LolaPoints);

		print "Adding to the lolafile $l\n";
		if( $SavePrePhoneme == 1 ) {
			print LOLA "$cur";
			$cur += ( $start - $p_start );
			print LOLA " $cur $p_ph\n";
		}

		print LOLA "$cur";
		print FROM "$cur";
		$cur += ( $stop - $start );
		print LOLA " $cur $ph\n";
		print FROM " $cur $BaseDir/$IdMap[$id] $start\n";

		if( $SavePostPhoneme == 1 ) {
			print LOLA "$cur";
			$cur += ( $s_stop - $stop );
			print LOLA " $cur $s_ph\n";
		}

		print LOLA "$cur"; 
		$cur += $FramesPerPause;
		print LOLA " $cur pause\n";

	}

	close( LOLA );
	close( FROM );
}

sub CreateWaveFile {
	local( $k, $cmd );
	local( $ph, $id, $start, $stop, $p_ph, $p_start );
	local( $s_ph, $s_stop );
	local( $ps, $ss );
	
 	($#PhonemeFoundList == -1) && return;

	if( defined( $ExtractCmd{$WaveExtention} ) ) {
		$cmd = $ExtractCmd{$WaveExtention};
	} else {
		print STDERR "findphone: don't know how to extract waveform from and \'$WaveExtention\' file.\n";
		exit( 2 );
	}

	open( EXTRACTOR, "| $cmd $WaveOutputFile $PauseLength" ) ||
		die "could not start $cmd\n";

	foreach $k ( @PhonemeFoundList ) {

		( $ph, $id, $start, $stop, $p_ph, $p_start, 
			$s_ph, $s_stop ) = split( /\s/, $k );

		# Change from milliseconds to frames

		$start = $start / 1000 / $LolaPoints;
		$stop = $stop / 1000 / $LolaPoints;
		$p_start = $p_start / 1000 / $LolaPoints;
		$s_stop = $s_stop / 1000 / $LolaPoints;

		if( $SavePrePhoneme == 1 ) {
			$ps = $p_start * $SamplesPerFrame;
		} else {
			$ps = $start * $SamplesPerFrame;
		}

		if( $SavePostPhoneme == 1 ) {
			$ss = $s_stop * $SamplesPerFrame;
		} else {
			$ss = $stop * $SamplesPerFrame;
		}

		print EXTRACTOR "$BaseDir/$IdMap[$id] $ps $ss\n";

	}

	close( EXTRACTOR );

}

sub PickWaveforms {
	local( $number ) = @_;
	local( $i, @list, $el );

	srand( time );

	if( $number >= ($#PhonemeFoundList) ) {
		return;
	}

	
	while( $number-- > 0 ) {

		$i = int( rand( $#PhonemeFoundList ) );

		$el = splice( @PhonemeFoundList, $i, 1 );
		push( @list, $el );
	}

	@PhonemeFoundList = @list;
}

sub SaveWaveforms {
	local( $start, $count ) = @_;
	local( @list );

	if( $start > $#PhonemeFoundList ) {
		@PhonemeFoundList = ();
		return;
	}

	if( ( $start + ($count-1) ) > $#PhonemeFoundList ) {
		$count = $#PhonemeFoundList - $start + 1;
	}

	@list = splice( @PhonemeFoundList, $start, $count );

	@PhonemeFoundList = @list;
}

sub Init {
	local( $fpp, $ml );

	$ml = $LolaPoints * 1000;	# get milliseconds

	$FramesPerPause = int( $PauseLength / $ml );
	$PauseLength = int( $FramesPerPause * $ml );

	$PauseSamples = int( $SampleRate / 1000 ) * $PauseLength;
	$SamplesPerFrame = $ml * int( $SampleRate / 1000 );

	$LolaOutputFile = $DatabaseName . "." . $LolaExtention;
	$WaveOutputFile = $DatabaseName . "." . $WaveExtention;
	$FromOutputFile = "from." . $LolaExtention;

}

sub CommandLineDriver {

	$DatabaseName = $ARGV[0];

	do ParseConfig( $DatabaseName );
	shift( @ARGV );
	do Init();
	do ReadFileMap();
	do ParseOptions();

#
# Collect Phonemes to look for
#
	while( @ARGV ) {

		$tmp = shift( @ARGV );

		$tmp =~ s/\s*,\s*/ , /og;

		$TmpList .= $tmp . " ";
	}

	chop( $TmpList );

	$TmpList =~ s/\s+/ /g;

	do SetPhonemes( $TmpList );
	do SearchIndexs();


	if( defined( $SequentsNumber ) ) {

		if( $NumberOfWaveforms ne "all" ) {
			do SaveWaveforms( $SequentsNumber, $NumberOfWaveforms );
		} else {
			print STDERR "findphone: must use -count with -seq\n";
			exit( 1 );
		}

	} else {
		if( $NumberOfWaveforms ne "all" ) {
			do PickWaveforms( $NumberOfWaveforms );
		}
	}

	if( $CreateWaveFile == 1 ) {
		do CreateWaveFile();
	}

	do CreateLolaFile();
}

sub QuitCmd {
	local( @args ) = @_;

	exit( 0 );
}

sub QuitUsage {
	print "usage: quit\n";
}

sub DbnameCmd {
	local( @args ) = @_;

	if( $#args != 0 ) {
		do DbnameUsage();
		return;
	}

	$DatabaseName = $args[0];

	do ParseConfig( $DatabaseName );
	do Init();
	do ReadFileMap();

}

sub DbnameUsage {
	print "usage: dbname <database_name>\n";
}

sub FromCmd {
	local( @args ) = @_;

	if( $#args != 0 ) {
		do FromUsage();
		return;
	}

	$FromOutputFile = $args[0];

}

sub FromUsage {
	print "usage: from <file>\n";
}

sub LolaCmd {
	local( @args ) = @_;

	if( $#args != 0 ) {
		do LolaUsage();
		return;
	}

	$LolaOutputFile = $args[0];

}

sub LolaUsage {
	print "usage: lola <file>\n";
}

sub WaveCmd {
	local( @args ) = @_;

	if( $#args != 0 ) {
		do WaveUsage();
		return;
	}

	$WaveOutputFile = $args[0];

}

sub WaveUsage {
	print "usage: wave <file>\n";
}

sub PauseCmd {
	local( @args ) = @_;

	if( $#args != 0 ) {
		do PauseUsage();
		return;
	}

	$PauseLength = $args[0];

}

sub PauseUsage {
	print "usage: pause <time>\n";
}

sub PreCmd {
	local( @args ) = @_;

	if( $#args >= 0 ) {
		do PreUsage();
		return;
	}

	if( $SavePrePhoneme == 1 ) {

		$SavePrePhoneme = 0;
		print "pre-phomene waveform not saved\n";

	} else {

		$SavePrePhoneme = 1;
		print "saving pre-phomene waveform\n";

	}
}

sub PreUsage {
	print "usage: pre\n";
}

sub PostCmd {
	local( @args ) = @_;

	if( $#args >= 0 ) {
		do PostUsage();
		return;
	}

	if( $SavePostPhoneme == 1 ) {

		$SavePostPhoneme = 0;
		print "post-phomene waveform not saved\n";

	} else {

		$SavePostPhoneme = 1;
		print "saving post-phomene waveform\n";

	}
}

sub PostUsage {
	print "usage: post\n";
}

sub CountCmd {
	local( @args ) = @_;

	if( $#args != 0 ) {
		do CountUsage();
		return;
	}

	if( $args[0] eq "all" ) {

		$NumberOfWaveforms = "all";

	} elsif( $args[0] =~ /^\d+$/ ) {

		$NumberOfWaveforms = $args[0];

	} else {
		do CountUsage();
	}
}

sub CountUsage {
	print "usage: count <number>|all\n";
}

sub HelpCmd {
	local( @args ) = @_;
	local( $k, $c, $cmd );

	if( $#args < 0 ) {
		@args = keys %CommandTable;
	}

	foreach $k ( @args ) {
		print "$k - $HelpTable{$k}\n";
	}

}

sub PrintCmd {
	local( $i, $j, $maxlin, $format, $len );
	local( @list );

	@list = keys %CommandTable;

	foreach $i ( @list ) {
		if( ( $len = length( $i ) ) > $maxlen ) {
			$maxlen = $len;
		}
	}

	$maxlen++;
	$format = "%" . $maxlen . "s";

	$len = 0;
	$j = int( 60 / $maxlen );

	foreach $i ( @list ) {
		printf( $format, $i );
		$len++;
		if( $len >= $j ) {
			print "\n";
			$len = 0;
		}
	}
}

sub SearchCmd {
	local( @args ) = @_;

	if( $#args < 0 ) {
		do SearchUsage();
		return;
	}

	if( !defined( $DatabaseName ) ) {
		print "database no set\n";
		return;
	}

	do ParsePhonemes( @args );

	select( STDOUT );
	$| = 1;

	print "search in progress ...";
	do SearchIndexs();
	print " done.\n";

	if( $NumberOfWaveforms ne "all" ) {
		print "picking $NumberOfWaveforms to save ...";
		do PickWaveforms( $NumberOfWaveforms );
		print " done.\n";
	}

	print "writing output ...";
	do CreateWaveFile();
	do CreateLolaFile();
	print " done.\n";

	$| = 0;

}

sub SearchUsage {
	print "usage: search <patteren>\n";
}

sub ShowCmd {
	local( @args ) = @_;
	local( $k );

	if( $#args < 0 ) {
		@args = ( "database", "lola", "wave", "pause", "count",
			  "pre", "post" );
	}

	foreach $k ( @args ) {

		if( $k eq "database" ) {

			if( defined( $DatabaseName ) ) {
				print "database: $DatabaseName\n";
			} else {
				print "database: not set\n";
			}

		} elsif( $k eq "lola" ) {

			if( defined( $LolaOutputFile ) ) {
				print "lola: $LolaOutputFile\n";
			} else {
				print "lola: not set\n";
			}

		} elsif( $k eq "wave" ) {
			if( defined( $WaveOutputFile ) ) {
				print "wave: $WaveOutputFile\n";
			} else {
				print "wave: not set\n";
			}

		} elsif( $k eq "pause" ) {
			if( defined( $PauseLength ) ) {
				print "pause: $PauseLength\n";
			} else {
				print "pause: not set\n";
			}

		} elsif( $k eq "count" ) {
			if( $NumberOfWaveforms eq "all" ) {
				print "count: saving all waveforms found\n";
			} else {
				print "count: $NumberOfWaveforms\n";
			}

		} elsif( $k eq "pre" ) {

			if( $SavePrePhoneme == 1 ) {
				print "pre: saving pre-phonemes\n";
			} else {
				print "pre: pre-phonemes not saved\n";
			}

		} elsif( $k eq "post" ) {

			if( $SavePostPhoneme == 1 ) {
				print "post: saving post-phonemes\n";
			} else {
				print "post: post-phonemes not saved\n";
			}

		} else {
			print "$k: unknown parameter\n";
		}
	}
}

sub ShowUsage {
	print "usage: show [<parametter>]\n";
}

sub InteractiveDriver {
	local( $cmd, @line );

	select( STDOUT );
	$| = 1;
	
	print "findphone> ";

	while( <STDIN> ) {

		chop;
		@line = split;

		if( $#line >= 0 ) {
			
			if( defined( $CommandTable{$line[0]} ) ) {
				$cmd = $CommandTable{$line[0]};
				shift( @line );
				print "\n";
				do $cmd( @line );
				print "\n";
			} else {
				print "\n$line[0]: unknown command.\n\n";
			}
		}

		print "findphone> ";
	}
}

#
#
# Main
#
#

if( $#ARGV < 0 ) {
	print STDERR "usage:\n";
	print STDERR "       findphone -i\n";
	print STDERR "       findphone <database> [options] <search string>\n";

	exit( 1 );
}
	
if( $ARGV[0] eq "-i" ) {
	do InteractiveDriver();
} else {
	do CommandLineDriver();
}
