#!/usr/local/bin/perl
#
# $Id: buildindex,v 1.1 1993/08/31 20:45:35 johans Exp $
#
#
# buildindex
#
#	Builds the index files used by findphone.
#
# Usage:
#	buildindex dbname
#
#	dbname is the name of the database inwhich the index is being made.
#
#	Before an index can be made for a database a configuration fill
#	discribing the database most be made.  The configuration file must
#	have the name: <dbname>.conf .
#
#	The configuration file consists of one option perline.
#	The follow options *must* be set for every database:
#
#		WaveformDir
#		LolaDir
#		BaseDir
#		IndexDir
#		WaveformExtention
#		LolaExtention
#		LolaPoints
#		SampleRate
#
# Options:
#
#	BaseDir - Directory which WaveformDir and LolaDir are relative to.
#
#	WaveformDir - Directory inwhich waveform files are found.
#		      This directory is relative to BaseDir. 
#		      Can have more then one.
#
#	LolaDir - Directory inwhich lola files are found.
#		  This directory is relative to BaseDir. 
#		  Can have more then one.
#
#	IndexDir - Directory inwhich the index files will be placed.
#		   Only *one* databases indexs can be in a directory.
#
#	WaveformExtention - Extention found on the waveform files (adc,wav).
#
#	LolaExtention - Extention found on lola files (ptlola,phn).
#
#	LolaPoints - The time in seconds which 1 'lola point'
#		     represents.
#
#	SampleRate - Sample rate of waveform files in Hz.
#
#
# Example config file:
#
#	BaseDir: /projects/cslu/speech/phonedb
#	IndexDir: /projects/cslu/speech/index/say_lname
#	WaveformExtention: adc
#	LolaExtention: ptlola
#	LolaPoints: 0.003
#	SampleRate: 8000
#	LolaDir: say_lname_handlabels
#	WaveformDir: say_lname
#
#
$| = 1;

$ConfDir = "/projects/cslu/speech/findphone/configs";

$BaseDir;
$IndexDir;
$WaveExtention;
$LolaExtention;
$LolaPoints;
$SampleRate;

@WaveformDirs;
@LolaDirs;

%WaveformFilesList;

$CurrentId = 1;

sub StripWhite {
	local( $str ) = @_;

	while( $str =~ /\s$/ ) {
		chop( $str );
	}

	$str;
}

sub ParseConfig {
	local( $dbname ) = @_;
	local( $conffile, $path );


	$conffile = $ConfDir . "/" . $dbname . ".conf";

	open( CONF, $conffile ) || die "buildindex: $conffile: $!\n";


	while( $_ = <CONF> ) {

		if( /^\n/ || /^#/ ) {
			next;
		}

		if( /^WaveformDir:\s*([^\n]*)/io ) {

			$path = do StripWhite( $1 );
			push( @WaveformDirs, $path );

		} elsif( /^LolaDir:\s*([^\n]*)/io ) {

			$path = do StripWhite( $1 );
			push( @LolaDirs, $path );

		} elsif( /^BaseDir:\s*([^\n]*)/io ) {
	
			if( defined( $BaseDir ) ) {
				print STDERR "BaseDir already set!!\n";
				exit( 1 );
			} 

			$BaseDir = do StripWhite( $1 );

		} elsif( /^IndexDir:\s*([^\n]*)/io ) {

			if( defined( $IndexDir ) ) {

				print STDERR "IndexDir already set!!\n";
				exit( 1 );

			}

			$IndexDir = do StripWhite( $1 );

		} elsif( /^WaveformExtention:\s*([^\n]*)/io ) {

			if( defined( $WaveExtention ) ) {

				print STDERR "Wave extention already set!!\n";
				exit( 1 );

			}

			$WaveExtention = &StripWhite( $1 );

		} elsif( /^LolaExtention:\s*([^\n]*)/io ) {

			if( defined( $LolaExtention ) ) {

				print STDERR "Lola extention already set!!\n";
				exit( 1 );

			}

			$LolaExtention = &StripWhite( $1 );

		} elsif( /^LolaPoints:\s*([^\n]*)/io ) {

			if( defined( $LolaPoints ) ) {
				print STDERR "LolaPoints already set!\n";
				exit( 1 );
			}
	
			$LolaPoints = do StripWhite( $1 );

		} elsif( /^SampleRate:\s*([^\n]*)/io ) {

			if( defined( $SampleRate ) ) {
				print STDERR "SampleRate already set!\n";
				exit( 1 );
			}

			$SampleRate = do StripWhite( $1 );

		} else {
	
			chop( $_ );
			print STDERR "Unknown configuration line: \'$_\'";

		}

	}
}

sub LookForWaveform {
	local($d) = @_;
	local( $f, $lola, @files );
print "looking in $d\n";
	opendir( WAVE_DIR, "$BaseDir/$d" ) || die "buildindex: $BaseDir/$d: $!\n";
	@files = readdir( WAVE_DIR );
	closedir( WAVE_DIR );
	foreach $f ( @files ) {
		if(-d "$BaseDir/$d/$f" && !($f =~/^\./)) { &LookForWaveform("$d/$f");}
		elsif( $f =~ /$WaveExtention$/ ) {
			$lola = $f;
			$lola =~ s/$WaveExtention$/$LolaExtention/;
			$WaveformFilesList{$lola} = "$d/$f";
		}
	}
}

sub CreateWaveformFileList {
	local($d);

	print "Creating waveform file list ...\n";

	foreach $d ( @WaveformDirs ) {
		&LookForWaveform($d);
	}

	print "done.\n";
}

sub FindWaveformFile {
	local( $path, $lola ) = @_;
	local( $wave, $id, $f );


	$wave = $lola;
	$wave =~ s/$LolaExtention$/$WaveExtention/;

	if( -f "$BaseDir/$path/$wave" ) {

		$id = &GetFileId( "$path/$wave" );

	} else {

		if( defined( $WaveformFilesList{$lola} ) ) {
			$id = &GetFileId( $WaveformFilesList{$lola} );
		} else {
			print "Could not find a waveform file for $lola\n";
		}
	}

	$id;

}

sub GetFileId {
	local( $file ) = @_;
	local( $id );

	$id = $CurrentId;

	open( IDMAP, ">>$IndexDir/ID.map" ) || die "buildindex: $IndexDir/ID.map: $!\n";
	print IDMAP "$id $file\n";
	close( IDMAP );

	$CurrentId++;

	$id;
}

sub DigestLola {
	local( $path ,$file ) = @_;
	local( $s_start, $s_stop, $s_ph );
	local( $c_start, $c_stop, $c_ph );
	local( $p_start, $p_stop, $p_ph );
	local( $id );
	local( $junk);
	local( $stop);
	local( $start);

        # Append to the index file for each phoneme in the LOLA file.  The start and
        # stop location will be recorded in milliseconds, not frames.

	open( LOLA, "$BaseDir/$path/$file" ) || die "buildindex: $BaseDir/$path/$file: $!\n";

        # Eat the first two lines (the header)
	$junk = <LOLA>;
	$junk = <LOLA>;	
 
	$id = &FindWaveformFile( $path, $file );  # Find fromfile id

	# Prime the loop by loading the first two lines of LOLA	

        $_ = <LOLA>;
	( $p_start, $p_stop, $p_ph ) = split(" ");
	$_ = <LOLA>;
	( $c_start, $c_stop, $c_ph ) = split(" ");

	$p_start = $p_start * 1000 * $LolaPoints;
	$p_stop = $p_stop * 1000 * $LolaPoints;
	$c_start = $c_start * 1000 * $LolaPoints;
	$c_stop = $c_stop * 1000 * $LolaPoints;

	# Write an index entry for the first label.  Fake the left-context.
	# Use "#" as the left phoneme and previous start -1 as the left-context start.

	open( INDEX, ">>$IndexDir/$p_ph.index" ) || die "buildindex: $IndexDir/$p_ph.index: $!\n";
	$start = $p_start - 1;
	print INDEX "$id $p_start $p_stop # $start $c_ph $c_stop\n";
	close( INDEX );


	while ( $_ = <LOLA> ) {

		( $s_start, $s_stop, $s_ph ) = split(" ");
		$s_start = $s_start * 1000 * $LolaPoints;
		$s_stop = $s_stop * 1000 * $LolaPoints;

		open( INDEX, ">>$IndexDir/$c_ph.index" ) || die "buildindex: $IndexDir/$c_ph.index: $!\n";

		print INDEX "$id $c_start $c_stop $p_ph $p_start $s_ph $s_stop\n";
		close( INDEX );

		( $p_start, $p_stop, $p_ph ) = ( $c_start, $c_stop, $c_ph );
		( $c_start, $c_stop, $c_ph ) = ( $s_start, $s_stop, $s_ph );

	}


	# Write an index entry for the last label.  Fake the right-context.
	# Use "#" as the right phoneme and center+1 as the right-context start.

	open( INDEX, ">>$IndexDir/$s_ph.index" ) || die "buildindex: $IndexDir/$s_ph.index: $!\n";
	$stop = $c_stop + 1;
	print INDEX "$id $s_start $s_stop $p_ph $p_start #  $stop \n";
	close( INDEX );

}

sub MakeIndexDir {
	local( @tmp );

	if( -e $IndexDir ) {

		opendir( IDIR, $IndexDir );
		@tmp = readdir( IDIR );
		closedir( IDIR );

		if( $#tmp != 1 ) {
			print STDERR "buildindex: index directory $IndexDir is not empty!\n";
			print STDERR "only one database index per-directory\n";
			exit( 1 );
		}
	} else {

		print "creating index direcotry.\n";

		mkdir( $IndexDir, 0755 ) || 
			die "buildindex: could not create $IndexDir: $!";
	}

}

sub ProcessDir {
	local($d) = @_;
	local(@files,$f);

	print "Lola dir: $d\n";

	opendir( LOLA_DIR, "$BaseDir/$d" ) || die "buildindex: $BaseDir/$d: $!\n";
	@files = readdir( LOLA_DIR );
	closedir( LOLA_DIR );

	foreach $f ( @files ) {
 		if(-d "$BaseDir/$d/$f" &&  !($f =~/^\./)) { &ProcessDir("$d/$f");}
		elsif( $f =~ /$LolaExtention$/ ) {
			print "indexing $d/$f\n";
			do DigestLola( $d, $f );
		}
	}
}


if( $#ARGV != 0 ) {
	print STDERR "usage: buildindex <database_name>\n";
	exit( 1 );
}

do ParseConfig( $ARGV[0] );
do MakeIndexDir();
do CreateWaveformFileList();

foreach $d ( @LolaDirs ) {
	&ProcessDir($d);
}



