#!/usr/cs/bin/perl -w

# use perl                                  -*- mode: Perl; -*-
eval 'exec perl -S $0 "$@"'
  if $running_under_some_shell;

use vars qw($running_under_some_shell);         # no whining!

require 5.003;

# grepmail version 3.1

# Grepmail searches a normal or gzipped mailbox for a given regular expression
# and returns those emails that match the query. It also supports piped input,
# and searches constrained by date. 

# Do a pod2text on this file to get full documentation, or pod2man to get
# man pages.

# Written by David Coppit (coppit@cs.virginia.edu,
#  http://www.cs.virginia.edu/~dwc3q/index.html)

# Please send me any modifications you make. (for the better, that is. :)

# This code is distributed under the GNU General Public License (GPL). See
# http://www.opensource.org/gpl-license.html and http://www.opensource.org/.

# Version History (major changes only)
# 3.1 Added -m, which prints the folder name in which the email was found as
#   an "X-Mailfolder" addition to the header (by Ulli Horlacher
#   <framstag@moep.bb.bawue.de>). Improved error checking on flags. Changed
#   "zcat" to "gunzip -c" to help with backwards compatibility with older
#   versions of gzip (thanks to Eugene Kim <eekim@eekim.com>).
# 3.0 -h and -b can be used together. Rewrote the ProcessMailFile to run 2 to
#   3 times faster, and use less memory. Correctly diagnoses directories as
#   such (by Gerald Pfeifer <pfeifer@dbai.tuwien.ac.at>).
# 2.1 Added -l,-r, and -e, as suggested by Reinhard Max <max@suse.de>. Now
#   uses about 1/3 the memory, and is a little faster.
# 2.0 Added POD documentation at the end of the script (thanks, Jeffrey
#   Haemer <jsh@boulder.qms.com>). -h for headers only -b for body only
# 1.9 "Ignore empty files" by Gerald Pfeifer <pfeifer@dbai.tuwien.ac.at>.
#   Emails without dates are now automatically output no matter what the
#   date specification is. (Better safe than sorry!)
# 1.7 Sped up by Andrew Johnson. It no longer looks for dates unless
#   the email matches the search string.
# 1.6 removed use of Compress::Zlib because it was 30% slower, complicated the
#   code, and because any user with gzip'd mail has zcat...
# 1.5 Andrew Johnson <ajohnson@gpu.srv.ualberta.ca> fixed a couple of bugs.
# 1.4 Incorporated conditional loading of the date module, use of
#   compress::Zlib instead of shelling out to gunzip, as well as some bug
#   fixes, as submitted by Andrew Johnson <ajohnson@gpu.srv.ualberta.ca>
#   (Many thanks!). Also restructured the code a bit.
# 1.3 Made it pipeable so you can do:
#   grepmail <pattern> file | grepmail <pattern>
# 1.1 Support for dates.
# 1.0 Initial version, with -v -i, and gzip support

# Notes:
# It turns out that -h, -b, and -v have some nasty feature interaction. Here's
# a table of how matching should occur for each combination of flags:
#
#  B, H,!V
#  Match if body and header matches
#  B,!H,!V
#  Match if body matches -- don't care about header
# !B, H,!V
#  Match if header matches -- don't care about body
# -V strictly inverts each of the above cases.

#  The best way to think about this is using Venn diagrams. (Especially when
#  trying to figure out whether the header uniquely determines whether the
#  email matches.

use strict;
use FileHandle;
use Getopt::Std;
use Carp;

sub usage
{
<<EOF;
usage: grepmail [[-e] <expr>] [-vihblrm] [-d \"datespec\"] <files...>

-h Search must match header
-b Search must match body
-l Output the names of files having an email matching the expression
-r Output the names of the files and the number of emails matching the
   expression
-m Append "X-Mailfolder: <folder>" to all headers to indicate in which folder
   the match occurred
-i Ignore case in the search expression
-v Output emails that don't match the expression
-e Explicitely name expr (when searching for strings beginning with "-")

Date specifications must be of the form of:
a date like "today", "1st thursday in June 1992", "05/10/93",
  "12:30 Dec 12th 1880", "8:00pm december tenth",
OR "before", "after", or "since", followed by a date as defined above,
OR "between <date> and <date>", where <date> is defined as above.

Files can be ASCII or gzip'd ASCII. You can also pipe gzip'd or normal ASCII to
grepmail.
EOF
}

#-------------------------------------------------------------------------------

my (%opts, $pattern, $unzipMethod);

BEGIN
{
  $pattern = "";

  # Print usage error if no arguments given
  print usage and exit if ($#ARGV < 0);

  if ($ARGV[0] !~ /^-/)
  {
    $pattern = shift @ARGV;
  }

  getopt("ed",\%opts);

# For debugging
#foreach my $i (keys %opts)
#{
#  print "$i: $opts{$i}\n";
#}

  if ($opts{e})
  {
    print "You specified two search patterns.\n" and exit if ($pattern ne "");
    
    $pattern = $opts{e};
  }
  elsif ($pattern eq "")
  {
    # The only time you can't specify the pattern is when -d is being used.
    # This should catch people who do "grepmail -h" thinking it's help.
    print usage and exit if !$opts{d};

    $pattern = ".";
  }

  if ($opts{d})
  {
    unless (eval "require Date::Manip")
    {
      print "You specified -d, but do not have Date::Manip. Get it from CPAN.\n";
      exit;
    }

    import Date::Manip;
  }
}

#-------------------------------------------------------------------------------

# Make the pattern insensitive if we need to
$pattern = "(?i)$pattern" if ($opts{i});

my ($daterestriction, $date1, $date2);

if ($opts{d})
{
  ($daterestriction,$date1,$date2) = &ProcessDate($opts{d});
}
else
{
  $daterestriction = "none";
}

# For debugging
#print "PATTERN: $pattern\n";
#print "FILES: @ARGV\n";
#exit;

# If the user provided input files...
if (@ARGV)
{
  # For each input file...
  my $file;
  foreach $file (@ARGV)
  {
    # First of all, silently ignore empty files...
    next if -z $file;

    # ...and also ignore directories.
    warn "** Skipping directory: '$file' **\n" and next if -d $file;
  
    # If it's not a gzipped file
    if ($file !~ /\.(gz|Z)$/)
    {
      warn "** Skipping binary file: '$file' **\n" and next if -B $file;
      my $fileHandle = new FileHandle;
      $fileHandle->open($file) || die "Can't open $file.\n";
      ProcessMailFile($fileHandle,$file);
      $fileHandle->close();
    }
    # If it is a gzipped file
    else
    {
      my $tempFile = "/tmp/$$";
      `gunzip -c $file > $tempFile`;
      my $fileHandle = new FileHandle;
      $fileHandle->open($tempFile)
          || die "Can't open temporary file used to decompress the file $file.\n";
      ProcessMailFile($fileHandle,$file);
      $fileHandle->close();
      unlink $tempFile;
    }
    
  }
}
# Using STDIN
else
{ 
  my $fileHandle = new FileHandle;
  $fileHandle->open("<&STDIN") || die "Can't dup STDIN $!";

  # If it looks binary, try to unzip it.
  if (-B $fileHandle)
  {
    binmode $fileHandle;
    my $tempFile = "/tmp/$$";
    open(TMP,"|gunzip -c >$tempFile") || die "Can't create $tempFile $!";
    binmode TMP;
    print TMP while <$fileHandle>;
    close TMP;
    $fileHandle->close()||die "Error writing $tempFile $!";

    $fileHandle->open($tempFile) || die "Can't open $tempFile $!";
    ProcessMailFile($fileHandle,"Gzip'd standard input");
    $fileHandle->close();
    unlink $tempFile;      
  }
  # Otherwise save it directly
  else
  {
    my $tempFile = "/tmp/$$";
    open(TMP,">$tempFile") || die "Can't create $tempFile $!";
    print TMP while <$fileHandle>;
    close TMP;
    $fileHandle->close()||die "Error writing $tempFile $!";

    $fileHandle->open($tempFile) || die "Can't open $tempFile $!";
    ProcessMailFile($fileHandle,"Standard input");
    $fileHandle->close();
  }
}

#-------------------------------------------------------------------------------

sub ProcessMailFile ($$)
{
my $fileHandle = shift @_;
my $fileName = shift @_;
my ($numberOfMatches,$start,$matchesBody,$matchesHeader,$paragraph);

$numberOfMatches = 0;

# Read whole paragraphs
$/ = "\n\n";

# This is the main loop. It's executed once for each email
while (!eof($fileHandle))
{
  # Read the header
  $start = tell $fileHandle;
  $paragraph = <$fileHandle>;

  # Save the header for later when we check the date.
  my $header = $paragraph;

  my ($matchesHeader,$matchesBody); 

  # See if the header matches the pattern
  $matchesHeader = ($paragraph =~ /$pattern/om);

  # At this point, we might know enough to print the email.
  if (
      ($opts{h} && $opts{b} && $opts{v} && !$matchesHeader) ||
      ($opts{h} && !$opts{b} && $opts{v} && !$matchesHeader) ||
      (!$opts{h} && !$opts{b} && !$opts{v} && $matchesHeader)
     )
  {
    # Skip to the next email if the date is wrong.
    if (!&CheckDate(\$header))
    {
      SkipToNextEmail($fileHandle);
      next;
    }

    if ($opts{l})
    {
      print "$fileName\n";;

      # We can return since we found at least one email that matches.
      return;
    }
    elsif ($opts{r})
    {
      $numberOfMatches++;
      SkipToNextEmail($fileHandle);
    }
    else
    {
      PrintEmail($fileHandle,$fileName,$start);
    }

    next;
  }

  # We might have enough information to abort early
  if (
      ($opts{h} && $opts{b} && !$opts{v} && !$matchesHeader) ||
      ($opts{h} && !$opts{b} && !$opts{v} && !$matchesHeader) ||
      ($opts{h} && !$opts{b} && $opts{v} && !$matchesHeader) ||
      (!$opts{h} && !$opts{b} && $opts{v} && $matchesHeader)
     )
  {
    do
    {
      $paragraph = <$fileHandle>;
    }
    while (!eof && ($paragraph !~ /^\n*From .*\d{4}/)) ;

    # Back up if we went too far
    seek ($fileHandle, -(length $paragraph), 1) if (!eof);

    next;
  }

  # Now search the body for the pattern
  do
  {
    $paragraph = <$fileHandle>;
  }
  while (!eof && ($paragraph !~ /^\n*From .*\d{4}/) &&
        ($paragraph !~ /$pattern/om));

  if (eof)
  {
    $matchesBody = 0;
  }
  elsif ($paragraph =~ /^\n*From .*\d{4}/)
  {
    seek ($fileHandle, -(length $paragraph), 1);
    $matchesBody = 0;
  }
  else
  {
    $matchesBody = ($paragraph =~ /$pattern/om);
  }

  my $isMatch = (
                 ($opts{b} && $opts{h} && $matchesBody && $matchesHeader) ||
                 ($opts{b} && !$opts{h} && $matchesBody) ||
                 (!$opts{b} && $opts{h} && $matchesHeader) ||
                 (!$opts{b} && !$opts{h} && ($matchesBody || $matchesHeader))
                );

  $isMatch = !$isMatch if $opts{v};

  # If the match occurred in the right place...
  if ($isMatch)
  {
    # Skip to the next email if the date is wrong.
    if (!&CheckDate(\$header))
    {
      SkipToNextEmail($fileHandle);
      next;
    }

    if ($opts{l})
    {
      print "$fileName\n";

      # We can return since we found at least one email that matches.
      return;
    }
    elsif ($opts{r})
    {
      $numberOfMatches++;
      SkipToNextEmail($fileHandle);
    }
    else
    {
      PrintEmail($fileHandle,$fileName,$start);
    }
  }
  else
  {
    # It doesn't match the pattern
    SkipToNextEmail($fileHandle);
  }

}

 print "$fileName: $numberOfMatches\n" if ($opts{r});
}

#-------------------------------------------------------------------------------

sub SkipToNextEmail($)
{
  my $fileHandle = shift;
  my $paragraph;

  do
  {
    $paragraph = <$fileHandle>;
  }
  while (!eof && ($paragraph !~ /^\n*From .*\d{4}/)) ;

  # Back up if we went too far
  seek ($fileHandle, -(length $paragraph), 1) if (!eof);
}

#-------------------------------------------------------------------------------

sub PrintEmail($$$)
{
  my $fileHandle = shift;
  my $fileName = shift;
  my $start = shift;
  my $paragraph;

  seek ($fileHandle, $start, 0);
  $paragraph = <$fileHandle>;

  # Print the mailfolder in the headers if -m was given
  if ($opts{"m"})
  {
    chop $paragraph;
    $paragraph .= "X-Mailfolder: $fileName\n\n";
  }

  do
  {
    print $paragraph;
    $paragraph = <$fileHandle>;
  }
  while (!eof && ($paragraph !~ /^\n*From .*\d{4}/)) ;

  # Back up if we went too far
  seek ($fileHandle, -(length $paragraph), 1) if (!eof);
}

#-------------------------------------------------------------------------------
    
sub CheckDate($)
{
my $emailref = shift;
my ($emailDate, $isInDate);
$emailDate = "";
$isInDate = 0;

if ($opts{d})
{
  # The email might not have a date. In this case, print it out anyway.
  if ($$emailref =~ /^Date:\s*(\S*\s*\S*\s*\S*\s*\S*\s*\S*)/m)
  {
    $emailDate = &ParseDate($1);
    $isInDate = &IsInDate($emailDate,$daterestriction,$date1,$date2);
  }
  else
  {
    $isInDate = 1;
  }
}
else
{
  $isInDate = 1;
}

return $isInDate;

}

#-------------------------------------------------------------------------------

# Figure out what kind of date restriction they want, and what the dates in
# question are.
sub ProcessDate($)
{
my ($daterestriction, $date1, $date2);

if(!defined($_[0]))
{
  return ("none","","");
}

my $datestring = $_[0];

if ($datestring =~ /^before (.*)/)
{
  $daterestriction = "before";
  $date1 = &ParseDate($1);
  $date2 = "";

  if (!$date1)
  {
    die "\"$1\" is not a valid date\n";
  }
}
elsif ($datestring =~ /^(after |since )(.*)/)
{
  $daterestriction = "after";
  $date1 = &ParseDate($2);
  $date2 = "";

  if (!$date1)
  {
    die "\"$2\" is not a valid date\n";
  }
}
elsif ($datestring =~ /^between (.*) and (.*)/)
{
  $daterestriction = "between";
  $date1 = &ParseDate($1);
  $date2 = &ParseDate($2);

  if (!$date1)
  {
    die "\"$1\" is not a valid date\n";
  }
  if (!$date2)
  {
    die "\"$2\" is not a valid date\n";
  }

  # Swap the dates if the user gave them backwards.
  if ($date1 gt $date2)
  {
    my $temp;
    $temp = $date1;
    $date1 = $date2;
    $date2 = $temp;
  }

}
elsif ($date1 = &ParseDate($datestring))
{
  $daterestriction = "on"
}
else
{
  die "Invalid date specification. Use \"$0 -h\" for help\n";
}

return ($daterestriction,$date1,$date2);

}

#-------------------------------------------------------------------------------

sub IsInDate($$$$)
{
my ($emailDate,$daterestriction,$date1,$date2);
$emailDate = shift @_;
$daterestriction = shift @_;
$date1 = shift @_;
$date2 = shift @_;

# Here we do the date checking.
if ($daterestriction eq "none")
{
  return 1;
}
else
{
  if ($daterestriction eq "before")
  {
    if ($emailDate lt $date1)
    {
      return 1;
    }
    else
    {
      return 0;
    }
  }
  elsif ($daterestriction eq "after")
  {
    if ($emailDate gt $date1)
    {
      return 1;
    }
    else
    {
      return 0;
    }
  }
  elsif ($daterestriction eq "on")
  {
    if (&UnixDate($emailDate,"%m %d %Y") eq &UnixDate($date1,"%m %d %Y"))
    {
      return 1;
    }
    else
    {
      return 0;
    }
  }
  elsif ($daterestriction eq "between")
  {
    if (($emailDate gt $date1) && ($emailDate lt $date2))
    {
      return 1;
    }
    else
    {
      return 0;
    }
  }
}

}

#-------------------------------------------------------------------------------

=head1 NAME

grepmail - search mailboxes for mail matching a regular expression

=head1 SYNOPSIS

  grepmail [-e <regex>] [-vihblrm] [-d "datespec"] [mailbox ...]

=head1 DESCRIPTION

=over 2

I<grepmail> looks for mail messages containing a pattern, and prints the
resulting messages on standard out.

By default I<grepmail> looks in both header and body for the specified pattern.

When redirected to a file, the result is another mailbox, which can, in turn,
be handled by standard User Agents, such as I<elm>, or even used as input for
another instance of I<grepmail>.

The pattern is optional if -d is used, and must precede all flags unless it is
specified using -e.

=back

=head1 OPTIONS AND ARGUMENTS

Many of the options and arguments are analogous to those of grep.

=over 8

=item B<pattern>

The pattern to search for in the mail message.  May be any Perl regular
expression, but should be quoted on the command line to protect against
globbing (shell expansion). To search for more than one pattern, use the form
"(pattern1|pattern2|...)".

=item B<mailbox>

Mailboxes must be traditional, UNIX C</bin/mail> mailbox format.  The
mailboxes may be zipped by gzip, in which case gunzip must be installed on the
system.  If no mailbox is specified, takes input from stdin, which can be
gzip'd or not.

=item B<-b>

Places the assertion that the pattern must match in the body of the email.

=item B<-h>

Places the assertion that the pattern must match in the header of the email.

=item B<-i>

Make the search case-insensitive (by analogy to I<grep -i>).

=item B<-v>

Invert the sense of the search, (by analogy to I<grep -v>). Note that this
affects only -h and -b, not -d. This results in the set of emails printed
being the complement of those that would be printed without the -v switch.

=item B<-l>

Output the names of files having an email matching the expression, (by analogy
to I<grep -l>).

=item B<-r>

Generate a report of the names of the files containing emails matching the
expression, along with a count of the number of matching emails.

=item B<-m>

Append "X-Mailfolder: <folder>" to all email headers, indicating which folder
contained the matched email.

=item B<-e>

Explicitely specify the search pattern. This is useful for specifying patterns
that begin with "-", which would otherwise be interpreted as a flag.

=item B<-d>

Date specifications must be of the form of:
  - a date like "today", "1st thursday in June 1992", "05/10/93", "12:30 Dec 12th 1880", "8:00pm december tenth",
  - OR "before", "after", or "since", followed by a date as defined above,
  - OR "between <date> and <date>", where <date> is defined as above.

=back

=head1 EXAMPLES

Get all email that you mailed yesterday

  grepmail -d yesterday sent-mail

Get all email that you mailed before the first thursday in June 1998 that
pertains to research:

  grepmail research -d "before 1st thursday in June 1992" sent-mail

Get all email you received since 8/20/98 that wasn't about research or your
job, ignoring case:

  grepmail "(research|job)" -i -d "since 8/20/98" -v saved-mail

Get all email about mime but not about Netscape. Constrain the search to match
the body, since most headers contain the text "mime":

  grepmail mime -b saved-mail | grepmail Netscape -v

Print a list of all mailboxes containing a message from Rodney. Constrain the
search to the headers, since quoted emails may match the pattern:

  grepmail "^From.*Rodney" -h -l saved-mail*

Find all emails with the text "Pilot" in both the header and the body:

  grepmail "Pilot" -h -b saved-mail*

Print a count of the number of messages about grepmail in all saved-mail
mailboxes:

  grepmail grepmail -b -r saved-mail*

=head1 AUTHOR

  David Coppit, <coppit@cs.virginia.edu>,
  http://www.cs.virginia.edu/~dwc3q/index.thml

=head1 SEE ALSO

elm(1), mail(1), grep(1), perl(1), printmail(1), Mail::Internet(3)
Crocker,  D.  H., Standard for the
Format of Arpa Internet Text Messages, RFC822.

=cut

