#!/usr/bin/perl
# Copyright 2000-5 Vlado Keselj www.cs.dal.ca/~vlado

sub help { print STDERR <<"#EOT" }
# Report new material on a web page, version $VERSION
#
# Uses diff, lynx or wget, sendmail (if option -e is used)
#
# Usage: report-new.pl [switches] URL
#  -h    Print help and exit.
#  -v    Print version of the program and exit.
#  -e email Sends output, if not empty, to email.
#  -d method Dumping method.  The default is lynx_txt; other option is
#        wget.
#EOT

use strict;
use POSIX qw(strftime);

use vars qw( $VERSION );
$VERSION = sprintf "%d.%d", q$Revision: 1.7 $ =~ /(\d+)/g;

use Getopt::Std;
use vars qw($opt_v $opt_h $opt_e $opt_d);
getopts("hve:d:");

if ($opt_v) { print "$VERSION\n"; exit; }
elsif ($opt_h || !@ARGV) { &help(); exit; }

my $dump = \&dump_lynx_txt;
if ($opt_d ne '') {
    if   ($opt_d eq 'lynx_txt') { $dump = \&dump_lynx_txt }
    elsif($opt_d eq 'wget'    ) { $dump = \&dump_wget     }
    else { &help(); print STDERR "error: '-d $opt_d'"; exit -1; }
}

($#ARGV==0 && $ARGV[0]=~/^http:\/\//) ||
    die "Format: report-new.pl http://...\n";

my ($urlbase, $url);
$urlbase = $url = shift;   # E.g.: http://www.cs.dal.ca/~vlado/srcperl
if ( $url =~ m.//[^/]*/. )
{ $urlbase = $`.$& }	   # E.g.: http://www.cs.dal.ca/

my $urlId = &encode_w1($url);
my $timestamp = strftime("%Y-%m-%d-%T", localtime(time));

if (! -d 'tmp')
{ mkdir 'tmp', 0700 or die "can't mkdir tmp: $!" }
if (! -d 'report-new.pl.d')
{ mkdir 'report-new.pl.d', 0700 or die "can't mkdir report-new.pl.d: $!" }

my $TmpBase  = "tmp/$urlId-$timestamp";
my $TmpFile1 = "$TmpBase-1";
my $lastFile = "report-new.pl.d/$urlId.last";
-e $lastFile or putfile($lastFile,'');

# First step: grab the page
&$dump($url, $TmpFile1);
my $material = getfile($TmpFile1);

$material = `diff $TmpFile1 $lastFile 2>&1`;
$material =~ s/^[^<].*\n//mg;
$material =~ s/^< //mg;

if ($material) {
    if ($opt_e) {
        my $out;
        open($out, "|sendmail -t") or die;
	print $out "To: $opt_e\n".
	     "Subject: [report-new.pl] $url\n\n$material";
	close($out);
    }
    else { print $material }
}
unlink($lastFile);
rename($TmpFile1, $lastFile);

sub putfile($@) {
    my $f = shift;
    local *F;
    open(F, ">$f") or die "putfile:cannot open $f:$!";
    print F '' unless @_;
    while (@_) { print F shift(@_) }
    close(F)
}

sub encode_w1( $ ) {
    local $_ = shift;
    s/[\W_]/'_'.uc unpack("H2",$&)/ge;
    return $_;
}

sub dump_lynx_txt {
    my $url = shift;
    my $file = shift;
    local *F;
    open(F,"|lynx -dump -nolist -") or die "lynx error:$!";
    print F $url;
    close(F);
}

sub dump_wget {
    my $url = shift;
    my $file = shift;
    system('wget', '--quiet', '-O', $file, $url);
}

sub getfile($) {
    my $f = shift;
    local *F;
    open(F, "<$f") or die "getfile:cannot open $f:$!";
    my @r = <F>;
    close(F);
    return wantarray ? @r : join ('', @r);
}

__END__
=head1 NAME

report-new.pl - Report new material on a web page

=head1 SYNOPIS

  report-new.pl [switches] URL

=head1 DESCRIPTION

Reports new material on a web page.  Typically used as a cron job with
the -e option.

  -h    Print help and exit.
  -v    Print version of the program and exit.
  -e email Sends output, if not empty, to email.
        justify [input files]
  -d method Dumping method.  The default is lynx_txt; other option is
        wget.

=head1 PREREQUISITES

POSIX qw(strftime);
uses diff, lynx or wget, sendmail (if option -e is used).

=head1 SCRIPT CATEGORIES

Web

=head1 README

Reports new material on a web page.

=head1 SEE ALSO

Scripts:
wget

=head1 THANKS

I would like to thank Peet Moris for bug reports and comments.

=head1 COPYRIGHT

Copyright 2000-5 Vlado Keselj F<http://www.cs.dal.ca/~vlado>

This script is provided "as is" without expressed or implied warranty.
This is free software; you can redistribute it and/or modify it under
the same terms as Perl itself.

The latest version can be found at F<http://www.cs.dal.ca/~vlado/srcperl/>.

=cut
