#!/usr/bin/perl
#
# Copyright 1997-1999 Sami Itkonen <si@iki.fi>
#
# Distributed under the GNU General Public License
#
# Sample validator script for HTML::Validator

=head1 NAME

validate - HTML validator

=head1 SYNOPSIS

 validate [-v] [-c catalog] [-n nsgmls] [-d doctype] <URL|file> [URL|file] ...

=head1 DESCRIPTION

 The I<validate> script will validate a HTML document. The document can
 be either local or it can be an URL.

 Although useful as such, the script was written to demonstrate the usage
 of the HTML::Validator module.

 Documents supplied from the standard input will also be validated.

=head1 OPTIONS

=over 3

=item -v

Verbose operation. Output will be produced even if the document valid. In
addition, the document type will be outputted.

=item -c catalog

The catalog to use instead of the default one.

=item -n nsgmls

The I<nsgmls> executable to use instead of the default one.

=item -d doctype

Use this document type instead of the one written on the file

=back

=head1 SEE ALSO

  L<HTML::Validator>, L<LWP>, L<nsgmls>

=head1 AUTHOR

Sami Itkonen <si@iki.fi>

=head1 COPYRIGHT

 Copyright  1997-1999 Sami Itkonen <si@iki.fi>

 HTML::Validator is distributed under the GNU General Public License.

=cut

require 5.004;
require "getopts.pl";

use HTML::Validator;
use Carp;
use FileHandle;

my $ME = substr($0, rindex ($0, '/') + 1);

Getopts('vhlsn:c:e:d:pf');

my $maxerr = defined $opt_e ? $opt_e : 10;

&Usage if $opt_h;

#
if (defined $opt_l) {
    my $list = new HTML::Validator;
    my $map = $list->{dtdmap};
    print "$ME  1998,1999 Sami Itkonen\n\n";
    print "List of recognized document types (for -d switch):\n\n";
    foreach $htmlstr (keys %{$map}) {
       printf "%-15s %s\n",$htmlstr,${$map}{$htmlstr};
    }
    print "\n";
    exit 0;
}

# check the standard input

my $stdin = &CheckSTDIN();
if (!$ARGV[0] && !$stdin) {&Usage;exit}

#-----------------------------------------------------------------------------
# If we have a file from stdin, we check that first
#-----------------------------------------------------------------------------

if ($stdin) {
  my $TMPFILE = "/tmp/html.validate.$$";
  my ($fh) = new FileHandle;
  open($fh,"-") or carp("Can't open stdin");
  if ($opt_f) {
    while (<$fh>) {
      chomp;
      unshift(@ARGV,$_);
    }
  }
  else {
    my ($fo) = new FileHandle;
    if (open($fo,">$TMPFILE")) {
      my $oldsep = $/;
      undef $/;
      my $data = <$fh>;
      print $fo $data;
      close($fo);
      $/ = $oldsep;
      &CheckFile(STDIN,$TMPFILE);
      unlink($TMPFILE);		
    }
    else {
      carp("Can't write to file '$TMPFILE'");
    }
  }
}

#-----------------------------------------------------------------------------
# go through the arguments
#-----------------------------------------------------------------------------

while (my $url = $ARGV[0]) {
  shift;
  &CheckFile($url,$url);
}

exit;

#-----------------------------------------------------------------------------
# Validate the file
#-----------------------------------------------------------------------------

sub CheckFile {
  my ($url,$file) = @_;
  my $doc = new HTML::Validator($file);

  $doc->{subst_URL} = 1 if $opt_s;
  $doc->{maxerr} = $maxerr;

  if (defined $opt_d) {
    $doc->doctype($opt_d);
  }

  if ($opt_v) {
    print "Document type for '$url':\n",$doc->doctype 
      , " (" , $doc->{dtdfile} ,")\n\n";
  }

  $doc->{nsgmls} = $opt_n if $opt_n;
  $doc->{catalog} = $opt_c if $opt_c;
  $doc->{parser} = \&my_parser if $opt_p;

  $doc->validate;

#  print $doc->{raw_output},"\n";
  if (defined $doc->is_valid) {
    if ($doc->is_valid) {
      print "File '$url': ",$doc->message(-1),"\n" if $opt_v;
    }
    else {
      next unless defined $doc->is_valid;
      print "In file '$url':\n\n";
      while (my $msg = $doc->message) {
	print " ",$msg,"\n";
      }
#      print "\n";

      while (my $error = $doc->errors) {
	print $error,"\n";
      }
    }
  }
  else {
      print "File '$url': ",$doc->message,"\n";
  }
  print "\n" if $opt_v || !$doc->is_valid;

}

#-----------------------------------------------------------------------------
# Check if there's anything for us in the standard input, this method straight 
# from the perl faq
#-----------------------------------------------------------------------------

sub CheckSTDIN {

  my $rin;
  vec($rin, fileno(STDIN), 1) = 1;
  select($rin,undef,undef,0);
  
  if (unpack("b", $rin) eq 0) {
    return 0;
  }
  else {
    return 1;
  }    
}

#-----------------------------------------------------------------------------
# Custom parser to override the module default
#-----------------------------------------------------------------------------

sub my_parser {
    my ($output) = @_;
    my $valid = 1;
    my @errors;
    while ($output =~ s/^(.*?)\n//) {
      my $line = $1;
      my ($nsgmls,$file,$li,$pos,$error) = 
	$line =~ /^(.*?):(.*?):(.*?):(.*?):(.*)/;
      next if $file =~ /sgml-lib/;
      $error =~ s/^E://;
      push (@errors,"Error in line $li, position $pos:\n$error\n");
      $valid = 0;
    }
    return $valid ? undef : \@errors;
}

#-----------------------------------------------------------------------------
# Usage
#-----------------------------------------------------------------------------

sub Usage {
  die <<EOF;
$ME  1998,1999 Sami Itkonen

 Usage: $ME [-v] [-c catalog] [-n nsgmls] <URL|file> [URL|file] ...

Checks the validity of html files, both locally and via http

 Options: 

    -v		 verbose mode
    -p           use a custom parser in this script 
                 (for demonstation purposes only)
    -s           substitute URLs in doctypes
    -e           maximum number of errors to show (default $maxerr);
    -f           read the filenames to check from stdin
    -d <doctype> use a different document type
    -c <catalog> substitute <catalog> for the default catalog file
    -n <nsgmls>  substitute <nsgmls> for nsgmls
    -l           list HTML identifiers

EOF
}

