#!/usr/bin/perl -wT
#
# W3C Link Checker
# by Hugo Haas <hugo@w3.org>
# (c) 1999-2009 World Wide Web Consortium
# based on Renaud Bruyeron's checklink.pl
#
# $Id: checklink,v 4.154 2009/03/28 13:50:42 ville Exp $
#
# This program is licensed under the W3C(r) Software License:
#       http://www.w3.org/Consortium/Legal/copyright-software
#
# The documentation is at:
#       http://validator.w3.org/docs/checklink.html
#
# See the CVSweb interface at:
#       http://dev.w3.org/cvsweb/perl/modules/W3C/LinkChecker/
#
# An online version is available at:
#       http://validator.w3.org/checklink
#
# Comments and suggestions should be sent to the www-validator mailing list:
#       www-validator@w3.org (with 'checklink' in the subject)
#       http://lists.w3.org/Archives/Public/www-validator/ (archives)

use strict;

# Get rid of potentially unsafe and unneeded environment variables.
delete(@ENV{qw(IFS CDPATH ENV BASH_ENV)});
$ENV{PATH} = ''; # undef would output warnings with Perl 5.6.1's Cwd.pm.

# ...but we want PERL5LIB honored even in taint mode, see perlsec, perl5lib,
# http://www.mail-archive.com/cpan-testers-discuss%40perl.org/msg01064.html
BEGIN {
  # undefinedness and "v-string in use/require non-portable" warnings with
  # perl5lib 1.02 and perl 5.10.0, rt.cpan.org #43446, #43447
  local $^W = 0;
  require perl5lib;
}

# -----------------------------------------------------------------------------

package W3C::UserAgent;

use LWP::RobotUA   1.19 qw();
use LWP::UserAgent      qw();

# if 0, ignore robots exclusion (useful for testing)
use constant USE_ROBOT_UA => 1;

if (USE_ROBOT_UA) {
  @W3C::UserAgent::ISA = qw(LWP::RobotUA);
} else {
  @W3C::UserAgent::ISA = qw(LWP::UserAgent);
}

sub new
{
  my $proto = shift;
  my $class = ref($proto) || $proto;
  my ($name, $from, $rules) = @_;

  # For security/privacy reasons, if $from was not given, do not send it.
  # Cheat by defining something for the constructor, and resetting it later.
  my $from_ok = $from;
  $from ||= 'www-validator@w3.org';

  my $self;
  if (USE_ROBOT_UA) {
    $self = $class->SUPER::new($name, $from, $rules);
  } else {
    my %cnf;
    @cnf{qw(agent from)} = ($name, $from);
    $self = LWP::UserAgent->new(%cnf);
    $self = bless $self, $class;
  }

  $self->from(undef) unless $from_ok;

  $self->env_proxy();

  $self->allow_private_ips(1);

  # TODO: bug 29
  $self->protocols_forbidden([qw(mailto javascript)]);

  return $self;
}

sub allow_private_ips
{
  my $self = shift;
  if (@_) {
    $self->{Checklink_allow_private_ips} = shift;
    if (!$self->{Checklink_allow_private_ips}) {
      # Pull in dependencies
      require Net::IP;
      require Socket;
      require Net::hostent;
    }
  }
  return $self->{Checklink_allow_private_ips};
}

sub redirect_progress_callback
{
  my $self = shift;
  $self->{Checklink_redirect_callback} = shift if @_;
  return $self->{Checklink_redirect_callback};
}

sub simple_request
{
  my $self = shift;

  my $response = $self->ip_disallowed($_[0]->uri());

  # RFC 2616, section 15.1.3
  $_[0]->remove_header("Referer")
    if ($_[0]->referer() &&
        (secure_scheme($_[0]->referer()) && !secure_scheme($_[0]->uri())));

  $response ||= do {
    local $SIG{__WARN__} = sub { # Suppress some warnings, rt.cpan.org #18902
      warn($_[0]) if ($_[0] && $_[0] !~ /^RobotRules/);
    };
    # @@@ Why not just $self->SUPER::simple_request?
    $self->W3C::UserAgent::SUPER::simple_request(@_);
  };

  if (! defined($self->{FirstResponse})) {
    $self->{FirstResponse} = $response->code();
    $self->{FirstMessage} = $response->message() || '(no message)';
  }

  return $response;
}

sub redirect_ok
{
  my ($self, $request, $response) = @_;

  if (my $callback = $self->redirect_progress_callback()) {
    # @@@ TODO: when an LWP internal robots.txt request gets redirected, this
    # will a bit confusingly fire for it too.  Would need a robust way to
    # determine whether the request is such a LWP "internal robots.txt" one.
    &$callback($request->method(), $request->uri());
  }

  return 0 unless $self->SUPER::redirect_ok($request, $response);

  if (my $res = $self->ip_disallowed($request->uri())) {
    $response->previous($response->clone());
    $response->request($request);
    $response->code($res->code());
    $response->message($res->message());
    return 0;
  }

  return 1;
}

#
# Checks whether we're allowed to retrieve the document based on its IP
# address.  Takes an URI object and returns a HTTP::Response containing the
# appropriate status and error message if the IP was disallowed, 0
# otherwise.  URIs without hostname or IP address are always allowed,
# including schemes where those make no sense (eg. data:, often javascript:).
#
sub ip_disallowed
{
  my ($self, $uri) = @_;
  return 0 if $self->allow_private_ips(); # Short-circuit

  my $hostname = undef;
  eval { $hostname = $uri->host() }; # Not all URIs implement host()...
  return 0 unless $hostname;

  my $addr = my $iptype = my $resp = undef;
  if (my $host = Net::hostent::gethostbyname($hostname)) {
    $addr = Socket::inet_ntoa($host->addr()) if $host->addr();
    if ($addr && (my $ip = Net::IP->new($addr))) {
      $iptype = $ip->iptype();
    }
  }
  if ($iptype && $iptype ne 'PUBLIC') {
    $resp = HTTP::Response->new(403,
    'Checking non-public IP address disallowed by link checker configuration');
    $resp->header('Client-Warning', 'Internal response');
  }
  return $resp;
}

sub secure_scheme
{
  my $uri = shift or return 0;
  $uri = URI->new($uri) unless ref($uri);
  return ($uri->scheme() =~ /^(?:file|https|ldaps|sips|snews|ssh)$/i);
}

# -----------------------------------------------------------------------------

package W3C::LinkChecker;

use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION
            $DocType $Head $Accept $ContentTypes %Cfg);

use HTML::Entities       qw();
use HTML::Parser    3.20 qw(); # >= 3.20 for "line" argspec identifier
use HTTP::Request        qw();
use HTTP::Response  1.50 qw(); # >= 1.50 for decoded_content()
use Time::HiRes          qw();
use URI             1.31 qw(); # >= 1.31 for sip: abs/rel
use URI::Escape          qw();
# @@@ Needs also W3C::UserAgent but can't use() it here.

use constant RC_ROBOTS_TXT          => -1;
use constant RC_DNS_ERROR           => -2;
use constant RC_IP_DISALLOWED       => -3;
use constant RC_PROTOCOL_DISALLOWED => -4;

use constant LINE_UNKNOWN  => -1;

use constant MP2 =>
  (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2);

# Tag=>attribute mapping of things we treat as links.
# Note: base/@href and meta/@http-equiv get special treatment, see start()
# for details.
use constant LINK_ATTRS => {
    a          => ['href'],
    area       => ['href'],
    audio      => ['src'],
    blockquote => ['cite'],
    body       => ['background'],
    del        => ['cite'],
    embed      => ['href', 'pluginspage', 'pluginurl', 'src'], # proprietary
    # form/@action not checked (side effects)
    frame      => ['longdesc', 'src'],
    iframe     => ['longdesc', 'src'],
    img        => ['longdesc', 'src'],
    ins        => ['cite'],
    # input/@action not checked (side effects)
    input      => ['src'],
    link       => ['href'],
    object     => ['data'],
    q          => ['cite'],
    script     => ['src'],
    source     => ['src'],
    video      => ['src'],
};

# Tag=>attribute mapping of things we treat as space separated lists of links.
use constant LINK_LIST_ATTRS => {
    a          => ['ping'],
    area       => ['ping'],
    head       => ['profile'],
};

# TBD/TODO:
# - applet/@archive, @code?
# - bgsound/@src?
# - object/@classid?
# - object/@archive?
# - isindex/@action?
# - layer/@background,@src?
# - ilayer/@background?
# - table,tr,td,th/@background?
# - xmp/@href?

@W3C::LinkChecker::ISA =  qw(HTML::Parser);

BEGIN
{
  # Version info
  $PACKAGE     = 'W3C Link Checker';
  $PROGRAM     = 'W3C-checklink';
  $VERSION     = '4.5';
  $REVISION    = sprintf('version %s (c) 1999-2009 W3C', $VERSION);
  my ($cvsver) = q$Revision: 4.154 $ =~ /(\d+[\d\.]*\.\d+)/;
  $AGENT       = sprintf('%s/%s [%s] %s',
                         $PROGRAM, $VERSION, $cvsver,
                         (W3C::UserAgent::USE_ROBOT_UA
                           ? LWP::RobotUA->_agent()
                           : LWP::UserAgent->_agent()));

  # Pull in mod_perl modules if applicable.
  eval {
    local $SIG{__DIE__};
    require Apache2::RequestUtil;
  } if MP2();

  my @content_types = qw(
    text/html
    application/xhtml+xml;q=0.9
    application/vnd.wap.xhtml+xml;q=0.6
  );
  $Accept = join(', ', @content_types, '*/*;q=0.5');
  my $re = join('|', map { s/;.*// ; quotemeta } @content_types);
  $ContentTypes = qr{\b(?:$re)\b}io;

  #
  # Read configuration.  If the W3C_CHECKLINK_CFG environment variable has
  # been set or the default contains a non-empty file, read it.  Otherwise,
  # skip silently.
  #
  my $defaultconfig = '/etc/w3c/checklink.conf';
  if ($ENV{W3C_CHECKLINK_CFG} || -s $defaultconfig) {

    require Config::General;
    Config::General->require_version(2.06); # Need 2.06 for -SplitPolicy

    my $conffile = $ENV{W3C_CHECKLINK_CFG} || $defaultconfig;
    eval {
      my %config_opts =
        ( -ConfigFile        => $conffile,
          -SplitPolicy       => 'equalsign',
          -AllowMultiOptions => 'no',
        );
      %Cfg = Config::General->new(%config_opts)->getall();
    };
    if ($@) {
      die <<".EOF.";
Failed to read configuration from '$conffile':
$@
.EOF.
    }
  }
  $Cfg{Markup_Validator_URI} ||=
    'http://validator.w3.org/check?uri=%s';
  $Cfg{CSS_Validator_URI} ||=
    'http://jigsaw.w3.org/css-validator/validator?uri=%s';
  $Cfg{Doc_URI} ||=
    'http://validator.w3.org/docs/checklink.html';

  # Untaint config params that are used as the format argument to (s)printf(),
  # Perl 5.10 does not want to see that in taint mode.
  ($Cfg{Markup_Validator_URI}) = ($Cfg{Markup_Validator_URI} =~ /^(.*)$/);
  ($Cfg{CSS_Validator_URI})    = ($Cfg{CSS_Validator_URI}    =~ /^(.*)$/);

  $DocType = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
  my $css_url = URI->new_abs('linkchecker.css', $Cfg{Doc_URI});
  $Head = sprintf(<<'EOF', HTML::Entities::encode($AGENT), $css_url);
<meta http-equiv="Content-Script-Type" content="text/javascript" />
<meta name="generator" content="%s" />
<link rel="stylesheet" type="text/css" href="%s" />
<script type="text/javascript">
function show_progress(progress_text, progress_head_id, progress_pre_id) {
var progressHead = document.getElementById(progress_head_id);
var txt = document.createTextNode(progress_text);
oChild=progressHead.childNodes[1];
oNewChild=document.createElement("span");
oNewChild.appendChild(txt);
progressHead.replaceChild(oNewChild, oChild);
scroll_bottom(progress_pre_id);
};
function scroll_bottom(progress_pre_id) {
var progressPre = document.getElementById(progress_pre_id);
progressPre.scrollTop = progressPre.scrollHeight;
}</script>
EOF

  # Trusted environment variables that need laundering in taint mode.
  foreach (qw(NNTPSERVER NEWSHOST)) {
    ($ENV{$_}) = ($ENV{$_} =~ /^(.*)$/) if $ENV{$_};
  }

  # Use passive FTP by default, see Net::FTP(3).
  $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE});
}

# Autoflush
$| = 1;

# Different options specified by the user
my $cmdline = ! ($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/);
my %Opts =
  ( Command_Line      => $cmdline,
    Quiet             => 0,
    Summary_Only      => 0,
    Verbose           => 0,
    Progress          => 0,
    HTML              => 0,
    Timeout           => 30,
    Redirects         => 1,
    Dir_Redirects     => 1,
    Accept_Language   => $cmdline ? undef : $ENV{HTTP_ACCEPT_LANGUAGE},
    No_Referer        => 0,
    Hide_Same_Realm   => 0,
    Depth             => 0,    # < 0 means unlimited recursion.
    Sleep_Time        => 1,
    Max_Documents     => 150,  # For the online version.
    User              => undef,
    Password          => undef,
    Base_Locations    => [],
    Exclude           => undef,
    Exclude_Docs      => [],
    Masquerade        => 0,
    Masquerade_From   => '',
    Masquerade_To     => '',
    Trusted           => $Cfg{Trusted},
    Allow_Private_IPs => defined($Cfg{Allow_Private_IPs}) ?
                         $Cfg{Allow_Private_IPs} : $cmdline,
  );
undef $cmdline;

# Global variables
# What URI's did we process? (used for recursive mode)
my %processed;
# Result of the HTTP query
my %results;
# List of redirects
my %redirects;
# Count of the number of documents checked
my $doc_count = 0;

# Time stamp
my $timestamp = &get_timestamp();
# Per-document header; undefined if already printed.  See print_doc_header().
my $doc_header;

&parse_arguments() if $Opts{Command_Line};

# Precompile/error-check regular expressions.
if (defined($Opts{Exclude})) {
  eval { $Opts{Exclude} = qr/$Opts{Exclude}/o; };
  &usage(1, "Error in exclude regexp: $@") if $@;
}
for my $i (0 .. $#{$Opts{Exclude_Docs}}) {
  eval { $Opts{Exclude_Docs}->[$i] = qr/$Opts{Exclude_Docs}->[$i]/; };
  &usage(1, "Error in exclude-docs regexp: $@") if $@;
}
if (defined($Opts{Trusted})) {
  eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; };
  &usage(1, "Error in trusted domains regexp: $@") if $@;
}

my $ua = W3C::UserAgent->new($AGENT); # @@@ TODO: admin address
# @@@ make number of keep-alive connections customizable
$ua->conn_cache({ total_capacity => 1}); # 1 keep-alive connection
if ($ua->can('delay')) {
  $ua->delay($Opts{Sleep_Time}/60);
}
$ua->timeout($Opts{Timeout});
eval {
  $ua->allow_private_ips($Opts{Allow_Private_IPs});
};
if ($@) {
  die <<".EOF.";
Allow_Private_IPs is false; this feature requires the Net::IP, Socket, and
Net::hostent modules:
$@
.EOF.
}

if ($Opts{Command_Line}) {

  require Text::Wrap;
  Text::Wrap->import('wrap');

  require URI::file;

  &usage(1) unless scalar(@ARGV);

  $Opts{_Self_URI} = 'http://validator.w3.org/checklink'; # For HTML output

  &ask_password() if ($Opts{User} && !$Opts{Password});

  if (!$Opts{Summary_Only}) {
    printf("%s %s\n", $PACKAGE, $REVISION) unless $Opts{HTML};
  } else {
    $Opts{Verbose} = 0;
    $Opts{Progress} = 0;
  }

  # Populate data for print_form()
  my %params = (
                summary            => $Opts{Summary_Only},
                hide_redirects     => !$Opts{Redirects},
                hide_type          => $Opts{Dir_Redirects} ? 'dir' : 'all',
                no_accept_language => !(defined($Opts{Accept_Language}) &&
                                        $Opts{Accept_Language} eq 'auto'),
                no_referer         => $Opts{No_Referer},
                recursive          => ($Opts{Depth} != 0),
                depth              => $Opts{Depth},
               );

  my $check_num = 1;
  my @bases = @{$Opts{Base_Locations}};
  foreach my $uri (@ARGV) {
    # Reset base locations so that previous URI's given on the command line
    # won't affect the recursion scope for this URI (see check_uri())
    @{$Opts{Base_Locations}} = @bases;
    # Transform the parameter into a URI
    $uri = &urize($uri);
    $params{uri} = $uri;
    &check_uri(\%params, $uri, $check_num, $Opts{Depth}, undef, undef, 1);
    $check_num++;
  }
  undef $check_num;

  if ($Opts{HTML}) {
    &html_footer();
  } elsif (($doc_count > 0) && !$Opts{Summary_Only}) {
    printf("\n%s\n", &global_stats());
  }

} else {

  require CGI;
  require CGI::Carp;
  CGI::Carp->import(qw(fatalsToBrowser));
  require CGI::Cookie;

  # file: URIs are not allowed in CGI mode
  my $forbidden = $ua->protocols_forbidden() || [];
  push(@$forbidden, 'file');
  $ua->protocols_forbidden($forbidden);

  my $query = new CGI;
  # Set a few parameters in CGI mode
  $Opts{Verbose}   = 0;
  $Opts{Progress}  = 0;
  $Opts{HTML}      = 1;
  $Opts{_Self_URI} = $query->url(-relative => 1);

  # Backwards compatibility
  my $uri = undef;
  if ($uri = $query->param('url')) {
    $query->param('uri', $uri) unless $query->param('uri');
    $query->delete('url');
  }
  $uri = $query->param('uri');

  if (! $uri) {
    &html_header('', 1); # Set cookie only from results page.
    my %cookies = CGI::Cookie->fetch();
    &print_form(scalar($query->Vars()), $cookies{$PROGRAM}, 1);
    &html_footer();
    exit;
  }

  # Backwards compatibility
  if ($query->param('hide_dir_redirects')) {
    $query->param('hide_redirects', 'on');
    $query->param('hide_type', 'dir');
    $query->delete('hide_dir_redirects');
  }

  $Opts{Summary_Only} = 1 if $query->param('summary');

  if ($query->param('hide_redirects')) {
    $Opts{Dir_Redirects} = 0;
    if (my $type = $query->param('hide_type')) {
      $Opts{Redirects} = 0 if ($type ne 'dir');
    } else {
      $Opts{Redirects} = 0;
    }
  }

  $Opts{Accept_Language} = undef if $query->param('no_accept_language');
  $Opts{No_Referer} = $query->param('no_referer');

  $Opts{Depth} = -1 if ($query->param('recursive') && $Opts{Depth} == 0);
  if (my $depth = $query->param('depth')) {
    # @@@ Ignore invalid depth silently for now.
    $Opts{Depth} = $1 if ($depth =~ /(-?\d+)/);
  }

  # Save, clear or leave cookie as is.
  my $cookie = undef;
  if (my $action = $query->param('cookie')) {
    if ($action eq 'clear') {
      # Clear the cookie.
      $cookie = CGI::Cookie->new(-name => $PROGRAM);
      $cookie->value({ clear => 1 });
      $cookie->expires('-1M');
    } elsif ($action eq 'set') {
      # Set the options.
      $cookie = CGI::Cookie->new(-name => $PROGRAM);
      my %options = $query->Vars();
      delete($options{$_}) for qw(url uri check cookie); # Non-persistent.
      $cookie->value(\%options);
    }
  }
  if (!$cookie) {
    my %cookies = CGI::Cookie->fetch();
    $cookie = $cookies{$PROGRAM};
  }
  # Always refresh cookie expiration time.
  $cookie->expires('+1M') if ($cookie && !$cookie->expires());

  # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts.
  # If we're under mod_perl, there is a way around it...
  eval {
    local $SIG{__DIE__};
    my $auth = Apache2::RequestUtil->request()->headers_in()->{Authorization};
    $ENV{HTTP_AUTHORIZATION} = $auth if $auth;
  } if (MP2() && !$ENV{HTTP_AUTHORIZATION});

  $uri =~ s/^\s+//g;
  if ($uri !~ m/:/) {
    if ($uri =~ m|^//|) {
      $uri = 'http:'.$uri;
    } else {
      $uri = 'http://'.$uri;
    }
  }

  &check_uri(scalar($query->Vars()), $uri, 1, $Opts{Depth}, $cookie);
  undef $query; # Not needed any more.
  &html_footer();
}

###############################################################################

################################
# Command line and usage stuff #
################################

sub parse_arguments ()
{

  require Getopt::Long;
  Getopt::Long->require_version(2.17);
  Getopt::Long->import('GetOptions');
  Getopt::Long::Configure('bundling', 'no_ignore_case');
  my $masq = '';
  my @locs = ();

  GetOptions('help|h|?'        => sub { usage(0) },
             'q|quiet'         => sub { $Opts{Quiet} = 1;
                                        $Opts{Summary_Only} = 1;
                                      },
             's|summary'       => \$Opts{Summary_Only},
             'b|broken'        => sub { $Opts{Redirects} = 0;
                                        $Opts{Dir_Redirects} = 0;
                                      },
             'e|dir-redirects' => sub { $Opts{Dir_Redirects} = 0; },
             'v|verbose'       => \$Opts{Verbose},
             'i|indicator'     => \$Opts{Progress},
             'H|html'          => \$Opts{HTML},
             'r|recursive'     => sub { $Opts{Depth} = -1
                                          if $Opts{Depth} == 0; },
             'l|location=s'    => \@locs,
             'X|exclude=s'     => \$Opts{Exclude},
             'exclude-docs=s@' => \@{$Opts{Exclude_Docs}},
             'u|user=s'        => \$Opts{User},
             'p|password=s'    => \$Opts{Password},
             't|timeout=i'     => \$Opts{Timeout},
             'S|sleep=i'       => \$Opts{Sleep_Time},
             'L|languages=s'   => \$Opts{Accept_Language},
             'R|no-referer'    => \$Opts{No_Referer},
             'D|depth=i'       => sub { $Opts{Depth} = $_[1]
                                          unless $_[1] == 0; },
             'd|domain=s'      => \$Opts{Trusted},
             'masquerade=s'    => \$masq,
             'hide-same-realm' => \$Opts{Hide_Same_Realm},
             'V|version'       => \&version,
            )
    || usage(1);

  if ($masq) {
    $Opts{Masquerade} = 1;
    my @masq = split(/\s+/, $masq);
    if (scalar(@masq) != 2 ||
        !defined($masq[0]) || $masq[0] !~ /\S/ ||
        !defined($masq[1]) || $masq[1] !~ /\S/) {
      usage(1, "Error: --masquerade takes two whitespace separated URIs.");
    } else {
      $Opts{Masquerade_From} = $masq[0];
      $Opts{Masquerade_To}   = $masq[1];
    }
  }

  if ($Opts{Accept_Language} && $Opts{Accept_Language} eq 'auto') {
    $Opts{Accept_Language} = &guess_language();
  }

  if (($Opts{Sleep_Time} || 0) < 1) {
    warn("*** Warning: minimum allowed sleep time is 1 second, resetting.\n");
    $Opts{Sleep_Time} = 1;
  }

  push(@{$Opts{Base_Locations}}, map { URI->new($_)->canonical() } @locs);

  $Opts{Depth} = -1 if ($Opts{Depth} == 0 && @locs);

  return;
}

sub version ()
{
  print "$PACKAGE $REVISION\n";
  exit 0;
}

sub usage ()
{
  my ($exitval, $msg) = @_;
  $exitval = 0 unless defined($exitval);
  $msg ||= ''; $msg =~ s/[\r\n]*$/\n\n/ if $msg;

  die($msg) unless $Opts{Command_Line};

  my $trust = defined($Cfg{Trusted}) ? $Cfg{Trusted} : 'same host only';

  select(STDERR) if $exitval;
  print "$msg$PACKAGE $REVISION

Usage: checklink <options> <uris>
Options:
 -s, --summary              Result summary only.
 -b, --broken               Show only the broken links, not the redirects.
 -e, --directory            Hide directory redirects, for example
                            http://www.w3.org/TR -> http://www.w3.org/TR/
 -r, --recursive            Check the documents linked from the first one.
 -D, --depth N              Check the documents linked from the first one to
                            depth N (implies --recursive).
 -l, --location URI         Scope of the documents checked in recursive mode
                            (implies --recursive).  Can be specified multiple
                            times.  If not specified, the default eg. for
                            http://www.w3.org/TR/html4/Overview.html
                            would be http://www.w3.org/TR/html4/
 -X, --exclude REGEXP       Do not check links whose full, canonical URIs
                            match REGEXP; also limits recursion the same way
                            as --exclude-docs with the same regexp would.
 --exclude-docs REGEXP      In recursive mode, do not check links in documents
                            whose full, canonical URIs match REGEXP.  This
                            option may be specified multiple times.
 -L, --languages LANGS      Accept-Language header to send.  The special value
                            'auto' causes autodetection from the environment.
 -R, --no-referer           Do not send the Referer HTTP header.
 -q, --quiet                No output if no errors are found (implies -s).
 -v, --verbose              Verbose mode.
 -i, --indicator            Show percentage of lines processed while parsing.
 -u, --user USERNAME        Specify a username for authentication.
 -p, --password PASSWORD    Specify a password.
 --hide-same-realm          Hide 401's that are in the same realm as the
                            document checked.
 -S, --sleep SECS           Sleep SECS seconds between requests to each server
                            (default and minimum: 1 second).
 -t, --timeout SECS         Timeout for requests in seconds (default: 30).
 -d, --domain DOMAIN        Regular expression describing the domain to which
                            authentication information will be sent
                            (default: $trust).
 --masquerade \"BASE1 BASE2\" Masquerade base URI BASE1 as BASE2.  See the
                            manual page for more information.
 -H, --html                 HTML output.
 -?, -h, --help             Show this message and exit.
 -V, --version              Output version information and exit.

See \"perldoc LWP\" for information about proxy server support,
\"perldoc Net::FTP\" for information about various environment variables
affecting FTP connections and \"perldoc Net::NNTP\" for setting a default
NNTP server for news: URIs.

The W3C_CHECKLINK_CFG environment variable can be used to set the
configuration file to use.  See details in the full manual page, it can
be displayed with: perldoc checklink

More documentation at: $Cfg{Doc_URI}
Please send bug reports and comments to the www-validator mailing list:
  www-validator\@w3.org (with 'checklink' in the subject)
  Archives are at: http://lists.w3.org/Archives/Public/www-validator/
";
  exit $exitval;
}

sub ask_password ()
{
  eval {
    local $SIG{__DIE__};
    require Term::ReadKey;
    Term::ReadKey->require_version(2.00);
    Term::ReadKey->import(qw(ReadMode));
  };
  if ($@) {
    warn('Warning: Term::ReadKey 2.00 or newer not available, ' .
         "password input disabled.\n");
    return;
  }
  printf(STDERR 'Enter the password for user %s: ', $Opts{User});
  ReadMode('noecho',  *STDIN);
  chomp($Opts{Password} = <STDIN>);
  ReadMode('restore', *STDIN);
  print(STDERR "ok.\n");
  return;
}

###############################################################################

###########################################################################
# Guess an Accept-Language header based on the $LANG environment variable #
###########################################################################

sub guess_language ()
{
  my $lang = $ENV{LANG} or return;

  $lang =~ s/[\.@].*$//; # en_US.UTF-8, fi_FI@euro...

  return 'en' if ($lang eq 'C' || $lang eq 'POSIX');

  my $res = undef;
  eval {
    require Locale::Language;
    if (my $tmp = Locale::Language::language2code($lang)) {
      $lang = $tmp;
    }
    if (my ($l, $c) = (lc($lang) =~ /^([a-z]+)(?:[-_]([a-z]+))?/)) {
      if (Locale::Language::code2language($l)) {
        $res = $l;
        if ($c) {
          require Locale::Country;
          $res .= "-$c" if Locale::Country::code2country($c);
        }
      }
    }
  };
  return $res;
}

###########################################
# Transform foo into file://localhost/foo #
###########################################

sub urize ($)
{
  my $u = URI->new_abs(URI::Escape::uri_unescape($_[0]), URI::file->cwd());
  return $u->as_string();
}

########################################
# Check for broken links in a resource #
########################################

sub check_uri (\%$$$$;$$)
{
  my ($params, $uri, $check_num, $depth, $cookie, $referer, $is_start) = @_;
  $is_start ||= ($check_num == 1);

  if ($Opts{HTML}) {
    &html_header($uri, 0, $cookie) if ($check_num == 1);
    &print_form($params, $cookie, $check_num) if $is_start;
  }

  my $start = $Opts{Summary_Only} ? 0 : &get_timestamp();

  # Get and parse the document
  my $response = &get_document('GET', $uri, $doc_count, \%redirects, $referer);

  # Can we check the resource? If not, we exit here...
  return if defined($response->{Stop});

  if ($is_start) { # Starting point of a new check, eg. from the command line
    # Use the first URI as the recursion base unless specified otherwise.
    push(@{$Opts{Base_Locations}}, $response->{absolute_uri}->canonical())
      unless @{$Opts{Base_Locations}};
  } else {
    # Before fetching the document, we don't know if we'll be within the
    # recursion scope or not (think redirects).
    if (!&in_recursion_scope($response->{absolute_uri})) {
      hprintf("Not in recursion scope: %s\n")
        if ($Opts{Verbose});
      $response->content("");
      return;
    }
  }

  # Define the document header, and perhaps print it.
  # (It might still be defined if the previous document had no errors;
  # just redefine it in that case.)

  if ($check_num != 1) {
    if ($Opts{HTML}) {
      $doc_header = "\n<hr />\n";
    } else {
      $doc_header = "\n" . ('-' x 40) . "\n";
    }
  }

  my $absolute_uri = $response->{absolute_uri}->as_string();

  if ($Opts{HTML}) {
    $doc_header .= ("<h2>\nProcessing\t"
                    . &show_url($absolute_uri)
                    . "\n</h2>\n\n");
  } else {
    $doc_header .= "\nProcessing\t$absolute_uri\n\n";
  }

  if (! $Opts{Quiet}) {
    print_doc_header();
  }

  # We are checking a new document
  $doc_count++;

  my $result_anchor = 'results'.$doc_count;

  if ($check_num == 1 && !$Opts{HTML} && !$Opts{Summary_Only}) {
    my $s = $Opts{Sleep_Time} == 1 ? '' : 's';
    my $acclang = $Opts{Accept_Language} || '(not sent)';
    my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending';
    printf(<<'EOF', $Accept, $acclang, $send_referer, $Opts{Sleep_Time}, $s);

Settings used:
- Accept: %s
- Accept-Language: %s
- Referer: %s
- Sleeping %d second%s between requests to each server
EOF
    printf("- Excluding links matching %s\n", $Opts{Exclude})
      if defined($Opts{Exclude});
    printf("- Excluding links in documents whose URIs match %s\n",
      join(', ', @{$Opts{Exclude_Docs}})) if @{$Opts{Exclude_Docs}};
  }

  if ($Opts{HTML}) {
    if (! $Opts{Summary_Only}) {
      my $accept = &encode($Accept);
      my $acclang = &encode($Opts{Accept_Language} || '(not sent)');
      my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending';
      my $s = $Opts{Sleep_Time} == 1 ? '' : 's';
      printf(<<'EOF', $accept, $acclang, $send_referer, $Opts{Sleep_Time}, $s);
<div class="settings">
Settings used:
 <ul>
  <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.1">Accept</a></tt>: %s</li>
  <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4">Accept-Language</a></tt>: %s</li>
  <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36">Referer</a></tt>: %s</li>
  <li>Sleeping %d second%s between requests to each server</li>
 </ul>
</div>
EOF
      printf("<p>Go to <a href=\"#%s\">the results</a>.</p>\n",
             $result_anchor);
      my $esc_uri = URI::Escape::uri_escape($absolute_uri, "^A-Za-z0-9.");
      printf("<p>For reliable link checking results, check
<a href=\"%s\">HTML validity</a> first.  See also
<a href=\"%s\">CSS validity</a>.</p>
<p>Back to the <a accesskey=\"1\" href=\"%s\">link checker</a>.</p>\n",
             &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri)),
             &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)),
             &encode($Opts{_Self_URI}));

      printf ('<h3 class="status_progress" id="status_progress%s">Status: <span></span></h3>', $result_anchor);
      printf("<pre class=\"progress\" id=\"pre_progress%s\">\n", $result_anchor);
    }
  }

  if ($Opts{Summary_Only} && !$Opts{Quiet}) {
    print '<p>' if $Opts{HTML};
    print 'This may take some time';
    print "... (<a href=\"$Cfg{Doc_URI}#wait\">why?</a>)</p>" if $Opts{HTML};
    print " if the document has many links to check.\n" unless $Opts{HTML};
  }
  # Record that we have processed this resource
  $processed{$absolute_uri} = 1;
  # Parse the document
  my $p = &parse_document($uri, $absolute_uri, $response, 1, ($depth != 0));
  my $base = URI->new($p->{base});

  # Check anchors
  ###############

  print "Checking anchors...\n" unless $Opts{Summary_Only};

  my %errors;
  while (my ($anchor, $lines) = each(%{$p->{Anchors}})) {
    if (!length($anchor)) {
      # Empty IDREF's are not allowed
      $errors{$anchor} = 1;
    } else {
      my $times = 0;
      $times += $_ for values(%$lines);
      # They should appear only once
      $errors{$anchor} = 1 if ($times > 1);
    }
  }
  print " done.\n" unless $Opts{Summary_Only};

  # Check links
  #############

  &hprintf("Recording all the links found: %d\n", scalar (keys %{$p->{Links}}))
    if ($Opts{Verbose});
  my %links;
  # Record all the links found
  while (my ($link, $lines) = each(%{$p->{Links}})) {
    my $link_uri = URI->new($link);
    my $abs_link_uri = URI->new_abs($link_uri, $base);

    if ($Opts{Masquerade}) {
      if ($abs_link_uri =~ m|^\Q$Opts{Masquerade_From}\E|) {
        print_doc_header();
        printf("processing %s in base %s\n",
               $abs_link_uri, $Opts{Masquerade_To});
        my $nlink = $abs_link_uri;
        $nlink =~
          s|^\Q$Opts{Masquerade_From}\E|$Opts{Masquerade_To}|;
        $abs_link_uri = URI->new($nlink);
      }
    }

    my $canon_uri = URI->new($abs_link_uri->canonical());
    my $fragment = $canon_uri->fragment(undef);
    if (!defined($Opts{Exclude}) || $canon_uri !~ $Opts{Exclude}) {
      foreach my $line_num (keys(%$lines)) {
        if (!defined($fragment) || !length($fragment)) {
          # Document without fragment
          $links{$canon_uri}{location}{$line_num} = 1;
        } else {
          # Resource with a fragment
          $links{$canon_uri}{fragments}{$fragment}{$line_num} = 1;
        }
      }
    }
  }

  # Build the list of broken URI's
  &hprintf("Checking %d links to build list of broken URI's\n", scalar (keys %links))
    if ($Opts{Verbose});
  my %broken;
  while (my ($u, $ulinks) = each(%links)) {

    if ($Opts{Summary_Only}) {
      # Hack: avoid browser/server timeouts in summary only CGI mode, bug 896
      print ' ' if ($Opts{HTML} && !$Opts{Command_Line});
    } else {
      &hprintf("\nChecking link %s\n", $u);
      printf('<script type="text/javascript">show_progress("Checking link %s", "status_progress%s", "pre_progress%s");</script>' , &encode($u), $result_anchor, $result_anchor)
        if (!$Opts{Command_Line} && $Opts{HTML} && !$Opts{Summary_Only});
    }

    # Check that a link is valid
    &check_validity($uri, $u,
                    ($depth != 0 && &in_recursion_scope($u)),
                    \%links, \%redirects);
    &hprintf("\tReturn code: %s\n", $results{$u}{location}{code})
      if ($Opts{Verbose});
    if ($results{$u}{location}{success}) {

      # Even though it was not broken, we might want to display it
      # on the results page (e.g. because it required authentication)
      $broken{$u}{location} = 1 if ($results{$u}{location}{display} >= 400);

      # List the broken fragments
      while (my ($fragment, $lines) = each(%{$ulinks->{fragments}})) {
        if ($Opts{Verbose}) {
          my @line_nums = sort { $a <=> $b } keys(%$lines);
          &hprintf("\t\t%s %s - Line%s: %s\n",
                   $fragment,
                   ($results{$u}{fragments}{$fragment}) ? 'OK' : 'Not found',
                   (scalar(@line_nums) > 1) ? 's' : '',
                   join(', ', @line_nums)
                  );
        }
        # A broken fragment?
        if ($results{$u}{fragments}{$fragment} == 0) {
          $broken{$u}{fragments}{$fragment} += 2;
        }
      }
    } elsif (!($Opts{Quiet} && &informational($results{$u}{location}{code}))) {
      # Couldn't find the document
      $broken{$u}{location} = 1;
      # All the fragments associated are hence broken
      foreach my $fragment (keys %{$ulinks->{fragments}}) {
        $broken{$u}{fragments}{$fragment}++;
      }
    }
  }
  &hprintf("\nProcessed in %ss.\n", &time_diff($start, &get_timestamp()))
   unless $Opts{Summary_Only};
   printf('<script type="text/javascript">show_progress("Done. Document processed in %ss.\n", "status_progress%s", "pre_progress%s");</script>' , &time_diff($start, &get_timestamp()), $result_anchor, $result_anchor)
   if ($Opts{HTML} && !$Opts{Summary_Only});

  # Display results
  if ($Opts{HTML} && !$Opts{Summary_Only}) {
    print("</pre>\n");
    printf("<h2><a name=\"%s\">Results</a></h2>\n", $result_anchor);
  }
  print "\n" unless $Opts{Quiet};

  &links_summary(\%links, \%results, \%broken, \%redirects);
  &anchors_summary($p->{Anchors}, \%errors);

  # Do we want to process other documents?
  if ($depth != 0) {

    foreach my $u (keys %links) {

      next unless $results{$u}{location}{success};  # Broken link?

      next unless &in_recursion_scope($u);

      # Do we understand its content type?
      next unless ($results{$u}{location}{type} =~ $ContentTypes);

      # Have we already processed this URI?
      next if &already_processed($u, $uri);

      # Do the job
      print "\n" unless $Opts{Quiet};
      if ($Opts{HTML}) {
        if (!$Opts{Command_Line}) {
          if ($doc_count == $Opts{Max_Documents}) {
            print("<hr />\n<p><strong>Maximum number of documents ($Opts{Max_Documents}) reached!</strong></p>\n");
          }
          if ($doc_count >= $Opts{Max_Documents}) {
            $doc_count++;
            print("<p>Not checking <strong>$u</strong></p>\n");
            $processed{$u} = 1;
            next;
          }
        }
      }

      # This is an inherently recursive algorithm, so Perl's warning is not
      # helpful.  You may wish to comment this out when debugging, though.
      no warnings 'recursion';

      if ($depth < 0) {
        &check_uri($params, $u, 0, -1, $cookie, $uri);
      } else {
        &check_uri($params, $u, 0, $depth-1, $cookie, $uri);
      }
    }
  }
  return;
}

#######################################
# Get and parse a resource to process #
#######################################

sub get_document ($$$;\%$)
{
  my ($method, $uri, $in_recursion, $redirects, $referer) = @_;
  # $method contains the HTTP method the use (GET or HEAD)
  # $uri contains the identifier of the resource
  # $in_recursion is > 0 if we are in recursion mode (i.e. it is at least
  #                        the second resource checked)
  # $redirects is a pointer to the hash containing the map of the redirects
  # $referer is the URI of the referring document

  # Get the resource
  my $response;
  if (defined($results{$uri}{response})
      && !(($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
    $response = $results{$uri}{response};
  } else {
    $response = &get_uri($method, $uri, $referer);
    &record_results($uri, $method, $response);
    &record_redirects($redirects, $response);
  }
  if (! $response->is_success()) {
    if (! $in_recursion) {
      # Is it too late to request authentication?
      if ($response->code() == 401) {
        &authentication($response);
      } else {
        # TODO: style this message to make it stand out
        print "<p>" if $Opts{HTML};
        &hprintf("\nError: %d %s\n",
                 $response->code(), $response->message() || '(no message)');
        print "</p>\n" if $Opts{HTML};
      }
    }
    $response->{Stop} = 1;
    $response->content("");
    return($response);
  }

  # What is the URI of the resource that we are processing by the way?
  my $base_uri = $response->base();
  my $request_uri = URI->new($response->request->url);
  $response->{absolute_uri} = $request_uri->abs($base_uri);

  # Can we parse the document?
  my $failed_reason;
  my $ct = $response->header('Content-Type');
  if (!$ct || $ct !~ $ContentTypes) {
    $failed_reason = "Content-Type for <$request_uri> is " .
      (defined($ct) ? "'$ct'" : 'undefined');
  } else {
    # Pre-decode Content-Encoding.
    # @@@TODO: maybe also decode charsets?
    my $docref = $response->decoded_content(ref => 1, charset => 'none');
    if (defined($docref)) {
      $response->content_ref($docref);
      # Remove Content-Encoding so it won't be decoded again later.
      $response->remove_header('Content-Encoding')
    } else {
      my $ce = $response->header('Content-Encoding');
      $ce = defined($ce) ? "'$ce'" : 'undefined';
      $ct = defined($ct) ? "'$ct'" : 'undefined';
      $failed_reason = "Error decoding document at <$request_uri>, Content-Type $ct, Content-Encoding $ce: '$@'";
    }
  }
  if ($failed_reason) {
    # No, there is a problem...
    if (! $in_recursion) {
      # TODO: style this message to make it stand out
      print "<p>" if $Opts{HTML};
      &hprintf("Can't check links: %s.\n", $failed_reason);
      print "</p>\n" if $Opts{HTML};
    }
    $response->{Stop} = 1;
    $response->content("");
  }

  # Ok, return the information
  return($response);
}

#########################################################
# Check whether a URI is within the scope of recursion. #
#########################################################

sub in_recursion_scope ($)
{
  my ($uri) = @_;
  return 0 unless $uri;

  my $candidate = URI->new($uri)->canonical();

  return 0 if (defined($Opts{Exclude}) && $candidate =~ $Opts{Exclude});

  for my $excluded_doc (@{$Opts{Exclude_Docs}}) {
    return 0 if ($candidate =~ $excluded_doc);
  }

  foreach my $base (@{$Opts{Base_Locations}}) {
    my $rel = $candidate->rel($base);
    next if ($candidate eq $rel);   # Relative path not possible?
    next if ($rel =~ m|^(\.\.)?/|); # Relative path upwards?
    return 1;
  }

  return 0; # We always have at least one base location, but none matched.
}

##################################################
# Check whether a URI has already been processed #
##################################################

sub already_processed ($$)
{
  my ($uri, $referer) = @_;
  # Don't be verbose for that part...
  my $summary_value = $Opts{Summary_Only};
  $Opts{Summary_Only} = 1;
  # Do a GET: if it fails, we stop, if not, the results are cached
  my $response = &get_document('GET', $uri, 1, undef, $referer);
  # ... but just for that part
  $Opts{Summary_Only} = $summary_value;
  # Can we process the resource?
  return -1 if defined($response->{Stop});
  # Have we already processed it?
  return  1 if defined($processed{$response->{absolute_uri}->as_string()});
  # It's not processed yet and it is processable: return 0
  return  0;
}

############################
# Get the content of a URI #
############################

sub get_uri ($$;$$\%$$$$)
{
  # Here we have a lot of extra parameters in order not to lose information
  # if the function is called several times (401's)
  my ($method, $uri, $referer, $start, $redirects, $code, $realm, $message,
      $auth) = @_;

  # $method contains the method used
  # $uri contains the target of the request
  # $referer is the URI of the referring document
  # $start is a timestamp (not defined the first time the function is called)
  # $redirects is a map of redirects
  # $code is the first HTTP return code
  # $realm is the realm of the request
  # $message is the HTTP message received
  # $auth equals 1 if we want to send out authentication information

  # For timing purposes
  $start = &get_timestamp() unless defined($start);

  # Prepare the query

  # Do we want printouts of progress?
  my $verbose_progress =
    ! ($Opts{Summary_Only} || (!$doc_count && $Opts{HTML}));

  &hprintf("%s %s ", $method, $uri) if $verbose_progress;

  my $request = new HTTP::Request($method, $uri);

  $request->header('Accept-Language' => $Opts{Accept_Language})
    if $Opts{Accept_Language};
  $request->header('Accept', $Accept);
  # accept_decodable() was added in LWP 5.814
  $request->accept_decodable() if $request->can('accept_decodable');

  # Are we providing authentication info?
  if ($auth && $request->url()->host() =~ $Opts{Trusted}) {
    if (defined($ENV{HTTP_AUTHORIZATION})) {
      $request->header(Authorization => $ENV{HTTP_AUTHORIZATION});
    } elsif (defined($Opts{User}) && defined($Opts{Password})) {
      $request->authorization_basic($Opts{User}, $Opts{Password});
    }
  }

  # Tell the user agent if we want progress reports for redirects or not.
  $ua->redirect_progress_callback(sub { &hprintf("\n-> %s %s ", @_); })
    if $verbose_progress;

  # Set referer
  $request->referer($referer) if (!$Opts{No_Referer} && $referer);

  # Telling caches in the middle we want a fresh copy (Bug 4998)
  $request->header(Cache_Control => "max-age=0");

  # Do the query
  my $response = $ua->request($request);

  # Get the results
  # Record the very first response
  if (! defined($code)) {
    ($code, $message) = delete(@$ua{qw(FirstResponse FirstMessage)});
  }
  # Authentication requested?
  if ($response->code() == 401 &&
      !defined($auth) &&
      (defined($ENV{HTTP_AUTHORIZATION})
       || (defined($Opts{User}) && defined($Opts{Password})))) {

    # Set host as trusted domain unless we already have one.
    if (!$Opts{Trusted}) {
      my $re = sprintf('^%s$', quotemeta($response->base()->host()));
      $Opts{Trusted} = qr/$re/io;
    }

    # Deal with authentication and avoid loops
    if (!defined($realm) &&
        $response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) {
      $realm = $1;
    }

    print "\n" if $verbose_progress;
    return &get_uri($method, $response->request()->url(), $referer,
                    $start, $redirects, $code, $realm, $message, 1);
  }
  # @@@ subtract robot delay from the "fetched in" time?
  &hprintf(" fetched in %ss\n",
           &time_diff($start, &get_timestamp())) if $verbose_progress;

  $response->{Realm} = $realm if defined($realm);

  return $response;
}

#########################################
# Record the results of an HTTP request #
#########################################

sub record_results ($$$)
{
  my ($uri, $method, $response) = @_;
  $results{$uri}{response} = $response;
  $results{$uri}{method} = $method;
  $results{$uri}{location}{code} = $response->code();
  $results{$uri}{location}{code} = RC_ROBOTS_TXT()
    if ($results{$uri}{location}{code} == 403 &&
        $response->message() =~ /Forbidden by robots\.txt/);
  $results{$uri}{location}{code} = RC_IP_DISALLOWED()
    if ($results{$uri}{location}{code} == 403 &&
        $response->message() =~ /non-public IP/);
  $results{$uri}{location}{code} = RC_DNS_ERROR()
    if ($results{$uri}{location}{code} == 500 &&
        $response->message() =~ /Bad hostname '[^\']*'/);
  $results{$uri}{location}{code} = RC_PROTOCOL_DISALLOWED()
    if ($results{$uri}{location}{code} == 500 &&
        $response->message() =~ /Access to '[^\']*' URIs has been disabled/);
  $results{$uri}{location}{type} = $response->header('Content-type');
  $results{$uri}{location}{display} = $results{$uri}{location}{code};
  # Rewind, check for the original code and message.
  for (my $tmp = $response->previous(); $tmp; $tmp = $tmp->previous()) {
    $results{$uri}{location}{orig} = $tmp->code();
    $results{$uri}{location}{orig_message} = $tmp->message() || '(no message)';
  }
  $results{$uri}{location}{success} = $response->is_success();
  # Stores the authentication information
  if (defined($response->{Realm})) {
    $results{$uri}{location}{realm} = $response->{Realm};
    $results{$uri}{location}{display} = 401 unless $Opts{Hide_Same_Realm};
  }
  # What type of broken link is it? (stored in {record} - the {display}
  #              information is just for visual use only)
  if (($results{$uri}{location}{display} == 401)
      && ($results{$uri}{location}{code} == 404)) {
    $results{$uri}{location}{record} = 404;
  } else {
    $results{$uri}{location}{record} = $results{$uri}{location}{display};
  }
  # Did it fail?
  $results{$uri}{location}{message} = $response->message() || '(no message)';
  if (! $results{$uri}{location}{success}) {
    &hprintf("Error: %d %s\n",
             $results{$uri}{location}{code},
             $results{$uri}{location}{message})
      if ($Opts{Verbose});
  }
  return;
}

####################
# Parse a document #
####################

sub parse_document ($$$$$)
{
  my ($uri, $base_uri, $response, $links, $rec_needs_links) = @_;

  print("parse_document($uri, $base_uri, ..., $links, $rec_needs_links)\n")
    if $Opts{Verbose};

  my $p;

  if (defined($results{$uri}{parsing})) {
    # We have already done the job. Woohoo!
    $p->{base} = $results{$uri}{parsing}{base};
    $p->{Anchors} = $results{$uri}{parsing}{Anchors};
    $p->{Links} = $results{$uri}{parsing}{Links};
    return $p;
  }

  my $start;
  $p = W3C::LinkChecker->new();
  $p->{base} = $base_uri;
  if (! $Opts{Summary_Only}) {
    $start = &get_timestamp();
    print("Parsing...\n");
  }

  # Content-Encoding etc already decoded in get_document().
  my $docref = $response->content_ref();

  # Count lines beforehand if needed for progress indicator.  In all cases,
  # the actual final number of lines processed shown is populated by our
  # end_document handler.
  $p->{Total} = ($$docref =~ tr/\n//) if $Opts{Progress};

  # We only look for anchors if we are not interested in the links
  # obviously, or if we are running a recursive checking because we
  # might need this information later
  $p->{only_anchors} = !($links || $rec_needs_links);

  # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing
  # Processing instructions are not parsed by process, but in this case
  # it should be. It's expensive, it's horrible, but it's the easiest way
  # for right now.
  $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ unless $p->{only_anchors};

  $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/);

  $p->parse($$docref)->eof();
  $response->content("");

  if (! $Opts{Summary_Only}) {
    my $stop = &get_timestamp();
    print "\r" if $Opts{Progress};
    &hprintf(" done (%d lines in %ss).\n",
             $p->{Total}, &time_diff($start, $stop));
  }

  # Save the results before exiting
  $results{$uri}{parsing}{base} = $p->{base};
  $results{$uri}{parsing}{Anchors} = $p->{Anchors};
  $results{$uri}{parsing}{Links} = $p->{Links};

  return $p;
}

####################################
# Constructor for W3C::LinkChecker #
####################################

sub new
{
  my $p = HTML::Parser::new(@_, api_version => 3);
  eval { local $SIG{__DIE__}; $p->utf8_mode(1); };

  # Start tags
  $p->handler(start => 'start', 'self, tagname, attr, text, line');
  # Declarations
  $p->handler(declaration =>
              sub {
                my $self = shift;
                $self->declaration(substr($_[0], 2, -1));
              }, 'self, text, line');
  # Other stuff
  if ($Opts{Progress}) {
    $p->handler(default => 'parse_progress', 'self, line');
    $p->{last_percentage} = 0;
  }
  $p->handler(end_document => 'end_document', 'self, line');
  # Check <a [..] name="...">?
  $p->{check_name} = 1;
  # Check <[..] id="..">?
  $p->{check_id} = 1;
  # Don't interpret comment loosely
  $p->strict_comment(1);

  return $p;
}

#################################################
# Record or return  the doctype of the document #
#################################################

sub doctype
{
  my ($self, $dc) = @_;
  return $self->{doctype} unless $dc;
  $_ = $self->{doctype} = $dc;

  # What to look for depending on the doctype

  # Check for <a name="...">?
  $self->{check_name} = 0
    if m%^-//(W3C|WAPFORUM)//DTD XHTML (Basic|Mobile) %;

  # Check for <* id="...">?
  $self->{check_id} = 0
    if (m%^-//IETF//DTD HTML [23]\.0//% || m%^-//W3C//DTD HTML 3\.2//%);

  # Enable XML mode (XHTML, XHTML Mobile, XHTML-Print, XHTML+RDFa, ...)
  $self->xml_mode(1) if (m%^-//(W3C|WAPFORUM)//DTD XHTML[ \-\+]%);

  return;
}

###################################
# Print parse progress indication #
###################################

sub parse_progress
{
  my ($self, $line) = @_;
  return unless defined($line) && $line > 0 && $self->{Total} > 0;

  my $percentage = int($line/$self->{Total}*100);
  if ($percentage != $self->{last_percentage}) {
    printf("\r%4d%%", $percentage);
    $self->{last_percentage} = $percentage;
  }

  return;
}

#############################
# Extraction of the anchors #
#############################

sub get_anchor
{
  my ($self, $tag, $attr) = @_;

  my $anchor = $self->{check_id} ? $attr->{id} : undef;
  if ($self->{check_name} && ($tag eq 'a')) {
    # @@@@ In XHTML, <a name="foo" id="foo"> is mandatory
    # Force an error if it's not the case (or if id's and name's values
    #                                      are different)
    # If id is defined, name if defined must have the same value
    $anchor ||= $attr->{name};
  }

  return $anchor;
}

#############################
# W3C::LinkChecker handlers #
#############################

sub add_link
{
  my ($self, $uri, $base, $line) = @_;
  if (defined($uri)) {
    # Remove repeated slashes after the . or .. in relative links, to avoid
    # duplicated checking or infinite recursion.
    $uri =~ s|^(\.\.?/)/+|$1|o;
    $uri = URI->new_abs($uri, $base) if defined($base);
    $self->{Links}{$uri}{$line}++;
  }
  return;
}

sub start
{
  my ($self, $tag, $attr, $text, $line) = @_;
  $line = LINE_UNKNOWN() unless defined($line);

  # Anchors
  my $anchor = $self->get_anchor($tag, $attr);
  $self->{Anchors}{$anchor}{$line}++ if defined($anchor);

  # Links
  if (!$self->{only_anchors}) {

    my $tag_local_base = undef;

    # Special case: base/@href
    # TODO: This should go away as soon as LWP::Protocol::collect() invokes
    #       HTML::HeadParser (thus taking care of it in $response->base()
    #       transparently) for application/xhtml+xml and
    #       application/vnd.wap.xhtml+xml documents
    #       --> it does in LWP >= 5.810
    if ($tag eq 'base') {
      # Treat <base> (without href) or <base href=""> as if it didn't exist.
      if (defined($attr->{href}) && length($attr->{href})) {
        $self->{base} = $attr->{href};
      }
      # Note: base/@href intentionally not treated as a dereferenceable link:
      # http://www.w3.org/mid/200802091439.27764.ville.skytta%40iki.fi
    }
    # Special case: meta[@http-equiv=Refresh]/@content
    elsif ($tag eq 'meta') {
      if ($attr->{'http-equiv'} && lc($attr->{'http-equiv'}) eq 'refresh') {
        my $content = $attr->{content};
        if ($content && $content =~ /.*?;\s*(?:url=)?(.+)/i) {
          $self->add_link($1, undef, $line);
        }
      }
    }
    # Special case: tags that have "local base"
    elsif ($tag eq 'applet' || $tag eq 'object') {
      if (my $codebase = $attr->{codebase}) {
        # TODO: HTML 4 spec says applet/@codebase may only point to subdirs of
        # the directory containing the current document.  Should we do
        # something about that?
        $tag_local_base = URI->new_abs($codebase, $self->{base});
      }
    }

    # Link attributes:
    if (my $link_attrs = LINK_ATTRS()->{$tag}) {
      for my $la (@$link_attrs) {
        $self->add_link($attr->{$la}, $tag_local_base, $line);
      }
    }

    # List of links attributes:
    if (my $link_attrs = LINK_LIST_ATTRS()->{$tag}) {
      for my $la (@$link_attrs) {
        if (defined(my $value = $attr->{$la})) {
          for my $link (split(/\s+/, $value)) {
            $self->add_link($link, $tag_local_base, $line);
          }
        }
      }
    }
  }

  $self->parse_progress($line) if $Opts{Progress};

  return;
}

sub declaration
{
  my ($self, $text, $line) = @_;
  $line = LINE_UNKNOWN() unless defined($line);

  # Extract the doctype
  my @declaration = split(/\s+/, $text, 4);
  if (($#declaration >= 3) &&
      ($declaration[0] eq 'DOCTYPE') &&
      (lc($declaration[1]) eq 'html')) {
    # Parse the doctype declaration
    if ($text =~ m/^DOCTYPE\s+html\s+(?:PUBLIC\s+"([^"]+)"|SYSTEM)(\s+"([^"]+)")?\s*$/i) {
      # Store the doctype
      $self->doctype($1) if $1;
      # If there is a link to the DTD, record it
      $self->add_link($3, undef, $line) if (!$self->{only_anchors} && $3);
    }
  }

  $self->text($text) unless $self->{only_anchors};

  return;
}

sub end_document
{
  my ($self, $line) = @_;
  $self->{Total} = $line;
  return;
}

################################
# Check the validity of a link #
################################

sub check_validity ($$$\%\%)
{
  my ($referer, $uri, $want_links, $links, $redirects) = @_;
  # $referer is the URI of the document checked
  # $uri is the URI of the target that we are verifying
  # $want_links is true if we're interested in links in the target doc
  # $links is a hash of the links in the documents checked
  # $redirects is a map of the redirects encountered

  # Get the document with the appropriate method
  # Only use GET if there are fragments. HEAD is enough if it's not the
  # case.
  my @fragments = keys %{$links->{$uri}{fragments}};
  my $method = scalar(@fragments) ? 'GET' : 'HEAD';

  my $response;
  my $being_processed = 0;
  if ((! defined($results{$uri}))
      || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
    $being_processed = 1;
    $response = &get_uri($method, $uri, $referer);
    # Get the information back from get_uri()
    &record_results($uri, $method, $response);
    # Record the redirects
    &record_redirects($redirects, $response);
  }

  # We got the response of the HTTP request. Stop here if it was a HEAD.
  return if ($method eq 'HEAD');

  # There are fragments. Parse the document.
  my $p;
  if ($being_processed) {
    # Can we really parse the document?
    if (!defined($results{$uri}{location}{type}) ||
        $results{$uri}{location}{type} !~ $ContentTypes)
    {
      &hprintf("Can't check content: Content-Type for '%s' is '%s'.\n",
               $uri, $results{$uri}{location}{type})
        if ($Opts{Verbose});
      $response->content("");
      return;
    }
    # Do it then
    $p = &parse_document($uri, $response->base(), $response, 0, $want_links);
  } else {
    # We already had the information
    $p->{Anchors} = $results{$uri}{parsing}{Anchors};
  }
  # Check that the fragments exist
  foreach my $fragment (keys %{$links->{$uri}{fragments}}) {
    if (defined($p->{Anchors}{$fragment})
        || &escape_match($fragment, $p->{Anchors})) {
      $results{$uri}{fragments}{$fragment} = 1;
    } else {
      $results{$uri}{fragments}{$fragment} = 0;
    }
  }
  return;
}

sub escape_match ($\%)
{
  my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]);
  foreach my $b (keys %$hash) {
    return 1 if ($a eq URI::Escape::uri_unescape($b));
  }
  return 0;
}

##########################
# Ask for authentication #
##########################

sub authentication ($)
{
  my ($response) = @_;

  my $realm = '';
  if ($response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) {
    $realm = $1;
  }

  if ($Opts{Command_Line}) {
    printf STDERR <<EOF, $response->request()->url(), $realm;

Authentication is required for %s.
The realm is "%s".
Use the -u and -p options to specify a username and password and the -d option
to specify trusted domains.
EOF
  } else {

    printf("Status: 401 Authorization Required\nWWW-Authenticate: %s\nConnection: close\nContent-Language: en\nContent-Type: text/html; charset=utf-8\n\n", $response->www_authenticate());

    printf("%s
<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
<head>
<title>W3C Link Checker: 401 Authorization Required</title>
%s</head>
<body>", $DocType, $Head);
    &banner(': 401 Authorization Required');
    printf("<p>
  You need \"%s\" access to <a href=\"%s\">%s</a> to perform link checking.<br />
", &encode($realm), (&encode($response->request()->url())) x 2);

    if ($Opts{Trusted}) {
      printf <<EOF, &encode($Opts{Trusted});
  This service has been configured to send authentication only to hostnames
  matching the regular expression <code>%s</code>
EOF
    }

    print "</p>\n";
  }
  return;
}

##################
# Get statistics #
##################

sub get_timestamp ()
{
  return pack('LL', Time::HiRes::gettimeofday());
}

sub time_diff ($$)
{
  my @start = unpack('LL', $_[0]);
  my @stop = unpack('LL', $_[1]);
  for ($start[1], $stop[1]) {
    $_ /= 1_000_000;
  }
  return(sprintf("%.2f", ($stop[0]+$stop[1])-($start[0]+$start[1])));
}

########################
# Handle the redirects #
########################

# Record the redirects in a hash
sub record_redirects (\%$)
{
  my ($redirects, $response) = @_;
  for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) {
    $redirects->{$prev->request()->url()} = $response->request()->url();
  }
  return;
}

# Determine if a request is redirected
sub is_redirected ($%)
{
  my ($uri, %redirects) = @_;
  return(defined($redirects{$uri}));
}

# Get a list of redirects for a URI
sub get_redirects ($%)
{
  my ($uri, %redirects) = @_;
  my @history = ($uri);
  my %seen = ($uri => 1); # for tracking redirect loops
  my $loop = 0;
  while ($redirects{$uri}) {
    $uri = $redirects{$uri};
    push(@history, $uri);
    if ($seen{$uri}) {
      $loop = 1;
      last;
    } else {
      $seen{$uri}++;
    }
  }
  return ($loop, @history);
}

####################################################
# Tool for sorting the unique elements of an array #
####################################################

sub sort_unique (@)
{
  my %saw;
  @saw{@_} = ();
  return (sort { $a <=> $b } keys %saw);
}

#####################
# Print the results #
#####################

sub line_number ($)
{
  my $line = shift;
  return $line if ($line >= 0);
  return "(N/A)";
}

sub http_rc ($)
{
  my $rc = shift;
  return $rc if ($rc >= 0);
  return "(N/A)";
}

# returns true if the given code is informational
sub informational ($)
{
  my $rc = shift;
  return $rc == RC_ROBOTS_TXT() || $rc == RC_IP_DISALLOWED() ||
    $rc == RC_PROTOCOL_DISALLOWED();
}

sub anchors_summary (\%\%)
{
  my ($anchors, $errors) = @_;

  # Number of anchors found.
  my $n = scalar(keys(%$anchors));
  if (! $Opts{Quiet}) {
    if ($Opts{HTML}) {
      print("<h3>Anchors</h3>\n<p>");
    } else {
      print("Anchors\n\n");
    }
    &hprintf("Found %d anchor%s.\n", $n, ($n == 1) ? '' : 's');
    print("</p>\n") if $Opts{HTML};
  }
  # List of the duplicates, if any.
  my @errors = keys %{$errors};
  if (! scalar(@errors)) {
    print("<p>Valid anchors!</p>\n") if (! $Opts{Quiet} && $Opts{HTML} && $n);
    return;
  }
  undef $n;

  print_doc_header();
  print('<p>') if $Opts{HTML};
  print('List of duplicate and empty anchors');
  print <<EOF if $Opts{HTML};
</p>
<table class="report" border="1" summary="List of duplicate and empty anchors.">
<thead>
<tr>
<th>Anchor</th>
<th>Lines</th>
</tr>
</thead>
<tbody>
EOF
  print("\n");

  foreach my $anchor (@errors) {
    my $format;
    my @unique = &sort_unique(map { line_number($_) }
                              keys %{$anchors->{$anchor}});
    if ($Opts{HTML}) {
      $format = "<tr><td class=\"broken\">%s</td><td>%s</td></tr>\n";
    } else {
      my $s = (scalar(@unique) > 1) ? 's' : '';
      $format = "\t%s\tLine$s: %s\n";
    }
    printf($format,
           &encode(length($anchor) ? $anchor : 'Empty anchor'),
           join(', ', @unique));
  }

  print("</tbody>\n</table>\n") if $Opts{HTML};

  return;
}

sub show_link_report (\%\%\%\%\@;$\%)
{
  my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_;

  print("\n<dl class=\"report\">") if $Opts{HTML};
  print("\n") if (! $Opts{Quiet});

  # Process each URL
  my ($c, $previous_c);
  foreach my $u (@$urls) {
    my @fragments = keys %{$broken->{$u}{fragments}};
    # Did we get a redirect?
    my $redirected = &is_redirected($u, %$redirects);
    # List of lines
    my @total_lines;
    push(@total_lines, keys(%{$links->{$u}{location}}));
    foreach my $f (@fragments) {
      push(@total_lines, keys(%{$links->{$u}{fragments}{$f}}))
        unless ($f eq $u && defined($links->{$u}{$u}{LINE_UNKNOWN()}));
    }

    my ($redirect_loop, @redirects_urls) = get_redirects($u, %$redirects);
    my $currloc = $results->{$u}{location};

    # Error type
    $c = &code_shown($u, $results);
    # What to do
    my $whattodo;
    my $redirect_too;
    if ($todo) {
      if ($u =~ m/^javascript:/) {
        if ($Opts{HTML}) {
          $whattodo =
'You must change this link: people using a browser without JavaScript support
will <em>not</em> be able to follow this link. See the
<a href="http://www.w3.org/TR/WAI-WEBCONTENT/#tech-scripts">Web Content
Accessibility Guidelines on the use of scripting on the Web</a> and the
<a href="http://www.w3.org/TR/WCAG10-HTML-TECHS/#directly-accessible-scripts">techniques
on how to solve this</a>.';
        } else {
          $whattodo = 'Change this link: people using a browser without JavaScript support will not be able to follow this link.';
        }
      } elsif ($c == RC_ROBOTS_TXT()) {
        $whattodo = 'The link was not checked due to robots exclusion ' .
          'rules. Check the link manually.';
      } elsif ($redirect_loop) {
        $whattodo =
          'Retrieving the URI results in a redirect loop, that should be ' .
          'fixed.  Examine the redirect sequence to see where the loop ' .
          'occurs.';
      } else {
        $whattodo = $todo->{$c};
      }
    } elsif (defined($redirects{$u})) {
      # Redirects
      if (($u.'/') eq $redirects{$u}) {
        $whattodo = 'The link is missing a trailing slash, and caused a redirect. Adding the trailing slash would speed up browsing.';
      }
      elsif  (($c eq 307) || ($c eq 302)) {
        $whattodo = 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.';
      }
      elsif ($c eq 301) {
        $whattodo = 'This is a permanent redirect. The link should be updated.';
      }
    }

    my @unique = &sort_unique(map { line_number($_) } @total_lines);
    my $lines_list = join(', ', @unique);
    my $s = (scalar(@unique) > 1) ? 's' : '';
    undef @unique;

    my @http_codes = ($currloc->{code});
    unshift(@http_codes, $currloc->{orig}) if $currloc->{orig};
    @http_codes = map { http_rc($_) } @http_codes;

    if ($Opts{HTML}) {
      # Style stuff
      my $idref = '';
      if ($codes && (!defined($previous_c) || ($c != $previous_c))) {
        $idref = ' id="d'.$doc_count.'code_'.$c.'"';
        $previous_c = $c;
      }
      # Main info
      for (@redirects_urls) {
        $_ = &show_url($_);
      }
      # HTTP message
      my $http_message;
      if ($currloc->{message}) {
        $http_message = &encode($currloc->{message});
        if ($c == 404 || $c == 500) {
          $http_message = '<span class="broken">'.
            $http_message.'</span>';
        }
      }
      my $redirmsg =
        $redirect_loop ? ' <em>redirect loop detected</em>' : '';
      printf("
<dt%s>%s <span class='msg_loc'>Line%s: %s</span> %s</dt>
<dd class='responsecode'><strong>Status</strong>: %s %s %s</dd>
<dd class='message_explanation'><p>%s %s</p></dd>\n",
             # Anchor for return codes
             $idref,
             # Color
             &status_icon($c),
             $s,
             # List of lines
             $lines_list,
             # List of redirects
             $redirected ? join(' redirected to ', @redirects_urls) . $redirmsg : &show_url($u),
             # Realm
             defined($currloc->{realm})
             ? sprintf('Realm: %s<br />', &encode($currloc->{realm})) : '',
             # HTTP original message
             # defined($currloc->{orig_message})
             # ? &encode($currloc->{orig_message}).
             # ' <span title="redirected to">-&gt;</span> '
             # : '',

             # Response code chain
             join(' <span class="redirected_to" title="redirected to">-&gt;</span> ',
                  map { &encode($_) } @http_codes),             
             # HTTP final message
             $http_message, 
             # What to do
             $whattodo,
             # Redirect too?
             $redirect_too ?
             sprintf(' <span %s>%s</span>', &bgcolor(301), $redirect_too) : '',
             );
      if ($#fragments >= 0) {
        printf("<dd>Broken fragments: <ul>\n");
      }
    } else {
      my $redirmsg = $redirect_loop ? ' redirect loop detected' : '';
      printf("\n%s\t%s\n  Code: %s %s\n%s\n",
             # List of redirects
             $redirected ? join("\n-> ", @redirects_urls) . $redirmsg : $u,
             # List of lines
             $lines_list ? sprintf("\n%6s: %s", "Line$s", $lines_list) : '',
             # Response code chain
             join(' -> ', @http_codes),
             # HTTP message
             $currloc->{message} || '',
             # What to do
             wrap(' To do: ', '        ', $whattodo));
      if ($#fragments >= 0) {
        if ($currloc->{code} == 200) {
          print("The following fragments need to be fixed:\n");
        } else {
          print("Fragments:\n");
        }
      }
    }
    # Fragments
    foreach my $f (@fragments) {
      my @unique_lines = &sort_unique(keys %{$links->{$u}{fragments}{$f}});
      my $plural = (scalar(@unique_lines) > 1) ? 's' : '';
      my $unique_lines = join(', ', @unique_lines);
      if ($Opts{HTML}) {
        printf("<li>%s<em>#%s</em> (line%s %s)</li>\n",
               &encode($u), &encode($f), $plural, $unique_lines);
      } else {
        printf("\t%-30s\tLine%s: %s\n", $f, $plural, $unique_lines);
      }
    }

    print("</ul></dd>\n") if ($Opts{HTML} && scalar(@fragments));
  }

  # End of the table
  print("</dl>\n") if $Opts{HTML};

  return;
}

sub code_shown ($$)
{
  my ($u, $results) = @_;

  if ($results->{$u}{location}{record} == 200) {
    return $results->{$u}{location}{orig} || $results->{$u}{location}{record};
  } else {
    return $results->{$u}{location}{record};
  }
}

sub links_summary (\%\%\%\%)
{
  # Advices to fix the problems

  my %todo = ( 200 => 'Some of the links to this resource point to broken URI fragments (such as index.html#fragment).',
               300 => 'This often happens when a typo in the link gets corrected automatically by the server. For the sake of performance, the link should be fixed.',
               301 => 'This is a permanent redirect. The link should be updated to point to the more recent URI.',
               302 => 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.',
               303 => 'This rare status code points to a "See Other" resource. There is generally nothing to be done.',
               307 => 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.',
               400 => 'This is usually the sign of a malformed URL that cannot be parsed by the server. Check the syntax of the link.',
               401 => "The link is not public and the actual resource is only available behind authentication. If not already done, you could specify it.",
               403 => 'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.',
               404 => 'The link is broken. Double-check that you have not made any typo, or mistake in copy-pasting. If the link points to a resource that no longer exists, you may want to remove or fix the link.',
               405 => 'The server does not allow HTTP HEAD requests, which prevents the Link Checker to check the link automatically.  Check the link manually.',
               406 => "The server isn't capable of responding according to the Accept* headers sent. This is likely to be a server-side issue with negotiation.",
               407 => 'The link is a proxy, but requires Authentication.',
               408 => 'The request timed out.',
               410 => 'The resource is gone. You should remove this link.',
               415 => 'The media type is not supported.',
               500 => 'This is a server side problem. Check the URI.',
               501 => 'Could not check this link: method not implemented or scheme not supported.',
               503 => 'The server cannot service the request, for some unknown reason.',
               # Non-HTTP codes:
               RC_ROBOTS_TXT() => sprintf('The link was not checked due to %srobots exclusion rules%s. Check the link manually, and see also the link checker %sdocumentation on robots exclusion%s.',
                                          $Opts{HTML} ? ('<a href="http://www.robotstxt.org/wc/exclusion.html#robotstxt">', '</a>', "<a href=\"$Cfg{Doc_URI}#bot\">", '</a>') : ('') x 4),
               RC_DNS_ERROR() => 'The hostname could not be resolved. Check the link for typos.',
               RC_IP_DISALLOWED() => sprintf('The link resolved to a %snon-public IP address%s, and this link checker instance has been configured to not access such addresses. This may be a real error or just a quirk of the name resolver configuration on the server where the link checker runs. Check the link manually, in particular its hostname/IP address.',
                                             $Opts{HTML} ? ('<a href="http://www.ietf.org/rfc/rfc1918.txt">', '</a>') : ('') x 2),
               RC_PROTOCOL_DISALLOWED() => 'Accessing links with this URI scheme has been disabled in link checker.',
             );
  my %priority = ( 410 => 1,
                   404 => 2,
                   403 => 5,
                   200 => 10,
                   300 => 15,
                   401 => 20
                 );

  my ($links, $results, $broken, $redirects) = @_;

  # List of the broken links
  my @urls = keys %{$broken};
  my @dir_redirect_urls = ();
  if ($Opts{Redirects}) {
    # Add the redirected URI's to the report
    for my $l (keys %$redirects) {
      next unless (defined($results->{$l})
                   && defined($links->{$l})
                   && !defined($broken->{$l}));
      # Check whether we have a "directory redirect"
      # e.g. http://www.w3.org/TR -> http://www.w3.org/TR/
      my ($redirect_loop, @redirects) = get_redirects($l, %$redirects);
      if ($#redirects == 1) {
        push(@dir_redirect_urls, $l);
        next;
      }
      push(@urls, $l);
    }
  }

  # Broken links and redirects
  if ($#urls < 0) {
    if (! $Opts{Quiet}) {
      print_doc_header();
      if ($Opts{HTML}) {
        print "<h3>Links</h3>\n<p>Valid links!</p>\n";
      } else {
        print "\nValid links.\n";
      }
    }
  } else {
    print_doc_header();
    print('<h3>') if $Opts{HTML};
    print("\nList of broken links and other issues");
    #print(' and redirects') if $Opts{Redirects};

    # Sort the URI's by HTTP Code
    my %code_summary;
    my @idx;
    foreach my $u (@urls) {
      if (defined($results->{$u}{location}{record}))  {
        my $c = &code_shown($u, $results);
        $code_summary{$c}++;
        push(@idx, $c);
      }
    }
    my @sorted = @urls[
                       sort {
                         defined($priority{$idx[$a]}) ?
                           defined($priority{$idx[$b]}) ?
                             $priority{$idx[$a]}
                               <=> $priority{$idx[$b]} :
                                 -1 :
                                   defined($priority{$idx[$b]}) ?
                                     1 :
                                         $idx[$a] <=> $idx[$b]
                                       } 0 .. $#idx
                      ];
    @urls = @sorted;
    undef(@sorted); undef(@idx);

    if ($Opts{HTML}) {
      # Print a summary
      print <<EOF;
</h3>
<p><em>There are issues with the URLs listed below. The table summarizes the
issues and suggested actions by HTTP response status code.</em></p>
<table class="report" border="1" summary="List of issues and suggested actions.">
<thead>
<tr>
<th>Code</th>
<th>Occurrences</th>
<th>What to do</th>
</tr>
</thead>
<tbody>
EOF
      foreach my $code (sort(keys(%code_summary))) {
        printf('<tr%s>', &bgcolor($code));
        printf('<td><a href="#d%scode_%s">%s</a></td>',
               $doc_count, $code, http_rc($code));
        printf('<td>%s</td>', $code_summary{$code});
        printf('<td>%s</td>', $todo{$code});
        print "</tr>\n";
      }
      print "</tbody>\n</table>\n";
    } else {
      print(':');
    }
    &show_link_report($links, $results, $broken, $redirects,
                      \@urls, 1, \%todo);
  }

  # Show directory redirects
  if ($Opts{Dir_Redirects} && ($#dir_redirect_urls > -1)) {
    print_doc_header();
    print('<h3>') if $Opts{HTML};
    print("\nList of redirects");
    print("</h3>\n<p>The links below are not broken, but the document does not use the exact URL, and the links were redirected. It may be a good idea to link to the final location, for the sake of speed.</p>") if $Opts{HTML};
    &show_link_report($links, $results, $broken, $redirects,
                      \@dir_redirect_urls);
  }

  return;
}

###############################################################################

################
# Global stats #
################

sub global_stats ()
{
  my $stop = &get_timestamp();
  my $n_docs =
    ($doc_count <= $Opts{Max_Documents}) ? $doc_count : $Opts{Max_Documents};
  return sprintf('Checked %d document%s in %s seconds.',
                 $n_docs,
                 ($n_docs == 1) ? '' : 's',
                 &time_diff($timestamp, $stop));
}

##################
# HTML interface #
##################

sub html_header ($;$$)
{
  my ($uri, $doform, $cookie) = @_;

  my $title = defined($uri) ? $uri : '';
  $title = ': ' . $title if ($title =~ /\S/);

  my $headers = '';
  if (! $Opts{Command_Line}) {
    $headers .= "Cache-Control: no-cache\nPragma: no-cache\n" if $doform;
    $headers .= "Content-Type: text/html; charset=utf-8\n";
    $headers .= "Set-Cookie: $cookie\n" if $cookie;

    # mod_perl 1.99_05 doesn't seem to like it if the "\n\n" isn't in the same
    # print() statement as the last header
    $headers .= "Content-Language: en\n\n";
  }

  my $script = my $onload = '';
  if ($doform) {
    $script = <<'EOF';
<script type="text/javascript">
function uriOk(num)
{
  if (document.getElementById) {
    var u = document.getElementById('uri_' + num);
    var ok = false;
    if (u.value.length > 0) {
      if (u.value.search) {
        ok = (u.value.search(/\S/) != -1);
      } else {
        ok = true;
      }
    }
    if (! ok) u.focus();
    return ok;
  }
  return true;
}
</script>
EOF
   $onload = ' onload="if(document.getElementById){document.getElementById(\'uri_1\').focus()}"';
  }

  print $headers, $DocType, "
<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
<head>
<title>W3C Link Checker", &encode($title), "</title>
", $Head, $script, "</head>
<body", $onload, '>';
  &banner($title);
  return;
}

sub banner ($)
{
  my $tagline ="Check links and anchors in Web pages or full Web sites";

  printf(<<'EOF', URI->new_abs("../images/no_w3c.png", $Cfg{Doc_URI}), $tagline);
<div id="banner"><h1 id="title"><a href="http://www.w3.org/" title="W3C"><img alt="W3C" id="logo" src="%s" /></a>
<a href="checklink"><span>Link Checker</span></a></h1>
<p id="tagline">%s</p></div>
<div id="main">
EOF
  return;
}

sub status_icon($)
{
  my ($code) = @_;
  my $icon_type;
  my $r = HTTP::Response->new($code);
  if ($r->is_success()) {
    $icon_type = 'error'; # if is success but reported, it's because of broken frags => error
  } elsif (&informational($code)) {
    $icon_type = 'info';
  } elsif ($code == 300) {
    $icon_type = 'info';
  } elsif ($code == 401) {
    $icon_type = 'error';
  } elsif ($r->is_redirect()) {
    $icon_type = 'warning';
  } elsif ($r->is_error()) {
    $icon_type = 'error';
  } else {
    $icon_type = 'error';
  }
  return sprintf('<span class="err_type"><img src="%s" alt="%s" /></span>',
                 URI->new_abs("../images/info_icons/$icon_type.png",
                              $Cfg{Doc_URI}),
                 $icon_type);
}

sub bgcolor ($)
{
  my ($code) = @_;
  my $class;
  my $r = HTTP::Response->new($code);
  if ($r->is_success()) {
    return '';
  } elsif ($code == RC_ROBOTS_TXT() || $code == RC_IP_DISALLOWED()) {
    $class = 'dubious';
  } elsif ($code == 300) {
    $class = 'multiple';
  } elsif ($code == 401) {
    $class = 'unauthorized';
  } elsif ($r->is_redirect()) {
    $class = 'redirect';
  } elsif ($r->is_error()) {
    $class = 'broken';
  } else {
    $class = 'broken';
  }
  return(' class="'.$class.'"');
}

sub show_url ($;$)
{
  my ($url, $fragment) = @_;
  if (defined($fragment)) {
    my $u = URI->new($url);
    $u->fragment($fragment);
    $url = $u->as_string();
  }
  $url = &encode($url);
  return sprintf('<a href="%s">%s</a>',
                 $url, defined($fragment) ? &encode($fragment) : $url);
}

sub html_footer ()
{
  printf("<p>%s</p>\n", &global_stats()) if ($doc_count > 0 && !$Opts{Quiet});
  if (! $doc_count) {
    print <<'EOF';
  <div class="intro">
      <p>This Link Checker looks for issues in links, anchors and referenced objects in a Web page, or recursively on a whole Web site.
      For best results, it is recommended to first ensure that the documents checked use <a href="http://validator.w3.org/">Valid (X)HTML Markup</a>. The Link Checker is part of the W3C's <a href="http://www.w3.org/QA/Tools/">validators and Quality Web tools</a>.</p>
  </div>
EOF
  }
  printf(<<'EOF', $Cfg{Doc_URI}, $Cfg{Doc_URI}, $PACKAGE, $REVISION);
</div><!-- main -->
<ul class="navbar" id="menu">
  <li><a href="%s" accesskey="3" title="Documentation for this Link Checker Service">Docs</a></li>
  <li><a href="http://search.cpan.org/dist/W3C-LinkChecker/" accesskey="2" title="Download the source / Install this service">Download</a></li>
  <li><a href="%s#csb" title="feedback: comments, suggestions and bugs" accesskey="4">Feedback</a></li>
  <li><a href="http://validator.w3.org/" title="Validate your markup with the W3C Markup Validation Service">Validator</a></li>
</ul>
<div>
<address>
%s<br /> %s
</address>
</div>
</body>
</html>
EOF
  return;
}

sub print_form (\%$$)
{
  my ($params, $cookie, $check_num) = @_;

  # Split params on \0, see CGI's docs on Vars()
  while (my ($key, $value) = each(%$params)) {
    if ($value) {
      my @vals = split(/\0/, $value, 2);
      $params->{$key} = $vals[0];
    }
  }

  # Override undefined values from the cookie, if we got one.
  my $valid_cookie = 0;
  if ($cookie) {
    my %cookie_values = $cookie->value();
    if (!$cookie_values{clear}) { # XXX no easy way to check if cookie expired?
      $valid_cookie = 1;
      while (my ($key, $value) = each(%cookie_values)) {
        $params->{$key} = $value unless defined($params->{$key});
      }
    }
  }

  my $chk = ' checked="checked"';
  $params->{hide_type} = 'all' unless $params->{hide_type};

  my $requested_uri = &encode($params->{uri}        || '');
  my $sum = $params->{summary}                ? $chk : '';
  my $red = $params->{hide_redirects}         ? $chk : '';
  my $all = ($params->{hide_type} ne 'dir')   ? $chk : '';
  my $dir = $all                              ? ''   : $chk;
  my $acc = $params->{no_accept_language}     ? $chk : '';
  my $ref = $params->{no_referer}             ? $chk : '';
  my $rec = $params->{recursive}              ? $chk : '';
  my $dep = &encode($params->{depth}                || '');

  my $cookie_options = '';
  if ($valid_cookie) {
    $cookie_options = "
    <label for=\"cookie1_$check_num\"><input type=\"radio\" id=\"cookie1_$check_num\" name=\"cookie\" value=\"nochanges\" checked=\"checked\" /> Don't modify saved options</label>
    <label for=\"cookie2_$check_num\"><input type=\"radio\" id=\"cookie2_$check_num\" name=\"cookie\" value=\"set\" /> Save these options</label>
    <label for=\"cookie3_$check_num\"><input type=\"radio\" id=\"cookie3_$check_num\" name=\"cookie\" value=\"clear\" /> Clear saved options</label>";
  } else {
    $cookie_options = "
    <label for=\"cookie_$check_num\"><input type=\"checkbox\" id=\"cookie_$check_num\" name=\"cookie\" value=\"set\" /> Save options in a <a href=\"http://www.w3.org/Protocols/rfc2109/rfc2109\">cookie</a></label>";
  }

  print "<form action=\"", $Opts{_Self_URI}, "\" method=\"get\" onsubmit=\"return uriOk($check_num)\">
<p><label for=\"uri_$check_num\">Enter the address (<a href=\"http://www.w3.org/Addressing/\">URL</a>)
of a document that you would like to check:</label></p>
<p><input type=\"text\" size=\"50\" id=\"uri_$check_num\" name=\"uri\" value=\"",$requested_uri,"\" /></p>
<fieldset id=\"extra_opt_uri_$check_num\" class=\"moreoptions\">
	<legend class=\"toggletext\">More Options</legend>
	<div class=\"options\">
  <p>
    <label for=\"summary_$check_num\"><input type=\"checkbox\" id=\"summary_$check_num\" name=\"summary\" value=\"on\"", $sum, " /> Summary only</label>
    <br />
    <label for=\"hide_redirects_$check_num\"><input type=\"checkbox\" id=\"hide_redirects_$check_num\" name=\"hide_redirects\" value=\"on\"", $red, " /> Hide <a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html#sec10.3\">redirects</a>:</label>
    <label for=\"hide_type_all_$check_num\"><input type=\"radio\" id=\"hide_type_all_$check_num\" name=\"hide_type\" value=\"all\"", $all, " /> all</label>
    <label for=\"hide_type_dir_$check_num\"><input type=\"radio\" id=\"hide_type_dir_$check_num\" name=\"hide_type\" value=\"dir\"", $dir, " /> for directories only</label>
    <br />
    <label for=\"no_accept_language_$check_num\"><input type=\"checkbox\" id=\"no_accept_language_$check_num\" name=\"no_accept_language\" value=\"on\"", $acc, " /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4\">Accept-Language</a></tt> header</label>
    <br />
    <label for=\"no_referer_$check_num\"><input type=\"checkbox\" id=\"no_referer_$check_num\" name=\"no_referer\" value=\"on\"", $ref, " /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36\">Referer</a></tt> header</label>
    <br />
    <label title=\"Check linked documents recursively (maximum: ", $Opts{Max_Documents}, " documents)\" for=\"recursive_$check_num\"><input type=\"checkbox\" id=\"recursive_$check_num\" name=\"recursive\" value=\"on\"", $rec, " /> Check linked documents recursively</label>,
    <label title=\"Depth of the recursion (-1 is the default and means unlimited)\" for=\"depth_$check_num\">recursion depth: <input type=\"text\" size=\"3\" maxlength=\"3\" id=\"depth_$check_num\" name=\"depth\" value=\"", $dep, "\" /></label>
    <br /><br />", $cookie_options, "
  </p>
  </div>
</fieldset>
<p class=\"submit_button\"><input type=\"submit\" name=\"check\" value=\"Check\" /></p>
</form>
<div class=\"intro\" id=\"don_program\"></div>  
<script type=\"text/javascript\" src=\"http://www.w3.org/QA/Tools/don_prog.js\"></script>  
";
  return;
}

sub encode (@)
{
  return $Opts{HTML} ? HTML::Entities::encode(@_) : @_;
}

sub hprintf (@)
{
  print_doc_header();
  if (! $Opts{HTML}) {
    printf(@_);
  } else {
    print HTML::Entities::encode(sprintf($_[0], @_[1..@_-1]));
  }
  return;
}

# Print the document header, if it hasn't been printed already.
# This is invoked before most other output operations, in order
# to enable quiet processing that doesn't clutter the output with
# "Processing..." messages when nothing else will be reported.
sub print_doc_header ()
{
  if (defined($doc_header)) {
    print $doc_header;
    undef($doc_header);
  }
}


# Local Variables:
# mode: perl
# indent-tabs-mode: nil
# tab-width: 2
# perl-indent-level: 2
# End:
# ex: ts=2 sw=2 et
