#! /usr/bin/perl -w
# {ɦW: chklinks
# {: Ȫ٨̺sˬd{
# {@: ̺ imacat <imacat@mail.imacat.idv.tw>
# Z: 2003-04-12
# vr: vҦ (c) 2003 ̺

use strict;

use Getopt::Long qw(GetOptions);
use Net::HTTP qw();
use Socket qw(inet_aton inet_ntoa);
use URI qw();

use vars qw($THIS_FILE $VERSION);
$0 =~ /[^\/]*$/;
$THIS_FILE = $&;
$VERSION = "2.0.0";

use vars qw(%DNS %SOCKS %NORECON_REP %URIS %REFS);
use vars qw(@LOCALS $LTARGET %LFOUND $STARTDIR @REMOTES %RFOUND);
use vars qw($TOTAL $SUCCESS);
%DNS = qw();
%SOCKS = qw();
%NORECON_REP = qw();
%URIS = qw();
%REFS = qw();

use vars qw($RECURSIVE $PARENT $NOREMOTE @EXCLUDES $VERBOSE);
$RECURSIVE = 0;
$PARENT = 0;
$NOREMOTE = 0;
@EXCLUDES = qw();
$VERBOSE = 1;

use vars qw($UASTR $VERSTR $SHORTHELP $HELPMSG);
$UASTR = "$THIS_FILE/$VERSION";
$SHORTHELP = "Try `$THIS_FILE --help' for more information.";
$VERSTR = "$& v$VERSION by imacat <imacat\@mail.imacat.idv.tw>";
$HELPMSG = << "EOT";
Usage: chklinks.pl [options] url
Check links on the specific website.

  -1,--onelevel      Check the links on this page and stops. (default)
  -r,--recursive     Recursively check through all the links on this host.
  -b,--below         Only check the links below this directory. (default)
  -p,--parent        Trace back to the parent directories.
  -l,--local         Only check the links on this same host.
  -s,--span          Check the links to other hosts (without recursion).
                     (default)
  -e,--exclude path  Exclude this path.  Check for their existence but not
                     check the links on it, just like they are on another
                     host.  Multiple --exclude are OK.
  -i,--include path  Include this path.  An opposite of --exclude that can
                     cancel its effect.  The latter has a higher priority.
  -d,--debug         Display debug messages.  Multiple --debug to debug more.
  -q,--quiet         Disable debug messages.  An opposite that can cancel the
                     effect of --debug.
  -h,--help          Display this help.
  -v,--version       Display version number.
  url                The URL of the website to check for.

EOT

use vars qw(%STATDESC);
%STATDESC = (
    200 => "OK",
    301 => "Moved Permanently",
    302 => "Found",
    303 => "See Other",
    307 => "Temporary Redirect",
    400 => "Bad Request",
    401 => "Unauthorized",
    403 => "Forbidden",
    404 => "Not Found",
    410 => "Gone",
    500 => "Internal Server Error",
    503 => "Service Unavailable",
);

main();
exit 0;


# main: Main program
sub main {
    local ($_, %_);
    my (@starts);
    
    @starts = parse_args();
    check_links($_) foreach @starts;
    
    print STDERR "Done.  " . (time - $^T) . " seconds elapsed.\n"
        if $VERBOSE > 0;
    return;
}

# parse_args: Parse the arguments
sub parse_args {
    local $_;
    my (@starts);
    
    # Get the arguments oѼ
    eval {
        local $SIG{__WARN__} = sub { die $_[0]; };
        Getopt::Long::Configure(qw(no_auto_abbrev bundling));
        GetOptions( "onelevel|1"=>sub { $RECURSIVE = 0; },
                    "recursive|r"=>sub { $RECURSIVE = 1; },
                    "below|b"=>sub { $PARENT = 0; },
                    "parent|p"=>sub { $PARENT = 1; },
                    "local|l"=>sub { $NOREMOTE = 1; },
                    "span|s"=>sub { $NOREMOTE = 0; },
                    "exclude|e=s"=>sub { unshift @EXCLUDES, "-" . $_[1]; },
                    "include|i=s"=>sub { unshift @EXCLUDES, "+" . $_[1]; },
                    "debug|d"=>sub { $VERBOSE++; },
                    "quiet|q"=>sub { $VERBOSE-- if $VERBOSE > 0; },
                    "help|h"=>sub { print $HELPMSG; exit 0; },
                    "version|v"=>sub { print "$VERSTR\n"; exit 0; });
    };
    die "$THIS_FILE: ERROR: $@" if $@ ne "";
    @_ = @ARGV;
    
    # Show progress
    $| = 1 if $VERBOSE > 2;
    
    @starts = qw();
    foreach (@_) {
        my ($uri, $canon);
        $uri = new URI($_);
        if (!defined $uri->scheme || $uri->scheme ne "http") {
            print STDERR "$THIS_FILE: WARNING: $_: Skipping non-HTTP URI\n"
                if $VERBOSE > 0;
            next;
        }
        if (!defined $uri->host || $uri->host eq "") {
            print STDERR "$THIS_FILE: WARNING: $_: Skipping hostless URI\n"
                if $VERBOSE > 0;
            next;
        }
        if ($uri->path eq "") {
            print STDERR "$THIS_FILE: WARNING: $_: URI not canonical\n"
                if $VERBOSE > 0;
            $uri->path("/");
        }
        $uri->fragment(undef);
        $canon = $uri->canonical;
        if (exists $URIS{$canon}) {
            print STDERR "$THIS_FILE: WARNING: Skipping duplicated URI $_\n"
                if $VERBOSE > 0;
            next;
        }
        $URIS{$canon} = $uri;
        push @starts, $canon;
    }
    
    die "$THIS_FILE: ERROR: Please specify the URL to check\n"
        if scalar(@starts) == 0;
    
    return @starts;
}

# check_links: Check links from an URL
sub check_links {
    local ($_, %_);
    my ($start, $page, $doc);
    $start = $_[0];
    
    @LOCALS = qw($start);
    %LFOUND = ($start => 1);
    $LTARGET = $URIS{$start}->host;
    $LTARGET .= ":" . $URIS{$start}->port if $URIS{$start}->port != 80;
    @REMOTES = qw();
    ($TOTAL, $SUCCESS) = (0, 0);
    $STARTDIR = $URIS{$start}->path;
    $STARTDIR =~ s/[^\/]+$//;
    
    $page = $start;
    ($page, $doc) = fetch_http_redir($page);
    return if !defined $doc;
    get_links($doc, $page);
    while (scalar(@LOCALS) > 0 || scalar(@REMOTES) > 0) {
        # Always process locals first
        if (scalar(@LOCALS) > 0) {
            @LOCALS = sort @LOCALS;
            $page = $LOCALS[0];
            ($page, $doc) = fetch_http_redir($page);
            # Skip the failed request
            next if !defined $doc;
            # Skip if non-recursive
            next if !$RECURSIVE;
            # Skip if it is excluded
            next if is_excluded($page);
            get_links($doc, $page);
        
        # Then the remote
        } else {
            @REMOTES = sort @REMOTES;
            $page = $REMOTES[0];
            fetch_http_redir($page);
            # We will never parse the remote documents
        }
    }
    
    print "$start: $TOTAL checked, $SUCCESS success\n"
        if $VERBOSE > 0;
    return;
}

# fetch_http: Fetch an HTTP document, and deal with redirection
sub fetch_http_redir {
    local ($_, %_);
    my ($s, $doc, $r);
    $s = $_[0];
    
    while (1) {
        ($doc, $r) = fetch_http($s);
        # No more redirection
        return ($s, $doc) if !defined $r;
        # Redirection target was scheduled before
        return ($s, undef) if exists $LFOUND{$r} || exists $RFOUND{$r};
        # Record the referer
        if (!exists $REFS{$r}) {
            $REFS{$r} = $s;
            $REFS{$r} = $REFS{$s} . " -> " . $REFS{$r} if exists $REFS{$s};
        }
        # Process the next
        $s = $r;
    }
}

# fetch_http: Fetch an HTTP document
sub fetch_http {
    local ($_, %_);
    my ($s, $uri, $ip, $target, $retry, $srep, $msg);
    my (%r, $code, $mess, $buf, $body, $c);
    $s = $_[0];
    $uri = $URIS{$s};
    
    # Add the counter
    $TOTAL++;
    # Remove it from the list
    $_ = $uri->host;
    $_ .= ":" . $uri->port if $uri->port != 80;
    if ($_ eq $LTARGET) {
        shift @LOCALS;
    } else {
        shift @REMOTES;
    }
    # Set the string to report
    $srep = $s;
    $srep .= " ($REFS{$s})" if exists $REFS{$s};
    
    # Resolve the hostname
    if (!exists $DNS{$uri->host}) {
        print STDERR "Resolving " . $uri->host . "... " if $VERBOSE > 2;
        # Success
        if (defined($_ = inet_aton $uri->host)) {
            $ip = inet_ntoa $_;
            $DNS{$uri->host} = $ip;
            print STDERR "$ip\n" if $VERBOSE > 2;
        
        # Failed
        } else {
            print STDERR "failed\n" if $VERBOSE > 2;
            $msg = $srep;
            $msg .= " Failed resolving " . $uri->host if $VERBOSE > 0;
            print "$msg\n";
            return;
        }
    
    # Resolved before
    } else {
        $ip = $DNS{$uri->host};
    }
    
    $target = $ip;
    $target .= ":" . $uri->port if $uri->port != 80;
    
    # Not connected yet
    if (!exists $SOCKS{$target}) {
        return if !defined($SOCKS{$target} = new_http($ip, $uri->port, $s));
    
    # Connected, but undefined (probably for disconnection)
    } elsif (!defined $SOCKS{$target}) {
        return if !defined($SOCKS{$target} = re_http($ip, $uri->port, $s));
    }
    
    print STDERR "Requesting $srep... " if $VERBOSE > 2;
    $retry = 1;
    TRY_REQUEST: while (1) {
        # Send the request
        print STDERR "let's see\n" if $VERBOSE > 3;
        print STDERR " Sending request... " if $VERBOSE > 3;
        %_ = (  "Host"          => $uri->host,
                "User-Agent"    => $UASTR);
        if (exists $REFS{$s}) {
            $_ = $REFS{$s};
            # Get the last part
            s/^.* -> //s;
            $_{"Referer"} = $_;
        }
        eval {
            $_ = $SOCKS{$target}->write_request(GET=>$uri->path_query, %_);
        };
        # Failed
        if ($@ ne "") {
            print STDERR "failed\n" if $VERBOSE > 2;
            print STDERR "$THIS_FILE: WARNING: " . $uri->path_query . "$@"
                if $VERBOSE > 2;
            # Disconnect it
            undef $SOCKS{$target};
            # No more retry
            if ($retry == 0) {
                $NORECON_REP{$target} = 1;
                return;
            }
            # Try to reconnect
            $retry--;
            return if !defined($SOCKS{$target} = re_http($ip, $uri->port, $s));
            print STDERR "Requesting $srep... " if $VERBOSE > 2;
            redo TRY_REQUEST;
        }
        print STDERR "OK\n" if $VERBOSE > 3;
        
        # Read the response headers
        print STDERR " Reading response headers... " if $VERBOSE > 3;
        eval {
            ($code, $mess, %r) = $SOCKS{$target}->read_response_headers();
        };
        # Failed
        if ($@ ne "") {
            print STDERR "failed\n" if $VERBOSE > 2;
            print STDERR "$THIS_FILE: WARNING: $@" if $VERBOSE > 2;
            # Disconnect it
            undef $SOCKS{$target};
            # No more retry
            if ($retry == 0) {
                $NORECON_REP{$target} = 1;
                return;
            }
            # Try to reconnect
            $retry--;
            return if !defined($SOCKS{$target} = re_http($ip, $uri->port, $s));
            print STDERR "Requesting $srep... " if $VERBOSE > 2;
            redo TRY_REQUEST;
        }
        print STDERR "OK\n" if $VERBOSE > 3;
        
        # Read the body
        print STDERR " Reading response body... " if $VERBOSE > 3;
        print STDERR "let's see\n" if $VERBOSE > 4;
        $body = "";
        $c = 1;
        while (1) {
            print STDERR "  Reading response body #$c... " if $VERBOSE > 4;
            eval {
                $_ = $SOCKS{$target}->read_entity_body($buf, 65535);
            };
            # Bad HTTP chunks
            if ($@ ne "") {
                print STDERR "failed\n" if $VERBOSE > 2;
                print STDERR "$THIS_FILE: WARNING: $@" if $VERBOSE > 2;
                undef $SOCKS{$target};
                # Do not try to reconnect in the future
                $NORECON_REP{$target} = 1;
                return;
            }
            # Read error
            if (!defined $_) {
                print STDERR "failed\n" if $VERBOSE > 2;
                print STDERR "$THIS_FILE: WARNING: $@" if $VERBOSE > 2;
                # Disconnect it
                undef $SOCKS{$target};
                # No more retry
                if ($retry == 0) {
                    $NORECON_REP{$target} = 1;
                    return;
                }
                # Try to reconnect
                $retry--;
                return if !defined($SOCKS{$target} = re_http($ip, $uri->port, $s));
                print STDERR "Requesting $srep... " if $VERBOSE > 2;
                redo TRY_REQUEST;
            }
            # EOF
            if ($_ == 0) {
                print STDERR "finished\n" if $VERBOSE > 4;
                last;
            }
            # Add this part of the body
            print STDERR "$_ bytes\n" if $VERBOSE > 4;
            $body .= $buf;
            $c++;
        }
        print STDERR "  Totally " if $VERBOSE > 4;
        print STDERR "" . length($body) . " bytes\n" if $VERBOSE > 3;
        print STDERR " Requesting $srep... " if $VERBOSE > 3;
        last;
    }
    print STDERR "OK\n" if $VERBOSE > 2;
    
    # Good response
    if ($code == 200) {
        $SUCCESS++;
        # Only return the body if text/html
        return $body if exists $r{"Content-Type"}
                        && $r{"Content-Type"} =~ /^text\/html\b/;
        return;
    
    # Redirected response
    } elsif (grep($code == $_, (301, 302, 303, 307)) > 0) {
        my ($ruri, $rtarget);
        # Redirect without location is an error
        if (!exists $r{"Location"}) {
            print STDERR "$srep $code $STATDESC{$code}: Redirect without location\n";
            return;
        }
        
        $ruri = new URI($r{"Location"});
        
        # Skip non-HTTP location
        return if defined $ruri->scheme && $ruri->scheme ne "http";
        
        # Deal with non-canonical location
        # Non-absolute location should be warned
        if (!defined $ruri->scheme) {
            print STDERR "$THIS_FILE: WARNING: $s: $code $STATDESC{$code}: Location not canonical: "
                    . $r{"Location"} . "\n"
                if $VERBOSE > 1;
            $ruri = $ruri->abs($uri);
        }
        if (!defined $ruri->host || $ruri->host eq "") {
            print STDERR "$THIS_FILE: WARNING: $s: $code $STATDESC{$code}: Location not canonical: "
                    . $r{"Location"} . "\n"
                if $VERBOSE > 1;
            $ruri->host($uri->host);
        }
        if ($ruri->path eq "") {
            print STDERR "$THIS_FILE: WARNING: $s: $code $STATDESC{$code}: Location not canonical: "
                    . $r{"Location"} . "\n"
                if $VERBOSE > 1;
            $ruri->path("/");
        }
        $ruri->fragment(undef);
        
        # Save it
        $URIS{$ruri->canonical} = $ruri if !exists $URIS{$ruri->canonical};
        
        # 301 should be treated as an error
        if ($code == 301) {
            $msg = $srep;
            $msg .= " $code $STATDESC{$code}: " . $ruri->canonical
                if $VERBOSE > 0;
            print "$msg\n";
            return (undef, $ruri->canonical);
        }
        
        $SUCCESS++;
        # Warn when not quiet
        print "$srep $code $STATDESC{$code}: " . $ruri->canonical . "\n"
                if $VERBOSE > 1;
        return (undef, $ruri->canonical);
    
    # Bad response that we can go on
    } elsif (grep($code == $_, (400, 403, 404, 410)) > 0) {
        $msg = $srep;
        $msg .= " $code $STATDESC{$code}" if $VERBOSE > 0;
        print "$msg\n";
        return;
    
    # Bad response that we should disconnect
    } elsif ($code == 500) {
        $msg = $srep;
        $msg .= " $code $STATDESC{$code}" if $VERBOSE > 0;
        print "$msg\n";
        undef $SOCKS{$target};
        return;
    
    # Bad response that we should disconnect and never retry nor report
    } elsif ($code == 503) {
        $msg = $srep;
        $msg .= " $code $STATDESC{$code}" if $VERBOSE > 0;
        print "$msg\n";
        undef $SOCKS{$target};
        # No more try and no more report
        $NORECON_REP{$target} = 0;
        return;
    
    # Unable to process
    } else {
        # Unable to process
        print "$srep $code $mess: Unable to process this response\n";
        return;
    }
    
    return;
}

# re_http: re-establish an HTTP connection
sub re_http {
    local ($_, %_);
    my ($ip, $port, $s, $target, $HTTP, $srep, $msg);
    ($ip, $port, $s) = @_;
    
    # Set the string to report
    $srep = $s;
    $srep .= " ($REFS{$s})" if exists $REFS{$s};
    
    $target = $ip;
    $target .= ":" . $port if $port != 80;
    
    # Don not try to reconnected
    if (exists $NORECON_REP{$target}) {
        # Report it
        if ($NORECON_REP{$target}) {
            $msg = $srep;
            $msg .= " Failed connecting $target" if $VERBOSE > 0;
            print "$msg\n";
        }
        return;
    
    # Try to reconnect
    } else {
        $HTTP = new_http($ip, $port, $s);
    }
    
    return $HTTP;
}

# new_http: initialize an HTTP connection
sub new_http {
    local ($_, %_);
    my ($ip, $port, $s, %opts, $target, $HTTP, $srep, $msg);
    ($ip, $port, $s) = @_;
    
    # Set the string to report
    $srep = $s;
    $srep .= " ($REFS{$s})" if exists $REFS{$s};
    
    $target = $ip;
    %opts = (   Host        => $ip,
                KeepAlive   => 300);
    if ($port != 80) {
        $opts{"PeerPort"} = $port;
        $target .= ":" . $port;
    }
    print STDERR "Connecting to $target... " if $VERBOSE > 2;
    # Success
    if (defined($HTTP = new Net::HTTP(%opts))) {
        print STDERR "success\n" if $VERBOSE > 2;
    
    # Failed
    } else {
        print STDERR "failed\n" if $VERBOSE > 2;
        $msg = $srep;
        $msg .= " Failed connecting $target" if $VERBOSE > 0;
        print "$msg\n";
        # Don not try to reconnect, but still report in the future
        $NORECON_REP{$target} = 1;
    }
    return $HTTP;
}

# get_links: Parse the document and get the links inside
sub get_links {
    local ($_, %_);
    my ($html, $base, $tag);
    ($html, $base) = @_;
    
    # Join the lines
    $html =~ s/\r\n/\n/g;
    $html =~ s/\n/ /g;
    
    # Get rid of the scripts and the styles
    $html =~ s/(<style\s+[^<>"']*(?:(["']).*?\1[^<>"']*)*>).*?(<\/style>)/$1$2/sig;
    $html =~ s/(<script\s+[^<>"']*(?:(["']).*?\1[^<>"']*)*>).*?(<\/script>)/$1$2/sig;
    
    # Collecting anchors
    while ($html =~ s/(<a\s+[^<>"']*(?:(["']).*?\2[^<>"']*)*>)//is) {
        $tag = $1;
        proc_link_val($1, $base)
            while $tag =~ s/(?<=\s)href=([^\s<>"']*(?:(["']).*?\2[^\s<>"']*)*)(?=[\s>])//is;
    }
    
    # Collecting images
    while ($html =~ s/(<img\s+[^<>"']*(?:(["']).*?\2[^<>"']*)*>)//is) {
        $tag = $1;
        proc_link_val($1, $base)
            while $tag =~ s/(?<=\s)src=([^\s<>"']*(?:(["']).*?\2[^\s<>"']*)*)(?=[\s>])//is;
    }
    
    # Collecting links
    while ($html =~ s/(<link\s+[^<>"']*(?:(["']).*?\2[^<>"']*)*>)//is) {
        $tag = $1;
        proc_link_val($1, $base)
            while $tag =~ s/(?<=\s)href=([^\s<>"']*(?:(["']).*?\2[^\s<>"']*)*)(?=[\s>])//is;
    }
    
    # Collecting external scripts
    while ($html =~ s/(<script\s+[^<>"']*(?:(["']).*?\2[^<>"']*)*>)//is) {
        $tag = $1;
        proc_link_val($1, $base)
            while $tag =~ s/(?<=\s)src=([^\s<>"']*(?:(["']).*?\2[^\s<>"']*)*)(?=[\s>])//is;
    }
    
    # Collecting forms
    while ($html =~ s/(<form\s+[^<>"']*(?:(["']).*?\2[^<>"']*)*>)//is) {
        $tag = $1;
        proc_link_val($1, $base)
            while $tag =~ s/(?<=\s)action=([^\s<>"']*(?:(["']).*?\2[^\s<>"']*)*)(?=[\s>])//is;
    }
    
    return;
}

# Process a found link value
sub proc_link_val {
    local ($_, %_);
    my ($base, $target);
    ($_, $base) = @_;
    
    # Remove the quotation
    s/(["'])(.*?)\1/$2/sg;
    
    # Unescape the HTML character entities
    s/&lt;/</g;
    s/&gt;/>/g;
    s/&quot;/"/g;
    s/&amp;/&/g;
    
    $_ = new URI($_);
    
    # Canonize it
    $_ = $_->abs($URIS{$base}) if !defined $_->scheme;
    # Skip non-HTTP schemes
    return if $_->scheme ne "http";
    $_->host($URIS{$base}->host) if !defined $_->host || $_->host eq "";
    $_->path("/") if $_->path eq "";
    $_->fragment(undef);
    
    # Save it
    $URIS{$_->canonical} = $_ if !exists $URIS{$_->canonical};
    $REFS{$_->canonical} = $base if !exists $REFS{$_->canonical};
    
    $target = $_->host;
    $target .= ":" . $_->port if $_->port != 80;
    # Local
    if ($target eq $LTARGET) {
        # Skip found URLs
        return if exists $LFOUND{$_->canonical};
        # Skip parent directories
        return if !$PARENT && $_->path !~ /^$STARTDIR/;
        push @LOCALS, $_->canonical;
        $LFOUND{$_->canonical} = 1;
    
    # Remote
    } else {
        # No remote
        return if $NOREMOTE;
        # Skip found URLs
        return if exists $RFOUND{$_->canonical};
        push @REMOTES, $_->canonical;
        $RFOUND{$_->canonical} = 1;
    }
    
    return;
}

# If this page is excluded
sub is_excluded {
    local ($_, %_);
    my ($page, $path, $excpath, $match);
    $page = $_[0];
    
    foreach (@EXCLUDES) {
        $path = substr $_, 1;
        # Directory match
        if (/\/$/) {
            $match = ($URIS{$page}->path =~ /^$path/);
        # File or directory match
        } else {
            $match = ($URIS{$page}->path =~ /^$path(?![^\/?])/);
        }
        # minus (-) means "exclude"
        return /^-/ if $match;
    }
    
    return 0;
}

__END__
