#!/usr/bin/perl
use strict;
use warnings;

=head1 NAME

grepurl - print links in HTML

=head1 SYNOPSIS

	grepurl [-bdv] [-e extension[,extension] [-E extension[,extension]
		[-h host[,host]] [-H host[,host]] [-p regex] [-P regex]
		[-s scheme[,scheme]] [-s scheme[,scheme]] [-u URL]
	
=head1 DESCRIPTION

The grepurl program searches through the URL specified in the -u
switch and prints the URLs that satisfies the given set of options.
It applies the options roughly in order of which part of the URL
the option affects (scheme, host, path, extension).

So far, grepurl expects to search through HTML, although I want to
add other content types, especially plain text, RSS feeds, and so on.

=head1 OPTIONS

=over 4

=item -a

arrange (sort) links in ascending order

=item -A

arrange (sort) links in descending order

=item -b

turn relative URLs into absolute ones

=item -d

turn on debugging output

=item -e EXTENSION 

select links with these extensions (comma separated)

=item -E EXTENSION

exclude links with these extensions (comma separated)

=item -h HOST 

select links with these hosts (comma separated)

=item -H HOST

exclude links with these hosts (comma separated)

=item -p REGEX 

select only paths that match this Perl regex

=item -P REGEX

exclude paths that match this Perl regex

=item -r REGEX 

select only URLs that match this Perl regex (applies to entire URL)

=item -R REGEX

exclude URLs that match this Perl regex (applies to entire URL)

=item -s SCHEME 

select only these schemes (comma separated)

=item -S SCHEME 

exclude these schemes (comma separated)

=item -t FILE

extract URLs from plain text file (not implemented)

=item -u URL

extract URLs from URL (may be file://), expects HTML
 
=item -v 

turn on verbose output

=item -1 

print found URLs only once (print a unique list)

=back

=head2 Examples

=over 4

=item Print all the links

	grepurl -u http://www.example.com/

=item Print all the links, and resolve relative URLs

	grepurl -b -u http://www.example.com/

=item Print links with the edxtension .jpg

	grepurl -e jpg -u http://www.example.com/

=item Print links with the edxtension .jpg and .jpeg

	grepurl -e jpg,jpeg -u http://www.example.com/

=item Do not print links with the extension .cfm or .asp

	grepurl -E cfm,asp -u http://www.example.com/

=item Print only links to www.panix.com

	grepurl -h www.panix.com -u http://www.example.com/

=item Print only links to www.panix.com or www.perl.com

	grepurl -h www.panix.com,www.perl.com -u http://www.example.com/

=item Do not print links to www.microsoft.com

	grepurl -H www.microsoft.com -u http://www.example.com/

=item Print links with "perl" in the path

	grepurl -p perl -u http://www.example.com

=item Print links with "perl" or "pearl" in the path

	grepurl -p "pea?rl" -u http://www.example.com

=item Print links with "fred" or "barney" in the path

	grepurl -p "fred|barney" -u http://www.example.com

=item Do not print links with "SCO" in the path

	grepurl -P SCO -u http://www.example.com

=item Do not print links whose path matches "Micro.*"

	grepurl -P "Micro.*" -u http://www.example.com

=item Do not print links whose URL matches "Micro.*" anywhere

	grepurl -R "Micro.*" -u http://www.example.com

=item Print only web links

	grepurl -s http -u http://www.example.com/

=item Print ftp and gopher links

	grepurl -s ftp,gopher -u http://www.example.com/

=item Exclude ftp and gopher links

	grepurl -S ftp,gopher -u http://www.example.com/

=item Arrange the links in an ascending sort

	grepurl -a -u http://www.example.com/

=item Arrange the links in an descending sort

	grepurl -A -u http://www.example.com/

=item Arrange the links in an descending sort, and print unique URLs

	grepurl -A -1 -u http://www.example.com/

=back

=head1 TO DO

=over 4

=item Operate over an entire directory or website

=back

=head1 SEE ALSO

urifind by darren chamberlain E<lt>darren@cpan.orgE<gt>

=head1 SOURCE AVAILABILITY

This source is in Github

	https://github.com/briandfoy/grepurl

=head1 AUTHOR

brian d foy, C<< <bdfoy@cpan.org> >>

=head1 COPYRIGHT

Copyright 2004-2013, brian d foy, All rights reserved.

You may use this program under the same terms as Perl itself.

=cut

use File::Basename;
use FindBin;
use Getopt::Std;
use HTML::SimpleLinkExtor;
use LWP::Simple;
use URI;

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
my $Version = '1.01';

unless( @ARGV ) {
	print "$FindBin::Script $Version\n";
	exit;
	}
	
my %opts;
getopts( 'bdv1' . 'aAiIjJ' . 'e:E:h:H:p:P:s:S:t:u:', \%opts );

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
my $Debug         = $opts{d} || $ENV{GREPURL_DEBUG}   || 0;
my $Verbose       = $opts{v} || $ENV{GREPURL_VERBOSE} || 0;
my $Either        = $Debug   || $Verbose              || 0;

my $Hosts         = uncommify( $opts{h} );
my $No_hosts      = uncommify( $opts{H} );

my $Schemes       = uncommify( $opts{'s'} );
my $No_schemes    = uncommify( $opts{S} );

my $Extensions    = uncommify( $opts{e} );
my $No_extensions = uncommify( $opts{E} );

my $Path          = regex( $opts{p} );
my $No_path       = regex( $opts{P} );

my $Regex         = regex( $opts{r} );
my $No_regex      = regex( $opts{R} );

debug_summary() if $Debug;

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
my $text = get_text();
	print "$$text\n" if $Debug;
die "There is no text!\n" unless( defined $$text && length $$text > 0 );
my $urls = get_urls( $text );

my $Base = $opts{u};

@$urls = do {	
	if( defined $opts{b} ) {
		print "Base url is $Base\n" if $Debug;
		map { URI->new_abs( $_, $Base )->canonical } @$urls;
		}
	else {
		map { URI->new( $_, $Base )->canonical } @$urls;
		}
	};

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Filters
#
# To select things, only pass through those elements
#
# To not select things, pass through anything that does not match
@$urls = map { 
	my $s = $_->can( 'scheme' ) ? $_->scheme : undef;
	defined $s ?
		exists $Schemes->{$s} ? $_ : ()
		:
		()
	} @$urls if defined $opts{'s'};

@$urls = map { 
	my $s = $_->can( 'scheme' ) ? $_->scheme : undef;
	defined $s ?
		exists $No_schemes->{$s} ? () : $_ 
		:
		$_
	} @$urls if defined $opts{S};

@$urls = map {
	my $h = $_->can( 'host' ) ? $_->host : undef;
	defined $h ? 
		exists $Hosts->{ $h } ? $_ : () 
		:
		()
	} @$urls if defined $opts{h};

@$urls = map {
	my $h = $_->can( 'host' ) ? $_->host : undef;
	defined $h ? 
		exists $No_hosts->{ $h } ? () : $_ 
		:
		$_
	} @$urls if defined $opts{H};

@$urls = map { 
	my $p       = $_->path; 
	my( $file ) = basename( $p );
	my( $e )    = $file =~ /\.([^.]+)$/;
	$e ||= '';
	exists $Extensions->{$e} ? $_ : () 
	} @$urls if defined $opts{e};

@$urls = map { 
	my $p       = $_->path; 
	my( $file ) = basename( $p );
	my( $e )    = $file =~ /\.([^.]+)$/;
	$e ||= '';
	exists $No_extensions->{$e} ? () : $_ 
	} @$urls if defined $opts{E};

@$urls = map { 
	my $p = $_->path; $p =~ m/$Path/ ? $_ : () 
	} @$urls if defined $opts{p};

@$urls = map { 
	my $p = $_->path; $p =~ m/$No_path/ ? () : $_ 
	} @$urls if defined $opts{P};

@$urls = map { 
	my $u = $_->abs; $u =~ m/$Regex/ ? $_ : () 
	} @$urls if defined $opts{r};

@$urls = map { 
	my $u = $_->abs; $u =~ m/$No_regex/ ? () : $_ 
	} @$urls if defined $opts{R};

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Unique

@$urls = do { my %u = map { $_, 1 } @$urls; keys %u } if defined $opts{1};

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Sort

@$urls = sort { $a cmp $b } @$urls if defined $opts{a};

@$urls = sort { $b cmp $a } @$urls if defined $opts{A};


# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Sort
$" = "\n";
print "@$urls\n";

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
	
sub get_urls {	
	&extract_from_html;
	}
	
sub extract_from_html {
	my $text = shift;
	
	require HTML::SimpleLinkExtor;
	
	my $extor = HTML::SimpleLinkExtor->new();

	$extor->parse( $$text );	
	
	my @links = $extor->links;
	print "Found " . @links . " links\n" if $Debug;
	
	\@links;
	}
	
sub get_text {
	if( defined $opts{u} ) {
		my $url = URI->new( $opts{u} );
		die "Bad url [$opts{u}]!" unless ref $url;
		read_from_url( $url )
		}
	elsif( defined $opts{t} ) {
		my $file = $opts{t};
		die "Could not read file [$file]!" unless -r $file;
		read_from_text_file( $file );
		}
	elsif( -t STDIN ) {
		read_from_stdin();
		}
	else {
		return;
		}
	}
	
sub read_from_url {
	print "Reading from url\n" if $Either;
	my $url = shift;

	my $data = LWP::Simple::get( $url );
	
	\$data;
	}
	
sub read_from_text {
	print "Reading from file\n" if $Either;
	my $file = shift;
	
	my $data = do { local $/; open my($fh), $file; <$fh> };

	\$data;
	}
	
sub read_from_stdin {
	print "Reading from standard input\n" if $Either;

	my $data = do { local $/; <STDIN> };

	\$data;
	}
	
sub regex {
	my $option = shift;
	
	return unless defined $option;
	
	my $regex = eval { qr/$option/ };
	
	$@ =~ s/at $FindBin::Script line \d+.*//;

	die "$FindBin::Script: $@" if $@;
	
	$regex;
	}
	
sub uncommify {
	my $option = shift;
	
	return {} unless defined $option;
		
	return { map { $_, 1 } split m/,/, $option };
	}
	
sub debug_summary {
	no warnings;
	
	local $" = "\n\t";
	
	print <<"DEBUG";
Version:       $Version
Verbose:       $Verbose
Debug:         $Debug
Ascending:     $opts{a}
Descending:    $opts{A}
Unique:        $opts{1}
Image:         $opts{i}
Image(-):      $opts{I}
Javascript:    $opts{j}
Javascript(-): $opts{j}
Hosts:         $opts{h}
	@{ [ keys %$Hosts ] }
Hosts(-):      $opts{H}
	@{ [ keys %$No_hosts ] }
Path:          $opts{p}
	$Path
Path(-):       $opts{P}
	$No_path
Regex:         $opts{r}
	$Regex
Regex(-):      $opts{R}
	$No_regex
Scheme:        $opts{s}
	@{ [ keys %$Schemes ] }
Scheme(-):     $opts{S}
	@{ [ keys %$No_schemes ] }
DEBUG
	}
