# $Id: SimpleLinkExtor.pm 2225 2007-03-19 05:22:52Z comdog $
package HTML::SimpleLinkExtor;
use strict;

use subs qw();
use vars qw($VERSION @ISA %AUTO_METHODS $AUTOLOAD $DEBUG);

use AutoLoader;
use Carp qw(carp);
use HTML::LinkExtor;
use URI;

$VERSION = 1.17;
#sprintf "%d.%02d", q$Revision: 2225 $ =~ m/ (\d+) \. (\d+) /xg;

$DEBUG   = 0;

@ISA = qw(HTML::LinkExtor);

%AUTO_METHODS = qw(
    background attribute
	href	attribute
	src		attribute

	a		tag
	area	tag
	base    tag
	body    tag
	img		tag
	frame	tag
	iframe  tag

	script	tag
	);

sub new
	{
	my $class = shift;
	my $base  = shift;

	my $self = new HTML::LinkExtor;
	bless $self, $class;

	$self->{'_SimpleLinkExtor_base'} = $base;
	$self->_init_links;
	
	return $self;
	}

sub DESTROY { 1 };

sub add_attributes
	{
	my $self = shift;
	my $attr = lc shift;
	
	$AUTO_METHODS{ $attr } = 'attribute';
	}
	
sub add_tags
	{
	my $self = shift;
	my $tag  = lc shift;
	
	$AUTO_METHODS{ $tag } = 'tag';
	}

sub remove_attributes
	{
	my $self = shift;
	my $attr = lc shift;
	
	delete $AUTO_METHODS{ $attr };
	}
	
sub remove_tags
	{
	my $self = shift;
	my $tag  = lc shift;
	
	delete $AUTO_METHODS{ $tag };
	}

sub attribute_list
	{
	my $self = shift;
	
	grep { $AUTO_METHODS{ $_ } eq 'attribute' } keys %AUTO_METHODS;
	}
	
sub tag_list
	{
	my $self = shift;
	
	grep { $AUTO_METHODS{ $_ } eq 'tag' } keys %AUTO_METHODS;
	}

sub clear_links { $_[0]->_init_links }

sub links
	{
	my $self = shift;

	return map { $$_[2] } $self->_link_refs;
	}

sub frames { ( $_[0]->frame, $_[0]->iframe ) }

sub AUTOLOAD
	{
	my $self = shift;
	my $method = $AUTOLOAD;

	$method =~ s/.*:://;
	print STDERR "AUTOLOAD: method is $method\n" if $DEBUG;

	unless( exists $AUTO_METHODS{$method} )
		{
		carp __PACKAGE__ . ": method $method unknown";
		return;
		}

	print STDERR "AUTOLOAD: calling _extract\n" if $DEBUG;
	$self->_extract( $method );
	}

sub _init_links
	{
	my $self  = shift;
	my $links = shift;
		
	do { 
		delete $self->{'_SimpleLinkExtor_links'};
		return
		} unless UNIVERSAL::isa( $links, 'ARRAY' );
	
	$self->{'_SimpleLinkExtor_links'} = $links;
	
	$self;
	}

sub _link_refs
	{
	my $self = shift;

	my @link_refs;
	# XXX: this is a bad way to do this. I should check if the
	# value is a reference. If I want to reset the links, for
	# instance, I can't just set it to [] because it then goes
	# through this branch. In _init_links I have to use a delete
	# which I really don't like. I don't have time to rewrite this
	# right now though --brian, 20050816
	if( ref $self->{'_SimpleLinkExtor_links'} )
		{
		@link_refs = @{$self->{'_SimpleLinkExtor_links'}};
		}
	else
		{
		@link_refs = $self->SUPER::links();
		$self->_init_links( \@link_refs );
		}

	# defined() so that an empty string means "do not resolve"
	unless( defined $self->{'_SimpleLinkExtor_base'} )
		{
		my $count = -1;
		my $found =  0;
		foreach my $link ( @link_refs )
			{
			$count++;
			next unless $link->[0] eq 'base' and $link->[1] eq 'href';
			$found = 1;
			$self->{'_SimpleLinkExtor_base'} = $link->[-1];
			last;
			}

		#remove the BASE HREF link - Good idea, bad idea?
		#splice @link_refs, $count, 1, () if $found;
		}

	$self->_add_base(\@link_refs) if $self->{'_SimpleLinkExtor_base'};

	print "_link_refs: there are $#link_refs + 1 links\n" if $DEBUG;
	return @link_refs;
	}

sub _extract
	{
	my $self      = shift;
	my $method    = shift;

	my $position  = $AUTO_METHODS{$method} eq 'tag' ? 0 : 1;
	print "_extract: Position is $position\n" if $DEBUG;

	my @links = map  { $$_[2] }
	            grep { $_->[$position] eq $method }
	            $self->_link_refs;

	print "_extract: There are $#links + 1 links\n" if $DEBUG;
	return @links;
	}

sub _add_base
	{
	my $self      = shift;
	my $array_ref = shift;

	my $base      = $self->{'_SimpleLinkExtor_base'};
	next unless $base;
	
	foreach my $tuple ( @$array_ref )
		{
		foreach my $index ( 1 .. $#$tuple )
			{
			next unless exists $AUTO_METHODS{ $tuple->[$index] };
			
			my $url = URI->new( $tuple->[$index + 1] );
			next unless ref $url;
			$tuple->[$index + 1] = $url->abs($base);
			}
		}
	}

sub parse_url
	{
	my $self = shift;
	my $url  = shift;
	
	require LWP::Simple;
	
	my $data = LWP::Simple::get( $url );
	
	return unless defined $data;
	
	$self->parse( $data );
	}
	
1;
__END__
=head1 NAME

HTML::SimpleLinkExtor - Extract links from HTML

=head1 SYNOPSIS

	use HTML::SimpleLinkExtor;

	my $extor = HTML::SimpleLinkExtor->new();
	$extor->parse_file($filename);
	#--or--
	$extor->parse($html);

	$extor->parse_file($other_file); # get more links

	$extor->clear_links; # reset the link list
	
	#extract all of the links
	@all_links   = $extor->links;

	#extract the img links
	@img_srcs    = $extor->img;

	#extract the frame links
	@frame_srcs  = $extor->frame;

	#extract the hrefs
	@area_hrefs  = $extor->area;
	@a_hrefs     = $extor->a;
	@base_hrefs  = $extor->base;
	@hrefs       = $extor->href;

	#extract the body background link
	@body_bg     = $extor->body;
	@background  = $extor->background;

=head1 DESCRIPTION

This is a simple HTML link extractor designed for the person who does
not want to deal with the intricacies of C<HTML::Parser> or the
de-referencing needed to get links out of C<HTML::LinkExtor>.

You can extract all the links or some of the links (based on the HTML
tag name or attribute name). If a E<lt>BASE HREFE<gt> tag is found,
all of the relative URLs will be resolved according to that reference.

This module is simply a subclass around C<HTML::LinkExtor>, so it can
only parse what that module can handle.  Invalid HTML or XHTML may
cause problems.

If you parse multiple files, the link list grows and contains the
aggregate list of links for all of the files parsed. If you want to
reset the link list between files, use the clear_links method.

=head2 Class Methods

=over

=item $extor = HTML::SimpleLinkExtor->new()

Create the link extractor object.

=item $extor = HTML::SimpleLinkExtor->new('')
=item $extor = HTML::SimpleLinkExtor->new($base)

Create the link extractor object and resolve the relative URLs
accoridng to the supplied base URL. The supplied base URL overrides
any other base URL found in the HTML.


Create the link extractor object and do not resolve relative
links.

=item HTML::SimpleLinkExtor->add_tags( TAG [, TAG ] )

C<HTML::SimpleLinkExtor> keeps an internal list of HTML tags (such as
'a' and 'img') that have URLs as values. If you run into another tag
that this module doesn't handle, please send it to me and I'll add it.
Until then you can add that tag to the internal list. This affects
the entire class, including previously created objects.

=item HTML::SimpleLinkExtor->add_attributes( ATTR [, ATTR] )

C<HTML::SimpleLinkExtor> keeps an internal list of HTML tag attributes
(such as 'href' and 'src') that have URLs as values. If you run into
another attribute that this module doesn't handle, please send it to
me and I'll add it. Until then you can add that attribute to the
internal list. This affects the entire class, including previously
created objects.

=item HTML::SimpleLinkExtor->remove_tags( TAG [, TAG ] )

Take tags out of the internal list that C<HTML::SimpleLinkExtor> uses
to extract URLs. This affects the entire class, including previously
created objects.

=item HTML::SimpleLinkExtor->remove_attributes( ATTR [, ATTR] )

Takes attributes out of the internal list that
C<HTML::SimpleLinkExtor> uses to extract URLs. This affects the entire
class, including previously created objects.

=item HTML::SimpleLinkExtor->attribute_list

Returns a list of the attributes C<HTML::SimpleLinkExtor> pays
attention to.

=item HTML::SimpleLinkExtor->tag_list

Returns a list of the tags C<HTML::SimpleLinkExtor> pays attention to.

=back

=head2 Object methods

=over 4

=item $extor->parse_file( $filename )

Parse the file for links.

=item $extor->parse_url( $url )

Fetch URL and parse its content for links.

=item $extor->parse( $data )

Parse the HTML in C<$data>.

=item $extor->clear_links

Clear the link list. This way, you can use the same parser for
another file.

=item $extor->links

Return a list of the links.

=item $extor->img

Return a list of the links from all the SRC attributes of the
IMG.

=item $extor->frame

Return a list of all the links from all the SRC attributes of
the FRAME.

=item $extor->iframe

Return a list of all the links from all the SRC attributes of
the IFRAME.

=item $extor->frames

Returns the combined list from frame and iframe.

=item $extor->src

Return a list of the links from all the SRC attributes of any
tag.

=item $extor->a

Return a list of the links from all the HREF attributes of the
A tags.

=item $extor->area

Return a list of the links from all the HREF attributes of the
AREA tags.

=item $extor->base

Return a list of the links from all the HREF attributes of the
BASE tags.  There should only be one.

=item $extor->href

Return a list of the links from all the HREF attributes of any
tag.

=item $extor->body, $extor->background

Return the link from the BODY tag's BACKGROUND attribute.

=item $extor->script

Return the link from the SCRIPT tag's SRC attribute

=back

=head1 TO DO

This module doesn't handle all of the HTML tags that might
have links.  If someone wants those, I'll add them, or you
can edit %AUTO_METHODS in the source.

=head1 CREDITS

Will Crain who identified a problem with IMG links that had
a USEMAP attribute.

=head1 SOURCE AVAILABILITY

This source is part of a SourceForge project which always has the
latest sources in CVS, as well as all of the previous releases.

	http://sourceforge.net/projects/brian-d-foy/

If, for some reason, I disappear from the world, one of the other
members of the project can shepherd this module appropriately.

=head1 AUTHORS

brian d foy, C<< <bdfoy@cpan.org> >>

=head1 COPYRIGHT AND LICENSE

Copyright (c) 2004-2007 brian d foy.  All rights reserved.

This program is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut

1;
