package HTML::PullParser;
require HTML::Parser;
@ISA=qw(HTML::Parser);
$VERSION = "3.57";
use strict;
use Carp ();
sub new
{
my($class, %cnf) = @_;
# Construct argspecs for the various events
my %argspec;
for (qw(start end text declaration comment process default)) {
my $tmp = delete $cnf{$_};
next unless defined $tmp;
$argspec{$_} = $tmp;
}
Carp::croak("Info not collected for any events")
unless %argspec;
my $file = delete $cnf{file};
my $doc = delete $cnf{doc};
Carp::croak("Can't parse from both 'doc' and 'file' at the same time")
if defined($file) && defined($doc);
Carp::croak("No 'doc' or 'file' given to parse from")
unless defined($file) || defined($doc);
# Create object
$cnf{api_version} = 3;
my $self = $class->SUPER::new(%cnf);
my $accum = $self->{pullparser_accum} = [];
while (my($event, $argspec) = each %argspec) {
$self->SUPER::handler($event => $accum, $argspec);
}
if (defined $doc) {
$self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc;
$self->{pullparser_str_pos} = 0;
}
else {
if (!ref($file) && ref(\$file) ne "GLOB") {
require IO::File;
$file = IO::File->new($file, "r") || return;
}
$self->{pullparser_file} = $file;
}
$self;
}
sub handler
{
Carp::croak("Can't set handlers for HTML::PullParser");
}
sub get_token
{
my $self = shift;
while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) {
if (my $f = $self->{pullparser_file}) {
# must try to parse more from the file
my $buf;
if (read($f, $buf, 512)) {
$self->parse($buf);
} else {
$self->eof;
$self->{pullparser_eof}++;
delete $self->{pullparser_file};
}
}
elsif (my $sref = $self->{pullparser_str_ref}) {
# must try to parse more from the scalar
my $pos = $self->{pullparser_str_pos};
my $chunk = substr($$sref, $pos, 512);
$self->parse($chunk);
$pos += length($chunk);
if ($pos < length($$sref)) {
$self->{pullparser_str_pos} = $pos;
}
else {
$self->eof;
$self->{pullparser_eof}++;
delete $self->{pullparser_str_ref};
delete $self->{pullparser_str_pos};
}
}
else {
die;
}
}
shift @{$self->{pullparser_accum}};
}
sub unget_token
{
my $self = shift;
unshift @{$self->{pullparser_accum}}, @_;
$self;
}
1;
__END__
=head1 NAME
HTML::PullParser - Alternative HTML::Parser interface
=head1 SYNOPSIS
use HTML::PullParser;
$p = HTML::PullParser->new(file => "index.html",
start => 'event, tagname, @attr',
end => 'event, tagname',
ignore_elements => [qw(script style)],
) || die "Can't open: $!";
while (my $token = $p->get_token) {
#...do something with $token
}
=head1 DESCRIPTION
The HTML::PullParser is an alternative interface to the HTML::Parser class.
It basically turns the HTML::Parser inside out. You associate a file
(or any IO::Handle object or string) with the parser at construction time and
then repeatedly call $parser->get_token to obtain the tags and text
found in the parsed document.
The following methods are provided:
=over 4
=item $p = HTML::PullParser->new( file => $file, %options )
=item $p = HTML::PullParser->new( doc => \$doc, %options )
A C