#
# Copyright (C) 1999 Ken MacLeod
# XML::Parser::PerlSAX is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
# $Id: PerlSAX.pm,v 1.7 1999/12/22 21:15:00 kmacleod Exp $
#
use strict;
package XML::Parser::PerlSAX;
use XML::Parser;
use UNIVERSAL;
use vars qw{ $VERSION $name_re };
# will be substituted by make-rel script
$VERSION = "0.08";
# FIXME I doubt this is a correct Perl RE for productions [4] and
# [5] in the XML 1.0 specification, especially considering Unicode chars
$name_re = '[A-Za-z_:][A-Za-z0-9._:-]*';
sub new {
my $type = shift;
my $self = (@_ == 1) ? shift : { @_ };
return bless $self, $type;
}
sub parse {
my $self = shift;
die "XML::Parser::PerlSAX: parser instance ($self) already parsing\n"
if (defined $self->{ParseOptions});
# If there's one arg and it has no ref, it's a string
my $args;
if (scalar (@_) == 1 && !ref($_[0])) {
$args = { Source => { String => shift } };
} else {
$args = (scalar (@_) == 1) ? shift : { @_ };
}
my $parse_options = { %$self, %$args };
$self->{ParseOptions} = $parse_options;
# ensure that we have at least one source
if (!defined $parse_options->{Source}
|| !(defined $parse_options->{Source}{String}
|| defined $parse_options->{Source}{ByteStream}
|| defined $parse_options->{Source}{SystemId})) {
die "XML::Parser::PerlSAX: no source defined for parse\n";
}
# assign default Handler to any undefined handlers
if (defined $parse_options->{Handler}) {
$parse_options->{DocumentHandler} = $parse_options->{Handler}
if (!defined $parse_options->{DocumentHandler});
$parse_options->{DTDHandler} = $parse_options->{Handler}
if (!defined $parse_options->{DTDHandler});
$parse_options->{EntityResolver} = $parse_options->{Handler}
if (!defined $parse_options->{EntityResolver});
}
my @handlers;
if (defined $parse_options->{DocumentHandler}) {
# cache DocumentHandler in self for callbacks
$self->{DocumentHandler} = $parse_options->{DocumentHandler};
my $doc_h = $parse_options->{DocumentHandler};
push (@handlers, Init => sub { $self->_handle_init(@_) } )
if (UNIVERSAL::can($doc_h, 'start_document'));
push (@handlers, Final => sub { $self->_handle_final(@_) } )
if (UNIVERSAL::can($doc_h, 'end_document'));
push (@handlers, Start => sub { $self->_handle_start(@_) } )
if (UNIVERSAL::can($doc_h, 'start_element'));
push (@handlers, End => sub { $self->_handle_end(@_) } )
if (UNIVERSAL::can($doc_h, 'end_element'));
push (@handlers, Char => sub { $self->_handle_char(@_) } )
if (UNIVERSAL::can($doc_h, 'characters'));
push (@handlers, Proc => sub { $self->_handle_proc(@_) } )
if (UNIVERSAL::can($doc_h, 'processing_instruction'));
push (@handlers, Comment => sub { $self->_handle_comment(@_) } )
if (UNIVERSAL::can($doc_h, 'comment'));
push (@handlers, CdataStart => sub { $self->_handle_cdatastart(@_) } )
if (UNIVERSAL::can($doc_h, 'start_cdata'));
push (@handlers, CdataEnd => sub { $self->_handle_cdataend(@_) } )
if (UNIVERSAL::can($doc_h, 'end_cdata'));
if (UNIVERSAL::can($doc_h, 'entity_reference')) {
push (@handlers, Default => sub { $self->_handle_default(@_) } );
$self->{UseEntRefs} = 1;
}
}
if (defined $parse_options->{DTDHandler}) {
# cache DTDHandler in self for callbacks
$self->{DTDHandler} = $parse_options->{DTDHandler};
my $dtd_h = $parse_options->{DTDHandler};
push (@handlers, Notation => sub { $self->_handle_notation(@_) } )
if (UNIVERSAL::can($dtd_h, 'notation_decl'));
push (@handlers, Unparsed => sub { $self->_handle_unparsed(@_) } )
if (UNIVERSAL::can($dtd_h, 'unparsed_entity_decl'));
push (@handlers, Entity => sub { $self->_handle_entity(@_) } )
if ($self->{UseEntRefs}
|| UNIVERSAL::can($dtd_h, 'entity_decl'));
push (@handlers, Element => sub { $self->_handle_element(@_) } )
if (UNIVERSAL::can($dtd_h, 'element_decl'));
push (@handlers, Attlist => sub { $self->_handle_attlist(@_) } )
if (UNIVERSAL::can($dtd_h, 'attlist_decl'));
push (@handlers, Doctype => sub { $self->_handle_doctype(@_) } )
if (UNIVERSAL::can($dtd_h, 'doctype_decl'));
push (@handlers, XMLDecl => sub { $self->_handle_xmldecl(@_) } )
if (UNIVERSAL::can($dtd_h, 'xml_decl'));
}
if (defined $parse_options->{EntityResolver}) {
# cache EntityResolver in self for callbacks
$self->{EntityResolver} = $parse_options->{EntityResolver};
my $er = $parse_options->{EntityResolver};
push (@handlers, ExternEnt => sub { $self->_handle_extern_ent(@_) } )
if (UNIVERSAL::can($er, 'resolve_entity'));
}
my @xml_parser_options;
if ($self->{UseEntRefs}) {
@xml_parser_options = ( NoExpand => 1,
Handlers => { @handlers } );
} else {
@xml_parser_options = ( Handlers => { @handlers } );
}
push (@xml_parser_options,
ProtocolEncoding => $self->{ParseOptions}{Source}{Encoding})
if (defined $self->{ParseOptions}{Source}{Encoding});
my $parser = new XML::Parser(@xml_parser_options);
my $result;
if (defined $self->{ParseOptions}{Source}{ByteStream}) {
$result = $parser->parse($self->{ParseOptions}{Source}{ByteStream});
} elsif (defined $self->{ParseOptions}{Source}{String}) {
$result = $parser->parse($self->{ParseOptions}{Source}{String});
} elsif (defined $self->{ParseOptions}{Source}{SystemId}) {
$result = $parser->parsefile($self->{ParseOptions}{Source}{SystemId});
}
# clean up parser instance
delete $self->{ParseOptions};
delete $self->{DocumentHandler};
delete $self->{DTDHandler};
delete $self->{EntityResolver};
delete $self->{Expat};
return $result;
}
sub location {
my $self = shift;
my $expat = $self->{Expat};
my @properties = ( ColumnNumber => $expat->current_column,
LineNumber => $expat->current_line,
BytePosition => $expat->current_byte,
Base => $expat->base );
# FIXME these locations change while parsing external entities
push (@properties, PublicId => $self->{Source}{PublicId})
if (defined $self->{Source}{PublicId});
push (@properties, SystemId => $self->{Source}{SystemId})
if (defined $self->{Source}{SystemId});
return { @properties };
}
###
### DocumentHandler methods
###
sub _handle_init {
my $self = shift;
my $expat = shift;
$self->{Expat} = $expat;
if ($self->{DocumentHandler}->can('set_document_locator')) {
$self->{DocumentHandler}->set_document_locator( { Locator => $self } );
}
$self->{DocumentHandler}->start_document( { } );
}
sub _handle_final {
my $self = shift;
delete $self->{UseEntRefs};
delete $self->{EntRefs};
return $self->{DocumentHandler}->end_document( { } );
}
sub _handle_start {
my $self = shift;
my $expat = shift;
my $element = shift;
my @properties;
if ($self->{ParseOptions}{UseAttributeOrder}) {
# Capture order and defined() status for attributes
my $ii;
my $order = [];
for ($ii = 0; $ii < $#_; $ii += 2) {
push @$order, $_[$ii];
}
push @properties, 'AttributeOrder', $order;
# Divide by two because XML::Parser counts both attribute name
# and value within it's index
push @properties, 'Defaulted', ($expat->specified_attr() / 2);
}
$self->{DocumentHandler}->start_element( { Name => $element,
Attributes => { @_ },
@properties } );
}
sub _handle_end {
my $self = shift;
my $expat = shift;
my $element = shift;
$self->{DocumentHandler}->end_element( { Name => $element } );
}
sub _handle_char {
my $self = shift;
my $expat = shift;
my $string = shift;
$self->{DocumentHandler}->characters( { Data => $string } );
}
sub _handle_proc {
my $self = shift;
my $expat = shift;
my $target = shift;
my $data = shift;
$self->{DocumentHandler}->processing_instruction( { Target => $target,
Data => $data } );
}
sub _handle_comment {
my $self = shift;
my $expat = shift;
my $data = shift;
$self->{DocumentHandler}->comment( { Data => $data } );
}
sub _handle_cdatastart {
my $self = shift;
my $expat = shift;
$self->{DocumentHandler}->start_cdata( { } );
}
sub _handle_cdataend {
my $self = shift;
my $expat = shift;
$self->{DocumentHandler}->end_cdata( { } );
}
# Default receives all characters that aren't handled by some other
# handler, this means a lot of stuff goes through here. All we're
# looking for are `&NAME;' entity reference sequences
sub _handle_default {
my $self = shift;
my $expat = shift;
my $string = shift;
if ($string =~ /^&($name_re);$/) {
my $ent_ref = $self->{EntRefs}{$1};
if (!defined $ent_ref) {
$ent_ref = { Name => $1 };
}
$self->{DocumentHandler}->entity_reference($ent_ref);
}
}
###
### DTDHandler methods
###
sub _handle_notation {
my $self = shift;
my $expat = shift;
my $notation = shift;
my $base = shift;
my $sysid = shift;
my $pubid = shift;
my @properties = (Name => $notation);
push (@properties, Base => $base)
if (defined $base);
push (@properties, SystemId => $sysid)
if (defined $sysid);
push (@properties, PublicId => $pubid)
if (defined $pubid);
$self->{DTDHandler}->notation_decl( { @properties } );
}
sub _handle_unparsed {
my $self = shift;
my $expat = shift;
my $entity = shift;
my $base = shift;
my $sysid = shift;
my $pubid = shift;
my @properties = (Name => $entity, SystemId => $sysid);
push (@properties, Base => $base)
if (defined $base);
push (@properties, PublicId => $pubid)
if (defined $pubid);
$self->{DTDHandler}->unparsed_entity_decl( { @properties } );
}
sub _handle_entity {
my $self = shift;
my $expat = shift;
my $name = shift;
my $val = shift;
my $sysid = shift;
my $pubid = shift;
my $ndata = shift;
my @properties = (Name => $name);
push (@properties, Value => $val)
if (defined $val);
push (@properties, PublicId => $pubid)
if (defined $pubid);
push (@properties, SystemId => $sysid)
if (defined $sysid);
push (@properties, Notation => $ndata)
if (defined $ndata);
my $properties = { @properties };
if ($self->{UseEntRefs}) {
$self->{EntRefs}{$name} = $properties;
}
if ($self->{DTDHandler}->can('entity_decl')) {
$self->{DTDHandler}->entity_decl( $properties );
}
}
sub _handle_element {
my $self = shift;
my $expat = shift;
my $name = shift;
my $model = shift;
$self->{DTDHandler}->element_decl( { Name => $name,
Model => $model } );
}
sub _handle_attlist {
my $self = shift;
my $expat = shift;
my $elname = shift;
my $attname = shift;
my $type = shift;
my $default = shift;
my $fixed = shift;
$self->{DTDHandler}->attlist_decl( { ElementName => $elname,
AttributeName => $attname,
Type => $type,
Default => $default,
Fixed => $fixed } );
}
sub _handle_doctype {
my $self = shift;
my $expat = shift;
my $name = shift;
my $sysid = shift;
my $pubid = shift;
my $internal = shift;
my @properties = (Name => $name);
push (@properties, SystemId => $sysid)
if (defined $sysid);
push (@properties, PublicId => $pubid)
if (defined $pubid);
push (@properties, Internal => $internal)
if (defined $internal);
$self->{DTDHandler}->doctype_decl( { @properties } );
}
sub _handle_xmldecl {
my $self = shift;
my $expat = shift;
my $version = shift;
my $encoding = shift;
my $standalone = shift;
my @properties = (Version => $version);
push (@properties, Encoding => $encoding)
if (defined $encoding);
push (@properties, Standalone => $standalone)
if (defined $standalone);
$self->{DTDHandler}->xml_decl( { @properties } );
}
###
### EntityResolver methods
###
sub _handle_extern_ent {
my $self = shift;
my $expat = shift;
my $base = shift;
my $sysid = shift;
my $pubid = shift;
my @properties = (SystemId => $sysid);
push (@properties, Base => $base)
if (defined $base);
push (@properties, PublicId => $pubid)
if (defined $pubid);
my $result = $self->{EntityResolver}->resolve_entity( { @properties } );
if (UNIVERSAL::isa($result, 'HASH')) {
if ($result->{ByteStream}) {
return $result->{ByteStream};
} elsif ($result->{String}) {
return $result->{String};
} elsif ($result->{SystemId}) {
# FIXME must be able to resolve SystemIds, XML::Parser's
# default can :-(
die "PerlSAX: automatic opening of SystemIds from \`resolve_entity' not implemented, contact the author\n";
} else {
# FIXME
die "PerlSAX: invalid source returned from \`resolve_entity'\n";
}
}
return undef;
}
1;
__END__
=head1 NAME
XML::Parser::PerlSAX - Perl SAX parser using XML::Parser
=head1 SYNOPSIS
use XML::Parser::PerlSAX;
$parser = XML::Parser::PerlSAX->new( [OPTIONS] );
$result = $parser->parse( [OPTIONS] );
$result = $parser->parse($string);
=head1 DESCRIPTION
C