#!/usr/bin/perl -w
eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
if 0; # not running under some shell
use strict;
use XML::Twig;
use Getopt::Long;
use Pod::Usage;
use File::Temp qw{tempfile};
my $DEFAULT_SC = 'aspell -c';
my $DEFAULT_PP = 'indented';
my $DEFAULT_EXT= '.bak';
my $VERSION="0.02";
my ( $spellchecker, $ext, $attributes, $exclude_elements,
$include_elements, $pretty_print, $version, $help, $man);
GetOptions( 'spellchecker=s' => \$spellchecker,
'backup-extension=s' => \$ext,
'attributes' => \$attributes,
'exclude_elements=s' => \$exclude_elements,
'include_elements=s' => \$include_elements,
'pretty_print:s' => \$pretty_print,
'version' => \$version,
'help' => \$help,
'man' => \$man,
) or pod2usage(-verbose => 1, -exitval => -1);
pod2usage( -verbose => 1, -exitval => 0) if $help;
pod2usage( -verbose => 2, -exitval => 0) if $man;
if( $version) { print "$0 version $VERSION\n"; exit;}
# option processing
$spellchecker ||= $DEFAULT_SC;
$ext ||= $DEFAULT_EXT;
if( $exclude_elements && $include_elements)
{ die "cannot use both --exclude-elements and --include-elements\n"; }
if( defined $pretty_print and !$pretty_print)
{ $pretty_print= $DEFAULT_PP; }
my %twig_options;
my( %include_elements);
if( $exclude_elements)
{ my @exclude_elts = split /\s+/, $exclude_elements;
my %start_tag_handlers= map { $_ => \&exclude_elt } @exclude_elts;
$twig_options{start_tag_handlers}= \%start_tag_handlers;
}
if( $include_elements)
{ my @include_elts = split /\s+/, $include_elements;
my %start_tag_handlers= map { $_ => \&include_elt } @include_elts;
$twig_options{start_tag_handlers}= \%start_tag_handlers;
}
$twig_options{pretty_print}= $pretty_print if( $pretty_print);
foreach my $file (@ARGV)
{
my $id=0;
my $id2elt={}; # id => element
my( $tmp_fh, $tmp_file) = tempfile( "xml_spellcheck_XXXX",
SUFFIX => '.txt'
);
my $t= XML::Twig->new( keep_encoding =>1, %twig_options,);
$t->parsefile( $file);
foreach my $elt ($t->descendants( '#TEXT'))
{
if( (!$include_elements and !$exclude_elements)
or ($include_elements and $elt->inherit_att( '#include'))
or ($exclude_elements and !$elt->inherit_att( '#exclude'))
)
{ $id++;
process_text( $t, $elt, $id, $id2elt, $tmp_fh)
}
}
close $tmp_fh;
system( "$spellchecker $tmp_file") ==0
or die "$spellchecker $tmp_file failed: $?";
open( $tmp_fh, "<$tmp_file") or die "cannot open temp file $tmp_file: $!";
while( <$tmp_fh>)
{ chomp;
my( $id, $text)= split /:/, $_, 2;
my $wrap= $id2elt->{$id};
$text=~ s{<\\n>}{\n}g;
my $text_elt= $wrap->first_child or die "internal error 100\n";
if( $text_elt->gi eq '#PCDATA')
{ $text_elt->set_pcdata( $text); }
elsif( $text_elt->gi eq '#CDATA')
{ $text_elt->set_cdata( $text); }
else
{ die "internal error 101\n"; }
$wrap->erase;
}
close $tmp_fh;
rename( $file, "$file$ext") or die "cannot save backup file $file$ext: $!";
open( FILE, ">$file") or die "cannot save spell checked file $file: $!";
$t->print( \*FILE);
close FILE;
}
sub include_elt
{ $_->set_att( '#include' => 1) ; }
sub exclude_elt
{ $_->set_att( '#exclude' => 1) ; }
sub process_text
{ my( $t, $elt, $id, $id2elt, $tmp_fh)= @_;
my $wrap= $elt->wrap_in( '#SC');
#$wrap->set_att( '#ID' => $id);
$id2elt->{$id}= $wrap;
my $text= $elt->text;
$text=~ s{\n}{<\\n>}g;
print $tmp_fh "$id:$text\n";
}
__END__
=head1 NAME
xml_spellcheck - spellcheck XML files
=head1 SYNOPSIS
xml_spellcheck [options]