#!/usr/bin/perl -w
eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
if 0; # not running under some shell
# $Id: /xmltwig/trunk/tools/xml_grep/xml_grep 27 2007-08-30T08:07:25.079327Z mrodrigu $
use strict;
use Getopt::Long;
use Pod::Usage;
use XML::Twig;
my $VERSION="0.7";
# options (all used globally in the script)
my( $help, $man, @roots, @paths, $files, $count, $nb_results, $encoding, @exclude,
$wrap, $nowrap, $descr, $group, $pretty_print, $version, $text_only, $date,
$html, $verbose, $strict
);
# used to check if the wrapping tags need to be output
my $results = 0;
my $file_results = 0;
# first process the case where the user provides only
# an xpath expression and a list of files
if( @ARGV && ($ARGV[0] !~ m{^-}) )
{ splice( @ARGV, 0, 0, '--group_by_file', 'file', '--pretty_print', 'indented', '--cond'); }
GetOptions( 'help' => \$help,
'man' => \$man,
'Version' => \$version,
'exclude|v=s' => \@exclude,
'root=s' => \@roots,
'cond=s' => \@paths,
'files' => \$files,
'count' => \$count,
'nb_results=i' => \$nb_results,
'encoding=s' => \$encoding,
'wrap:s' => \$wrap,
'nowrap' => \$nowrap,
'descr:s' => \$descr,
'group_by_file:s' => \$group,
'pretty_print:s' => \$pretty_print,
'text_only' => \$text_only,
'date!' => \$date,
'strict' => \$strict,
'html' => \$html,
'verbose' => \$verbose,
) or pod2usage(2);
pod2usage(1) if $help;
pod2usage(-exitstatus => 0, -verbose => 2) if $man;
if( $version) { warn "$0 version $VERSION\n"; exit; }
binmode STDOUT, ':utf8';
# case where options are given, but no root or path, assume the
# first arg is a path
if( !@roots and !@paths and !@exclude and @ARGV) { @paths= shift @ARGV; }
unless( @roots or @paths or @exclude or $files) { pod2usage(1); exit; }
if( ($files or $count) and !@paths) { pod2usage(1); exit; }
if( ($files or $count) and (@roots or $encoding or defined( $wrap)
or defined( $group) or defined( $pretty_print)))
{ pod2usage(1); exit; }
if( $files and !@ARGV) { pod2usage(1); exit; }
if( !$files and !$count and @paths and !@roots) { @roots= @paths; @paths=(); }
$date=1 unless( defined $date);
# defaults for optional arguments to options
$group = 'file' if( defined $group and !$group);
$pretty_print = 'indented' if( defined $pretty_print and !$pretty_print);
if( $nowrap) { $wrap=''; } elsif( !defined( $wrap) and (@roots or @paths)) { $wrap= 'xml_grep'; }
if( !defined( $descr) and (@roots or @paths))
{ if( $date)
{ $date= localtime();
$descr = qq{version="$VERSION" date="$date"}
}
else
{ $descr = qq{version="$VERSION"}; }
}
# some globals
my $current_file;
my $count_file = 0;
my $count_total = 0;
# will be used to create the twig
my %options;
if( $count)
{ my $twig_roots={};
my $twig_root= sub { $count_file++; $_[0]->purge; };
foreach my $path (@paths)
{ $twig_roots->{$path}= $twig_root; }
$options{twig_roots}= $twig_roots;
}
elsif( @exclude)
{ # general options
$nowrap=1;
# twig options
$options{twig_print_outside_roots} = 1;
my $root_handlers={};
foreach my $exclude (@exclude)
{ $root_handlers->{$exclude}= sub { }; }
$options{twig_roots}= $root_handlers;
}
else
{ create_regular_handlers( \%options, \@roots, \@paths);
}
$options{pretty_print} = $pretty_print if( $pretty_print);
$options{output_encoding} = $encoding if( $encoding);
my $t= create_twig( %options);
if( @ARGV)
{ foreach my $file (@ARGV)
{ $current_file= $file;
if( $verbose) { warn "parsing '$file'\n"; }
my $ok= $html && ($current_file=~ m{^(http|ftp|file)://}) ? $t->safe_parseurl_html( $file)
: ($current_file=~ m{^(http|ftp|file)://}) ? $t->safe_parseurl( $file)
: $html ? $t->safe_parsefile_html( $file)
: $t->safe_parsefile( $file);
if( !$ok)
{ if( $@ =~ m{XMLGREP: FOUND})
{ # in files mode
print $current_file, "\n";
$nb_results--;
exit unless( $nb_results);
}
elsif( $@ =~ m{^XMLGREP: NB_RESULT_REACHED})
{ print file_result_end() if( $group && $file_results);
print result_end() if( $results);
exit;
}
else
{ $@ ||= 'unknown cause';
if( $strict) { die $@; }
warn $@;
if( !$count) { print "\n"; }
}
}
if( $count)
{ print "$current_file: $count_file\n";
$count_total += $count_file;
$count_file=0;
}
elsif( @roots) { print file_result_end() if( $file_results); }
elsif( $count) { print "$count_total matches\n"; }
}
if( $count) { print "total: $count_total\n"; }
print result_end() if( $results);
}
else
{ $file_results=0;
my $ok= $t->safe_parse( \*STDIN);
if( !$ok and ( $@ !~ m{^XMLGREP: NB_RESULT_REACHED}))
{ if( !$strict) { warn $@; } else { die $@; } }
if( $count) { print "$count_total matches\n"; }
else { print result_end(); }
}
sub create_regular_handlers
{ my( $options, $roots, $paths)= @_;
if( @$roots)
{ my $root_handlers={};
my $root_handler= twig_roots_handler( @$paths);
foreach my $root (@$roots)
{ $root_handlers->{$root}= $root_handler; }
$options->{twig_roots}= $root_handlers;
}
if( @$paths)
{ my $twig_handlers={};
my $twig_handler= twig_handlers();
foreach my $path (@$paths)
{ $twig_handlers->{$path}= $twig_handler; }
$options->{twig_handlers}= $twig_handlers;
}
}
sub create_twig
{ my( %options)= @_;
my $twig;
eval { $twig= XML::Twig->new( %options) };
if( $@)
{ # see if we are in the case where the only condition uses string() or regexp
if( ($@=~ m{^(regexp|string\(\)) condition not supported on twig_roots option})
&& $options{twig_roots} && !$options{twig_handlers}
&& ( keys %{$options{twig_roots}} == 1)
)
{ # in this case add the proper twig_roots option
my $cond= (keys %{$options{twig_roots}})[0];
(my $root= $cond)=~ s{\[[^\]]*\]$}{};
#warn "cond: '$cond' - root: '$root'\n";
delete $options{twig_roots};
delete $options{twig_handlers};
@paths= ($cond);
@roots= ($root);
create_regular_handlers( \%options, \@roots, \@paths);
return create_twig( %options);
}
elsif( $@=~ m{^wrong condition: unrecognized expression in handler: '(.*?)'})
{ die "error in filter condition '$1'\n"; }
else
{ die "error: $@"; }
}
return $twig;
}
sub twig_roots_handler
{ my( @paths)= @_;
return sub
{ my( $t, $root)= @_;
if( !@paths or $_->att( '#print'))
{ print result_start() unless( $results);
print file_result_start() unless( !$group or $file_results);
if( $text_only)
{ print $root->text, "\n"; }
else
{ $root->print; }
$nb_results--;
unless( $nb_results) { $@= "XMLGREP: NB_RESULT_REACHED"; die; }
}
$t->purge;
1;
};
}
sub twig_handlers
{ if( $files)
{ return sub { $@="XMLGREP: FOUND"; die; }; }
else
{ return sub { my( $t, $hit)= @_;
foreach my $elt ( $hit->ancestors_or_self)
{ $elt->set_att( '#print' => 1); }
1;
};
}
}
sub result_start
{ $results=1;
return if( $text_only);
my $enc_decl= $encoding ? qq{encoding="$encoding" } : '';
return $wrap ? qq{\n<$wrap $descr>\n}
: '';
}
sub result_end
{ my $result;
return if( $text_only);
if( !$group) { $result= "\n"; }
$result .= qq{$wrap>\n} if( $wrap);
return $result;
}
sub file_result_start
{ $file_results=1;
return if( $text_only);
my $result;
$result= qq{<$group filename="$current_file">};
if( !$pretty_print)
{ $result.= "\n"; }
return $result;
}
sub file_result_end
{ $file_results=0;
return '' if( $text_only);
return qq{\n$group>\n};
}
__END__
=head1 NAME
xml_grep - grep XML files looking for specific elements
=head1 SYNOPSYS
xml_grep [options]