#!/usr/bin/perl
eval 'exec /usr/bin/perl -S $0 ${1+"$@"}'
if $running_under_some_shell;
#!./perl
BEGIN {
# @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
# with $ENV{PERL_CORE} set
# In case we need it in future...
require Config; import Config;
}
use strict;
use warnings;
use Getopt::Std;
use Config;
my @orig_ARGV = @ARGV;
our $VERSION = do { my @r = (q$Revision: 2.6 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
# These may get re-ordered.
# RAW is a do_now as inserted by &enter
# AGG is an aggreagated do_now, as built up by &process
use constant {
RAW_NEXT => 0,
RAW_IN_LEN => 1,
RAW_OUT_BYTES => 2,
RAW_FALLBACK => 3,
AGG_MIN_IN => 0,
AGG_MAX_IN => 1,
AGG_OUT_BYTES => 2,
AGG_NEXT => 3,
AGG_IN_LEN => 4,
AGG_OUT_LEN => 5,
AGG_FALLBACK => 6,
};
# (See the algorithm in encengine.c - we're building structures for it)
# There are two sorts of structures.
# "do_now" (an array, two variants of what needs storing) is whatever we need
# to do now we've read an input byte.
# It's housed in a "do_next" (which is how we got to it), and in turn points
# to a "do_next" which contains all the "do_now"s for the next input byte.
# There will be a "do_next" which is the start state.
# For a single byte encoding it's the only "do_next" - each "do_now" points
# back to it, and each "do_now" will cause bytes. There is no state.
# For a multi-byte encoding where all characters in the input are the same
# length, then there will be a tree of "do_now"->"do_next"->"do_now"
# branching out from the start state, one step for each input byte.
# The leaf "do_now"s will all be at the same distance from the start state,
# only the leaf "do_now"s cause output bytes, and they in turn point back to
# the start state.
# For an encoding where there are varaible length input byte sequences, you
# will encounter a leaf "do_now" sooner for the shorter input sequences, but
# as before the leaves will point back to the start state.
# The system will cope with escape encodings (imagine them as a mostly
# self-contained tree for each escape state, and cross links between trees
# at the state-switching characters) but so far no input format defines these.
# The system will also cope with having output "leaves" in the middle of
# the bifurcating branches, not just at the extremities, but again no
# input format does this yet.
# There are two variants of the "do_now" structure. The first, smaller variant
# is generated by &enter as the input file is read. There is one structure
# for each input byte. Say we are mapping a single byte encoding to a
# single byte encoding, with "ABCD" going "abcd". There will be
# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
# &process then walks the tree, building aggregate "do_now" structres for
# adjacent bytes where possible. The aggregate is for a contiguous range of
# bytes which each produce the same length of output, each move to the
# same next state, and each have the same fallback flag.
# So our 4 RAW "do_now"s above become replaced by a single structure
# containing:
# ["A", "D", "abcd", 1, ...]
# ie, for an input byte $_ in "A".."D", output 1 byte, found as
# substr ("abcd", (ord $_ - ord "A") * 1, 1)
# which maps very nicely into pointer arithmetic in C for encengine.c
sub encode_U
{
# UTF-8 encode long hand - only covers part of perl's range
## my $uv = shift;
# chr() works in native space so convert value from table
# into that space before using chr().
my $ch = chr(utf8::unicode_to_native($_[0]));
# Now get core perl to encode that the way it likes.
utf8::encode($ch);
return $ch;
}
sub encode_S
{
# encode single byte
## my ($ch,$page) = @_; return chr($ch);
return chr $_[0];
}
sub encode_D
{
# encode double byte MS byte first
## my ($ch,$page) = @_; return chr($page).chr($ch);
return chr ($_[1]) . chr $_[0];
}
sub encode_M
{
# encode Multi-byte - single for 0..255 otherwise double
## my ($ch,$page) = @_;
## return &encode_D if $page;
## return &encode_S;
return chr ($_[1]) . chr $_[0] if $_[1];
return chr $_[0];
}
my %encode_types = (U => \&encode_U,
S => \&encode_S,
D => \&encode_D,
M => \&encode_M,
);
# Win32 does not expand globs on command line
eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
my %opt;
# I think these are:
# -Q to disable the duplicate codepoint test
# -S make mapping errors fatal
# -q to remove comments written to output files
# -O to enable the (brute force) substring optimiser
# -o