# $Id: encoding.pm,v 2.8 2009/02/15 17:44:13 dankogai Exp $
package encoding;
our $VERSION = '2.6_01';
use Encode;
use strict;
use warnings;
sub DEBUG () { 0 }
BEGIN {
if ( ord("A") == 193 ) {
require Carp;
Carp::croak("encoding: pragma does not support EBCDIC platforms");
}
}
our $HAS_PERLIO = 0;
eval { require PerlIO::encoding };
unless ($@) {
$HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 );
}
sub _exception {
my $name = shift;
$] > 5.008 and return 0; # 5.8.1 or higher then no
my %utfs = map { $_ => 1 }
qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
UTF-32 UTF-32BE UTF-32LE);
$utfs{$name} or return 0; # UTFs or no
require Config;
Config->import();
our %Config;
return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no
}
sub in_locale { $^H & ( $locale::hint_bits || 0 ) }
sub _get_locale_encoding {
my $locale_encoding;
# I18N::Langinfo isn't available everywhere
eval {
require I18N::Langinfo;
I18N::Langinfo->import(qw(langinfo CODESET));
$locale_encoding = langinfo( CODESET() );
};
my $country_language;
no warnings 'uninitialized';
if ( (not $locale_encoding) && in_locale() ) {
if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) {
( $country_language, $locale_encoding ) = ( $1, $2 );
}
elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) {
( $country_language, $locale_encoding ) = ( $1, $2 );
}
# LANGUAGE affects only LC_MESSAGES only on glibc
}
elsif ( not $locale_encoding ) {
if ( $ENV{LC_ALL} =~ /\butf-?8\b/i
|| $ENV{LANG} =~ /\butf-?8\b/i )
{
$locale_encoding = 'utf8';
}
# Could do more heuristics based on the country and language
# parts of LC_ALL and LANG (the parts before the dot (if any)),
# since we have Locale::Country and Locale::Language available.
# TODO: get a database of Language -> Encoding mappings
# (the Estonian database at http://www.eki.ee/letter/
# would be excellent!) --jhi
}
if ( defined $locale_encoding
&& lc($locale_encoding) eq 'euc'
&& defined $country_language )
{
if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) {
$locale_encoding = 'euc-jp';
}
elsif ( $country_language =~ /^ko_KR|korean?$/i ) {
$locale_encoding = 'euc-kr';
}
elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) {
$locale_encoding = 'euc-cn';
}
elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) {
$locale_encoding = 'euc-tw';
}
else {
require Carp;
Carp::croak(
"encoding: Locale encoding '$locale_encoding' too ambiguous"
);
}
}
return $locale_encoding;
}
sub import {
my $class = shift;
my $name = shift;
if ( $name eq ':_get_locale_encoding' ) { # used by lib/open.pm
my $caller = caller();
{
no strict 'refs';
*{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding;
}
return;
}
$name = _get_locale_encoding() if $name eq ':locale';
my %arg = @_;
$name = $ENV{PERL_ENCODING} unless defined $name;
my $enc = find_encoding($name);
unless ( defined $enc ) {
require Carp;
Carp::croak("encoding: Unknown encoding '$name'");
}
$name = $enc->name; # canonize
unless ( $arg{Filter} ) {
DEBUG and warn "_exception($name) = ", _exception($name);
_exception($name) or ${^ENCODING} = $enc;
$HAS_PERLIO or return 1;
}
else {
defined( ${^ENCODING} ) and undef ${^ENCODING};
# implicitly 'use utf8'
require utf8; # to fetch $utf8::hint_bits;
$^H |= $utf8::hint_bits;
eval {
require Filter::Util::Call;
Filter::Util::Call->import;
filter_add(
sub {
my $status = filter_read();
if ( $status > 0 ) {
$_ = $enc->decode( $_, 1 );
DEBUG and warn $_;
}
$status;
}
);
};
$@ eq '' and DEBUG and warn "Filter installed";
}
defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
for my $h (qw(STDIN STDOUT)) {
if ( $arg{$h} ) {
unless ( defined find_encoding( $arg{$h} ) ) {
require Carp;
Carp::croak(
"encoding: Unknown encoding for $h, '$arg{$h}'");
}
eval { binmode( $h, ":raw :encoding($arg{$h})" ) };
}
else {
unless ( exists $arg{$h} ) {
eval {
no warnings 'uninitialized';
binmode( $h, ":raw :encoding($name)" );
};
}
}
if ($@) {
require Carp;
Carp::croak($@);
}
}
return 1; # I doubt if we need it, though
}
sub unimport {
no warnings;
undef ${^ENCODING};
if ($HAS_PERLIO) {
binmode( STDIN, ":raw" );
binmode( STDOUT, ":raw" );
}
else {
binmode(STDIN);
binmode(STDOUT);
}
if ( $INC{"Filter/Util/Call.pm"} ) {
eval { filter_del() };
}
}
1;
__END__
=pod
=head1 NAME
encoding - allows you to write your script in non-ascii or non-utf8
=head1 SYNOPSIS
use encoding "greek"; # Perl like Greek to you?
use encoding "euc-jp"; # Jperl!
# or you can even do this if your shell supports your native encoding
perl -Mencoding=latin2 -e'...' # Feeling centrally European?
perl -Mencoding=euc-kr -e'...' # Or Korean?
# more control
# A simple euc-cn => utf-8 converter
use encoding "euc-cn", STDOUT => "utf8"; while(<>){print};
# "no encoding;" supported (but not scoped!)
no encoding;
# an alternate way, Filter
use encoding "euc-jp", Filter=>1;
# now you can use kanji identifiers -- in euc-jp!
# switch on locale -
# note that this probably means that unless you have a complete control
# over the environments the application is ever going to be run, you should
# NOT use the feature of encoding pragma allowing you to write your script
# in any recognized encoding because changing locale settings will wreck
# the script; you can of course still use the other features of the pragma.
use encoding ':locale';
=head1 ABSTRACT
Let's start with a bit of history: Perl 5.6.0 introduced Unicode
support. You could apply C and regexes even to complex CJK
characters -- so long as the script was written in UTF-8. But back
then, text editors that supported UTF-8 were still rare and many users
instead chose to write scripts in legacy encodings, giving up a whole
new feature of Perl 5.6.
Rewind to the future: starting from perl 5.8.0 with the B
pragma, you can write your script in any encoding you like (so long
as the C module supports it) and still enjoy Unicode support.
This pragma achieves that by doing the following:
=over
=item *
Internally converts all literals (C) from
the encoding specified to utf8. In Perl 5.8.1 and later, literals in
C
and C pseudo-filehandle are also converted.
=item *
Changing PerlIO layers of C and C to the encoding
specified.
=back
=head2 Literal Conversions
You can write code in EUC-JP as follows:
my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
#<-char-><-char-> # 4 octets
s/\bCamel\b/$Rakuda/;
And with C