#!/usr/bin/perl -w
eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
if 0; # not running under some shell
=head1 NAME
lwp-rget - Retrieve web documents recursively
=head1 SYNOPSIS
lwp-rget [--verbose] [--auth=USER:PASS] [--depth=N] [--hier] [--iis]
[--keepext=mime/type[,mime/type]] [--limit=N] [--nospace]
[--prefix=URL] [--referer=URL] [--sleep=N] [--tolower] links
if ($type ne 'img' and $url->as_string !~ /^\Q$PREFIX/o) {
print STDERR "*outsider*\n" if $VERBOSE;
return $url->as_string;
}
# Translate URL to lowercase if $TOLOWER defined
$plain_url = to_lower($plain_url) if (defined $TOLOWER);
# If we already have it, then there is nothing to be done
my $seen = $seen{$plain_url->as_string};
if ($seen) {
my $frag = $url->frag;
$seen .= "#$frag" if defined($frag);
$seen = protect_frag_spaces($seen);
print STDERR "$seen (again)\n" if $VERBOSE;
return $seen;
}
# Too much or too deep
if ($depth > $MAX_DEPTH and $type ne 'img') {
print STDERR "*too deep*\n" if $VERBOSE;
return $url;
}
if ($no_docs > $MAX_DOCS) {
print STDERR "*too many*\n" if $VERBOSE;
return $url;
}
# Fetch document
$no_docs++;
sleep($SLEEP) if $SLEEP;
my $req = HTTP::Request->new(GET => $url);
# See: http://ftp.sunet.se/pub/NT/mirror-microsoft/kb/Q163/7/74.TXT
$req->header ('Accept', '*/*') if (defined $IIS); # GIF/JPG from IIS 2.0
$req->authorization_basic(split (/:/, $AUTH)) if (defined $AUTH);
if ($referer && !$SUPPRESS_REFERER) {
if ($req->uri->scheme eq 'http') {
# RFC 2616, section 15.1.3
$referer = url($referer) unless ref($referer);
undef $referer if ($referer->scheme || '') eq 'https';
}
$req->referer($referer) if $referer;
}
my $res = $ua->request($req);
# Check outcome
if ($res->is_success) {
my $doc = $res->content;
my $ct = $res->content_type;
my $name = find_name($res->request->uri, $ct);
print STDERR "$name\n" unless $QUIET;
$seen{$plain_url->as_string} = $name;
# If the file is HTML, then we look for internal links
if ($ct eq "text/html") {
# Save an unprosessed version of the HTML document. This
# both reserves the name used, and it also ensures that we
# don't loose everything if this program is killed before
# we finish.
save($name, $doc);
my $base = $res->base;
# Follow and substitute links...
$doc =~
s/
(
<(img|a|body|area|frame|td)\b # some interesting tag
[^>]+ # still inside tag (not strictly correct)
\b(?:src|href|background) # some link attribute
\s*=\s* # =
)
(?: # scope of OR-ing
(")([^"]*)" | # value in double quotes OR
(')([^']*)' | # value in single quotes OR
([^\s>]+) # quoteless value
)
/
new_link($1, lc($2), $3||$5, HTML::Entities::decode($4||$6||$7),
$base, $name, "$url", $depth+1)
/giex;
# XXX
# The regular expression above is not strictly correct.
# It is not really possible to parse HTML with a single
# regular expression, but it is faster. Tags that might
# confuse us include:
#
#
#
}
save($name, $doc);
return $name;
}
else {
print STDERR $res->code . " " . $res->message . "\n" if $VERBOSE;
$seen{$plain_url->as_string} = $url->as_string;
return $url->as_string;
}
}
sub new_link
{
my($pre, $type, $quote, $url, $base, $localbase, $referer, $depth) = @_;
$url = protect_frag_spaces($url);
$url = fetch(url($url, $base)->abs, $type, $referer, $depth);
$url = url("file:$url", "file:$localbase")->rel
unless $url =~ /^[.+\-\w]+:/;
$url = unprotect_frag_spaces($url);
return $pre . $quote . $url . $quote;
}
sub protect_frag_spaces
{
my ($url) = @_;
$url = $url->as_string if (ref($url));
if ($url =~ m/^([^#]*#)(.+)$/)
{
my ($base, $frag) = ($1, $2);
$frag =~ s/ /%20/g;
$url = $base . $frag;
}
return $url;
}
sub unprotect_frag_spaces
{
my ($url) = @_;
$url = $url->as_string if (ref($url));
if ($url =~ m/^([^#]*#)(.+)$/)
{
my ($base, $frag) = ($1, $2);
$frag =~ s/%20/ /g;
$url = $base . $frag;
}
return $url;
}
sub fix_backslashes
{
my ($url) = @_;
my ($base, $frag);
$url = $url->as_string if (ref($url));
if ($url =~ m/([^#]+)(#.*)/)
{
($base, $frag) = ($1, $2);
}
else
{
$base = $url;
$frag = "";
}
$base =~ tr/\\/\//;
$base =~ s/%5[cC]/\//g; # URL-encoded back slash is %5C
return $base . $frag;
}
sub to_lower
{
my ($url) = @_;
my $was_object = 0;
if (ref($url))
{
$url = $url->as_string;
$was_object = 1;
}
if ($url =~ m/([^#]+)(#.*)/)
{
$url = lc($1) . $2;
}
else
{
$url = lc($url);
}
if ($was_object == 1)
{
return url($url);
}
else
{
return $url;
}
}
sub translate_spaces
{
my ($url) = @_;
my ($base, $frag);
$url = $url->as_string if (ref($url));
if ($url =~ m/([^#]+)(#.*)/)
{
($base, $frag) = ($1, $2);
}
else
{
$base = $url;
$frag = "";
}
$base =~ s/^ *//; # Remove initial spaces from base
$base =~ s/ *$//; # Remove trailing spaces from base
$base =~ tr/ /_/;
$base =~ s/%20/_/g; # URL-encoded space is %20
return $base . $frag;
}
sub mkdirp
{
my($directory, $mode) = @_;
my @dirs = split(/\//, $directory);
my $path = shift(@dirs); # build it as we go
my $result = 1; # assume it will work
unless (-d $path) {
$result &&= mkdir($path, $mode);
}
foreach (@dirs) {
$path .= "/$_";
if ( ! -d $path) {
$result &&= mkdir($path, $mode);
}
}
return $result;
}
sub find_name
{
my($url, $type) = @_;
#print "find_name($url, $type)\n";
# Translate spaces in URL to underscores (_) if $NOSPACE defined
$url = translate_spaces($url) if (defined $NOSPACE);
# Translate URL to lowercase if $TOLOWER defined
$url = to_lower($url) if (defined $TOLOWER);
$url = url($url) unless ref($url);
my $path = $url->path;
# trim path until only the basename is left
$path =~ s|(.*/)||;
my $dirname = ".$1";
if (!$HIER) {
$dirname = "";
}
elsif (! -d $dirname) {
mkdirp($dirname, 0775);
}
my $extra = ""; # something to make the name unique
my $suffix;
if ($KEEPEXT{lc($type)}) {
$suffix = ($path =~ m/\.(.*)/) ? $1 : "";
}
else {
$suffix = media_suffix($type);
}
$path =~ s|\..*||; # trim suffix
$path = "index" unless length $path;
while (1) {
# Construct a new file name
my $file = $dirname . $path . $extra;
$file .= ".$suffix" if $suffix;
# Check if it is unique
return $file unless -f $file;
# Try something extra
unless ($extra) {
$extra = "001";
next;
}
$extra++;
}
}
sub save
{
my $name = shift;
#print "save($name,...)\n";
open(FILE, ">$name") || die "Can't save $name: $!";
binmode FILE;
print FILE $_[0];
close(FILE);
}
sub usage
{
print <<""; exit 1;
Usage: $progname [options]