#!/usr/sco/bin/perl
#!/usr/local/bin/perl
#!/usr/local/bin/proxyperl
#
# w3get - point it at a http: url and it recursively retrieves href's
# and img src's starting from that page
#
# Version 0.1 by Brooks Cutter (bcutter@paradyne.com) 2/5/94
#
# Usage: w3get [-d] [-v]
#
# where fully qualified url is like http://host/file.html
# like the Mosaic What's new page:
# http://www.ncsa.uiuc.edu/SDG/Software/Mosaic/Docs/whats-new.html
#
# -d prints debugging information.
# -v is verbose (prints a message for each url it descends)
#
#
# I wrote this program a month ago in preperation for a presentation
# on Mosaic and the World Wide Web. I had a sun there and wanted to
# display parts of the web without using a slow PPP connection.
# I haven't done anything with it since then (except today to document
# it and clean it up) so don't intend to develop it further until
# a need arises. Feel free to hack this up and pass it around.
# (and pass me a copy please...)
#
#
# If you are a AT&T Site behind the proxy gateway, you will need
# my version of proxyperl. Email me for more info, and then set below to 1.
$att_proxy = 0;
# Uses Paradyne Automounter setup..
$pdn = 0;
# This string is prepended to the rewritten url's
# It could also be a 'file://...' or 'ftp://...', etc...
# mark so can post process later since leaves the original URL on page
$redirect = 'HOPS_LOCAL';
# directory where I can write my output to
$outdir = "$ENV{'HOME'}/tmp/html_get";
mkdir($outdir, 0755) unless(-d $outdir);
die "$0: \n(like http://www.cis.ohio-state.edu:80/hypertext/faq/usenet/tcl-faq/part1/faq.html)\n" unless (@ARGV);
unshift(@INC,"/pdn/appl/perl/lib","/pdn/appl/perl/lib/sys") if ($pdn);
require 'url.pl';
require 'getopts.pl';
&Getopts('dv');
$'ipcdebug = 1 if (($opt_d) && ($att_proxy));
#$version = "HTTP 1.0";
push(@todo, @ARGV);
FOREVER: while (1) {
#last unless(@todo);
print '@todo = ',scalar @todo," ($todo[0])\n" if ($opt_d);
unless(@todo) {
last unless(@remote);
@todo = @remote; @remote = ();
}
$node_url = shift(@todo);
$seen{$node_url} = 1; # So I only descend each url once...
print "Checking url $node_url\n" if ($opt_v);
unless ($node_url =~ m!^http://!) {
warn "Argument must be fully qualified url (ie: http://host/file.html):\n$node_url\n";
next;
}
# If it's already pulled the page down, it shouldn't retrieve it
# again - but it needs to open it, parse the hyperlinks and then
# retrieve those if necessary. Right now it pulls everything down
# whether it has it or not.
#
# ($url_fn,$url_dir) = &url2fndir($url);
# next if (-e "$outdir/$url_fn");
#
if ($page = &url'get($node_url,$version)) {
if ($node_url =~ /html/i) {
$page = &url'abs($node_url,$page);
# This should be combined into the one above, but it was a quick kludge
# (like this program)
$page = &url'img_abs($node_url,$page);
}
} else { warn "$!\n"; next; }
$node_host = '';
if ($node_url =~ m|^http://([^/]+)/?.*$|) {
$node_host = $1;
}
# I should really get the type from HTTP/1.0 headers...
if ($node_url =~ /html$/i) {
@links = &parse_html($page);
@http = &extract_http(@links);
for (@http) {
s/#.*//; # Delete skipto marks
next if ($seen{$_});
next if (/htbin/); # skip hitbin
next if (/cgi.*bin/); # skip hitbin
next if (/\?/); # Skip argument urls...
#next unless (/paradyne.com/); # If you don't want to stray from a domain
if (($node_host) && ($node_url =~ m!http://$node_host!)) {
# Do local ones first
push(@todo, $_);
} else {
push(@remote, $_);
}
$seen{$_} = 1;
}
@links2 = &localize(@links); # Should use pointers
&save_url($node_url, @links2);
} else {
&save_url($node_url, $page);
}
next;
}
exit;
sub save_url {
local($url) = shift(@_);
local($url_fn, $url_dir) = &url2fndir($url);
return unless($url_fn);
if ($url_dir) {
if ((-e "$outdir/$url_dir") && (!-d "$outdir/$url_dir")) {
# url was previously referenced like:
# http://host/directory - and thought it was a file when a
# directory index was generated. So move it to index.html...
system("mv $outdir/$url_dir $outdir/$url_dir.index");
system("mkdir -p $outdir/$url_dir");
system("mv $outdir/$url_dir.index $outdir/$url_dir/index.html");
} elsif (!-e "$outdir/$url_dir") { system("mkdir -p $outdir/$url_dir"); }
}
print STDERR "Writing $url to $url_fn\n";
if (-e "$outdir/$url_fn") {
print STDERR "--->>> HEY, $url_fn already exists!\n";
return;
}
open(OUT, ">$outdir/$url_fn");
print OUT @_;
close(OUT);
}
sub url2fndir {
local($url) = shift(@_);
return($cache_url_fn{$url},$cache_url_dir{$url})
if (($cache_url_fn{$url}) && ($cache_url_dir{$url}));
local($url_fn,$url_dir);
if ($url =~ m!^http://(.+)$!) {
$url_fn = $1;
$url_fn =~ tr/~/_/d;
@url_dir = split(/\//, $url_fn); pop(@url_dir);
$url_dir = join('/',@url_dir);
$cache_url_fn{$url} = $url_fn;
$cache_url_dir{$url} = $url_dir;
return($url_fn,$url_dir);
}
return('');
}
sub extract_http {
local($url);
local($_,@return);
for (@_) {
next unless ((/^]+)>$/i) || (/^<(img\s+.*src)=([^>]+)>$/i)) {
$cmd = $1;
$url = $2;
$url =~ tr/'"//d; # Delete quotes
if (($url =~ /^http:/) || ($url =~ m!^[/a-zA-Z0-9]!)) {
push(@return, $url);
}
next;
}
}
return(@return);
}
sub localize {
local($_,@return);
local(@r);
for (@_) {
unless ((/^]+)>$/i) || (/^<(img\s+.*src)=([^>]+)>$/i)) {
$cmd = $1;
$url = $2;
$url =~ tr/'"//d; # Delete quotes
#print "localize found url $url\n";
if ($url =~ m!^http://(.+)$!) {
push(@r, "<$cmd=\"$redirect/$1\">");
} else {
push(@r, "<$cmd=\"$url\">");
}
next;
}
}
return(@r);
}
sub parse_html {
local(@data) = ();
local($save, $_, $lt, $gt);
NEXTLINE: for (split(/\r/,$_[0])) {
$save .= $_;
if ((($lt = index($save,'<')) == -1) || (index($save,'>',$lt) == -1))
{ next; }
$lt = $gt = 0;
while (($lt = index($save, '<', $gt)) >= $[) {
# This is the data *BEFORE* the '<'
if ($lt) { # do If isn't /^', $lt);
if ($gt == -1) {
$save = substr($save, $lt);
next NEXTLINE;
}
# This is the data *INSIDE* the <>
$data = substr($save, $lt, ($gt-$lt+1));
push(@data, $data);
}
$save = substr($save, ($gt+1));
}
push(@data, $save);
return(@data);
}
# EOF