#!/usr/sco/bin/perl #!/usr/local/bin/perl #!/usr/local/bin/proxyperl # # w3get - point it at a http: url and it recursively retrieves href's # and img src's starting from that page # # Version 0.1 by Brooks Cutter (bcutter@paradyne.com) 2/5/94 # # Usage: w3get [-d] [-v] # # where fully qualified url is like http://host/file.html # like the Mosaic What's new page: # http://www.ncsa.uiuc.edu/SDG/Software/Mosaic/Docs/whats-new.html # # -d prints debugging information. # -v is verbose (prints a message for each url it descends) # # # I wrote this program a month ago in preperation for a presentation # on Mosaic and the World Wide Web. I had a sun there and wanted to # display parts of the web without using a slow PPP connection. # I haven't done anything with it since then (except today to document # it and clean it up) so don't intend to develop it further until # a need arises. Feel free to hack this up and pass it around. # (and pass me a copy please...) # # # If you are a AT&T Site behind the proxy gateway, you will need # my version of proxyperl. Email me for more info, and then set below to 1. $att_proxy = 0; # Uses Paradyne Automounter setup.. $pdn = 0; # This string is prepended to the rewritten url's # It could also be a 'file://...' or 'ftp://...', etc... # mark so can post process later since leaves the original URL on page $redirect = 'HOPS_LOCAL'; # directory where I can write my output to $outdir = "$ENV{'HOME'}/tmp/html_get"; mkdir($outdir, 0755) unless(-d $outdir); die "$0: \n(like http://www.cis.ohio-state.edu:80/hypertext/faq/usenet/tcl-faq/part1/faq.html)\n" unless (@ARGV); unshift(@INC,"/pdn/appl/perl/lib","/pdn/appl/perl/lib/sys") if ($pdn); require 'url.pl'; require 'getopts.pl'; &Getopts('dv'); $'ipcdebug = 1 if (($opt_d) && ($att_proxy)); #$version = "HTTP 1.0"; push(@todo, @ARGV); FOREVER: while (1) { #last unless(@todo); print '@todo = ',scalar @todo," ($todo[0])\n" if ($opt_d); unless(@todo) { last unless(@remote); @todo = @remote; @remote = (); } $node_url = shift(@todo); $seen{$node_url} = 1; # So I only descend each url once... print "Checking url $node_url\n" if ($opt_v); unless ($node_url =~ m!^http://!) { warn "Argument must be fully qualified url (ie: http://host/file.html):\n$node_url\n"; next; } # If it's already pulled the page down, it shouldn't retrieve it # again - but it needs to open it, parse the hyperlinks and then # retrieve those if necessary. Right now it pulls everything down # whether it has it or not. # # ($url_fn,$url_dir) = &url2fndir($url); # next if (-e "$outdir/$url_fn"); # if ($page = &url'get($node_url,$version)) { if ($node_url =~ /html/i) { $page = &url'abs($node_url,$page); # This should be combined into the one above, but it was a quick kludge # (like this program) $page = &url'img_abs($node_url,$page); } } else { warn "$!\n"; next; } $node_host = ''; if ($node_url =~ m|^http://([^/]+)/?.*$|) { $node_host = $1; } # I should really get the type from HTTP/1.0 headers... if ($node_url =~ /html$/i) { @links = &parse_html($page); @http = &extract_http(@links); for (@http) { s/#.*//; # Delete skipto marks next if ($seen{$_}); next if (/htbin/); # skip hitbin next if (/cgi.*bin/); # skip hitbin next if (/\?/); # Skip argument urls... #next unless (/paradyne.com/); # If you don't want to stray from a domain if (($node_host) && ($node_url =~ m!http://$node_host!)) { # Do local ones first push(@todo, $_); } else { push(@remote, $_); } $seen{$_} = 1; } @links2 = &localize(@links); # Should use pointers &save_url($node_url, @links2); } else { &save_url($node_url, $page); } next; } exit; sub save_url { local($url) = shift(@_); local($url_fn, $url_dir) = &url2fndir($url); return unless($url_fn); if ($url_dir) { if ((-e "$outdir/$url_dir") && (!-d "$outdir/$url_dir")) { # url was previously referenced like: # http://host/directory - and thought it was a file when a # directory index was generated. So move it to index.html... system("mv $outdir/$url_dir $outdir/$url_dir.index"); system("mkdir -p $outdir/$url_dir"); system("mv $outdir/$url_dir.index $outdir/$url_dir/index.html"); } elsif (!-e "$outdir/$url_dir") { system("mkdir -p $outdir/$url_dir"); } } print STDERR "Writing $url to $url_fn\n"; if (-e "$outdir/$url_fn") { print STDERR "--->>> HEY, $url_fn already exists!\n"; return; } open(OUT, ">$outdir/$url_fn"); print OUT @_; close(OUT); } sub url2fndir { local($url) = shift(@_); return($cache_url_fn{$url},$cache_url_dir{$url}) if (($cache_url_fn{$url}) && ($cache_url_dir{$url})); local($url_fn,$url_dir); if ($url =~ m!^http://(.+)$!) { $url_fn = $1; $url_fn =~ tr/~/_/d; @url_dir = split(/\//, $url_fn); pop(@url_dir); $url_dir = join('/',@url_dir); $cache_url_fn{$url} = $url_fn; $cache_url_dir{$url} = $url_dir; return($url_fn,$url_dir); } return(''); } sub extract_http { local($url); local($_,@return); for (@_) { next unless ((/^]+)>$/i) || (/^<(img\s+.*src)=([^>]+)>$/i)) { $cmd = $1; $url = $2; $url =~ tr/'"//d; # Delete quotes if (($url =~ /^http:/) || ($url =~ m!^[/a-zA-Z0-9]!)) { push(@return, $url); } next; } } return(@return); } sub localize { local($_,@return); local(@r); for (@_) { unless ((/^]+)>$/i) || (/^<(img\s+.*src)=([^>]+)>$/i)) { $cmd = $1; $url = $2; $url =~ tr/'"//d; # Delete quotes #print "localize found url $url\n"; if ($url =~ m!^http://(.+)$!) { push(@r, "<$cmd=\"$redirect/$1\">"); } else { push(@r, "<$cmd=\"$url\">"); } next; } } return(@r); } sub parse_html { local(@data) = (); local($save, $_, $lt, $gt); NEXTLINE: for (split(/\r/,$_[0])) { $save .= $_; if ((($lt = index($save,'<')) == -1) || (index($save,'>',$lt) == -1)) { next; } $lt = $gt = 0; while (($lt = index($save, '<', $gt)) >= $[) { # This is the data *BEFORE* the '<' if ($lt) { # do If isn't /^', $lt); if ($gt == -1) { $save = substr($save, $lt); next NEXTLINE; } # This is the data *INSIDE* the <> $data = substr($save, $lt, ($gt-$lt+1)); push(@data, $data); } $save = substr($save, ($gt+1)); } push(@data, $save); return(@data); } # EOF