OS/2 Shareware BBS: 35 Internet

home *** CD-ROM | disk | FTP | other *** search

/ OS/2 Shareware BBS: 35 Internet / 35-Internet.zip / swish131.zip / swishspider.pl < prev next >

Wrap

Perl Script | 1998-10-21 | 2KB | 76 lines

#!/usr/local/bin/perl5.005 use LWP::UserAgent; use LWP::RobotUA; use HTTP::Request; use HTTP::Status; use HTML::Parse; if (scalar(@ARGV) != 2) { print STDERR "Usage: SwishSpider localpath url\n"; exit(1); } my $ua = new LWP::UserAgent; $ua->agent( "SwishSpider" ); $ua->from( "ron\@ckm.ucsf.edu" ); my $localpath = shift; my $url = shift; my $request = new HTTP::Request( "GET", $url ); my $response = $ua->simple_request( $request ); # # Write out important meta-data. This includes the HTTP code. Depending on the # code, we write out other data. Redirects have the location printed, everything # else gets the content-type. # open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response" ); print RESP $response->code() . "\n"; if( $response->code() == RC_OK ) { print RESP $response->header( "content-type" ) . "\n"; } elsif( $response->is_redirect() ) { print RESP $response->header( "location" ) . "\n"; } close( RESP ); # # Write out the actual data assuming the retrieval was succesful. Also, if # we have actual data and it's of type text/html, write out all the links it # refers to # if( $response->code() == RC_OK ) { my $contents = $response->content(); open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" ); print CONTENTS $contents; close( CONTENTS ); if( $response->header("content-type") eq "text/html" ) { open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links\n" ); my $html = HTML::Parse::parse_html( $contents ); foreach ( @{$html->extract_links( qw(a) )} ) { my $link = new URI::URL( $$_[0], $url )->abs(); # # Remove fragments # $link =~ s/(.*)#.*/$1/; # # Remove ../ This is important because the abs() function # can leave these in and cause never ending loops. # $link =~ s/\.\.\///g; print LINKS "$link\n"; } close( LINKS ); } }