home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
OS/2 Shareware BBS: 35 Internet
/
35-Internet.zip
/
swish131.zip
/
swishspider.pl
< prev
next >
Wrap
Perl Script
|
1998-10-21
|
2KB
|
76 lines
#!/usr/local/bin/perl5.005
use LWP::UserAgent;
use LWP::RobotUA;
use HTTP::Request;
use HTTP::Status;
use HTML::Parse;
if (scalar(@ARGV) != 2) {
print STDERR "Usage: SwishSpider localpath url\n";
exit(1);
}
my $ua = new LWP::UserAgent;
$ua->agent( "SwishSpider" );
$ua->from( "ron\@ckm.ucsf.edu" );
my $localpath = shift;
my $url = shift;
my $request = new HTTP::Request( "GET", $url );
my $response = $ua->simple_request( $request );
#
# Write out important meta-data. This includes the HTTP code. Depending on the
# code, we write out other data. Redirects have the location printed, everything
# else gets the content-type.
#
open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response" );
print RESP $response->code() . "\n";
if( $response->code() == RC_OK ) {
print RESP $response->header( "content-type" ) . "\n";
} elsif( $response->is_redirect() ) {
print RESP $response->header( "location" ) . "\n";
}
close( RESP );
#
# Write out the actual data assuming the retrieval was succesful. Also, if
# we have actual data and it's of type text/html, write out all the links it
# refers to
#
if( $response->code() == RC_OK ) {
my $contents = $response->content();
open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
print CONTENTS $contents;
close( CONTENTS );
if( $response->header("content-type") eq "text/html" ) {
open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links\n" );
my $html = HTML::Parse::parse_html( $contents );
foreach ( @{$html->extract_links( qw(a) )} ) {
my $link = new URI::URL( $$_[0], $url )->abs();
#
# Remove fragments
#
$link =~ s/(.*)#.*/$1/;
#
# Remove ../ This is important because the abs() function
# can leave these in and cause never ending loops.
#
$link =~ s/\.\.\///g;
print LINKS "$link\n";
}
close( LINKS );
}
}