home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
ftp.f-secure.com
/
2014.06.ftp.f-secure.com.tar
/
ftp.f-secure.com
/
support
/
hotfix
/
fsis
/
IS-SpamControl.fsfix
/
iufssc
/
lib
/
FS
/
PhishBL
/
ParseURL.pm
< prev
Wrap
Text File
|
2006-11-29
|
5KB
|
178 lines
#!/usr/bin/perl
#
# $Id: ParseURL.pm 2957 2006-02-02 15:11:02Z kankri $
use strict;
use warnings;
package FS::PhishBL::ParseURL;
=head1 NAME
FS::PhishBL::ParseURL - URL parsing functions for use with FS::PhishBL
=head1 SYNOPSIS
use FS::PhishBL::ParseURL qw(parse_ip_address url_to_blocklist_hostname);
=head1 AUTHOR
Dan Gardner <dan@netcraft.com>
IP address parsing functions taken from NetcraftToolbar::ParseURL
by Nick Cleaton <njc@netcraft.com>
=head1 COPYRIGHT
Copyright (C) 2005 Netcraft Ltd, all rights reserved
=cut
use Exporter;
use vars qw(@EXPORT_OK $VERSION $allow_octal);
$VERSION = qw/$Revision: 2957 $/[1];
@EXPORT_OK = qw(parse_ip_address url_to_blocklist_hostname);
=item parse_ip_address ( ADDRESS )
ADDRESS is a numeric IP address, possibly obfuscated. Returns
the address converted to the canonical dotted quad form if a
way to do that can be found, otherwise returns C<undef>.
The function errs on the side of returning an IP address if at
all possible. In edge cases, it emulates the way that the MS
Windows resolver works as closely as possible, since obfuscated
IP addresses in phishing URLs are most likely to target that
platform.
=cut
sub parse_ip_address {
local $_ = shift;
s/\.+$//;
if (length $_ <= 15 and /^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) /) {
$_ = $1;
# The windows resolver allows a four part dotted decimal IP address to
# have a space followed by any old rubbish appended, so long as the
# total length of the string doesn't get above 15 chars, e.g.
# <a href="http://10.192.95.89%20xy/">this goes to 10.192.95.89</a>.
# We cannot make our handling of this more general (e.g. with an
# unconditional s/ .*//) since space works in domain names, so for
# example:
# <a href="http://10.192.95.89%20xy.wildcard.example.com">this does not
# go to 10.192.95.89</a>. The second example fails to be recognised
# as an IP (it's too long to be a dotted decimal address) and so falls
# through to DNS, where a wildcard can make it work.
}
return unless m#^((?:0x[0-9a-f]+|[0-9\.])+)$#i;
$_ = $1;
my @bits = split /\.+/, $_;
my @sizes;
if (@bits == 1) {
@sizes = (4);
} elsif (@bits == 2) {
@sizes = (1,3);
} elsif (@bits == 3) {
@sizes = (1,1,2);
} elsif (@bits == 4) {
@sizes = (1,1,1,1);
} else {
return;
}
# If the IP address includes a number with a leading 0 and
# containing an 8 or a 9, then the octal format gets discarded
# (for all parts of the address) and only decimal and hex work.
local $allow_octal = (/(^|\.)0\d*[89]/ ? 0 : 1);
my @decode;
for my $i (0..$#bits) {
my @octets = _canon_num($bits[$i],$sizes[$i]);
return unless @octets;
push @decode, @octets;
}
return join '.', @decode;
}
sub _canon_num {
my ($num, $bytes) = @_;
if ($allow_octal and $num =~ /^0[0-7]*$/) {
# mod 2^32, only need the last 11 octal digits, == 33 bits
$num =~ s/.*(.{11})$/$1/;
$num = oct "0$num";
} elsif ($num =~ /^[0-9]+$/) {
# mod 2^32, only need the last 32 decimal digits, since 10^32
# is a multiple of 2^32.
$num =~ s/.*(.{32})$/$1/;
# If the number is very large then rounding errors when perl
# converts it to a double precision float and then does modulo
# 2^32 could leave us with the wrong IP address. So we
# convert from string to mod32 integer in one operation.
my @digits = split //, $num;
$num = 0;
foreach my $digit (@digits) {
$num *= 10;
$num += $digit;
$num %= (2**32);
}
} elsif ($num =~ /^0[xX]([0-9a-zA-Z]+)$/) {
$num = $1;
# mod 2^32, only need the last 8 hex digits
$num =~ s/.*(.{8})$/$1/;
$num = hex "0x$num";
} else {
return;
}
my @octets;
while ($bytes--) {
unshift @octets, $num%256;
$num -= $num%256;
$num /= 256;
}
return @octets;
}
=item url_to_blocklist_hostname ( URL )
Extracts the hostname part of the URL, in a canonical form
suitable for use as an index into the FS::PhishBL blocklist.
Returns the canonical hostname, or C<undef> if no hostname can be
extracted from the URL.
=cut
sub url_to_blocklist_hostname {
my ($url) = @_;
my $MAX_DOTS = 5; # constant
return unless $url =~ m|^[^:]+://(?:[^\?/#]*@)?([^\\/\#?:]+)|;
my $hostname = $1;
$hostname =~ s/\%([a-f0-9]{2})/chr(hex($1))/ige; # unescape
$hostname =~ s/[\x00-\x1f\x7f-\xff]+//g; # strip dodgy chars
$hostname =~ s/^\.+|\.+$//g; # trim dots from lhs and rhs
$hostname =~ s/\.{2,}/./g; # collapse multiple dots
$hostname = parse_ip_address($hostname) || $hostname;
$hostname =~ s/([^A-Za-z0-9\-\.])/'%'.sprintf("%02x", ord($1))/ge; # re-escape
my @hostparts = split /\./, $hostname;
my $start = $#hostparts - $MAX_DOTS;
$start = 0 if $start < 0;
$hostname = join('.', @hostparts[$start..$#hostparts]);
return lc $hostname;
}
1;