home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
ftp.cs.arizona.edu
/
ftp.cs.arizona.edu.tar
/
ftp.cs.arizona.edu
/
icon
/
historic
/
v941.tgz
/
icon.v941src.tar
/
icon.v941src
/
ipl
/
progs
/
fuzz.icn
< prev
next >
Wrap
Text File
|
2000-07-29
|
6KB
|
180 lines
############################################################################
#
# File: fuzz.icn
#
# Subject: Program to perform fuzzy pattern matching
#
# Author: Alex Cecil
#
# Date: November 10, 1993
#
############################################################################
#
# This file is in the public domain.
#
############################################################################
#
# This program illustrates "fuzzy" string pattern matching. The result
# of matching s and t is a number between 0 and 1 which is based on
# counting matching pairs of characters in increasingly long substrings
# of s and t. Characters may be weighted differently, and the reverse
# tally may be given a negative bias.
#
############################################################################
global bias, rank_list_max, weight1, weight2, weight_set, which_fuzz_value
procedure main()
local alphanum, in_id, in_name, in_record, rank_list,
start_time, word_requested
bias := -2 # Reduce importance of reverse match
rank_list_max := 15 # Number of best matches to write
weight1 := 6 # Weight of chars not in weight_set
weight2 := 2 # Weight of chars in weight_set
weight_set := 'aehiouwy' # Soundex ignore list
write("The ",rank_list_max,
" best matches for the first word in each line will be written.")
writes("\nName of input file: "); in_name := read()
in_id := (open(in_name,"r")) | (stop("Can't open file ",in_name))
writes("\nWord to search for: ")
word_requested := map(read())
writes("\nWhich function: Simple, Optimized, Weighted (1,2,3): ")
which_fuzz_value := case read() of {
"1" : fuzz_value_1 # Simple, "obvious" implementation
"2" : fuzz_value_2 # Simple, linearized for speed
default : fuzz_value_3 # Weights and bias included
}
write("\nSearching for \"",word_requested,"\" in file ",in_name)
start_time := &time
alphanum := &letters ++ &digits
rank_list := [] # [[fuzz-value,in-record],...]
while in_record := read(in_id) do {
in_record ? {
tab(upto(alphanum))
rank(word_requested,map(tab(many(alphanum))),in_record,
rank_list,rank_list_max)
}
}
write("\nFuzz Value of first word\n | Input Record...")
every rank := !rank_list do {
write(left(string(rank[1]),5)," ",left(rank[2],72))
}
write("\nElapsed time in milliseconds: ",&time - start_time)
end
procedure rank(s,t,r,rl,rm)
# Maintain a sorted list (rl) of the rm best Fuzz values with records (r).
# Special cases to save time: strings are the same; or s and t have fewer
# than about 50% characters in common.
local i, v
if s == t then v := 1.0
else if *(s ** t) * 4 <= (*s + *t) then v := 0.0
else v := which_fuzz_value(s,t,weight1,weight2,weight_set,bias)
# 3rd-last args needed by fuzz_value_3
if *rl = 0 then put(rl,[v,r]) # First entry in list
else if v >= rl[*rl][1] then { # If value greater than least in list...
put(rl,[v,r]) # add to list, sort, and trim
every i := *rl to 2 by -1 do {
if rl[i][1] > rl[i-1][1] then rl[i] :=: rl[i-1]
}
if *rl > rm then pull(rl)
}
end
procedure fuzz_value_1(s,t)
# Calculate Fuzz Value of s and t with weight=1 and bias=0
# Simple, non-optomized algorithm.
if *s > *t then s :=: t
return 2.0 * (fuzz_match_1(s,t) + fuzz_match_1(reverse(s),reverse(t)))/
((*s * (*s+1)) + (*t * (*t+1)))
end
procedure fuzz_match_1(s,ti)
# Calculate the Fuzz Matches between s and t. Simple algorithm.
# ASCII NUL is used to mark matched pairs, so can't be used in strings
local i, imax, jmax, m, t, tsdif
tsdif := *ti - *s
m := 0
every imax := 1 to *s do {
t := ti
jmax := imax + tsdif + 1
every i := 1 to imax do
if t[find(s[i],t,1,jmax)] := "\0" then m +:= 1
}
return m
end
procedure fuzz_value_2(s,t)
# Calculate Fuzz Value with weight=1 and bias=0
# Optomized version.
if *s > *t then s :=: t
return 2.0 * (fuzz_match_2(s,t) + fuzz_match_2(reverse(s),reverse(t)))/
((*s * (*s+1)) + (*t * (*t+1)))
end
procedure fuzz_match_2(s,t)
# Calculate the Fuzz Matches between s and t.
# Replace column loop by imperical calculation.
# ASCII NUL is used to mark matched pairs, so can't be used in s or t.
# s(ip) is ith char from right, similarly for t(jp)
local ip, j, jmp, jp, m, si
ip := *s
jmp := *t + 1
m := 0
every si := !s do {
if t[j := find(si,t)] := "\0" then {
jp := jmp - j
m +:= (ip <= jp | ip) - abs(ip - jp) # max column minus column offset
}
ip -:= 1
}
return m
end
procedure fuzz_value_3(s,t,w1,w2,w2c,b,c)
# Calculate Fuzz Value with weight w2 if in cset w2c, else weight w1; bias b.
if *s > *t then s :=: t
return 2.0 * (fuzz_match_3(s,t,w1,w2,w2c) +
fuzz_match_3(reverse(s),reverse(t),w1+b,w2+b,w2c)) /
(fuzz_self_3(s,w1+w1+b,w2+w2+b,w2c) + fuzz_self_3(t,w1+w1+b,w2+w2+b,w2c))
end
procedure fuzz_match_3(s,t,w1,w2,w2c)
# Calculate the Fuzz Matches between s and t.
# Replace column loop by imperical calculation.
# ASCII NUL is used to mark matched pairs, so can't be used in s or t.
# s(ip) is ith char from right, similarly for t(jp)
local ip, j, jmp, jp, m, mo, si
ip := *s
jmp := *t + 1
m := 0
every si := !s do {
if t[j := find(si,t)] := "\0" then {
jp := jmp - j
mo := (ip <= jp | ip) - abs(ip - jp) # max column minus column offset
m +:= (any(w2c,si) & (w2 * mo)) | (w1 * mo)
}
ip -:= 1
}
return m
end
procedure fuzz_self_3(s,w1fr,w2fr,w2c)
# fuzz matches of s with s
# w1fr, w2fr: forward plus reverse weights.
local ip, m, si
ip := *s
m := 0
every si := !s do {
m +:= (any(w2c,si) & (w2fr * ip)) | (w1fr * ip)
ip -:= 1
}
return m
end