home *** CD-ROM | disk | FTP | other *** search
- # SOUNDIX Version 1.0
-
- {
- # for ( i = 1 ; i <= NF ; i++ ) print soundex($i);
- for ( i = 1 ; i <= NF ; i++ ) printf(" Result: %s ==> %s\n",$i,soundex($i));
- }
-
- # SOUNDIX Version 1.0
- #
- # This program takes a character string such as a person's last
- # name and translates it to a sound index. This index can then be
- # used by an application to perform phonetic (i.e. 'sounds-like')
- # search. Algorithm found in D. Knuth, "Art of Computer Programming",
- # Vol. 3, Page 391-392
- #
- # Rules:
- # =====
- #
- # 1) Retain the first letter of the name and drop all occurances of
- # a, e, h, i, o, u, w, and y in other positions
- # 2) assign the following numbers to the remaining letters after the first
- # bfpv ==> 1
- # cgjkqsxz ==> 2
- # dt ==> 3
- # l ==> 4
- # mn ==> 5
- # r ==> 6
- #
- # 3) if two or more letters with the same code were adjacent in the original
- # string (before step 1), omit all but the first
- #
- # 4) convert to the form "letter, digit, digit, digit" by adding trailing
- # zeros (if there are less than three digits) by dropping rightmost
- # digits (if there are more than three).
- #
- #
- # Logic:
- # =====
- #
- # 1) Uppercase the string
- # 2) Use suffix to first letter
- # 3) Change the following letters:
- # R to 6
- # M,N to 5
- # L to 4
- # D,T to 3
- # C,G,J,K,Q,S,X,Z to 2
- # B,F,P,V to 1
- # AEIOUYHW to 0
- # anything else to 0
- # 4) Remove all adjacent duplicates
- # 5) Remove all zeros
- #
- # Example: ( and marks duplicates which are deleted )
- # =======
- #
- # McClowry --> 52240060 --> 5246 --> M246
- # McLorey --> 5240600 --> 5246 --> M246
- #
- # Schiller --> 22004406 --> 246 --> S460
- # Shilar --> 200406 --> 246 --> S460
- #
- # Rosen --> 60205 --> 625 --> R250
- # Rozin --> 60205 --> 625 --> R250
- #
- # Moynihan --> 50050005 --> 555 --> M550
- # Monnihan --> 50550005 --> 555 --> M550
- #
- # Abete --> 01030 --> 013 --> A130
- # Abadey --> 010300 --> 013 --> A130
- #
- #
- function soundex(str) {
- local ldl;
- local t_from = "|@#$%:;&*()_-+=![]'{}?/<>.~`^1234567890AEIOUYHWBFPVCGJKQSXZDTLMNR\\";
- local t_to = "000000000000000000000000000000000007000000000001111222222223345560";
-
- str = strupr(str);
- ldl = substr(str,1,1); # rule 1
- gsub(/^[AEIOUYH]/,"7",str); # reserve leading "AEIOUYH"
- str = stran(str,t_to,t_from); # rule 2
-
- gsub(/11+/,"1",str); # replace duplicate 1's with single 1 rule 3
- gsub(/22+/,"2",str); # replace duplicate 2's with single 2 rule 3
- gsub(/33+/,"3",str); # replace duplicate 3's with single 3 rule 3
- gsub(/44+/,"4",str); # replace duplicate 4's with single 4 rule 3
- gsub(/55+/,"5",str); # replace duplicate 5's with single 5 rule 3
- gsub(/66+/,"6",str); # replace duplicate 6's with single 6 rule 3
- gsub(/0+/,"",str); # delete internal 0's, rule 1
-
- str = ldl substr(str,2); # glue leading character back on front
- if ( (ldl = length(str)) < 4 ) {
- switch ( ldl ) {
- case 1:
- str ∩= "000";
- break;
- case 2:
- str ∩= "00";
- break;
- case 3:
- str ∩= "0";
- break;
- }
- } else if ( ldl > 4 ) str = substr(str,1,4);
-
- return str;
- }
-