home *** CD-ROM | disk | FTP | other *** search
- # QTAwk Soundix Algorithm
- #
- # Optimized Soundix Algorithm. Adapted for QTAwk from article in:
- # "The C Gazette", Vol. 4, No. 2, Autumn 1989, page 29, by Joe Celko
- #
- BEGIN {
- sl = 4; # set code length
- }
-
- {
- for ( i = 1 ; i <= NF ; i++ ) printf(" Result: %s ==> %s\n",$i,soundix4($i,sl));
- }
-
- function soundix4(inname,len) {
- local workbuf;
- local leading_letter;
-
- #* make a working copy
- workbuf = strupr(inname);
-
- #* convert all vowels to 'A'
- gsub(/[AEIOUY]/,'A',workbuf);
-
- #* prefix transformations: done only once on the front of a name
- sub(/^MAC/,"MCC",workbuf); # MAC -> MCC
- sub(/^KN/ ,"NN" ,workbuf); # KN -> NN
- sub(/^PF/ ,"FF" ,workbuf); # PF -> FF
- sub(/^SCH/,"SSS",workbuf); # SCH -> SSS
- sub(/^K/ ,'C' ,workbuf); # K -> C
-
- #* preserve leading letter
- leading_letter = substr(workbuf,1,1);
- workbuf = substr(workbuf,2);
-
- #* infix transformations: done after the first letter
- #* and are from left to right on the name
- gsub(/DG/ ,"GG" ,workbuf); # DG -> GG
- gsub(/CAAN/,"TAAN",workbuf); # CAAN -> TAAN
- gsub(/D/ ,'T' ,workbuf); # D -> T
- gsub(/NST/ ,"NSS" ,workbuf); # NST -> NSS
- gsub(/AV/ ,"AF" ,workbuf); # AV -> AF
- gsub(/Q/ ,'G' ,workbuf); # Q -> G
- gsub(/Z/ ,'S' ,workbuf); # Z -> S
- gsub(/M/ ,'N' ,workbuf); # M -> N
- gsub(/KN/ ,"NN" ,workbuf); # KN -> NN
- gsub(/K/ ,'C' ,workbuf); # K -> C
- gsub(/AH/ ,"AA" ,workbuf); # AH -> AA
- gsub(/HA/ ,"AA" ,workbuf); # HA -> AA
- gsub(/AW/ ,"AA" ,workbuf); # AW -> AA
- gsub(/PH/ ,"FF" ,workbuf); # PH -> FF
- gsub(/SCH/ ,"SSS" ,workbuf); # SCH -> SSS
-
- #* suffix transformations: done on the end of the word going right to left
- #* (1) remove terminal A's and S's
- sub(/[AS]+$/,"",workbuf);
-
- #* (2) terminal NT-> TT
- sub(/NT$/,"TT",workbuf);
-
- #* now strip out all vowels except the first - remember that all vowels
- #* were transformed to 'A' earlier
- gsub(/A/,"",workbuf);
-
- #* remove all duplicate letters.
- #* Note this is different from the Soundex3 duplicate cleanup because
- #* the letter transforms can create duplicates at the front of the
- #* output name
- gsub(/B+/,'B',workbuf);
- gsub(/C+/,'C',workbuf);
- gsub(/F+/,'F',workbuf);
- gsub(/H+/,'H',workbuf);
- gsub(/G+/,'G',workbuf);
- gsub(/J+/,'J',workbuf);
- gsub(/L+/,'L',workbuf);
- gsub(/N+/,'N',workbuf);
- gsub(/P+/,'P',workbuf);
- gsub(/R+/,'R',workbuf);
- gsub(/S+/,'S',workbuf);
- gsub(/T+/,'T',workbuf);
- gsub(/V+/,'V',workbuf);
- gsub(/W+/,'W',workbuf);
- gsub(/X+/,'X',workbuf);
-
- #* return proper length code
- return leading_letter ∩ (len ? substr(workbuf,1,len - 1) : workbuf);
- }
-