home *** CD-ROM | disk | FTP | other *** search
- ;From the article "Pattern Matching - The Gestalt Approach"
- ;by John W. Ratcliff and David E. Metzener
- ;Dr. Dobbs Journal, July 1988
-
- ; This version is for use in assembly language programs.
- ; See TESTSIM.ASM for a demonstration.
- ;
- ; - All required local variables are right here in this procedure.
- ; - Enter with DS:SI = pointer to string 1,
- ; DS:DI = pointer to string 2
- ; Preserves all registers except AX (which returns the similarity)
- ; It is CRITICAL that SS=CSeg! Else the bp referencing will not work!
- ; I have no intentions of hacking it for other configurations ..
- ; that's left as an exercise for the student.
- ; (Hint: Look in the Turbo Pascal version for an example.)
- ;
- ; David Kirschbaum
- ; Toad Hall
- ; kirsch@braggvax.ARPA
-
-
- ;TITLE SIMIL.ASM written by John W. Ratcliff and David E. Metzener
-
- ; November 10, 1987
- ; Uses the Ratcliff/Obershelp pattern recognition algorithm.
- ; The function SIMIL returns a percentage value corresponding to how
- ; alike any two strings are. Be certain to upper case the two strings
- ; passed if you are not concerned about case sensitivity.
-
- _Simil_a proc near
-
- ;TH Data moved down here so it'll be completely contained in our procedure.
-
- ststr1L dw 25 dup(?) ;contains lefts for string 1
- ststr1R dw 25 dup(?) ;contains rights for string 1
- ststr2L dw 25 dup(?) ;contains lefts for string 2
- ststr2R dw 25 dup(?) ;contains rights for string 2
- stcknum dw ? ;number of elements on the stack
- score dw ? ;the # of chars in common times 2
- total dw ? ;total # of chars in string 1 and 2
- cL1 dw ? ;left of string 1 found in common
- cR1 dw ? ;right of string 1 found in common
- cL2 dw ? ;left of string 2 found in common
- cR2 dw ? ;right of string 2 found in common
- s2ed dw ? ;the end of string 2 used in compare
-
- _simil:
- ; This routine expects pointers passed in two character strings, null
- ; terminated, that you wish compared. It returns a percentage value
- ; from 0 to 100% corresponding to how alike the two strings are.
- ; Usage: DS:SI = pointer to string 1 TH
- ; DS:DI = pointer to string 2 TH
- ; The similarity routine is composed of three major components
- ; pushst --- pushes a string section to be compared on the stack
- ; popst --- pops a string section to be examined off of the stack
- ; compare --- finds the largest group of characters in common between
- ; any two string sections
- ; The similarity routine begins by computing the total length of both
- ; strings passed and placing that value in TOTAL. It then takes
- ; the beginning and ending of both strings passed and pushes them on
- ; the stack. It then falls into the main line code.
- ; The original two strings are immediately popped off of the stack and
- ; are passed to the compare routine. The compare routine will find the
- ; largest group of characters in common between the two strings.
- ; The number of characters in common is multiplied times two and added
- ; to the total score. If there were no characters in common, then there
- ; is nothing to push onto the stack. If there are exactly one character
- ; to the left in both strings, then we needn't push it on the stack.
- ; (We already know they aren't equal from the previous call to compare.)
- ; Otherwise the characters to the left are pushed onto the stack. These
- ; same rules apply to characters to the right of the substring found in
- ; common. This process of pulling substrings off of the stack, comparing
- ; them, and pushing remaining sections on the stack is continued until
- ; the stack is empty. On return the total score is divided by the
- ; number of characters in both strings. This is multiplied times 100 to
- ; yield a percentage. This percentage similarity is returned to the
- ; calling procedure.
-
- push bp ;save BP reg.
- mov bp,sp ;save SP reg in BP for use in program
- push bx ;save all regs we're gonna trash TH
- push cx
- push dx
- push si
- push di
- push ES ;save the ES seg register
-
- mov ax,DS ;copy DS segment reg to ES
- mov ES,ax
-
- xor ax,ax ;zero out AX for clearing of SCORE var
- mov score,ax ;zero out SCORE
- mov stcknum,ax ;initialize number of stack entries to 0
- ;TH DS:SI and DS:DI already point to string 1 and string 2 respectively.
- ;TH mov si,[bp+4] ;move beginning pointer of string 1 to SI
- ;TH mov di,[bp+6] ;move beginning pointer of string 2 to SI
-
- cmp [si],al ;is it a null string?
- je StrErr ;can't process null strings
- cmp [di],al ;is it a null string?
- jne DoCmp ;Neither is a null string, so process them
- StrErr: jmp Donit ;exit routine
-
- DoCmp: push di ;save DI because of SCAS opcode
- push si ;save SI because of SCAS opcode
- xor al,al ;clear out AL to search for end of string
- cld ;set direction flag forward
- mov cx,-1 ;make sure we repeat the correct # of times
-
- repnz scasb ;scan for string delimiter in string 2
- dec di ;point DI to '$00' byte of string 2
- dec di ;point DI to last char of string 2
- mov bp,di ;move DI to BP where it is supposed to be
-
- pop di ;restore SI into DI for SCAS (string 1)
- repnz scasb ;scan for string delimiter in string 1
-
- not cx ;do one's complement for correct length of st
- sub cx,2 ;subtract the two zero bytes at the end of st
- mov total,cx ;store string 2's length
-
- dec di ;point DI to '$00' byte of string 1
- dec di ;point DI to last char of string 1
- mov bx,di ;move DI to BX where it is supposed to be
-
- pop di ;restore DI to what it should be
- call PushSt ;push values for the first call to SIMILIARITY
-
- Main: cmp stcknum,0 ;is there anything on the stack?
- je Done ;no, then all done!
-
- call PopSt ;get regs set up for a compare call
- call Compare ;do compare for this substring set
- or dx,dx ;if nothing in common then nothing to push TH
- jz Main ;try another set TH
-
- shl dx,1 ;*2 for add to score
- add score,dx ;add into score
- mov bp,stcknum ;get number of entry I want to look at
- shl bp,1 ;get AX ready to access string stacks
- mov si,[ststr1L+bp] ;move L1 into SI or L1
- mov bx,cL1 ;move CL1 into BX or R1
- mov di,[ststr2L+bp] ;move L2 into DI or L2
- mov cx,cL2 ;move CL2 into CX temporarily
- mov ax,[ststr1R+bp] ;get old R1 off of stack
- mov cL1,ax ;place in CL1 temporarily
- mov ax,[ststr2R+bp] ;get old R2 off of stack
- mov cL2,ax ;save in CL2 temporarily
- mov bp,cx ;place CL2 into BP
-
- cmp bx,si ;compare CL1 to L1
- je ChRght ;if zero, then nothing on left side string 1
-
- cmp bp,di ;compare CL2 to L2
- je ChRght ;if zero, then nothing on left side string 2
-
- dec bx ;point to last part of left side string 1
- dec bp ;point to last part of left side string 2
- cmp bx,si ;only one char to examine?
- jne PushIt ;no->we need to examine this
- cmp bp,di ;only one char in both?
- je ChRght ;nothing to look at if both only contain 1 char
-
- PushIt: call PushSt ;push left side on stack
- ChRght: mov si,cR1 ;move CR1 into SI or L1
- mov bx,cL1 ;move R1 into BX or R1
- mov di,cR2 ;move CR2 into DI or L2
- mov bp,cL2 ;move R2 into BP or R2
-
- cmp si,bx ;compare CR1 to R1
- je Main ;If zero, then nothing on right side string 1
- cmp di,bp ;compare CR2 to R2
- je Main ;if zero, then nothing on right side string 2
-
- inc si ;point to last part of right side string 1
- inc di ;point to last part of right side string 2
- cmp bx,si ;only one char to examine?
- jne Push2 ;no-> examine it
- cmp bp,di ;only 1 char to examine in both?
- je Main ;yes-> get next string off of stack
-
- Push2: call PushSt ;push right side on stack
- jmp short Main ;do next level of compares
-
- Done: mov ax,score ;get score into AX for MUL
- or ax,ax ;anything? TH
- jz Donit ;nope, forget this mess TH
- mov cx,100 ;get 100 into CX for MUL
- mul cx ;multiply by 100
- mov cx,total ;get total chars for divide
- div cx ;divide by total
- Donit:
- pop ES ;restore ES to entry value
- pop di ;and all other regs also TH
- pop si
- pop dx
- pop cx
- pop bx
- pop bp ;restore BP back to entry value
- ret ;leave with AX holding % similarity
-
-
- Compare:
-
- ; The compare routine locates the largest group of characters between string 1
- ; and string 2. This routine assumes that the direction flag is clear.
- ; Pass to this routine:
- ; BX = R1 (right side of string 1)
- ; DS:SI = L1 (left side of string 1)
- ; ES:Di = L2 (left side of string 2)
- ; BP = R2 (right side of string 2)
- ;
- ; This routine returns:
- ; DX = # of characters matching
- ; CL1 = Left side of first string that matches
- ; CL2 = Left side of second string that matches
- ; CR1 = Right side of first string that matches
- ; CR2 = Right side of second string that matches
- ;
- ; The compare routine is composed of two loops, an inner and an outer.
- ; The worst case scenario is that there are absolutely no characters in
- ; common between string 1 and string 2. In this case N x M compares are
- ; performed. However, when an equal condition occurs in the inner
- ; loop, then the next character to be examined in string 2 (for this loop)
- ; is advanced by the number of characters found equal. Whenever a new
- ; maximum number of characters in common is found, then the ending location
- ; of both the inner and outer loop is backed off by the difference between
- ; the new max chars value and the old max chars value for both loops. In
- ; short, if 5 characters have been found in common part of the way through
- ; the search, then we can cut our search short 5 characters before the
- ; true end of both strings since there is no chance of finding better than
- ; a 5 character match at that point. This technique means that an exact
- ; equal match will require only a single compare and combinations of other
- ; matches will proceed as efficiently as possible.
-
- mov s2ed,bp ;store end of string 2
- xor dx,dx ;init MAXCHARS
- ForL3: push di ;save start of string 2
- ForL4: push di ;save start of string 2
- push si ;save start of string 1
- mov cx,s2ed ;set up for calc of length of string 1
- sub cx,di ;get length of string 1 -1
- inc cx ;make proper length
- push cx ;save starting length of string 1
- repz cmpsb ;compare strings
- jz Equal ;if equal, then skip fixes
- inc cx ;inc back because CMPS decs even if not equal
- Equal: pop ax ;get starting length of string 1
- sub ax,cx ;get length of common characters
- jnz NewMax ;more than 0 chars matched
-
- pop si ;get back start of string 1
- pop di ;get back start of string 2
-
- Reent: inc di ;do the next char no matter what
- Reent2: cmp di,bp ;are we done with string 2?
- jle ForL4 ;no, then do next string compare
- pop di ;get back start of string 2
- inc si ;next char in string 1 to scan
- cmp si,bx ;are we done with string 1?
- jle ForL3 ;no, then do next string compare
- ret ;MAXCHARS is in DX
-
- ; We branch downwards for both newmax and newmx2 because on the
- ; 8086 line of processors a branch not taken is much faster than
- ; one which is. Since the not equal condition is to be found
- ; most often and we would like the inner loop to execute as quickly
- ; as possible, we branch outside of this loop on the less frequent
- ; occurrence. When a match or a new MAXCHARS is found, we branch down
- ; to these two routines, process the new conditions, and then branch
- ; back up to the main line code.
-
- NewMax:
- ;TH: we pop SI and DI in any case
- pop si ;get back start of string 1 TH
- pop di ;get back start of string 2 TH
- cmp ax,dx ;greater than MAXCHARS?
- jg NewMx2 ;yes, update new MAXCHARS and pointers
- ;TH pop si ;get back start of string 1
- ;TH pop di ;get back start of string 2
- add di,ax ;skip past matching chars
- jmp short Reent2 ;reenter main loop
-
- NewMx2:
- ;TH pop si ;get back start of string 1
- ;TH pop di ;get back start of string 2
- mov cL1,si ;put begin of match of string 1
- mov cL2,di ;put begin of match of string 2
- mov cx,ax ;save new MAXCHARS
-
- sub ax,dx ;get delta for adjustment to ends of strings
- sub bx,ax ;adjust end of string 1
- sub bp,ax ;adjust end of string 2
-
- mov dx,cx ;new MAXCHARS
- dec cx ;set up for advance to last matching char
- add di,cx ;advance to last matching char string 2
- mov cR2,di ;put end of match of string 2
- add cx,si ;advance to last matching char string 1
- mov cR1,cx ;put end of match of string 1
- jmp short Reent ;reenter inner loop
-
-
- PushSt:
- ; On Entry:
- ; BX = R1 (right side of string 1)
- ; DS:SI = L1 (left side of string 1)
- ; ES:Di = L2 (left side of string 2)
- ; BP = R2 (right side of string 2)
-
- mov cx,bp ;save R2
- mov bp,stcknum
- shl bp,1 ;*2 for words
- mov [bp+ststr1L],si ;put left side of string 1 on stack
- mov [bp+ststr1R],bx ;put right side of string 1 on stack
- mov [bp+ststr2L],di ;put left side of string 2 on stack
- mov [bp+ststr2R],cx ;put right side of string 2 on stack
- inc stcknum ;add one to number of stack entries
- mov bp,cx ;restore R2
- ret
-
-
- PopSt:
- ; BX = R1 (right side of string 1)
- ; DS:SI = L1 (left side of string 1)
- ; ES:Di = L2 (left side of string 2)
- ; BP = R2 (right side of string 2)
-
- dec stcknum ;point to last entry on stack
- mov bp,stcknum ;get number of stack entries
- shl bp,1 ;*2 for words
- mov si,[bp+ststr1L] ;restore left side of string 1 from stack
- mov bx,[bp+ststr1R] ;restore right side of string 1 from stack
- mov di,[bp+ststr2L] ;restore left side of string 2 from stack
- mov bp,[bp+ststr2R] ;restore right side of string 2 from stack
- ret
-
- _Simil_a endp