home *** CD-ROM | disk | FTP | other *** search
- ;
- ; S I M I L F . A S M
- ; =================
- ; Entered & adapted by J Meeks (& Herb Martin a little); latest
- ; version 05-09-89, 00:13am.
- ;
- ; Adapted from Dr. Dobb's Journal, 07-88 issue, pp. 46-51 (text), pp.
- ; 68-72 (program listing).
- ;
- ; Written by John W. Ratcliff & David E. Metzener, 11-10-87.
- ;
- ; Uses the Ratcliff/Obershelp pattern recognition algorithm.
- ;
- ; This program provides a new function for Turbo Pascal 5.0 on an
- ; 8086-based machine. The function similf returns a percentage value
- ; corresponding to the degree to which two strings are similar to one
- ; another. Be certain to convert both strings to uppercase if you
- ; want to have a comparison that is case-insensitive.
- ;
- ; NOTE: This routine is for use with Turbo Pascal 5.0 programs. The
- ; original simil is for use with C programs, and is almost identical
- ; to this one.
- ;
- TITLE SIMILF.ASM
- ;
- ASSUME CS: Code, DS: Data, ES: Data
- ;
- DATA SEGMENT
- ;
- ststr1l dw 25 dup (?) ; Lefts for string 1.
- ststr1r dw 25 dup (?) ; Rights for string 1.
- ststr2l dw 25 dup (?) ; Lefts for string 2.
- ststr2r dw 25 dup (?) ; Rights for string 2.
- stcknum dw ? ; # elements on stack.
- score dw ? ; # chars in common * 2.
- total dw ? ; Total # chars in strings 1 & 2.
- cl1 dw ? ; Left of string 1 found in common.
- cr1 dw ? ; Right of string 1 found in common.
- cl2 dw ? ; Left of string 2 found in common.
- cr2 dw ? ; Right of string 2 found in common.
- s2ed dw ? ; End of string 2 used in compare.
- ;
- DATA ENDS
- PUBLIC similf
- ;
- Code SEGMENT
- ;
- similf PROC NEAR
- ;
- ; This routine expects pointers passed to two character strings, null
- ; terminated, that you wish compared. It returns a percentage value
- ; from 0% to 100% corresponding to how alike the two strings are.
- ;
- ; Usage: Simil (Str1, Str2: string);
- ;
- ; The similarity routine is composed of three major components:
- ; compare Finds the largest group of characters in common between
- ; any two string sections.
- ; pushst Pushes a string section to be compared onto the stack.
- ; popst Pops a string section to be examined off the stack.
- ;
- ; The similarity routine begins by computing the total length of both
- ; strings passed and placing that value in total. It then takes the
- ; beginnings and endings of both strings passed and pushes them onto
- ; the stack. It then falls through into the main line code.
-
- ; The original two strings are immediately popped off the stack and
- ; are passed to the compare routine. The compare routine finds the
- ; largest group of characters in common between the two strings.
-
- ; The number of characters in common is multiplied by two and added
- ; to the total score. If there were no characters in common, then
- ; there is nothing to push onto the stack. If there is exactly one
- ; character to the left in both strings then we needn't push them
- ; onto the stack (we already know they aren't equal from the previous
- ; call to compare). Otherwise the characters on the left are pushed
- ; onto the stack. These same rules apply to characters to the right
- ; of the substring found in in common. This process of popping
- ; substrings off the stack, comparing them, and pushing remaining
- ; sections onto the stack is continued until the stack is empty.
-
- ; On return, the total score is divided by the number of characters
- ; in both strings. This is multiplied by 100 to yield a percentage.
- ; This percentage similarity is returned to the calling procedure.
-
- push bp ; Save BP.
- mov bp, sp ; Save SP in BP for use in program.
- push es ; Save ES.
- mov ax, ds ; Copy DS into ES.
- mov es, ax
- xor ax, ax ; Clear AX.
- mov score, ax ; Clear score variable.
- mov stcknum, ax ; Clear number of stack entries.
- mov si, [bp+4] ; Move string 1 begin ptr to SI.
- mov di, [bp+8] ; Move string 2 begin ptr to DI.
- cmp [si], al ; Is string 1 a null string?
- je strerr ; Can't process null strings!
- cmp [di], al ; Is string 2 a null string?
- jne docmp ; Neither string null; process 'em.
- strerr: jmp donit ; Jump to exit routine.
- docmp: push di ; Save DI (because of SCAS).
- push si ; Save SI (because of SCAS).
- xor al, al ; Clr AL to search for string end.
- cld ; Clr direction flag to forward.
- mov cx, -1 ; Repeat correct # times.
- repnz scasb ; Scan for delimiter in string 2.
- dec di
- dec di ; Point DI to last char of str 2.
- mov bp, di ; Move DI to BP, where should be.
- pop di ; Original SI into DI for scasb.
- repnz scasb ; Scan for delimiter in string 1.
- not cx ; 1's comp of string 1's lentgh.
- sub cx, 2 ; - 2 zero bytes at end string 1.
- mov total, cx ; Sum of strings' lengths in total.
- dec di
- dec di ; Point DI to last char of str 1.
- mov bx, di ; Move DI to BX, where should be.
- pop di ; Restore original DI.
- call pushst ; Push for 1st call to compare.
- main: cmp stcknum, 0 ; Is there anything on the stack?
- je done ; No. Nothing to compare!
- call popst ; Set up regs for a compare call.
- call compare ; Do compare for substring set.
- cmp dx, 0 ; Nothing in common to push.
- je main ; Try another set substrings.
- shl dx, 1 ; Multiply # common chars by 2.
- add score, dx ; Add 2 * common chars to total.
- mov bp, stcknum ; Get # of entry to be looked at.
- shl bp, 1 ; Ready AX to access string stk.
- mov si, [ststr1l+bp] ; Move l1 into SI (or l1).
- mov bx, cl1 ; Move cl1 into BX (or r1).
- mov di, [ststr2l+bp] ; Move l2 into DI (or l2).
- mov cx, cl2 ; Move cl2 into CX temporarily.
- mov ax, [ststr1r+bp] ; Get old r1 off stack.
- mov cl1, ax ; Place in cl1 temporarily.
- mov ax, [ststr2r+bp] ; Get old r2 off stack.
- mov cl2, ax ; Place in cl2 temporarily.
- mov bp, cx ; Move cl2 into BP.
- cmp bx, si ; Compare cl1 to l1.
- je chrght ; Nothing on left side of string 1.
- cmp bp, di ; Compare cl2 to l2.
- je chrght ; Nothing on left side of string 2.
- dec bx ; Point last part left side str 1.
- dec bp ; Point last part left side str 2.
- cmp bx, si ; Only one char to compare?
- jne pushit ; No. We need to examine this.
- cmp bp, di ; Only one character in both?
- je chrght ; Only 1 char; nothing to look at.
- pushit: call pushst ; Push left side onto stack.
- chrght: mov si, cr1 ; Move cr1 into SI (or l1).
- mov bx, cl1 ; Move r1 into BX (or r1).
- mov di, cr2 ; Move cr2 into DI (or l2).
- mov bp, cl2 ; Move r2 into BP (or r2).
- cmp si, bx ; Compare cr1 to r1.
- je main ; Nothing on right side of str 1.
- cmp di, bp ; Compare cr2 to r2.
- je main ; Nothing on right side of str 2.
- inc si ; Point last part right side str 1.
- inc di ; Point last part right side str 2.
- cmp bx, si ; Only one character to examine?
- jne push2 ; No. Examine it.
- cmp bp, di ; Only one character in both?
- je main ; Yes. Get next strs off stack.
- push2: call pushst ; Push right side onto stack.
- jmp short main ; Do next level of compares.
- done: mov ax, score ; Get score into AX for multiply.
- mov cx, 100 ; Put 100 into CX for multiply.
- mul cx ; Multiply score by 100.
- mov cx, total ; Get total # chars for divide.
- div cx ; Div (score * 100) by # chars.
- donit: pop es ; Restore ES to entry value.
- POP BP ; Restore BP to entry value.
- ret 8 ; Return; AX = % similarity.
- similf ENDP
- ;
- ;
- compare PROC NEAR
- ;
- ; The compare procedure locates the largest matching group of
- ; characters between string 1 and string 2. The routine assumes
- ; the CPU's direction flag is cleared.
- ;
- ; Pass to this routine:
- ; DS:SI l1 (left side of string 1),
- ; BX r1 (right side of string 1),
- ; ES:DI l2 (left side of string 2),
- ; BP r2 (right side of string 2).
- ; The routine returns:
- ; DX # characters matching,
- ; cl1 Left side of first matching substring,
- ; cl2 Left side of second matching substring,
- ; cr1 Right side of first matching substring,
- ; cr2 Right side of second matching substring.
- ;
- ; The compare routine is composed of two loops, an inner and an
- ; outer. The worst case scenario is that there are absolutely no
- ; characters in common between string 1 and string 2. In this case
- ; N * M compares are performed. However, when a substring match
- ; occurs in the inner loop, then the next character to be examined in
- ; string 2 (for this loop) is advanced by the number of characters
- ; found equal. Whenever a new maximum number of characters in common
- ; is found then the ending locations of both the inner and outer
- ; loops are backed off by the difference between the new max chars
- ; value and the old max chars value; i.e., if 5 characters have been
- ; found in common part of the way through the search then we can cut
- ; our search short by 5 characters since there is no chance of
- ; finding better than a 5-character match at that point. This
- ; technique means that an exact match will require only a single
- ; compare, and combinations of other matches will proceed as
- ; efficiently as possible.
- ;
- ; Note that we branch downward for both newmax and newmax2, because
- ; on the 8086 line of processors a branch not taken is much faster
- ; than one that is taken. Since the not-equal condition is to be
- ; found most often, and we would like the inner loop to execute as
- ; quickly as possible, we branch outside the loop's body on the less
- ; frequent occurence. When a match and/or a new maxchars is found
- ; then we branch down to these two routines, process the new
- ; conditions, then jump back up into the main line loop code.
- mov s2ed, bp ; Store end of string 2.
- xor dx, dx ; Initialize maxchars.
- forl3: push di ; Save start of string 2.
- forl4: push di ; Save start of string 2.
- push si ; Save start of string 1.
- mov cx, s2ed ; Set up to calc length of str 1.
- sub cx, di ; Get (length of string 1) - 1.
- inc cx ; Get length of string 1.
- push cx ; Save starting length of str 1.
- repz cmpsb ; Compare strings.
- jz equal ; If equal, skip fixes.
- inc cx ; Back up (CMPSB decs even if <>).
- equal: pop ax ; Get starting length of string 1.
- sub ax, cx ; Get length of common substring.
- jnz newmax ; More than zero chars matched.
- pop si ; Get back start of string 1.
- pop di ; Get back start of string 2.
- reent: inc di ; Do next char no matter what.
- reent2: cmp di, bp ; Are we done with string 2?
- jle forl4 ; No. Do next string compare.
- pop di ; Get back start of string 2.
- inc si ; Next char in string 1 to scan.
- cmp si, bx ; Are we done with string 1?
- jle forl3 ; No. Do next string compare.
- ret ; Return with maxchars in DX.
- ;
- newmax: cmp ax, dx ; Greater than maxchars?
- jg newmx2 ; Yes. Update new maxchars & ptrs.
- pop si ; Get back start of string 1.
- pop di ; Get back start of string 2.
- add di, ax ; Skip past matching chars.
- jmp short reent2 ; Re-enter inner loop.
- newmx2: pop si ; Get back start of string 1.
- pop di ; Get back start of string 2.
- mov cl1, si ; Put beginning match of str 1.
- mov cl2, di ; Put beginning match of str 2.
- mov cx, ax ; Save new maxchars.
- sub ax, dx ; Get delta for str end adjustment.
- sub bx, ax ; Adjust end of string 1.
- sub bp, ax ; Adjust end of string 2.
- mov dx, cx ; New maxchars.
- dec cx ; Set up advance last match char.
- add di, cx ; Advance to last match char str 2.
- mov cr2, di ; Put ending of match of string 2.
- add cx, si ; Advance to last match char str 1.
- mov cr1, cx ; Put ending of match of string 1.
- jmp short reent ; Re-enter inner loop.
- ;
- compare ENDP
- pushst PROC NEAR
- ;
- ; Pass to this routine:
- ; DS:SI l1 (left side of string 1),
- ; BX r1 (right side of string 1),
- ; ES:DI l2 (left side of string 2),
- ; BP r2 (right side of string 2).
- ;
- mov cx, bp ; Save r2.
- mov bp, stcknum ; Get # entries on stack.
- shl bp, 1 ; Multiply by 2 for words.
- mov [bp+ststr1l], si ; Put left side string 1 on stack.
- mov [bp+ststr1r], bx ; Put right side string 1 on stack.
- mov [bp+ststr2l], di ; Put left side string 2 on stack.
- mov [bp+ststr2r], cx ; Put right side string 2 on stack.
- inc stcknum ; Add 1 to number stack entries.
- mov bp, cx ; Restore r2.
- ret ; Return.
- ;
- pushst ENDP
- ;
- ;
- popst PROC NEAR
- ;
- ; Pass to this routine:
- ; DS:SI l1 (left side of string 1),
- ; BX r1 (right side of string 1),
- ; ES:DI l2 (left side of string 2),
- ; BP r2 (right side of string 2).
- ;
- dec stcknum ; Point to last entry in stack.
- mov bp, stcknum ; Get # entries on stack.
- shl bp, 1 ; * 2 for words.
- mov si, [bp+ststr1l] ; Restore left string 1 from stk.
- mov bx, [bp+ststr1r] ; Restore right string 1 from stk.
- mov di, [bp+ststr2l] ; Restore left string 2 from stk.
- mov bp, [bp+ststr2r] ; Restore right string 2 from stk.
- ret ; Return.
- ;
- popst ENDP
- ;
- ;
- Code ENDS
- ;
- END ; of file SimilF.Asm.
-