home *** CD-ROM | disk | FTP | other *** search
- ; This code performs the core operation that computes the MD5
- ; message digest of a bunch of longwords. MD5 is defined over
- ; longwords; when it must be performed over smaller units (bytes
- ; or bits), they are packed into longwords little-endian.
- ; The byteReverse function swaps the bytes around in a buffer of
- ; longwords for this purpose.
-
- ; You start with a 4-long buffer initialized to the magic values
- ; $67452301, $efcdab89, $98badcfe, $10325476
- ; And then, for each 16-longword chunk of input, the Transform()
- ; function is called, which alters the 4-longword buffer in-place
- ; based on the next 16 longwords of input.
- ;
- ; At the end of the input, it is padded with a 1 bit, and then as
- ; many 0 bits as are needed to come 2 longwords short of a full
- ; 16-longword chunk. The last two longwords are a 64-bit count of the
- ; number of bits in the input, exclusive of padding, least signficant
- ; longword first.
- ;
- ; After that trailer has been fed through Transform(), the contents
- ; of the buffer are the MD5 checksum. If they must be expressed as bytes,
- ; they should be presented in little-endian order.
-
- ; Byte-swap a buffer full of longs
-
- xdef _byteReverse
- _byteReverse:
- ; args: buf, longs
- move.l 4(sp),a0 ; Buf
- move.w 10(sp),d0 ; Longs
- subq.w #1,d0 ; We assume this is never 0 to start with
- revloop:
- move.l (a0),d1
- ror.w #8,d1
- swap d1
- ror.w #8,d1
- move.l d1,(a0)+
-
- dbra d0,revloop
-
- rts
-
- ; Macros needed by the MD5 transformation code
-
- ; rotl r1,bits - rotate register r1 left the given number of bits
- rotl macro ; args: reg, bits
- ifc \2,'' ; = 0
- mexit
- endc
- ifle \2-8 ; <= 8
- rol.l #\2,\1
- mexit
- endc
- ifge \2-24 ; >= 24
- ror.l #32-\2,\1
- mexit
- endc
- swap \1
- iflt \2-16 ; < 16
- ror.l #16-\2,\1
- mexit
- endc
- ifgt \2-16 ; > 16
- rol.l #\2-16,\1
- mexit
- endc
- endm
- ; Cycles taken:
- ; 0 - 0 4 - 16 8 - 24 12 - 20 16 - 4 20 - 20 24 - 24 28 - 16
- ; 1 - 10 5 - 18 9 - 26 13 - 18 17 - 14 21 - 22 25 - 22 29 - 14
- ; 2 - 12 6 - 20 10 - 24 14 - 16 18 - 16 22 - 24 26 - 20 30 - 12
- ; 3 - 14 7 - 22 11 - 22 15 - 14 19 - 18 23 - 26 27 - 18 31 - 10
- ; MD5 uses 4 rotations each in the amounts 4, 5, 6, 7, 9, 10, 11, 12,
- ; 14, 15, 16, 17, 20, 21, 22, 23. The total timing for each of
- ; those is 16+18+20+22 + 26+24+22+20 + 16+14+4+14 + 20+22+24+26 cycles,
- ; 4*19 + 4*23 + 4*12 + 4*23 = 4 * 77 = 308 cycles, or 1232 for the 4.
-
- ; Macros to compute the four boolean functions used in MD5
- ; Each one takes three arguments (x, y, z) and returns a result in d0.
-
- f1 macro ; d0 = (x & y) | (~x & z)
- move.l \3,d0
- eor.l \2,d0
- and.l \1,d0
- eor.l \3,d0
- endm ; 4+8+8+8 = 28 cycles
-
- f2 macro ; d0 = (x & z) | (x & ~z)
- f1 \3,\1,\2
- endm ; 28 cycles
-
- f3 macro ; d0 = x ^ y ^ z
- move.l \1,d0
- eor.l \2,d0
- eor.l \3,d0
- endm ; 4+8+8 = 20 cycles
-
- f4 macro ; d0 = y ^ (x | ~z)
- move.l \3,d0
- not.l d0
- or.l \1,d0
- eor.l \2,d0
- endm ; 4+6+8+8 = 26 cycles
-
- ; The total time for these steps is 16 times (28+28+20+26) = 16 * 102 = 1632
- ; cycles.
-
- ; The basic step in the MD5 computation
- ; Note that the current code does not use the constant argument,
- ; getitng it from a table instead. The constants have been left in
- ; the source because it's a real pain in the ass to type them all
- ; in correctly if changing this macro ever becomes a good idea.
-
- md5step macro ; args: f, w, x, y, z, const, input, shift
- \1 \3,\4,\5 ; f(x, y, z) in d0
- add.l (a0)+,d0
- add.l \7,d0
- add.l d0,\2 ; w += f(x, y, z) + const + input
- rotl \2,\8 ; w = ROTL(w, shift)
- add.l \3,\2 ; w += x
- endm ; f + 14 + 6 + data + 8 + rot + 8 = 36 + f + data + rot cycles
- ; In most cases, the data cost is 12 cycles (long with 16-bit offset);
- ; values will be subtracted below where it's less.
- ; So that's a total of 48 cycles, times 64 is 3072 cycles.
-
- ; Okay, the actual code (finally):
-
- xdef _Transform2
- _Transform2:
- ; args: uint32 buf[4], uint32 in[16]
- move.l 4(sp),a0 ; buf 16
- move.l 8(sp),a1 ; in 16 32
- movem.l d2-d4,-(sp) ; 32 64
-
- movem.l (a0),d1-d4 ; 44 108
- lea md5table(pc),a0 ; 8 116
-
- md5step f1,d1,d2,d3,d4,$d76aa478,(a1)+,7
- md5step f1,d4,d1,d2,d3,$e8c7b756,(a1)+,12
- md5step f1,d3,d4,d1,d2,$242070db,(a1)+,17
- md5step f1,d2,d3,d4,d1,$c1bdceee,(a1)+,22
- md5step f1,d1,d2,d3,d4,$f57c0faf,(a1)+,7
- md5step f1,d4,d1,d2,d3,$4787c62a,(a1)+,12
- md5step f1,d3,d4,d1,d2,$a8304613,(a1)+,17
- md5step f1,d2,d3,d4,d1,$fd469501,(a1)+,22
- md5step f1,d1,d2,d3,d4,$698098d8,(a1)+,7
- md5step f1,d4,d1,d2,d3,$8b44f7af,(a1)+,12
- md5step f1,d3,d4,d1,d2,$ffff5bb1,(a1)+,17
- md5step f1,d2,d3,d4,d1,$895cd7be,(a1)+,22
- md5step f1,d1,d2,d3,d4,$6b901122,(a1)+,7
- md5step f1,d4,d1,d2,d3,$fd987193,(a1)+,12
- md5step f1,d3,d4,d1,d2,$a679438e,(a1)+,17
- md5step f1,d2,d3,d4,d1,$49b40821,(a1),22
- ; Subtract 4*16 = 64 cycles for no offsets
-
- md5step f2,d1,d2,d3,d4,$f61e2562,4*1-60(a1),5
- md5step f2,d4,d1,d2,d3,$c040b340,4*6-60(a1),9
- md5step f2,d3,d4,d1,d2,$265e5a51,4*11-60(a1),14
- md5step f2,d2,d3,d4,d1,$e9b6c7aa,4*0-60(a1),20
- md5step f2,d1,d2,d3,d4,$d62f105d,4*5-60(a1),5
- md5step f2,d4,d1,d2,d3,$02441453,4*10-60(a1),9
- md5step f2,d3,d4,d1,d2,$d8a1e681,(a1),14
- md5step f2,d2,d3,d4,d1,$e7d3fbc8,4*4-60(a1),20
- md5step f2,d1,d2,d3,d4,$21e1cde6,4*9-60(a1),5
- md5step f2,d4,d1,d2,d3,$c33707d6,-(a1),9
- md5step f2,d3,d4,d1,d2,$f4d50d87,4*3-56(a1),14
- md5step f2,d2,d3,d4,d1,$455a14ed,4*8-56(a1),20
- md5step f2,d1,d2,d3,d4,$a9e3e905,-(a1),5
- md5step f2,d4,d1,d2,d3,$fcefa3f8,4*2-52(a1),9
- md5step f2,d3,d4,d1,d2,$676f02d9,4*7-52(a1),14
- md5step f2,d2,d3,d4,d1,$8d2a4c8a,-(a1),20
- ; Subtract 4+6 = 10 cycles for no offsets
-
- md5step f3,d1,d2,d3,d4,$fffa3942,4*5-48(a1),4
- md5step f3,d4,d1,d2,d3,$8771f681,4*8-48(a1),11
- md5step f3,d3,d4,d1,d2,$6d9d6122,-(a1),16
- md5step f3,d2,d3,d4,d1,$fde5380c,4*14-44(a1),23
- md5step f3,d1,d2,d3,d4,$a4beea44,4*1-44(a1),4
- md5step f3,d4,d1,d2,d3,$4bdecfa9,4*4-44(a1),11
- md5step f3,d3,d4,d1,d2,$f6bb4b60,4*7-44(a1),16
- md5step f3,d2,d3,d4,d1,$bebfbc70,-(a1),23
- md5step f3,d1,d2,d3,d4,$289b7ec6,4*13-40(a1),4
- md5step f3,d4,d1,d2,d3,$eaa127fa,4*0-40(a1),11
- md5step f3,d3,d4,d1,d2,$d4ef3085,4*3-40(a1),16
- md5step f3,d2,d3,d4,d1,$04881d05,4*6-40(a1),23
- md5step f3,d1,d2,d3,d4,$d9d4d039,4*9-40(a1),4
- md5step f3,d4,d1,d2,d3,$e6db99e5,4*12-40(a1),11
- md5step f3,d3,d4,d1,d2,$1fa27cf8,4*15-40(a1),16
- md5step f3,d2,d3,d4,d1,$c4ac5665,4*2-40(a1),23
- ; subtract 4 cycles for no offsets
-
- md5step f4,d1,d2,d3,d4,$f4292244,4*0-40(a1),6
- md5step f4,d4,d1,d2,d3,$432aff97,4*7-40(a1),10
- md5step f4,d3,d4,d1,d2,$ab9423a7,4*14-40(a1),15
- md5step f4,d2,d3,d4,d1,$fc93a039,4*5-40(a1),21
- md5step f4,d1,d2,d3,d4,$655b59c3,4*12-40(a1),6
- md5step f4,d4,d1,d2,d3,$8f0ccc92,4*3-40(a1),10
- md5step f4,d3,d4,d1,d2,$ffeff47d,(a1)+,15
- md5step f4,d2,d3,d4,d1,$85845dd1,4*1-44(a1),21
- md5step f4,d1,d2,d3,d4,$6fa87e4f,4*8-44(a1),6
- md5step f4,d4,d1,d2,d3,$fe2ce6e0,4*15-44(a1),10
- md5step f4,d3,d4,d1,d2,$a3014314,4*6-44(a1),15
- md5step f4,d2,d3,d4,d1,$4e0811a1,4*13-44(a1),21
- md5step f4,d1,d2,d3,d4,$f7537e82,4*4-44(a1),6
- md5step f4,d4,d1,d2,d3,$bd3af235,(a1),10
- md5step f4,d3,d4,d1,d2,$2ad7d2bb,4*2-44(a1),15
- md5step f4,d2,d3,d4,d1,$eb86d391,4*9-44(a1),21
- ; Subtract 8 cycles for no offsets
-
- ; 116
- move.l 16(sp),a0 ; 16 132
- add.l d1,(a0)+ ; 20 152
- add.l d2,(a0)+ ; 20 172
- add.l d3,(a0)+ ; 20 192
- add.l d4,(a0)+ ; 20 212
-
- movem.l (sp)+,d2-d4 ; 36 248
- rts ; 16 264
-
- ; So, that's a total of 86 cycles subtracted for no offets, from
- ; 264 + 1232 + 1632 + 3072 = 6200 cycles. The final tally is
- ; thus 6114 cycles.
-
- ; A table of the Mysterious Constants used by the MD5 computation.
- md5table:
- dc.l $d76aa478,$e8c7b756,$242070db,$c1bdceee
- dc.l $f57c0faf,$4787c62a,$a8304613,$fd469501
- dc.l $698098d8,$8b44f7af,$ffff5bb1,$895cd7be
- dc.l $6b901122,$fd987193,$a679438e,$49b40821
-
- dc.l $f61e2562,$c040b340,$265e5a51,$e9b6c7aa
- dc.l $d62f105d,$02441453,$d8a1e681,$e7d3fbc8
- dc.l $21e1cde6,$c33707d6,$f4d50d87,$455a14ed
- dc.l $a9e3e905,$fcefa3f8,$676f02d9,$8d2a4c8a
-
- dc.l $fffa3942,$8771f681,$6d9d6122,$fde5380c
- dc.l $a4beea44,$4bdecfa9,$f6bb4b60,$bebfbc70
- dc.l $289b7ec6,$eaa127fa,$d4ef3085,$04881d05
- dc.l $d9d4d039,$e6db99e5,$1fa27cf8,$c4ac5665
-
- dc.l $f4292244,$432aff97,$ab9423a7,$fc93a039
- dc.l $655b59c3,$8f0ccc92,$ffeff47d,$85845dd1
- dc.l $6fa87e4f,$fe2ce6e0,$a3014314,$4e0811a1
- dc.l $f7537e82,$bd3af235,$2ad7d2bb,$eb86d391
-
- end
-
-
- ;;; Crypt2.asm - 68000 assembler versions of heavily-used functions in
- ;;; crypt.c for the IDEA cipher. By Colin Plumb; not part
- ;;; of the original distribution from ETH Zurich
- ;;; isibee.ethz.ch:pub/simpl/idea.V1.0.tar.Z (129.132.38.1)
- ;;;
- ;;; No copyright is claimed on this code; it is in the public domain.
- ;;; It would be nice if you could give me credit if you use it, but
- ;;; the legal definition of "public domain" states that there is no
- ;;; legal obligation to do so.
-
- ;;; An earlier version of this code is circulating with a copyright notice
- ;;; on it; this was due to my mailing out an earlier version without any
- ;;; indication of the intended status, so the recipient assumed I claimed
- ;;; copyright. It is, however, incorrect. And if you want my e-mail
- ;;; address, the best one to use for the long term is <colin@nyx.cs.du.edu>.
-
- ;;; Now, the original code referenced above provides various encryption
- ;;; modes, in particular feedback and chaining modes which have a
- ;;; lag of greater than 1. These are useful only in very specialized
- ;;; applications, and for general use the lag factor of 1 is strongest.
- ;;; There is no reason for a typical application package to use anything
- ;;; else. In particular, the xpkidea.library which associates a "quality"
- ;;; of 100% with a lag factor of 25 is downright dangerous; typical users
- ;;; aren't going to be able to come up with 25 fully random initial vectors.
-
- ;;; Electronic Code Book (ECB) mode, where you pass each block of plaintext
- ;;; through the cipher system, is not very good and should be avoided, as
- ;;; repeated plaintext results in repeated ciphertext, which gives information
- ;;; away. Output Feedback (OFB) mode is vulnerable if you reuse keys and
- ;;; initial vectors. In particular, if you encrypt two uncompressed ASCII
- ;;; messages with the same key and initial vector, it is not difficult
- ;;; (even for a normal user, much less an intelligence agency!) to first,
- ;;; read both messages up to the minimum of their two lengths, and second,
- ;;; read any other shorter message sent with the same key and initial vector.
-
- ;;; The only modes that you should consider are Cipher Block Chaining (CBC)
- ;;; and Cipher Feedback (CFB) modes. CFB has the advantage that the text
- ;;; does not have to be a multiple of a block long, and so does not expand
- ;;; the text. It also does not require a decryption routine (or the moral
- ;;; equyivalent, InvertIdeaKey, below). It is recommended for general use.
-
- ;;; Where the plaintext blocks are the array x[i] and the ciphertext are y[i],
- ;;; IDEA encryption is denoted by IDEA(plain,key) and decryption by
- ;;; IIDEA(cipher,key), the modes are as follows:
-
- ;;; CBC encrypt: y[i] = IDEA(x[i]^y[i-1], key)
- ;;; CBC decrypt: x[i] = y[i-1] ^ IIDEA(y[i], key)
- ;;; CFB encrypt: y[i] = x[i] ^ IDEA(y[i-1], key)
- ;;; CFB decrypt: x[i] = y[i] ^ IDEA(y[i-1], key)
-
- ;;; Both modes require initial vectors y[-1]. This can be a random number,
- ;;; or you can set it to a constant (e.g. 0) and make x[0] be a random
- ;;; number. This saves you the effort of storing it somewhere else.
-
- ;;; For something like a disk or file encryptor, set it to some unique
- ;;; value, like the inode number or disk block number. This way, identical
- ;;; blocks will still be encrypted differently.
-
- ;;; Folks, this is a pretty darn good encryption algorithm. Proving
- ;;; that is a lot of work. Unless you understand that proof, you are
- ;;; not competent to "improve" it. In particular, don't change the
- ;;; way a 128-bit key is expanded to a 832-bit working key. And don't
- ;;; think about "fixing" the reversed addition in the last step. That
- ;;; significantly weakens the cipher. If you don't understand, then
- ;;; just take it as evidence that you shouldn't mess with things you
- ;;; don't understand.
-
- ;;; And don't forget that english text has an average entropy of less
- ;;; than 2 bits per character. That is, you have to compress ASCII by
- ;;; 4:1 before it is totally random and can't be further compressed with
- ;;; sufficient effort. So if you want to use a password, it should be 64
- ;;; bytes long, not 16. (It can be shorter if you use a hard-to-remember
- ;;; string with lots of nonsense words, FUnNy capitalisation or @dd
- ;;; characters.) You have to hash it down to 128 random bits. An
- ;;; algorithm for that is not supplied. Exclusive-oring 16-byte blocks
- ;;; together does not work well.
-
- ;;; Remember, 2^128 is a very large number. 38.5 decimal digits.
- ;;; 340,282,366,920,938,463,463,374,607,431,768,211,456.
- ;;; 340 thousand million billion trillion million. Or, equivalently,
- ;;; 340 million million million million million million. That's
- ;;; a bit over 27 totally random letters. But a standard pass phrase,
- ;;; even a wierd one, is far from totally random. For starters, it's
- ;;; usually easy to figure out where the blanks go even if they're
- ;;; stripped out. "scramblepatternpipelineyellow" is 29 characters
- ;;; long, but nothing like random. A random string of 27 letters
- ;;; looks like "xbrebwcrajhozfepyclvgkchqro". This was a standard
- ;;; pseudo-random number generator; it may look random to you, but
- ;;; it's not good enough for high-security applications.
-
- ;;; (Okay, so the cycle counts indicate an unhealthy obsession.)
- ;;; Multiply takes, according to the 6th edition Motorola MC68000
- ;;; manual, 38+2n cycles, where n is the number of bits set in the
- ;;; source effective address. Since the data are random, on average
- ;;; 8 of the 16 bits will be set, and multiplies will take 54 cycles
- ;;; on average.
-
- nofKeyPerRound equ 6 ; number of used keys per round
- nofRound equ 8 ; number of rounds
-
- ; Multiply two numbers in the range of 1..0x10000, modulo 0x10001.
- ; 0x10000 is represented as 0.
-
- Mult macro ; src1, src2, dest, neither source is 0
- ; dest may equal src2, but not src1 (which is zeroed)
- mulu %2,%1 ; 54
- move.w %1,%3 ; 4
- swap %1 ; 4 %1 now holds high part
- sub.w %1,%3 ; 4
- moveq #0,%1 ; 4 does not affect X flag
- addx.w %1,%3 ; 4
-
- endm ; 74
- ; Code to handle the cases where one argument is 0 = 0x10000 = -1.
- ; Negate the other modulo 0x10001, giving 0x10001 - x = 1 - x.
- ; In-place and out-of-place versions
-
- Invert1 macro ; dest
- neg.w %1 ; 4
- addq.w #1,%1 ; 4
- endm ; 8
-
- Invert2 macro ; src, dest
- moveq #1,%2 ; 4
- sub.w %1,%2 ; 4
- endm ; 8
-
- ; Externally callable version (also a usage example)
-
- xdef _Mul
- xdef _Mul2
- _Mul:
- _Mul2:
- move.w 4(sp),d0 ; 12
- beq.s Mul_a_is_0 ; 8
- move.w 6(sp),d1 ; 12
- beq.s Mul_b_is_0 ; 8
- Mult d1,d0,d0 ; 54 (based on average of 8 bits set)
- rts ; 10 = 104 total
- Mul_a_is_0:
- Invert2 6(sp),d0
- rts
- Mul_b_is_0:
- Invert1 d0
- rts
- MulEnd:
- ;; The cases where a or b are zero are so rare they don't affect the average
- ;; time significantly.
-
-
- ; encryption and decryption algorithm IDEA
-
- ; void Idea(u_int16 dataIn[4], u_int16 dataOut[4], u_int16 key[52])
-
- ; This performs the core IDEA encryption. It leaves no sensitive
- ; information behind in memory or registers, except for a temporary
- ; value from the last KeyMult in d0, which is probably safe to assume will
- ; quickly be trashed.
-
- ; Modifying this code is NOT recommended for the faint of heart.
-
- ; This code is, in a word, *tight*. Note in particular the use of the low
- ; halves of d2-d7 only, to eliminate the overhead of saving and restoring
- ; their high halves, the use of d3 as a constant zero source, and a1
- ; doubling as a loop counter. Also note that the exceptional cases in the
- ; KeyMult macro have been taken entirely out of line, and there has been
- ; some code motion to between KeyMult 5 and 6 so that the branches stay
- ; in short range (which they are, barely: offsets of +126 and -126 bytes
- ; both occur).
-
- ; Unrolling this loop any further would result in the branches going out
- ; of short branch range, which would cost more in instruction fetch time
- ; than would be saved by the unrolling. It would also take it out of a
- ; 68020's cache.
-
- ; Register usage:
- ; d0.l - KeyMult temp
- ; d1.l - Constant zero
- ; d2.w - s2
- ; d3.w - s1
- ; d4.w - x0
- ; d5.w - x2 <- Note reversal
- ; d6.w - x1
- ; d7.w - x3
- ; a0.l - dataIn, dataOut and key pointer limit
- ; a1.l - Pointer to current location in expanded key
-
- ; KeyMult: Multiply value in register by next available key (pointed at by a1).
- ; Requires corresponding MultAux code nearby.
-
- KeyMult macro ; %1 = number, %2 = register
- move.w %2,d0 ; 4
- beq Mul%1_a_is_0 ; 8
- mulu (a1)+,d0 ; 58 (42 if 0)
- beq.s Mul%1_b_is_0 ; 8
- move.w d0,%2 ; 4
- swap d0 ; 4
- sub.w d0,%2 ; 4
- addx.w d1,%2 ; 4
- Mul%1_done:
- endm ; 94 total
-
- ; Note: the following cases should occur 1/65536 of the time, however the
- ; second, if it occurs, will occur for every block encrypted using the
- ; same key, so it is important that it not be significantly slower than the
- ; common case. (It's faster, so all is well.)
-
- MultAux macro ; %1 = number, %2 = register
- Mul%1_a_is_0: ; 14 move and branch overhead
- move.w #1,%2 ; 8
- sub.w (a1)+,%2 ; 8
- bra.s Mul%1_done ; 10 = 40 total for this path
-
- Mul%1_b_is_0: ; 64 overhead to get here
- Invert1 %2 ; 8
- bra.s Mul%1_done ; 10 = 82 total for this path
- endm
-
- MultAux 1,d4
- MultAux 2,d7
- MultAux 3,d5
- MultAux 4,d6
-
- MultAux 5,d4
-
- xdef _Idea
- xdef _Idea2
- _Idea:
- _Idea2:
- move.l 4(sp),a0 ; 16 dataIn
- move.l 12(sp),a1 ; 16 32 key
-
- ; Since I am scrupulously careful never to touch the high halves of
- ; d2..d7, I only need to save the low halves.
- movem.w d2-d7,-(sp) ; 32 68
-
- clr.w d1 ; 4 72 Constant 0
- ; movem.w <mem>,<regs> here would sign-extend.
- move.w (a0)+,d4 ; 8 80
- move.w (a0)+,d6 ; 8 88
- move.w (a0)+,d5 ; 8 96
- move.w (a0),d7 ; 8 104
- ; When a1 points here, stop. 96 = 2 bytes/key * 6 keys/round * 8 rounds
- lea 96(a1),a0 ; 8 112
-
- CryptLoop:
- KeyMult 1,d4 ; 94 x0 *= *key++;
- add.w (a1)+,d6 ; 8 102 x1 += *key++;
- add.w (a1)+,d5 ; 8 110 x2 += *key++;
- KeyMult 2,d7 ; 94 204 x3 *= *key++;
-
- move.w d5,d2 ; 4 208 s2 = x2
- eor.w d4,d5 ; 4 212 x2 ^= x0
- KeyMult 3,d5 ; 94 306 x2 *= *key++;
- move.w d6,d3 ; 4 310 s1 = x1;
- eor.w d7,d6 ; 4 314 x1 ^= x3;
- add.w d5,d6 ; 4 318 x1 += x2;
- KeyMult 4,d6 ; 94 412 x1 *= *key++;
- eor.w d6,d4 ; 4 416 x0 ^= x1;
-
- ;; Second unrolled copy. Some of the first loop is postponed until after the
- ;; first KeyMult so that the branches there will be in short range (4 cycles).
- KeyMult 5,d4
-
- add.w d6,d5 ; 4 420 x2 += x1; FIRST LOOP
- eor.w d5,d7 ; 4 424 x3 ^= x2; FIRST LOOP
- eor.w d2,d6 ; 4 428 x1 ^= s2; FIRST LOOP
- eor.w d3,d5 ; 4 432 x2 ^= s1; FIRST LOOP
-
- add.w (a1)+,d6
- add.w (a1)+,d5
-
- ;; This code logically belongs after the KeyMult, but is moved up so that
- ;; the branches to MultAux will stay in short branch range.
- move.w d5,d2
- eor.w d4,d5
- move.w d6,d3
-
- KeyMult 6,d7
-
- KeyMult 7,d5
- eor.w d7,d6
- add.w d5,d6
- KeyMult 8,d6
- add.w d6,d5
- eor.w d6,d4
- eor.w d5,d7
- eor.w d2,d6
- eor.w d3,d5 ; 432 864
-
- cmp.l a0,a1 ; 6 870
- bne CryptLoop ; 10 880
- ; 2 114 -> From top, if expired
- KeyMult 9,d4 ; 94 208 x0 *= *key++;
- add.w (a1)+,d5 ; 8 216 x2 += *key++;
- add.w (a1)+,d6 ; 8 224 x1 += *key++;
- KeyMult 10,d7 ; 94 318 x3 *= *key++;
-
- move.l 20(a7),a0 ; 16 334 6 regs(12) + retaddr(4) + arg 1(4)
- movem.w d4-d7,(a0) ; 24 358 Store x0,x2,x1,x3
- ; movem.w <mem>,<regs> sign-extends, which is not what we want...
- move.w (sp)+,d2 ; 8 366
- move.w (sp)+,d3 ; 8 374
- move.w (sp)+,d4 ; 8 382
- move.w (sp)+,d5 ; 8 390
- move.w (sp)+,d6 ; 8 398
- move.w (sp)+,d7 ; 8 406
- rts ; 16 422
-
- MultAux 6,d7
- MultAux 7,d5
- MultAux 8,d6
-
- MultAux 9,d4
- MultAux 10,d7
-
- ; 422 + 4 * 880 = 3942 cycles per block, or 492.75 cycles per byte.
- ; At 7159090 Hz, that's 1816.11 blocks per second, or 14528.8
- ; bytes per second.
-
- ; End of Idea
-
- ;; For reference, here's the original source:
-
- ; void Idea(u_int16 *dataIn, u_int16 *dataOut, u_int16 *key)
- ; {
- ; register u_int16 round, x0, x1, x2, x3, s1, s2;
- ;
- ; x0 = *dataIn++; x1 = *dataIn++; x2 = *dataIn++; x3 = *dataIn;
- ;
- ; for (round = nofRound; round > 0; round--) {
- ; x0 = Mul(x0, *key++);
- ; x1 += *key++;
- ; x2 += *key++;
- ; x3 = Mul(x3, *key++);
- ;
- ; s1 = x1; s2 = x2;
- ; x2 ^= x0;
- ; x2 = Mul(x2, *key++);
- ; x1 ^= x3;
- ; x1 += x2;
- ; x1 = Mul(x1, *key++);
- ; x2 += x1;
- ;
- ; x0 ^= x1;
- ; x3 ^= x2;
- ;
- ; x1 ^= s2;
- ; x2 ^= s1;
- ; }
- ; *(dataOut++) = Mul(x0, *key++);
- ; *(dataOut++) = x2 + *key++;
- ; *(dataOut++) = x1 + *key++;
- ; *(dataOut) = Mul(x3, *key);
- ; } /* Idea */
-
- ;;;
- ;;; This expands a user key to a full idea key, copying the 128-bit block
- ;;; with a 25-bit rotation each time.
- ;;;
-
- ; void ExpandUserKey(uint16 userKey[8], uint16 key[52])
- ; Expand an 8-word (16-byte) user key into a 52-word (104-byte)
- ; working key, by rotating the 128-bit big-endian key left 25
- ; bits and storing the result in the next 8 words, successively.
-
- ; 1306 cycles total = (@ 7159090 Hz) 182.425 us
-
- ; Rotate a/b/c/d left 25 bits, by rotating right 7 and shifting
- ; down 1 register
-
- shdown macro ; a, b, c, d, e, temp, const
- ; aaaa bbbb cccc dddd ____ ____
- move.l %1,%5 ; aaaa bbbb cccc dddd aaaa ____ 4
- and.w %7,%1 ; __0a bbbb cccc dddd aaaa ____ 4 8
- eor.w %1,%5 ; __0a bbbb cccc dddd aaa0 ____ 4 12
-
- move.w %4,%6 ; __0a bbbb cccc dddd aaa0 __dd 4 16
- and.w %7,%6 ; __0a bbbb cccc dddd aaa0 __0d 4 20
- eor.w %6,%4 ; __0a bbbb cccc ddd0 aaa0 __0d 4 24
- or.w %6,%5 ; __0a bbbb cccc ddd0 aaad __0d 4 28
- ror.l #7,%5 ; __0a bbbb cccc ddd0 daaa __0d 22 50
-
- move.w %3,%6 ; __0a bbbb cccc ddd0 daaa __cc 4 54
- and.w %7,%6 ; __0a bbbb cccc ddd0 daaa __0c 4 58
- eor.w %6,%3 ; __0a bbbb ccc0 ddd0 daaa __0c 4 62
- or.w %6,%4 ; __0a bbbb ccc0 dddc daaa __0c 4 66
- ror.l #7,%4 ; __0a bbbb ccc0 cddd daaa __0c 22 88
-
- move.w %2,%6 ; __0a bbbb ccc0 cddd daaa __bb 4 92
- and.w %7,%6 ; __0a bbbb ccc0 cddd daaa __0b 4 96
- eor.w %6,%2 ; __0a bbb0 ccc0 cddd daaa __0b 4 100
- or.w %6,%3 ; __0a bbb0 cccb cddd daaa __0b 4 104
- ror.l #7,%3 ; __0a bbb0 bccc cddd daaa __0b 22 126
-
- or.w %1,%2 ; __0a bbba bccc cddd daaa __0b 4 130
- ror.l #7,%2 ; __0a abbb bccc cddd daaa __0b 22 152
-
- ; Rotate a/b/c/d left 25 bits, result in e/f/a/b.
- endm
-
- shup macro ; e, f, a, b, c, d, const
- ; ____ ____ aaaa bbbb cccc dddd
- move.l %4,%1 ; bbbb ____ aaaa bbbb cccc dddd 4
- and.w %7,%4 ; bbbb ____ aaaa __0b cccc cccc 4 8
- eor.w %4,%1 ; bbb0 xxxx aaaa __0b cccc dddd 4 12
-
- move.l %5,%2 ; bbb0 cccc aaaa __0b cccc dddd 4 16
- and.w %7,%5 ; bbb0 cccc aaaa __0b __0c dddd 4 20
- eor.w %5,%2 ; bbb0 ccc0 aaaa __0b __0c dddd 4 24
- or.w %4,%2 ; bbb0 cccb aaaa __0b __0c dddd 4 28
- ror.l #7,%2 ; bbb0 bccc aaaa __0b __0c dddd 22 50
-
- move.l %3,%4 ; bbb0 bccc aaaa aaaa __0c dddd 4 54
- and.w %7,%3 ; bbb0 bccc __0a aaaa __0c dddd 4 58
- eor.w %3,%4 ; bbb0 bccc __0a aaa0 __0c dddd 4 62
- or.w %3,%1 ; bbba bccc __0a aaa0 __0c dddd 4 66
- ror.l #7,%1 ; abbb bccc __0a aaa0 __0c dddd 22 88
-
- move.l %6,%3 ; abbb bccc dddd aaa0 __0c dddd 4 92
- and.w %7,%6 ; abbb bccc dddd aaa0 __0c __0d 4 96
- eor.w %6,%3 ; abbb bccc ddd0 aaa0 __0c __0d 4 100
- or.w %5,%3 ; abbb bccc dddc aaa0 __0c __0d 4 104
- ror.l #7,%3 ; abbb bccc cddd aaa0 __0c __0d 22 126
-
- or.w %6,%4 ; abbb bccc cddd aaad __0c __0d 4 130
- ror.l #7,%4 ; abbb bccc cddd daaa __0c __0d 22 152
-
- endm
-
- xdef _ExpandUserKey
- xdef _ExpandUserKey2
- _ExpandUserKey:
- _ExpandUserKey2:
- move.l 4(sp),a0 ; UserKey 16
- move.l 8(sp),a1 ; Key 16 32
- movem.l d2-d5,-(sp) ; 40 72
- movem.l (a0),d0-d3 ; Fetched into d0-d3 44 116
- move.l d7,a0 ; 4 120
- moveq #127,d7 ; 4 124
-
- movem.l d0-d3,(a1) ; Store words 0..7 40 164
-
- shdown d0,d1,d2,d3,d4,d5,d7 ; 152 318
- movem.l d1-d4,16(a1) ; Store words 8..15 44 360
-
- shdown d1,d2,d3,d4,d5,d0,d7 ; 152 512
- movem.l d2-d5,32(a1) ; store words 16..23 44 556
-
- shup d0,d1,d2,d3,d4,d5,d7 ; 152 708
- movem.l d0-d3,48(a1) ; Store words 24..31 44 752
-
- shdown d0,d1,d2,d3,d4,d5,d7 ; 152 904
- movem.l d1-d4,64(a1) ; Store words 32..39 44 948
-
- shdown d1,d2,d3,d4,d5,d0,d7 ; 152 1100
- movem.l d2-d5,80(a1) ; store words 40..47 44 1144
-
- ; We only need two longwords this iteration
- ; (d2 d3 d4 d5)
-
- ; aaaa bbbb cccc dddd
- and.w d7,d2 ; __0a bbbb cccc dddd 4 1148
-
- move.w d3,d5 ; __0a bbbb cccc __bb 4 1152
- and.w d7,d5 ; __0a bbbb cccc __0b 4 1156
- eor.w d5,d3 ; __0a bbb0 cccc __0b 4 1160
- or.w d2,d3 ; __0a bbba cccc __0b 4 1164
- ror.l #7,d3 ; __0a abbb cccc __0b 22 1186
-
- not.w d7 ; 4 1190
- and.w d7,d4 ; __0a abbb ccc0 __0b 4 1194
- or.w d5,d4 ; __0a abbb cccb __0b 4 1198
- ror.l #7,d4 ; __0a abbb bccc __0b 22 1220
-
- movem.l d3-d4,96(a1) ; store words 48..51 28 1248
-
- move.l a0,d7 ; 4 1252
- movem.l (sp)+,d2-d5 ; 44 1296
- rts ; 10 1306
-
- ;;;
- ;;; This section inverts keys
- ;;; First, the code to find a multiplicative inverse
- ;;;
-
- ; Set up the registers for InvCore
- InvPrep macro ; %1 = in.l, %2 = temp.l, %3 = out.w
- move.l %1,%2
- move.w %2,%3
-
- endm
-
- ; Note: this expects %1 to be set to the input value (high 16 bits zero!),
- ; and be non-zero, %2 to be set to 0x10001, and %3 to be set to 1. It
- ; returns the multiplicative inverse of %1 in %3. It does not affect the
- ; high words of %1, %3 and %4. %2 and %5 have all 32 bits trashed.
-
- InvCore macro ; %1 = in.l, %2 = mod.l, %3 = out.w,
- ; %4 = temp.w, %5 = temp.l, %6 = #1.w
- ;iter1\@:
- divu %1,%2
- bvs Out1\@ ; If ovfl, input=1, so inverse = 1 (in %3).
- move.w %2,%4 ; t0 = t0 + t1 * q = 0 + 1 * q = q
- ; mulu %3,%5 ; If we had copied to d4, we would have done this...
- ; add.w %5,%4 ; But since we know d0=1 and d2=0, we can skip it
- clr.w %2
- swap %2
- ; cmp.w #1,%2
- cmp.w %3,%2 ; Since %3 = 1
- beq Out2\@ ; remainder = 1, so quit - inverse in -%5
-
- InvLoop\@:
- ;iter2\@:
- divu %2,%1
- move.w %1,%5
- mulu %4,%5
- add.w %5,%3
- clr.w %1
- swap %1
- cmp.w %6,%1
- beq Out1\@ ; remainder = 1, so quit - inverse in %3
- ;iter3\@:
- divu %1,%2
- move.w %2,%5
- mulu %3,%5
- add.w %5,%4
- clr.w %2
- swap %2
- cmp.w %6,%2
- ; beq Out2\@ ; remainder = 1, so quit - inverse in -%4
- ; bra InvLoop\@
- bne InvLoop\@
-
- Out2\@: ; Inverse in -%4
- move.w %6,%3
- sub.w %4,%3
- Out1\@: ; Inverse in %3
-
- endm
-
- InputZero:
- moveq #0,d0
- rts
-
- xdef _MulInv
- xdef _MulInv2
- _MulInv:
- _MulInv2:
- moveq #0,d1
- move.w 4(sp),d1
- beq InputZero
- move.l d2,a0
- move.w d3,-(sp)
- move.l d4,a1
-
- moveq #1,d0
- move.l #$10001,d3
-
- InvCore d1,d3,d0,d2,d4,#1
-
- move.l a1,d4
- move.w (sp)+,d3
- move.l a0,d2
-
- rts
- MulInvEnd:
-
- ; Because it's legal to invert a key on top of itself, we push the inverted
- ; key onto the stack and then copy it to the destination at the end.
-
- xdef _InvertIdeaKey:
- xdef _InvertIdeaKey2:
- _InvertIdeaKey:
- _InvertIdeaKey2:
- move.l 4(sp),a0 ; Source
- move.l 8(sp),a1 ; Dest pointer
- movem.l d2-d7,-(sp)
-
- move.l #$10001,d7
-
- move.l (a0)+,d0
- move.l (a0)+,d1
-
- moveq #0,d3 ; Clear high half of d3
- move.w d1,d3
- beq IIKzero1
- InvPrep d7,d4,d1
- InvCore d3,d4,d1,d5,d6,d7
- IIKzero1:
- swap d1
- neg.w d1
- swap d1
-
- move.l d1,-(sp)
-
- swap d0
- move.w d0,d3
- beq IIKzero2
- InvPrep d7,d4,d0
- InvCore d3,d4,d0,d5,d6,d7
- IIKzero2:
- swap d0
- neg.w d0
-
- move.l d0,-(sp)
-
- moveq #nofRound-2,d2
- bra IIKloop
-
- IIKzero3: ; Located here to be in short branch range
- ; (saves 4*7 = 28 cycles; costs one branch = 10 cycles)
- ; d1.w == 0; d0.w is unknown
- move.w d0,d4
- beq IIKzero3a
- InvPrep d7,d5,d1
- InvCore d4,d5,d1,d0,d6,d7
- clr.w d0
- bra IIKzero3a
-
- IIKzero4:
- ; d1.w != 0; d0.w == 0
- clr.w d1
- bra IIKzero4a
-
- IIKloop:
- move.l (a0)+,-(sp)
- move.l (a0)+,d0
- move.l (a0)+,d1
-
- neg.w d0
- swap d0
- move.w d1,d3
- beq IIKzero3
- move.w d0,d4
- beq IIKzero4
- InvPrep d7,d5,d1
- InvCore d4,d5,d1,d0,d6,d7
- IIKzero4a:
- InvPrep d7,d4,d0
- InvCore d3,d4,d0,d5,d6,d7
- IIKzero3a:
- move.l d0,-(sp)
- swap d1
- neg.w d1
- move.l d1,-(sp)
-
- dbra d2,IIKloop
-
- move.l (a0)+,d2
- move.l (a0)+,d0
- move.l (a0)+,d1
-
- move.w d1,d3
- beq IIKzero5
- InvPrep d7,d4,d1
- InvCore d3,d4,d1,d5,d6,d7
- IIKzero5:
- swap d1
- neg.w d1
- swap d1
-
- swap d0
- move.w d0,d3
- beq IIKzero6
- InvPrep d7,d4,d0
- InvCore d3,d4,d0,d5,d6,d7
- IIKzero6:
- swap d0
- neg.w d0
-
- ; move.l d0,(a1)+ ; Install these last three longwords...NOT
- ; move.l d1,(a1)+ ; Merge this with the movem below.
- ; move.l d2,(a1)+
- ; 104 bytes of key = 26 longs = 9+9+8
- movem.l (sp)+,d3-d7/a0
- movem.l d0-d7/a0,(a1) ; 9 longs
- movem.l (sp)+,d0-d7/a0
- movem.l d0-d7/a0,36(a1) ; 9 longs
- movem.l (sp)+,d0-d7
- movem.l d0-d7,72(a1) ; 8 longs
-
- movem.l (sp)+,d2-d7 ; Retrieve registers
- rts
-
- InvertIdeaKeyEnd:
-
- ;; And for reference, the original code...
-
- ; void InvertIdeaKey( u_int16 *key, u_int16 *invKey)
- ; {
- ; register int i;
- ; KeyT(dk);
- ;
- ; dk[nofKeyPerRound * nofRound + 0] = MulInv(*key++);
- ; dk[nofKeyPerRound * nofRound + 1] = (addMod - *key++) & ones;
- ; dk[nofKeyPerRound * nofRound + 2] = (addMod - *key++) & ones;
- ; dk[nofKeyPerRound * nofRound + 3] = MulInv(*key++);
- ; for (i = nofKeyPerRound * (nofRound - 1); i >= 0; i -= nofKeyPerRound) {
- ; dk[i + 4] = *key++;
- ; dk[i + 5] = *key++;
- ; dk[i + 0] = MulInv(*key++);
- ; if (i > 0) {
- ; dk[i + 2] = (addMod - *key++) & ones;
- ; dk[i + 1] = (addMod - *key++) & ones;
- ; } else {
- ; dk[i + 1] = (addMod - *key++) & ones;
- ; dk[i + 2] = (addMod - *key++) & ones;
- ; }
- ; dk[i + 3] = MulInv(*key++);
- ; }
- ; for (i = 0; i < keyLen; i++)
- ; invKey[i] = dk[i];
- ; } /* InvertIdeaKey */
-
-