Meeting Pearls 1

home *** CD-ROM | disk | FTP | other *** search

/ Meeting Pearls 1 / Meeting Pearls Vol 1 (1994).iso / installed_progs / util / pgp / pgpamiga / source / pgp_todo.lha / idea.asm < prev

Wrap

Assembly Source File | 1993-08-07 | 34.3 KB | 964 lines

; This code performs the core operation that computes the MD5 ; message digest of a bunch of longwords. MD5 is defined over ; longwords; when it must be performed over smaller units (bytes ; or bits), they are packed into longwords little-endian. ; The byteReverse function swaps the bytes around in a buffer of ; longwords for this purpose. ; You start with a 4-long buffer initialized to the magic values ; $67452301, $efcdab89, $98badcfe, $10325476 ; And then, for each 16-longword chunk of input, the Transform() ; function is called, which alters the 4-longword buffer in-place ; based on the next 16 longwords of input. ; ; At the end of the input, it is padded with a 1 bit, and then as ; many 0 bits as are needed to come 2 longwords short of a full ; 16-longword chunk. The last two longwords are a 64-bit count of the ; number of bits in the input, exclusive of padding, least signficant ; longword first. ; ; After that trailer has been fed through Transform(), the contents ; of the buffer are the MD5 checksum. If they must be expressed as bytes, ; they should be presented in little-endian order. ; Byte-swap a buffer full of longs xdef _byteReverse _byteReverse: ; args: buf, longs move.l 4(sp),a0 ; Buf move.w 10(sp),d0 ; Longs subq.w #1,d0 ; We assume this is never 0 to start with revloop: move.l (a0),d1 ror.w #8,d1 swap d1 ror.w #8,d1 move.l d1,(a0)+ dbra d0,revloop rts ; Macros needed by the MD5 transformation code ; rotl r1,bits - rotate register r1 left the given number of bits rotl macro ; args: reg, bits ifc \2,'' ; = 0 mexit endc ifle \2-8 ; <= 8 rol.l #\2,\1 mexit endc ifge \2-24 ; >= 24 ror.l #32-\2,\1 mexit endc swap \1 iflt \2-16 ; < 16 ror.l #16-\2,\1 mexit endc ifgt \2-16 ; > 16 rol.l #\2-16,\1 mexit endc endm ; Cycles taken: ; 0 - 0 4 - 16 8 - 24 12 - 20 16 - 4 20 - 20 24 - 24 28 - 16 ; 1 - 10 5 - 18 9 - 26 13 - 18 17 - 14 21 - 22 25 - 22 29 - 14 ; 2 - 12 6 - 20 10 - 24 14 - 16 18 - 16 22 - 24 26 - 20 30 - 12 ; 3 - 14 7 - 22 11 - 22 15 - 14 19 - 18 23 - 26 27 - 18 31 - 10 ; MD5 uses 4 rotations each in the amounts 4, 5, 6, 7, 9, 10, 11, 12, ; 14, 15, 16, 17, 20, 21, 22, 23. The total timing for each of ; those is 16+18+20+22 + 26+24+22+20 + 16+14+4+14 + 20+22+24+26 cycles, ; 4*19 + 4*23 + 4*12 + 4*23 = 4 * 77 = 308 cycles, or 1232 for the 4. ; Macros to compute the four boolean functions used in MD5 ; Each one takes three arguments (x, y, z) and returns a result in d0. f1 macro ; d0 = (x & y) | (~x & z) move.l \3,d0 eor.l \2,d0 and.l \1,d0 eor.l \3,d0 endm ; 4+8+8+8 = 28 cycles f2 macro ; d0 = (x & z) | (x & ~z) f1 \3,\1,\2 endm ; 28 cycles f3 macro ; d0 = x ^ y ^ z move.l \1,d0 eor.l \2,d0 eor.l \3,d0 endm ; 4+8+8 = 20 cycles f4 macro ; d0 = y ^ (x | ~z) move.l \3,d0 not.l d0 or.l \1,d0 eor.l \2,d0 endm ; 4+6+8+8 = 26 cycles ; The total time for these steps is 16 times (28+28+20+26) = 16 * 102 = 1632 ; cycles. ; The basic step in the MD5 computation ; Note that the current code does not use the constant argument, ; getitng it from a table instead. The constants have been left in ; the source because it's a real pain in the ass to type them all ; in correctly if changing this macro ever becomes a good idea. md5step macro ; args: f, w, x, y, z, const, input, shift \1 \3,\4,\5 ; f(x, y, z) in d0 add.l (a0)+,d0 add.l \7,d0 add.l d0,\2 ; w += f(x, y, z) + const + input rotl \2,\8 ; w = ROTL(w, shift) add.l \3,\2 ; w += x endm ; f + 14 + 6 + data + 8 + rot + 8 = 36 + f + data + rot cycles ; In most cases, the data cost is 12 cycles (long with 16-bit offset); ; values will be subtracted below where it's less. ; So that's a total of 48 cycles, times 64 is 3072 cycles. ; Okay, the actual code (finally): xdef _Transform2 _Transform2: ; args: uint32 buf[4], uint32 in[16] move.l 4(sp),a0 ; buf 16 move.l 8(sp),a1 ; in 16 32 movem.l d2-d4,-(sp) ; 32 64 movem.l (a0),d1-d4 ; 44 108 lea md5table(pc),a0 ; 8 116 md5step f1,d1,d2,d3,d4,$d76aa478,(a1)+,7 md5step f1,d4,d1,d2,d3,$e8c7b756,(a1)+,12 md5step f1,d3,d4,d1,d2,$242070db,(a1)+,17 md5step f1,d2,d3,d4,d1,$c1bdceee,(a1)+,22 md5step f1,d1,d2,d3,d4,$f57c0faf,(a1)+,7 md5step f1,d4,d1,d2,d3,$4787c62a,(a1)+,12 md5step f1,d3,d4,d1,d2,$a8304613,(a1)+,17 md5step f1,d2,d3,d4,d1,$fd469501,(a1)+,22 md5step f1,d1,d2,d3,d4,$698098d8,(a1)+,7 md5step f1,d4,d1,d2,d3,$8b44f7af,(a1)+,12 md5step f1,d3,d4,d1,d2,$ffff5bb1,(a1)+,17 md5step f1,d2,d3,d4,d1,$895cd7be,(a1)+,22 md5step f1,d1,d2,d3,d4,$6b901122,(a1)+,7 md5step f1,d4,d1,d2,d3,$fd987193,(a1)+,12 md5step f1,d3,d4,d1,d2,$a679438e,(a1)+,17 md5step f1,d2,d3,d4,d1,$49b40821,(a1),22 ; Subtract 4*16 = 64 cycles for no offsets md5step f2,d1,d2,d3,d4,$f61e2562,4*1-60(a1),5 md5step f2,d4,d1,d2,d3,$c040b340,4*6-60(a1),9 md5step f2,d3,d4,d1,d2,$265e5a51,4*11-60(a1),14 md5step f2,d2,d3,d4,d1,$e9b6c7aa,4*0-60(a1),20 md5step f2,d1,d2,d3,d4,$d62f105d,4*5-60(a1),5 md5step f2,d4,d1,d2,d3,$02441453,4*10-60(a1),9 md5step f2,d3,d4,d1,d2,$d8a1e681,(a1),14 md5step f2,d2,d3,d4,d1,$e7d3fbc8,4*4-60(a1),20 md5step f2,d1,d2,d3,d4,$21e1cde6,4*9-60(a1),5 md5step f2,d4,d1,d2,d3,$c33707d6,-(a1),9 md5step f2,d3,d4,d1,d2,$f4d50d87,4*3-56(a1),14 md5step f2,d2,d3,d4,d1,$455a14ed,4*8-56(a1),20 md5step f2,d1,d2,d3,d4,$a9e3e905,-(a1),5 md5step f2,d4,d1,d2,d3,$fcefa3f8,4*2-52(a1),9 md5step f2,d3,d4,d1,d2,$676f02d9,4*7-52(a1),14 md5step f2,d2,d3,d4,d1,$8d2a4c8a,-(a1),20 ; Subtract 4+6 = 10 cycles for no offsets md5step f3,d1,d2,d3,d4,$fffa3942,4*5-48(a1),4 md5step f3,d4,d1,d2,d3,$8771f681,4*8-48(a1),11 md5step f3,d3,d4,d1,d2,$6d9d6122,-(a1),16 md5step f3,d2,d3,d4,d1,$fde5380c,4*14-44(a1),23 md5step f3,d1,d2,d3,d4,$a4beea44,4*1-44(a1),4 md5step f3,d4,d1,d2,d3,$4bdecfa9,4*4-44(a1),11 md5step f3,d3,d4,d1,d2,$f6bb4b60,4*7-44(a1),16 md5step f3,d2,d3,d4,d1,$bebfbc70,-(a1),23 md5step f3,d1,d2,d3,d4,$289b7ec6,4*13-40(a1),4 md5step f3,d4,d1,d2,d3,$eaa127fa,4*0-40(a1),11 md5step f3,d3,d4,d1,d2,$d4ef3085,4*3-40(a1),16 md5step f3,d2,d3,d4,d1,$04881d05,4*6-40(a1),23 md5step f3,d1,d2,d3,d4,$d9d4d039,4*9-40(a1),4 md5step f3,d4,d1,d2,d3,$e6db99e5,4*12-40(a1),11 md5step f3,d3,d4,d1,d2,$1fa27cf8,4*15-40(a1),16 md5step f3,d2,d3,d4,d1,$c4ac5665,4*2-40(a1),23 ; subtract 4 cycles for no offsets md5step f4,d1,d2,d3,d4,$f4292244,4*0-40(a1),6 md5step f4,d4,d1,d2,d3,$432aff97,4*7-40(a1),10 md5step f4,d3,d4,d1,d2,$ab9423a7,4*14-40(a1),15 md5step f4,d2,d3,d4,d1,$fc93a039,4*5-40(a1),21 md5step f4,d1,d2,d3,d4,$655b59c3,4*12-40(a1),6 md5step f4,d4,d1,d2,d3,$8f0ccc92,4*3-40(a1),10 md5step f4,d3,d4,d1,d2,$ffeff47d,(a1)+,15 md5step f4,d2,d3,d4,d1,$85845dd1,4*1-44(a1),21 md5step f4,d1,d2,d3,d4,$6fa87e4f,4*8-44(a1),6 md5step f4,d4,d1,d2,d3,$fe2ce6e0,4*15-44(a1),10 md5step f4,d3,d4,d1,d2,$a3014314,4*6-44(a1),15 md5step f4,d2,d3,d4,d1,$4e0811a1,4*13-44(a1),21 md5step f4,d1,d2,d3,d4,$f7537e82,4*4-44(a1),6 md5step f4,d4,d1,d2,d3,$bd3af235,(a1),10 md5step f4,d3,d4,d1,d2,$2ad7d2bb,4*2-44(a1),15 md5step f4,d2,d3,d4,d1,$eb86d391,4*9-44(a1),21 ; Subtract 8 cycles for no offsets ; 116 move.l 16(sp),a0 ; 16 132 add.l d1,(a0)+ ; 20 152 add.l d2,(a0)+ ; 20 172 add.l d3,(a0)+ ; 20 192 add.l d4,(a0)+ ; 20 212 movem.l (sp)+,d2-d4 ; 36 248 rts ; 16 264 ; So, that's a total of 86 cycles subtracted for no offets, from ; 264 + 1232 + 1632 + 3072 = 6200 cycles. The final tally is ; thus 6114 cycles. ; A table of the Mysterious Constants used by the MD5 computation. md5table: dc.l $d76aa478,$e8c7b756,$242070db,$c1bdceee dc.l $f57c0faf,$4787c62a,$a8304613,$fd469501 dc.l $698098d8,$8b44f7af,$ffff5bb1,$895cd7be dc.l $6b901122,$fd987193,$a679438e,$49b40821 dc.l $f61e2562,$c040b340,$265e5a51,$e9b6c7aa dc.l $d62f105d,$02441453,$d8a1e681,$e7d3fbc8 dc.l $21e1cde6,$c33707d6,$f4d50d87,$455a14ed dc.l $a9e3e905,$fcefa3f8,$676f02d9,$8d2a4c8a dc.l $fffa3942,$8771f681,$6d9d6122,$fde5380c dc.l $a4beea44,$4bdecfa9,$f6bb4b60,$bebfbc70 dc.l $289b7ec6,$eaa127fa,$d4ef3085,$04881d05 dc.l $d9d4d039,$e6db99e5,$1fa27cf8,$c4ac5665 dc.l $f4292244,$432aff97,$ab9423a7,$fc93a039 dc.l $655b59c3,$8f0ccc92,$ffeff47d,$85845dd1 dc.l $6fa87e4f,$fe2ce6e0,$a3014314,$4e0811a1 dc.l $f7537e82,$bd3af235,$2ad7d2bb,$eb86d391 end ;;; Crypt2.asm - 68000 assembler versions of heavily-used functions in ;;; crypt.c for the IDEA cipher. By Colin Plumb; not part ;;; of the original distribution from ETH Zurich ;;; isibee.ethz.ch:pub/simpl/idea.V1.0.tar.Z (129.132.38.1) ;;; ;;; No copyright is claimed on this code; it is in the public domain. ;;; It would be nice if you could give me credit if you use it, but ;;; the legal definition of "public domain" states that there is no ;;; legal obligation to do so. ;;; An earlier version of this code is circulating with a copyright notice ;;; on it; this was due to my mailing out an earlier version without any ;;; indication of the intended status, so the recipient assumed I claimed ;;; copyright. It is, however, incorrect. And if you want my e-mail ;;; address, the best one to use for the long term is <colin@nyx.cs.du.edu>. ;;; Now, the original code referenced above provides various encryption ;;; modes, in particular feedback and chaining modes which have a ;;; lag of greater than 1. These are useful only in very specialized ;;; applications, and for general use the lag factor of 1 is strongest. ;;; There is no reason for a typical application package to use anything ;;; else. In particular, the xpkidea.library which associates a "quality" ;;; of 100% with a lag factor of 25 is downright dangerous; typical users ;;; aren't going to be able to come up with 25 fully random initial vectors. ;;; Electronic Code Book (ECB) mode, where you pass each block of plaintext ;;; through the cipher system, is not very good and should be avoided, as ;;; repeated plaintext results in repeated ciphertext, which gives information ;;; away. Output Feedback (OFB) mode is vulnerable if you reuse keys and ;;; initial vectors. In particular, if you encrypt two uncompressed ASCII ;;; messages with the same key and initial vector, it is not difficult ;;; (even for a normal user, much less an intelligence agency!) to first, ;;; read both messages up to the minimum of their two lengths, and second, ;;; read any other shorter message sent with the same key and initial vector. ;;; The only modes that you should consider are Cipher Block Chaining (CBC) ;;; and Cipher Feedback (CFB) modes. CFB has the advantage that the text ;;; does not have to be a multiple of a block long, and so does not expand ;;; the text. It also does not require a decryption routine (or the moral ;;; equyivalent, InvertIdeaKey, below). It is recommended for general use. ;;; Where the plaintext blocks are the array x[i] and the ciphertext are y[i], ;;; IDEA encryption is denoted by IDEA(plain,key) and decryption by ;;; IIDEA(cipher,key), the modes are as follows: ;;; CBC encrypt: y[i] = IDEA(x[i]^y[i-1], key) ;;; CBC decrypt: x[i] = y[i-1] ^ IIDEA(y[i], key) ;;; CFB encrypt: y[i] = x[i] ^ IDEA(y[i-1], key) ;;; CFB decrypt: x[i] = y[i] ^ IDEA(y[i-1], key) ;;; Both modes require initial vectors y[-1]. This can be a random number, ;;; or you can set it to a constant (e.g. 0) and make x[0] be a random ;;; number. This saves you the effort of storing it somewhere else. ;;; For something like a disk or file encryptor, set it to some unique ;;; value, like the inode number or disk block number. This way, identical ;;; blocks will still be encrypted differently. ;;; Folks, this is a pretty darn good encryption algorithm. Proving ;;; that is a lot of work. Unless you understand that proof, you are ;;; not competent to "improve" it. In particular, don't change the ;;; way a 128-bit key is expanded to a 832-bit working key. And don't ;;; think about "fixing" the reversed addition in the last step. That ;;; significantly weakens the cipher. If you don't understand, then ;;; just take it as evidence that you shouldn't mess with things you ;;; don't understand. ;;; And don't forget that english text has an average entropy of less ;;; than 2 bits per character. That is, you have to compress ASCII by ;;; 4:1 before it is totally random and can't be further compressed with ;;; sufficient effort. So if you want to use a password, it should be 64 ;;; bytes long, not 16. (It can be shorter if you use a hard-to-remember ;;; string with lots of nonsense words, FUnNy capitalisation or @dd ;;; characters.) You have to hash it down to 128 random bits. An ;;; algorithm for that is not supplied. Exclusive-oring 16-byte blocks ;;; together does not work well. ;;; Remember, 2^128 is a very large number. 38.5 decimal digits. ;;; 340,282,366,920,938,463,463,374,607,431,768,211,456. ;;; 340 thousand million billion trillion million. Or, equivalently, ;;; 340 million million million million million million. That's ;;; a bit over 27 totally random letters. But a standard pass phrase, ;;; even a wierd one, is far from totally random. For starters, it's ;;; usually easy to figure out where the blanks go even if they're ;;; stripped out. "scramblepatternpipelineyellow" is 29 characters ;;; long, but nothing like random. A random string of 27 letters ;;; looks like "xbrebwcrajhozfepyclvgkchqro". This was a standard ;;; pseudo-random number generator; it may look random to you, but ;;; it's not good enough for high-security applications. ;;; (Okay, so the cycle counts indicate an unhealthy obsession.) ;;; Multiply takes, according to the 6th edition Motorola MC68000 ;;; manual, 38+2n cycles, where n is the number of bits set in the ;;; source effective address. Since the data are random, on average ;;; 8 of the 16 bits will be set, and multiplies will take 54 cycles ;;; on average. nofKeyPerRound equ 6 ; number of used keys per round nofRound equ 8 ; number of rounds ; Multiply two numbers in the range of 1..0x10000, modulo 0x10001. ; 0x10000 is represented as 0. Mult macro ; src1, src2, dest, neither source is 0 ; dest may equal src2, but not src1 (which is zeroed) mulu %2,%1 ; 54 move.w %1,%3 ; 4 swap %1 ; 4 %1 now holds high part sub.w %1,%3 ; 4 moveq #0,%1 ; 4 does not affect X flag addx.w %1,%3 ; 4 endm ; 74 ; Code to handle the cases where one argument is 0 = 0x10000 = -1. ; Negate the other modulo 0x10001, giving 0x10001 - x = 1 - x. ; In-place and out-of-place versions Invert1 macro ; dest neg.w %1 ; 4 addq.w #1,%1 ; 4 endm ; 8 Invert2 macro ; src, dest moveq #1,%2 ; 4 sub.w %1,%2 ; 4 endm ; 8 ; Externally callable version (also a usage example) xdef _Mul xdef _Mul2 _Mul: _Mul2: move.w 4(sp),d0 ; 12 beq.s Mul_a_is_0 ; 8 move.w 6(sp),d1 ; 12 beq.s Mul_b_is_0 ; 8 Mult d1,d0,d0 ; 54 (based on average of 8 bits set) rts ; 10 = 104 total Mul_a_is_0: Invert2 6(sp),d0 rts Mul_b_is_0: Invert1 d0 rts MulEnd: ;; The cases where a or b are zero are so rare they don't affect the average ;; time significantly. ; encryption and decryption algorithm IDEA ; void Idea(u_int16 dataIn[4], u_int16 dataOut[4], u_int16 key[52]) ; This performs the core IDEA encryption. It leaves no sensitive ; information behind in memory or registers, except for a temporary ; value from the last KeyMult in d0, which is probably safe to assume will ; quickly be trashed. ; Modifying this code is NOT recommended for the faint of heart. ; This code is, in a word, *tight*. Note in particular the use of the low ; halves of d2-d7 only, to eliminate the overhead of saving and restoring ; their high halves, the use of d3 as a constant zero source, and a1 ; doubling as a loop counter. Also note that the exceptional cases in the ; KeyMult macro have been taken entirely out of line, and there has been ; some code motion to between KeyMult 5 and 6 so that the branches stay ; in short range (which they are, barely: offsets of +126 and -126 bytes ; both occur). ; Unrolling this loop any further would result in the branches going out ; of short branch range, which would cost more in instruction fetch time ; than would be saved by the unrolling. It would also take it out of a ; 68020's cache. ; Register usage: ; d0.l - KeyMult temp ; d1.l - Constant zero ; d2.w - s2 ; d3.w - s1 ; d4.w - x0 ; d5.w - x2 <- Note reversal ; d6.w - x1 ; d7.w - x3 ; a0.l - dataIn, dataOut and key pointer limit ; a1.l - Pointer to current location in expanded key ; KeyMult: Multiply value in register by next available key (pointed at by a1). ; Requires corresponding MultAux code nearby. KeyMult macro ; %1 = number, %2 = register move.w %2,d0 ; 4 beq Mul%1_a_is_0 ; 8 mulu (a1)+,d0 ; 58 (42 if 0) beq.s Mul%1_b_is_0 ; 8 move.w d0,%2 ; 4 swap d0 ; 4 sub.w d0,%2 ; 4 addx.w d1,%2 ; 4 Mul%1_done: endm ; 94 total ; Note: the following cases should occur 1/65536 of the time, however the ; second, if it occurs, will occur for every block encrypted using the ; same key, so it is important that it not be significantly slower than the ; common case. (It's faster, so all is well.) MultAux macro ; %1 = number, %2 = register Mul%1_a_is_0: ; 14 move and branch overhead move.w #1,%2 ; 8 sub.w (a1)+,%2 ; 8 bra.s Mul%1_done ; 10 = 40 total for this path Mul%1_b_is_0: ; 64 overhead to get here Invert1 %2 ; 8 bra.s Mul%1_done ; 10 = 82 total for this path endm MultAux 1,d4 MultAux 2,d7 MultAux 3,d5 MultAux 4,d6 MultAux 5,d4 xdef _Idea xdef _Idea2 _Idea: _Idea2: move.l 4(sp),a0 ; 16 dataIn move.l 12(sp),a1 ; 16 32 key ; Since I am scrupulously careful never to touch the high halves of ; d2..d7, I only need to save the low halves. movem.w d2-d7,-(sp) ; 32 68 clr.w d1 ; 4 72 Constant 0 ; movem.w <mem>,<regs> here would sign-extend. move.w (a0)+,d4 ; 8 80 move.w (a0)+,d6 ; 8 88 move.w (a0)+,d5 ; 8 96 move.w (a0),d7 ; 8 104 ; When a1 points here, stop. 96 = 2 bytes/key * 6 keys/round * 8 rounds lea 96(a1),a0 ; 8 112 CryptLoop: KeyMult 1,d4 ; 94 x0 *= *key++; add.w (a1)+,d6 ; 8 102 x1 += *key++; add.w (a1)+,d5 ; 8 110 x2 += *key++; KeyMult 2,d7 ; 94 204 x3 *= *key++; move.w d5,d2 ; 4 208 s2 = x2 eor.w d4,d5 ; 4 212 x2 ^= x0 KeyMult 3,d5 ; 94 306 x2 *= *key++; move.w d6,d3 ; 4 310 s1 = x1; eor.w d7,d6 ; 4 314 x1 ^= x3; add.w d5,d6 ; 4 318 x1 += x2; KeyMult 4,d6 ; 94 412 x1 *= *key++; eor.w d6,d4 ; 4 416 x0 ^= x1; ;; Second unrolled copy. Some of the first loop is postponed until after the ;; first KeyMult so that the branches there will be in short range (4 cycles). KeyMult 5,d4 add.w d6,d5 ; 4 420 x2 += x1; FIRST LOOP eor.w d5,d7 ; 4 424 x3 ^= x2; FIRST LOOP eor.w d2,d6 ; 4 428 x1 ^= s2; FIRST LOOP eor.w d3,d5 ; 4 432 x2 ^= s1; FIRST LOOP add.w (a1)+,d6 add.w (a1)+,d5 ;; This code logically belongs after the KeyMult, but is moved up so that ;; the branches to MultAux will stay in short branch range. move.w d5,d2 eor.w d4,d5 move.w d6,d3 KeyMult 6,d7 KeyMult 7,d5 eor.w d7,d6 add.w d5,d6 KeyMult 8,d6 add.w d6,d5 eor.w d6,d4 eor.w d5,d7 eor.w d2,d6 eor.w d3,d5 ; 432 864 cmp.l a0,a1 ; 6 870 bne CryptLoop ; 10 880 ; 2 114 -> From top, if expired KeyMult 9,d4 ; 94 208 x0 *= *key++; add.w (a1)+,d5 ; 8 216 x2 += *key++; add.w (a1)+,d6 ; 8 224 x1 += *key++; KeyMult 10,d7 ; 94 318 x3 *= *key++; move.l 20(a7),a0 ; 16 334 6 regs(12) + retaddr(4) + arg 1(4) movem.w d4-d7,(a0) ; 24 358 Store x0,x2,x1,x3 ; movem.w <mem>,<regs> sign-extends, which is not what we want... move.w (sp)+,d2 ; 8 366 move.w (sp)+,d3 ; 8 374 move.w (sp)+,d4 ; 8 382 move.w (sp)+,d5 ; 8 390 move.w (sp)+,d6 ; 8 398 move.w (sp)+,d7 ; 8 406 rts ; 16 422 MultAux 6,d7 MultAux 7,d5 MultAux 8,d6 MultAux 9,d4 MultAux 10,d7 ; 422 + 4 * 880 = 3942 cycles per block, or 492.75 cycles per byte. ; At 7159090 Hz, that's 1816.11 blocks per second, or 14528.8 ; bytes per second. ; End of Idea ;; For reference, here's the original source: ; void Idea(u_int16 *dataIn, u_int16 *dataOut, u_int16 *key) ; { ; register u_int16 round, x0, x1, x2, x3, s1, s2; ; ; x0 = *dataIn++; x1 = *dataIn++; x2 = *dataIn++; x3 = *dataIn; ; ; for (round = nofRound; round > 0; round--) { ; x0 = Mul(x0, *key++); ; x1 += *key++; ; x2 += *key++; ; x3 = Mul(x3, *key++); ; ; s1 = x1; s2 = x2; ; x2 ^= x0; ; x2 = Mul(x2, *key++); ; x1 ^= x3; ; x1 += x2; ; x1 = Mul(x1, *key++); ; x2 += x1; ; ; x0 ^= x1; ; x3 ^= x2; ; ; x1 ^= s2; ; x2 ^= s1; ; } ; *(dataOut++) = Mul(x0, *key++); ; *(dataOut++) = x2 + *key++; ; *(dataOut++) = x1 + *key++; ; *(dataOut) = Mul(x3, *key); ; } /* Idea */ ;;; ;;; This expands a user key to a full idea key, copying the 128-bit block ;;; with a 25-bit rotation each time. ;;; ; void ExpandUserKey(uint16 userKey[8], uint16 key[52]) ; Expand an 8-word (16-byte) user key into a 52-word (104-byte) ; working key, by rotating the 128-bit big-endian key left 25 ; bits and storing the result in the next 8 words, successively. ; 1306 cycles total = (@ 7159090 Hz) 182.425 us ; Rotate a/b/c/d left 25 bits, by rotating right 7 and shifting ; down 1 register shdown macro ; a, b, c, d, e, temp, const ; aaaa bbbb cccc dddd ____ ____ move.l %1,%5 ; aaaa bbbb cccc dddd aaaa ____ 4 and.w %7,%1 ; __0a bbbb cccc dddd aaaa ____ 4 8 eor.w %1,%5 ; __0a bbbb cccc dddd aaa0 ____ 4 12 move.w %4,%6 ; __0a bbbb cccc dddd aaa0 __dd 4 16 and.w %7,%6 ; __0a bbbb cccc dddd aaa0 __0d 4 20 eor.w %6,%4 ; __0a bbbb cccc ddd0 aaa0 __0d 4 24 or.w %6,%5 ; __0a bbbb cccc ddd0 aaad __0d 4 28 ror.l #7,%5 ; __0a bbbb cccc ddd0 daaa __0d 22 50 move.w %3,%6 ; __0a bbbb cccc ddd0 daaa __cc 4 54 and.w %7,%6 ; __0a bbbb cccc ddd0 daaa __0c 4 58 eor.w %6,%3 ; __0a bbbb ccc0 ddd0 daaa __0c 4 62 or.w %6,%4 ; __0a bbbb ccc0 dddc daaa __0c 4 66 ror.l #7,%4 ; __0a bbbb ccc0 cddd daaa __0c 22 88 move.w %2,%6 ; __0a bbbb ccc0 cddd daaa __bb 4 92 and.w %7,%6 ; __0a bbbb ccc0 cddd daaa __0b 4 96 eor.w %6,%2 ; __0a bbb0 ccc0 cddd daaa __0b 4 100 or.w %6,%3 ; __0a bbb0 cccb cddd daaa __0b 4 104 ror.l #7,%3 ; __0a bbb0 bccc cddd daaa __0b 22 126 or.w %1,%2 ; __0a bbba bccc cddd daaa __0b 4 130 ror.l #7,%2 ; __0a abbb bccc cddd daaa __0b 22 152 ; Rotate a/b/c/d left 25 bits, result in e/f/a/b. endm shup macro ; e, f, a, b, c, d, const ; ____ ____ aaaa bbbb cccc dddd move.l %4,%1 ; bbbb ____ aaaa bbbb cccc dddd 4 and.w %7,%4 ; bbbb ____ aaaa __0b cccc cccc 4 8 eor.w %4,%1 ; bbb0 xxxx aaaa __0b cccc dddd 4 12 move.l %5,%2 ; bbb0 cccc aaaa __0b cccc dddd 4 16 and.w %7,%5 ; bbb0 cccc aaaa __0b __0c dddd 4 20 eor.w %5,%2 ; bbb0 ccc0 aaaa __0b __0c dddd 4 24 or.w %4,%2 ; bbb0 cccb aaaa __0b __0c dddd 4 28 ror.l #7,%2 ; bbb0 bccc aaaa __0b __0c dddd 22 50 move.l %3,%4 ; bbb0 bccc aaaa aaaa __0c dddd 4 54 and.w %7,%3 ; bbb0 bccc __0a aaaa __0c dddd 4 58 eor.w %3,%4 ; bbb0 bccc __0a aaa0 __0c dddd 4 62 or.w %3,%1 ; bbba bccc __0a aaa0 __0c dddd 4 66 ror.l #7,%1 ; abbb bccc __0a aaa0 __0c dddd 22 88 move.l %6,%3 ; abbb bccc dddd aaa0 __0c dddd 4 92 and.w %7,%6 ; abbb bccc dddd aaa0 __0c __0d 4 96 eor.w %6,%3 ; abbb bccc ddd0 aaa0 __0c __0d 4 100 or.w %5,%3 ; abbb bccc dddc aaa0 __0c __0d 4 104 ror.l #7,%3 ; abbb bccc cddd aaa0 __0c __0d 22 126 or.w %6,%4 ; abbb bccc cddd aaad __0c __0d 4 130 ror.l #7,%4 ; abbb bccc cddd daaa __0c __0d 22 152 endm xdef _ExpandUserKey xdef _ExpandUserKey2 _ExpandUserKey: _ExpandUserKey2: move.l 4(sp),a0 ; UserKey 16 move.l 8(sp),a1 ; Key 16 32 movem.l d2-d5,-(sp) ; 40 72 movem.l (a0),d0-d3 ; Fetched into d0-d3 44 116 move.l d7,a0 ; 4 120 moveq #127,d7 ; 4 124 movem.l d0-d3,(a1) ; Store words 0..7 40 164 shdown d0,d1,d2,d3,d4,d5,d7 ; 152 318 movem.l d1-d4,16(a1) ; Store words 8..15 44 360 shdown d1,d2,d3,d4,d5,d0,d7 ; 152 512 movem.l d2-d5,32(a1) ; store words 16..23 44 556 shup d0,d1,d2,d3,d4,d5,d7 ; 152 708 movem.l d0-d3,48(a1) ; Store words 24..31 44 752 shdown d0,d1,d2,d3,d4,d5,d7 ; 152 904 movem.l d1-d4,64(a1) ; Store words 32..39 44 948 shdown d1,d2,d3,d4,d5,d0,d7 ; 152 1100 movem.l d2-d5,80(a1) ; store words 40..47 44 1144 ; We only need two longwords this iteration ; (d2 d3 d4 d5) ; aaaa bbbb cccc dddd and.w d7,d2 ; __0a bbbb cccc dddd 4 1148 move.w d3,d5 ; __0a bbbb cccc __bb 4 1152 and.w d7,d5 ; __0a bbbb cccc __0b 4 1156 eor.w d5,d3 ; __0a bbb0 cccc __0b 4 1160 or.w d2,d3 ; __0a bbba cccc __0b 4 1164 ror.l #7,d3 ; __0a abbb cccc __0b 22 1186 not.w d7 ; 4 1190 and.w d7,d4 ; __0a abbb ccc0 __0b 4 1194 or.w d5,d4 ; __0a abbb cccb __0b 4 1198 ror.l #7,d4 ; __0a abbb bccc __0b 22 1220 movem.l d3-d4,96(a1) ; store words 48..51 28 1248 move.l a0,d7 ; 4 1252 movem.l (sp)+,d2-d5 ; 44 1296 rts ; 10 1306 ;;; ;;; This section inverts keys ;;; First, the code to find a multiplicative inverse ;;; ; Set up the registers for InvCore InvPrep macro ; %1 = in.l, %2 = temp.l, %3 = out.w move.l %1,%2 move.w %2,%3 endm ; Note: this expects %1 to be set to the input value (high 16 bits zero!), ; and be non-zero, %2 to be set to 0x10001, and %3 to be set to 1. It ; returns the multiplicative inverse of %1 in %3. It does not affect the ; high words of %1, %3 and %4. %2 and %5 have all 32 bits trashed. InvCore macro ; %1 = in.l, %2 = mod.l, %3 = out.w, ; %4 = temp.w, %5 = temp.l, %6 = #1.w ;iter1\@: divu %1,%2 bvs Out1\@ ; If ovfl, input=1, so inverse = 1 (in %3). move.w %2,%4 ; t0 = t0 + t1 * q = 0 + 1 * q = q ; mulu %3,%5 ; If we had copied to d4, we would have done this... ; add.w %5,%4 ; But since we know d0=1 and d2=0, we can skip it clr.w %2 swap %2 ; cmp.w #1,%2 cmp.w %3,%2 ; Since %3 = 1 beq Out2\@ ; remainder = 1, so quit - inverse in -%5 InvLoop\@: ;iter2\@: divu %2,%1 move.w %1,%5 mulu %4,%5 add.w %5,%3 clr.w %1 swap %1 cmp.w %6,%1 beq Out1\@ ; remainder = 1, so quit - inverse in %3 ;iter3\@: divu %1,%2 move.w %2,%5 mulu %3,%5 add.w %5,%4 clr.w %2 swap %2 cmp.w %6,%2 ; beq Out2\@ ; remainder = 1, so quit - inverse in -%4 ; bra InvLoop\@ bne InvLoop\@ Out2\@: ; Inverse in -%4 move.w %6,%3 sub.w %4,%3 Out1\@: ; Inverse in %3 endm InputZero: moveq #0,d0 rts xdef _MulInv xdef _MulInv2 _MulInv: _MulInv2: moveq #0,d1 move.w 4(sp),d1 beq InputZero move.l d2,a0 move.w d3,-(sp) move.l d4,a1 moveq #1,d0 move.l #$10001,d3 InvCore d1,d3,d0,d2,d4,#1 move.l a1,d4 move.w (sp)+,d3 move.l a0,d2 rts MulInvEnd: ; Because it's legal to invert a key on top of itself, we push the inverted ; key onto the stack and then copy it to the destination at the end. xdef _InvertIdeaKey: xdef _InvertIdeaKey2: _InvertIdeaKey: _InvertIdeaKey2: move.l 4(sp),a0 ; Source move.l 8(sp),a1 ; Dest pointer movem.l d2-d7,-(sp) move.l #$10001,d7 move.l (a0)+,d0 move.l (a0)+,d1 moveq #0,d3 ; Clear high half of d3 move.w d1,d3 beq IIKzero1 InvPrep d7,d4,d1 InvCore d3,d4,d1,d5,d6,d7 IIKzero1: swap d1 neg.w d1 swap d1 move.l d1,-(sp) swap d0 move.w d0,d3 beq IIKzero2 InvPrep d7,d4,d0 InvCore d3,d4,d0,d5,d6,d7 IIKzero2: swap d0 neg.w d0 move.l d0,-(sp) moveq #nofRound-2,d2 bra IIKloop IIKzero3: ; Located here to be in short branch range ; (saves 4*7 = 28 cycles; costs one branch = 10 cycles) ; d1.w == 0; d0.w is unknown move.w d0,d4 beq IIKzero3a InvPrep d7,d5,d1 InvCore d4,d5,d1,d0,d6,d7 clr.w d0 bra IIKzero3a IIKzero4: ; d1.w != 0; d0.w == 0 clr.w d1 bra IIKzero4a IIKloop: move.l (a0)+,-(sp) move.l (a0)+,d0 move.l (a0)+,d1 neg.w d0 swap d0 move.w d1,d3 beq IIKzero3 move.w d0,d4 beq IIKzero4 InvPrep d7,d5,d1 InvCore d4,d5,d1,d0,d6,d7 IIKzero4a: InvPrep d7,d4,d0 InvCore d3,d4,d0,d5,d6,d7 IIKzero3a: move.l d0,-(sp) swap d1 neg.w d1 move.l d1,-(sp) dbra d2,IIKloop move.l (a0)+,d2 move.l (a0)+,d0 move.l (a0)+,d1 move.w d1,d3 beq IIKzero5 InvPrep d7,d4,d1 InvCore d3,d4,d1,d5,d6,d7 IIKzero5: swap d1 neg.w d1 swap d1 swap d0 move.w d0,d3 beq IIKzero6 InvPrep d7,d4,d0 InvCore d3,d4,d0,d5,d6,d7 IIKzero6: swap d0 neg.w d0 ; move.l d0,(a1)+ ; Install these last three longwords...NOT ; move.l d1,(a1)+ ; Merge this with the movem below. ; move.l d2,(a1)+ ; 104 bytes of key = 26 longs = 9+9+8 movem.l (sp)+,d3-d7/a0 movem.l d0-d7/a0,(a1) ; 9 longs movem.l (sp)+,d0-d7/a0 movem.l d0-d7/a0,36(a1) ; 9 longs movem.l (sp)+,d0-d7 movem.l d0-d7,72(a1) ; 8 longs movem.l (sp)+,d2-d7 ; Retrieve registers rts InvertIdeaKeyEnd: ;; And for reference, the original code... ; void InvertIdeaKey( u_int16 *key, u_int16 *invKey) ; { ; register int i; ; KeyT(dk); ; ; dk[nofKeyPerRound * nofRound + 0] = MulInv(*key++); ; dk[nofKeyPerRound * nofRound + 1] = (addMod - *key++) & ones; ; dk[nofKeyPerRound * nofRound + 2] = (addMod - *key++) & ones; ; dk[nofKeyPerRound * nofRound + 3] = MulInv(*key++); ; for (i = nofKeyPerRound * (nofRound - 1); i >= 0; i -= nofKeyPerRound) { ; dk[i + 4] = *key++; ; dk[i + 5] = *key++; ; dk[i + 0] = MulInv(*key++); ; if (i > 0) { ; dk[i + 2] = (addMod - *key++) & ones; ; dk[i + 1] = (addMod - *key++) & ones; ; } else { ; dk[i + 1] = (addMod - *key++) & ones; ; dk[i + 2] = (addMod - *key++) & ones; ; } ; dk[i + 3] = MulInv(*key++); ; } ; for (i = 0; i < keyLen; i++) ; invKey[i] = dk[i]; ; } /* InvertIdeaKey */