home *** CD-ROM | disk | FTP | other *** search
- ; multiple-precision arithmetic routines
- ; the ARM is a 32-bit little-endian RISC processor
-
- r0 RN 0
- r1 RN 1
- r2 RN 2
- r3 RN 3
- r4 RN 4
- r5 RN 5
- r6 RN 6
- r7 RN 7
- r8 RN 8
- r9 RN 9
- r10 RN 10
- r11 RN 11
- r12 RN 12
- r14 RN 14
- sp RN 13
- lr RN 14
- pc RN 15
-
- AREA |A$$Code|, CODE, READONLY
-
- IMPORT global_precision
-
- gp_ DCD global_precision
-
- EXPORT _p_addc
- ; _p_addc
- ; r0,r1 point to the numbers, r2 is carry (0 or 1)
- ; return with number at r0 increased, new carry in r0
- _p_addc STMFD sp!,{r4,r5,lr}
- LDR r3,gp_ ; addr of gp variable
- LDR r3,[r3] ; value of gp variable
- MOVS r3,r3,LSL#16 ; <<16 [it's a short]
- MOVEQ r0,r2 ; precision = 0 ??!
- LDMEQFD sp!,{r4,r5,pc}^
- MOVS r2,r2,LSR#1 ; carry into C flag
- pa1 LDR r4,[r0]
- LDR r5,[r1],#4
- ADCS r4,r4,r5
- STR r4,[r0],#4
- SUB r3,r3,#1<<16
- TEQ r3,#0
- BNE pa1
- ADC r0,r3,#0 ; r3==0 here, so r0 = carry
- LDMFD sp!,{r4,r5,pc}^
-
- EXPORT _p_subb
- ; _p_subb
- ; r0,r1 point to the numbers, r2 is borrow (0 or 1)
- ; return with number at r1 decreased, new borrow in r0
- _p_subb STMFD sp!,{r4,r5,lr}
- LDR r3,gp_ ; addr of gp variable
- LDR r3,[r3] ; value of gp variable
- MOVS r3,r3,LSL#16 ; <<16 [it's a short]
- MOVEQ r0,r2 ; precision = 0 ??!
- LDMEQFD sp!,{r4,r5,pc}^
- EOR r2,r2,#1 ; carry = 1-borrow
- MOVS r2,r2,LSR#1 ; carry into C flag
- ps1 LDR r4,[r0]
- LDR r5,[r1],#4
- SBCS r4,r4,r5
- STR r4,[r0],#4
- SUB r3,r3,#1<<16
- TEQ r3,#0
- BNE ps1
- ADC r0,r3,#0 ; r3==0 here, so r0 = carry
- EOR r0,r0,#1 ; borrow = 1-carry
- LDMFD sp!,{r4,r5,pc}^
-
- EXPORT _p_rol
- ; _p_rol
- ; r0 points to a number, r1 is carry.
- ; rotate left 1 bit, return new carry
- _p_rol LDR r3,gp_
- LDR r3,[r3]
- MOVS r3,r3,LSL#16
- MOVEQ r0,r1
- MOVEQS pc,lr
- MOVS r1,r1,LSR#1 ; carry into C flag
- pr1 LDR r2,[r0]
- ADCS r2,r2,r2 ; left shift 1b with carry -> b0
- STR r2,[r0],#4
- SUB r3,r3,#1<<16
- TEQ r3,#0
- BNE pr1
- ADC r0,r3,#0
- MOVS pc,lr
-
- EXPORT _p_smul
- ; _p_smul
- ; r0:prod r1:multiplicand r2=multiplier
- ; accumulate into prod
- _p_smul STMFD sp!,{r5-r9,lr}
- LDR r3,gp_
- LDR r3,[r3]
- MOVS r3,r3,LSL#16
- LDMEQFD sp!,{r5-r9,pc}^
- MOV r14,#0 ; carried-over high word
- MOV r7,r2,LSR#16 ; high1 in r7
- pm1 LDR r5,[r1],#4 ; get a digit
- ; now a 32x32->64 multiply...
- ;NBNBNB factor out splitting of r2, since it doesn't change
- MOV r6,r5,LSR#16 ; high0 in r6
- SUB r5,r5,r6,LSL#16 ; low0 in r5
- SUB r8,r2,r7,LSL#16 ; low1 in r8
- MUL r9,r5,r8 ; ll in r9
- MUL r8,r6,r8 ; hl in r8
- MUL r5,r7,r5 ; lh in r5
- MUL r6,r7,r6 ; hh in r6
- ADDS r8,r5,r8 ; sum of cross terms in r8
- ADDCS r6,r6,#&10000 ; bit 16 of hh
- ADDS r5,r9,r8,LSL#16 ; ll + low(cross)
- ADC r8,r6,r8,LSR#16 ; hh + high(cross) + carry
- ; now low in r5, high in r8
- ADDS r5,r5,r14 ; add in carry
- ADC r8,r8,#0
- LDR r14,[r0] ; digit from PROD
- ADDS r5,r5,r14 ; add it in. There will be at most 1
- ADC r14,r8,#0 ; carry between prev add and this.
- STR r5,[r0],#4 ; store a digit
- LDR r5,[r0]
- SUBS r3,r3,#1<<16
- BNE pm1
- ADD r5,r5,r14
- STR r5,[r0] ; add final carry back in
- LDMFD sp!,{r5-r9,pc}^
-
- ; I really ought to implement mp_dmul in assembler too.
- ; It's pretty speed-critical.
- ; Actually if I do this my effort on mp_smul will have been
- ; wasted :-)
- ; Well, here goes...
-
- EXPORT _p_dmul
- ; _p_dmul
- ; r0 points to double-length space. r1,r2 point to bignums.
- _p_dmul STMFD sp!,{r4-r11,lr}
- ; next few instructions are nasty hack so I can just drop in
- ; my existing code...
- MOV r4,r0
- MOV r0,r2
- LDR r3,gp_
- LDR r3,[r3]
- MOV r3,r3,LSL#16
- MOV r3,r3,LSR#16
- MOV r2,r3
- ; now for the code itself
- mul9999 MOV r6,#0
- MOV r7,r4 ; save start of answer
- ANDS r5,r3,#3
- BEQ m99l1 ; multiple of 4 words
- CMP r5,#2
- STR r6,[r4],#4
- STRGE r6,[r4],#4
- STRGT r6,[r4],#4
- m99l1 BICS r5,r3,#3
- BEQ m99l3
- MOV r8,#0
- MOV r9,#0
- MOV r10,#0
- m99l2 STMIA r4!,{r6,r8,r9,r10}
- SUBS r5,r5,#4
- BGT m99l2
- ; destination now has lots of 0s
- ; it's worth setting out the register allocation here:
- ; r0,r1 are digit pointers. r0 points to a[i], r1 to b[0]
- ; r2,r3 are lengths. r2 decrements, r3 doesn't
- ; r4 points to c[i]
- ; r5 is the digit (of a) we're multiplying by
- ; r6 is the carry
- ; r7 is the inner loop counter ("j")
- ; r8,9,10,11,12 are used by the 32x32 multiply
- ; r13 is SP
- ; r14 is the (unmodified) length of the first number
- m99l3 ;STMFD sp!,{r14}
- MOV r4,r7 ; restore c
- MOV r14,r2 ; save la
- m99l4 LDR r5,[r0],#4 ; f=a[i++] START OF OUTER LOOP
- MOV r6,#0 ; ca=0
- MOV r7,#0 ; j=0
- m99l5 LDR r8,[r1,r7,LSL#2] ; b[j] START OF INNER LOOP
- ; start of multiply routine -- mul3232 with all regs +8
- MOV r10,r8,LSR#16 ; high halfwords in r10,r11
- MOV r11,r5,LSR#16
- SUB r8,r8,r10,LSL#16 ; low halfwords in r8,r9
- SUB r9,r5,r11,LSL#16
- MUL r12,r8,r9 ; ll in r12
- MUL r9,r10,r9 ; hl in r9
- MUL r8,r11,r8 ; lh in r8
- MUL r10,r11,r10 ; hh in r10
- ADDS r9,r8,r9 ; sum of cross terms in r9; now r8 spare
- ADDCS r10,r10,#&10000 ; insert carry
- ADDS r8,r12,r9,LSL#16 ; low bits in r8
- ADC r9,r10,r9,LSR#16 ; high bits in r9
- ; end of multiply routine. H in r9, L in r8
- LDR r10,[r4,r7,LSL#2] ; c[i+j]
- ADDS r8,r8,r10 ; f*b[j] + c[i+j]
- ADC r9,r9,#0 ; including high bits
- ADDS r8,r8,r6 ; ... + ca
- ADC r6,r9,#0 ; high bits are new carry
- STR r8,[r4,r7,LSL#2] ; low bits are new value for c[i+j]
- ADD r7,r7,#1 ; ++j
- CMP r7,r3 ; if (j<lb)
- BLT m99l5 ; then continue with inner loop
- ; END OF INNER LOOP
- STR r6,[r4,r7,LSL#2] ; c[i+j]=ca
- ADD r4,r4,#4 ; advance &c[i]
- SUBS r2,r2,#1 ; finished outer loop?
- BGT m99l4 ; nope
- ; END OF OUTER LOOP
- LDMFD sp!,{r4-r11,pc}^
-
- END
-