home *** CD-ROM | disk | FTP | other *** search
Text File | 1994-12-21 | 78.1 KB | 1,816 lines |
- # ariarm.d (c) Copyright 1994 P.J.Burwood
- # external routines for arilev1.d
- # Processor: ARM in APCS mode
- # Assembler-Syntax: ObjAsm
- # Assumptions: intCsize=32, intDsize=32.
- # Parameter passing conventions: APCS means that registers a1-a4 and ip do not
- # have to be preserved across function calls.
- # Note: A sequence of up to 4 conditional instructions is used in preference
- # to a branch.
-
- #ifdef INCLUDED_FROM_C
-
- #define COPY_LOOPS
- #define FILL_LOOPS
- #define CLEAR_LOOPS
- #define LOG_LOOPS
- #define TEST_LOOPS
- #define ADDSUB_LOOPS
- #define SHIFT_LOOPS
- #define MUL_LOOPS
-
- #else
-
- a1 RN 0
- a2 RN 1
- a3 RN 2
- a4 RN 3
- v1 RN 4
- v2 RN 5
- v3 RN 6
- v4 RN 7
- v5 RN 8
- v6 RN 9
- sl RN 10
- fp RN 11
- ip RN 12
- sp RN 13
- lr RN 14
- pc RN 15
-
- f0 FN 0
- f1 FN 1
- f2 FN 2
- f3 FN 3
- f4 FN 4
- f5 FN 5
- f6 FN 6
- f7 FN 7
-
- AREA |C$$code|,CODE,READONLY
-
- /* with GNU C we pass the second value in a2, don't need a global variable */
- #ifndef __GNUC__
- ptr_mulu32_high
- IMPORT mulu32_high
- DCD mulu32_high
- #endif
- ptr_divu_16_rest
- IMPORT divu_16_rest
- DCD divu_16_rest
- #ifndef __GNUC__
- ptr_divu_32_rest
- IMPORT divu_32_rest
- DCD divu_32_rest
- #endif
-
- # extern uint32 mulu32_ (uint32 arg1, uint32 arg2);
- # entry
- # a1 = x
- # a2 = y
- # exit
- # a1 = low32(x*y)
- # a2 = high32(x*y)
- # mulu32_high = high32(x*y)
- # a3,a4,ip destroyed
- EXPORT mulu32_
- mulu32_
- MOV ip,a1,LSR #16 # temp := top half of x
- MOV a3,a2,LSR #16 # hi := top half of y
- BIC a1,a1,ip,LSL #16 # x := bottom half of x
- BIC a2,a2,a3,LSL #16 # y := bottom half of y
- MUL a4,a1,a2 # low section of result
- MUL a2,ip,a2 # ) middle sections
- MUL a1,a3,a1 # ) of result
- MUL a3,ip,a3 # high section of result
- ADDS a2,a2,a1 # add middle sections
- # (can't use mla as we need carry)
- ADDCS a3,a3,#&10000 # carry from above add
- ADDS a1,a4,a2,LSL #16 # x is now bottom 32 bits of result
- ADC a2,a3,a2,LSR #16 # hi is top 32 bits
- #ifndef __GNUC__
- LDR a3,[pc,#ptr_mulu32_high-.-8]
- STR a2,[a3,#0]
- #endif
- MOVS pc,lr
-
- EXPORT divu_3216_1616_
- divu_3216_1616_
- MOV a2,a2,LSL#15 # multiply divisor by 2^15
- RSB a2,a2,#0 # negate divisor
- ADDS a1,a2,a1 # dividend = dividend + -divisor/2
- SUBCC a1,a1,a2 # dividend = dividend - -divisor/2
- ADCS a1,a2,a1,LSL#1 # dividend = dividend*2 + -divisor
- # and shift quotient
- SUBCC a1,a1,a2 # do this another 15 times
- ADCS a1,a2,a1,LSL#1
- SUBCC a1,a1,a2
- ADCS a1,a2,a1,LSL#1
- SUBCC a1,a1,a2
- ADCS a1,a2,a1,LSL#1
- SUBCC a1,a1,a2
- ADCS a1,a2,a1,LSL#1
- SUBCC a1,a1,a2
- ADCS a1,a2,a1,LSL#1
- SUBCC a1,a1,a2
- ADCS a1,a2,a1,LSL#1
- SUBCC a1,a1,a2
- ADCS a1,a2,a1,LSL#1
- SUBCC a1,a1,a2
- ADCS a1,a2,a1,LSL#1
- SUBCC a1,a1,a2
- ADCS a1,a2,a1,LSL#1
- SUBCC a1,a1,a2
- ADCS a1,a2,a1,LSL#1
- SUBCC a1,a1,a2
- ADCS a1,a2,a1,LSL#1
- SUBCC a1,a1,a2
- ADCS a1,a2,a1,LSL#1
- SUBCC a1,a1,a2
- ADCS a1,a2,a1,LSL#1
- SUBCC a1,a1,a2
- ADCS a1,a2,a1,LSL#1
- SUBCC a1,a1,a2
- MOV a2,a1,LSR#15 # move remainder into a2 and shift
- ADC a1,a1,a1 # move last bit of quotient in
- MOV a1,a1,LSL#16 # AND out top 16 bits by shifting up
- MOV a1,a1,LSR#16 # and back down again
- LDR a3,[pc,#ptr_divu_16_rest-.-8] # save rest so can be picked up later
- STR a2,[a3,#0] # the result is 16 bits
- MOVS pc, lr
-
- # extern uint32 divu_6432_3232_ (uint32 x, uint32 y); # -> Quotient q
- # extern uint32 divu_32_rest; # -> Rest r
- # see arilev0 for algorithm
- # entry
- # a1 = xhi (dividend)
- # a2 = xlo (dividend)
- # a3 = y (divisor)
- # exit
- # a1 = 32 bit quotient
- # a2 = 32 bit remainder
- # a3, a4 destroyed
- EXPORT divu_6432_3232_
- divu_6432_3232_
- STMFD sp!, {v1,v2,v3,v4,v5,v6,lr}
- MOV v2, a2 # = xlo
- MOV v1, a3 # = y
- CMP a3,#&10000 # y <= (uint32)(bit(16)-1)
- BCS |L0005b0.J4.divu_6432_3232_|
- MOV a2, v2, LSR #16
- ORR a1, a2, a1, ASL #16 # = highlow32(low16(xhi),high16(xlo))
- MOV a2, v1
- BL divu_3216_1616_
- MOV v3, a1 # = q1
- MOV a1, v2, ASL #16
- MOV a1, a1, LSR #16
- ORR a1, a1, a2, ASL #16 # = highlow32(r1,low16(xlo))
- MOV a2, v1
- BL divu_3216_1616_
- ORR a1, a1, v3, ASL #16 # = highlow32(q1,q0)
- #ifndef __GNUC__
- LDR a4,[pc,#ptr_divu_32_rest-.-8]
- STR a2,[a4,#0] # divu_32_rest = remainder
- #endif
- LDMFD sp!, {v1,v2,v3,v4,v5,v6,pc}^
-
- |L0005b0.J4.divu_6432_3232_|
- MOV v3, #0 # s = 0
- MOVS a4, v1, LSR #16 # while ((sint32)y >= 0)
- ADDEQ v3, v3, #16 # { y = y<<1; s++; }
- MOVEQ v1, v1, ASL #16
- MOVS a4, v1, LSR #24
- ADDEQ v3, v3, #8
- MOVEQ v1, v1, ASL #8
- MOVS a4, v1, LSR #28
- ADDEQ v3, v3, #4
- MOVEQ v1, v1, ASL #4
- MOVS a4, v1, LSR #30
- ADDEQ v3, v3, #2
- MOVEQ v1, v1, ASL #2
- MOVS a4, v1, LSR #31
- ADDEQ v3, v3, #1
- MOVEQ v1, v1, ASL #1
-
- CMPS v3, #0
- MOVNE a2, a1, ASL v3 # if (!(s==0))
- RSBNE a1, v3, #32 # { xhi = (xhi << s)
- ORRNE a1, a2, v2, LSR a1 # | (xlo >> (32-s));
- MOVNE v2, v2, ASL v3 # xlo = xlo << s; }
- ADD a2, v1, #&10000 # y1_1 = high16(y)+1
- MOVS v5, a2, LSR #16 # if (y1_1 = 0)
- MOVEQ v4, a1, ASL #16 # r16 = low16(xhi) * 2^16
- MOVEQ a1, a1, LSR #16 # q1 = high16(xhi)
- MOVNE a2, v5
- BLNE divu_3216_1616_ # divu_3216_1616(xhi,y1_1, q1=,r16=)
- MOVNE v4, a2, ASL #16 # r16 = r16 * 2^16
- ORR v4, v4, v2, LSR #16 # r = highlow32(r16,high16(xlo))
- MOV a4, v1, ASL #16 # tmp = mulu16(low16(y),q1)
- MOV a4, a4, LSR #16
- MUL a3, a4, a1
- RSB a3, a3, a1, ASL #16 # r2 = highlow32_0(q1) - tmp
- MOV v6, a1 # = q1
- ADDS a1, v4, a3 # r += r2
- ADDCS v6, v6, #1 # if ( r < r2 ) { q1 += 1
- SUBCS a1, a1, v1 # r -= y }
- CMP a1, v1 # if (r >= y)
- ADDCS v6, v6, #1 # { q1 += 1
- SUBCS a1, a1, v1 # r -= y }
- CMP v5, #0 # if (y1_1 = 0)
- MOVEQ v4, a1, ASL #16 # { r16 = low16(r) * 2^16
- MOVEQ a1, a1, LSR #16 # q0 = high16(r) }
- MOVNE a2, v5
- BLNE divu_3216_1616_ # divu_3216_1616(r,y1_1, q0=,r16=)
- MOVNE v4, a2, ASL #16 # r16 = r16 * 2^16
- MOV v2, v2, ASL #16
- ORR v4, v4, v2, LSR #16 # r = highlow32(r16,low16(xlo))
- MOV a4, v1, ASL #16 # tmp = mulu16(low16(y),q0)
- MOV a4, a4, LSR #16
- MUL a3, a4, a1
- RSB a3, a3, a1, ASL #16 # r2 = highlow32_0(q0) - tmp
- ADDS v4, v4, a3 # r += r2
- ADDCS a1, a1, #1 # if ( r < r2 ) { q0 += 1
- SUBCS v4, v4, v1 # r -= y }
- CMP v4, v1 # if (r >= y)
- ADDCS a1, a1, #1 # { q0 += 1
- SUBCS v4, v4, v1 # r -= y }
- MOV a2, v4, LSR v3 # remainder = r >> s
- ORR a1, a1, v6, ASL #16 # return highlow32(q1,q0)
- #ifndef __GNUC__
- LDR a3,[pc,#ptr_divu_32_rest-.-8]
- STR a2,[a3,#0] # divu_32_rest = remainder
- #endif
- LDMFD sp!, {v1,v2,v3,v4,v5,v6,pc}^
-
- # extern uintD* copy_loop_up (uintD* sourceptr, uintD* destptr, uintC count);
- # entry
- # a1 = source pointer
- # a2 = destination pointer
- # a3 = count of words to store
- # exit
- # a1 = address of last word stored + 1
- # a2 - a4, ip destroyed
- EXPORT |copy_loop_up| # word aligned copy loop up
- # NAME |copy_loop_up|
- |copy_loop_up|
- ANDS a4,a3,#3 # multiple of 4 words ?
- BEQ |copy_loop_up_l1| # yup, so branch
- CMP a4,#2 # copy the first 1-3 words
- LDR a4,[a1],#4 # to align the total to a multiple
- STR a4,[a2],#4 # of 4 words
- LDRGE a4,[a1],#4
- STRGE a4,[a2],#4
- LDRGT a4,[a1],#4
- STRGT a4,[a2],#4
- |copy_loop_up_l1|
- BICS a4,a3,#3 # set counter to multiple of 4
- MOVEQ a1,a2 # return addr of last word stored
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{v1,lr} # save work regs
- |copy_loop_up_l2|
- LDMIA a1!,{a3,v1,ip,lr} # copy 4 words in one go
- STMIA a2!,{a3,v1,ip,lr}
- SUBS a4,a4,#8 # decrement counter by 8
- LDMGEIA a1!,{a3,v1,ip,lr} # if count still positive then copy
- STMGEIA a2!,{a3,v1,ip,lr} # 4 more words
- BGT |copy_loop_up_l2| # and loop
- MOV a1,a2 # return addr of last word stored
- LDMFD sp!,{v1,pc}^ # restore work regs and return
-
- # extern uintD* copy_loop_down (uintD* sourceptr, uintD* destptr, uintC count);
- # entry
- # a1 = source pointer
- # a2 = destination pointer
- # a3 = count of words to store
- # exit
- # a1 = address of last word stored
- # a2 - a4, ip destroyed
- EXPORT |copy_loop_down| # word aligned copy loop down
- # NAME |copy_loop_down|
- |copy_loop_down|
- ANDS a4,a3,#3 # multiple of 4 words ?
- BEQ |copy_loop_down_l1| # yup, so branch
- CMP a4,#2 # copy the first 1-3 words
- LDR a4,[a1,#-4]! # to align the total to a multiple
- STR a4,[a2,#-4]! # of 4 words
- LDRGE a4,[a1,#-4]!
- STRGE a4,[a2,#-4]!
- LDRGT a4,[a1,#-4]!
- STRGT a4,[a2,#-4]!
- |copy_loop_down_l1|
- BICS a4,a3,#3 # set counter to multiple of 4
- MOVEQ a1,a2 # return addr of last word stored
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{v1,lr} # save work regs
- |copy_loop_down_l2|
- LDMDB a1!,{a3,v1,ip,lr} # copy 4 words in one go
- STMDB a2!,{a3,v1,ip,lr}
- SUBS a4,a4,#8 # decrement counter by 8
- LDMGEDB a1!,{a3,v1,ip,lr} # if count still positive then copy
- STMGEDB a2!,{a3,v1,ip,lr} # 4 more words
- BGT |copy_loop_down_l2| # and loop
- MOV a1,a2 # return addr of last word stored
- LDMFD sp!,{v1,pc}^ # restore work regs and return
-
- # extern uintD* clear_loop_up (uintD* destptr, uintC count);
- # entry
- # a1 = destination pointer
- # a2 = count of words to store
- # exit
- # a1 = address of last word stored + 1
- # a2 - a4, ip destroyed
- EXPORT |clear_loop_up| # word aligned clear loop up
- |clear_loop_up|
- MOV a3,#0 # set filler to 0
- # and drop into fill_loop_up
-
- # extern uintD* fill_loop_up (uintD* destptr, uintC count, uintD filler);
- # entry
- # a1 = destination pointer
- # a2 = count of words to store
- # a3 = word to store
- # exit
- # a1 = address of last word stored + 1
- # a2 - a4, ip destroyed
- EXPORT |fill_loop_up| # word aligned fill loop up
- |fill_loop_up|
- ANDS a4,a2,#3 # multiple of 4 words ?
- BEQ |fill_loop_up_l1| # yup, so branch
- CMP a4,#2 # store the first 1-3 words
- STR a3,[a1],#4 # to align the total to a multiple
- STRGE a3,[a1],#4 # of 4 words
- STRGT a3,[a1],#4
- |fill_loop_up_l1|
- BICS a4,a2,#3 # set counter to multiple of 4
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{v1,lr} # save work regs
- MOV v1,a3 # copy filler to three other
- MOV ip,a3 # registers
- MOV lr,a3
- |fill_loop_up_l2|
- STMIA a1!,{a3,v1,ip,lr} # store 4 fillers in one go
- SUBS a4,a4,#8 # decrement counter by 8
- STMGEIA a1!,{a3,v1,ip,lr} # if count still positive then store 4
- BGT |fill_loop_up_l2| # more and loop
- LDMFD sp!,{v1,pc}^ # restore work regs and return
-
-
- # extern uintD* clear_loop_down (uintD* destptr, uintC count);
- # entry
- # a1 = destination pointer
- # a2 = count of words to store
- # exit
- # a1 = address of last word stored + 1
- # a2 - a4, ip destroyed
- EXPORT |clear_loop_down| # word aligned clear loop down
- |clear_loop_down|
- MOV a3,#0 # set filler to 0
- # and drop into fill_loop_down
-
- # extern uintD* fill_loop_down (uintD* destptr, uintC count, uintD filler);
- # entry
- # a1 = destination pointer
- # a2 = count of words to store
- # a3 = word to store
- # exit
- # a1 = address of last word stored
- # a2 - a4, ip destroyed
- EXPORT |fill_loop_down| # word aligned fill loop down
- |fill_loop_down|
- ANDS a4,a2,#3 # multiple of 4 words ?
- BEQ |fill_loop_down_l1| # yup, so branch
- CMP a4,#2 # store the first 1-3 words
- STR a3,[a1,#-4]! # to align the total to a multiple
- STRGE a3,[a1,#-4]! # of 4 words
- STRGT a3,[a1,#-4]!
- |fill_loop_down_l1|
- BICS a4,a2,#3 # set counter to multiple of 4
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{v1,lr} # save work regs
- MOV v1,a3 # copy filler to three other
- MOV ip,a3 # registers
- MOV lr,a3
- |fill_loop_down_l2|
- STMDB a1!,{a3,v1,ip,lr} # store 4 fillers in one go
- SUBS a4,a4,#8 # decrement counter by 8
- STMGEDB a1!,{a3,v1,ip,lr} # if count still positive then store 4
- BGT |fill_loop_down_l2| # more and loop
- LDMFD sp!,{v1,pc}^ # restore work regs and return
-
- # extern void or_loop_up (uintD* xptr, uintD* yptr, uintC count);
- # entry
- # a1 = xptr
- # a2 = yptr
- # a3 = count of words to be ORed
- # exit
- # xptr |= yptr for count words
- # a1 - a4, ip destroyed
- EXPORT |or_loop_up| # word aligned or loop up
- |or_loop_up|
- ANDS a4,a3,#3 # multiple of 4 words ?
- BEQ |or_loop_up_l1| # yup, so branch
- CMP a4,#2 # OR the first 1-3 words
- LDR a4,[a2],#4 # to align the total to a multiple
- LDR ip,[a1] # of 4 words
- ORR ip,ip,a4
- STR ip,[a1],#4
- BLT |or_loop_up_l1| # better to branch than skip instrs.
- LDRGE a4,[a2],#4
- LDRGE ip,[a1]
- ORRGE ip,ip,a4
- STRGE ip,[a1],#4
- LDRGT a4,[a2],#4
- LDRGT ip,[a1]
- ORRGT ip,ip,a4
- STRGT ip,[a1],#4
- |or_loop_up_l1|
- BICS a4,a3,#3 # set counter to multiple of 4
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{v1-v5,lr} # save work regs
- |or_loop_up_l2|
- LDMIA a2!,{a3,v1,v2,ip} # load 4 words in one go
- LDMIA a1,{v3,v4,v5,lr} # load target words
- ORR v3,v3,a3 # OR the four words
- ORR v4,v4,v1
- ORR v5,v5,v2
- ORR lr,lr,ip
- STMIA a1!,{v3,v4,v5,lr} # store 4 results
- SUBS a4,a4,#4 # decrement counter by 4
- BGT |or_loop_up_l2| # if count still positive then loop
- LDMFD sp!,{v1-v5,pc}^ # restore work regs and return
-
-
- # extern void xor_loop_up (uintD* xptr, uintD* yptr, uintC count);
- # entry
- # a1 = xptr
- # a2 = yptr
- # a3 = count of words to be XORed
- # exit
- # xptr ^= yptr for count words
- # a1 - a4, ip destroyed
- EXPORT |xor_loop_up| # word aligned xor loop up
- |xor_loop_up|
- ANDS a4,a3,#3 # multiple of 4 words ?
- BEQ |xor_loop_up_l1| # yup, so branch
- CMP a4,#2 # XOR the first 1-3 words
- LDR a4,[a2],#4 # to align the total to a multiple
- LDR ip,[a1] # of 4 words
- EOR ip,ip,a4
- STR ip,[a1],#4
- BLT |xor_loop_up_l1| # better to branch than skip instrs.
- LDRGE a4,[a2],#4
- LDRGE ip,[a1]
- EORGE ip,ip,a4
- STRGE ip,[a1],#4
- LDRGT a4,[a2],#4
- LDRGT ip,[a1]
- EORGT ip,ip,a4
- STRGT ip,[a1],#4
- |xor_loop_up_l1|
- BICS a4,a3,#3 # set counter to multiple of 4
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{v1-v5,lr} # save work regs
- |xor_loop_up_l2|
- LDMIA a2!,{a3,v1,v2,ip} # load 4 words in one go
- LDMIA a1,{v3,v4,v5,lr} # load target words
- EOR v3,v3,a3 # XOR the four words
- EOR v4,v4,v1
- EOR v5,v5,v2
- EOR lr,lr,ip
- STMIA a1!,{v3,v4,v5,lr} # store 4 results
- SUBS a4,a4,#4 # decrement counter by 4
- BGT |xor_loop_up_l2| # if count still positive then loop
- LDMFD sp!,{v1-v5,pc}^ # restore work regs and return
-
- # extern void and_loop_up (uintD* xptr, uintD* yptr, uintC count);
- # entry
- # a1 = xptr
- # a2 = yptr
- # a3 = count of words to be ANDed
- # exit
- # xptr &= yptr for count words
- # a1 - a4, ip destroyed
- EXPORT |and_loop_up| # word aligned and loop up
- |and_loop_up|
- ANDS a4,a3,#3 # multiple of 4 words ?
- BEQ |and_loop_up_l1| # yup, so branch
- CMP a4,#2 # AND the first 1-3 words
- LDR a4,[a2],#4 # to align the total to a multiple
- LDR ip,[a1] # of 4 words
- AND ip,ip,a4
- STR ip,[a1],#4
- BLT |and_loop_up_l1| # better to branch than skip instrs.
- LDRGE a4,[a2],#4
- LDRGE ip,[a1]
- ANDGE ip,ip,a4
- STRGE ip,[a1],#4
- LDRGT a4,[a2],#4
- LDRGT ip,[a1]
- ANDGT ip,ip,a4
- STRGT ip,[a1],#4
- |and_loop_up_l1|
- BICS a4,a3,#3 # set counter to multiple of 4
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{v1-v5,lr} # save work regs
- |and_loop_up_l2|
- LDMIA a2!,{a3,v1,v2,ip} # load 4 words in one go
- LDMIA a1,{v3,v4,v5,lr} # load target words
- AND v3,v3,a3 # AND the four words
- AND v4,v4,v1
- AND v5,v5,v2
- AND lr,lr,ip
- STMIA a1!,{v3,v4,v5,lr} # store 4 results
- SUBS a4,a4,#4 # decrement counter by 4
- BGT |and_loop_up_l2| # if count still positive then loop
- LDMFD sp!,{v1-v5,pc}^ # restore work regs and return
-
- # extern void eqv_loop_up (uintD* xptr, uintD* yptr, uintC count);
- # entry
- # a1 = xptr
- # a2 = yptr
- # a3 = count of words to be XORed
- # exit
- # xptr = ~(xptr ^ yptr) for count words
- # a1 - a4, ip destroyed
- EXPORT |eqv_loop_up| # word aligned eqv loop up
- |eqv_loop_up|
- ANDS a4,a3,#3 # multiple of 4 words ?
- BEQ |eqv_loop_up_l1| # yup, so branch
- CMP a4,#2 # EQV the first 1-3 words
- LDR a4,[a2],#4 # to align the total to a multiple
- LDR ip,[a1] # of 4 words
- EOR ip,ip,a4
- MVN ip,ip
- STR ip,[a1],#4
- BLT |eqv_loop_up_l1| # better to branch than skip instrs.
- LDRGE a4,[a2],#4
- LDRGE ip,[a1]
- EORGE ip,ip,a4
- MVNGE ip,ip
- STRGE ip,[a1],#4
- BLE |eqv_loop_up_l1| # better to branch than skip instrs.
- LDRGT a4,[a2],#4
- LDRGT ip,[a1]
- EORGT ip,ip,a4
- MVNGT ip,ip
- STRGT ip,[a1],#4
- |eqv_loop_up_l1|
- BICS a4,a3,#3 # set counter to multiple of 4
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{v1-v5,lr} # save work regs
- |eqv_loop_up_l2|
- LDMIA a2!,{a3,v1,v2,ip} # load 4 words in one go
- LDMIA a1,{v3,v4,v5,lr} # load target words
- EOR v3,v3,a3 # EVQ the four words
- MVN v3,v3
- EOR v4,v4,v1
- MVN v4,v4
- EOR v5,v5,v2
- MVN v5,v5
- EOR lr,lr,ip
- MVN lr,lr
- STMIA a1!,{v3,v4,v5,lr} # store 4 results
- SUBS a4,a4,#4 # decrement counter by 4
- BGT |eqv_loop_up_l2| # if count still positive then loop
- LDMFD sp!,{v1-v5,pc}^ # restore work regs and return
-
- # extern void nand_loop_up (uintD* xptr, uintD* yptr, uintC count);
- # entry
- # a1 = xptr
- # a2 = yptr
- # a3 = count of words to be NANDed
- # exit
- # xptr = ~(xptr & yptr) for count words
- # a1 - a4, ip destroyed
- EXPORT |nand_loop_up| # word aligned nand loop up
- |nand_loop_up|
- ANDS a4,a3,#3 # multiple of 4 words ?
- BEQ |nand_loop_up_l1| # yup, so branch
- CMP a4,#2 # NAND the first 1-3 words
- LDR a4,[a2],#4 # to align the total to a multiple
- LDR ip,[a1] # of 4 words
- AND ip,ip,a4
- MVN ip,ip
- STR ip,[a1],#4
- BLT |nand_loop_up_l1| # better to branch than skip instrs.
- LDRGE a4,[a2],#4
- LDRGE ip,[a1]
- ANDGE ip,ip,a4
- MVNGE ip,ip
- STRGE ip,[a1],#4
- BLE |nand_loop_up_l1| # better to branch than skip instrs.
- LDRGT a4,[a2],#4
- LDRGT ip,[a1]
- ANDGT ip,ip,a4
- MVNGT ip,ip
- STRGT ip,[a1],#4
- |nand_loop_up_l1|
- BICS a4,a3,#3 # set counter to multiple of 4
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{v1-v5,lr} # save work regs
- |nand_loop_up_l2|
- LDMIA a2!,{a3,v1,v2,ip} # load 4 words in one go
- LDMIA a1,{v3,v4,v5,lr} # load target words
- AND v3,v3,a3 # NAND the four words
- MVN v3,v3
- AND v4,v4,v1
- MVN v4,v4
- AND v5,v5,v2
- MVN v5,v5
- AND lr,lr,ip
- MVN lr,lr
- STMIA a1!,{v3,v4,v5,lr} # store 4 results
- SUBS a4,a4,#4 # decrement counter by 4
- BGT |nand_loop_up_l2| # if count still positive then loop
- LDMFD sp!,{v1-v5,pc}^ # restore work regs and return
-
- # extern void nor_loop_up (uintD* xptr, uintD* yptr, uintC count);
- # entry
- # a1 = xptr
- # a2 = yptr
- # a3 = count of words to be NORed
- # exit
- # xptr = ~(xptr | yptr) for count words
- # a1 - a4, ip destroyed
- EXPORT |nor_loop_up| # word aligned nor loop up
- |nor_loop_up|
- ANDS a4,a3,#3 # multiple of 4 words ?
- BEQ |nor_loop_up_l1| # yup, so branch
- CMP a4,#2 # NOR the first 1-3 words
- LDR a4,[a2],#4 # to align the total to a multiple
- LDR ip,[a1] # of 4 words
- ORR ip,ip,a4
- MVN ip,ip
- STR ip,[a1],#4
- BLT |nor_loop_up_l1| # better to branch than skip instrs.
- LDRGE a4,[a2],#4
- LDRGE ip,[a1]
- ORRGE ip,ip,a4
- MVNGE ip,ip
- STRGE ip,[a1],#4
- BLE |nor_loop_up_l1| # better to branch than skip instrs.
- LDRGT a4,[a2],#4
- LDRGT ip,[a1]
- ORRGT ip,ip,a4
- MVNGT ip,ip
- STRGT ip,[a1],#4
- |nor_loop_up_l1|
- BICS a4,a3,#3 # set counter to multiple of 4
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{v1-v5,lr} # save work regs
- |nor_loop_up_l2|
- LDMIA a2!,{a3,v1,v2,ip} # load 4 words in one go
- LDMIA a1,{v3,v4,v5,lr} # load target words
- ORR v3,v3,a3 # NOR the four words
- MVN v3,v3
- ORR v4,v4,v1
- MVN v4,v4
- ORR v5,v5,v2
- MVN v5,v5
- ORR lr,lr,ip
- MVN lr,lr
- STMIA a1!,{v3,v4,v5,lr} # store 4 results
- SUBS a4,a4,#4 # decrement counter by 4
- BGT |nor_loop_up_l2| # if count still positive then loop
- LDMFD sp!,{v1-v5,pc}^ # restore work regs and return
-
- # extern void andc2_loop_up (uintD* xptr, uintD* yptr, uintC count);
- # entry
- # a1 = xptr
- # a2 = yptr
- # a3 = count of words to be ANDC2ed
- # exit
- # xptr = xptr & ~yptr for count words
- # a1 - a4, ip destroyed
- EXPORT |andc2_loop_up| # word aligned andc2 loop up
- |andc2_loop_up|
- ANDS a4,a3,#3 # multiple of 4 words ?
- BEQ |andc2_loop_up_l1| # yup, so branch
- CMP a4,#2 # ANDC2 the first 1-3 words
- LDR a4,[a2],#4 # to align the total to a multiple
- LDR ip,[a1] # of 4 words
- BIC ip,ip,a4
- STR ip,[a1],#4
- BLT |andc2_loop_up_l1| # better to branch than skip instrs.
- LDRGE a4,[a2],#4
- LDRGE ip,[a1]
- BICGE ip,ip,a4
- STRGE ip,[a1],#4
- LDRGT a4,[a2],#4
- LDRGT ip,[a1]
- BICGT ip,ip,a4
- STRGT ip,[a1],#4
- |andc2_loop_up_l1|
- BICS a4,a3,#3 # set counter to multiple of 4
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{v1-v5,lr} # save work regs
- |andc2_loop_up_l2|
- LDMIA a2!,{a3,v1,v2,ip} # load 4 words in one go
- LDMIA a1,{v3,v4,v5,lr} # load target words
- BIC v3,v3,a3 # ANDC2 the four words
- BIC v4,v4,v1
- BIC v5,v5,v2
- BIC lr,lr,ip
- STMIA a1!,{v3,v4,v5,lr} # store 4 results
- SUBS a4,a4,#4 # decrement counter by 4
- BGT |andc2_loop_up_l2| # if count still positive then loop
- LDMFD sp!,{v1-v5,pc}^ # restore work regs and return
-
- # extern void orc2_loop_up (uintD* xptr, uintD* yptr, uintC count);
- # entry
- # a1 = xptr
- # a2 = yptr
- # a3 = count of words to be XORed
- # exit
- # xptr = xptr | ~yptr for count words
- # a1 - a4, ip destroyed
- EXPORT |orc2_loop_up| # word aligned orc2 loop up
- |orc2_loop_up|
- ANDS a4,a3,#3 # multiple of 4 words ?
- BEQ |orc2_loop_up_l1| # yup, so branch
- CMP a4,#2 # ORC2 the first 1-3 words
- LDR a4,[a2],#4 # to align the total to a multiple
- LDR ip,[a1] # of 4 words
- MVN a4,a4
- ORR ip,ip,a4
- STR ip,[a1],#4
- BLT |orc2_loop_up_l1| # better to branch than skip instrs.
- LDRGE a4,[a2],#4
- LDRGE ip,[a1]
- MVNGE a4,a4
- ORRGE ip,ip,a4
- STRGE ip,[a1],#4
- BLE |orc2_loop_up_l1| # better to branch than skip instrs.
- LDRGT a4,[a2],#4
- LDRGT ip,[a1]
- MVNGT a4,a4
- ORRGT ip,ip,a4
- STRGT ip,[a1],#4
- |orc2_loop_up_l1|
- BICS a4,a3,#3 # set counter to multiple of 4
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{v1-v5,lr} # save work regs
- |orc2_loop_up_l2|
- LDMIA a2!,{a3,v1,v2,ip} # load 4 words in one go
- LDMIA a1,{v3,v4,v5,lr} # load target words
- MVN a3,a3 # ORC2 the four words
- ORR v3,v3,a3
- MVN v1,v1
- ORR v4,v4,v1
- MVN v2,v2
- ORR v5,v5,v2
- MVN ip,ip
- ORR lr,lr,ip
- STMIA a1!,{v3,v4,v5,lr} # store 4 results
- SUBS a4,a4,#4 # decrement counter by 4
- BGT |orc2_loop_up_l2| # if count still positive then loop
- LDMFD sp!,{v1-v5,pc}^ # restore work regs and return
-
- # extern void not_loop_up (uintD* xptr, uintC count);
- # entry
- # a1 = xptr
- # a2 = count of words to be NOTed
- # exit
- # xptr = ~xptr for count words
- # a1 - a4, ip destroyed
- EXPORT |not_loop_up| # word aligned not loop up
- |not_loop_up|
- ANDS a3,a2,#3 # multiple of 4 words ?
- BEQ |not_loop_up_l1| # yup, so branch
- CMP a3,#2 # NOT the first 1-3 words
- LDR a3,[a1] # to align the total to a multiple
- MVN a3,a3 # of 4 words
- STR a3,[a1],#4
- BLT |not_loop_up_l1| # better to branch than skip instrs.
- LDRGE a3,[a1]
- MVNGE a3,a3
- STRGE a3,[a1],#4
- LDRGT a3,[a1]
- MVNGT a3,a3
- STRGT a3,[a1],#4
- |not_loop_up_l1|
- BICS a4,a2,#3 # set counter to multiple of 4
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{lr} # save work regs
- |not_loop_up_l2|
- LDMIA a1,{a2,a3,ip,lr} # load 4 words in one go,NO writeback
- MVN a2,a2 # NOT the four words
- MVN a3,a3
- MVN ip,ip
- MVN lr,lr
- STMIA a1!,{a2,a3,ip,lr} # store 4 results
- SUBS a4,a4,#4 # decrement counter by 4
- BGT |not_loop_up_l2| # if count still positive then loop
- LDMFD sp!,{pc}^ # restore work regs and return
-
- # extern void and_test_loop_up (uintD* xptr, uintD* yptr, uintC count);
- # entry
- # a1 = xptr
- # a2 = yptr
- # a3 = count of words to be AND_TESTed
- # exit
- # a1 = TRUE if any words ANDed together are non-zero else FALSE
- # a2 - a4, ip destroyed
- EXPORT |and_test_loop_up| # word aligned and_test loop up
- |and_test_loop_up|
- ANDS a4,a3,#3 # multiple of 4 words ?
- BEQ |and_test_loop_up_l1| # yup, so branch
- CMP a4,#2
- LDR a4,[a2],#4 # AND_TEST the first 1-3 words
- LDR ip,[a1],#4 # to align the total to a multiple
- TST ip,a4 # of 4 words
- MOVNE a1,#1 # return TRUE if AND_TEST ok
- MOVNES pc,lr
- BCC |and_test_loop_up_l1| # better to branch than skip instrs.
- LDRGE a4,[a2],#4
- LDRGE ip,[a1],#4
- TSTGE ip,a4
- MOVNE a1,#1
- MOVNES pc,lr
- ANDS a4,a3,#3
- CMP a4,#2
- BLE |and_test_loop_up_l1| # better to branch than skip instrs.
- LDRGT a4,[a2],#4
- LDRGT ip,[a1],#4
- TSTGT ip,a4
- MOVNE a1,#1
- MOVNES pc,lr
- |and_test_loop_up_l1|
- BICS a4,a3,#3 # set counter to multiple of 4
- MOVEQ a1,#0 # return FALSE
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{v1-v5,lr} # save work regs
- MOV v6,a1 # move xptr to v6
- MOV a1,#1 # set result to TRUE
- |and_test_loop_up_l2|
- LDMIA a2!,{a3,v1,v2,ip} # load 4 words in one go
- LDMIA v6!,{v3,v4,v5,lr} # load target words
- TST v3,a3 # AND_TEST the four words
- TSTEQ v4,v1
- TSTEQ v5,v2
- TSTEQ lr,ip
- LDMNEFD sp!,{v1-v6,pc}^
- SUBS a4,a4,#4 # decrement counter by 4
- BGT |and_test_loop_up_l2| # if count still positive then loop
- MOV a1,#0
- LDMFD sp!,{v1-v5,pc}^ # restore work regs and return
-
- # extern void test_loop_up (uintD* xptr, uintC count);
- # entry
- # a1 = xptr
- # a2 = count of words to be TESTed
- # exit
- # a1 = TRUE if any words are non-zero else FALSE
- # a2 - a4, ip destroyed
- EXPORT |test_loop_up| # word aligned test loop up
- |test_loop_up|
- MOV ip,a1 # move xptr to ip
- MOV a1,#1 # set result to TRUE
- ANDS a3,a2,#3 # multiple of 4 words ?
- BEQ |test_loop_up_l1| # yup, so branch
- LDR a4,[ip],#4 # TEST the first 1-3 words
- TEQ a4,#0 # align the total to a multiple of 4
- MOVNES pc,lr # return TRUE if AND_TEST ok
- CMP a3,#2
- BLT |test_loop_up_l1| # need to branch 'cos PSR set
- LDRGE a4,[ip],#4 # when checking against zero
- TEQGE a4,#0
- MOVNES pc,lr
- CMP a3,#2
- BLE |test_loop_up_l1| # need to branch 'cos PSR set
- LDRGT a4,[ip],#4 # when checking against zero
- TEQGT a4,#0
- MOVNES pc,lr
- |test_loop_up_l1|
- BICS a4,a2,#3 # set counter to multiple of 4
- MOVEQ a1,#0 # return FALSE
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{v1,lr} # save work regs
- |test_loop_up_l2|
- LDMIA ip!,{a2,a3,v1,lr} # load 4 words in one go
- TEQ a2,#0 # TEST the four words
- TEQEQ a3,#0
- TEQEQ v1,#0
- TEQEQ lr,#0
- LDMNEFD sp!,{v1,pc}^
- SUBS a4,a4,#4 # decrement counter by 4
- BGT |test_loop_up_l2| # if count still positive then loop
- MOV a1,#0
- LDMFD sp!,{v1,pc}^ # restore work regs and return
-
- # extern void compare_loop_up (uintD* xptr, uintD* yptr, uintC count);
- # entry
- # a1 = xptr
- # a2 = yptr
- # a3 = count of words to be COMPAREd
- # exit
- # a1 = +1 if first non-equal word in xptr[] and yptr[]
- # xptr[i] > yptr[i]
- # -1 if xptr[i] < yptr[i]
- # 0 otherwise
- # a2 - a4, ip destroyed
- EXPORT |compare_loop_up| # word aligned compare loop up
- |compare_loop_up|
- ANDS a4,a3,#3 # multiple of 4 words ?
- BEQ |compare_loop_up_l1| # yup, so branch
- LDR a4,[a2],#4 # COMPARE the first 1-3 words
- LDR ip,[a1],#4 # to align the total to a multiple
- CMP ip,a4 # of 4 words
- MVNLO a1,#0 # x < y -> -1
- MOVHI a1,#1 # x > y -> +1
- MOVNES pc,lr # and return result if not equal
- ANDS a4,a3,#3
- CMP a4,#2
- BLT |compare_loop_up_l1| # need to branch 'cos PSR used
- LDR a4,[a2],#4
- LDR ip,[a1],#4
- CMP ip,a4
- MVNLO a1,#0
- MOVHI a1,#1
- MOVNES pc,lr
- ANDS a4,a3,#3
- CMP a4,#2
- BLE |compare_loop_up_l1| # need to branch 'cos PSR used
- LDR a4,[a2],#4
- LDR ip,[a1],#4
- CMP ip,a4
- MVNLO a1,#0
- MOVHI a1,#1
- MOVNES pc,lr
- |compare_loop_up_l1|
- BICS a4,a3,#3 # set counter to multiple of 4
- MOVEQ a1,#0 # xptr[] == yptr[] -> 0
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{v1-v6,lr} # save work regs
- MOV v6,a1 # move xptr to v6
- MOV a1,#1 # set result to +1
- |compare_loop_up_l2|
- LDMIA a2!,{a3,v1,v2,ip} # load 4 words in one go
- LDMIA v6!,{v3,v4,v5,lr} # load test words
- CMP v3,a3 # COMPARE the four words
- CMPEQ v4,v1
- CMPEQ v5,v2
- CMPEQ lr,ip
- MVNLO a1,#0 # x < y -> -1 (a1 already holds +1)
- LDMNEFD sp!,{v1-v6,pc}^
- SUBS a4,a4,#4 # decrement counter by 4
- BGT |compare_loop_up_l2| # if count still positive then loop
- MOV a1,#0
- LDMFD sp!,{v1-v6,pc}^ # restore work regs and return
-
- # extern uintD addto_loop_down (uintD* sourceptr, uintD* destptr, uintC count);
- # entry
- # a1 = sourceptr
- # a2 = destptr
- # a3 = count of words to be added
- # exit
- # destptr[] = sourceptr[] + destptr[]
- # a1 = last carry
- # a2 - a4, ip destroyed
- EXPORT |addto_loop_down| # word aligned addto loop down
- |addto_loop_down|
- MOV a4,a3 # set regs for a call
- MOV a3,a2 # to add_loop_down
- # and drop into add_loop_down
-
- # extern uintD add_loop_down (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count);
- # entry
- # a1 = sourceptr1
- # a2 = sourceptr2
- # a3 = destptr
- # a4 = count of words to be added
- # exit
- # destptr[] = sourceptr1[] + sourceptr2[]
- # a1 = last carry
- # a2 - a4, ip destroyed
- EXPORT |add_loop_down| # word aligned add loop down
- |add_loop_down|
- ANDS ip,a4,#3 # multiple of 4 words ?
- BEQ |add_loop_down_l1| # yup, so branch
- STMFD sp!,{v6,lr}
- LDR v6,[a2,#-4]! # add the first 1-3 words
- LDR lr,[a1,#-4]! # to align the total to a multiple
- ADDS lr,lr,v6 # of 4 words
- STR lr,[a3,#-4]!
- TEQ ip,#1
- BEQ |add_loop_down_l0| # need to branch 'cos PSR used
- LDR v6,[a2,#-4]!
- LDR lr,[a1,#-4]!
- ADCS lr,lr,v6
- STR lr,[a3,#-4]!
- TEQ ip,#2
- BEQ |add_loop_down_l0| # need to branch 'cos PSR used
- LDR v6,[a2,#-4]!
- LDR lr,[a1,#-4]!
- ADCS lr,lr,v6
- STR lr,[a3,#-4]!
- |add_loop_down_l0| # at least one add has happened
- BICS a4,a4,#3 # set counter to multiple of 4
- BNE |add_loop_down_l3| # branch if more adds to do
- ADCEQ a1,a4,a4 # set result to Carry (a4 is 0)
- LDMEQFD sp!,{v6,pc}^ # and return
- |add_loop_down_l1|
- BICS a4,a4,#3 # set counter to multiple of 4
- MOVEQ a1,#0 # no adds, so C = 0
- MOVEQS pc,lr # if zero then we're done
- CMN a4,#0 # clear carry bit
- STMFD sp!,{v6,lr}
- |add_loop_down_l3|
- STMFD sp!,{v1-v5} # save work regs
- |add_loop_down_l2|
- LDMDB a2!,{v1,v2,v3,ip} # load 4 words in one go
- LDMDB a1!,{v4,v5,v6,lr} # and from source2
- ADCS lr,lr,ip # add the four words with carry
- ADCS v6,v6,v3
- ADCS v5,v5,v2
- ADCS v4,v4,v1
- STMDB a3!,{v4,v5,v6,lr} # store 4 results
- SUB a4,a4,#4 # decrement counter by 4, preserve C
- TEQ a4,#0 # are we done ?
- BNE |add_loop_down_l2| # if count non-zero then loop
- ADC a1,a4,a4 # set result to Carry (a4 is 0)
- LDMFD sp!,{v1-v6,pc}^ # restore work regs and return
-
- # extern uintD inc_loop_down (uintD* ptr, uintC count);
- # entry
- # a1 = ptr
- # a2 = count of words to be INCed
- # exit
- # a1 = 0 if any words are non-zero after increment else 1
- # stop incrementing when first word becomes non-zero
- # a2 - a4, ip destroyed
- EXPORT |inc_loop_down| # word aligned inc loop down
- |inc_loop_down|
- ANDS a3,a2,#1 # multiple of 2 words ?
- BEQ |inc_loop_down_l1| # yup, so branch
- LDR a4,[a1,#-4]! # INC the first word
- ADDS a4,a4,#1 # align the total to a multiple of 2
- STR a4,[a1]
- MOVNE a1,#0 # set result to 0
- MOVNES pc,lr # return 0 if non-zero result
- |inc_loop_down_l1|
- BICS a4,a2,#1 # set counter to multiple of 2
- MOVEQ a1,#1 # return 1
- MOVEQS pc,lr # if zero then we're done
- MOV ip,a1 # move ptr to ip
- MOV a1,#0 # set result to 0
- ANDS a3,a4,#3
- BEQ |inc_loop_down_l3|
- LDMDB ip,{a2,a3} # load 2 words in one go
- ADDS a3,a3,#1 # INC the two words
- ADDEQS a2,a2,#1 # stopping when first word non-zero
- STMDB ip!,{a2,a3} # store 2 results
- MOVNES pc,lr # return 0 if any result non-zero
- SUBS a4,a4,#2 # decrement counter by 2
- MOVEQ a1,#1 # if finished loop then
- MOVEQS pc,lr # return 1
- |inc_loop_down_l3| # now a multiple of 4 words
- STMFD sp!,{v1,lr} # save work regs
- |inc_loop_down_l2|
- LDMDB ip,{a2,a3,v1,lr} # load 4 words in one go
- ADDS lr,lr,#1 # INC the four words
- ADDEQS v1,v1,#1 # stopping when first word non-zero
- ADDEQS a3,a3,#1
- ADDEQS a2,a2,#1
- STMDB ip!,{a2,a3,v1,lr} # store 4 results
- LDMNEFD sp!,{v1,pc}^ # return 0 if any result non-zero
- SUBS a4,a4,#4 # decrement counter by 4
- BGT |inc_loop_down_l2| # if count still positive then loop
- MOV a1,#1
- LDMFD sp!,{v1,pc}^ # restore work regs and return 1
-
- # extern uintD sub_loop_down (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count);
- # entry
- # a1 = sourceptr1
- # a2 = sourceptr2
- # a3 = destptr
- # a4 = count of words to be subtracted
- # exit
- # destptr[] = sourceptr1[] - sourceptr2[]
- # a1 = last carry
- # a2 - a4, ip destroyed
- EXPORT |sub_loop_down| # word aligned sub loop down
- |sub_loop_down|
- ANDS ip,a4,#3 # multiple of 4 words ?
- BEQ |sub_loop_down_l1| # yup, so branch
- STMFD sp!,{v6,lr}
- LDR v6,[a2,#-4]! # subtract the first 1-3 words
- LDR lr,[a1,#-4]! # to align the total to a multiple
- SUBS lr,lr,v6 # of 4 words
- STR lr,[a3,#-4]!
- TEQ ip,#1
- BNE |sub_loop_down_l0| # branch if more than one subtract
- |sub_loop_down_l4| # drop through for better instr. timings
- BICS a4,a4,#3 # set counter to multiple of 4
- SBCEQ a1,a4,a4 # set result to Carry (a4 is 0)
- LDMEQFD sp!,{v6,pc}^ # and return
- STMFD sp!,{v1-v5} # save work regs
- B |sub_loop_down_l2| # branch if more subtracts to do
- |sub_loop_down_l0|
- LDR v6,[a2,#-4]!
- LDR lr,[a1,#-4]!
- SBCS lr,lr,v6
- STR lr,[a3,#-4]!
- TEQ ip,#2
- BEQ |sub_loop_down_l4| # need to branch 'cos PSR used
- LDR v6,[a2,#-4]!
- LDR lr,[a1,#-4]!
- SBCS lr,lr,v6
- STR lr,[a3,#-4]!
- B |sub_loop_down_l4|
- |sub_loop_down_l1|
- BICS a4,a4,#3 # set counter to multiple of 4
- MOVEQ a1,#0 # no subtracts, so C = 0
- MOVEQS pc,lr # if zero then we're done
- CMP a4,#0 # set carry bit, since a4 > 0
- STMFD sp!,{v1-v6,lr} # save work regs
- |sub_loop_down_l2|
- LDMDB a2!,{v1,v2,v3,ip} # load 4 words in one go
- LDMDB a1!,{v4,v5,v6,lr} # and from source2
- SBCS lr,lr,ip # subtract the four words with carry
- SBCS v6,v6,v3
- SBCS v5,v5,v2
- SBCS v4,v4,v1
- STMDB a3!,{v4,v5,v6,lr} # store 4 results
- SUB a4,a4,#4 # decrement counter by 4, preserve C
- TEQ a4,#0 # are we done ?
- BNE |sub_loop_down_l2| # if count non-zero then loop
- SBC a1,a4,a4 # set result to Carry (a4 is 0)
- LDMFD sp!,{v1-v6,pc}^ # restore work regs and return
-
- # extern uintD subx_loop_down (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count, uintD carry);
- # entry
- # a1 = sourceptr1
- # a2 = sourceptr2
- # a3 = destptr
- # a4 = count of words to be subtracted
- # [sp] = carry
- # exit
- # destptr[] = sourceptr1[] - sourceptr2[]
- # a1 = last carry
- # a2 - a4, ip destroyed
- EXPORT |subx_loop_down| # word aligned xsub loop down
- |subx_loop_down|
- LDR ip,[sp] # get starting value of carry
- |subx_loop_down_lsub|
- RSBS ip,ip,#0 # set carry in PSR
- ANDS ip,a4,#3 # multiple of 4 words ?
- BEQ |subx_loop_down_l1| # yup, so branch
- STMFD sp!,{v6,lr}
- LDR v6,[a2,#-4]! # subtract the first 1-3 words
- LDR lr,[a1,#-4]! # to align the total to a multiple
- SBCS lr,lr,v6 # of 4 words
- STR lr,[a3,#-4]!
- TEQ ip,#1
- BNE |subx_loop_down_l0| # branch if more than one subtract
- |subx_loop_down_l4| # drop through for better instr. timings
- BICS a4,a4,#3 # set counter to multiple of 4
- SBCEQ a1,a4,a4 # set result to Carry (a4 is 0)
- LDMEQFD sp!,{v6,pc}^ # and return
- STMFD sp!,{v1-v5} # save work regs
- B |subx_loop_down_l2| # branch if more subtracts to do
- |subx_loop_down_l0|
- LDR v6,[a2,#-4]!
- LDR lr,[a1,#-4]!
- SBCS lr,lr,v6
- STR lr,[a3,#-4]!
- TEQ ip,#2
- BEQ |subx_loop_down_l4| # need to branch 'cos PSR used
- LDR v6,[a2,#-4]!
- LDR lr,[a1,#-4]!
- SBCS lr,lr,v6
- STR lr,[a3,#-4]!
- B |subx_loop_down_l4|
- |subx_loop_down_l1|
- BICS a4,a4,#3 # set counter to multiple of 4
- SBCEQ a1,a4,a4 # set result to Carry (a4 is 0)
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{v1-v6,lr} # save work regs
- |subx_loop_down_l2|
- LDMDB a2!,{v1,v2,v3,ip} # load 4 words in one go
- LDMDB a1!,{v4,v5,v6,lr} # and from source2
- SBCS lr,lr,ip # subtract the four words with carry
- SBCS v6,v6,v3
- SBCS v5,v5,v2
- SBCS v4,v4,v1
- STMDB a3!,{v4,v5,v6,lr} # store 4 results
- SUB a4,a4,#4 # decrement counter by 4, preserve C
- TEQ a4,#0 # are we done ?
- BNE |subx_loop_down_l2| # if count non-zero then loop
- SBC a1,a4,a4 # set result to Carry (a4 is 0)
- LDMFD sp!,{v1-v6,pc}^ # restore work regs and return
-
- # extern uintD subfrom_loop_down (uintD* sourceptr, uintD* destptr, uintC count);
- # entry
- # a1 = sourceptr
- # a2 = destptr
- # a3 = count of words to be subtracted
- # exit
- # destptr[] = destptr[] - sourceptr[]
- # a1 = last carry
- # a2 - a4, ip destroyed
- EXPORT |subfrom_loop_down|
- |subfrom_loop_down| # word aligned subfrom loop down
- ANDS ip,a3,#3 # multiple of 4 words ?
- BEQ |subfrom_loop_down_l1| # yup, so branch
- STMFD sp!,{lr}
- LDR a4,[a1,#-4]! # subtract the first 1-3 words
- LDR lr,[a2,#-4]! # to align the total to a multiple
- SUBS lr,lr,a4 # of 4 words
- STR lr,[a2]
- TEQ ip,#1
- BNE |subfrom_loop_down_l0| # branch if more than one subtract
- |subfrom_loop_down_l4| # drop through for better instr. timings
- BICS a4,a3,#3 # set counter to multiple of 4
- SBCEQ a1,a4,a4 # set result to Carry (a4 is 0)
- LDMEQFD sp!,{pc}^ # and return
- STMFD sp!,{v1-v5} # save work regs
- B |subfrom_loop_down_l2| # branch if more subtracts to do
- |subfrom_loop_down_l0|
- LDR a4,[a1,#-4]!
- LDR lr,[a2,#-4]!
- SBCS lr,lr,a4
- STR lr,[a2]
- TEQ ip,#2
- BEQ |subfrom_loop_down_l4| # need to branch 'cos PSR used
- LDR a4,[a1,#-4]!
- LDR lr,[a2,#-4]!
- SBCS lr,lr,a4
- STR lr,[a2]
- B |subfrom_loop_down_l4|
- |subfrom_loop_down_l1|
- BICS a4,a3,#3 # set counter to multiple of 4
- MOVEQ a1,#0 # no subtracts, so C = 0
- MOVEQS pc,lr # if zero then we're done
- CMP a4,#0 # set carry bit, since a4 > 0
- STMFD sp!,{v1-v5,lr} # save work regs
- |subfrom_loop_down_l2|
- LDMDB a1!,{a3,v1,v2,ip} # load 4 words in one go
- LDMDB a2,{v3,v4,v5,lr} # and from destptr
- SBCS lr,lr,ip # subtract the four words with carry
- SBCS v5,v5,v2
- SBCS v4,v4,v1
- SBCS v3,v3,a3
- STMDB a2!,{v3,v4,v5,lr} # store 4 results
- SUB a4,a4,#4 # decrement counter by 4, preserve C
- TEQ a4,#0 # are we done ?
- BNE |subfrom_loop_down_l2| # if count non-zero then loop
- SBC a1,a4,a4 # set result to Carry (a4 is 0)
- LDMFD sp!,{v1-v5,pc}^ # restore work regs and return
-
- # extern uintD dec_loop_down (uintD* ptr, uintC count);
- # entry
- # a1 = ptr
- # a2 = count of words to be DECed
- # exit
- # a1 = 0 if any words are non-zero before decrement else -1
- # stop decrementing when first word is non-zero
- # a2 - a4, ip destroyed
- EXPORT |dec_loop_down|
- |dec_loop_down| # word aligned dec loop down
- ANDS a3,a2,#1 # multiple of 2 words ?
- BEQ |dec_loop_down_l1| # yup, so branch
- LDR a4,[a1,#-4]! # DEC the first word
- SUBS a4,a4,#1 # align the total to a multiple of 2
- STR a4,[a1]
- MOVCS a1,#0 # set result to 0
- MOVCSS pc,lr # return 0 if non-zero result
- |dec_loop_down_l1|
- BICS a4,a2,#1 # set counter to multiple of 2
- MVNEQ a1,#0 # return -1
- MOVEQS pc,lr # if zero then we're done
- MOV ip,a1 # move ptr to ip
- MOV a1,#0 # set result to 0
- ANDS a3,a4,#3
- BEQ |dec_loop_down_l3|
- LDMDB ip,{a2,a3} # load 2 words in one go
- SUBS a3,a3,#1 # DEC the two words
- SUBCCS a2,a2,#1 # stopping when first word non-zero
- STMDB ip!,{a2,a3} # store 2 results
- MOVCSS pc,lr # return 0 if any result non-zero
- SUBS a4,a4,#2 # decrement counter by 2
- MVNEQ a1,#0 # if finished loop then
- MOVEQS pc,lr # return -1
- |dec_loop_down_l3| # now a multiple of 4 words
- STMFD sp!,{v1,lr} # save work regs
- |dec_loop_down_l2|
- LDMDB ip,{a2,a3,v1,lr} # load 4 words in one go
- SUBS lr,lr,#1 # DEC the four words
- SUBCCS v1,v1,#1 # stopping when first word non-zero
- SUBCCS a3,a3,#1
- SUBCCS a2,a2,#1
- STMDB ip!,{a2,a3,v1,lr} # store 4 results
- LDMCSFD sp!,{v1,pc}^ # return 0 if any carry
- SUBS a4,a4,#4 # decrement counter by 4
- BGT |dec_loop_down_l2| # if count still positive then loop
- MVN a1,#0
- LDMFD sp!,{v1,pc}^ # restore work regs and return -1
-
- # extern void neg_loop_down (uintD* ptr, uintC count);
- # entry
- # a1 = ptr
- # a2 = count of words. The long integer is to be NEGated
- # exit
- # ptr[] = -ptr[] for count words
- # a1 = last carry
- # a2 - a4, ip destroyed
- EXPORT |neg_loop_down|
- |neg_loop_down| # word aligned neg loop down
- CMPS a2,#0 # count = 0 ?
- MOVEQ a1,#0 # yup, so return 0
- MOVEQS pc,lr
- |neg_loop_down_l1| # skip all the zero words first
- LDR a3,[a1,#-4]! # compare words against zero
- CMPS a3,#0 # downwards in memory
- BNE |neg_loop_down_l2| # non-zero, so negate rest of words
- SUBS a2,a2,#1 # reduce count of words
- BNE |neg_loop_down_l1| # more ?, so loop
- MOV a1,#0 # return 0
- MOVS pc,lr
- |neg_loop_down_l2|
- RSB a3,a3,#0 # first non-zero word = -word
- STR a3,[a1]
- SUBS a2,a2,#1
- MVNEQ a1,#0 # done ? -> return -1
- MOVEQS pc,lr
- # now NOT rest of the words
- ANDS a3,a2,#3 # multiple of 4 words ?
- BEQ |neg_loop_down_l3| # yup, so branch
- CMP a3,#2 # NOT the first 1-3 words
- LDR a3,[a1,#-4]! # to align the total to a multiple
- MVN a3,a3 # of 4 words
- STR a3,[a1]
- BLT |neg_loop_down_l3| # better to branch than skip instrs.
- LDRGE a3,[a1,#-4]!
- MVNGE a3,a3
- STRGE a3,[a1]
- LDRGT a3,[a1,#-4]!
- MVNGT a3,a3
- STRGT a3,[a1]
- |neg_loop_down_l3|
- BICS a4,a2,#3 # set counter to multiple of 4
- MVNEQ a1,#0 # set result to -1
- MOVEQS pc,lr # if zero then we're done
- STMFD sp!,{lr} # save work regs
- |neg_loop_down_l4|
- LDMDB a1,{a2,a3,ip,lr} # load 4 words in one go,NO writeback
- MVN a2,a2 # NOT the four words
- MVN a3,a3
- MVN ip,ip
- MVN lr,lr
- STMDB a1!,{a2,a3,ip,lr} # store 4 results
- SUBS a4,a4,#4 # decrement counter by 4
- BGT |neg_loop_down_l4| # if count still positive then loop
- MVN a1,#0 # set result to -1
- LDMFD sp!,{pc}^ # restore work regs and return -1
-
- # extern uintD shift1left_loop_down (uintD* ptr, uintC count);
- # entry
- # a1 = ptr
- # a2 = count of words to be shifted left
- # exit
- # a1 = carry out from last shift left
- # a2 - a4, ip destroyed
- EXPORT |shift1left_loop_down|
- |shift1left_loop_down| # word aligned shift1left loop down
- CMN a1,#0 # clear carry bit, since a1 > 0
- ANDS a3,a2,#1 # multiple of 2 words ?
- BEQ |shift1left_loop_down_l1| # yup, so branch
- LDR a4,[a1,#-4]! # shift left the first word
- ADDS a4,a4,a4
- STR a4,[a1]
- |shift1left_loop_down_l1|
- BICS a4,a2,#1 # set counter to multiple of 2
- ADCEQ a1,a4,a4 # if zero set result to C (a4 is 0)
- MOVEQS pc,lr # and return
- ANDS a3,a4,#3 # multiple of 4 words ?
- BEQ |shift1left_loop_down_l3| # yup, so branch
- LDMDB a1,{a2,a3} # load 2 words in one go
- ADCS a3,a3,a3 # shift left the two words
- ADCS a2,a2,a2
- STMDB a1!,{a2,a3} # store 2 results
- BICS a4,a4,#2 # decrement counter by 2
- ADCEQ a1,a4,a4 # set result to Carry (a4 is 0)
- MOVEQS pc,lr # and return
- |shift1left_loop_down_l3| # now a multiple of 4 words
- STMFD sp!,{lr} # save work regs
- |shift1left_loop_down_l2|
- LDMDB a1,{a2,a3,ip,lr} # load 4 words in one go
- ADCS lr,lr,lr # shift left the four words
- ADCS ip,ip,ip
- ADCS a3,a3,a3
- ADCS a2,a2,a2
- STMDB a1!,{a2,a3,ip,lr} # store 4 results
- SUB a4,a4,#4 # decrement counter by 4
- TEQ a4,#0 # are we done ?
- BNE |shift1left_loop_down_l2| # if count non-zero then loop
- ADC a1,a4,a4 # set result to Carry (a4 is 0)
- LDMFD sp!,{pc}^ # restore work regs and return 1
-
- # extern uintD shiftleft_loop_down (uintD* ptr, uintC count, uintC i, uintD carry);
- # entry
- # a1 = ptr
- # a2 = count of words to be shifted left
- # a3 = size of left shift
- # a4 = value to ORR in for first shift
- # exit
- # a1 = shift out from last shift left
- # a2 - a4, ip destroyed
- EXPORT |shiftleft_loop_down|
- |shiftleft_loop_down| # word aligned shiftleft loop down
- STMFD sp!,{v6,lr}
- RSB v6,a3,#32 # size of complementary right shift
- ANDS ip,a2,#3 # multiple of 4 words ?
- BEQ |shiftleft_loop_down_l1| # yup, so branch
- LDR lr,[a1,#-4]! # shiftleft the first 1-3 words
- ORR a4,a4,lr,ASL a3 # to align the total to a multiple
- STR a4,[a1,#0] # of 4 words
- MOV a4,lr,LSR v6
- CMP ip,#2
- BLT |shiftleft_loop_down_l1| # better to branch than skip instrs.
- LDRGE lr,[a1,#-4]!
- ORRGE a4,a4,lr,ASL a3
- STRGE a4,[a1,#0]
- MOVGE a4,lr,LSR v6
- LDRGT lr,[a1,#-4]!
- ORRGT a4,a4,lr,ASL a3
- STRGT a4,[a1,#0]
- MOVGT a4,lr,LSR v6
- |shiftleft_loop_down_l1|
- BICS ip,a2,#3 # set counter to multiple of 4
- MOVEQ a1,a4 # if zero then we're done
- LDMEQFD sp!,{v6,pc}^ # so return last shift out
- STMFD sp!,{v1-v3} # save work regs
- |shiftleft_loop_down_l2|
- LDMDB a1,{a2,v1,v2,v3} # load 4 words in one go
- ORR lr,a4,v3,ASL a3 # shiftleft the four words
- MOV a4,v3,LSR v6 # keep carry in a4
- ORR v3,a4,v2,ASL a3 # and store results up a register
- MOV a4,v2,LSR v6 # to regs v1-v3,lr
- ORR v2,a4,v1,ASL a3
- MOV a4,v1,LSR v6
- ORR v1,a4,a2,ASL a3
- MOV a4,a2,LSR v6
- STMDB a1!,{v1,v2,v3,lr} # store 4 results
- SUBS ip,ip,#4 # decrement counter by 4
- BGT |shiftleft_loop_down_l2| # if count still positive then loop
- MOV a1,a4 # result = last shift out
- LDMFD sp!,{v1-v3,v6,pc}^ # restore work regs and return
-
- # extern uintD shiftleftcopy_loop_down (uintD* sourceptr, uintD* destptr, uintC count, uintC i);
- # entry
- # a1 = sourceptr
- # a2 = destptr
- # a3 = count of words to be shifted left
- # a4 = size of left shift
- # exit
- # a1 = shift out from last shift left
- # a2 - a4, ip destroyed
- EXPORT |shiftleftcopy_loop_down|
- |shiftleftcopy_loop_down| # word aligned shiftleftcopy loop down
- STMFD sp!,{v5,v6,lr}
- MOV v5,#0 # initial shift carry
- RSB v6,a4,#32 # size of complementary right shift
- ANDS ip,a3,#3 # multiple of 4 words ?
- BEQ |shiftleftcopy_loop_down_l1| # yup, so branch
- LDR lr,[a1,#-4]! # shiftleft the first 1-3 words
- ORR v5,v5,lr,ASL a4 # to align the total to a multiple
- STR v5,[a2,#-4]! # of 4 words
- MOV v5,lr,LSR v6
- CMP ip,#2
- BLT |shiftleftcopy_loop_down_l1| # better to branch than skip instrs.
- LDRGE lr,[a1,#-4]!
- ORRGE v5,v5,lr,ASL a4
- STRGE v5,[a2,#-4]!
- MOVGE v5,lr,LSR v6
- LDRGT lr,[a1,#-4]!
- ORRGT v5,v5,lr,ASL a4
- STRGT v5,[a2,#-4]!
- MOVGT v5,lr,LSR v6
- |shiftleftcopy_loop_down_l1|
- BICS ip,a3,#3 # set counter to multiple of 4
- MOVEQ a1,v5 # if zero then we're done
- LDMEQFD sp!,{v5,v6,pc}^ # so return last shift out
- STMFD sp!,{v1-v3} # save work regs
- |shiftleftcopy_loop_down_l2|
- LDMDB a1!,{a3,v1,v2,v3} # load 4 words in one go
- ORR lr,v5,v3,ASL a4 # shiftleft the four words
- MOV v5,v3,LSR v6 # keep carry in a4
- ORR v3,v5,v2,ASL a4 # and store results up a register
- MOV v5,v2,LSR v6 # to regs v1-v3,lr
- ORR v2,v5,v1,ASL a4
- MOV v5,v1,LSR v6
- ORR v1,v5,a3,ASL a4
- MOV v5,a3,LSR v6
- STMDB a2!,{v1,v2,v3,lr} # store 4 results
- SUBS ip,ip,#4 # decrement counter by 4
- BGT |shiftleftcopy_loop_down_l2| # if count still positive then loop
- MOV a1,v5 # result = last shift out
- LDMFD sp!,{v1-v3,v5,v6,pc}^ # restore work regs and return
-
- # extern uintD shift1right_loop_up (uintD* ptr, uintC count, uintD carry);
- # entry
- # a1 = ptr
- # a2 = count of words to be shifted right
- # a3 = carry
- # exit
- # a1 = carry out from last shift right
- # a2 - a4, ip destroyed
- EXPORT |shift1right_loop_up|
- |shift1right_loop_up| # word aligned shift1right loop up
- MOVS a3,a3,LSR #1 # set carry
- ANDS a3,a2,#1 # multiple of 2 words ?
- BEQ |shift1right_loop_up_l1| # yup, so branch
- LDR a4,[a1] # shift right the first word
- MOVS a4,a4,RRX
- STR a4,[a1],#4
- |shift1right_loop_up_l1|
- BICS a4,a2,#1 # set counter to multiple of 2
- MOVEQ a1,a4,RRX # if zero set result to C (a4 is 0)
- MOVEQS pc,lr # and return
- ANDS a3,a4,#3 # multiple of 4 words ?
- BEQ |shift1right_loop_up_l3| # yup, so branch
- LDMIA a1,{a2,a3} # load 2 words in one go
- MOVS a2,a2,RRX # shift right the two words
- MOVS a3,a3,RRX
- STMIA a1!,{a2,a3} # store 2 results
- BICS a4,a4,#2 # decrement counter by 2
- ADCEQ a1,a4,a4 # set result to Carry (a4 is 0)
- MOVEQS pc,lr # and return
- |shift1right_loop_up_l3| # now a multiple of 4 words
- STMFD sp!,{lr} # save work regs
- |shift1right_loop_up_l2|
- LDMIA a1,{a2,a3,ip,lr} # load 4 words in one go
- MOVS a2,a2,RRX # shift right the four words
- MOVS a3,a3,RRX
- MOVS ip,ip,RRX
- MOVS lr,lr,RRX
- STMIA a1!,{a2,a3,ip,lr} # store 4 results
- SUB a4,a4,#4 # decrement counter by 4
- TEQ a4,#0 # are we done ?
- BNE |shift1right_loop_up_l2| # if count non-zero then loop
- MOV a1,a4,RRX # set result to Carry (a4 is 0)
- LDMFD sp!,{pc}^ # restore work regs and return 1
-
- # extern uintD shiftright_loop_up (uintD* ptr, uintC count, uintC i);
- # entry
- # a1 = ptr
- # a2 = count of words to be shifted right
- # a3 = size of right shift
- # exit
- # a1 = shift out from last shift right
- # a2 - a4, ip destroyed
- EXPORT |shiftright_loop_up|
- |shiftright_loop_up| # word aligned shiftright loop up
- STMFD sp!,{v6,lr}
- MOV a4,#0 # initial shift carry
- RSB v6,a3,#32 # size of complementary left shift
- |shiftright_loop_up_l0|
- ANDS ip,a2,#3 # multiple of 4 words ?
- BEQ |shiftright_loop_up_l1| # yup, so branch
- LDR lr,[a1] # shiftright the first 1-3 words
- ORR a4,a4,lr,LSR a3 # to align the total to a multiple
- STR a4,[a1],#4 # of 4 words
- MOV a4,lr,ASL v6
- CMP ip,#2
- BLT |shiftright_loop_up_l1| # better to branch than skip instrs.
- LDRGE lr,[a1]
- ORRGE a4,a4,lr,LSR a3
- STRGE a4,[a1],#4
- MOVGE a4,lr,ASL v6
- LDRGT lr,[a1]
- ORRGT a4,a4,lr,LSR a3
- STRGT a4,[a1],#4
- MOVGT a4,lr,ASL v6
- |shiftright_loop_up_l1|
- BICS ip,a2,#3 # set counter to multiple of 4
- MOVEQ a1,a4 # if zero then we're done
- LDMEQFD sp!,{v6,pc}^ # so return last shift out
- STMFD sp!,{v1-v3} # save work regs
- |shiftright_loop_up_l2|
- LDMIA a1,{v1,v2,v3,lr} # load 4 words in one go
- ORR a2,a4,v1,LSR a3 # shiftright the four words
- MOV a4,v1,ASL v6 # keep carry in a4
- ORR v1,a4,v2,LSR a3 # and store results down a register
- MOV a4,v2,ASL v6 # to regs a2,v1-v3
- ORR v2,a4,v3,LSR a3
- MOV a4,v3,ASL v6
- ORR v3,a4,lr,LSR a3
- MOV a4,lr,ASL v6
- STMIA a1!,{a2,v1,v2,v3} # store 4 results
- SUBS ip,ip,#4 # decrement counter by 4
- BGT |shiftright_loop_up_l2| # if count still positive then loop
- MOV a1,a4 # result = last shift out
- LDMFD sp!,{v1-v3,v6,pc}^ # restore work regs and return
-
- # extern uintD shiftrightsigned_loop_up (uintD* ptr, uintC count, uintC i);
- # entry
- # a1 = ptr
- # a2 = count of words to be shifted right signed
- # a3 = size of right shift
- # exit
- # a1 = shift out from last shift right
- # a2 - a4, ip destroyed
- EXPORT |shiftrightsigned_loop_up|
- |shiftrightsigned_loop_up| # word aligned shiftrightsigned loop up
- STMFD sp!,{v6,lr}
- RSB v6,a3,#32 # size of complementary left shift
- LDR lr,[a1] # setup carry for first shift.
- MOV a4,lr,ASR #31 # this is the sign extended bits
- AND a4,a4,a4,LSL v6 # 31->(32-i) of the first word
- B |shiftright_loop_up_l0| # use right shift code now
-
- # extern uintD shiftrightcopy_loop_up (uintD* sourceptr, uintD* destptr, uintC count, uintC i, uintD carry);
- # entry
- # a1 = sourceptr
- # a2 = destptr
- # a3 = count of words to be shifted right
- # a4 = size of right shift
- # [sp] = carry for first shift
- # exit
- # a1 = shift out from last shift right
- # a2 - a4, ip destroyed
- EXPORT |shiftrightcopy_loop_up|
- |shiftrightcopy_loop_up| # word aligned shiftrightcopy loop up
- STMFD sp!,{v5,v6,lr}
- LDR v5,[sp,#12] # initial shift carry
- RSB v6,a4,#32 # size of complementary left shift
- MOV v5,v5,ASL v6
- |shiftrightcopy_loop_up_l0|
- ANDS ip,a3,#3 # multiple of 4 words ?
- BEQ |shiftrightcopy_loop_up_l1| # yup, so branch
- LDR lr,[a1],#4 # shiftright the first 1-3 words
- ORR v5,v5,lr,LSR a4 # to align the total to a multiple
- STR v5,[a2],#4 # of 4 words
- MOV v5,lr,ASL v6
- CMP ip,#2
- BLT |shiftrightcopy_loop_up_l1| # better to branch than skip instrs.
- LDRGE lr,[a1],#4
- ORRGE v5,v5,lr,LSR a4
- STRGE v5,[a2],#4
- MOVGE v5,lr,ASL v6
- LDRGT lr,[a1],#4
- ORRGT v5,v5,lr,LSR a4
- STRGT v5,[a2],#4
- MOVGT v5,lr,ASL v6
- |shiftrightcopy_loop_up_l1|
- BICS ip,a3,#3 # set counter to multiple of 4
- MOVEQ a1,v5 # if zero then we're done
- LDMEQFD sp!,{v5,v6,pc}^ # so return last shift out
- STMFD sp!,{v1-v3} # save work regs
- |shiftrightcopy_loop_up_l2|
- LDMIA a1!,{v1,v2,v3,lr} # load 4 words in one go
- ORR a3,v5,v1,LSR a4 # shiftright the four words
- MOV v5,v1,ASL v6 # keep carry in a4
- ORR v1,v5,v2,LSR a4 # and store results down a register
- MOV v5,v2,ASL v6 # to regs a2,v1-v3
- ORR v2,v5,v3,LSR a4
- MOV v5,v3,ASL v6
- ORR v3,v5,lr,LSR a4
- MOV v5,lr,ASL v6
- STMIA a2!,{a3,v1,v2,v3} # store 4 results
- SUBS ip,ip,#4 # decrement counter by 4
- BGT |shiftrightcopy_loop_up_l2| # if count still positive then loop
- MOV a1,v5 # result = last shift out
- LDMFD sp!,{v1-v3,v5,v6,pc}^ # restore work regs and return
-
- # mulu32_64_vregs
- # entry
- # a1 = x
- # ip = y
- # exit
- # v1 = low32(x*y)
- # ip = high32(x*y)
- # v2,v3,v4 destroyed
- mulu32_64_vregs
- MOV v1,a1,LSR #16 # temp := top half of x
- MOV v2,ip,LSR #16 # hi := top half of y
- BIC v3,a1,v1,LSL #16 # x := bottom half of x
- BIC ip,ip,v2,LSL #16 # y := bottom half of y
- MUL v4,v3,ip # low section of result
- MUL ip,v1,ip # ) middle sections
- MUL v3,v2,v3 # ) of result
- MUL v2,v1,v2 # high section of result
- ADDS ip,ip,v3 # add middle sections
- # (can't use mla as we need carry)
- ADDCS v2,v2,#&10000 # carry from above add
- ADDS v1,v4,ip,LSL #16 # x is now bottom 32 bits of result
- ADC ip,v2,ip,LSR #16 # hi is top 32 bits
- MOVS pc,lr
-
- # extern uintD mulusmall_loop_down (uintD digit, uintD* ptr, uintC len, uintD newdigit);
- # entry
- # a1 = digit
- # a2 = ptr
- # a3 = count of words to be multiplied down
- # a4 = new digit = carry
- # exit
- # a1 = final carry of multiply
- # a2 - a4, ip destroyed
- EXPORT |mulusmall_loop_down|
- mulusmall_loop_down
- CMP a3,#0
- MOVEQ a1,a4
- MOVEQS pc,lr
- STMFD sp!,{v1-v2,lr}
- mulusmall_loop_down_l1
- LDR ip,[a2,#-4]!
-
- # BL mulu32_64_vregs # muluD(digit,*--ptr,hi=,lo=)
- # replaced by multiplication of a small x = a1 and a big y = ip :
- MOV v1,ip,LSR #16 # top half of y
- BIC ip,ip,v1,LSL #16 # bottom half of y
- MUL v2,a1,v1 # middle section of result
- MUL v1,a1,ip # low section of result
- MOV ip,#0 # high section of result
- ADDS v1,v1,v2,LSL #16 # bottom 32 bits of result
- ADC ip,ip,v2,LSR #16 # top 32 bits of result
-
- ADDS v1,v1,a4 # lo += carry
- ADC a4,ip,#0 # if (lo<carry) { hi += 1 }; carry=hi
- STR v1,[a2,#0] # *ptr = lo
- SUBS a3,a3,#1 # len--
- BNE mulusmall_loop_down_l1 # until len==0
- MOV a1,a4 # return carry
- LDMFD sp!,{v1-v2,pc}^
-
- # extern void mulu_loop_down (uintD digit, uintD* sourceptr, uintD* destptr, uintC len);
- # entry
- # a1 = digit
- # a2 = sourceptr
- # a3 = destptr
- # a4 = count of words to be multiplied down
- # exit
- # a1 - a4, ip destroyed
- EXPORT |mulu_loop_down|
- mulu_loop_down
- STMFD sp!,{v1-v5,lr}
- MOV v5,#0
- mulu_loop_down_l1
- LDR ip,[a2,#-4]!
- BL mulu32_64_vregs # muluD(digit,*--sourceptr,hi=,lo=)
- ADDS v1,v1,v5 # lo += carry
- ADC v5,ip,#0 # if (lo<carry) { hi += 1 }; carry=hi
- STR v1,[a3,#-4]! # *--destptr = lo
- SUBS a4,a4,#1 # len--
- BNE mulu_loop_down_l1 # until len==0
- STR v5,[a3,#-4]! # *--destptr = carry
- LDMFD sp!,{v1-v5,pc}^
-
- # extern void muluadd_loop_down (uintD digit, uintD* sourceptr, uintD* destptr, uintC len);
- # entry
- # a1 = digit
- # a2 = sourceptr
- # a3 = destptr
- # a4 = count of words to be multiplied added down
- # exit
- # a1 - a4, ip destroyed
- EXPORT |muluadd_loop_down|
- muluadd_loop_down
- STMFD sp!,{v1-v5,lr}
- MOV v5,#0
- muluadd_loop_down_l1
- LDR ip,[a2,#-4]!
- BL mulu32_64_vregs # muluD(digit,*--sourceptr,hi=,lo=)
- ADDS v1,v1,v5 # lo += carry
- ADCCS ip,ip,#0 # if (lo<carry) { hi += 1 };
- LDR v5,[a3,#-4]! # carry = *--destptr
- ADDS v1,v1,v5 # lo += carry
- ADC v5,ip,#0 # if (lo<carry) { hi += 1 }; carry=hi
- STR v1,[a3,#0] # *destptr = lo
- SUBS a4,a4,#1 # len--
- BNE muluadd_loop_down_l1 # until len==0
- MOV a1,v5 # return carry
- LDMFD sp!,{v1-v5,pc}^
-
- # extern void mulusub_loop_down (uintD digit, uintD* sourceptr, uintD* destptr, uintC len);
- # entry
- # a1 = digit
- # a2 = sourceptr
- # a3 = destptr
- # a4 = count of words to be multiplied subtracted down
- # exit
- # a1 - a4, ip destroyed
- EXPORT |mulusub_loop_down|
- mulusub_loop_down
- STMFD sp!,{v1-v5,lr}
- MOV v5,#0
- mulusub_loop_down_l1
- LDR ip,[a2,#-4]!
- BL mulu32_64_vregs # muluD(digit,*--sourceptr,hi=,lo=)
- ADDS v1,v1,v5 # lo += carry
- ADCCS ip,ip,#0 # if (lo<carry) { hi += 1 };
- LDR v5,[a3,#-4]! # carry = *--destptr
- SUBS v4,v5,v1
- STR v4,[a3,#0] # *destptr = carry - lo
- ADDCC v5,ip,#1 # if (carry<lo) { hi += 1 }; carry=hi
- MOVCS v5,ip
- SUBS a4,a4,#1 # len--
- BNE mulusub_loop_down_l1 # until len==0
- MOV a1,v5 # return carry
- LDMFD sp!,{v1-v5,pc}^
-
- END
-
- #endif
-