home *** CD-ROM | disk | FTP | other *** search
- Xref: sparky comp.sys.sun.misc:3255 comp.sys.sun.apps:1434
- Path: sparky!uunet!munnari.oz.au!yoyo.aarnet.edu.au!sirius.ucs.adelaide.edu.au!winnie.cs.adelaide.edu.au
- From: gordoni@winnie.cs.adelaide.edu.au (Gordon Irlam)
- Newsgroups: comp.sys.sun.misc,comp.sys.sun.apps
- Subject: Re: Zeroing memory, quickly.
- Message-ID: <7871@sirius.ucs.adelaide.edu.au>
- Date: 21 Jul 92 08:28:10 GMT
- Sender: news@ucs.adelaide.edu.au
- Reply-To: gordoni@cs.adelaide.edu.au
- Followup-To: comp.sys.sun.misc
- Lines: 950
- Nntp-Posting-Host: winnie.cs.adelaide.edu.au
-
- > Hi,
- > I'm looking for a really fast way for zeroing (clearing) a
- > section of memory. Same functionality as bzero(), basically; but as
- > fast as possible. I have an application that needs to clear lots of
- > memory; and the bzero() is the biggest time hog in it.
-
- I wrote VERY heavily optimized versions of bcopy() and bzero() a few
- years back for an experimental SPARC system we were developing.
-
- The routines also seem to work fairly well on various Sun systems.
- Although they have not been optimized for Suns.
-
- The minimal write buffer on the SPARCstation 1 means that for this
- machine the code can probably be improved considerably by splitting up
- adjacent stores. For machines with write-through caches (all
- currently available SPARCstations) the code probably carries loop
- unrolling to excess - write buffer stalls limit the potential gains of
- unrolling.
-
- I don't have any times for bzero(), and they probably aren't quite as
- impressive, but on a SPARCstation 2 my bcopy() typically consumes 50%
- to 60% of the time of Sun's libc version - depending upon whether the
- data is cached or not.
-
- Warning: Readability was totally sacrificed in the pursuit of speed.
- This was on account of simulated bcopy()'s/bzero()'s constituting the
- prime bottleneck during the simulated bootstraping of the kernel we
- were porting.
-
- Code suplied, just add water.
-
- Gordon Irlam
- (gordoni@cs.adelaide.edu.au)
-
-
- *** IMPORTANT NOTE ***
- It is University policy to protect its intellectual property.
- Consequently use of the following code in commercial products requires
- permission. Use of this code for non-commercial purposes is
- permitted.
- *** END IMPORTANT NOTE ***
-
- [Yes this is silly for something as trivial as this, but their is
- little I can do about it, sorry.]
-
- ----------------------------- cut here ----------------------------
-
- /**************************************************************************
- ** Actor =
- **
- ** Component = Libraries
- ** Module = SPARC/memset.x
- **
- ** Synopsis = System V memset.
- **
- ** Originally written by = Gordon Irlam
- ** Responsible = Gordon Irlam
- **
- ** Copyright (c) 1990 Department of Computer Science, Adelaide University
- **
- ***************************************************************************
- ** $Header: /usr/home/projects/mars/people/gordoni/rcs.files/chorus/chorus_3.2/lib/SPARC/RCS/utMemSet.x,v 1.2 90/09/10 14:11:30 gordoni Exp $
- ***************************************************************************
- * The interface between assembly code and C code follows the Sparc register
- * conventions. This requires the caller to place parameters in %o0 through
- * %o5. The return value, if any is returned in %o0. Register %o6 is the stack
- * pointer, and %o7 stores the return address.
- *
- * The callee can use %o0 through %o5, and %g1 through %g3 for scrach values.
- * Registers %g4 through %g7 must never be used. Register %o6 must always
- * contain a pointer to the current top of the stack. All other registers can
- * be used only if they are restored prior to return.
- *
- * The stack grows downwards and the stack pointer must be double word aligned.
- * On the top of the stack is space for the operating system to store the 16 in
- * and local registers currently used. This is performed on window overflow,
- * and context switch. Below this is a word used to hold the address at which
- * structure return values should be stored. This is followed by six words that
- * can be used by the called routine to store the first six arguments it was
- * passed in registers. This is followed by any additional arguments that are
- * passed.
- */
-
- ! Exported symbols.
- .global _memset ! ANSI C memory fill.
-
- /***************************************************************************
- * void *memset(void *s, int c, int n): Fill the first n bytes starting at s
- * with the value c. Return s.
- ***************************************************************************
- */
-
- _memset:
- tst %o2 ! Finished if n < 0.
- bneg fill_done
- mov %o0, %o3 ! Current s in %o3 (delay slot).
- cmp %o2, 7
- ble last_fill ! Make sure can fill to a double word boundary.
-
- ! Fill byte by byte until reach double word boundary.
- btst 7, %o3 ! Delay slot.
- be,a big_fill
- nop
-
- first_chunk:
- stb %o1, [%o3] ! Fill byte.
- inc %o3
- btst 7, %o3
- bne first_chunk
- dec %o2 ! Delay slot.
-
- big_fill:
- ! Fill %o4 and %o5 with the fill value.
- and %o1, 0xff, %o1 ! Make sure fill value is a single byte.
- sll %o1, 8, %o4
- or %o4, %o1, %o4 ! Fill value in low halfword of %04
- sll %o4, 16, %o5
- or %o4, %o5, %o4 ! Fill value in entire word of %o4.
- mov %o4, %o5
-
- deccc 64, %o2 ! See if can fill a 64 byte chunk.
- bneg,a small_fill
- inc 64, %o2 ! Delay slot.
-
- big_chunk:
- std %o4, [%o3 + 0] ! Fill a 64 byte chunk.
- std %o4, [%o3 + 8]
- std %o4, [%o3 + 16]
- std %o4, [%o3 + 24]
- std %o4, [%o3 + 32]
- std %o4, [%o3 + 40]
- std %o4, [%o3 + 48]
- deccc 64, %o2
- std %o4, [%o3 + 56]
- bpos big_chunk
- inc 64, %o3 ! Delay slot.
- inc 64, %o2
-
- small_fill:
- deccc 8, %o2 ! See if can fill an 8 byte chunk.
- bneg,a last_fill
- inc 8, %o2 ! Delay slot.
-
- small_chunk:
- deccc 8, %o2
- std %o4, [%o3 + 0] ! Fill an 8 byte chunk.
- bpos small_chunk
- inc 8, %o3 ! Delay slot.
- inc 8, %o2
-
- last_fill:
- deccc %o2 ! See if can fill next byte.
- bneg,a fill_done
- nop
-
- last_chunk:
- deccc %o2
- stb %o1, [%o3] ! Fill byte.
- bpos,a last_chunk
- inc %o3 ! Delay slot.
-
- fill_done:
- retl ! Return value will still be in %o0.
- nop
-
- ----------------------------- cut here ----------------------------
-
- /**************************************************************************
- ** Actor =
- **
- ** Component = Libraries
- ** Module = SPARC/memmove.x
- **
- ** Synopsis = System V memmove.
- **
- ** Originally written by = Gordon Irlam
- ** Responsible = Gordon Irlam
- **
- ** Copyright (c) 1990 Department of Computer Science, Adelaide University
- **
- ***************************************************************************
- ** $Header: /a/berlioz/ed/projects/mars/people/gordoni/rcs.files/chorus/chorus_3.2/lib/SPARC/RCS/utMemMove.x,v 1.3 90/09/27 18:18:40 gordoni Exp $
- ***************************************************************************
- * The interface between assembly code and C code follows the Sparc register
- * conventions. This requires the caller to place parameters in %o0 through
- * %o5. The return value, if any is returned in %o0. Register %o6 is the stack
- * pointer, and %o7 stores the return address.
- *
- * The callee can use %o0 through %o5, and %g1 through %g3 for scrach values.
- * Registers %g4 through %g7 must never be used. Register %o6 must always
- * contain a pointer to the current top of the stack. All other registers can
- * be used only if they are restored prior to return.
- *
- * The stack grows downwards and the stack pointer must be double word aligned.
- * On the top of the stack is space for the operating system to store the 16 in
- * and local registers currently used. This is performed on window overflow,
- * and context switch. Below this is a word used to hold the address at which
- * structure return values should be stored. This is followed by six words that
- * can be used by the called routine to store the first six arguments it was
- * passed in registers. This is followed by any additional arguments that are
- * passed.
- */
-
- ! Exported symbols.
- .global _memmove ! ANSI C Overlapping memory copy.
-
- /***************************************************************************
- * void *memmove(void *dest, void *src, int n): Copy n bytes from src to dest.
- * Safe to use even if memory areas overlap. Return dest.
- ***************************************************************************
- */
-
- _memmove:
- tst %o2 ! Finished if n <= 0.
- ble copy_done
- cmp %o0, %o1 ! Copy backwards if dest >= src.
- bgeu bwd_copy
- cmp %o2, 7 ! Delay slot.
- ble fwd_last_copy ! Make sure can copy first few bytes.
- mov %o0, %o3 ! Current destination in %o3 (delay slot).
-
- ! Copy slowly until reach word boundary for source.
- btst 3, %o1 ! Delay slot.
- be fwd_third_copy
- btst 1, %o1 ! Delay slot.
- be,a fwd_second_copy
- lduh [%o1], %o4 ! Delay slot.
-
- ldub [%o1], %o4 ! Copy byte.
- inc %o1
- stb %o4, [%o3]
- inc %o3
- btst 3, %o1
- be fwd_third_copy
- dec %o2 ! Delay slot.
-
- lduh [%o1], %o4 ! Copy half word.
- fwd_second_copy:
- inc 2, %o1
- srl %o4, 8, %o5
- stb %o5, [%o3 + 0]
- stb %o4, [%o3 + 1]
- inc 2, %o3
- dec 2, %o2
-
- fwd_third_copy:
- btst 3, %o3
- beq fwd_word_copy ! Destination mod 4 = 0.
- btst 4, %o3 ! Delay slot.
-
- ! Copy next word if destination is in low half of a double word.
- bne,a fwd_test_alignment
- deccc 32 + 4, %o2 ! Delay slot.
-
- ld [%o1], %o4 ! Copy word.
- inc 4, %o1
- srl %o4, 24, %g3
- srl %o4, 16, %g2
- srl %o4, 8, %o5
- stb %g3, [%o3 + 0]
- stb %g2, [%o3 + 1]
- stb %o5, [%o3 + 2]
- stb %o4, [%o3 + 3]
- inc 4, %o3
-
- deccc 32 + 4 + 4, %o2 ! See if can copy a 32 + 4 byte chunk.
- fwd_test_alignment:
- bneg,a fwd_small_byte_copy
- inc 32 + 4, %o2 ! Delay slot.
- ! Call appropriate routine according to alignment of destination.
- btst 1, %o3
- beq fwd_byte_2_copy ! Destination mod 4 = 2.
- btst 2, %o3
- beq,a fwd_byte_1_copy ! Destination mod 4 = 1.
- nop
- ba,a fwd_byte_3_copy ! Destination mod 4 = 3.
-
- fwd_word_copy:
- ! Copy slowly until destination is double word aligned.
- beq,a fwd_word_test_alignment
- deccc 64, %o2 ! See if can copy a 64 byte chunk (delay slot).
-
- ld [%o1], %o4 ! Copy word.
- inc 4, %o1
- st %o4, [%o3]
- inc 4, %o3
- deccc 64 + 4, %o2 ! See if can copy a 64 byte chunk.
-
- fwd_word_test_alignment:
- bneg,a fwd_small_word_copy
- inccc 64 - 8, %o2 ! See if can fill an 8 byte chunk (delay slot).
- btst 7, %o1
- beq,a fwd_doubleword_chunk
- nop
-
- fwd_word_chunk:
- ld [%o1 + 0], %o4 ! Copy a 64 byte chunk.
- ld [%o1 + 4], %o5
- ld [%o1 + 8], %g2
- ld [%o1 + 12], %g3
- std %o4, [%o3 + 0]
- std %g2, [%o3 + 8]
- ld [%o1 + 16], %o4
- ld [%o1 + 20], %o5
- ld [%o1 + 24], %g2
- ld [%o1 + 28], %g3
- std %o4, [%o3 + 16]
- std %g2, [%o3 + 24]
- ld [%o1 + 32], %o4
- ld [%o1 + 36], %o5
- ld [%o1 + 40], %g2
- ld [%o1 + 44], %g3
- std %o4, [%o3 + 32]
- std %g2, [%o3 + 40]
- ld [%o1 + 48], %o4
- ld [%o1 + 52], %o5
- ld [%o1 + 56], %g2
- ld [%o1 + 60], %g3
- std %o4, [%o3 + 48]
- std %g2, [%o3 + 56]
- deccc 64, %o2
- inc 64, %o1
- bpos fwd_word_chunk
- inc 64, %o3 ! Delay slot.
- ba fwd_small_word_copy
- inccc 64 - 8, %o2 ! See if can fill an 8 byte chunk (delay slot).
-
- fwd_doubleword_chunk:
- ldd [%o1 + 0], %o4 ! Copy a 64 byte chunk.
- ldd [%o1 + 8], %g2
- std %o4, [%o3 + 0]
- std %g2, [%o3 + 8]
- ldd [%o1 + 16], %o4
- ldd [%o1 + 24], %g2
- std %o4, [%o3 + 16]
- std %g2, [%o3 + 24]
- ldd [%o1 + 32], %o4
- ldd [%o1 + 40], %g2
- std %o4, [%o3 + 32]
- std %g2, [%o3 + 40]
- ldd [%o1 + 48], %o4
- ldd [%o1 + 56], %g2
- std %o4, [%o3 + 48]
- std %g2, [%o3 + 56]
- deccc 64, %o2
- inc 64, %o1
- bpos fwd_doubleword_chunk
- inc 64, %o3 ! Delay slot.
- inccc 64 - 8, %o2 ! See if can fill an 8 byte chunk.
-
- fwd_small_word_copy:
- bneg,a fwd_last_copy_tested
- inccc 8, %o2 ! See if can copy next byte (delay slot).
-
- fwd_small_word_chunk:
- ld [%o1 + 0], %o4 ! Copy an 8 byte chunk.
- ld [%o1 + 4], %o5
- deccc 8, %o2
- std %o4, [%o3]
- inc 8, %o1
- bpos fwd_small_word_chunk
- inc 8, %o3 ! Delay slot.
- inc 8, %o2
-
- fwd_last_copy:
- tst %o2 ! See if can copy next byte.
-
- fwd_last_copy_tested:
- ble copy_done
-
- fwd_last_chunk:
- deccc 2, %o2 ! Delay slot.
- ldub [%o1 + 0], %o4 ! Copy byte.
- bneg copy_done
- stb %o4, [%o3 + 0] ! Delay slot.
- ldub [%o1 + 1], %o4 ! Copy byte.
- inc 2, %o1
- stb %o4, [%o3 + 1]
- bne,a fwd_last_chunk
- inc 2, %o3 ! Delay slot.
- retl ! Return value will still be in %o0.
- nop
-
- fwd_small_byte_copy:
- deccc 8, %o2 ! See if can fill an 8 byte chunk.
- bneg,a fwd_last_copy_tested
- inccc 8, %o2 ! See if can copy next byte (delay slot).
-
- fwd_small_byte_chunk:
- ld [%o1 + 0], %o4 ! Copy an 8 byte chunk.
- ld [%o1 + 4], %o5
- srl %o4, 24, %g2
- srl %o4, 16, %g3
- stb %g2, [%o3 + 0]
- srl %o4, 8, %g2
- stb %g3, [%o3 + 1]
- stb %g2, [%o3 + 2]
- stb %o4, [%o3 + 3]
- srl %o5, 24, %g2
- srl %o5, 16, %g3
- stb %g2, [%o3 + 4]
- srl %o5, 8, %g2
- stb %g3, [%o3 + 5]
- stb %g2, [%o3 + 6]
- stb %o5, [%o3 + 7]
- inc 8, %o1
- ba fwd_small_byte_copy
- inc 8, %o3 ! Delay slot.
-
- fwd_byte_1_copy:
- ld [%o1], %o4 ! Load first 4 bytes.
- srl %o4, 24, %g2
- srl %o4, 8, %g3
- sll %o4, 24, %g1 ! Last byte in top part of %g1.
- stb %g2, [%o3 + 0] ! Store first 3 bytes.
- sth %g3, [%o3 + 1]
- inc 4, %o1
- inc 4, %o3
-
- fwd_byte_1_chunk:
- ld [%o1 + 0], %o4 ! Copy a 32 byte chunk.
- ld [%o1 + 4], %o5
- deccc 32, %o2
- srl %o4, 8, %g2 ! First 3 bytes in %g2.
- or %g2, %g1, %g2 ! Add top byte in %g1 from previous cycle.
- sll %o4, 24, %o4 ! Fourth byte in high part of %o4.
- srl %o5, 8, %g3 ! Next 3 bytes in low part of %g3.
- or %o4, %g3, %g3 ! Add fourth byte from %o4.
- sll %o5, 24, %g1 ! Save last byte in %g1 for next cycle.
- ld [%o1 + 8], %o4
- ld [%o1 + 12], %o5
- std %g2, [%o3 + 0 - 1]
- srl %o4, 8, %g2
- or %g2, %g1, %g2
- sll %o4, 24, %o4
- srl %o5, 8, %g3
- or %o4, %g3, %g3
- sll %o5, 24, %g1
- ld [%o1 + 16], %o4
- ld [%o1 + 20], %o5
- std %g2, [%o3 + 8 - 1]
- srl %o4, 8, %g2
- or %g2, %g1, %g2
- sll %o4, 24, %o4
- srl %o5, 8, %g3
- or %o4, %g3, %g3
- sll %o5, 24, %g1
- ld [%o1 + 24], %o4
- ld [%o1 + 28], %o5
- std %g2, [%o3 + 16 - 1]
- srl %o4, 8, %g2
- or %g2, %g1, %g2
- sll %o4, 24, %o4
- srl %o5, 8, %g3
- or %o4, %g3, %g3
- sll %o5, 24, %g1
- std %g2, [%o3 + 24 - 1]
- inc 32, %o1
- bpos fwd_byte_1_chunk
- inc 32, %o3 ! Delay slot.
- inc 32, %o2
- srl %g1, 24, %g1
- ba fwd_small_byte_copy
- stb %g1, [%o3 - 1] ! Store last byte (delay slot).
-
- fwd_byte_2_copy:
- ld [%o1], %o4 ! Load first 4 bytes.
- srl %o4, 16, %g3
- sll %o4, 16, %g1 ! Second 2 bytes in top half of %g1.
- sth %g3, [%o3] ! Store first 2 bytes.
- inc 4, %o1
- inc 4, %o3
-
- fwd_byte_2_chunk:
- ld [%o1 + 0], %o4 ! Copy a 32 byte chunk.
- ld [%o1 + 4], %o5
- deccc 32, %o2
- srl %o4, 16, %g2 ! First 2 bytes in %g2.
- or %g2, %g1, %g2 ! Add top 2 bytes in %g1 from previous cycle.
- sll %o4, 16, %o4 ! Second 2 bytes in high part of %o4.
- srl %o5, 16, %g3 ! Third 2 bytes in low part of %g3.
- or %o4, %g3, %g3 ! Add second 2 bytes from %o4.
- sll %o5, 16, %g1 ! Save last 2 bytes in %g1 for next cycle.
- ld [%o1 + 8], %o4
- ld [%o1 + 12], %o5
- std %g2, [%o3 + 0 - 2]
- srl %o4, 16, %g2
- or %g2, %g1, %g2
- sll %o4, 16, %o4
- srl %o5, 16, %g3
- or %o4, %g3, %g3
- sll %o5, 16, %g1
- ld [%o1 + 16], %o4
- ld [%o1 + 20], %o5
- std %g2, [%o3 + 8 - 2]
- srl %o4, 16, %g2
- or %g2, %g1, %g2
- sll %o4, 16, %o4
- srl %o5, 16, %g3
- or %o4, %g3, %g3
- sll %o5, 16, %g1
- ld [%o1 + 24], %o4
- ld [%o1 + 28], %o5
- std %g2, [%o3 + 16 - 2]
- srl %o4, 16, %g2
- or %g2, %g1, %g2
- sll %o4, 16, %o4
- srl %o5, 16, %g3
- or %o4, %g3, %g3
- sll %o5, 16, %g1
- std %g2, [%o3 + 24 - 2]
- inc 32, %o1
- bpos fwd_byte_2_chunk
- inc 32, %o3 ! Delay slot.
- inc 32, %o2
- srl %g1, 16, %g1
- ba fwd_small_byte_copy
- sth %g1, [%o3 - 2] ! Store last 2 bytes (delay slot).
-
- fwd_byte_3_copy:
- ld [%o1], %o4 ! Load first 4 bytes.
- srl %o4, 24, %g2
- sll %o4, 8, %g1 ! Last 3 bytes in top part of %g1.
- stb %g2, [%o3 + 0] ! Store first byte.
- inc 4, %o1
- inc 4, %o3
-
- fwd_byte_3_chunk:
- ld [%o1 + 0], %o4 ! Copy a 32 byte chunk.
- ld [%o1 + 4], %o5
- deccc 32, %o2
- srl %o4, 24, %g2 ! First byte in %g2.
- or %g2, %g1, %g2 ! Add top 3 bytes in %g1 from previous cycle.
- sll %o4, 8, %o4 ! Next 3 bytes in high part of %o4.
- srl %o5, 24, %g3 ! Next byte in low part of %g3.
- or %o4, %g3, %g3 ! Add 3 bytes from %o4.
- sll %o5, 8, %g1 ! Save last 3 bytes in %g1 for next cycle.
- ld [%o1 + 8], %o4
- ld [%o1 + 12], %o5
- std %g2, [%o3 + 0 - 3]
- srl %o4, 24, %g2
- or %g2, %g1, %g2
- sll %o4, 8, %o4
- srl %o5, 24, %g3
- or %o4, %g3, %g3
- sll %o5, 8, %g1
- ld [%o1 + 16], %o4
- ld [%o1 + 20], %o5
- std %g2, [%o3 + 8 - 3]
- srl %o4, 24, %g2
- or %g2, %g1, %g2
- sll %o4, 8, %o4
- srl %o5, 24, %g3
- or %o4, %g3, %g3
- sll %o5, 8, %g1
- ld [%o1 + 24], %o4
- ld [%o1 + 28], %o5
- std %g2, [%o3 + 16 - 3]
- srl %o4, 24, %g2
- or %g2, %g1, %g2
- sll %o4, 8, %o4
- srl %o5, 24, %g3
- or %o4, %g3, %g3
- sll %o5, 8, %g1
- std %g2, [%o3 + 24 - 3]
- inc 32, %o1
- bpos fwd_byte_3_chunk
- inc 32, %o3 ! Delay slot.
- inc 32, %o2
- srl %g1, 16, %g2
- srl %g1, 8, %g1
- sth %g2, [%o3 - 3] ! Store last 3 bytes.
- ba fwd_small_byte_copy
- stb %g1, [%o3 - 1] ! Delay slot.
-
- bwd_copy:
- add %o1, %o2, %o1 ! Start from other end.
- ble bwd_last_copy ! Make sure can copy first few bytes.
- add %o0, %o2, %o3 ! Current destination in %o3 (delay slot).
-
- ! Copy slowly until reach word boundary for source.
- btst 3, %o1 ! Delay slot.
- be bwd_third_copy
- btst 1, %o1 ! Delay slot.
- be,a bwd_second_copy
- lduh [%o1 - 2], %o4 ! Delay slot.
-
- ldub [%o1 - 1], %o4 ! Copy byte.
- dec %o1
- stb %o4, [%o3 - 1]
- dec %o3
- btst 3, %o1
- be bwd_third_copy
- dec %o2 ! Delay slot.
-
- lduh [%o1 - 2], %o4 ! Copy half word.
- bwd_second_copy:
- dec 2, %o1
- srl %o4, 8, %o5
- stb %o4, [%o3 - 1]
- stb %o5, [%o3 - 2]
- dec 2, %o3
- dec 2, %o2
-
- bwd_third_copy:
- btst 3, %o3
- beq bwd_word_copy ! Destination mod 4 = 0.
- btst 4, %o3 ! Delay slot.
-
- ! Copy next word if destination is in high half of a double word.
- beq,a bwd_test_alignment
- deccc 32 + 4, %o2 ! Delay slot.
-
- ld [%o1 - 4], %o4 ! Copy word.
- dec 4, %o1
- srl %o4, 8, %o5
- srl %o4, 16, %g2
- srl %o4, 24, %g3
- stb %o4, [%o3 - 1]
- stb %o5, [%o3 - 2]
- stb %g2, [%o3 - 3]
- stb %g3, [%o3 - 4]
- dec 4, %o3
-
- deccc 32 + 4 + 4, %o2 ! See if can copy a 32 + 4 byte chunk.
- bwd_test_alignment:
- bneg,a bwd_small_byte_copy
- inc 32 + 4, %o2 ! Delay slot.
- ! Call appropriate routine according to alignment of dest.
- btst 1, %o3
- beq bwd_byte_2_copy ! Destination mod 4 = 2.
- btst 2, %o3
- beq,a bwd_byte_1_copy ! Destination mod 4 = 1.
- nop
- ba,a bwd_byte_3_copy ! Destination mod 4 = 3.
-
- bwd_word_copy:
- ! Copy slowly until destination is double word aligned.
- beq,a bwd_word_test_alignment
- deccc 64, %o2 ! See if can copy a 64 byte chunk (delay slot).
-
- ld [%o1 - 4], %o4 ! Copy word.
- dec 4, %o1
- st %o4, [%o3 - 4]
- dec 4, %o3
- deccc 64 + 4, %o2 ! See if can copy a 64 byte chunk.
-
- bwd_word_test_alignment:
- bneg,a bwd_small_word_copy
- inccc 64 - 8, %o2 ! See if can fill an 8 byte chunk (delay slot).
- btst 7, %o1
- beq,a bwd_doubleword_chunk
- nop
-
- bwd_word_chunk:
- ld [%o1 - 4], %o5 ! Copy a 64 byte chunk.
- ld [%o1 - 8], %o4
- ld [%o1 - 12], %g3
- ld [%o1 - 16], %g2
- std %o4, [%o3 - 8]
- std %g2, [%o3 - 16]
- ld [%o1 - 20], %o5
- ld [%o1 - 24], %o4
- ld [%o1 - 28], %g3
- ld [%o1 - 32], %g2
- std %o4, [%o3 - 24]
- std %g2, [%o3 - 32]
- ld [%o1 - 36], %o5
- ld [%o1 - 40], %o4
- ld [%o1 - 44], %g3
- ld [%o1 - 48], %g2
- std %o4, [%o3 - 40]
- std %g2, [%o3 - 48]
- ld [%o1 - 52], %o5
- ld [%o1 - 56], %o4
- ld [%o1 - 60], %g3
- ld [%o1 - 64], %g2
- std %o4, [%o3 - 56]
- std %g2, [%o3 - 64]
- deccc 64, %o2
- dec 64, %o1
- bpos bwd_word_chunk
- dec 64, %o3 ! Delay slot.
- ba bwd_small_word_copy
- inccc 64 - 8, %o2 ! See if can fill an 8 byte chunk (delay slot).
-
- bwd_doubleword_chunk:
- ldd [%o1 - 8], %o4 ! Copy a 64 byte chunk.
- ldd [%o1 - 16], %g2
- std %o4, [%o3 - 8]
- std %g2, [%o3 - 16]
- ldd [%o1 - 24], %o4
- ldd [%o1 - 32], %g2
- std %o4, [%o3 - 24]
- std %g2, [%o3 - 32]
- ldd [%o1 - 40], %o4
- ldd [%o1 - 48], %g2
- std %o4, [%o3 - 40]
- std %g2, [%o3 - 48]
- ldd [%o1 - 56], %o4
- ldd [%o1 - 64], %g2
- std %o4, [%o3 - 56]
- std %g2, [%o3 - 64]
- deccc 64, %o2
- dec 64, %o1
- bpos bwd_doubleword_chunk
- dec 64, %o3 ! Delay slot.
- inccc 64 - 8, %o2 ! See if can fill an 8 byte chunk.
-
- bwd_small_word_copy:
- bneg,a bwd_last_copy_tested
- inccc 8, %o2 ! See if can copy next byte (delay slot).
-
- bwd_small_word_chunk:
- ld [%o1 - 4], %o5 ! Copy an 8 byte chunk.
- ld [%o1 - 8], %o4
- deccc 8, %o2
- std %o4, [%o3 - 8]
- dec 8, %o1
- bpos bwd_small_word_chunk
- dec 8, %o3 ! Delay slot.
- inc 8, %o2
-
- bwd_last_copy:
- tst %o2 ! See if can copy next byte.
-
- bwd_last_copy_tested:
- ble copy_done
-
- bwd_last_chunk:
- deccc 2, %o2 ! Delay slot.
- ldub [%o1 - 1], %o4 ! Copy byte.
- bneg copy_done
- stb %o4, [%o3 - 1] ! Delay slot.
- ldub [%o1 - 2], %o4 ! Copy byte.
- dec 2, %o1
- stb %o4, [%o3 - 2]
- bne,a bwd_last_chunk
- dec 2, %o3 ! Delay slot.
- retl ! Return value will still be in %o0.
- nop
-
- bwd_small_byte_copy:
- deccc 8, %o2 ! See if can fill an 8 byte chunk.
- bneg,a bwd_last_copy_tested
- inccc 8, %o2 ! See if can copy next byte (delay slot).
-
- bwd_small_byte_chunk:
- ld [%o1 - 4], %o4 ! Copy an 8 byte chunk.
- ld [%o1 - 8], %o5
- srl %o4, 8, %g2
- stb %o4, [%o3 - 1]
- stb %g2, [%o3 - 2]
- srl %o4, 16, %g2
- srl %o4, 24, %g3
- stb %g2, [%o3 - 3]
- stb %g3, [%o3 - 4]
- srl %o5, 8, %g2
- stb %o5, [%o3 - 5]
- stb %g2, [%o3 - 6]
- srl %o5, 16, %g2
- srl %o5, 24, %g3
- stb %g2, [%o3 - 7]
- stb %g3, [%o3 - 8]
- dec 8, %o1
- ba bwd_small_byte_copy
- dec 8, %o3 ! Delay slot.
-
- bwd_byte_1_copy:
- ld [%o1 - 4], %o5 ! Load first 4 bytes.
- srl %o5, 8, %g1 ! Last 3 bytes in low part of %g1.
- stb %o5, [%o3 - 1] ! Store first byte.
- dec 4, %o1
- dec 4, %o3
-
- bwd_byte_1_chunk:
- ld [%o1 - 4], %o5 ! Copy a 32 byte chunk.
- ld [%o1 - 8], %o4
- deccc 32, %o2
- sll %o5, 24, %g3 ! First byte in %g3.
- or %g3, %g1, %g3 ! Add 3 low bytes in %g1 from previous cycle.
- srl %o5, 8, %o5 ! Next 3 bytes in low part of %o5.
- sll %o4, 24, %g2 ! Next byte in high part of %g2.
- or %o5, %g2, %g2 ! Add low 3 bytes from %o5.
- srl %o4, 8, %g1 ! Save last 3 bytes in %g1 for next cycle.
- ld [%o1 - 12], %o5
- ld [%o1 - 16], %o4
- std %g2, [%o3 - 8 + 3]
- sll %o5, 24, %g3
- or %g3, %g1, %g3
- srl %o5, 8, %o5
- sll %o4, 24, %g2
- or %o5, %g2, %g2
- srl %o4, 8, %g1
- ld [%o1 - 20], %o5
- ld [%o1 - 24], %o4
- std %g2, [%o3 - 16 + 3]
- sll %o5, 24, %g3
- or %g3, %g1, %g3
- srl %o5, 8, %o5
- sll %o4, 24, %g2
- or %o5, %g2, %g2
- srl %o4, 8, %g1
- ld [%o1 - 28], %o5
- ld [%o1 - 32], %o4
- std %g2, [%o3 - 24 + 3]
- sll %o5, 24, %g3
- or %g3, %g1, %g3
- srl %o5, 8, %o5
- sll %o4, 24, %g2
- or %o5, %g2, %g2
- srl %o4, 8, %g1
- std %g2, [%o3 - 32 + 3]
- dec 32, %o1
- bpos bwd_byte_1_chunk
- dec 32, %o3 ! Delay slot.
- inc 32, %o2
- srl %g1, 16, %g2
- sth %g1, [%o3 + 1] ! Store last 3 bytes.
- ba bwd_small_byte_copy
- stb %g2, [%o3 + 0] ! Delay slot.
-
- bwd_byte_2_copy:
- ld [%o1 - 4], %o4 ! Load first 4 bytes.
- srl %o4, 16, %g1 ! Second 2 bytes in low half of %g1.
- sth %o4, [%o3 - 2] ! Store first 2 bytes.
- dec 4, %o1
- dec 4, %o3
-
- bwd_byte_2_chunk:
- ld [%o1 - 4], %o5 ! Copy a 32 byte chunk.
- ld [%o1 - 8], %o4
- deccc 32, %o2
- sll %o5, 16, %g3 ! First 2 bytes in %g3.
- or %g3, %g1, %g3 ! Add low 2 bytes in %g1 from previous cycle.
- srl %o5, 16, %o5 ! Second 2 bytes in low part of %o5.
- sll %o4, 16, %g2 ! Third 2 bytes in high part of %g2.
- or %o5, %g2, %g2 ! Add second 2 bytes from %o5.
- srl %o4, 16, %g1 ! Save last 2 bytes in %g1 for next cycle.
- ld [%o1 - 12], %o5
- ld [%o1 - 16], %o4
- std %g2, [%o3 - 8 + 2]
- sll %o5, 16, %g3
- or %g3, %g1, %g3
- srl %o5, 16, %o5
- sll %o4, 16, %g2
- or %o5, %g2, %g2
- srl %o4, 16, %g1
- ld [%o1 - 20], %o5
- ld [%o1 - 24], %o4
- std %g2, [%o3 - 16 + 2]
- sll %o5, 16, %g3
- or %g3, %g1, %g3
- srl %o5, 16, %o5
- sll %o4, 16, %g2
- or %o5, %g2, %g2
- srl %o4, 16, %g1
- ld [%o1 - 28], %o5
- ld [%o1 - 32], %o4
- std %g2, [%o3 - 24 + 2]
- sll %o5, 16, %g3
- or %g3, %g1, %g3
- srl %o5, 16, %o5
- sll %o4, 16, %g2
- or %o5, %g2, %g2
- srl %o4, 16, %g1
- std %g2, [%o3 - 32 + 2]
- dec 32, %o1
- bpos bwd_byte_2_chunk
- dec 32, %o3 ! Delay slot.
- inc 32, %o2
- ba bwd_small_byte_copy
- sth %g1, [%o3 + 0] ! Store last 2 bytes (delay slot).
-
- bwd_byte_3_copy:
- ld [%o1 - 4], %o4 ! Load first 4 bytes.
- srl %o4, 8, %g3
- srl %o4, 24, %g1 ! Last byte in low part of %g1.
- stb %o4, [%o3 - 1] ! Store first 3 bytes.
- sth %g3, [%o3 - 3]
- dec 4, %o1
- dec 4, %o3
-
- bwd_byte_3_chunk:
- ld [%o1 - 4], %o5 ! Copy a 32 byte chunk.
- ld [%o1 - 8], %o4
- deccc 32, %o2
- sll %o5, 8, %g3 ! First 3 bytes in %g3.
- or %g3, %g1, %g3 ! Add low byte in %g1 from previous cycle.
- srl %o5, 24, %o5 ! Fourth byte in low part of %o5.
- sll %o4, 8, %g2 ! Next 3 bytes in high part of %g2.
- or %o5, %g2, %g2 ! Add low byte from %o5.
- srl %o4, 24, %g1 ! Save last byte in %g1 for next cycle.
- ld [%o1 - 12], %o5
- ld [%o1 - 16], %o4
- std %g2, [%o3 - 8 + 1]
- sll %o5, 8, %g3
- or %g3, %g1, %g3
- srl %o5, 24, %o5
- sll %o4, 8, %g2
- or %o5, %g2, %g2
- srl %o4, 24, %g1
- ld [%o1 - 20], %o5
- ld [%o1 - 24], %o4
- std %g2, [%o3 - 16 + 1]
- sll %o5, 8, %g3
- or %g3, %g1, %g3
- srl %o5, 24, %o5
- sll %o4, 8, %g2
- or %o5, %g2, %g2
- srl %o4, 24, %g1
- ld [%o1 - 28], %o5
- ld [%o1 - 32], %o4
- std %g2, [%o3 - 24 + 1]
- sll %o5, 8, %g3
- or %g3, %g1, %g3
- srl %o5, 24, %o5
- sll %o4, 8, %g2
- or %o5, %g2, %g2
- srl %o4, 24, %g1
- std %g2, [%o3 - 32 + 1]
- dec 32, %o1
- bpos bwd_byte_3_chunk
- dec 32, %o3 ! Delay slot.
- inc 32, %o2
- ba bwd_small_byte_copy
- stb %g1, [%o3 + 0] ! Store last byte (delay slot).
-
- copy_done:
- retl ! Return value will still be in %o0.
- nop
-
- ----------------------------- cut here ----------------------------
-
- void bzero(dst, length)
- char *dst;
- int length;
- {
- (void) memset((void *) dst, 0, (int) length);
- }
-
- ----------------------------- cut here ----------------------------
-
- void bcopy(src, dst, length)
- char *src;
- char *dst;
- int length;
- {
- (void) memcpy((void *) dst, (void *) src, (int) length);
- }
-