home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
OS/2 Shareware BBS: 6 File
/
06-File.zip
/
ramfs102.zip
/
src
/
vmheap.asm
< prev
next >
Wrap
Assembly Source File
|
2002-10-20
|
17KB
|
663 lines
; $Id: vmheap.asm,v 1.1.2.3 2002/10/21 00:11:36 root Exp $
;
; VM block transfer routines. ALP v 4.00.007 or later required.
.586p
.MMX
.XMM
extern DOS32FLATDS:abs
extern _fix_kernel:FAR32
public _threednow
OPTION SEGMENT:USE16
_BSS segment public 'BSS'
extern _DevHlp: dword
extern _flat_ds: word
far_gate fword ?
_threednow dw ?
_BSS ends
DGROUP GROUP _BSS
_TEXT32 segment para public use32 'CODE'
assume cs:_TEXT32
assume ds:FLAT, es:FLAT
; Workhorse procedure for SIMD data transfer. Derived from:
;
; AMD Athlon<TM> Processor x86 Code Optimization 22007J August 2001
; Chapter 10: "3DNow!<TM> and MMX<TM> Optimizations"
;
; AMD Athlon Processor-Specific Code
;
; The following memory copy example is written with Microsoft
; Visual C++ in-line assembler syntax, and assumes that the
; Microsoft Processor Pack is installed (available from
; Microsoft's web site). This is a general purpose memcpy()
; routine, which can efficiently copy any size block, small or
; large. Data alignment is strongly recommended for good
; performance, but this code can handle non-aligned blocks.
;
; Example 2: Optimized memcpy() for Any Data Size or Alignment
;
TINY_BLOCK_COPY EQU 64 ; upper limit for movsd type copy
; The smallest copy uses the X86 "movsd" instruction, in an optimized
; form which is an "unrolled loop".
IN_CACHE_COPY EQU 64 * 1024 ; upper limit for movq/movq copy w/SW prefetch
; Next is a copy that uses the MMX registers to copy 8 bytes at a time,
; also using the "unrolled loop" optimization. This code uses
; the software prefetch instruction to get the data into the cache.
UNCACHED_COPY EQU 197 * 1024 ; upper limit for movq/movntq w/SW prefetch
; For larger blocks, which will spill beyond the cache, it's faster to
; use the Streaming Store instruction MOVNTQ. This write instruction
; bypasses the cache and writes straight to main memory. This code also
; uses the software prefetch instruction to pre-read the data.
; USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
CACHEBLOCK EQU 80h ; # of 64-byte blocks (cache lines) for block prefetch
; For the largest size blocks, a special technique called Block Prefetch
; can be used to accelerate the read operations. Block Prefetch reads
; one address per cache line, for a series of cache lines, in a short loop.
; This is faster than using software prefetch. The technique is great for
; getting maximum read bandwidth, especially in DDR memory systems.
ramfs_3dnow_transfer proc far
push ebp
mov ebp, esp
sub esp, 108
fsave [ebp-108]
mov ebx, ecx ; keep a copy of count
cld
cmp ecx, TINY_BLOCK_COPY
jb $memcpy_ic_3 ; tiny? skip mmx copy
cmp ecx, 32*1024 ; don't align between 32k-64k because
jbe $memcpy_do_align ; it appears to be slower
cmp ecx, 64*1024
jbe $memcpy_align_done
$memcpy_do_align:
mov ecx, 8 ; a trick that's faster than rep movsb...
sub ecx, edi ; align destination to qword
and ecx, 111b ; get the low bits
sub ebx, ecx ; update copy count
neg ecx ; set up to jump into the array
add ecx, offset FLAT:$memcpy_align_done
jmp ecx ; jump to array of movsb`s
align 4
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
$memcpy_align_done: ; destination is dword aligned
mov ecx, ebx ; number of bytes left to copy
shr ecx, 6 ; get 64-byte block count
jz $memcpy_ic_2 ; finish the last few bytes
cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
jae $memcpy_uc_test
; This is small block copy that uses the MMX registers to copy 8 bytes
; at a time. It uses the "unrolled loop" optimization, and also uses
; the software prefetch instruction to get the data into the cache.
ALIGN 16
$memcpy_ic_1: ; 64-byte block copies, in-cache copy
prefetchnta ds:[esi+(200*64/34+192)] ; start reading ahead
movq mm0, ds:[esi+0] ; read 64 bits
movq mm1, ds:[esi+8]
movq es:[edi+0], mm0 ; write 64 bits
movq es:[edi+8], mm1 ; note: the normal movq writes the
movq mm2, ds:[esi+16] ; data to cache; a cache line will be
movq mm3, ds:[esi+24] ; allocated as needed, to store the data
movq es:[edi+16], mm2
movq es:[edi+24], mm3
movq mm0, ds:[esi+32]
movq mm1, ds:[esi+40]
movq es:[edi+32], mm0
movq es:[edi+40], mm1
movq mm2, ds:[esi+48]
movq mm3, ds:[esi+56]
movq es:[edi+48], mm2
movq es:[edi+56], mm3
add esi, 64 ; update source pointer
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memcpy_ic_1 ; last 64-byte block?
$memcpy_ic_2:
mov ecx, ebx ; has valid low 6 bits of the byte count
$memcpy_ic_3:
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset FLAT:$memcpy_last_few
jmp ecx ; jump to array of movsd`s
$memcpy_uc_test:
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
jae $memcpy_bp_1
$memcpy_64_test:
or ecx, ecx ; tail end of block prefetch will jump here
jz $memcpy_ic_2 ; no more 64-byte blocks left
; For larger blocks, which will spill beyond the cache, it's faster to
; use the Streaming Store instruction MOVNTQ. This write instruction
; bypasses the cache and writes straight to main memory. This code also
; uses the software prefetch instruction to pre-read the data.
align 16
$memcpy_uc_1: ; 64-byte blocks, uncached copy
prefetchnta ds:[esi+(200*64/34+192)] ; start reading ahead
movq mm0, ds:[esi+0] ; read 64 bits
add edi,64 ; update destination pointer
movq mm1, ds:[esi+8]
add esi,64 ; update source pointer
movq mm2, ds:[esi-48]
movntq es:[edi-64], mm0 ; write 64 bits, bypassing the cache
movq mm0, ds:[esi-40] ; note: movntq also prevents the CPU
movntq es:[edi-56], mm1 ; from READING the destination address
movq mm1, ds:[esi-32] ; into the cache, only to be over-written
movntq es:[edi-48], mm2 ; so that also helps performance
movq mm2, ds:[esi-24]
movntq es:[edi-40], mm0
movq mm0, ds:[esi-16]
movntq es:[edi-32], mm1
movq mm1, ds:[esi-8]
movntq es:[edi-24], mm2
movntq es:[edi-16], mm0
dec ecx
movntq es:[edi-8], mm1
jnz $memcpy_uc_1 ; last 64-byte block?
jmp $memcpy_ic_2 ; almost dont
; For the largest size blocks, a special technique called Block Prefetch
; can be used to accelerate the read operations. Block Prefetch reads
; one address per cache line, for a series of cache lines, in a short loop.
; This is faster than using software prefetch. The technique is great for
; getting maximum read bandwidth, especially in DDR memory systems.
$memcpy_bp_1: ; large blocks, block prefetch copy
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
jl $memcpy_64_test ; no, back to regular uncached copy
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
add esi, CACHEBLOCK * 64 ; move to the top of the block
align 16
$memcpy_bp_2:
mov edx, ds:[esi-64] ; grab one address per cache line
mov edx, ds:[esi-128] ; grab one address per cache line
sub esi, 128 ; go reverse order to suppress HW prefetcher
dec eax ; count down the cache lines
jnz $memcpy_bp_2 ; keep grabbing more lines into cache
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
align 16
$memcpy_bp_3:
movq mm0, ds:[esi] ; read 64 bits
movq mm1, ds:[esi+8]
movq mm2, ds:[esi+16]
movq mm3, ds:[esi+24]
movq mm4, ds:[esi+32]
movq mm5, ds:[esi+40]
movq mm6, ds:[esi+48]
movq mm7, ds:[esi+56]
add esi, 64 ; update source pointer
movntq es:[edi], mm0 ; write 64 bits, bypassing cache
movntq es:[edi+8], mm1 ; note: movntq also prevents the CPU
movntq es:[edi+16], mm2 ; from READING the destination address
movntq es:[edi+24], mm3 ; into the cache, only to be over-written,
movntq es:[edi+32], mm4 ; so that also helps performance
movntq es:[edi+40], mm5
movntq es:[edi+48], mm6
movntq es:[edi+56], mm7
add edi, 64 ; update dest pointer
dec eax ; count down
jnz $memcpy_bp_3 ; keep copying
sub ecx, CACHEBLOCK ; update the 64-byte block count
jmp $memcpy_bp_1 ; keep processing blocks
; The smallest copy uses the X86 "movsd" instruction, in an optimized
; form which is an "unrolled loop". Then it handles the last few bytes.
align 4
movsd
movsd ; perform last 1-15 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd ; perform last 1-7 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
$memcpy_last_few: ; dword aligned from before movsd`s
mov ecx, ebx ; has valid low 2 bits of the byte count
and ecx, 11b ; the last few cows must come home
jz $memcpy_final ; no more, let's leave
rep movsb ; the last 1, 2, or 3 bytes
$memcpy_final:
emms ; clean up the MMX state
sfence ; flush the write buffer
frstor [ebp-108]
mov esp, ebp
pop ebp
retf
ramfs_3dnow_transfer endp
; The old yet reliable DWORD copy procedure
ramfs_dword_transfer proc far
mov ebx, ecx
cld
shr ecx, 2
and ebx, 3
rep movsd
mov ecx, ebx
rep movsb
retf
ramfs_dword_transfer endp
_TEXT32 ends
_TEXT segment public 'CODE'
assume es:nothing, ss:nothing, ds:DGROUP, fs:nothing, gs:nothing
public VMVIRTTOFLAT
public VMALLOC
public VMFREE
public VMREADUCHAR
public VMREADUSHORT
public VMREADBLK
public VMWRITEBLK
public VMREAD
public VMWRITE
public VMCOPY
;-----------------------------------------------------------------------------
; FLAT _pascal VMVirtToFlat (void *p);
;-----------------------------------------------------------------------------
VMVIRTTOFLAT proc near
push bp
mov bp, sp
push si
movzx esi, word ptr [bp+4] ; offset of p
mov ax, word ptr [bp+6] ; selector of p
mov dl, 5Bh ; _DevHlp_VirtToLin
call [_DevHlp]
jnc vvtf_end
xor eax, eax
vvtf_end:
shld edx, eax, 16
pop si
pop bp
ret 4
VMVIRTTOFLAT endp
;-----------------------------------------------------------------------------
; FLAT _pascal VMAlloc (ULONG cbSize);
;-----------------------------------------------------------------------------
VMALLOC proc near
push bp
mov bp, sp
mov ecx, [bp+4] ; cbSize
mov eax, 00000004h ; flags
mov dl, 57h ; _DevHlp_VMAlloc
call [_DevHlp]
jnc va_end
xor eax, eax
va_end:
shld edx, eax, 16
pop bp
ret 4
VMALLOC endp
;-----------------------------------------------------------------------------
; void _pascal VMFree (FLAT flatBlock);
;-----------------------------------------------------------------------------
VMFREE proc near
push bp
mov bp, sp
mov eax, [bp+4] ; flatBlock
mov dl, 58h ; _DevHlp_VMFree
call [_DevHlp]
jnc vf_end
int 3
vf_end:
pop bp
ret 4
VMFREE endp
;-----------------------------------------------------------------------------
; UCHAR _pascal VMReadUChar (FLAT flatSrc);
;-----------------------------------------------------------------------------
VMREADUCHAR proc near
push bp
mov bp, sp
mov ax, DOS32FLATDS
mov es, ax
mov eax, [bp+4] ; flatSrc
mov al, es:[eax]
pop bp
ret 4
VMREADUCHAR endp
;-----------------------------------------------------------------------------
; USHORT _pascal VMReadUShort (FLAT flatSrc);
;-----------------------------------------------------------------------------
VMREADUSHORT proc near
push bp
mov bp, sp
mov ax, DOS32FLATDS
mov es, ax
mov eax, [bp+4] ; flatSrc
mov ax, es:[eax]
pop bp
ret 4
VMREADUSHORT endp
;-----------------------------------------------------------------------------
; void _pascal VMReadBlk (BLOCK _ss *pBlk, FLAT flatBlk);
;-----------------------------------------------------------------------------
VMREADBLK proc near
push bp
mov bp, sp
push ds
mov ax, DOS32FLATDS
mov es, ax
mov ebx, [bp+4] ; flatBlk
mov eax, es:[ebx+0]
mov edx, es:[ebx+4]
lds bx, dword ptr [bp+8] ; pBlk
mov ds:[bx+0], eax
mov ds:[bx+4], edx
pop ds
pop bp
ret 8
VMREADBLK endp
;-----------------------------------------------------------------------------
; void _pascal VMWriteBlk (FLAT flatBlk, BLOCK _ss *pBlk);
;-----------------------------------------------------------------------------
VMWRITEBLK proc near
push bp
mov bp, sp
push ds
mov ax, DOS32FLATDS
mov es, ax
lds bx, dword ptr [bp+4] ; pBlk
mov eax, ds:[bx+0]
mov edx, ds:[bx+4]
mov ebx, [bp+8] ; flatBlk
mov es:[ebx+0], eax
mov es:[ebx+4], edx
pop ds
pop bp
ret 8
VMWRITEBLK endp
;-----------------------------------------------------------------------------
; void _pascal VMRead (void *pDest, FLAT flatSrc, USHORT cbLen);
;-----------------------------------------------------------------------------
VMREAD proc near
push bp
mov bp, sp
push si
push di
push ds
push es
mov ax, DOS32FLATDS
mov ds, ax
sub ecx, ecx
mov cx, [bp+4] ; number of bytes to copy
xor edi, edi
mov esi, [bp+6] ; source
les di, [bp+10] ; destination
push fs
mov ax, seg DGROUP
mov fs, ax
call fword ptr fs:far_gate
pop fs
pop es
pop ds
pop di
pop si
pop bp
ret 10
VMREAD endp
;-----------------------------------------------------------------------------
; void _pascal VMWrite (FLAT flatDest, void *pSrc, USHORT cbLen);
;-----------------------------------------------------------------------------
VMWRITE proc near
push bp
mov bp, sp
push si
push di
push ds
push es
mov ax, DOS32FLATDS
mov es, ax
mov edi, [bp+10] ; flatDest
xor esi, esi
lds si, [bp+6] ; pSrc
xor ecx, ecx
mov cx, word ptr [bp+4] ; cbLen
push fs
mov ax, seg DGROUP
mov fs, ax
call fword ptr fs:far_gate
pop fs
pop es
pop ds
pop di
pop si
pop bp
ret 10
VMWRITE endp
;-----------------------------------------------------------------------------
; void _pascal VMCopy (FLAT flatDest, FLAT flatSrc, ULONG cbLen);
;-----------------------------------------------------------------------------
VMCOPY proc near
push bp
mov bp, sp
push es
push ds
push si
push di
mov ax, DOS32FLATDS
mov es, ax
mov ds, ax
mov edi, [bp+12] ; flatDest
mov esi, [bp+8] ; flatSrc
mov ecx, [bp+4] ; cbLen
mov edx, ecx
cmp edi, esi
je vc_end
ja vc_backwards
; move forwards
push fs
mov ax, seg DGROUP
mov fs, ax
call fword ptr fs:far_gate
pop fs
jmp vc_end
vc_backwards:
; move backwards
lea esi, [esi+ecx-1]
lea edi, [edi+ecx-1]
and ecx, 0003h
std
rep movs byte ptr es:[edi], byte ptr es:[esi]
sub esi, 3
sub edi, 3
mov ecx, edx
shr ecx, 2
rep movs dword ptr es:[edi], dword ptr es:[esi]
cld
vc_end:
pop di
pop si
pop ds
pop es
pop bp
ret 12
VMCOPY endp
;
; Weird initialization routine. _fix_kernel can't be issued at the init time,
; so it's deferred as we plant init_cont to sit in place of real transfer
; subroutine.
;
VMINIT proc near
push ds
mov ax, seg DGROUP
mov ds, ax
mov dword ptr ds:far_gate, 0
mov word ptr ds:far_gate, offset init_cont
mov word ptr ds:far_gate+4, seg init_cont
pop ds
ret
init_cont:
pushad
push ds
push es
push fs
mov ax, seg DGROUP
mov ds, ax
mov fs, ax
mov al, 9 ; This is an undocumented DosEnv entry
mov dl, 24h
call [_DevHlp]
mov es, ax
mov cx, seg DGROUP
movzx ax, byte ptr es:[bx]
shl ax, 2
mov ds, cx
add bx, ax
mov ax, word ptr es:[bx+26h] ; Flat CS
mov word ptr ds:far_gate+4, ax
mov ax, word ptr es:[bx+2Ah] ; Flat DS
mov word ptr ds:_flat_ds, ax
; Check if 3DNow! was forced
mov ax, ds:_threednow
or ax, ax
jz no_3dnow
cmp al, 1
je ok_3dnow
; 3DNow!<TM> detection - AMD technote #21928G/0 March 2000
pushfd
pop eax
mov ebx, eax
xor eax, 00200000h ; Flip CPUID bit
push eax
popfd
pushfd
pop eax
cmp eax, ebx
je no_3dnow
mov eax, 80000000h
cpuid
cmp eax, 80000000h
jbe no_3dnow
mov eax, 80000001h
cpuid
test edx, 80000000h
je no_3dnow
; Now try to apply the patch. Compose the address manually using Flat CS
; provided by kernel.
ok_3dnow:
push es
push ds
mov dword ptr fs:far_gate, offset FLAT:_fix_kernel
mov ax, fs:_flat_ds
mov ds, ax
mov es, ax
call fword ptr fs:far_gate
pop ds
pop es
or eax, eax
jz kernel_fixed
mov ax, fs:_threednow
cmp al, 1 ; If 3DNow! was forced, let it stay
jne no_3dnow
kernel_fixed:
; Install the appropriate handler
mov dword ptr ds:far_gate, offset FLAT:ramfs_3dnow_transfer
jmp short far_gate_done
no_3dnow:
mov dword ptr ds:far_gate, offset FLAT:ramfs_dword_transfer
far_gate_done:
pop fs
pop es
pop ds
popad
jmp fword ptr fs:far_gate
VMINIT endp
_TEXT ends
end