home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Fresh Fish 5
/
FreshFish_July-August1994.bin
/
bbs
/
dev
/
c2p.lha
/
C2P
/
CPU+Blitter
/
c2p8.s
< prev
next >
Wrap
Text File
|
1994-07-01
|
21KB
|
834 lines
opt o+,l+,d+
incdir inc:
include "exec/exec_lib.i"
include "exec/exec.i"
include "graphics/graphics_lib.i"
include "hardware/custom.i"
xdef _c2p8_init
xdef _c2p8_go
; ---------------------------------------------------------------------
; void __asm c2p8_init (register __a0 UBYTE *chunky,
; register __a1 UBYTE *chunky_cmp,
; register __a2 PLANEPTR *planes,
; register __d0 ULONG signals1,
; register __d1 ULONG signals2,
; register __d2 ULONG pixels, // width*height
; register __d3 ULONG offset, // byte offset into plane
; register __d4 UBYTE *buff2, // Chip buffer width*height
; register __d5 UBYTE *buff3, // Chip buffer width*height
; register __a3 struct GfxBase *GfxBase);
;
; void c2p8_go();
;
; -------------------------------------------------------------------
;
; Pipelined CPU+blitter 8-plane chunky to planar converter.
; Optimised for 68020/30 with fastmem.
;
; Author: Peter McGavin (e-mail peterm@maths.grace.cri.nz), 21 April 1994
; Based on James McCoull's 4-pass blitter algorithm.
;
; Modified by Conrad Sanderson (g.sanderson@ais.gu.edu.au), 4 June 1994
;
; This code is public domain.
;
; algorithm:
;
; Uses chunky comparison buffer. Returns immediately if no diffs found.
; Performs first 2 passes (Fast->Chip) with the CPU (in 1 pass).
; Only converts 32-pixel "units" that changed since last time.
; Updates chunky comparison buffer.
; If nothing has changed, signals signals1 and return immediately.
; Waits for previous QBlit() to completely finish (signals2).
; Then launches passes 3 & 4 with QBlit().
; Return immediately after launching passes 3 & 4.
; ** your task can render the next frame while the converter is still going **
; Signals via signals1 (asynchronously) after completion of pass 3.
; Signals via signals2 from CleanUp() on completion of QBlit().
;
; Approx timing for A4000/030, (320x200x8):
; CPU pass min 13ms, max 37ms, depending how different (then return)
; Asynchronous blitter passes add 62ms
; Max framerate (with changes every frame) is 62ms/frame = 16fps
; occurs when fBUFFER rendering time <= 25ms (= 62ms-37ms)
;
; Approx timing for A1200+fast ram, (320x200x8):
; CPU pass min 18ms, max 55ms
;
;
; see c2p8_demo.c for example usage
; -------------------------------------------------------------------
section chunks,code
_c2p8_init:
movem.l d2-d3/a2-a4/a6,-(sp)
lea mybltnode(pc),a4
move.l a0,(chunky-mybltnode,a4)
move.l a1,(chunky_cmp-mybltnode,a4)
move.l a2,(planes-mybltnode,a4)
move.l d0,(signals1-mybltnode,a4)
move.l d1,(signals2-mybltnode,a4)
move.l d2,(pixels-mybltnode,a4)
lsr.l #1,d2
move.l d2,(pixels2-mybltnode,a4)
lsr.l #1,d2
move.l d2,(pixels4-mybltnode,a4)
lsr.l #1,d2
move.l d2,(pixels8-mybltnode,a4)
lsr.l #1,d2
move.l d2,(pixels16-mybltnode,a4)
move.l d3,(offset-mybltnode,a4)
move.l d4,(tmp_buff2-mybltnode,a4)
move.l d5,(tmp_buff3-mybltnode,a4)
move.l a3,(gfxbase-mybltnode,a4)
move.l (4).w,a6
move.l a6,(sysbase-mybltnode,a4)
move.l (ThisTask,a6),(task-mybltnode,a4) ; save task ptr
movem.l (sp)+,d2-d3/a2-a4/a6
rts
cnop 0,4
_c2p8_go: movem.l d2-d7/a2-a6,-(sp)
lea mybltnode(pc),a2
move.l a2,a0
; wait for previous call to c2p4 to finish pass 3
move.l (signals1-mybltnode,a0),d0
move.l (sysbase,pc),a6
jsr (_LVOWait,a6) ; signals1 in d0
move.l a2,a0
move.l (chunky-mybltnode,a0),a2
move.l (chunky_cmp-mybltnode,a0),a3
;-------------------------------------------------
;original chunky data
;0 a7a6a5a4a3a2a1a0 b7b6b5b4b3b2b1b0
;2 c7c6c5c4c3c2c1c0 d7d6d5d4d3d2d1d0
;4 e7e6e5e4e3e2e1e0 f7f6f5f4f3f2f1f0
;6 g7g6g5g4g3g2g1g0 h7h6h5h4h3h2h1h0
;8 i7i6i5i4i3i2i1i0 j7j6j5j4j3j2j1j0
;10 k7k6k5k4k3k2k1k0 l7l6l5l4l3l2l1l0
;12 m7m6m5m4m3m2m1m0 n7n6n5n4n3n2n1n0
;14 o7o6o5o4o3o2o1o0 p7p6p5p4p3p2p1p0
;16 q7q6q5q4q3q2q1q0 r7r6r5r4r3r2r1r0
;18 s7s6s5s4s3s2s1s0 t7t6t5t4t3t2t1t0
;20 u7u6u5u4u3u2u1u0 v7v6v5v4v3v2v1v0
;22 w7w6w5w4w3w2w1w0 x7x6x5x4x3x2x1x0
;24 y7y6y5y4y3y2y1y0 z7z6z5z4z3z2z1z0
;26 A7A6A5A4A3A2A1A0 B7B6B5B4B3B2B1B0
;28 C7C6C5C4C3C2C1C0 D7D6D5D4D3D2D1D0
;30 E7E6E5E4E3E2E1E0 F7F6F5F4F3F2F1F0
;-------------------------------------------------
move.l (pixels16-mybltnode,a0),d6
lsr.l #1,d6 ; loop count = pixels/32
move.l (pixels4-mybltnode,a0),d0
move.l (tmp_buff2,pc),a0
lea (a0,d0.l),a1 ; a1 -> buff2+pixels/4
lea (a1,d0.l),a4 ; a4 -> buff2+pixels/2
lea (a4,d0.l),a5 ; a5 -> buff2+3*pixels/4
move.l #$0f0f0f0f,d7 ; constant
move.l #$00ff00ff,a6 ; constant
bra.b end_pass1loop
cnop 0,4 ; align to 32 bits
; main loop (starts here) processes 32 chunky pixels at a time
; compare next 32 pixels with compare page, looking for differences
initpass1loop: cmpm.l (a2)+,(a3)+
bne.w fix1
cmpm.l (a2)+,(a3)+
bne.w fix2
cmpm.l (a2)+,(a3)+
bne.w fix3
cmpm.l (a2)+,(a3)+
bne.b fix4
cmpm.l (a2)+,(a3)+
bne.b fix5
cmpm.l (a2)+,(a3)+
bne.b fix6
cmpm.l (a2)+,(a3)+
bne.b fix7
cmpm.l (a2)+,(a3)+
bne.b fix8
addq.l #8,a0 ; skip 8 bytes in output
addq.l #8,a1 ; skip 8 bytes in output
addq.l #8,a4 ; skip 8 bytes in output
addq.l #8,a5 ; skip 8 bytes in output
end_pass1loop: dbra d6,initpass1loop
bra.w done2
cnop 0,4
; This becomes the main loop after the first difference is found
pass1loop: cmpm.l (a2)+,(a3)+
bne.b fix1
cmpm.l (a2)+,(a3)+
bne.b fix2
cmpm.l (a2)+,(a3)+
bne.b fix3
cmpm.l (a2)+,(a3)+
bne.b fix4
cmpm.l (a2)+,(a3)+
bne.b fix5
cmpm.l (a2)+,(a3)+
bne.b fix6
cmpm.l (a2)+,(a3)+
bne.b fix7
cmpm.l (a2)+,(a3)+
bne.b fix8
addq.l #8,a0 ; skip 8 bytes in output
addq.l #8,a1 ; skip 8 bytes in output
addq.l #8,a4 ; skip 8 bytes in output
addq.l #8,a5 ; skip 8 bytes in output
dbra d6,pass1loop
bra.w done
cnop 0,4
; difference found, restore a2 and a3
fix8: sub.w #32,a2
sub.w #32,a3
bra.b go_c2p
fix7: sub.w #28,a2
sub.w #28,a3
bra.b go_c2p
fix6: sub.w #24,a2
sub.w #24,a3
bra.b go_c2p
fix5: sub.w #20,a2
sub.w #20,a3
bra.b go_c2p
fix4: sub.w #16,a2
sub.w #16,a3
bra.b go_c2p
cnop 0,4
fix3: subq.l #4,a2
subq.l #4,a3
fix2: subq.l #4,a2
subq.l #4,a3
fix1: subq.l #4,a2
subq.l #4,a3
; convert 32 pixels (passes 1 and 2 combined)
go_c2p: movem.l (a2)+,d0-d3 ; AaBbCcDd EeFfGgHh IiJjKkLl MmNnOoPp
movem.l d0-d3,(a3) ; update compare buffer
adda.w #16,a3
exg d7,a6 ; d7=$00ff00ff
move.l d0,d4 ; AaBbCcDd
and.l d7,d4 ; ..Bb..Dd
eor.l d4,d0 ; Aa..Cc..
lsl.l #8,d4 ; Bb..Dd..
move.l d2,d5 ; IiJjKkLl
and.l d7,d5 ; ..Jj..Ll
eor.l d5,d2 ; Ii..Kk..
lsr.l #8,d2 ; ..Ii..Kk
or.l d2,d0 ; AaIiCcKk
or.l d5,d4 ; BbJjDdLl
move.l d1,d2 ; EeFfGgHh
and.l d7,d2 ; ..Ff..Hh
eor.l d2,d1 ; Ee..Gg..
lsl.l #8,d2 ; Ff..Hh..
move.l d3,d5 ; MmNnOoPp
and.l d7,d5 ; ..Nn..Pp
eor.l d5,d3 ; Mm..Oo..
lsr.l #8,d3 ; ..Mm..Oo
or.l d3,d1 ; EeMmGgOo
or.l d5,d2 ; FfNnHhPp
exg d7,a6 ; d7 = $0f0f0f0f
move.l d0,d3 ; AaIiCcKk
and.l d7,d3 ; .a.i.c.k
eor.l d3,d0 ; A.I.C.K.
lsl.l #4,d3 ; a.i.c.k.
move.l d1,d5 ; EeMmGgOo
and.l d7,d5 ; .e.m.g.o
or.l d5,d3 ; aeimcgko
move.l d3,(a4)+
eor.l d5,d1 ; E.M.G.O.
lsr.l #4,d1 ; .E.M.G.O
or.l d1,d0 ; AEIMCGKO
move.l d0,(a0)+
move.l d4,d1 ; BbJjDdLl
and.l d7,d1 ; .b.j.d.l
eor.l d1,d4 ; B.J.D.L.
lsl.l #4,d1 ; b.j.d.l.
move.l d2,d5 ; FfNnHhPp
and.l d7,d5 ; .f.n.h.p
or.l d5,d1 ; bfjndhlp
move.l d1,(a5)+
eor.l d5,d2 ; F.N.H.P.
lsr.l #4,d2 ; .F.N.H.P
or.l d2,d4 ; BFJNDHLP
move.l d4,(a1)+
bchg #16,d6 ; repeat inner loop twice
beq.b go_c2p
dbra d6,pass1loop
; wait until previous QBlit() has completely finished (signals2)
; then start the blitter in the background for passes 3 & 4
done: lea mybltnode(pc),a2 ; a2->mybltnode
move.l sysbase(pc),a6 ; a6->SysBase
move.l (signals2-mybltnode,a2),d0
jsr (_LVOWait,a6)
move.l a2,a1
move.l (gfxbase-mybltnode,a2),a6
jsr (_LVOQBlit,a6)
bra.b ret
; If we get to here then no difference was found.
; Signal the task (signals1) and return.
cnop 0,4
done2: lea mybltnode(pc),a2
move.l (signals1-mybltnode,a2),d0
move.l d0,d1
mov