Fresh Fish 5

home *** CD-ROM | disk | FTP | other *** search

/ Fresh Fish 5 / FreshFish_July-August1994.bin / bbs / dev / c2p.lha / C2P / CPU+Blitter / c2p8.s < prev next >

Wrap

Text File | 1994-07-01 | 21KB | 834 lines

opt o+,l+,d+ incdir inc: include "exec/exec_lib.i" include "exec/exec.i" include "graphics/graphics_lib.i" include "hardware/custom.i" xdef _c2p8_init xdef _c2p8_go ; --------------------------------------------------------------------- ; void __asm c2p8_init (register __a0 UBYTE *chunky, ; register __a1 UBYTE *chunky_cmp, ; register __a2 PLANEPTR *planes, ; register __d0 ULONG signals1, ; register __d1 ULONG signals2, ; register __d2 ULONG pixels, // width*height ; register __d3 ULONG offset, // byte offset into plane ; register __d4 UBYTE *buff2, // Chip buffer width*height ; register __d5 UBYTE *buff3, // Chip buffer width*height ; register __a3 struct GfxBase *GfxBase); ; ; void c2p8_go(); ; ; ------------------------------------------------------------------- ; ; Pipelined CPU+blitter 8-plane chunky to planar converter. ; Optimised for 68020/30 with fastmem. ; ; Author: Peter McGavin (e-mail peterm@maths.grace.cri.nz), 21 April 1994 ; Based on James McCoull's 4-pass blitter algorithm. ; ; Modified by Conrad Sanderson (g.sanderson@ais.gu.edu.au), 4 June 1994 ; ; This code is public domain. ; ; algorithm: ; ; Uses chunky comparison buffer. Returns immediately if no diffs found. ; Performs first 2 passes (Fast->Chip) with the CPU (in 1 pass). ; Only converts 32-pixel "units" that changed since last time. ; Updates chunky comparison buffer. ; If nothing has changed, signals signals1 and return immediately. ; Waits for previous QBlit() to completely finish (signals2). ; Then launches passes 3 & 4 with QBlit(). ; Return immediately after launching passes 3 & 4. ; ** your task can render the next frame while the converter is still going ** ; Signals via signals1 (asynchronously) after completion of pass 3. ; Signals via signals2 from CleanUp() on completion of QBlit(). ; ; Approx timing for A4000/030, (320x200x8): ; CPU pass min 13ms, max 37ms, depending how different (then return) ; Asynchronous blitter passes add 62ms ; Max framerate (with changes every frame) is 62ms/frame = 16fps ; occurs when fBUFFER rendering time <= 25ms (= 62ms-37ms) ; ; Approx timing for A1200+fast ram, (320x200x8): ; CPU pass min 18ms, max 55ms ; ; ; see c2p8_demo.c for example usage ; ------------------------------------------------------------------- section chunks,code _c2p8_init: movem.l d2-d3/a2-a4/a6,-(sp) lea mybltnode(pc),a4 move.l a0,(chunky-mybltnode,a4) move.l a1,(chunky_cmp-mybltnode,a4) move.l a2,(planes-mybltnode,a4) move.l d0,(signals1-mybltnode,a4) move.l d1,(signals2-mybltnode,a4) move.l d2,(pixels-mybltnode,a4) lsr.l #1,d2 move.l d2,(pixels2-mybltnode,a4) lsr.l #1,d2 move.l d2,(pixels4-mybltnode,a4) lsr.l #1,d2 move.l d2,(pixels8-mybltnode,a4) lsr.l #1,d2 move.l d2,(pixels16-mybltnode,a4) move.l d3,(offset-mybltnode,a4) move.l d4,(tmp_buff2-mybltnode,a4) move.l d5,(tmp_buff3-mybltnode,a4) move.l a3,(gfxbase-mybltnode,a4) move.l (4).w,a6 move.l a6,(sysbase-mybltnode,a4) move.l (ThisTask,a6),(task-mybltnode,a4) ; save task ptr movem.l (sp)+,d2-d3/a2-a4/a6 rts cnop 0,4 _c2p8_go: movem.l d2-d7/a2-a6,-(sp) lea mybltnode(pc),a2 move.l a2,a0 ; wait for previous call to c2p4 to finish pass 3 move.l (signals1-mybltnode,a0),d0 move.l (sysbase,pc),a6 jsr (_LVOWait,a6) ; signals1 in d0 move.l a2,a0 move.l (chunky-mybltnode,a0),a2 move.l (chunky_cmp-mybltnode,a0),a3 ;------------------------------------------------- ;original chunky data ;0 a7a6a5a4a3a2a1a0 b7b6b5b4b3b2b1b0 ;2 c7c6c5c4c3c2c1c0 d7d6d5d4d3d2d1d0 ;4 e7e6e5e4e3e2e1e0 f7f6f5f4f3f2f1f0 ;6 g7g6g5g4g3g2g1g0 h7h6h5h4h3h2h1h0 ;8 i7i6i5i4i3i2i1i0 j7j6j5j4j3j2j1j0 ;10 k7k6k5k4k3k2k1k0 l7l6l5l4l3l2l1l0 ;12 m7m6m5m4m3m2m1m0 n7n6n5n4n3n2n1n0 ;14 o7o6o5o4o3o2o1o0 p7p6p5p4p3p2p1p0 ;16 q7q6q5q4q3q2q1q0 r7r6r5r4r3r2r1r0 ;18 s7s6s5s4s3s2s1s0 t7t6t5t4t3t2t1t0 ;20 u7u6u5u4u3u2u1u0 v7v6v5v4v3v2v1v0 ;22 w7w6w5w4w3w2w1w0 x7x6x5x4x3x2x1x0 ;24 y7y6y5y4y3y2y1y0 z7z6z5z4z3z2z1z0 ;26 A7A6A5A4A3A2A1A0 B7B6B5B4B3B2B1B0 ;28 C7C6C5C4C3C2C1C0 D7D6D5D4D3D2D1D0 ;30 E7E6E5E4E3E2E1E0 F7F6F5F4F3F2F1F0 ;------------------------------------------------- move.l (pixels16-mybltnode,a0),d6 lsr.l #1,d6 ; loop count = pixels/32 move.l (pixels4-mybltnode,a0),d0 move.l (tmp_buff2,pc),a0 lea (a0,d0.l),a1 ; a1 -> buff2+pixels/4 lea (a1,d0.l),a4 ; a4 -> buff2+pixels/2 lea (a4,d0.l),a5 ; a5 -> buff2+3*pixels/4 move.l #$0f0f0f0f,d7 ; constant move.l #$00ff00ff,a6 ; constant bra.b end_pass1loop cnop 0,4 ; align to 32 bits ; main loop (starts here) processes 32 chunky pixels at a time ; compare next 32 pixels with compare page, looking for differences initpass1loop: cmpm.l (a2)+,(a3)+ bne.w fix1 cmpm.l (a2)+,(a3)+ bne.w fix2 cmpm.l (a2)+,(a3)+ bne.w fix3 cmpm.l (a2)+,(a3)+ bne.b fix4 cmpm.l (a2)+,(a3)+ bne.b fix5 cmpm.l (a2)+,(a3)+ bne.b fix6 cmpm.l (a2)+,(a3)+ bne.b fix7 cmpm.l (a2)+,(a3)+ bne.b fix8 addq.l #8,a0 ; skip 8 bytes in output addq.l #8,a1 ; skip 8 bytes in output addq.l #8,a4 ; skip 8 bytes in output addq.l #8,a5 ; skip 8 bytes in output end_pass1loop: dbra d6,initpass1loop bra.w done2 cnop 0,4 ; This becomes the main loop after the first difference is found pass1loop: cmpm.l (a2)+,(a3)+ bne.b fix1 cmpm.l (a2)+,(a3)+ bne.b fix2 cmpm.l (a2)+,(a3)+ bne.b fix3 cmpm.l (a2)+,(a3)+ bne.b fix4 cmpm.l (a2)+,(a3)+ bne.b fix5 cmpm.l (a2)+,(a3)+ bne.b fix6 cmpm.l (a2)+,(a3)+ bne.b fix7 cmpm.l (a2)+,(a3)+ bne.b fix8 addq.l #8,a0 ; skip 8 bytes in output addq.l #8,a1 ; skip 8 bytes in output addq.l #8,a4 ; skip 8 bytes in output addq.l #8,a5 ; skip 8 bytes in output dbra d6,pass1loop bra.w done cnop 0,4 ; difference found, restore a2 and a3 fix8: sub.w #32,a2 sub.w #32,a3 bra.b go_c2p fix7: sub.w #28,a2 sub.w #28,a3 bra.b go_c2p fix6: sub.w #24,a2 sub.w #24,a3 bra.b go_c2p fix5: sub.w #20,a2 sub.w #20,a3 bra.b go_c2p fix4: sub.w #16,a2 sub.w #16,a3 bra.b go_c2p cnop 0,4 fix3: subq.l #4,a2 subq.l #4,a3 fix2: subq.l #4,a2 subq.l #4,a3 fix1: subq.l #4,a2 subq.l #4,a3 ; convert 32 pixels (passes 1 and 2 combined) go_c2p: movem.l (a2)+,d0-d3 ; AaBbCcDd EeFfGgHh IiJjKkLl MmNnOoPp movem.l d0-d3,(a3) ; update compare buffer adda.w #16,a3 exg d7,a6 ; d7=$00ff00ff move.l d0,d4 ; AaBbCcDd and.l d7,d4 ; ..Bb..Dd eor.l d4,d0 ; Aa..Cc.. lsl.l #8,d4 ; Bb..Dd.. move.l d2,d5 ; IiJjKkLl and.l d7,d5 ; ..Jj..Ll eor.l d5,d2 ; Ii..Kk.. lsr.l #8,d2 ; ..Ii..Kk or.l d2,d0 ; AaIiCcKk or.l d5,d4 ; BbJjDdLl move.l d1,d2 ; EeFfGgHh and.l d7,d2 ; ..Ff..Hh eor.l d2,d1 ; Ee..Gg.. lsl.l #8,d2 ; Ff..Hh.. move.l d3,d5 ; MmNnOoPp and.l d7,d5 ; ..Nn..Pp eor.l d5,d3 ; Mm..Oo.. lsr.l #8,d3 ; ..Mm..Oo or.l d3,d1 ; EeMmGgOo or.l d5,d2 ; FfNnHhPp exg d7,a6 ; d7 = $0f0f0f0f move.l d0,d3 ; AaIiCcKk and.l d7,d3 ; .a.i.c.k eor.l d3,d0 ; A.I.C.K. lsl.l #4,d3 ; a.i.c.k. move.l d1,d5 ; EeMmGgOo and.l d7,d5 ; .e.m.g.o or.l d5,d3 ; aeimcgko move.l d3,(a4)+ eor.l d5,d1 ; E.M.G.O. lsr.l #4,d1 ; .E.M.G.O or.l d1,d0 ; AEIMCGKO move.l d0,(a0)+ move.l d4,d1 ; BbJjDdLl and.l d7,d1 ; .b.j.d.l eor.l d1,d4 ; B.J.D.L. lsl.l #4,d1 ; b.j.d.l. move.l d2,d5 ; FfNnHhPp and.l d7,d5 ; .f.n.h.p or.l d5,d1 ; bfjndhlp move.l d1,(a5)+ eor.l d5,d2 ; F.N.H.P. lsr.l #4,d2 ; .F.N.H.P or.l d2,d4 ; BFJNDHLP move.l d4,(a1)+ bchg #16,d6 ; repeat inner loop twice beq.b go_c2p dbra d6,pass1loop ; wait until previous QBlit() has completely finished (signals2) ; then start the blitter in the background for passes 3 & 4 done: lea mybltnode(pc),a2 ; a2->mybltnode move.l sysbase(pc),a6 ; a6->SysBase move.l (signals2-mybltnode,a2),d0 jsr (_LVOWait,a6) move.l a2,a1 move.l (gfxbase-mybltnode,a2),a6 jsr (_LVOQBlit,a6) bra.b ret ; If we get to here then no difference was found. ; Signal the task (signals1) and return. cnop 0,4 done2: lea mybltnode(pc),a2 move.l (signals1-mybltnode,a2),d0 move.l d0,d1 mov