Amiga ACS 1998 #2

home *** CD-ROM | disk | FTP | other *** search

/ Amiga ACS 1998 #2 / amigaacscoverdisc1998-021998.iso / games / doom / adoom / src / amiga_draw.s next >

Wrap

Text File | 1998-01-08 | 17KB | 780 lines

mc68020 multipass debug on,lattice4 xdef @R_DrawColumn_040 xdef @R_DrawSpan_040 xdef @R_DrawColumn_060 xdef @R_DrawSpan_060 xref _dc_yl xref _dc_yh xref _dc_x xref _columnofs xref _ylookup xref _dc_iscale xref _centery xref _dc_texturemid xref _dc_source xref _dc_colormap xref _ds_xfrac xref _ds_yfrac xref _ds_x1 xref _ds_y xref _ds_x2 xref _ds_xstep xref _ds_ystep xref _ds_source xref _ds_colormap SCREENWIDTH equ 320 FRACBITS equ 16 ;*********************************************************************** ;@R_DrawColumn movem.l d3-d7/a2-a5,-(sp) ; move.l (_dc_yl),d0 ; move.l (_dc_yh),d7 ; sub.l d0,d7 ; bmi.b 1$ ; move.l (_dc_x),d1 ; lea (_columnofs),a5 ; lea (a5,d1.l*4),a1 ; lea (_ylookup),a5 ; movea.l (a5,d0.l*4),a2 ; adda.l (a1),a2 ; move.l (_dc_iscale),d6 ; sub.l (_centery),d0 ; muls.l d6,d0 ; move.l (_dc_texturemid),d5 ; add.l d0,d5 ; movea.l (_dc_source),a3 ; movea.l (_dc_colormap),a4 ; moveq #127,d4 ; move.l #SCREENWIDTH,d3 ; moveq #0,d1 ; ensure high bits of d1 are clear ; ;;2$ move.l d5,d0 ; frac ;; swap d0 ;; and.w d4,d0 ; (frac>>16)&127 ;; move.b (a3,d0.w),d1 ; dc_source[(frac>>FRACBITS)&127] ;; move.b (a4,d1.w),(a2) ; *dest = dc_colormap[d1] ;; adda.l d3,a2 ; dest += SCREENWIDTH ;; add.l d6,d5 ; frac += fracstep ;; dbra d7,2$ ;;1$ movem.l (sp)+,d3-d7/a2-a5 ;; rts ; ;; faster routine from j.selck@flensburg.netsurf.de: ; ; add.w d6,d5 ; frac += fracstep (also sets X flag) ; swap d5 ; swap(frac) ; swap d6 ; swap(fracstep) ; and.w d4,d5 ; (frac>>16)&127 ;2$ move.b (a3,d5.w),d1 ; dc_source[(frac>>FRACBITS)&127] ; move.b (a4,d1.w),(a2) ; *dest = dc_colormap[d1] ; addx.l d6,d5 ; swap(frac += fracstep), use & set X ; adda.l d3,a2 ; dest += SCREENWIDTH ; and.w d4,d5 ; (frac>>16)&127 ; dbra d7,2$ ; !! dbra slow on 68060 !! ;1$ movem.l (sp)+,d3-d7/a2-a5 ; rts ; This even faster version by Aki M Laukkanen <amlaukka@cc.helsinki.fi> cnop 0,4 @R_DrawColumn_060 movem.l d2-d3/d5-d7/a2/a3,-(sp) move.l (_dc_yh),d7 ; count = _dc_yh - _dc_yl move.l (_dc_yl),d0 sub.l d0,d7 bmi .end move.l (_dc_x),d1 ; dest = ylookup[_dc_yl] + columnofs[_dc_x] lea (_ylookup),a0 move.l (a0,d0.l*4),a0 lea (_columnofs),a1 add.l (a1,d1.l*4),a0 move.l (_dc_colormap),a2 move.l (_dc_source),a1 move.l (_dc_iscale),d1 ; frac = _dc_texturemid + (_dc_yl-centery)*fracstep sub.l (_centery),d0 muls.l d1,d0 add.l (_dc_texturemid),d0 moveq #$7f,d3 move.l #SCREENWIDTH,a3 move.l d7,d6 ; Do the leftover iterations in and.w #3,d6 ; this loop. beq .skip .skip_loop move.l d0,d5 swap d5 and.l d3,d5 move.b (a1,d5.w),d5 add.l d1,d0 move.b (a2,d5.w),(a0) add.l a3,a0 subq.l #1,d6 bne .skip_loop ; d7: cnt >> 2 ; a0: chunky ; a1: texture ; a2: light_table ; d0: frac (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU) ; d1: dfrac*2 (.......................................) ; d2: frac+dfrac(.......................................) ; d3: $7f ; a3: SCREENWIDTH .skip lsr.l #2,d7 subq.l #1,d7 bmi .end add.l a3,a3 move.l d0,d2 add.l a3,a3 add.l d1,d2 add.l d1,d1 eor.w d0,d2 ; swap the fraction part for addx eor.w d2,d0 ; assuming 16.16 fixed point eor.w d0,d2 swap d0 ; swap decimals and fraction swap d1 swap d2 moveq #0,d5 and.w d3,d2 and.w d3,d0 sub.w d1,d0 add.l d1,d0 ; setup the X flag move.b (a1,d2.w),d5 .loop ; This should be reasonably scheduled for ; m68060. It should perform well on other processors ; too. That AGU stall still bothers me though. move.b (a1,d0.w),d6 ; stall + pOEP but allows sOEP addx.l d1,d2 ; pOEP only move.b (a2,d5.l),d5 ; pOEP but allows sOEP and.w d3,d2 ; sOEP move.b (a2,d6.l),d6 ; pOEP but allows sOEP move.b d5,(SCREENWIDTH,a0) ; sOEP addx.l d1,d0 ; pOEP only move.b (a1,d2.w),d5 ; pOEP but allows sOEP and.w d3,d0 ; sOEP move.b d6,(a0) ; pOEP ; = ~4 cycles/pixel ; + cache misses ; The vertical writes are the true timehog of the loop ; because of the characteristics of the copyback cache ; operation. ; Better mark the chunky buffer as write through ; with the MMU and have all the horizontal writes ; be longs aligned to longword boundary. move.b (a1,d0.w),d6 addx.l d1,d2 move.b (a2,d5.l),d5 and.w d3,d2 move.b (a2,d6.l),d6 move.b d5,(SCREENWIDTH*3,a0) addx.l d1,d0 move.b (a1,d2.w),d5 and.w d3,d0 move.b d6,(SCREENWIDTH*2,a0) add.l a3,a0 .loop_end dbf d7,.loop ; it's faster to divide it to two lines on 060 ; and shouldn't be slower on 040. move.b (a1,d0.w),d6 ; new move.b (a2,d6.l),d6 ; new move.b d6,(a0) ; new .end movem.l (sp)+,d2-d3/d5-d7/a2/a3 rts cnop 0,4 ; 030/040 version @R_DrawColumn_040 movem.l d2-d4/d6-d7/a2/a3,-(sp) move.l (_dc_yh),d7 ; count = _dc_yh - _dc_yl move.l (_dc_yl),d0 sub.l d0,d7 bmi .end addq.l #1,d7 move.l (_dc_x),d1 ; dest = ylookup[_dc_yl] + columnofs[_dc_x] lea (_ylookup),a0 move.l (a0,d0.l*4),a0 lea (_columnofs),a1 add.l (a1,d1.l*4),a0 move.l (_dc_colormap),d4 move.l (_dc_source),a1 move.l (_dc_iscale),d1 ; frac = _dc_texturemid + (_dc_yl-centery)*fracstep sub.l (_centery),d0 muls.l d1,d0 add.l (_dc_texturemid),d0 moveq #$7f,d3 move.l #SCREENWIDTH,a3 move.l d7,d6 ; Do the leftover iterations in and.w #3,d6 ; this loop. beq .skip .skip_loop move.l d0,d2 swap d2 and.l d3,d2 move.b (a1,d2.w),d4 move.l d4,a2 move.b (a2),(a0) add.l d1,d0 add.l a3,a0 subq.l #1,d6 bne .skip_loop ; d7: cnt >> 2 ; a0: chunky ; a1: texture ; d0: frac (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU) ; d1: dfrac (.......................................) ; d3: $7f ; d4: light table aligned to 256 byte boundary ; a3: SCREENWIDTH .skip lsr.l #2,d7 subq.l #1,d7 bmi .end add.l a3,a3 add.l a3,a3 swap d0 ; swap decimals and fraction swap d1 and.w d3,d0 sub.w d1,d0 add.l d1,d0 ; setup the X flag .loop move.b (a1,d0.w),d4 addx.l d1,d0 move.l d4,a2 and.w d3,d0 move.b (a2),(a0) move.b (a1,d0.w),d4 addx.l d1,d0 move.l d4,a2 and.w d3,d0 move.b (a2),(SCREENWIDTH,a0) move.b (a1,d0.w),d4 addx.l d1,d0 move.l d4,a2 and.w d3,d0 move.b (a2),(SCREENWIDTH*2,a0) move.b (a1,d0.w),d4 addx.l d1,d0 move.l d4,a2 and.w d3,d0 move.b (a2),(SCREENWIDTH*3,a0) add.l a3,a0 .loop_end dbf d7,.loop .end movem.l (sp)+,d2-d4/d6-d7/a2/a3 rts ;void R_DrawColumn (void) ;{ ; int count; ; byte* dest; ; fixed_t frac; ; fixed_t fracstep; ; ; count = dc_yh - dc_yl; ; if (count < 0) ; return; ; dest = ylookup[dc_yl] + columnofs[dc_x]; ; fracstep = dc_iscale; ; frac = dc_texturemid + (dc_yl-centery)*fracstep; ; do { ; *dest = dc_colormap[dc_source[(frac>>FRACBITS)&127]]; ; dest += SCREENWIDTH; ; frac += fracstep; ; } while (count--); ;} ;*********************************************************************** ;@R_DrawSpan movem.l d2-d7/a2-a5,-(a7) ; move.l (_ds_x1),d0 ; lea (_columnofs),a5 ; lea (a5,d0.l*4),a1 ; move.l (_ds_y),d1 ; lea (_ylookup),a5 ; movea.l (a5,d1.l*4),a2 ; adda.l (a1),a2 ; move.l (_ds_x2),d5 ; sub.l d0,d5 ; count ; movea.l (_ds_source),a3 ; movea.l (_ds_colormap),a4 ; move.l (_ds_xstep),d3 ; move.l (_ds_ystep),d4 ;;- ; moveq #10,d2 ; moveq #63,d6 ; move.l #63*64,d7 ; movea.l (_ds_xfrac),a0 ; xfrac ; movea.l (_ds_yfrac),a1 ; yfrac ;1$ move.l a0,d0 ; xfrac ; swap d0 ; and.l d6,d0 ; (xfrac>>16)&63 ; move.l a1,d1 ; yfrac ; asr.l d2,d1 ; and.l d7,d1 ; (yfrac>>10)&(63*64) ; add.l d0,d1 ; spot ; moveq #0,d0 ; move.b (a3,d1.l),d0 ; ds_source[spot] ; move.b (a4,d0.w),(a2)+ ; *dest++ = ds_colormap[...] ; adda.l d3,a0 ; xfrac += ds_xstep ; adda.l d4,a1 ; yfrac += ds_ystep ; dbra d5,1$ ; movem.l (a7)+,d2-d7/a2-a5 ; rts ; This faster version by Aki M Laukkanen <amlaukka@cc.helsinki.fi> cnop 0,4 @R_DrawSpan_060 movem.l d2-d7/a2/a3,-(sp) move.l (_ds_y),d0 move.l (_ds_x1),d1 ; dest = ylookup[_ds_y] + columnofs[_ds_x1] lea (_ylookup),a0 move.l (a0,d0.l*4),a0 lea (_columnofs),a1 add.l (a1,d1.l*4),a0 move.l (_ds_source),a1 move.l (_ds_colormap),a2 move.l (_ds_x2),d7 ; count = _ds_x2 - _ds_x1 sub.l d1,d7 addq.l #1,d7 move.l (_ds_xfrac),d0 move.l (_ds_yfrac),d1 move.l (_ds_xstep),d2 move.l (_ds_ystep),d3 move.l a0,d4 btst #0,d4 beq .skipb move.l d0,d5 ; do the unaligned pixels move.l d1,d6 ; so we can write to longword swap d5 ; boundary in the main loop swap d6 and.w #$3f,d5 and.w #$3f,d6 lsl.w #6,d6 or.w d5,d6 move.b (a1,d6.w),d5 add.l d2,d0 move.b (a2,d5.w),(a0)+ add.l d3,d1 move.l a0,d4 subq.l #1,d7 .skipb btst #1,d4 beq .skips moveq #2,d4 cmp.l d4,d7 bls .skips move.l d0,d5 ; write two pixels move.l d1,d6 swap d5 swap d6 and.w #$3f,d5 and.w #$3f,d6 lsl.w #6,d6 or.w d5,d6 move.b (a1,d6.w),d5 move.w (a2,d5.w),d4 add.l d2,d0 add.l d3,d1 move.l d0,d5 move.l d1,d6 swap d5 swap d6 and.w #$3f,d5 and.w #$3f,d6 lsl.w #6,d6 or.w d5,d6 move.b (a1,d6.w),d5 move.b (a2,d5.w),d4 add.l d2,d0 move.w d4,(a0)+ add.l d3,d1 subq.l #2,d7 .skips ; a0: chunky ; a1: texture ; a2: light_table ; d7: count >> 2 ; d0: xfrac (vvvv vvvv vvvv vvvv 1111 1111 11UU UUUU) ; d1: yfrac (uuuu uuuu uuuu uuuu 1111 VVVV VV11 1111) ; d2: dxfrac ; d3: dyfrac move.l d7,d6 ; setup registers and.w #3,d6 move.l d6,a3 eor.w d0,d1 ; swap fraction parts for addx eor.w d2,d3 eor.w d1,d0 eor.w d3,d2 eor.w d0,d1 eor.w d2,d3 swap d0 swap d1 swap d2 swap d3 lsl.w #6,d1 lsl.w #6,d3 moveq #0,d6 moveq #0,d5 sub.l #$f000,a1 lsr.l #2,d7 beq .skip_loop2 subq.l #1,d7 sub.w d3,d1 add.l d3,d1 ; setup the X flag or.w #$ffc0,d0 or.w #$f03f,d1 move.w d0,d6 and.w d1,d6 bra .start_loop2 cnop 0,8 .loop2 ; This should be reasonably scheduled for m68060. ; It writes long words to long word aligned locations. ; First of all that's the optimal way if you write ; directly to a frame buffer on graphics cards. ; Same holds true if you change the chunky buffer ; cache mode to write through. See R_DrawColumn(). or.w #$ffc0,d0 ; pOEP or.w #$f03f,d1 ; sOEP move.b (a2,d5.l),d4 ; pOEP but allows sOEP move.w d0,d6 ; sOEP and.w d1,d6 ; pOEP move.l d4,(a0)+ ; sOEP .start_loop2 addx.l d2,d0 ; pOEP only addx.l d3,d1 ; pOEP only move.b (a1,d6.l),d5 ; pOEP but allows sOEP or.w #$ffc0,d0 ; sOEP or.w #$f03f,d1 ; pOEP move.w d0,d6 ; sOEP move.w (a2,d5.l),d4 ; pOEP but allows sOEP and.w d1,d6 ; sOEP addx.l d2,d0 ; pOEP only addx.l d3,d1 ; pOEP only move.b (a1,d6.l),d5 ; pOEP but allows sOEP or.w #$ffc0,d0 ; sOEP or.w #$f03f,d1 ; pOEP move.w d0,d6 ; sOEP move.b (a2,d5.l),d4 ; pOEP but allows sOEP and.w d1,d6 ; sOEP addx.l d2,d0 ; pOEP only addx.l d3,d1 ; pOEP only move.b (a1,d6.l),d5 ; pOEP but allows sOEP or.w #$ffc0,d0 ; sOEP or.w #$f03f,d1 ; pOEP move.w d0,d6 ; sOEP swap d4 ; pOEP only move.w (a2,d5.l),d4 ; pOEP but allows sOEP and.w d1,d6 ; sOEP addx.l d2,d0 ; pOEP only addx.l d3,d1 ; pOEP only move.b (a1,d6.l),d5 ; pOEP but allows sOEP dbf d7,.loop2 ; pOEP only ; = 7.75 cycles/pixel move.b (a2,d5.l),d4 move.l d4,(a0)+ .skip_loop2 sub.w d3,d1 add.l d3,d1 move.l a3,d7 bra .loop_end2 .loop3 or.w #$ffc0,d0 or.w #$f03f,d1 move.w d0,d6 and.w d1,d6 addx.l d2,d0 addx.l d3,d1 move.b (a1,d6.l),d5 move.b (a2,d5.l),(a0)+ .loop_end2 dbf d7,.loop3 .end2 movem.l (sp)+,d2-d7/a2/a3 rts cnop 0,4 ; 030/040 version @R_DrawSpan_040 movem.l d2-d7/a2-a4,-(sp) move.l (_ds_y),d0 move.l (_ds_x1),d1 ; dest = ylookup[_ds_y] + columnofs[_ds_x1] lea (_ylookup),a0 move.l (a0,d0.l*4),a0 lea (_columnofs),a1 add.l (a1,d1.l*4),a0 move.l (_ds_source),a1 move.l (_ds_colormap),a2 move.l (_ds_x2),d7 ; count = _ds_x2 - _ds_x1 sub.l d1,d7 addq.l #1,d7 move.l (_ds_xfrac),d0 move.l (_ds_yfrac),d1 move.l (_ds_xstep),d2 move.l (_ds_ystep),d3 move.l a0,d4 btst #0,d4 beq .skipb move.l d0,d5 ; do the unaligned pixels move.l d1,d6 ; so we can write to longword swap d5 ; boundary in the main loop swap d6 and.w #$3f,d5 and.w #$3f,d6 lsl.w #6,d6 or.w d5,d6 move.b (a1,d6.w),d5 add.l d2,d0 move.b (a2,d5.w),(a0)+ add.l d3,d1 move.l a0,d4 subq.l #1,d7 .skipb btst #1,d4 beq .skips moveq #2,d4 cmp.l d4,d7 bls .skips move.l d0,d5 ; write two pixels move.l d1,d6 swap d5 swap d6 and.w #$3f,d5 and.w #$3f,d6 lsl.w #6,d6 or.w d5,d6 move.b (a1,d6.w),d5 move.w (a2,d5.w),d4 add.l d2,d0 add.l d3,d1 move.l d0,d5 move.l d1,d6 swap d5 swap d6 and.w #$3f,d5 and.w #$3f,d6 lsl.w #6,d6 or.w d5,d6 move.b (a1,d6.w),d5 move.b (a2,d5.w),d4 add.l d2,d0 move.w d4,(a0)+ add.l d3,d1 subq.l #2,d7 .skips ; a0: chunky ; a4: chunky end ; a1: texture ; d4: light_table ; d0: xfrac (vvvv vvvv vvvv vvvv 1111 1111 11UU UUUU) ; d1: yfrac (uuuu uuuu uuuu uuuu 1111 VVVV VV11 1111) ; d2: dxfrac ; d3: dyfrac ; d6: x_or ; d7: y_or move.l a2,d4 add.l #$1000,a1 ; catch 22 move.l a0,a3 add.l d7,a3 move.l d7,d5 and.b #~3,d5 move.l a0,a4 add.l d5,a4 eor.w d0,d1 ; swap fraction parts for addx eor.w d2,d3 eor.w d1,d0 eor.w d3,d2 eor.w d0,d1 eor.w d2,d3 swap d0 swap d1 swap d2 swap d3 lsl.w #6,d1 lsl.w #6,d3 move.w #$ffc0,d6 move.w #$f03f,d7 lsr.w #2,d5 beq .skip_loop2 sub.w d3,d1 add.l d3,d1 ; setup the X flag .loop2 or.w d6,d0 or.w d7,d1 and.w d1,d0 addx.l d2,d0 addx.l d3,d1 move.b (a1,d0.w),d4 move.l d4,a2 move.w (a2),d5 or.w d6,d0 or.w d7,d1 and.w d1,d0 addx.l d2,d0 addx.l d3,d1 move.b (a1,d0.w),d4 move.l d4,a2 move.b (a2),d5 swap d5 or.w d6,d0 or.w d7,d1 and.w d1,d0 addx.l d2,d0 addx.l d3,d1 move.b (a1,d0.w),d4 move.l d4,a2 move.w (a2),d5 or.w d6,d0 or.w d7,d1 and.w d1,d0 addx.l d2,d0 addx.l d3,d1 move.b (a1,d0.w),d4 move.l d4,a2 move.b (a2),d5 move.l d5,(a0)+ cmp.l a0,a4 bne .loop2 .skip_loop2 sub.w d3,d1 add.l d3,d1 bra .loop_end2 .loop3 or.w d6,d0 or.w d7,d1 and.w d1,d0 addx.l d2,d0 addx.l d3,d1 move.b (a1,d0.w),d4 move.l d4,a2 move.b (a2),(a0)+ .loop_end2 cmp.l a0,a3 bne .loop3 .end2 movem.l (sp)+,d2-d7/a2-a4 rts ;void R_DrawSpan (void) ;{ ; fixed_t xfrac, yfrac; ; byte* dest; ; int count, spot; ; ; xfrac = ds_xfrac; ; yfrac = ds_yfrac; ; dest = ylookup[ds_y] + columnofs[ds_x1]; ; count = ds_x2 - ds_x1; ; do { ; spot = ((yfrac>>(16-6))&(63*64)) + ((xfrac>>16)&63); ; *dest++ = ds_colormap[ds_source[spot]]; ; xfrac += ds_xstep; ; yfrac += ds_ystep; ; } while (count--); ;} ;*********************************************************************** end