home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Amiga ACS 1998 #2
/
amigaacscoverdisc1998-021998.iso
/
games
/
doom
/
adoom
/
src
/
amiga_draw.s
next >
Wrap
Text File
|
1998-01-08
|
17KB
|
780 lines
mc68020
multipass
debug on,lattice4
xdef @R_DrawColumn_040
xdef @R_DrawSpan_040
xdef @R_DrawColumn_060
xdef @R_DrawSpan_060
xref _dc_yl
xref _dc_yh
xref _dc_x
xref _columnofs
xref _ylookup
xref _dc_iscale
xref _centery
xref _dc_texturemid
xref _dc_source
xref _dc_colormap
xref _ds_xfrac
xref _ds_yfrac
xref _ds_x1
xref _ds_y
xref _ds_x2
xref _ds_xstep
xref _ds_ystep
xref _ds_source
xref _ds_colormap
SCREENWIDTH equ 320
FRACBITS equ 16
;***********************************************************************
;@R_DrawColumn movem.l d3-d7/a2-a5,-(sp)
; move.l (_dc_yl),d0
; move.l (_dc_yh),d7
; sub.l d0,d7
; bmi.b 1$
; move.l (_dc_x),d1
; lea (_columnofs),a5
; lea (a5,d1.l*4),a1
; lea (_ylookup),a5
; movea.l (a5,d0.l*4),a2
; adda.l (a1),a2
; move.l (_dc_iscale),d6
; sub.l (_centery),d0
; muls.l d6,d0
; move.l (_dc_texturemid),d5
; add.l d0,d5
; movea.l (_dc_source),a3
; movea.l (_dc_colormap),a4
; moveq #127,d4
; move.l #SCREENWIDTH,d3
; moveq #0,d1 ; ensure high bits of d1 are clear
;
;;2$ move.l d5,d0 ; frac
;; swap d0
;; and.w d4,d0 ; (frac>>16)&127
;; move.b (a3,d0.w),d1 ; dc_source[(frac>>FRACBITS)&127]
;; move.b (a4,d1.w),(a2) ; *dest = dc_colormap[d1]
;; adda.l d3,a2 ; dest += SCREENWIDTH
;; add.l d6,d5 ; frac += fracstep
;; dbra d7,2$
;;1$ movem.l (sp)+,d3-d7/a2-a5
;; rts
;
;; faster routine from j.selck@flensburg.netsurf.de:
;
; add.w d6,d5 ; frac += fracstep (also sets X flag)
; swap d5 ; swap(frac)
; swap d6 ; swap(fracstep)
; and.w d4,d5 ; (frac>>16)&127
;2$ move.b (a3,d5.w),d1 ; dc_source[(frac>>FRACBITS)&127]
; move.b (a4,d1.w),(a2) ; *dest = dc_colormap[d1]
; addx.l d6,d5 ; swap(frac += fracstep), use & set X
; adda.l d3,a2 ; dest += SCREENWIDTH
; and.w d4,d5 ; (frac>>16)&127
; dbra d7,2$ ; !! dbra slow on 68060 !!
;1$ movem.l (sp)+,d3-d7/a2-a5
; rts
; This even faster version by Aki M Laukkanen <amlaukka@cc.helsinki.fi>
cnop 0,4
@R_DrawColumn_060
movem.l d2-d3/d5-d7/a2/a3,-(sp)
move.l (_dc_yh),d7 ; count = _dc_yh - _dc_yl
move.l (_dc_yl),d0
sub.l d0,d7
bmi .end
move.l (_dc_x),d1 ; dest = ylookup[_dc_yl] + columnofs[_dc_x]
lea (_ylookup),a0
move.l (a0,d0.l*4),a0
lea (_columnofs),a1
add.l (a1,d1.l*4),a0
move.l (_dc_colormap),a2
move.l (_dc_source),a1
move.l (_dc_iscale),d1 ; frac = _dc_texturemid + (_dc_yl-centery)*fracstep
sub.l (_centery),d0
muls.l d1,d0
add.l (_dc_texturemid),d0
moveq #$7f,d3
move.l #SCREENWIDTH,a3
move.l d7,d6 ; Do the leftover iterations in
and.w #3,d6 ; this loop.
beq .skip
.skip_loop
move.l d0,d5
swap d5
and.l d3,d5
move.b (a1,d5.w),d5
add.l d1,d0
move.b (a2,d5.w),(a0)
add.l a3,a0
subq.l #1,d6
bne .skip_loop
; d7: cnt >> 2
; a0: chunky
; a1: texture
; a2: light_table
; d0: frac (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac*2 (.......................................)
; d2: frac+dfrac(.......................................)
; d3: $7f
; a3: SCREENWIDTH
.skip
lsr.l #2,d7
subq.l #1,d7
bmi .end
add.l a3,a3
move.l d0,d2
add.l a3,a3
add.l d1,d2
add.l d1,d1
eor.w d0,d2 ; swap the fraction part for addx
eor.w d2,d0 ; assuming 16.16 fixed point
eor.w d0,d2
swap d0 ; swap decimals and fraction
swap d1
swap d2
moveq #0,d5
and.w d3,d2
and.w d3,d0
sub.w d1,d0
add.l d1,d0 ; setup the X flag
move.b (a1,d2.w),d5
.loop
; This should be reasonably scheduled for
; m68060. It should perform well on other processors
; too. That AGU stall still bothers me though.
move.b (a1,d0.w),d6 ; stall + pOEP but allows sOEP
addx.l d1,d2 ; pOEP only
move.b (a2,d5.l),d5 ; pOEP but allows sOEP
and.w d3,d2 ; sOEP
move.b (a2,d6.l),d6 ; pOEP but allows sOEP
move.b d5,(SCREENWIDTH,a0) ; sOEP
addx.l d1,d0 ; pOEP only
move.b (a1,d2.w),d5 ; pOEP but allows sOEP
and.w d3,d0 ; sOEP
move.b d6,(a0) ; pOEP
; = ~4 cycles/pixel
; + cache misses
; The vertical writes are the true timehog of the loop
; because of the characteristics of the copyback cache
; operation.
; Better mark the chunky buffer as write through
; with the MMU and have all the horizontal writes
; be longs aligned to longword boundary.
move.b (a1,d0.w),d6
addx.l d1,d2
move.b (a2,d5.l),d5
and.w d3,d2
move.b (a2,d6.l),d6
move.b d5,(SCREENWIDTH*3,a0)
addx.l d1,d0
move.b (a1,d2.w),d5
and.w d3,d0
move.b d6,(SCREENWIDTH*2,a0)
add.l a3,a0
.loop_end
dbf d7,.loop
; it's faster to divide it to two lines on 060
; and shouldn't be slower on 040.
move.b (a1,d0.w),d6 ; new
move.b (a2,d6.l),d6 ; new
move.b d6,(a0) ; new
.end
movem.l (sp)+,d2-d3/d5-d7/a2/a3
rts
cnop 0,4
; 030/040 version
@R_DrawColumn_040
movem.l d2-d4/d6-d7/a2/a3,-(sp)
move.l (_dc_yh),d7 ; count = _dc_yh - _dc_yl
move.l (_dc_yl),d0
sub.l d0,d7
bmi .end
addq.l #1,d7
move.l (_dc_x),d1 ; dest = ylookup[_dc_yl] + columnofs[_dc_x]
lea (_ylookup),a0
move.l (a0,d0.l*4),a0
lea (_columnofs),a1
add.l (a1,d1.l*4),a0
move.l (_dc_colormap),d4
move.l (_dc_source),a1
move.l (_dc_iscale),d1 ; frac = _dc_texturemid + (_dc_yl-centery)*fracstep
sub.l (_centery),d0
muls.l d1,d0
add.l (_dc_texturemid),d0
moveq #$7f,d3
move.l #SCREENWIDTH,a3
move.l d7,d6 ; Do the leftover iterations in
and.w #3,d6 ; this loop.
beq .skip
.skip_loop
move.l d0,d2
swap d2
and.l d3,d2
move.b (a1,d2.w),d4
move.l d4,a2
move.b (a2),(a0)
add.l d1,d0
add.l a3,a0
subq.l #1,d6
bne .skip_loop
; d7: cnt >> 2
; a0: chunky
; a1: texture
; d0: frac (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac (.......................................)
; d3: $7f
; d4: light table aligned to 256 byte boundary
; a3: SCREENWIDTH
.skip
lsr.l #2,d7
subq.l #1,d7
bmi .end
add.l a3,a3
add.l a3,a3
swap d0 ; swap decimals and fraction
swap d1
and.w d3,d0
sub.w d1,d0
add.l d1,d0 ; setup the X flag
.loop
move.b (a1,d0.w),d4
addx.l d1,d0
move.l d4,a2
and.w d3,d0
move.b (a2),(a0)
move.b (a1,d0.w),d4
addx.l d1,d0
move.l d4,a2
and.w d3,d0
move.b (a2),(SCREENWIDTH,a0)
move.b (a1,d0.w),d4
addx.l d1,d0
move.l d4,a2
and.w d3,d0
move.b (a2),(SCREENWIDTH*2,a0)
move.b (a1,d0.w),d4
addx.l d1,d0
move.l d4,a2
and.w d3,d0
move.b (a2),(SCREENWIDTH*3,a0)
add.l a3,a0
.loop_end
dbf d7,.loop
.end
movem.l (sp)+,d2-d4/d6-d7/a2/a3
rts
;void R_DrawColumn (void)
;{
; int count;
; byte* dest;
; fixed_t frac;
; fixed_t fracstep;
;
; count = dc_yh - dc_yl;
; if (count < 0)
; return;
; dest = ylookup[dc_yl] + columnofs[dc_x];
; fracstep = dc_iscale;
; frac = dc_texturemid + (dc_yl-centery)*fracstep;
; do {
; *dest = dc_colormap[dc_source[(frac>>FRACBITS)&127]];
; dest += SCREENWIDTH;
; frac += fracstep;
; } while (count--);
;}
;***********************************************************************
;@R_DrawSpan movem.l d2-d7/a2-a5,-(a7)
; move.l (_ds_x1),d0
; lea (_columnofs),a5
; lea (a5,d0.l*4),a1
; move.l (_ds_y),d1
; lea (_ylookup),a5
; movea.l (a5,d1.l*4),a2
; adda.l (a1),a2
; move.l (_ds_x2),d5
; sub.l d0,d5 ; count
; movea.l (_ds_source),a3
; movea.l (_ds_colormap),a4
; move.l (_ds_xstep),d3
; move.l (_ds_ystep),d4
;;-
; moveq #10,d2
; moveq #63,d6
; move.l #63*64,d7
; movea.l (_ds_xfrac),a0 ; xfrac
; movea.l (_ds_yfrac),a1 ; yfrac
;1$ move.l a0,d0 ; xfrac
; swap d0
; and.l d6,d0 ; (xfrac>>16)&63
; move.l a1,d1 ; yfrac
; asr.l d2,d1
; and.l d7,d1 ; (yfrac>>10)&(63*64)
; add.l d0,d1 ; spot
; moveq #0,d0
; move.b (a3,d1.l),d0 ; ds_source[spot]
; move.b (a4,d0.w),(a2)+ ; *dest++ = ds_colormap[...]
; adda.l d3,a0 ; xfrac += ds_xstep
; adda.l d4,a1 ; yfrac += ds_ystep
; dbra d5,1$
; movem.l (a7)+,d2-d7/a2-a5
; rts
; This faster version by Aki M Laukkanen <amlaukka@cc.helsinki.fi>
cnop 0,4
@R_DrawSpan_060
movem.l d2-d7/a2/a3,-(sp)
move.l (_ds_y),d0
move.l (_ds_x1),d1 ; dest = ylookup[_ds_y] + columnofs[_ds_x1]
lea (_ylookup),a0
move.l (a0,d0.l*4),a0
lea (_columnofs),a1
add.l (a1,d1.l*4),a0
move.l (_ds_source),a1
move.l (_ds_colormap),a2
move.l (_ds_x2),d7 ; count = _ds_x2 - _ds_x1
sub.l d1,d7
addq.l #1,d7
move.l (_ds_xfrac),d0
move.l (_ds_yfrac),d1
move.l (_ds_xstep),d2
move.l (_ds_ystep),d3
move.l a0,d4
btst #0,d4
beq .skipb
move.l d0,d5 ; do the unaligned pixels
move.l d1,d6 ; so we can write to longword
swap d5 ; boundary in the main loop
swap d6
and.w #$3f,d5
and.w #$3f,d6
lsl.w #6,d6
or.w d5,d6
move.b (a1,d6.w),d5
add.l d2,d0
move.b (a2,d5.w),(a0)+
add.l d3,d1
move.l a0,d4
subq.l #1,d7
.skipb
btst #1,d4
beq .skips
moveq #2,d4
cmp.l d4,d7
bls .skips
move.l d0,d5 ; write two pixels
move.l d1,d6
swap d5
swap d6
and.w #$3f,d5
and.w #$3f,d6
lsl.w #6,d6
or.w d5,d6
move.b (a1,d6.w),d5
move.w (a2,d5.w),d4
add.l d2,d0
add.l d3,d1
move.l d0,d5
move.l d1,d6
swap d5
swap d6
and.w #$3f,d5
and.w #$3f,d6
lsl.w #6,d6
or.w d5,d6
move.b (a1,d6.w),d5
move.b (a2,d5.w),d4
add.l d2,d0
move.w d4,(a0)+
add.l d3,d1
subq.l #2,d7
.skips
; a0: chunky
; a1: texture
; a2: light_table
; d7: count >> 2
; d0: xfrac (vvvv vvvv vvvv vvvv 1111 1111 11UU UUUU)
; d1: yfrac (uuuu uuuu uuuu uuuu 1111 VVVV VV11 1111)
; d2: dxfrac
; d3: dyfrac
move.l d7,d6 ; setup registers
and.w #3,d6
move.l d6,a3
eor.w d0,d1 ; swap fraction parts for addx
eor.w d2,d3
eor.w d1,d0
eor.w d3,d2
eor.w d0,d1
eor.w d2,d3
swap d0
swap d1
swap d2
swap d3
lsl.w #6,d1
lsl.w #6,d3
moveq #0,d6
moveq #0,d5
sub.l #$f000,a1
lsr.l #2,d7
beq .skip_loop2
subq.l #1,d7
sub.w d3,d1
add.l d3,d1 ; setup the X flag
or.w #$ffc0,d0
or.w #$f03f,d1
move.w d0,d6
and.w d1,d6
bra .start_loop2
cnop 0,8
.loop2
; This should be reasonably scheduled for m68060.
; It writes long words to long word aligned locations.
; First of all that's the optimal way if you write
; directly to a frame buffer on graphics cards.
; Same holds true if you change the chunky buffer
; cache mode to write through. See R_DrawColumn().
or.w #$ffc0,d0 ; pOEP
or.w #$f03f,d1 ; sOEP
move.b (a2,d5.l),d4 ; pOEP but allows sOEP
move.w d0,d6 ; sOEP
and.w d1,d6 ; pOEP
move.l d4,(a0)+ ; sOEP
.start_loop2
addx.l d2,d0 ; pOEP only
addx.l d3,d1 ; pOEP only
move.b (a1,d6.l),d5 ; pOEP but allows sOEP
or.w #$ffc0,d0 ; sOEP
or.w #$f03f,d1 ; pOEP
move.w d0,d6 ; sOEP
move.w (a2,d5.l),d4 ; pOEP but allows sOEP
and.w d1,d6 ; sOEP
addx.l d2,d0 ; pOEP only
addx.l d3,d1 ; pOEP only
move.b (a1,d6.l),d5 ; pOEP but allows sOEP
or.w #$ffc0,d0 ; sOEP
or.w #$f03f,d1 ; pOEP
move.w d0,d6 ; sOEP
move.b (a2,d5.l),d4 ; pOEP but allows sOEP
and.w d1,d6 ; sOEP
addx.l d2,d0 ; pOEP only
addx.l d3,d1 ; pOEP only
move.b (a1,d6.l),d5 ; pOEP but allows sOEP
or.w #$ffc0,d0 ; sOEP
or.w #$f03f,d1 ; pOEP
move.w d0,d6 ; sOEP
swap d4 ; pOEP only
move.w (a2,d5.l),d4 ; pOEP but allows sOEP
and.w d1,d6 ; sOEP
addx.l d2,d0 ; pOEP only
addx.l d3,d1 ; pOEP only
move.b (a1,d6.l),d5 ; pOEP but allows sOEP
dbf d7,.loop2 ; pOEP only
; = 7.75 cycles/pixel
move.b (a2,d5.l),d4
move.l d4,(a0)+
.skip_loop2
sub.w d3,d1
add.l d3,d1
move.l a3,d7
bra .loop_end2
.loop3
or.w #$ffc0,d0
or.w #$f03f,d1
move.w d0,d6
and.w d1,d6
addx.l d2,d0
addx.l d3,d1
move.b (a1,d6.l),d5
move.b (a2,d5.l),(a0)+
.loop_end2
dbf d7,.loop3
.end2
movem.l (sp)+,d2-d7/a2/a3
rts
cnop 0,4
; 030/040 version
@R_DrawSpan_040
movem.l d2-d7/a2-a4,-(sp)
move.l (_ds_y),d0
move.l (_ds_x1),d1 ; dest = ylookup[_ds_y] + columnofs[_ds_x1]
lea (_ylookup),a0
move.l (a0,d0.l*4),a0
lea (_columnofs),a1
add.l (a1,d1.l*4),a0
move.l (_ds_source),a1
move.l (_ds_colormap),a2
move.l (_ds_x2),d7 ; count = _ds_x2 - _ds_x1
sub.l d1,d7
addq.l #1,d7
move.l (_ds_xfrac),d0
move.l (_ds_yfrac),d1
move.l (_ds_xstep),d2
move.l (_ds_ystep),d3
move.l a0,d4
btst #0,d4
beq .skipb
move.l d0,d5 ; do the unaligned pixels
move.l d1,d6 ; so we can write to longword
swap d5 ; boundary in the main loop
swap d6
and.w #$3f,d5
and.w #$3f,d6
lsl.w #6,d6
or.w d5,d6
move.b (a1,d6.w),d5
add.l d2,d0
move.b (a2,d5.w),(a0)+
add.l d3,d1
move.l a0,d4
subq.l #1,d7
.skipb
btst #1,d4
beq .skips
moveq #2,d4
cmp.l d4,d7
bls .skips
move.l d0,d5 ; write two pixels
move.l d1,d6
swap d5
swap d6
and.w #$3f,d5
and.w #$3f,d6
lsl.w #6,d6
or.w d5,d6
move.b (a1,d6.w),d5
move.w (a2,d5.w),d4
add.l d2,d0
add.l d3,d1
move.l d0,d5
move.l d1,d6
swap d5
swap d6
and.w #$3f,d5
and.w #$3f,d6
lsl.w #6,d6
or.w d5,d6
move.b (a1,d6.w),d5
move.b (a2,d5.w),d4
add.l d2,d0
move.w d4,(a0)+
add.l d3,d1
subq.l #2,d7
.skips
; a0: chunky
; a4: chunky end
; a1: texture
; d4: light_table
; d0: xfrac (vvvv vvvv vvvv vvvv 1111 1111 11UU UUUU)
; d1: yfrac (uuuu uuuu uuuu uuuu 1111 VVVV VV11 1111)
; d2: dxfrac
; d3: dyfrac
; d6: x_or
; d7: y_or
move.l a2,d4
add.l #$1000,a1 ; catch 22
move.l a0,a3
add.l d7,a3
move.l d7,d5
and.b #~3,d5
move.l a0,a4
add.l d5,a4
eor.w d0,d1 ; swap fraction parts for addx
eor.w d2,d3
eor.w d1,d0
eor.w d3,d2
eor.w d0,d1
eor.w d2,d3
swap d0
swap d1
swap d2
swap d3
lsl.w #6,d1
lsl.w #6,d3
move.w #$ffc0,d6
move.w #$f03f,d7
lsr.w #2,d5
beq .skip_loop2
sub.w d3,d1
add.l d3,d1 ; setup the X flag
.loop2
or.w d6,d0
or.w d7,d1
and.w d1,d0
addx.l d2,d0
addx.l d3,d1
move.b (a1,d0.w),d4
move.l d4,a2
move.w (a2),d5
or.w d6,d0
or.w d7,d1
and.w d1,d0
addx.l d2,d0
addx.l d3,d1
move.b (a1,d0.w),d4
move.l d4,a2
move.b (a2),d5
swap d5
or.w d6,d0
or.w d7,d1
and.w d1,d0
addx.l d2,d0
addx.l d3,d1
move.b (a1,d0.w),d4
move.l d4,a2
move.w (a2),d5
or.w d6,d0
or.w d7,d1
and.w d1,d0
addx.l d2,d0
addx.l d3,d1
move.b (a1,d0.w),d4
move.l d4,a2
move.b (a2),d5
move.l d5,(a0)+
cmp.l a0,a4
bne .loop2
.skip_loop2
sub.w d3,d1
add.l d3,d1
bra .loop_end2
.loop3
or.w d6,d0
or.w d7,d1
and.w d1,d0
addx.l d2,d0
addx.l d3,d1
move.b (a1,d0.w),d4
move.l d4,a2
move.b (a2),(a0)+
.loop_end2
cmp.l a0,a3
bne .loop3
.end2
movem.l (sp)+,d2-d7/a2-a4
rts
;void R_DrawSpan (void)
;{
; fixed_t xfrac, yfrac;
; byte* dest;
; int count, spot;
;
; xfrac = ds_xfrac;
; yfrac = ds_yfrac;
; dest = ylookup[ds_y] + columnofs[ds_x1];
; count = ds_x2 - ds_x1;
; do {
; spot = ((yfrac>>(16-6))&(63*64)) + ((xfrac>>16)&63);
; *dest++ = ds_colormap[ds_source[spot]];
; xfrac += ds_xstep;
; yfrac += ds_ystep;
; } while (count--);
;}
;***********************************************************************
end