home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Amiga Format CD 28
/
amigaformatcd28.iso
/
-websites-
/
amidoom
/
adoom_src-0.7.lha
/
ADoom_src
/
amiga_draw.s
< prev
next >
Wrap
Text File
|
1998-01-16
|
26KB
|
1,285 lines
*
* amiga_draw.s - optimized rendering
* by Aki Laukkanen <amlaukka@cc.helsinki.fi>
*
* This file is public domain.
*
mc68020
multipass
;;; debug on,lattice4
;;; include "exec/types.i"
SCREENWIDTH equ 320
FUZZTABLE equ 250
FUZZOFF equ SCREENWIDTH
FRACBITS equ 16
FRACUNIT equ (1<<FRACBITS)
*
* global functions
*
xdef _R_DrawColumn_040 ; high detail
xdef @R_DrawColumn_040
xdef _R_DrawSpan_040
xdef @R_DrawSpan_040
xdef _R_DrawColumn_060
xdef @R_DrawColumn_060
xdef _R_DrawSpan_060
xdef @R_DrawSpan_060
xdef _R_DrawFuzzColumn
xdef @R_DrawFuzzColumn
;; xdef _R_DrawTranslatedColumn
;; xdef @R_DrawTranslatedColumn
xdef _R_DrawSpanLow ; low detail
xdef @R_DrawSpanLow
xdef _R_DrawColumnLow
xdef @R_DrawColumnLow
xdef _R_DrawFuzzColumnLow
xdef @R_DrawFuzzColumnLow
;; xdef _R_DrawTranslatedColumnLow
;; xdef @R_DrawTranslatedColumnLow
*
* needed symbols/labels
*
xref _dc_yl
xref _dc_yh
xref _dc_x
xref _columnofs
xref _ylookup
xref _dc_iscale
xref _centery
xref _dc_texturemid
xref _dc_source
xref _dc_colormap
xref _ds_xfrac
xref _ds_yfrac
xref _ds_x1
xref _ds_y
xref _ds_x2
xref _ds_xstep
xref _ds_ystep
xref _ds_source
xref _ds_colormap
xref _fuzzoffset
xref _fuzzpos
xref _viewheight
xref _dc_translation
xref _colormaps
; low detail drawing functions
cnop 0,4
_R_DrawColumnLow
@R_DrawColumnLow
movem.l d3-d4/d6-d7/a2/a3,-(sp)
move.l (_dc_yh),d7 ; count = _dc_yh - _dc_yl
move.l (_dc_yl),d0
sub.l d0,d7
bmi .end
move.l (_dc_x),d1 ; dest = ylookup[_dc_yl] + columnofs[_dc_x]
lea (_ylookup),a0
add.l d1,d1 ; dc_x <<= 1
move.l (a0,d0.l*4),a0
lea (_columnofs),a1
add.l (a1,d1.l*4),a0
move.l (_dc_colormap),d4
move.l (_dc_source),a1
move.l (_dc_iscale),d1 ; frac = _dc_texturemid + (_dc_yl-centery)*fracstep
sub.l (_centery),d0
muls.l d1,d0
add.l (_dc_texturemid),d0
moveq #$7f,d3
lea (SCREENWIDTH*4).w,a3
; d7: cnt >> 2
; a0: chunky
; a1: texture
; d0: frac (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac (.......................................)
; d3: $7f
; d4: light table aligned to 256 byte boundary
; a3: SCREENWIDTH
move.l d7,d6
and.w #3,d6
swap d0 ; swap decimals and fraction
swap d1
add.w (.width_tab,pc,d6.w*2),a0
lsr.w #2,d7
move.w (.tmap_tab,pc,d6.w*2),d6
and.w d3,d0
sub.w d1,d0
add.l d1,d0 ; setup the X flag
jmp (.loop,pc,d6.w)
cnop 0,4
.width_tab
dc.w -3*SCREENWIDTH
dc.w -2*SCREENWIDTH
dc.w -1*SCREENWIDTH
dc.w 0
.tmap_tab
dc.w .0-.loop
dc.w .1-.loop
dc.w .2-.loop
dc.w .3-.loop
.loop
.3
move.b (a1,d0.w),d4
addx.l d1,d0
move.l d4,a2
move.w (a2),d6
and.w d3,d0
move.b (a2),d6
move.w d6,(a0)
.2
move.b (a1,d0.w),d4
addx.l d1,d0
move.l d4,a2
move.w (a2),d6
and.w d3,d0
move.b (a2),d6
move.w d6,(SCREENWIDTH,a0)
.1
move.b (a1,d0.w),d4
addx.l d1,d0
move.l d4,a2
move.w (a2),d6
and.w d3,d0
move.b (a2),d6
move.w d6,(SCREENWIDTH*2,a0)
.0
move.b (a1,d0.w),d4
addx.l d1,d0
move.l d4,a2
move.w (a2),d6
and.w d3,d0
move.b (a2),d6
move.w d6,(SCREENWIDTH*3,a0)
add.l a3,a0
.loop_end
dbf d7,.loop
.end
movem.l (sp)+,d3-d4/d6-d7/a2/a3
rts
cnop 0,4
_R_DrawSpanLow
@R_DrawSpanLow
movem.l d2-d7/a2-a4,-(sp)
move.l (_ds_y),d0
move.l (_ds_x1),d1 ; dest = ylookup[_ds_y] + columnofs[_ds_x1]
lea (_ylookup),a0
add.l d1,d1
move.l (a0,d0.l*4),a0
lea (_columnofs),a1
add.l (a1,d1.l*4),a0
move.l (_ds_x2),d7 ; count = _ds_x2 - _ds_x1
move.l (_ds_source),a1
add.l d7,d7
move.l (_ds_colormap),a2
sub.l d1,d7
addq.l #2,d7
move.l (_ds_xfrac),d0
move.l (_ds_yfrac),d1
move.l (_ds_xstep),d2
move.l (_ds_ystep),d3
move.l a0,d4 ; notice, that this address must already be aligned by word
btst #1,d4
beq .skips
move.l d0,d5 ; do the unaligned pixels
move.l d1,d6 ; so we can write to longword
swap d5 ; boundary in the main loop
swap d6
and.w #$3f,d5
and.w #$3f,d6 ; this is the worst possible
lsl.w #6,d6 ; way but hey, this is not a loop
or.w d5,d6
move.b (a1,d6.w),d5
add.l d2,d0
move.b (a2,d5.w),(a0)+
add.l d3,d1
move.b (a2,d5.w),(a0)+ ; I know this is crap but spare me the comments
subq.l #2,d7
.skips move.l a2,d4
lea ($1000,a1),a1 ; catch 22
move.l a0,a3
add.l d7,a3
move.l d7,d5
and.b #~7,d5
move.l a0,a4
add.l d5,a4
eor.w d0,d1 ; swap fraction parts for addx
eor.w d2,d3
eor.w d1,d0
eor.w d3,d2
eor.w d0,d1
eor.w d2,d3
swap d0
swap d1
swap d2
swap d3
lsl.w #6,d1
lsl.w #6,d3
move.w #$ffc0,d6
move.w #$f03f,d7
lsr.w #3,d5
beq .skip_loop2
sub.w d2,d0
add.l d2,d0 ; setup the X flag
.loop2 or.w d6,d0 ; Not really and exercise in optimizing
or.w d7,d1 ; but I guess it's faster than 1x1 for 030
and.w d1,d0 ; where this low detail business is needed.
addx.l d3,d1
move.b (a1,d0.w),d4
addx.l d2,d0
move.l d4,a2
move.w (a2),d5
or.w d6,d0
move.b (a2),d5
or.w d7,d1
and.w d1,d0
swap d5
addx.l d3,d1
move.b (a1,d0.w),d4
addx.l d2,d0
move.l d4,a2
move.w (a2),d5
or.w d6,d0
move.b (a2),d5
or.w d7,d1
and.w d1,d0
move.l d5,(a0)+
addx.l d3,d1
move.b (a1,d0.w),d4
addx.l d2,d0
move.l d4,a2
move.w (a2),d5
or.w d6,d0
move.b (a2),d5
or.w d7,d1
and.w d1,d0
swap d5
addx.l d3,d1
move.b (a1,d0.w),d4
addx.l d2,d0
move.l d4,a2
move.w (a2),d5
move.b (a2),d5
move.l d5,(a0)+
cmp.l a0,a4
bne .loop2
.skip_loop2
sub.w d2,d0
add.l d2,d0
bra .loop_end2
.loop3 or.w d6,d0
or.w d7,d1
and.w d1,d0
addx.l d3,d1
move.b (a1,d0.w),d4
addx.l d2,d0
move.l d4,a2
move.b (a2),(a0)+
move.b (a2),(a0)+
.loop_end2
cmp.l a0,a3
bne .loop3
.end2 movem.l (sp)+,d2-d7/a2-a4
rts
cnop 0,4
_R_DrawTranslatedColumnLow
@R_DrawTranslatedColumnLow
movem.l d2-d4/d6-d7/a2/a3,-(sp)
move.l (_dc_yh),d7 ; count = _dc_yh - _dc_yl
move.l (_dc_yl),d0
sub.l d0,d7
bmi .end
move.l (_dc_x),d1 ; dest = ylookup[_dc_yl] + columnofs[_dc_x]
lea (_ylookup),a0
add.l d1,d1
move.l (a0,d0.l*4),a0
lea (_columnofs),a1
add.l (a1,d1.l*4),a0
move.l (_dc_translation),d2
move.l (_dc_colormap),d4
move.l (_dc_source),a1
move.l (_dc_iscale),d1 ; frac = _dc_texturemid + (_dc_yl-centery)*fracstep
sub.l (_centery),d0
muls.l d1,d0
add.l (_dc_texturemid),d0
moveq #$7f,d3
lea (SCREENWIDTH*4).w,a3
; d7: cnt >> 2
; a0: chunky
; a1: texture
; d0: frac (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac (.......................................)
; d3: $7f
; d4: light table aligned to 256 byte boundary
; d2: translation table aligned to 256 byte boundary
; a3: SCREENWIDTH
move.l d7,d6
and.w #3,d6
swap d0 ; swap decimals and fraction
swap d1
add.w (.width_tab,pc,d6.w*2),a0
lsr.w #2,d7
move.w (.tmap_tab,pc,d6.w*2),d6
and.w d3,d0
sub.w d1,d0
add.l d1,d0 ; setup the X flag
jmp (.loop,pc,d6.w)
cnop 0,4
.width_tab
dc.w -3*SCREENWIDTH
dc.w -2*SCREENWIDTH
dc.w -1*SCREENWIDTH
dc.w 0
.tmap_tab
dc.w .0-.loop
dc.w .1-.loop
dc.w .2-.loop
dc.w .3-.loop
.loop
.3
move.b (a1,d0.w),d2
move.l d2,a2
addx.l d1,d0
move.b (a2),d4
move.l d4,a2
and.w d3,d0
move.w (a2),d6
move.b (a2),d6
move.w d6,(a0)
.2
move.b (a1,d0.w),d2
move.l d2,a2
addx.l d1,d0
move.b (a2),d4
move.l d4,a2
and.w d3,d0
move.w (a2),d6
move.b (a2),d6
move.w d6,(SCREENWIDTH,a0)
.1
move.b (a1,d0.w),d2
move.l d2,a2
addx.l d1,d0
move.b (a2),d4
move.l d4,a2
and.w d3,d0
move.w (a2),d6
move.b (a2),d6
move.w d6,(SCREENWIDTH*2,a0)
.0
move.b (a1,d0.w),d2
move.l d2,a2
addx.l d1,d0
move.b (a2),d4
move.l d4,a2
and.w d3,d0
move.w (a2),d6
move.b (a2),d6
move.b d6,(SCREENWIDTH*3,a0)
add.l a3,a0
.loop_end
dbf d7,.loop
.end
movem.l (sp)+,d2-d4/d6-d7/a2/a3
rts
cnop 0,4
_R_DrawFuzzColumnLow
@R_DrawFuzzColumnLow
movem.l d4/d6-d7/a2/a3,-(sp)
move.l (_viewheight),d1
subq.l #1,d1
move.l (_dc_yh),d7 ; count = _dc_yh - _dc_yl
cmp.l d1,d7
bne .skip_yh
subq.l #1,d1
move.l d1,d7
.skip_yh
move.l (_dc_yl),d0
bne .skip_yl
moveq #1,d0
.skip_yl
sub.l d0,d7
bmi .end
move.l (_dc_x),d1 ; dest = ylookup[_dc_yl] + columnofs[_dc_x]
lea (_ylookup),a0
add.l d1,d1
move.l (a0,d0.l*4),a0
lea (_columnofs),a1
add.l (a1,d1.l*4),a0
move.l (_colormaps),d4
add.l #6*256,d4
lea (_fuzzoffset),a1
move.l (_fuzzpos),d0 ; bring it down
.pos_loop sub.w #200,d0
bpl .pos_loop
add.w #200,d0
add.l d0,a1
lea (SCREENWIDTH*4).w,a3
; d7: cnt >> 2
; a0: chunky
; a1: fuzzoffset
; d0: frac (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac (.......................................)
; d3: $7f
; d4: light table aligned to 256 byte boundary
; a3: SCREENWIDTH
move.l d7,d6
and.w #3,d6
add.w (.width_tab,pc,d6.w*2),a0
lsr.w #2,d7
move.w (.tmap_tab,pc,d6.w*2),d6
jmp (.loop,pc,d6.w)
cnop 0,4
.width_tab
dc.w -3*SCREENWIDTH
dc.w -2*SCREENWIDTH
dc.w -1*SCREENWIDTH
dc.w 0
.tmap_tab
dc.w .0-.loop
dc.w .1-.loop
dc.w .2-.loop
dc.w .3-.loop
.loop
.3 move.l a0,a2 ; This is essentially
add.l (a1)+,a2 ; just moving memory around.
move.b (a2),d4
move.l d4,a2
move.w (a2),d6
move.b (a2),d6
move.w d6,(a0)
.2 lea (SCREENWIDTH,a0),a2
add.l (a1)+,a2
move.b (a2),d4
move.l d4,a2
move.w (a2),d6
move.b (a2),d6
move.w d6,(SCREENWIDTH,a0)
.1 lea (2*SCREENWIDTH,a0),a2
add.l (a1)+,a2
move.b (a2),d4
move.l d4,a2
move.w (a2),d6
move.b (a2),d6
move.w d6,(2*SCREENWIDTH,a0)
.0 lea (3*SCREENWIDTH,a0),a2
add.l (a1)+,a2
move.b (a2),d4
move.l d4,a2
move.w (a2),d6
move.b (a2),d6
move.w d6,(3*SCREENWIDTH,a0)
add.l a3,a0
.loop_end
dbf d7,.loop
sub.l #_fuzzoffset,a1
move.l a1,_fuzzpos
.end
movem.l (sp)+,d4/d6-d7/a2/a3
rts
; high detail versions
cnop 0,4
_R_DrawFuzzColumn
@R_DrawFuzzColumn
movem.l d4/d6-d7/a2/a3,-(sp)
move.l (_viewheight),d1
subq.l #1,d1
move.l (_dc_yh),d7 ; count = _dc_yh - _dc_yl
cmp.l d1,d7
bne .skip_yh
subq.l #1,d1
move.l d1,d7
.skip_yh
move.l (_dc_yl),d0
bne .skip_yl
moveq #1,d0
.skip_yl
sub.l d0,d7
bmi .end
move.l (_dc_x),d1 ; dest = ylookup[_dc_yl] + columnofs[_dc_x]
lea (_ylookup),a0
move.l (a0,d0.l*4),a0
lea (_columnofs),a1
add.l (a1,d1.l*4),a0
move.l (_colormaps),d4
add.l #6*256,d4
lea (_fuzzoffset),a1
move.l (_fuzzpos),d0
.pos_loop sub.w #200,d0
bpl .pos_loop
add.w #200,d0
add.l d0,a1
lea (SCREENWIDTH*4).w,a3
; d7: cnt >> 2
; a0: chunky
; a1: fuzzoffset
; d0: frac (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac (.......................................)
; d3: $7f
; d4: light table aligned to 256 byte boundary
; a3: SCREENWIDTH
move.l d7,d6
and.w #3,d6
add.w (.width_tab,pc,d6.w*2),a0
lsr.w #2,d7
move.w (.tmap_tab,pc,d6.w*2),d6
jmp (.loop,pc,d6.w)
cnop 0,4
.width_tab
dc.w -3*SCREENWIDTH
dc.w -2*SCREENWIDTH
dc.w -1*SCREENWIDTH
dc.w 0
.tmap_tab
dc.w .0-.loop
dc.w .1-.loop
dc.w .2-.loop
dc.w .3-.loop
.loop
.3 move.l a0,a2 ; This is essentially
add.l (a1)+,a2 ; just moving memory around.
move.b (a2),d4
move.l d4,a2 ; Not 060 optimized but
move.b (a2),(a0) ; if you have hordes of
.2 lea (SCREENWIDTH,a0),a2 ; invisible monsters which
add.l (a1)+,a2 ; slow down the game too much,
move.b (a2),d4 ; do tell me.
move.l d4,a2
move.b (a2),(SCREENWIDTH,a0)
.1 lea (2*SCREENWIDTH,a0),a2
add.l (a1)+,a2
move.b (a2),d4
move.l d4,a2
move.b (a2),(2*SCREENWIDTH,a0)
.0 lea (3*SCREENWIDTH,a0),a2
add.l (a1)+,a2
move.b (a2),d4
move.l d4,a2
move.b (a2),(3*SCREENWIDTH,a0)
add.l a3,a0
.loop_end
dbf d7,.loop
sub.l #_fuzzoffset,a1
move.l a1,_fuzzpos
.end
movem.l (sp)+,d4/d6-d7/a2/a3
rts
cnop 0,4
_R_DrawTranslatedColumn ; no 060 version :(
@R_DrawTranslatedColumn
movem.l d2-d4/d6-d7/a2/a3,-(sp)
move.l (_dc_yh),d7 ; count = _dc_yh - _dc_yl
move.l (_dc_yl),d0
sub.l d0,d7
bmi .end
move.l (_dc_x),d1 ; dest = ylookup[_dc_yl] + columnofs[_dc_x]
lea (_ylookup),a0
move.l (a0,d0.l*4),a0
lea (_columnofs),a1
add.l (a1,d1.l*4),a0
move.l (_dc_translation),d2
move.l (_dc_colormap),d4
move.l (_dc_source),a1
move.l (_dc_iscale),d1 ; frac = _dc_texturemid + (_dc_yl-centery)*fracstep
sub.l (_centery),d0
muls.l d1,d0
add.l (_dc_texturemid),d0
moveq #$7f,d3
lea (SCREENWIDTH*4).w,a3
; d7: cnt >> 2
; a0: chunky
; a1: texture
; d0: frac (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac (.......................................)
; d3: $7f
; d4: light table aligned to 256 byte boundary
; d2: translation table aligned to 256 byte boundary
; a3: SCREENWIDTH
move.l d7,d6
and.w #3,d6
swap d0 ; swap decimals and fraction
swap d1
add.w (.width_tab,pc,d6.w*2),a0
lsr.w #2,d7
move.w (.tmap_tab,pc,d6.w*2),d6
and.w d3,d0
sub.w d1,d0
add.l d1,d0 ; setup the X flag
jmp (.loop,pc,d6.w)
cnop 0,4
.width_tab
dc.w -3*SCREENWIDTH
dc.w -2*SCREENWIDTH
dc.w -1*SCREENWIDTH
dc.w 0
.tmap_tab
dc.w .0-.loop
dc.w .1-.loop
dc.w .2-.loop
dc.w .3-.loop
.loop
.3
move.b (a1,d0.w),d2
move.l d2,a2
addx.l d1,d0
move.b (a2),d4
and.w d3,d0
move.l d4,a2
move.b (a2),(a0)
.2
move.b (a1,d0.w),d2
move.l d2,a2
addx.l d1,d0
move.b (a2),d4
and.w d3,d0
move.l d4,a2
move.b (a2),(SCREENWIDTH,a0)
.1
move.b (a1,d0.w),d2
move.l d2,a2
addx.l d1,d0
move.b (a2),d4
and.w d3,d0
move.l d4,a2
move.b (a2),(SCREENWIDTH*2,a0)
.0
move.b (a1,d0.w),d2
move.l d2,a2
addx.l d1,d0
move.b (a2),d4
and.w d3,d0
move.l d4,a2
move.b (a2),(SCREENWIDTH*3,a0)
add.l a3,a0
.loop_end
dbf d7,.loop
.end
movem.l (sp)+,d2-d4/d6-d7/a2/a3
rts
cnop 0,4
_R_DrawColumn_060
@R_DrawColumn_060
movem.l d2-d3/d5-d7/a2/a3,-(sp)
move.l (_dc_yh),d7 ; count = _dc_yh - _dc_yl
move.l (_dc_yl),d0
sub.l d0,d7
bmi .end
move.l (_dc_x),d1 ; dest = ylookup[_dc_yl] + columnofs[_dc_x]
lea (_ylookup),a0
move.l (a0,d0.l*4),a0
lea (_columnofs),a1
add.l (a1,d1.l*4),a0
move.l (_dc_colormap),a2
move.l (_dc_source),a1
move.l (_dc_iscale),d1 ; frac = _dc_texturemid + (_dc_yl-centery)*fracstep
sub.l (_centery),d0
muls.l d1,d0
add.l (_dc_texturemid),d0
moveq #$7f,d3
move.l #SCREENWIDTH,a3
move.l d7,d6 ; Do the leftover iterations in
and.w #3,d6 ; this loop.
beq .skip
.skip_loop
move.l d0,d5
swap d5
and.l d3,d5
move.b (a1,d5.w),d5
add.l d1,d0
move.b (a2,d5.w),(a0)
add.l a3,a0
subq.l #1,d6
bne .skip_loop
; d7: cnt >> 2
; a0: chunky
; a1: texture
; a2: light_table
; d0: frac (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac*2 (.......................................)
; d2: frac+dfrac(.......................................)
; d3: $7f
; a3: SCREENWIDTH
.skip
lsr.l #2,d7
subq.l #1,d7
bmi .end
add.l a3,a3
move.l d0,d2
add.l a3,a3
add.l d1,d2
add.l d1,d1
eor.w d0,d2 ; swap the fraction part for addx
eor.w d2,d0 ; assuming 16.16 fixed point
eor.w d0,d2
swap d0 ; swap decimals and fraction
swap d1
swap d2
moveq #0,d5
and.w d3,d2
and.w d3,d0
sub.w d1,d0
add.l d1,d0 ; setup the X flag
move.b (a1,d2.w),d5
.loop
; This should be reasonably scheduled for
; m68060. It should perform well on other processors
; too. That AGU stall still bothers me though.
move.b (a1,d0.w),d6 ; stall + pOEP but allows sOEP
addx.l d1,d2 ; pOEP only
move.b (a2,d5.l),d5 ; pOEP but allows sOEP
and.w d3,d2 ; sOEP
move.b (a2,d6.l),d6 ; pOEP but allows sOEP
move.b d5,(SCREENWIDTH,a0) ; sOEP
addx.l d1,d0 ; pOEP only
move.b (a1,d2.w),d5 ; pOEP but allows sOEP
and.w d3,d0 ; sOEP
move.b d6,(a0) ; pOEP
; = ~4 cycles/pixel
; + cache misses
; The vertical writes are the true timehog of the loop
; because of the characteristics of the copyback cache
; operation.
; Better mark the chunky buffer as write through
; with the MMU and have all the horizontal writes
; be longs aligned to longword boundary.
move.b (a1,d0.w),d6
addx.l d1,d2
move.b (a2,d5.l),d5
and.w d3,d2
move.b (a2,d6.l),d6
move.b d5,(SCREENWIDTH*3,a0)
addx.l d1,d0
move.b (a1,d2.w),d5
and.w d3,d0
move.b d6,(SCREENWIDTH*2,a0)
add.l a3,a0
.loop_end
dbf d7,.loop
; it's faster to divide it to two lines on 060
; and shouldn't be slower on 040.
move.b (a1,d0.w),d6 ; new
move.b (a2,d6.l),d6 ; new
move.b d6,(a0) ; new
.end
movem.l (sp)+,d2-d3/d5-d7/a2/a3
rts
cnop 0,4
; 030/040 version
_R_DrawColumn_040
@R_DrawColumn_040
movem.l d3-d4/d6-d7/a2/a3,-(sp)
move.l (_dc_yh),d7 ; count = _dc_yh - _dc_yl
move.l (_dc_yl),d0
sub.l d0,d7
bmi .end
move.l (_dc_x),d1 ; dest = ylookup[_dc_yl] + columnofs[_dc_x]
lea (_ylookup),a0
move.l (a0,d0.l*4),a0
lea (_columnofs),a1
add.l (a1,d1.l*4),a0
move.l (_dc_colormap),d4
move.l (_dc_source),a1
move.l (_dc_iscale),d1 ; frac = _dc_texturemid + (_dc_yl-centery)*fracstep
sub.l (_centery),d0
muls.l d1,d0
add.l (_dc_texturemid),d0
moveq #$7f,d3
lea (SCREENWIDTH*4).w,a3
; d7: cnt >> 2
; a0: chunky
; a1: texture
; d0: frac (uuuu uuuu uuuu uuuu 0000 0000 0UUU UUUU)
; d1: dfrac (.......................................)
; d3: $7f
; d4: light table aligned to 256 byte boundary
; a3: SCREENWIDTH
move.l d7,d6
and.w #3,d6
swap d0 ; swap decimals and fraction
swap d1
add.w (.width_tab,pc,d6.w*2),a0
lsr.w #2,d7
move.w (.tmap_tab,pc,d6.w*2),d6
and.w d3,d0
sub.w d1,d0
add.l d1,d0 ; setup the X flag
jmp (.loop,pc,d6.w)
cnop 0,4
.width_tab
dc.w -3*SCREENWIDTH
dc.w -2*SCREENWIDTH
dc.w -1*SCREENWIDTH
dc.w 0
.tmap_tab
dc.w .0-.loop
dc.w .1-.loop
dc.w .2-.loop
dc.w .3-.loop
.loop
.3
move.b (a1,d0.w),d4
addx.l d1,d0
move.l d4,a2
and.w d3,d0
move.b (a2),(a0)
.2
move.b (a1,d0.w),d4
addx.l d1,d0
move.l d4,a2
and.w d3,d0
move.b (a2),(SCREENWIDTH,a0)
.1
move.b (a1,d0.w),d4
addx.l d1,d0
move.l d4,a2
and.w d3,d0
move.b (a2),(SCREENWIDTH*2,a0)
.0
move.b (a1,d0.w),d4
addx.l d1,d0
move.l d4,a2
and.w d3,d0
move.b (a2),(SCREENWIDTH*3,a0)
add.l a3,a0
.loop_end
dbf d7,.loop
.end
movem.l (sp)+,d3-d4/d6-d7/a2/a3
rts
; This faster version by Aki M Laukkanen <amlaukka@cc.helsinki.fi>
cnop 0,4
_R_DrawSpan_060
@R_DrawSpan_060
movem.l d2-d7/a2/a3,-(sp)
move.l (_ds_y),d0
move.l (_ds_x1),d1 ; dest = ylookup[_ds_y] + columnofs[_ds_x1]
lea (_ylookup),a0
move.l (a0,d0.l*4),a0
lea (_columnofs),a1
add.l (a1,d1.l*4),a0
move.l (_ds_source),a1
move.l (_ds_colormap),a2
move.l (_ds_x2),d7 ; count = _ds_x2 - _ds_x1
sub.l d1,d7
addq.l #1,d7
move.l (_ds_xfrac),d0
move.l (_ds_yfrac),d1
move.l (_ds_xstep),d2
move.l (_ds_ystep),d3
move.l a0,d4
btst #0,d4
beq .skipb
move.l d0,d5 ; do the unaligned pixels
move.l d1,d6 ; so we can write to longword
swap d5 ; boundary in the main loop
swap d6
and.w #$3f,d5
and.w #$3f,d6
lsl.w #6,d6
or.w d5,d6
move.b (a1,d6.w),d5
add.l d2,d0
move.b (a2,d5.w),(a0)+
add.l d3,d1
move.l a0,d4
subq.l #1,d7
.skipb btst #1,d4
beq .skips
moveq #2,d4
cmp.l d4,d7
bls .skips
move.l d0,d5 ; write two pixels
move.l d1,d6
swap d5
swap d6
and.w #$3f,d5
and.w #$3f,d6
lsl.w #6,d6
or.w d5,d6
move.b (a1,d6.w),d5
move.w (a2,d5.w),d4
add.l d2,d0
add.l d3,d1
move.l d0,d5
move.l d1,d6
swap d5
swap d6
and.w #$3f,d5
and.w #$3f,d6
lsl.w #6,d6
or.w d5,d6
move.b (a1,d6.w),d5
move.b (a2,d5.w),d4
add.l d2,d0
move.w d4,(a0)+
add.l d3,d1
subq.l #2,d7
.skips move.l d7,d6 ; setup registers
and.w #3,d6
move.l d6,a3
eor.w d0,d1 ; swap fraction parts for addx
eor.w d2,d3
eor.w d1,d0
eor.w d3,d2
eor.w d0,d1
eor.w d2,d3
swap d0
swap d1
swap d2
swap d3
lsl.w #6,d1
lsl.w #6,d3
moveq #0,d6
moveq #0,d5
sub.l #$f000,a1
lsr.l #2,d7
beq .skip_loop2
subq.l #1,d7
sub.w d3,d1
add.l d3,d1 ; setup the X flag
or.w #$ffc0,d0
or.w #$f03f,d1
move.w d0,d6
and.w d1,d6
bra .start_loop2
cnop 0,8
.loop2 or.w #$ffc0,d0 ; pOEP
or.w #$f03f,d1 ; sOEP
move.b (a2,d5.l),d4 ; pOEP but allows sOEP
move.w d0,d6 ; sOEP
and.w d1,d6 ; pOEP
move.l d4,(a0)+ ; sOEP
.start_loop2
addx.l d2,d0 ; pOEP only
addx.l d3,d1 ; pOEP only
move.b (a1,d6.l),d5 ; pOEP but allows sOEP
or.w #$ffc0,d0 ; sOEP
or.w #$f03f,d1 ; pOEP
move.w d0,d6 ; sOEP
move.w (a2,d5.l),d4 ; pOEP but allows sOEP
and.w d1,d6 ; sOEP
addx.l d2,d0 ; pOEP only
addx.l d3,d1 ; pOEP only
move.b (a1,d6.l),d5 ; pOEP but allows sOEP
or.w #$ffc0,d0 ; sOEP
or.w #$f03f,d1 ; pOEP
move.w d0,d6 ; sOEP
move.b (a2,d5.l),d4 ; pOEP but allows sOEP
and.w d1,d6 ; sOEP
addx.l d2,d0 ; pOEP only
addx.l d3,d1 ; pOEP only
move.b (a1,d6.l),d5 ; pOEP but allows sOEP
or.w #$ffc0,d0 ; sOEP
or.w #$f03f,d1 ; pOEP
move.w d0,d6 ; sOEP
swap d4 ; pOEP only
move.w (a2,d5.l),d4 ; pOEP but allows sOEP
and.w d1,d6 ; sOEP
addx.l d2,d0 ; pOEP only
addx.l d3,d1 ; pOEP only
move.b (a1,d6.l),d5 ; pOEP but allows sOEP
dbf d7,.loop2 ; pOEP only = 7.75 cycles/pixel
move.b (a2,d5.l),d4
move.l d4,(a0)+
.skip_loop2
sub.w d3,d1
add.l d3,d1
move.l a3,d7
bra .loop_end2
.loop3 or.w #$ffc0,d0
or.w #$f03f,d1
move.w d0,d6
and.w d1,d6
addx.l d2,d0
addx.l d3,d1
move.b (a1,d6.l),d5
move.b (a2,d5.l),(a0)+
.loop_end2
dbf d7,.loop3
.end2 movem.l (sp)+,d2-d7/a2/a3
rts
cnop 0,4
; 030/040 version
_R_DrawSpan_040
@R_DrawSpan_040
movem.l d2-d7/a2-a4,-(sp)
move.l (_ds_y),d0
move.l (_ds_x1),d1 ; dest = ylookup[_ds_y] + columnofs[_ds_x1]
lea (_ylookup),a0
move.l (a0,d0.l*4),a0
lea (_columnofs),a1
add.l (a1,d1.l*4),a0
move.l (_ds_source),a1
move.l (_ds_colormap),a2
move.l (_ds_x2),d7 ; count = _ds_x2 - _ds_x1
sub.l d1,d7
addq.l #1,d7
move.l (_ds_xfrac),d0
move.l (_ds_yfrac),d1
move.l (_ds_xstep),d2
move.l (_ds_ystep),d3
move.l a0,d4
btst #0,d4
beq .skipb
move.l d0,d5 ; do the unaligned pixels
move.l d1,d6 ; so we can write to longword
swap d5 ; boundary in the main loop
swap d6
and.w #$3f,d5
and.w #$3f,d6
lsl.w #6,d6
or.w d5,d6
move.b (a1,d6.w),d5
add.l d2,d0
move.b (a2,d5.w),(a0)+
add.l d3,d1
move.l a0,d4
subq.l #1,d7
.skipb btst #1,d4
beq .skips
moveq #2,d4
cmp.l d4,d7
bls .skips
move.l d0,d5 ; write two pixels
move.l d1,d6
swap d5
swap d6
and.w #$3f,d5
and.w #$3f,d6
lsl.w #6,d6
or.w d5,d6
move.b (a1,d6.w),d5
move.w (a2,d5.w),d4
add.l d2,d0
add.l d3,d1
move.l d0,d5
move.l d1,d6
swap d5
swap d6
and.w #$3f,d5
and.w #$3f,d6
lsl.w #6,d6
or.w d5,d6
move.b (a1,d6.w),d5
move.b (a2,d5.w),d4
add.l d2,d0
move.w d4,(a0)+
add.l d3,d1
subq.l #2,d7
.skips move.l a2,d4
add.l #$1000,a1 ; catch 22
move.l a0,a3
add.l d7,a3
move.l d7,d5
and.b #~3,d5
move.l a0,a4
add.l d5,a4
eor.w d0,d1 ; swap fraction parts for addx
eor.w d2,d3
eor.w d1,d0
eor.w d3,d2
eor.w d0,d1
eor.w d2,d3
swap d0
swap d1
swap d2
swap d3
lsl.w #6,d1
lsl.w #6,d3
move.w #$ffc0,d6
move.w #$f03f,d7
lsr.w #2,d5
beq .skip_loop2
sub.w d2,d0
add.l d2,d0 ; setup the X flag
.loop2 or.w d6,d0
or.w d7,d1
and.w d1,d0
addx.l d3,d1
move.b (a1,d0.w),d4
addx.l d2,d0
move.l d4,a2
move.w (a2),d5
or.w d6,d0
or.w d7,d1
and.w d1,d0
addx.l d3,d1
move.b (a1,d0.w),d4
addx.l d2,d0
move.l d4,a2
move.b (a2),d5
swap d5
or.w d6,d0
or.w d7,d1
and.w d1,d0
addx.l d3,d1
move.b (a1,d0.w),d4
addx.l d2,d0
move.l d4,a2
move.w (a2),d5
or.w d6,d0
or.w d7,d1
and.w d1,d0
addx.l d3,d1
move.b (a1,d0.w),d4
addx.l d2,d0
move.l d4,a2
move.b (a2),d5
move.l d5,(a0)+
cmp.l a0,a4
bne .loop2
.skip_loop2
sub.w d2,d0
add.l d2,d0
bra .loop_end2
.loop3 or.w d6,d0
or.w d7,d1
and.w d1,d0
addx.l d3,d1
move.b (a1,d0.w),d4
addx.l d2,d0
move.l d4,a2
move.b (a2),(a0)+
.loop_end2
cmp.l a0,a3
bne .loop3
.end2 movem.l (sp)+,d2-d7/a2-a4
rts
;***********************************************************************
end