core vdp, arm rendering speed optimisation

This commit is contained in:
kub 2024-07-18 21:36:43 +02:00
parent 1fad746a1f
commit 11a1966bf3
2 changed files with 85 additions and 96 deletions

View file

@ -304,34 +304,34 @@ TileFlipMakerAS(TileFlipSH_AS_and, pix_sh_as_and)
// -------------------------------------------- // --------------------------------------------
#ifndef _ASM_DRAW_C #ifndef _ASM_DRAW_C
#define DrawTile(mask) { \ #define DrawTile(mask) { \
if (code!=oldcode) { \ if (code!=oldcode) { \
oldcode = code; \ oldcode = code; \
\ \
pack = 0; \ pack = 0; \
if (code != blank) { \ if (code != blank) { \
/* Get tile address/2: */\ /* Get tile address/2: */ \
u32 addr = ((code&0x7ff)<<4) + ty; \ u32 addr = ((code&0x7ff)<<4) + ty; \
if (code & 0x1000) addr ^= 0xe; /* Y-flip */ \ if (code & 0x1000) addr ^= 0xe; /* Y-flip */ \
\ \
pal = ((code>>9)&0x30) | sh; /* shadow */ \ pal = ((code>>9)&0x30) | sh; /* shadow */ \
\ \
pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr)); \ pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr)); \
if (!pack) \ if (!pack) \
blank = code; \ blank = code; \
} \ } \
} \ } \
\ \
if (code & 0x8000) { /* (un-forced) high priority tile */ \ if (code & 0x8000) { /* (un-forced) high priority tile */ \
if (sh | (pack&mask)) { \ if (sh | (pack&mask)) { \
code |= (dx<<16) | (ty<<25); \ code |= (dx<<16) | (ty<<25); \
if (code & 0x1000) code ^= 0xe<<25; \ if (code & 0x1000) code ^= 0xe<<25; \
*hc++ = code, *hc++ = pack&mask; /* cache it */ \ *hc++ = code, *hc++ = pack&mask; /* cache it */ \
} \ } \
} else if (pack&mask) { \ } else if (pack&mask) { \
if (code & 0x0800) TileFlip(pd + dx, pack&mask, pal); \ if (code & 0x0800) TileFlip(pd + dx, pack&mask, pal); \
else TileNorm(pd + dx, pack&mask, pal); \ else TileNorm(pd + dx, pack&mask, pal); \
} \ } \
} }
static void DrawStrip(struct TileStrip *ts, int lflags, int cellskip) static void DrawStrip(struct TileStrip *ts, int lflags, int cellskip)
@ -478,34 +478,34 @@ static void DrawStripVSRam(struct TileStrip *ts, int plane_sh, int cellskip)
} }
#endif #endif
#define DrawTileInterlace(mask) { \ #define DrawTileInterlace(mask) { \
if (code!=oldcode) { \ if (code!=oldcode) { \
oldcode = code; \ oldcode = code; \
\ \
pack = 0; \ pack = 0; \
if (code != blank) { \ if (code != blank) { \
/* Get tile address/2: */ \ /* Get tile address/2: */ \
u32 addr = ((code&0x3ff)<<5) + ty; \ u32 addr = ((code&0x3ff)<<5) + ty; \
if (code & 0x1000) addr ^= 0x1e; /* Y-flip */ \ if (code & 0x1000) addr ^= 0x1e; /* Y-flip */ \
\ \
pal = ((code>>9)&0x30) | sh; /* shadow */ \ pal = ((code>>9)&0x30) | sh; /* shadow */ \
\ \
pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr)); \ pack = CPU_LE2(*(u32 *)(PicoMem.vram + addr)); \
if (!pack) \ if (!pack) \
blank = code; \ blank = code; \
} \ } \
} \ } \
\ \
if (code & 0x8000) { /* high priority tile */ \ if (code & 0x8000) { /* high priority tile */ \
if (sh | (pack&mask)) { \ if (sh | (pack&mask)) { \
code = (code&0xfc00) | ((code&0x3ff)<<1) | (dx<<16) | (ty<<25); \ code = (code&0xfc00) | ((code&0x3ff)<<1) | (dx<<16) | (ty<<25); \
if (code & 0x1000) code ^= 0x1e<<25; \ if (code & 0x1000) code ^= 0x1e<<25; \
*hc++ = code, *hc++ = pack&mask; /* cache it */ \ *hc++ = code, *hc++ = pack&mask; /* cache it */ \
} \ } \
} else if (pack&mask) { \ } else if (pack&mask) { \
if (code & 0x0800) TileFlip(pd + dx, pack&mask, pal); \ if (code & 0x0800) TileFlip(pd + dx, pack&mask, pal); \
else TileNorm(pd + dx, pack&mask, pal); \ else TileNorm(pd + dx, pack&mask, pal); \
} \ } \
} }
#ifndef _ASM_DRAW_C #ifndef _ASM_DRAW_C

View file

@ -433,7 +433,7 @@ DrawLayer:
movs r3, r9, lsl #1 @ (force[31]|sh[30]) << 1 movs r3, r9, lsl #1 @ (force[31]|sh[30]) << 1
mov r3, #0 mov r3, #0
orrmi r10,r10, #1<<23 @ r10=cells[31:24]|sh[23]|hi_not_empty[22] orrmi r10,r10, #1<<23 @ r10=cells[31:24]|sh[23]|hi_not_empty[22]
@ orrcc r10,r10, #1<<20 @ |had_output[21]|!force[20]|hscroll[19:17]|ty[15:0] @ orrcc r10,r10, #1<<20 @ |had_output[21]|!force[20]|hscroll[18:16]|ty[15:0]
movmi r3, #0x80 @ default to shadowed pal on sh mode movmi r3, #0x80 @ default to shadowed pal on sh mode
and r4, r7, #7 and r4, r7, #7
@ -452,7 +452,7 @@ DrawLayer:
mvn r9, #0 @ r9=prevcode=-1 mvn r9, #0 @ r9=prevcode=-1
add r1, r11, r7 @ r1=pdest add r1, r11, r7 @ r1=pdest
@ r10=cells[31:24]|sh[23]|hi_not_empty[22]|had_output[21]|!force[20]|hscroll[19:17]|ty[15:0] @ r10=cells[31:24]|sh[23]|hi_not_empty[22]|had_output[21]|!force[20]|hscroll[18:16]|ty[15:0]
@ r1=pd+dx r2=pack r3=pal r5=xmask r6=hc r8=tilex r9=prevcode r11=HighCol r12=nametab lr=vram @ r1=pd+dx r2=pack r3=pal r5=xmask r6=hc r8=tilex r9=prevcode r11=HighCol r12=nametab lr=vram
@ r4 & r7 are scratch in this loop @ r4 & r7 are scratch in this loop
@ -467,21 +467,19 @@ DrawLayer:
add r8, r8, #1 add r8, r8, #1
movs r2, r9, lsl #20 @ if (code&0x1000) tst r9, #0x1000 @ if (code&0x1000)
mov r2, r2, lsl #1 mov r2, r9, lsl #21
add r2, r2, r10, lsl #17 add r2, r2, r10, lsl #17
mov r2, r2, lsr #17 eorne r2, r2, #0xe<<17 @ if (code&0x1000) addr^=0xe;
eorcs r2, r2, #0x0e @ if (code&0x1000) addr^=0xe;
ldr r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels ldr r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
mvn r7, #0 mvn r7, #0
mov r4, r4, lsr #16-2 @ (dx&7)*4 mov r4, r4, lsr #16-2 @ (dx&7)*4
tst r9, #0x0800 tst r9, #0x0800
moveq r7, r7, lsl r4 @ mask = ~0 [shift] (dx&7)*4 moveq r7, r7, lsl r4 @ mask = ~0 [shift] (dx&7)*4
movne r7, r7, lsr r4 movne r7, r7, lsr r4
mvn r7, r7, ror #16 bic r2, r2, r7, ror #16 @ pack&~mask
and r2, r2, r7 @ pack&mask
orr r9, r9, #0x80000000 @ invalidate oldcode since pack is masked orr r9, r9, #0x80000000 @ invalidate oldcode since pack is masked
b .DrawStrip_samecode b .DrawStrip_samecode
@ -504,13 +502,12 @@ DrawLayer:
mov r9, r7 @ remember code mov r9, r7 @ remember code
movs r2, r9, lsl #20 @ if (code&0x1000) tst r9, #0x1000 @ if (code&0x1000)
mov r2, r2, lsl #1 mov r2, r9, lsl #21
add r2, r2, r10, lsl #17 add r2, r2, r10, lsl #17
mov r2, r2, lsr #17 eorne r2, r2, #0x0e<<17 @ if (code&0x1000) addr^=0xe;
eorcs r2, r2, #0x0e @ if (code&0x1000) addr^=0xe;
ldr r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels ldr r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
.DrawStrip_samecode: .DrawStrip_samecode:
tst r9, #0x8000 tst r9, #0x8000
@ tstne r10, #1<<20 @ !force[20] @ tstne r10, #1<<20 @ !force[20]
@ -577,21 +574,19 @@ DrawLayer:
add r1, r1, #8 add r1, r1, #8
movs r2, r9, lsl #20 @ if (code&0x1000) tst r9, #0x1000 @ if (code&0x1000)
mov r2, r2, lsl #1 mov r2, r9, lsl #21
add r2, r2, r10, lsl #17 add r2, r2, r10, lsl #17
mov r2, r2, lsr #17 eorne r2, r2, #0x0e<<17 @ if (code&0x1000) addr^=0xe;
eorcs r2, r2, #0x0e @ if (code&0x1000) addr^=0xe;
ldr r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels ldr r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
mvn r7, #0 mvn r7, #0
mov r4, r4, lsr #16-2 @ (dx&7)*4 mov r4, r4, lsr #16-2 @ (dx&7)*4
tst r9, #0x0800 tst r9, #0x0800
moveq r7, r7, lsl r4 @ mask = ~0 [shift] (dx&7)*4 moveq r7, r7, lsl r4 @ mask = ~0 [shift] (dx&7)*4
movne r7, r7, lsr r4 movne r7, r7, lsr r4
mov r7, r7, ror #16 and r2, r2, r7, ror #16 @ pack&mask
and r2, r2, r7 @ pack&mask
bic r10,r10, #7<<16 bic r10,r10, #7<<16
b .DrawStrip_samecode @ one last time, with last tile now masked b .DrawStrip_samecode @ one last time, with last tile now masked
@ -743,13 +738,12 @@ DrawLayer:
mov r9, r7 @ remember code mov r9, r7 @ remember code
movs r2, r9, lsl #20 @ if (code&0x1000) tst r9, #0x1000 @ if (code&0x1000)
mov r2, r2, lsl #1 mov r2, r9, lsl #21
add r2, r2, r10, lsl #17 add r2, r2, r10, lsl #17
mov r2, r2, lsr #17 eorne r2, r2, #0x0e<<17 @ if (code&0x1000) addr^=0xe;
eorcs r2, r2, #0x0e @ if (code&0x1000) addr^=0xe;
ldr r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(PicoMem.vram+addr); // Get 8 pixels ldr r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
.DrawStrip_vs_samecode: .DrawStrip_vs_samecode:
tst r9, #0x8000 tst r9, #0x8000
@ -1007,8 +1001,7 @@ DrawTilesFromCache:
tst r6, #0x0800 @ flipped? tst r6, #0x0800 @ flipped?
moveq r12,r12, lsl r4 moveq r12,r12, lsl r4
movne r12,r12, lsr r4 movne r12,r12, lsr r4
mov r12,r12, ror #16 and r2, r2, r12, ror #16
and r2, r2, r12
mov r12,#0xf mov r12,#0xf
tst r8, #1 tst r8, #1
bne .dtfc_shadow bne .dtfc_shadow
@ -1152,9 +1145,7 @@ DrawSpriteSHi:
cmp r0, #328 cmp r0, #328
bge DrawSpriteSHi bge DrawSpriteSHi
mov r8, r8, lsl #17 bic r8, r8, #0xf8000 @ tile&=0x7fff; // Clip tile address
mov r8, r8, lsr #17 @ tile&=0x7fff; // Clip tile address
ldr r2, [lr, r8, lsl #1] @ pack=*(unsigned int *)(PicoMem.vram+addr); // Get 8 pixels ldr r2, [lr, r8, lsl #1] @ pack=*(unsigned int *)(PicoMem.vram+addr); // Get 8 pixels
add r1, r11, r0 @ r1=pdest add r1, r11, r0 @ r1=pdest
tst r2, r2 tst r2, r2
@ -1346,9 +1337,7 @@ DrawSprite:
cmp r0, #328 cmp r0, #328
bge DrawSprite bge DrawSprite
mov r8, r8, lsl #17 bic r8, r8, #0xf8000 @ tile&=0x7fff; // Clip tile address
mov r8, r8, lsr #17 @ tile&=0x7fff; // Clip tile address
ldr r2, [lr, r8, lsl #1] @ pack=*(unsigned int *)(PicoMem.vram+addr); // Get 8 pixels ldr r2, [lr, r8, lsl #1] @ pack=*(unsigned int *)(PicoMem.vram+addr); // Get 8 pixels
add r1, r11, r0 @ r1=pdest add r1, r11, r0 @ r1=pdest
tst r2, r2 tst r2, r2
@ -1492,16 +1481,16 @@ DrawWindow:
mov r9, r7 @ remember code mov r9, r7 @ remember code
movs r2, r9, lsl #20 @ if (code&0x1000) tst r9, #0x1000 @ if (code&0x1000)
mov r2, r2, lsl #1 mov r2, r9, lsl #21
add r2, r10, r2, lsr #17 @ r2=addr=(code&0x7ff)<<4; addr+=ty add r2, r2, r10, lsl #17
eorcs r2, r2, #0x0e @ if (code&0x1000) addr^=0xe; eorne r2, r2, #0xe<<17 @ if (code&0x1000) addr^=0xe;
ldr r2, [lr, r2, lsr #16] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
and r3, r9, #0x6000 and r3, r9, #0x6000
mov r3, r3, lsr #9 @ r3=pal=((code&0x6000)>>9); mov r3, r3, lsr #9 @ r3=pal=((code&0x6000)>>9);
ldr r2, [lr, r2, lsl #1] @ pack=*(unsigned int *)(Pico.vram+addr); // Get 8 pixels
.dw_samecode: .dw_samecode:
tst r6, #0x100 tst r6, #0x100
bne .dw_shadow bne .dw_shadow