speed improvement and fixes for 32x ARM asm draw

This commit is contained in:
kub 2019-04-25 19:02:29 +02:00
parent 83bafe8e0b
commit 08626dab12
5 changed files with 118 additions and 60 deletions

View file

@ -311,11 +311,6 @@ void PicoDraw32xLayerMdOnly(int offs, int lines)
void PicoDrawSetOutFormat32x(pdso_t which, int use_32x_line_mode)
{
#ifdef _ASM_32X_DRAW
extern void *Pico32xNativePal;
Pico32xNativePal = Pico32xMem->pal_native;
#endif
if (which == PDF_RGB555) {
// need CLUT pixels in PicoDraw2FB for layer transparency
PicoDrawSetInternalBuf(Pico.est.Draw2FB, 328);

View file

@ -13,12 +13,6 @@
.equiv P32XV_PRI, (1<< 7)
.bss
.align 2
.global Pico32xNativePal
Pico32xNativePal:
.word 0
.text
.align 2
@ -82,8 +76,8 @@ Pico32xNativePal:
mov r3, r3, lsl #26 @ mdbg << 26
mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data
tst r10,#P32XV_PRI
moveq r10,#0
movne r10,#0x8000 @ r10 = inv_bit
movne r10,#0
moveq r10,#0x8000 @ r10 = inv_bit
call_scan_prep \call_scan lr
mov r4, #0 @ line
@ -92,7 +86,6 @@ Pico32xNativePal:
0: @ loop_outer:
call_scan_end \call_scan
add r4, r4, #1
sub r11,r11,#1 @ adjust for prev read
cmp r4, r2, lsr #16
call_scan_fin_ge \call_scan
ldmgefd sp!, {r4-r11,pc}
@ -106,31 +99,86 @@ Pico32xNativePal:
add r5, r1, r12, lsl #1 @ p32x = dram + dram[l]
2: @ loop_inner:
ldrb r7, [r11], #1 @ MD pixel
subs r6, r6, #1
ldrh r8, [r5], #2
subs lr, r6, #1
blt 0b @ loop_outer
ldrh r8, [r5], #2 @ 32x pixel
cmp r3, r7, lsl #26 @ MD has bg pixel?
beq 3f @ draw32x
eor r12,r8, r10
ands r12,r12,#0x8000 @ !((t ^ inv) & 0x8000)
.if \do_md
mov r7, r7, lsl #1
ldreqh r12,[r9, r7]
streqh r12,[r0], #2 @ *dst++ = palmd[*pmd]
.else
addeq r0, r0, #2
.endif
beq 2b @ loop_inner
3: @ draw32x:
and r12,r8, #0x03e0
3: @ loop_innermost:
ldrh r7, [r5], #2 @ 32x pixel
subs lr, lr, #1
cmpge r7, r8
beq 3b @ loop_innermost
sub r5, r5, #2
add lr, lr, #1
sub lr, r6, lr
sub r6, r6, lr
eor r12,r8, r10
tst r12, #0x8000 @ !((t ^ inv) & 0x8000)
bne 5f @ draw_md
and r7 ,r8, #0x03e0
mov r8, r8, lsl #11
orr r8, r8, r8, lsr #(10+11)
orr r8, r8, r12,lsl #1
orr r8, r8, r7 ,lsl #1
bic r8, r8, #0x0020 @ kill prio bit
add r11,r11,lr
tst r0, #2 @ dst unaligned?
strneh r8, [r0], #2
subne lr, lr, #1
cmp lr, #0
beq 2b @ loop_inner
mov r8, r8, lsl #16
orr r12,r8, r8, lsr #16
mov r8 ,r12
4: @ draw_32x:
subs lr, lr, #4 @ store 4 pixels
stmgeia r0!, {r8, r12}
bgt 4b @ draw_32x
beq 2b @ loop_inner
adds lr, lr, #2 @ store 1-3 leftover pixels
strge r8, [r0], #4
strneh r8, [r0], #2
b 2b @ loop_inner
5: @ draw_md:
subs lr, lr, #1
ldrgeb r7, [r11], #1 @ MD pixel
blt 2b @ loop_inner
cmp r3, r7, lsl #26 @ MD has bg pixel?
.if \do_md
mov r7, r7, lsl #1
ldrneh r7 ,[r9, r7]
strneh r7 ,[r0], #2 @ *dst++ = palmd[*pmd]
.else
addne r0, r0, #2
.endif
bne 5b @ draw_md
and r7 ,r8, #0x03e0
mov r8, r8, lsl #11
orr r8, r8, r8, lsr #(10+11)
orr r8, r8, r7 ,lsl #1
bic r8, r8, #0x0020 @ kill prio bit
strh r8, [r0], #2 @ *dst++ = bgr2rgb(*p32x++)
b 2b @ loop_inner
6: @ draw_md_32x:
subs lr, lr, #1
ldrgeb r7, [r11], #1 @ MD pixel
blt 2b @ loop_inner
cmp r3, r7, lsl #26 @ MD has bg pixel?
.if \do_md
mov r7, r7, lsl #1
ldrneh r7 ,[r9, r7] @ *dst++ = palmd[*pmd]
moveq r7 ,r8 @ *dst++ = bgr2rgb(*p32x++)
strh r7 ,[r0], #2
.else
streqh r8, [r0] @ *dst++ = bgr2rgb(*p32x++)
add r0, r0, #2
.endif
b 6b @ draw_md_32x
.endm
@ -144,9 +192,11 @@ Pico32xNativePal:
stmfd sp!, {r4-r11,lr}
ldr lr,=Pico
ldr r10,=Pico32xNativePal
ldr r10,=Pico32xMem
ldr r9,=OFS_PMEM32x_pal_native
ldr r10, [r10]
ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB]
ldr r10,[r10]
add r10,r10,r9
add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd
and r4, r2, #0xff
@ -184,7 +234,7 @@ Pico32xNativePal:
ldrneb r8, [r5, #2]! @ r7,r8 - pixel 0,1 index
subs r6, r6, #1
blt 0b @ loop_outer
cmp r7, r8 @ is this really improving things?
cmp r7, r8
beq 5f @ check_fill @ +8
3: @ no_fill:
@ -204,11 +254,11 @@ Pico32xNativePal:
ldrneh r7, [r9, r12] @ t = palmd[pmd[0]]
tst lr, #0x20
ldrneb lr, [r11,#-1] @ MD pixel 1
strh r7, [r0], #2
cmpne r3, lr, lsl #26 @ MD has bg pixel?
mov lr, lr, lsl #1
ldrneh r8, [r9, lr] @ t = palmd[pmd[1]]
strh r8, [r0], #2
orr r7, r7, r8, lsl #16 @ combine 2 pixels to optimize memory bandwidth
str r7, [r0], #4 @ (no write combining on ARM9)
.else
streqh r7, [r0]
tst lr, #0x20
@ -219,18 +269,21 @@ Pico32xNativePal:
.endif
b 2b @ loop_inner
5: @ check_fill
5: @ check_fill:
@ count pixels, align if needed
bic r12,r5, #1
ldrh lr ,[r12, #2] @ only do this for at least 4 pixels
ldrh r12,[r12]
orr r12,lr,r12, lsl #16
orr lr, r7, r7, lsl #8
orr lr, lr, lr, lsl #16
cmp r12,lr
bne 3b @ no_fill
tst r5, #1
sub lr, r5, #2 @ starting r5 (32x render data start)
addeq r5, r5, #2
addne r5, r5, #1 @ add for the check above
addeq r5, r5, #4
addne r5, r5, #3 @ add for the check above
add r6, r6, #1 @ restore from dec
orr r7, r7, r7, lsl #8
6:
@ -240,11 +293,12 @@ Pico32xNativePal:
ldrh r12,[r5], #2
bge 7f @ count_done
cmp r8, r7
subne r5, r5, #2 @ undo readahead
cmpeq r12,r7
beq 6b
7: @ count_done
sub r5, r5, #4 @ undo readahead
7: @ count_done:
sub r5, r5, #2 @ undo readahead
@ fix alignment and check type
sub r8, r5, lr
@ -262,11 +316,15 @@ Pico32xNativePal:
beq 9f @ bg_mode
add r11,r11,r8
8:
subs r8, r8, #2
strgeh r7, [r0], #2
strgeh r7, [r0], #2
bgt 8b
orr r12,r7, r7, lsl #16
mov r7 ,r12
8: @ 32x_loop:
subs r8, r8, #4 @ store 4 pixels
stmgeia r0!, {r7, r12}
bgt 8b @ 32x_loop
beq 2b @ loop_inner
adds r8, r8, #2
strge r7, [r0], #4 @ store 2 leftover pixels
b 2b @ loop_inner
9: @ bg_mode:
@ -281,8 +339,8 @@ Pico32xNativePal:
mov lr, lr, lsl #1
ldrneh lr, [r9, lr]
moveq lr, r7
strh r12,[r0], #2
strh lr, [r0], #2
orr r12,r12,lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth
str r12,[r0], #4 @ (no write combining on ARM9)
.else
streqh r7, [r0]
cmp r3, lr, lsl #26 @ MD pixel 1 has bg?
@ -303,9 +361,11 @@ Pico32xNativePal:
stmfd sp!, {r4-r11,lr}
ldr lr,=Pico
ldr r10,=Pico32xNativePal
ldr r10,=Pico32xMem
ldr r9,=OFS_PMEM32x_pal_native
ldr r10, [r10]
ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB]
ldr r10,[r10]
add r10,r10,r9
add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd
and r4, r2, #0xff
@ -320,7 +380,6 @@ Pico32xNativePal:
0: @ loop_outer:
call_scan_end \call_scan
add r4, r4, #1
sub r11,r11,#1 @ adjust for prev read
cmp r4, r2, lsr #16
call_scan_fin_ge \call_scan
ldmgefd sp!, {r4-r11,pc}
@ -341,13 +400,13 @@ Pico32xNativePal:
eor lr, lr, #0x20
3: @ loop_innermost:
ldrb r7, [r11], #1 @ MD pixel
subs r6, r6, #1
ldrgeb r7, [r11], #1 @ MD pixel
blt 0b @ loop_outer
cmp r3, r7, lsl #26 @ MD has bg pixel?
mov r7, r7, lsl #1
tstne lr, #0x20
tst lr, #0x20
cmpne r3, r7, lsl #26 @ MD has bg pixel?
.if \do_md
mov r7, r7, lsl #1
ldrneh r12,[r9, r7] @ t = palmd[*pmd]
streqh lr, [r0], #2
strneh r12,[r0], #2 @ *dst++ = t
@ -365,15 +424,18 @@ make_do_loop_dc do_loop_dc, 0, 0
make_do_loop_dc do_loop_dc_md, 0, 1
make_do_loop_dc do_loop_dc_scan, 1, 0
make_do_loop_dc do_loop_dc_scan_md, 1, 1
.pool
make_do_loop_pp do_loop_pp, 0, 0
make_do_loop_pp do_loop_pp_md, 0, 1
make_do_loop_pp do_loop_pp_scan, 1, 0
make_do_loop_pp do_loop_pp_scan_md, 1, 1
.pool
make_do_loop_rl do_loop_rl, 0, 0
make_do_loop_rl do_loop_rl_md, 0, 1
make_do_loop_rl do_loop_rl_scan, 1, 0
make_do_loop_rl do_loop_rl_scan_md, 1, 1
.pool
@ vim:filetype=armasm

View file

@ -1364,8 +1364,8 @@ static void FinalizeLine8bit(int sh, int line, struct PicoEState *est)
{
// a hack for mid-frame palette changes
if (!(est->rendstatus & PDRAW_SONIC_MODE) || line - dirty_line > 4) {
// store a maximum of 3 additional palettes in SonicPal
if (est->SonicPalCount < 3)
// store a maximum of 2 additional palettes in SonicPal
if (est->SonicPalCount < 2)
est->SonicPalCount ++;
dirty_line = line;
est->rendstatus |= PDRAW_SONIC_MODE;

View file

@ -84,6 +84,7 @@ get_define OFS_EST_ PicoEState HighPal ; echo "$line" >>$fn
get_define OFS_PMEM_ PicoMem vram ; echo "$line" >>$fn
get_define OFS_PMEM_ PicoMem vsram ; echo "$line" >>$fn
get_define OFS_PMEM32x_ Pico32xMem pal_native ; echo "$line" >>$fn
get_define OFS_SH2_ SH2_ is_slave ; echo "$line" >>$fn
get_define OFS_SH2_ SH2_ p_bios ; echo "$line" >>$fn