mirror of
https://github.com/RaySollium99/picodrive.git
synced 2025-09-05 15:27:46 -04:00
speed improvement and fixes for 32x ARM asm draw
This commit is contained in:
parent
83bafe8e0b
commit
08626dab12
5 changed files with 118 additions and 60 deletions
|
@ -311,11 +311,6 @@ void PicoDraw32xLayerMdOnly(int offs, int lines)
|
||||||
|
|
||||||
void PicoDrawSetOutFormat32x(pdso_t which, int use_32x_line_mode)
|
void PicoDrawSetOutFormat32x(pdso_t which, int use_32x_line_mode)
|
||||||
{
|
{
|
||||||
#ifdef _ASM_32X_DRAW
|
|
||||||
extern void *Pico32xNativePal;
|
|
||||||
Pico32xNativePal = Pico32xMem->pal_native;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (which == PDF_RGB555) {
|
if (which == PDF_RGB555) {
|
||||||
// need CLUT pixels in PicoDraw2FB for layer transparency
|
// need CLUT pixels in PicoDraw2FB for layer transparency
|
||||||
PicoDrawSetInternalBuf(Pico.est.Draw2FB, 328);
|
PicoDrawSetInternalBuf(Pico.est.Draw2FB, 328);
|
||||||
|
|
|
@ -13,12 +13,6 @@
|
||||||
|
|
||||||
.equiv P32XV_PRI, (1<< 7)
|
.equiv P32XV_PRI, (1<< 7)
|
||||||
|
|
||||||
.bss
|
|
||||||
.align 2
|
|
||||||
.global Pico32xNativePal
|
|
||||||
Pico32xNativePal:
|
|
||||||
.word 0
|
|
||||||
|
|
||||||
.text
|
.text
|
||||||
.align 2
|
.align 2
|
||||||
|
|
||||||
|
@ -82,8 +76,8 @@ Pico32xNativePal:
|
||||||
mov r3, r3, lsl #26 @ mdbg << 26
|
mov r3, r3, lsl #26 @ mdbg << 26
|
||||||
mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data
|
mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data
|
||||||
tst r10,#P32XV_PRI
|
tst r10,#P32XV_PRI
|
||||||
moveq r10,#0
|
movne r10,#0
|
||||||
movne r10,#0x8000 @ r10 = inv_bit
|
moveq r10,#0x8000 @ r10 = inv_bit
|
||||||
call_scan_prep \call_scan lr
|
call_scan_prep \call_scan lr
|
||||||
|
|
||||||
mov r4, #0 @ line
|
mov r4, #0 @ line
|
||||||
|
@ -92,7 +86,6 @@ Pico32xNativePal:
|
||||||
0: @ loop_outer:
|
0: @ loop_outer:
|
||||||
call_scan_end \call_scan
|
call_scan_end \call_scan
|
||||||
add r4, r4, #1
|
add r4, r4, #1
|
||||||
sub r11,r11,#1 @ adjust for prev read
|
|
||||||
cmp r4, r2, lsr #16
|
cmp r4, r2, lsr #16
|
||||||
call_scan_fin_ge \call_scan
|
call_scan_fin_ge \call_scan
|
||||||
ldmgefd sp!, {r4-r11,pc}
|
ldmgefd sp!, {r4-r11,pc}
|
||||||
|
@ -106,31 +99,86 @@ Pico32xNativePal:
|
||||||
add r5, r1, r12, lsl #1 @ p32x = dram + dram[l]
|
add r5, r1, r12, lsl #1 @ p32x = dram + dram[l]
|
||||||
|
|
||||||
2: @ loop_inner:
|
2: @ loop_inner:
|
||||||
ldrb r7, [r11], #1 @ MD pixel
|
ldrh r8, [r5], #2
|
||||||
subs r6, r6, #1
|
subs lr, r6, #1
|
||||||
blt 0b @ loop_outer
|
blt 0b @ loop_outer
|
||||||
ldrh r8, [r5], #2 @ 32x pixel
|
|
||||||
cmp r3, r7, lsl #26 @ MD has bg pixel?
|
|
||||||
beq 3f @ draw32x
|
|
||||||
eor r12,r8, r10
|
|
||||||
ands r12,r12,#0x8000 @ !((t ^ inv) & 0x8000)
|
|
||||||
.if \do_md
|
|
||||||
mov r7, r7, lsl #1
|
|
||||||
ldreqh r12,[r9, r7]
|
|
||||||
streqh r12,[r0], #2 @ *dst++ = palmd[*pmd]
|
|
||||||
.else
|
|
||||||
addeq r0, r0, #2
|
|
||||||
.endif
|
|
||||||
beq 2b @ loop_inner
|
|
||||||
|
|
||||||
3: @ draw32x:
|
3: @ loop_innermost:
|
||||||
and r12,r8, #0x03e0
|
ldrh r7, [r5], #2 @ 32x pixel
|
||||||
|
subs lr, lr, #1
|
||||||
|
cmpge r7, r8
|
||||||
|
beq 3b @ loop_innermost
|
||||||
|
|
||||||
|
sub r5, r5, #2
|
||||||
|
add lr, lr, #1
|
||||||
|
sub lr, r6, lr
|
||||||
|
sub r6, r6, lr
|
||||||
|
|
||||||
|
eor r12,r8, r10
|
||||||
|
tst r12, #0x8000 @ !((t ^ inv) & 0x8000)
|
||||||
|
bne 5f @ draw_md
|
||||||
|
|
||||||
|
and r7 ,r8, #0x03e0
|
||||||
mov r8, r8, lsl #11
|
mov r8, r8, lsl #11
|
||||||
orr r8, r8, r8, lsr #(10+11)
|
orr r8, r8, r8, lsr #(10+11)
|
||||||
orr r8, r8, r12,lsl #1
|
orr r8, r8, r7 ,lsl #1
|
||||||
|
bic r8, r8, #0x0020 @ kill prio bit
|
||||||
|
|
||||||
|
add r11,r11,lr
|
||||||
|
tst r0, #2 @ dst unaligned?
|
||||||
|
strneh r8, [r0], #2
|
||||||
|
subne lr, lr, #1
|
||||||
|
cmp lr, #0
|
||||||
|
beq 2b @ loop_inner
|
||||||
|
mov r8, r8, lsl #16
|
||||||
|
orr r12,r8, r8, lsr #16
|
||||||
|
mov r8 ,r12
|
||||||
|
4: @ draw_32x:
|
||||||
|
subs lr, lr, #4 @ store 4 pixels
|
||||||
|
stmgeia r0!, {r8, r12}
|
||||||
|
bgt 4b @ draw_32x
|
||||||
|
beq 2b @ loop_inner
|
||||||
|
adds lr, lr, #2 @ store 1-3 leftover pixels
|
||||||
|
strge r8, [r0], #4
|
||||||
|
strneh r8, [r0], #2
|
||||||
|
b 2b @ loop_inner
|
||||||
|
|
||||||
|
5: @ draw_md:
|
||||||
|
subs lr, lr, #1
|
||||||
|
ldrgeb r7, [r11], #1 @ MD pixel
|
||||||
|
blt 2b @ loop_inner
|
||||||
|
cmp r3, r7, lsl #26 @ MD has bg pixel?
|
||||||
|
.if \do_md
|
||||||
|
mov r7, r7, lsl #1
|
||||||
|
ldrneh r7 ,[r9, r7]
|
||||||
|
strneh r7 ,[r0], #2 @ *dst++ = palmd[*pmd]
|
||||||
|
.else
|
||||||
|
addne r0, r0, #2
|
||||||
|
.endif
|
||||||
|
bne 5b @ draw_md
|
||||||
|
|
||||||
|
and r7 ,r8, #0x03e0
|
||||||
|
mov r8, r8, lsl #11
|
||||||
|
orr r8, r8, r8, lsr #(10+11)
|
||||||
|
orr r8, r8, r7 ,lsl #1
|
||||||
bic r8, r8, #0x0020 @ kill prio bit
|
bic r8, r8, #0x0020 @ kill prio bit
|
||||||
strh r8, [r0], #2 @ *dst++ = bgr2rgb(*p32x++)
|
strh r8, [r0], #2 @ *dst++ = bgr2rgb(*p32x++)
|
||||||
b 2b @ loop_inner
|
|
||||||
|
6: @ draw_md_32x:
|
||||||
|
subs lr, lr, #1
|
||||||
|
ldrgeb r7, [r11], #1 @ MD pixel
|
||||||
|
blt 2b @ loop_inner
|
||||||
|
cmp r3, r7, lsl #26 @ MD has bg pixel?
|
||||||
|
.if \do_md
|
||||||
|
mov r7, r7, lsl #1
|
||||||
|
ldrneh r7 ,[r9, r7] @ *dst++ = palmd[*pmd]
|
||||||
|
moveq r7 ,r8 @ *dst++ = bgr2rgb(*p32x++)
|
||||||
|
strh r7 ,[r0], #2
|
||||||
|
.else
|
||||||
|
streqh r8, [r0] @ *dst++ = bgr2rgb(*p32x++)
|
||||||
|
add r0, r0, #2
|
||||||
|
.endif
|
||||||
|
b 6b @ draw_md_32x
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
@ -144,9 +192,11 @@ Pico32xNativePal:
|
||||||
stmfd sp!, {r4-r11,lr}
|
stmfd sp!, {r4-r11,lr}
|
||||||
|
|
||||||
ldr lr,=Pico
|
ldr lr,=Pico
|
||||||
ldr r10,=Pico32xNativePal
|
ldr r10,=Pico32xMem
|
||||||
|
ldr r9,=OFS_PMEM32x_pal_native
|
||||||
|
ldr r10, [r10]
|
||||||
ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB]
|
ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB]
|
||||||
ldr r10,[r10]
|
add r10,r10,r9
|
||||||
add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd
|
add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd
|
||||||
|
|
||||||
and r4, r2, #0xff
|
and r4, r2, #0xff
|
||||||
|
@ -184,7 +234,7 @@ Pico32xNativePal:
|
||||||
ldrneb r8, [r5, #2]! @ r7,r8 - pixel 0,1 index
|
ldrneb r8, [r5, #2]! @ r7,r8 - pixel 0,1 index
|
||||||
subs r6, r6, #1
|
subs r6, r6, #1
|
||||||
blt 0b @ loop_outer
|
blt 0b @ loop_outer
|
||||||
cmp r7, r8 @ is this really improving things?
|
cmp r7, r8
|
||||||
beq 5f @ check_fill @ +8
|
beq 5f @ check_fill @ +8
|
||||||
|
|
||||||
3: @ no_fill:
|
3: @ no_fill:
|
||||||
|
@ -204,11 +254,11 @@ Pico32xNativePal:
|
||||||
ldrneh r7, [r9, r12] @ t = palmd[pmd[0]]
|
ldrneh r7, [r9, r12] @ t = palmd[pmd[0]]
|
||||||
tst lr, #0x20
|
tst lr, #0x20
|
||||||
ldrneb lr, [r11,#-1] @ MD pixel 1
|
ldrneb lr, [r11,#-1] @ MD pixel 1
|
||||||
strh r7, [r0], #2
|
|
||||||
cmpne r3, lr, lsl #26 @ MD has bg pixel?
|
cmpne r3, lr, lsl #26 @ MD has bg pixel?
|
||||||
mov lr, lr, lsl #1
|
mov lr, lr, lsl #1
|
||||||
ldrneh r8, [r9, lr] @ t = palmd[pmd[1]]
|
ldrneh r8, [r9, lr] @ t = palmd[pmd[1]]
|
||||||
strh r8, [r0], #2
|
orr r7, r7, r8, lsl #16 @ combine 2 pixels to optimize memory bandwidth
|
||||||
|
str r7, [r0], #4 @ (no write combining on ARM9)
|
||||||
.else
|
.else
|
||||||
streqh r7, [r0]
|
streqh r7, [r0]
|
||||||
tst lr, #0x20
|
tst lr, #0x20
|
||||||
|
@ -219,18 +269,21 @@ Pico32xNativePal:
|
||||||
.endif
|
.endif
|
||||||
b 2b @ loop_inner
|
b 2b @ loop_inner
|
||||||
|
|
||||||
5: @ check_fill
|
5: @ check_fill:
|
||||||
@ count pixels, align if needed
|
@ count pixels, align if needed
|
||||||
bic r12,r5, #1
|
bic r12,r5, #1
|
||||||
|
ldrh lr ,[r12, #2] @ only do this for at least 4 pixels
|
||||||
ldrh r12,[r12]
|
ldrh r12,[r12]
|
||||||
|
orr r12,lr,r12, lsl #16
|
||||||
orr lr, r7, r7, lsl #8
|
orr lr, r7, r7, lsl #8
|
||||||
|
orr lr, lr, lr, lsl #16
|
||||||
cmp r12,lr
|
cmp r12,lr
|
||||||
bne 3b @ no_fill
|
bne 3b @ no_fill
|
||||||
|
|
||||||
tst r5, #1
|
tst r5, #1
|
||||||
sub lr, r5, #2 @ starting r5 (32x render data start)
|
sub lr, r5, #2 @ starting r5 (32x render data start)
|
||||||
addeq r5, r5, #2
|
addeq r5, r5, #4
|
||||||
addne r5, r5, #1 @ add for the check above
|
addne r5, r5, #3 @ add for the check above
|
||||||
add r6, r6, #1 @ restore from dec
|
add r6, r6, #1 @ restore from dec
|
||||||
orr r7, r7, r7, lsl #8
|
orr r7, r7, r7, lsl #8
|
||||||
6:
|
6:
|
||||||
|
@ -240,11 +293,12 @@ Pico32xNativePal:
|
||||||
ldrh r12,[r5], #2
|
ldrh r12,[r5], #2
|
||||||
bge 7f @ count_done
|
bge 7f @ count_done
|
||||||
cmp r8, r7
|
cmp r8, r7
|
||||||
|
subne r5, r5, #2 @ undo readahead
|
||||||
cmpeq r12,r7
|
cmpeq r12,r7
|
||||||
beq 6b
|
beq 6b
|
||||||
|
|
||||||
7: @ count_done
|
7: @ count_done:
|
||||||
sub r5, r5, #4 @ undo readahead
|
sub r5, r5, #2 @ undo readahead
|
||||||
|
|
||||||
@ fix alignment and check type
|
@ fix alignment and check type
|
||||||
sub r8, r5, lr
|
sub r8, r5, lr
|
||||||
|
@ -262,11 +316,15 @@ Pico32xNativePal:
|
||||||
beq 9f @ bg_mode
|
beq 9f @ bg_mode
|
||||||
|
|
||||||
add r11,r11,r8
|
add r11,r11,r8
|
||||||
8:
|
orr r12,r7, r7, lsl #16
|
||||||
subs r8, r8, #2
|
mov r7 ,r12
|
||||||
strgeh r7, [r0], #2
|
8: @ 32x_loop:
|
||||||
strgeh r7, [r0], #2
|
subs r8, r8, #4 @ store 4 pixels
|
||||||
bgt 8b
|
stmgeia r0!, {r7, r12}
|
||||||
|
bgt 8b @ 32x_loop
|
||||||
|
beq 2b @ loop_inner
|
||||||
|
adds r8, r8, #2
|
||||||
|
strge r7, [r0], #4 @ store 2 leftover pixels
|
||||||
b 2b @ loop_inner
|
b 2b @ loop_inner
|
||||||
|
|
||||||
9: @ bg_mode:
|
9: @ bg_mode:
|
||||||
|
@ -281,8 +339,8 @@ Pico32xNativePal:
|
||||||
mov lr, lr, lsl #1
|
mov lr, lr, lsl #1
|
||||||
ldrneh lr, [r9, lr]
|
ldrneh lr, [r9, lr]
|
||||||
moveq lr, r7
|
moveq lr, r7
|
||||||
strh r12,[r0], #2
|
orr r12,r12,lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth
|
||||||
strh lr, [r0], #2
|
str r12,[r0], #4 @ (no write combining on ARM9)
|
||||||
.else
|
.else
|
||||||
streqh r7, [r0]
|
streqh r7, [r0]
|
||||||
cmp r3, lr, lsl #26 @ MD pixel 1 has bg?
|
cmp r3, lr, lsl #26 @ MD pixel 1 has bg?
|
||||||
|
@ -303,9 +361,11 @@ Pico32xNativePal:
|
||||||
stmfd sp!, {r4-r11,lr}
|
stmfd sp!, {r4-r11,lr}
|
||||||
|
|
||||||
ldr lr,=Pico
|
ldr lr,=Pico
|
||||||
ldr r10,=Pico32xNativePal
|
ldr r10,=Pico32xMem
|
||||||
|
ldr r9,=OFS_PMEM32x_pal_native
|
||||||
|
ldr r10, [r10]
|
||||||
ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB]
|
ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB]
|
||||||
ldr r10,[r10]
|
add r10,r10,r9
|
||||||
add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd
|
add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd
|
||||||
|
|
||||||
and r4, r2, #0xff
|
and r4, r2, #0xff
|
||||||
|
@ -320,7 +380,6 @@ Pico32xNativePal:
|
||||||
0: @ loop_outer:
|
0: @ loop_outer:
|
||||||
call_scan_end \call_scan
|
call_scan_end \call_scan
|
||||||
add r4, r4, #1
|
add r4, r4, #1
|
||||||
sub r11,r11,#1 @ adjust for prev read
|
|
||||||
cmp r4, r2, lsr #16
|
cmp r4, r2, lsr #16
|
||||||
call_scan_fin_ge \call_scan
|
call_scan_fin_ge \call_scan
|
||||||
ldmgefd sp!, {r4-r11,pc}
|
ldmgefd sp!, {r4-r11,pc}
|
||||||
|
@ -341,13 +400,13 @@ Pico32xNativePal:
|
||||||
eor lr, lr, #0x20
|
eor lr, lr, #0x20
|
||||||
|
|
||||||
3: @ loop_innermost:
|
3: @ loop_innermost:
|
||||||
ldrb r7, [r11], #1 @ MD pixel
|
|
||||||
subs r6, r6, #1
|
subs r6, r6, #1
|
||||||
|
ldrgeb r7, [r11], #1 @ MD pixel
|
||||||
blt 0b @ loop_outer
|
blt 0b @ loop_outer
|
||||||
cmp r3, r7, lsl #26 @ MD has bg pixel?
|
tst lr, #0x20
|
||||||
mov r7, r7, lsl #1
|
cmpne r3, r7, lsl #26 @ MD has bg pixel?
|
||||||
tstne lr, #0x20
|
|
||||||
.if \do_md
|
.if \do_md
|
||||||
|
mov r7, r7, lsl #1
|
||||||
ldrneh r12,[r9, r7] @ t = palmd[*pmd]
|
ldrneh r12,[r9, r7] @ t = palmd[*pmd]
|
||||||
streqh lr, [r0], #2
|
streqh lr, [r0], #2
|
||||||
strneh r12,[r0], #2 @ *dst++ = t
|
strneh r12,[r0], #2 @ *dst++ = t
|
||||||
|
@ -365,15 +424,18 @@ make_do_loop_dc do_loop_dc, 0, 0
|
||||||
make_do_loop_dc do_loop_dc_md, 0, 1
|
make_do_loop_dc do_loop_dc_md, 0, 1
|
||||||
make_do_loop_dc do_loop_dc_scan, 1, 0
|
make_do_loop_dc do_loop_dc_scan, 1, 0
|
||||||
make_do_loop_dc do_loop_dc_scan_md, 1, 1
|
make_do_loop_dc do_loop_dc_scan_md, 1, 1
|
||||||
|
.pool
|
||||||
|
|
||||||
make_do_loop_pp do_loop_pp, 0, 0
|
make_do_loop_pp do_loop_pp, 0, 0
|
||||||
make_do_loop_pp do_loop_pp_md, 0, 1
|
make_do_loop_pp do_loop_pp_md, 0, 1
|
||||||
make_do_loop_pp do_loop_pp_scan, 1, 0
|
make_do_loop_pp do_loop_pp_scan, 1, 0
|
||||||
make_do_loop_pp do_loop_pp_scan_md, 1, 1
|
make_do_loop_pp do_loop_pp_scan_md, 1, 1
|
||||||
|
.pool
|
||||||
|
|
||||||
make_do_loop_rl do_loop_rl, 0, 0
|
make_do_loop_rl do_loop_rl, 0, 0
|
||||||
make_do_loop_rl do_loop_rl_md, 0, 1
|
make_do_loop_rl do_loop_rl_md, 0, 1
|
||||||
make_do_loop_rl do_loop_rl_scan, 1, 0
|
make_do_loop_rl do_loop_rl_scan, 1, 0
|
||||||
make_do_loop_rl do_loop_rl_scan_md, 1, 1
|
make_do_loop_rl do_loop_rl_scan_md, 1, 1
|
||||||
|
.pool
|
||||||
|
|
||||||
@ vim:filetype=armasm
|
@ vim:filetype=armasm
|
||||||
|
|
|
@ -1364,8 +1364,8 @@ static void FinalizeLine8bit(int sh, int line, struct PicoEState *est)
|
||||||
{
|
{
|
||||||
// a hack for mid-frame palette changes
|
// a hack for mid-frame palette changes
|
||||||
if (!(est->rendstatus & PDRAW_SONIC_MODE) || line - dirty_line > 4) {
|
if (!(est->rendstatus & PDRAW_SONIC_MODE) || line - dirty_line > 4) {
|
||||||
// store a maximum of 3 additional palettes in SonicPal
|
// store a maximum of 2 additional palettes in SonicPal
|
||||||
if (est->SonicPalCount < 3)
|
if (est->SonicPalCount < 2)
|
||||||
est->SonicPalCount ++;
|
est->SonicPalCount ++;
|
||||||
dirty_line = line;
|
dirty_line = line;
|
||||||
est->rendstatus |= PDRAW_SONIC_MODE;
|
est->rendstatus |= PDRAW_SONIC_MODE;
|
||||||
|
|
|
@ -328,7 +328,7 @@ static int make_local_pal_md(int fast_mode)
|
||||||
localPal[0xe0] = 0x00000000; // reserved pixels for OSD
|
localPal[0xe0] = 0x00000000; // reserved pixels for OSD
|
||||||
localPal[0xf0] = 0x00ffffff;
|
localPal[0xf0] = 0x00ffffff;
|
||||||
|
|
||||||
if (Pico.m.dirtyPal == 2)
|
if (Pico.m.dirtyPal == 2)
|
||||||
Pico.m.dirtyPal = 0;
|
Pico.m.dirtyPal = 0;
|
||||||
return pallen;
|
return pallen;
|
||||||
}
|
}
|
||||||
|
|
|
@ -84,6 +84,7 @@ get_define OFS_EST_ PicoEState HighPal ; echo "$line" >>$fn
|
||||||
|
|
||||||
get_define OFS_PMEM_ PicoMem vram ; echo "$line" >>$fn
|
get_define OFS_PMEM_ PicoMem vram ; echo "$line" >>$fn
|
||||||
get_define OFS_PMEM_ PicoMem vsram ; echo "$line" >>$fn
|
get_define OFS_PMEM_ PicoMem vsram ; echo "$line" >>$fn
|
||||||
|
get_define OFS_PMEM32x_ Pico32xMem pal_native ; echo "$line" >>$fn
|
||||||
|
|
||||||
get_define OFS_SH2_ SH2_ is_slave ; echo "$line" >>$fn
|
get_define OFS_SH2_ SH2_ is_slave ; echo "$line" >>$fn
|
||||||
get_define OFS_SH2_ SH2_ p_bios ; echo "$line" >>$fn
|
get_define OFS_SH2_ SH2_ p_bios ; echo "$line" >>$fn
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue