32x, more ARM asm drawing optimisations for dc mode

This commit is contained in:
kub 2021-12-19 14:40:16 +01:00
parent 2a29ca852b
commit c3fcdf3f8d
2 changed files with 30 additions and 31 deletions

View file

@ -84,8 +84,8 @@
mov r3, r3, lsl #26 @ mdbg << 26 mov r3, r3, lsl #26 @ mdbg << 26
mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data
tst r10,#P32XV_PRI tst r10,#P32XV_PRI
moveq r10,#0 movne r10,#0
movne r10,#0x8000 @ r10 = inv_bit moveq r10,#0x8000 @ r10 = !inv_bit
call_scan_prep \call_scan lr call_scan_prep \call_scan lr
mov r4, #0 @ line mov r4, #0 @ line
@ -120,49 +120,48 @@
beq 5f @ check_fill beq 5f @ check_fill
3: @ no_fill: 3: @ no_fill:
ldrb r12,[r11], #1 @ MD pixel 0
eor r7, r7, r10 eor r7, r7, r10
and r12,r7, #0x03e0 @ convert BGR555 -> RGB565 and lr, r7, #0x03e0 @ convert BGR555 -> RGB565
mov r7, r7, ror #5 mov r7, r7, ror #5
orr r7, r7, r7, ror #10+11 orr r7, r7, r7, ror #10+11
orr r7, r7, r12,lsl #1+16 orr r7, r7, lr, lsl #1+16
eor r8, r8, r10 eor r8, r8, r10
and r12,r8, #0x03e0 and lr, r8, #0x03e0
mov r8, r8, ror #5 mov r8, r8, ror #5
orr r8, r8, r8, ror #10+11 orr r8, r8, r8, ror #10+11
orr r8, r8, r12,lsl #1+16 orr r8, r8, lr, lsl #1+16
ldrb r12,[r11], #1 @ MD pixel 0
ldrb lr, [r11], #1 @ MD pixel 1 ldrb lr, [r11], #1 @ MD pixel 1
lsr r7, #16
lsr r8, #16
.if \do_md .if \do_md
cmp r3, r12, lsl #26 cmp r3, r12, lsl #26
movne r12,r12, lsl #1 @ load MD color if not bg tstne r7, #0x20<<16
movne r12,r12, lsl #1 @ load MD color if no 32X prio and not bg
ldrneh r12,[r9, r12] ldrneh r12,[r9, r12]
orreq r7, r7, #0x20 @ accumulate MD bg info into prio bit moveq r12,r7, lsr #16 @ else replace with 32X color
cmp r3, lr, lsl #26
movne lr, lr, lsl #1
ldrneh lr, [r9, lr]
orreq r8, r8, #0x20
tst r7, #0x20 @ replace 32X with MD color if no prio and not bg cmp r3, lr, lsl #26
moveq r7, r12 tstne r8, #0x20<<16
tst r8, #0x20 movne lr, lr, lsl #1 @ load MD color if no 32X prio and not bg
moveq r8, lr ldrneh lr, [r9, lr]
orr r7, r7, r8, lsl #16 @ combine 2 pixels to optimize memory bandwidth moveq lr, r8, lsr #16 @ else replace with 32X color
str r7, [r0], #4 @ (no write combining on ARM9)
orr r12,r12, lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth
str r12,[r0], #4 @ (no write combining on ARM9)
.else .else
cmp r3, r12, lsl #26 @ replace MD bg info into prio bit cmp r3, r12, lsl #26 @ replace MD bg info into prio bit
orreq r7, r7, #0x20 tstne r7, #0x20<<16
moveq r7, r7, lsr #16
streqh r7, [r0, #0]
cmp r3, lr, lsl #26 cmp r3, lr, lsl #26
orreq r8, r8, #0x20 tstne r8, #0x20<<16
moveq r8, r8, lsr #16
streqh r8, [r0, #2]
add r0, r0, #4 @ store 32x pixels if 32X prio or MD bg add r0, r0, #4 @ store 32x pixels if 32X prio or MD bg
tst r7, #0x20
strneh r7, [r0, #-4]
tst r8, #0x20
strneh r8, [r0, #-2]
.endif .endif
b 2b @ loop_inner b 2b @ loop_inner
@ -205,7 +204,7 @@
lsr r7, #16 lsr r7, #16
tst r7, #0x20 @ check for prio transfer tst r7, #0x20 @ check for prio transfer
beq 9f @ bg_loop bne 9f @ bg_loop
add r11,r11,r8 @ consume md pixels (not used) add r11,r11,r8 @ consume md pixels (not used)
orr r12,r7, r7, lsl #16 orr r12,r7, r7, lsl #16

View file

@ -207,12 +207,12 @@ static void apply_renderer(void)
{ {
PicoIn.opt &= ~(POPT_ALT_RENDERER|POPT_EN_SOFTSCALE|POPT_DIS_32C_BORDER); PicoIn.opt &= ~(POPT_ALT_RENDERER|POPT_EN_SOFTSCALE|POPT_DIS_32C_BORDER);
if (is_16bit_mode()) { if (is_16bit_mode()) {
if (currentConfig.scaling == EOPT_SCALE_SW) { if (currentConfig.scaling == EOPT_SCALE_SW)
PicoIn.opt |= POPT_EN_SOFTSCALE; PicoIn.opt |= POPT_EN_SOFTSCALE;
PicoIn.filter = currentConfig.filter; else if (currentConfig.scaling == EOPT_SCALE_HW)
} else if (currentConfig.scaling == EOPT_SCALE_HW)
// hw scaling, render without any padding // hw scaling, render without any padding
PicoIn.opt |= POPT_DIS_32C_BORDER; PicoIn.opt |= POPT_DIS_32C_BORDER;
PicoIn.filter = currentConfig.filter;
} else } else
PicoIn.opt |= POPT_DIS_32C_BORDER; PicoIn.opt |= POPT_DIS_32C_BORDER;