32x, more ARM asm drawing optimisations for dc mode

2025-10-26 16:29:37 -04:00 · 2021-12-19 14:40:16 +01:00 · 2021-12-19 14:40:16 +01:00 · c3fcdf3f8d
commit c3fcdf3f8d
parent 2a29ca852b
2 changed files with 30 additions and 31 deletions
--- a/pico/32x/draw_arm.S
+++ b/pico/32x/draw_arm.S
@ -84,8 +84,8 @@
    mov     r3, r3, lsl #26  @ mdbg << 26
    mla     r11,r4,r5,r11    @ r11 = pmd = PicoDraw2FB + offs*328: md data
    tst     r10,#P32XV_PRI
-    moveq   r10,#0
+    movne   r10,#0
-    movne   r10,#0x8000      @ r10 = inv_bit
+    moveq   r10,#0x8000      @ r10 = !inv_bit
    call_scan_prep \call_scan lr
    mov     r4, #0           @ line
@ -120,49 +120,48 @@
    beq     5f @ check_fill
 3: @ no_fill:
    ldrb    r12,[r11], #1    @ MD pixel 0
    eor     r7, r7, r10
-    and     r12,r7, #0x03e0  @ convert BGR555 -> RGB565
+    and     lr, r7, #0x03e0  @ convert BGR555 -> RGB565
    mov     r7, r7, ror #5
    orr     r7, r7, r7, ror #10+11
-    orr     r7, r7, r12,lsl #1+16
+    orr     r7, r7, lr, lsl #1+16
    eor     r8, r8, r10
-    and     r12,r8, #0x03e0
+    and     lr, r8, #0x03e0
    mov     r8, r8, ror #5
    orr     r8, r8, r8, ror #10+11
-    orr     r8, r8, r12,lsl #1+16
+    orr     r8, r8, lr, lsl #1+16
    ldrb    r12,[r11], #1    @ MD pixel 0
    ldrb    lr, [r11], #1    @ MD pixel 1
    lsr     r7, #16
    lsr     r8, #16
 .if \do_md
    cmp     r3, r12, lsl #26
-    movne   r12,r12, lsl #1  @ load MD color if not bg
+    tstne   r7, #0x20<<16
    movne   r12,r12, lsl #1  @ load MD color if no 32X prio and not bg
    ldrneh  r12,[r9, r12]
-    orreq   r7, r7, #0x20    @ accumulate MD bg info into prio bit
+    moveq   r12,r7, lsr #16  @ else replace with 32X color
    cmp     r3, lr,  lsl #26
    movne   lr, lr,  lsl #1
    ldrneh  lr, [r9, lr]
    orreq   r8, r8, #0x20
-    tst     r7, #0x20        @ replace 32X with MD color if no prio and not bg
+    cmp     r3, lr,  lsl #26
-    moveq   r7, r12
+    tstne   r8, #0x20<<16
-    tst     r8, #0x20
+    movne   lr, lr,  lsl #1  @ load MD color if no 32X prio and not bg
-    moveq   r8, lr
+    ldrneh  lr, [r9, lr]
-    orr     r7, r7, r8, lsl #16 @ combine 2 pixels to optimize memory bandwidth
+    moveq   lr, r8, lsr #16  @ else replace with 32X color
-    str     r7, [r0], #4     @ (no write combining on ARM9)
+
    orr     r12,r12, lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth
    str     r12,[r0], #4     @ (no write combining on ARM9)
 .else
    cmp     r3, r12, lsl #26 @ replace MD bg info into prio bit
-    orreq   r7, r7, #0x20
+    tstne   r7, #0x20<<16
    moveq   r7, r7, lsr #16
    streqh  r7, [r0, #0]
    cmp     r3, lr,  lsl #26
-    orreq   r8, r8, #0x20
+    tstne   r8, #0x20<<16
    moveq   r8, r8, lsr #16
    streqh  r8, [r0, #2]
    add     r0, r0, #4       @ store 32x pixels if 32X prio or MD bg
    tst     r7, #0x20
    strneh  r7, [r0, #-4]
    tst     r8, #0x20
    strneh  r8, [r0, #-2]
 .endif
    b       2b @ loop_inner
@ -205,7 +204,7 @@
    lsr     r7, #16
    tst     r7, #0x20        @ check for prio transfer
-    beq     9f @ bg_loop
+    bne     9f @ bg_loop
    add     r11,r11,r8       @ consume md pixels (not used)
    orr     r12,r7, r7, lsl #16
--- a/platform/linux/emu.c
+++ b/platform/linux/emu.c
@ -207,12 +207,12 @@ static void apply_renderer(void)
 {
 	PicoIn.opt &= ~(POPT_ALT_RENDERER|POPT_EN_SOFTSCALE|POPT_DIS_32C_BORDER);
 	if (is_16bit_mode()) {
-		if (currentConfig.scaling == EOPT_SCALE_SW) {
+		if (currentConfig.scaling == EOPT_SCALE_SW)
 			PicoIn.opt |= POPT_EN_SOFTSCALE;
-			PicoIn.filter = currentConfig.filter;
+		else if (currentConfig.scaling == EOPT_SCALE_HW)
 		} else if (currentConfig.scaling == EOPT_SCALE_HW)
 			// hw scaling, render without any padding
 			PicoIn.opt |= POPT_DIS_32C_BORDER;
 		PicoIn.filter = currentConfig.filter;
 	} else
 		PicoIn.opt |= POPT_DIS_32C_BORDER;