32x, ARM asm drawing fixes and optimzations for dc,pp modes

This commit is contained in:
kub 2021-12-18 19:19:37 +01:00
parent a8acecdc08
commit 2a29ca852b

View file

@ -76,7 +76,6 @@
PIC_LDR(lr, r9, Pico) PIC_LDR(lr, r9, Pico)
PIC_LDR(r10,r9, Pico32x) PIC_LDR(r10,r9, Pico32x)
ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB] ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB]
ldr r12, [lr, #OFS_Pico_est+OFS_EST_DrawLineDestIncr]
ldrh r10,[r10, #0x40] @ Pico32x.vdp_regs[0] ldrh r10,[r10, #0x40] @ Pico32x.vdp_regs[0]
add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd
@ -85,8 +84,8 @@
mov r3, r3, lsl #26 @ mdbg << 26 mov r3, r3, lsl #26 @ mdbg << 26
mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data
tst r10,#P32XV_PRI tst r10,#P32XV_PRI
movne r10,#0 moveq r10,#0
moveq r10,#0x8000 @ r10 = inv_bit movne r10,#0x8000 @ r10 = inv_bit
call_scan_prep \call_scan lr call_scan_prep \call_scan lr
mov r4, #0 @ line mov r4, #0 @ line
@ -107,125 +106,143 @@
mov r12,r4, lsl #1 mov r12,r4, lsl #1
ldrh r12,[r1, r12] ldrh r12,[r1, r12]
add r11,r11,#8 add r11,r11,#8
mov r6, #320 mov r6, #320/2
add r5, r1, r12, lsl #1 @ p32x = dram + dram[l] add r5, r1, r12, lsl #1 @ p32x = dram + dram[l]
ldrh r7, [r5], #2
2: @ loop_inner: 2: @ loop_inner:
mov r8, r7 @ r4,r6 - counters; r5 - 32x data; r9 - md pal; r10 - inv_prio; r11 - md data
subs lr, r6, #1 @ r7,r8,r12,lr - temp
ldrh r7, [r5], #2
ldrh r8, [r5], #2
subs r6, r6, #1
blt 0b @ loop_outer blt 0b @ loop_outer
beq 7f @ single_pix cmp r7, r8
ldrh r7, [r5], #2 @ 32x pixel beq 5f @ check_fill
cmp r7, r8 @ do RLE only if we have at least 2 px
@ ldreqh r7, [r5]
@ cmpeq r7, r8
subeq lr, lr, #1
beq 3f @ loop_innermost
7: @ single_pix: 3: @ no_fill:
mov r6, lr eor r7, r7, r10
and r12,r7, #0x03e0 @ convert BGR555 -> RGB565
eor r12,r8, r10 mov r7, r7, ror #5
tst r12, #0x8000 @ !((t ^ inv) & 0x8000) orr r7, r7, r7, ror #10+11
addeq r11,r11,#1 orr r7, r7, r12,lsl #1+16
beq 8f @ single_pix_32x eor r8, r8, r10
ldrb r12,[r11], #1 @ MD pixel
cmp r3, r12,lsl #26 @ MD has bg pixel?
.if \do_md
movne r12,r12,lsl #1
ldrneh r12,[r9, r12]
strneh r12,[r0], #2 @ *dst++ = palmd[*pmd]
.else
addne r0, r0, #2
.endif
bne 2b @ loop_inner
8: @ single_pix_32x:
and r12,r8, #0x03e0 and r12,r8, #0x03e0
mov r8, r8, lsl #11 mov r8, r8, ror #5
orr r8, r8, r8, lsr #(10+11) orr r8, r8, r8, ror #10+11
orr r8, r8, r12,lsl #1 orr r8, r8, r12,lsl #1+16
bic r8, r8, #0x0020 @ kill prio bit
strh r8, [r0], #2 ldrb r12,[r11], #1 @ MD pixel 0
ldrb lr, [r11], #1 @ MD pixel 1
lsr r7, #16
lsr r8, #16
.if \do_md
cmp r3, r12, lsl #26
movne r12,r12, lsl #1 @ load MD color if not bg
ldrneh r12,[r9, r12]
orreq r7, r7, #0x20 @ accumulate MD bg info into prio bit
cmp r3, lr, lsl #26
movne lr, lr, lsl #1
ldrneh lr, [r9, lr]
orreq r8, r8, #0x20
tst r7, #0x20 @ replace 32X with MD color if no prio and not bg
moveq r7, r12
tst r8, #0x20
moveq r8, lr
orr r7, r7, r8, lsl #16 @ combine 2 pixels to optimize memory bandwidth
str r7, [r0], #4 @ (no write combining on ARM9)
.else
cmp r3, r12, lsl #26 @ replace MD bg info into prio bit
orreq r7, r7, #0x20
cmp r3, lr, lsl #26
orreq r8, r8, #0x20
add r0, r0, #4 @ store 32x pixels if 32X prio or MD bg
tst r7, #0x20
strneh r7, [r0, #-4]
tst r8, #0x20
strneh r8, [r0, #-2]
.endif
b 2b @ loop_inner b 2b @ loop_inner
3: @ loop_innermost: 5: @ check_fill:
ldrh r7, [r5], #2 @ 32x pixel @ count pixels, align if needed
subs lr, lr, #1 ldrh r12,[r5, #0] @ only do this for at least 4 pixels
cmpge r7, r8 ldrh lr ,[r5, #2]
beq 3b @ loop_innermost cmp r12,r7
cmpeq lr ,r7
bne 3b @ no_fill
add r5, r5, #4 @ adjust for the check above
add lr, lr, #1 sub lr, r5, #4+4 @ starting r5 (32x render data start)
sub lr, r6, lr add r6, r6, #1 @ restore from dec
sub r6, r6, lr 6: @ count_loop:
sub r12,r5, lr @ loop checks 2 pixels
ldrh r8, [r5], #2
cmp r12,r6, lsl #2
ldrh r12,[r5], #2
bge 7f @ count_done
cmp r8, r7
cmpeq r12,r7
beq 6b
eor r12,r8, r10 7: @ count_done:
tst r12, #0x8000 @ !((t ^ inv) & 0x8000) sub r5, r5, #4 @ undo readahead
bne 5f @ draw_md
and r12,r8, #0x03e0 sub r8, r5, lr @ pixel count
mov r8, r8, lsl #11 mov r8, r8, lsr #1
orr r8, r8, r8, lsr #(10+11)
orr r8, r8, r12,lsl #1
bic r8, r8, #0x0020 @ kill prio bit
add r11,r11,lr cmp r8, r6, lsl #1 @ limit count to line length
tst r0, #2 @ dst unaligned? movgt r8, r6, lsl #1
strneh r8, [r0], #2 sub r6, r6, r8, lsr #1 @ consume pixels
subne lr, lr, #1
cmp lr, #0 eor r7, r7, r10
and r12,r7, #0x03e0 @ convert BGR555 -> RGB565
mov r7, r7, ror #5
orr r7, r7, r7, ror #10+11
orr r7, r7, r12,lsl #1+16
lsr r7, #16
tst r7, #0x20 @ check for prio transfer
beq 9f @ bg_loop
add r11,r11,r8 @ consume md pixels (not used)
orr r12,r7, r7, lsl #16
mov r7 ,r12
8: @ 32x_loop:
subs r8, r8, #4 @ store 4 pixels
stmgeia r0!, {r7, r12}
bgt 8b @ 32x_loop
beq 2b @ loop_inner beq 2b @ loop_inner
mov r8, r8, lsl #16 adds r8, r8, #2
orr r12,r8, r8, lsr #16 strge r7, [r0], #4 @ store 2 leftover pixels
mov r8 ,r12
4: @ draw_32x:
subs lr, lr, #4 @ store 4 pixels
stmgeia r0!, {r8, r12}
bgt 4b @ draw_32x
beq 2b @ loop_inner
adds lr, lr, #2 @ store 1-3 leftover pixels
strge r8, [r0], #4
strneh r8, [r0], #2
b 2b @ loop_inner b 2b @ loop_inner
5: @ draw_md: 9: @ bg_loop:
subs lr, lr, #1 ldrb r12,[r11],#1 @ MD pixel 0,1
ldrgeb r12,[r11], #1 @ MD pixel ldrb lr, [r11],#1
blt 2b @ loop_inner
cmp r3, r12,lsl #26 @ MD has bg pixel?
.if \do_md .if \do_md
cmp r3, r12,lsl #26 @ MD pixel 0 has bg?
mov r12,r12,lsl #1 mov r12,r12,lsl #1
ldrneh r12,[r9, r12] ldrneh r12,[r9, r12] @ t = palmd[*pmd]
strneh r12,[r0], #2 @ *dst++ = palmd[*pmd] moveq r12,r7
cmp r3, lr, lsl #26 @ MD pixel 1 has bg?
mov lr, lr, lsl #1
ldrneh lr, [r9, lr]
moveq lr, r7
orr r12,r12,lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth
str r12,[r0], #4 @ (no write combining on ARM9)
.else .else
addne r0, r0, #2 add r0, r0, #4
cmp r3, r12,lsl #26 @ MD pixel 0 has bg?
streqh r7, [r0, #-4]
cmp r3, lr, lsl #26 @ MD pixel 1 has bg?
streqh r7, [r0, #-2]
.endif .endif
bne 5b @ draw_md subs r8, r8, #2
bgt 9b @ bg_loop
and r12,r8, #0x03e0 b 2b @ loop_inner
mov r8, r8, lsl #11
orr r8, r8, r8, lsr #(10+11)
orr r8, r8, r12,lsl #1
bic r8, r8, #0x0020 @ kill prio bit
strh r8, [r0], #2 @ *dst++ = bgr2rgb(*p32x++)
6: @ draw_md_32x:
subs lr, lr, #1
ldrgeb r12,[r11], #1 @ MD pixel
blt 2b @ loop_inner
cmp r3, r12,lsl #26 @ MD has bg pixel?
.if \do_md
mov r12,r12,lsl #1
ldrneh r12,[r9, r12] @ *dst++ = palmd[*pmd]
moveq r12,r8 @ *dst++ = bgr2rgb(*p32x++)
strh r12,[r0], #2
.else
streqh r8, [r0] @ *dst++ = bgr2rgb(*p32x++)
add r0, r0, #2
.endif
b 6b @ draw_md_32x
.endm .endm
@ -281,91 +298,95 @@
tst r5, #1 tst r5, #1
ldreqb r8, [r5], #2 ldreqb r8, [r5], #2
ldrb r7, [r5, #-1] ldrb r7, [r5, #-1]
ldrneb r8, [r5, #2]! @ r7,r8 - pixel 0,1 index ldrneb r8, [r5, #2]! @ r7,r8 - 32X pixel 0,1
subs r6, r6, #1 subs r6, r6, #1
blt 0b @ loop_outer blt 0b @ loop_outer
cmp r7, r8 cmp r7, r8
beq 5f @ check_fill @ +8 beq 5f @ check_fill
3: @ no_fill: 3: @ no_fill:
mov r12,r7, lsl #1 ldrb r12,[r11], #1 @ MD pixel 0
mov lr, r8, lsl #1 ldrb lr, [r11], #1 @ MD pixel 1
ldrh r7, [r10,r12]
ldrh r8, [r10,lr] mov r7, r7, lsl #1
add r11,r11,#2 mov r8, r8, lsl #1
ldrh r7, [r10,r7] @ 32X color 0
ldrh r8, [r10,r8] @ 32X color 1
eor r12,r7, #0x20
tst r12,#0x20
ldrneb r12,[r11,#-2] @ MD pixel 0
eor lr, r8, #0x20
cmpne r3, r12, lsl #26 @ MD has bg pixel?
.if \do_md .if \do_md
mov r12,r12,lsl #1 cmp r3, r12, lsl #26
ldrneh r7, [r9, r12] @ t = palmd[pmd[0]] movne r12,r12, lsl #1 @ load MD color if not bg
tst lr, #0x20 ldrneh r12,[r9, r12]
ldrneb lr, [r11,#-1] @ MD pixel 1 orreq r7, r7, #0x20 @ accumulate MD bg info into prio bit
cmpne r3, lr, lsl #26 @ MD has bg pixel? cmp r3, lr, lsl #26
mov lr, lr, lsl #1 movne lr, lr, lsl #1
ldrneh r8, [r9, lr] @ t = palmd[pmd[1]] ldrneh lr, [r9, lr]
orreq r8, r8, #0x20
tst r7, #0x20 @ replace 32X with MD color if no prio and not bg
moveq r7, r12
tst r8, #0x20
moveq r8, lr
orr r7, r7, r8, lsl #16 @ combine 2 pixels to optimize memory bandwidth orr r7, r7, r8, lsl #16 @ combine 2 pixels to optimize memory bandwidth
str r7, [r0], #4 @ (no write combining on ARM9) str r7, [r0], #4 @ (no write combining on ARM9)
.else .else
streqh r7, [r0] cmp r3, r12, lsl #26 @ replace MD bg info into prio bit
tst lr, #0x20 orreq r7, r7, #0x20
ldrneb lr, [r11,#-1] @ MD pixel 1 cmp r3, lr, lsl #26
add r0, r0, #4 orreq r8, r8, #0x20
cmpne r3, lr, lsl #26 @ MD has bg pixel?
streqh r8, [r0, #-2] add r0, r0, #4 @ store 32x pixels if 32X prio or MD bg
tst r7, #0x20
strneh r7, [r0, #-4]
tst r8, #0x20
strneh r8, [r0, #-2]
.endif .endif
b 2b @ loop_inner b 2b @ loop_inner
5: @ check_fill: 5: @ check_fill:
@ count pixels, align if needed @ count pixels, align if needed
bic r12,r5, #1 bic r12,r5, #1
ldrh lr ,[r12, #2] @ only do this for at least 4 pixels ldrh r12,[r12, #0] @ only do this for at least 4 pixels
ldrh r12,[r12]
orr r12,lr,r12, lsl #16
orr lr, r7, r7, lsl #8 orr lr, r7, r7, lsl #8
orr lr, lr, lr, lsl #16
cmp r12,lr cmp r12,lr
bne 3b @ no_fill bne 3b @ no_fill
add r5, r5, #2 @ adjust for the check above
tst r5, #1 sub lr, r5, #4 @ starting r5 (32x render data start)
sub lr, r5, #2 @ starting r5 (32x render data start) bic r5, r5, #1
addeq r5, r5, #4
addne r5, r5, #3 @ add for the check above
add r6, r6, #1 @ restore from dec add r6, r6, #1 @ restore from dec
orr r7, r7, r7, lsl #8 orr r7, r7, r7, lsl #8
6: 6: @ count_loop:
sub r12,r5, lr sub r12,r5, lr @ loop checks 4 pixels
ldrh r8, [r5], #2 ldrh r8, [r5], #2
cmp r12,r6, lsl #1 cmp r12,r6, lsl #1
ldrh r12,[r5], #2 ldrh r12,[r5], #2
bge 7f @ count_done bge 7f @ count_done
cmp r8, r7 cmp r8, r7
subne r5, r5, #2 @ undo readahead
cmpeq r12,r7 cmpeq r12,r7
beq 6b beq 6b
cmp r8, r7
addeq r5, r5, #2 @ adjust if 2 pixels where ok
7: @ count_done: 7: @ count_done:
sub r5, r5, #2 @ undo readahead sub r5, r5, #4 @ undo readahead
@ fix alignment and check type tst lr, #1 @ fix alignment and calculate count
sub r8, r5, lr
tst r8, #1
subne r5, r5, #1 subne r5, r5, #1
subne r8, r8, #1 sub r8, r5, lr
and r7, r7, #0xff and r7, r7, #0xff @ 32x pixel color
cmp r8, r6, lsl #1
mov r7, r7, lsl #1 mov r7, r7, lsl #1
movgt r8, r6, lsl #1 @ r8=count
ldrh r7, [r10,r7] ldrh r7, [r10,r7]
sub r6, r6, r8, lsr #1 @ adjust counter
tst r7, #0x20
beq 9f @ bg_mode
add r11,r11,r8 cmp r8, r6, lsl #1 @ limit count to line length
movgt r8, r6, lsl #1
sub r6, r6, r8, lsr #1 @ consume pixels
tst r7, #0x20 @ check for prio transfer
beq 9f @ bg_loop
add r11,r11,r8 @ consume md pixels (not used)
orr r12,r7, r7, lsl #16 orr r12,r7, r7, lsl #16
mov r7 ,r12 mov r7 ,r12
8: @ 32x_loop: 8: @ 32x_loop:
@ -377,11 +398,11 @@
strge r7, [r0], #4 @ store 2 leftover pixels strge r7, [r0], #4 @ store 2 leftover pixels
b 2b @ loop_inner b 2b @ loop_inner
9: @ bg_mode: 9: @ bg_loop:
ldrb r12,[r11],#1 @ MD pixel 0,1 ldrb r12,[r11],#1 @ MD pixel 0,1
ldrb lr, [r11],#1 ldrb lr, [r11],#1
cmp r3, r12,lsl #26 @ MD pixel 0 has bg?
.if \do_md .if \do_md
cmp r3, r12,lsl #26 @ MD pixel 0 has bg?
mov r12,r12,lsl #1 mov r12,r12,lsl #1
ldrneh r12,[r9, r12] @ t = palmd[*pmd] ldrneh r12,[r9, r12] @ t = palmd[*pmd]
moveq r12,r7 moveq r12,r7
@ -392,13 +413,14 @@
orr r12,r12,lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth orr r12,r12,lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth
str r12,[r0], #4 @ (no write combining on ARM9) str r12,[r0], #4 @ (no write combining on ARM9)
.else .else
streqh r7, [r0]
cmp r3, lr, lsl #26 @ MD pixel 1 has bg?
streqh r7, [r0, #2]
add r0, r0, #4 add r0, r0, #4
cmp r3, r12,lsl #26 @ MD pixel 0 has bg?
streqh r7, [r0, #-4]
cmp r3, lr, lsl #26 @ MD pixel 1 has bg?
streqh r7, [r0, #-2]
.endif .endif
subs r8, r8, #2 subs r8, r8, #2
bgt 9b @ bg_mode bgt 9b @ bg_loop
b 2b @ loop_inner b 2b @ loop_inner
.endm .endm