mirror of
https://github.com/RaySollium99/picodrive.git
synced 2025-09-05 07:17:45 -04:00
515 lines
14 KiB
ArmAsm
515 lines
14 KiB
ArmAsm
@*
|
|
@* PicoDrive
|
|
@* (C) notaz, 2010
|
|
@* (C) kub, 2019
|
|
@*
|
|
@* This work is licensed under the terms of MAME license.
|
|
@* See COPYING file in the top-level directory.
|
|
@*
|
|
|
|
#include <pico/arm_features.h>
|
|
#include <pico/pico_int_offs.h>
|
|
|
|
.extern Pico32x
|
|
.extern Pico
|
|
|
|
.equiv P32XV_PRI, (1<< 7)
|
|
|
|
.text
|
|
.align 2
|
|
|
|
PIC_LDR_INIT()
|
|
|
|
.macro call_scan_prep cond pico @ &Pico
|
|
.if \cond
|
|
PIC_LDR(r4, r6, PicoScan32xBegin)
|
|
PIC_LDR(r5, r6, PicoScan32xEnd)
|
|
add r6, \pico, #OFS_Pico_est
|
|
ldr r4, [r4]
|
|
ldr r5, [r5]
|
|
stmfd sp!, {r4,r5,r6}
|
|
.endif
|
|
.endm
|
|
|
|
.macro call_scan_fin_ge cond
|
|
.if \cond
|
|
addge sp, sp, #4*3
|
|
.endif
|
|
.endm
|
|
|
|
.macro call_scan_begin cond
|
|
.if \cond
|
|
stmfd sp!, {r1-r3}
|
|
and r0, r2, #0xff
|
|
add r0, r0, r4
|
|
mov lr, pc
|
|
ldr pc, [sp, #(3+0)*4]
|
|
ldr r0, [sp, #(3+2)*4] @ &Pico.est
|
|
ldmfd sp!, {r1-r3}
|
|
ldr r0, [r0, #OFS_EST_DrawLineDest]
|
|
.endif
|
|
.endm
|
|
|
|
.macro call_scan_end cond
|
|
.if \cond
|
|
stmfd sp!, {r0-r3}
|
|
and r0, r2, #0xff
|
|
add r0, r0, r4
|
|
mov lr, pc
|
|
ldr pc, [sp, #(4+1)*4]
|
|
ldr r1, [sp, #(4+2)*4] @ &Pico.est
|
|
ldr r0, [r1, #OFS_EST_DrawLineDest]
|
|
ldr r2, [r1, #OFS_EST_DrawLineDestIncr]
|
|
add r0, r0, r2
|
|
str r0, [r1, #OFS_EST_DrawLineDest]
|
|
ldmfd sp!, {r0-r3}
|
|
.endif
|
|
.endm
|
|
|
|
@ direct color
|
|
@ unsigned short *dst, unsigned short *dram, int lines_sft_offs, int mdbg
|
|
.macro make_do_loop_dc name call_scan do_md
|
|
.global \name
|
|
\name:
|
|
stmfd sp!, {r4-r11,lr}
|
|
|
|
PIC_LDR(lr, r9, Pico)
|
|
PIC_LDR(r10,r9, Pico32x)
|
|
ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB]
|
|
ldrh r10,[r10, #0x40] @ Pico32x.vdp_regs[0]
|
|
add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd
|
|
|
|
and r4, r2, #0xff
|
|
mov r5, #328
|
|
mov r3, r3, lsl #26 @ mdbg << 26
|
|
mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data
|
|
tst r10,#P32XV_PRI
|
|
movne r10,#0
|
|
moveq r10,#0x8000 @ r10 = !inv_bit
|
|
call_scan_prep \call_scan lr
|
|
|
|
mov r4, #0 @ line
|
|
b 1f @ loop_outer_entry
|
|
|
|
0: @ loop_outer:
|
|
call_scan_end \call_scan
|
|
ldr r12, [r9, #OFS_EST_DrawLineDestIncr-OFS_EST_HighPal]
|
|
sub r0, r0, #320*2
|
|
add r0, r0, r12
|
|
add r4, r4, #1
|
|
cmp r4, r2, lsr #16
|
|
call_scan_fin_ge \call_scan
|
|
ldmgefd sp!, {r4-r11,pc}
|
|
|
|
1: @ loop_outer_entry:
|
|
call_scan_begin \call_scan
|
|
mov r12,r4, lsl #1
|
|
ldrh r12,[r1, r12]
|
|
add r11,r11,#8
|
|
mov r6, #320/2
|
|
add r5, r1, r12, lsl #1 @ p32x = dram + dram[l]
|
|
|
|
2: @ loop_inner:
|
|
@ r4,r6 - counters; r5 - 32x data; r9 - md pal; r10 - inv_prio; r11 - md data
|
|
@ r7,r8,r12,lr - temp
|
|
ldrh r7, [r5], #2
|
|
ldrh r8, [r5], #2
|
|
subs r6, r6, #1
|
|
blt 0b @ loop_outer
|
|
cmp r7, r8
|
|
beq 5f @ check_fill
|
|
|
|
3: @ no_fill:
|
|
ldrb r12,[r11], #1 @ MD pixel 0
|
|
|
|
eor r7, r7, r10
|
|
and lr, r7, #0x03e0 @ convert BGR555 -> RGB565
|
|
mov r7, r7, ror #5
|
|
orr r7, r7, r7, ror #10+11
|
|
orr r7, r7, lr, lsl #1+16
|
|
eor r8, r8, r10
|
|
and lr, r8, #0x03e0
|
|
mov r8, r8, ror #5
|
|
orr r8, r8, r8, ror #10+11
|
|
orr r8, r8, lr, lsl #1+16
|
|
|
|
ldrb lr, [r11], #1 @ MD pixel 1
|
|
|
|
.if \do_md
|
|
cmp r3, r12, lsl #26
|
|
tstne r7, #0x20<<16
|
|
movne r12,r12, lsl #1 @ load MD color if no 32X prio and not bg
|
|
ldrneh r12,[r9, r12]
|
|
moveq r12,r7, lsr #16 @ else replace with 32X color
|
|
|
|
cmp r3, lr, lsl #26
|
|
tstne r8, #0x20<<16
|
|
movne lr, lr, lsl #1 @ load MD color if no 32X prio and not bg
|
|
ldrneh lr, [r9, lr]
|
|
moveq lr, r8, lsr #16 @ else replace with 32X color
|
|
|
|
orr r12,r12, lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth
|
|
str r12,[r0], #4 @ (no write combining on ARM9)
|
|
.else
|
|
cmp r3, r12, lsl #26 @ replace MD bg info into prio bit
|
|
tstne r7, #0x20<<16
|
|
moveq r7, r7, lsr #16
|
|
streqh r7, [r0, #0]
|
|
|
|
cmp r3, lr, lsl #26
|
|
tstne r8, #0x20<<16
|
|
moveq r8, r8, lsr #16
|
|
streqh r8, [r0, #2]
|
|
|
|
add r0, r0, #4 @ store 32x pixels if 32X prio or MD bg
|
|
.endif
|
|
b 2b @ loop_inner
|
|
|
|
5: @ check_fill:
|
|
@ count pixels, align if needed
|
|
ldrh r12,[r5, #0] @ only do this for at least 4 pixels
|
|
ldrh lr ,[r5, #2]
|
|
cmp r12,r7
|
|
cmpeq lr ,r7
|
|
bne 3b @ no_fill
|
|
add r5, r5, #4 @ adjust for the check above
|
|
|
|
sub lr, r5, #4+4 @ starting r5 (32x render data start)
|
|
add r6, r6, #1 @ restore from dec
|
|
6: @ count_loop:
|
|
sub r12,r5, lr @ loop checks 2 pixels
|
|
ldrh r8, [r5], #2
|
|
cmp r12,r6, lsl #2
|
|
ldrh r12,[r5], #2
|
|
bge 7f @ count_done
|
|
cmp r8, r7
|
|
cmpeq r12,r7
|
|
beq 6b
|
|
|
|
7: @ count_done:
|
|
sub r5, r5, #4 @ undo readahead
|
|
|
|
sub r8, r5, lr @ pixel count
|
|
mov r8, r8, lsr #1
|
|
|
|
cmp r8, r6, lsl #1 @ limit count to line length
|
|
movgt r8, r6, lsl #1
|
|
sub r6, r6, r8, lsr #1 @ consume pixels
|
|
|
|
eor r7, r7, r10
|
|
and r12,r7, #0x03e0 @ convert BGR555 -> RGB565
|
|
mov r7, r7, ror #5
|
|
orr r7, r7, r7, ror #10+11
|
|
orr r7, r7, r12,lsl #1+16
|
|
lsr r7, #16
|
|
|
|
tst r7, #0x20 @ check for prio transfer
|
|
bne 9f @ bg_loop
|
|
|
|
add r11,r11,r8 @ consume md pixels (not used)
|
|
orr r12,r7, r7, lsl #16
|
|
mov r7 ,r12
|
|
8: @ 32x_loop:
|
|
subs r8, r8, #4 @ store 4 pixels
|
|
stmgeia r0!, {r7, r12}
|
|
bgt 8b @ 32x_loop
|
|
beq 2b @ loop_inner
|
|
adds r8, r8, #2
|
|
strge r7, [r0], #4 @ store 2 leftover pixels
|
|
b 2b @ loop_inner
|
|
|
|
9: @ bg_loop:
|
|
ldrb r12,[r11],#1 @ MD pixel 0,1
|
|
ldrb lr, [r11],#1
|
|
.if \do_md
|
|
cmp r3, r12,lsl #26 @ MD pixel 0 has bg?
|
|
mov r12,r12,lsl #1
|
|
ldrneh r12,[r9, r12] @ t = palmd[*pmd]
|
|
moveq r12,r7
|
|
cmp r3, lr, lsl #26 @ MD pixel 1 has bg?
|
|
mov lr, lr, lsl #1
|
|
ldrneh lr, [r9, lr]
|
|
moveq lr, r7
|
|
orr r12,r12,lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth
|
|
str r12,[r0], #4 @ (no write combining on ARM9)
|
|
.else
|
|
add r0, r0, #4
|
|
cmp r3, r12,lsl #26 @ MD pixel 0 has bg?
|
|
streqh r7, [r0, #-4]
|
|
cmp r3, lr, lsl #26 @ MD pixel 1 has bg?
|
|
streqh r7, [r0, #-2]
|
|
.endif
|
|
subs r8, r8, #2
|
|
bgt 9b @ bg_loop
|
|
b 2b @ loop_inner
|
|
.endm
|
|
|
|
|
|
@ packed pixel
|
|
@ note: this may read a few bytes over the end of PicoDraw2FB and dram,
|
|
@ so those should have a bit more alloc'ed than really needed.
|
|
@ unsigned short *dst, unsigned short *dram, int lines_sft_offs, int mdbg
|
|
.macro make_do_loop_pp name call_scan do_md
|
|
.global \name
|
|
\name:
|
|
stmfd sp!, {r4-r11,lr}
|
|
|
|
PIC_LDR(lr, r9, Pico)
|
|
PIC_LDR(r10,r9, Pico32xMem)
|
|
ldr r9, =OFS_PMEM32x_pal_native
|
|
ldr r10, [r10]
|
|
ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB]
|
|
add r10,r10,r9
|
|
add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd
|
|
|
|
and r4, r2, #0xff
|
|
mov r5, #328
|
|
mov r3, r3, lsl #26 @ mdbg << 26
|
|
mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data
|
|
call_scan_prep \call_scan lr
|
|
|
|
mov r4, #0 @ line
|
|
b 1f @ loop_outer_entry
|
|
|
|
0: @ loop_outer:
|
|
call_scan_end \call_scan
|
|
ldr r12, [r9, #OFS_EST_DrawLineDestIncr-OFS_EST_HighPal]
|
|
sub r0, r0, #320*2
|
|
add r0, r0, r12
|
|
add r4, r4, #1
|
|
cmp r4, r2, lsr #16
|
|
call_scan_fin_ge \call_scan
|
|
ldmgefd sp!, {r4-r11,pc}
|
|
|
|
1: @ loop_outer_entry:
|
|
call_scan_begin \call_scan
|
|
mov r12,r4, lsl #1
|
|
ldrh r12,[r1, r12]
|
|
add r11,r11,#8
|
|
mov r6, #320/2
|
|
add r5, r1, r12, lsl #1 @ p32x = dram + dram[l]
|
|
and r12,r2, #0x100 @ shift
|
|
add r5, r5, r12,lsr #8
|
|
|
|
2: @ loop_inner:
|
|
@ r4,r6 - counters; r5 - 32x data; r9,r10 - md,32x pal; r11 - md data
|
|
@ r7,r8,r12,lr - temp
|
|
tst r5, #1
|
|
ldreqb r8, [r5], #2
|
|
ldrb r7, [r5, #-1]
|
|
ldrneb r8, [r5, #2]! @ r7,r8 - 32X pixel 0,1
|
|
subs r6, r6, #1
|
|
blt 0b @ loop_outer
|
|
cmp r7, r8
|
|
beq 5f @ check_fill
|
|
|
|
3: @ no_fill:
|
|
ldrb r12,[r11], #1 @ MD pixel 0
|
|
ldrb lr, [r11], #1 @ MD pixel 1
|
|
|
|
mov r7, r7, lsl #1
|
|
mov r8, r8, lsl #1
|
|
ldrh r7, [r10,r7] @ 32X color 0
|
|
ldrh r8, [r10,r8] @ 32X color 1
|
|
|
|
.if \do_md
|
|
cmp r3, r12, lsl #26
|
|
movne r12,r12, lsl #1 @ load MD color if not bg
|
|
ldrneh r12,[r9, r12]
|
|
orreq r7, r7, #0x20 @ accumulate MD bg info into prio bit
|
|
cmp r3, lr, lsl #26
|
|
movne lr, lr, lsl #1
|
|
ldrneh lr, [r9, lr]
|
|
orreq r8, r8, #0x20
|
|
|
|
tst r7, #0x20 @ replace 32X with MD color if no prio and not bg
|
|
moveq r7, r12
|
|
tst r8, #0x20
|
|
moveq r8, lr
|
|
orr r7, r7, r8, lsl #16 @ combine 2 pixels to optimize memory bandwidth
|
|
str r7, [r0], #4 @ (no write combining on ARM9)
|
|
.else
|
|
cmp r3, r12, lsl #26 @ replace MD bg info into prio bit
|
|
orreq r7, r7, #0x20
|
|
cmp r3, lr, lsl #26
|
|
orreq r8, r8, #0x20
|
|
|
|
add r0, r0, #4 @ store 32x pixels if 32X prio or MD bg
|
|
tst r7, #0x20
|
|
strneh r7, [r0, #-4]
|
|
tst r8, #0x20
|
|
strneh r8, [r0, #-2]
|
|
.endif
|
|
b 2b @ loop_inner
|
|
|
|
5: @ check_fill:
|
|
@ count pixels, align if needed
|
|
bic r12,r5, #1
|
|
ldrh r12,[r12, #0] @ only do this for at least 4 pixels
|
|
orr lr, r7, r7, lsl #8
|
|
cmp r12,lr
|
|
bne 3b @ no_fill
|
|
add r5, r5, #2 @ adjust for the check above
|
|
|
|
sub lr, r5, #4 @ starting r5 (32x render data start)
|
|
bic r5, r5, #1
|
|
add r6, r6, #1 @ restore from dec
|
|
orr r7, r7, r7, lsl #8
|
|
6: @ count_loop:
|
|
sub r12,r5, lr @ loop checks 4 pixels
|
|
ldrh r8, [r5], #2
|
|
cmp r12,r6, lsl #1
|
|
ldrh r12,[r5], #2
|
|
bge 7f @ count_done
|
|
cmp r8, r7
|
|
cmpeq r12,r7
|
|
beq 6b
|
|
cmp r8, r7
|
|
addeq r5, r5, #2 @ adjust if 2 pixels where ok
|
|
|
|
7: @ count_done:
|
|
sub r5, r5, #4 @ undo readahead
|
|
|
|
tst lr, #1 @ fix alignment and calculate count
|
|
subne r5, r5, #1
|
|
sub r8, r5, lr
|
|
|
|
and r7, r7, #0xff @ 32x pixel color
|
|
mov r7, r7, lsl #1
|
|
ldrh r7, [r10,r7]
|
|
|
|
cmp r8, r6, lsl #1 @ limit count to line length
|
|
movgt r8, r6, lsl #1
|
|
sub r6, r6, r8, lsr #1 @ consume pixels
|
|
|
|
tst r7, #0x20 @ check for prio transfer
|
|
beq 9f @ bg_loop
|
|
|
|
add r11,r11,r8 @ consume md pixels (not used)
|
|
orr r12,r7, r7, lsl #16
|
|
mov r7 ,r12
|
|
8: @ 32x_loop:
|
|
subs r8, r8, #4 @ store 4 pixels
|
|
stmgeia r0!, {r7, r12}
|
|
bgt 8b @ 32x_loop
|
|
beq 2b @ loop_inner
|
|
adds r8, r8, #2
|
|
strge r7, [r0], #4 @ store 2 leftover pixels
|
|
b 2b @ loop_inner
|
|
|
|
9: @ bg_loop:
|
|
ldrb r12,[r11],#1 @ MD pixel 0,1
|
|
ldrb lr, [r11],#1
|
|
.if \do_md
|
|
cmp r3, r12,lsl #26 @ MD pixel 0 has bg?
|
|
mov r12,r12,lsl #1
|
|
ldrneh r12,[r9, r12] @ t = palmd[*pmd]
|
|
moveq r12,r7
|
|
cmp r3, lr, lsl #26 @ MD pixel 1 has bg?
|
|
mov lr, lr, lsl #1
|
|
ldrneh lr, [r9, lr]
|
|
moveq lr, r7
|
|
orr r12,r12,lr, lsl #16 @ combine 2 pixels to optimize memory bandwidth
|
|
str r12,[r0], #4 @ (no write combining on ARM9)
|
|
.else
|
|
add r0, r0, #4
|
|
cmp r3, r12,lsl #26 @ MD pixel 0 has bg?
|
|
streqh r7, [r0, #-4]
|
|
cmp r3, lr, lsl #26 @ MD pixel 1 has bg?
|
|
streqh r7, [r0, #-2]
|
|
.endif
|
|
subs r8, r8, #2
|
|
bgt 9b @ bg_loop
|
|
b 2b @ loop_inner
|
|
.endm
|
|
|
|
|
|
@ run length
|
|
@ unsigned short *dst, unsigned short *dram, int lines_sft_offs, int mdbg
|
|
.macro make_do_loop_rl name call_scan do_md
|
|
.global \name
|
|
\name:
|
|
stmfd sp!, {r4-r11,lr}
|
|
|
|
PIC_LDR(lr, r9, Pico)
|
|
PIC_LDR(r10,r9, Pico32xMem)
|
|
ldr r9, =OFS_PMEM32x_pal_native
|
|
ldr r10, [r10]
|
|
ldr r11, [lr, #OFS_Pico_est+OFS_EST_Draw2FB]
|
|
add r10,r10,r9
|
|
add r9, lr, #OFS_Pico_est+OFS_EST_HighPal @ palmd
|
|
|
|
and r4, r2, #0xff
|
|
mov r5, #328
|
|
mov r3, r3, lsl #26 @ mdbg << 26
|
|
mla r11,r4,r5,r11 @ r11 = pmd = PicoDraw2FB + offs*328: md data
|
|
call_scan_prep \call_scan lr
|
|
|
|
mov r4, #0 @ line
|
|
b 1f @ loop_outer_entry
|
|
|
|
0: @ loop_outer:
|
|
call_scan_end \call_scan
|
|
ldr r12, [r9, #OFS_EST_DrawLineDestIncr-OFS_EST_HighPal]
|
|
sub r0, r0, #320*2
|
|
add r0, r0, r12
|
|
add r4, r4, #1
|
|
cmp r4, r2, lsr #16
|
|
call_scan_fin_ge \call_scan
|
|
ldmgefd sp!, {r4-r11,pc}
|
|
|
|
1: @ loop_outer_entry:
|
|
call_scan_begin \call_scan
|
|
mov r12,r4, lsl #1
|
|
ldrh r12,[r1, r12]
|
|
add r11,r11,#8
|
|
mov r6, #320
|
|
add r5, r1, r12, lsl #1 @ p32x = dram + dram[l]
|
|
|
|
2: @ loop_inner:
|
|
ldrh r8, [r5], #2 @ control word
|
|
and r12,r8, #0xff
|
|
mov r12,r12,lsl #1
|
|
ldrh lr, [r10,r12] @ t = 32x pixel
|
|
eor lr, lr, #0x20
|
|
|
|
3: @ loop_innermost:
|
|
subs r6, r6, #1
|
|
ldrgeb r7, [r11], #1 @ MD pixel
|
|
blt 0b @ loop_outer
|
|
tst lr, #0x20
|
|
cmpne r3, r7, lsl #26 @ MD has bg pixel?
|
|
.if \do_md
|
|
mov r7, r7, lsl #1
|
|
ldrneh r12,[r9, r7] @ t = palmd[*pmd]
|
|
streqh lr, [r0], #2
|
|
strneh r12,[r0], #2 @ *dst++ = t
|
|
.else
|
|
streqh lr, [r0]
|
|
add r0, r0, #2
|
|
.endif
|
|
subs r8, r8, #0x100
|
|
bge 3b @ loop_innermost
|
|
b 2b @ loop_inner
|
|
.endm
|
|
|
|
|
|
make_do_loop_dc do_loop_dc, 0, 0
|
|
make_do_loop_dc do_loop_dc_md, 0, 1
|
|
make_do_loop_dc do_loop_dc_scan, 1, 0
|
|
make_do_loop_dc do_loop_dc_scan_md, 1, 1
|
|
.pool
|
|
|
|
make_do_loop_pp do_loop_pp, 0, 0
|
|
make_do_loop_pp do_loop_pp_md, 0, 1
|
|
make_do_loop_pp do_loop_pp_scan, 1, 0
|
|
make_do_loop_pp do_loop_pp_scan_md, 1, 1
|
|
.pool
|
|
|
|
make_do_loop_rl do_loop_rl, 0, 0
|
|
make_do_loop_rl do_loop_rl_md, 0, 1
|
|
make_do_loop_rl do_loop_rl_scan, 1, 0
|
|
make_do_loop_rl do_loop_rl_scan_md, 1, 1
|
|
.pool
|
|
|
|
@ vim:filetype=armasm
|