ym2612 ARM optimisations

This commit is contained in:
kub 2020-04-07 22:07:38 +02:00
parent b061bc166c
commit c918379137

View file

@ -15,8 +15,8 @@
#include "../arm_features.h"
@ very simple adaption YM2612 output rate to sample rate (~1M cycles @44100)
//#define INTERPOL
@ very simple YM2612 output rate to sample rate adaption (~500k cycles @44100)
#define INTERPOL
.equiv SLOT1, 0
.equiv SLOT2, 2
@ -44,7 +44,7 @@
@ r5=slot, r1=eg_cnt, trashes: r0,r2,r3
@ writes output to routp, but only if vol_out changes
.macro update_eg_phase_slot slot
.macro update_eg_phase_slot
#if defined(INTERPOL)
ldrh r0, [r5,#0x34] @ vol_out
#endif
@ -190,21 +190,6 @@
ldrh r3, [r5,#0x18] @ tl
add r0, r0, r3 @ volume += tl
strh r0, [r5,#0x34] @ vol_out
.if \slot == SLOT1
mov r6, r6, lsr #16
orr r6, r0, r6, lsl #16
.elseif \slot == SLOT2
mov r6, r6, lsl #16
mov r0, r0, lsl #16
orr r6, r0, r6, lsr #16
.elseif \slot == SLOT3
mov r7, r7, lsr #16
orr r7, r0, r7, lsl #16
.elseif \slot == SLOT4
mov r7, r7, lsl #16
mov r0, r0, lsl #16
orr r7, r0, r7, lsr #16
.endif
0: @ EG_OFF
.endm
@ -672,24 +657,16 @@ chan_render_loop:
mov r11, r1
and r0, r0, #7
orr r4, r4, r0 @ (length<<8)|algo
add r0, lr, #0x44
ldmia r0, {r8,r9} @ eg_timer, eg_timer_add
ldr r8, [lr, #0x44] @ eg_timer
ldr r9, [lr, #0x48] @ eg_timer_add
ldr r10, [lr, #0x54] @ op1_out
@ ldmia lr, {r6,r7} @ load volumes
ldr r5, [lr, #0x40] @ CH
ldrh r6, [r5, #0x34] @ vol_out values for all slots
ldrh r2, [r5, #0x34+SLOT_STRUCT_SIZE*2]
ldrh r7, [r5, #0x34+SLOT_STRUCT_SIZE]
ldrh r3, [r5, #0x34+SLOT_STRUCT_SIZE*3]
orr r6, r6, r2, lsl #16
orr r7, r7, r3, lsl #16
tst r12, #8 @ lfo?
beq crl_loop
crl_loop_lfo:
add r0, lr, #0x30
ldmia r0, {r1,r2} @ lfo_cnt, lfo_inc
ldr r1, [lr, #0x30] @ lfo_cnt
ldr r2, [lr, #0x34] @ lfo_inc
subs r4, r4, #0x100
bmi crl_loop_end
@ -707,37 +684,48 @@ crl_loop:
bmi crl_loop_end
@ -- SSG --
add r0, lr, #0x3c
ldmia r0, {r1,r5} @ eg_cnt, CH
ldr r5, [lr, #0x40] @ CH
@ r5=slot, trashes: r0,r2,r3
mov r6, #4
ssg_upd_loop:
update_ssg_eg
add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT2 (2)
update_ssg_eg
sub r5, r5, #SLOT_STRUCT_SIZE @ SLOT3 (1)
update_ssg_eg
add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT4 (3)
#if 0
subs r6, r6, #1
addne r5, r5, #SLOT_STRUCT_SIZE
#else
add r5, r5, #SLOT_STRUCT_SIZE*2
update_ssg_eg
subs r6, r6, #2
subne r5, r5, #SLOT_STRUCT_SIZE
#endif
bne ssg_upd_loop
sub r5, r5, #SLOT_STRUCT_SIZE*3
@ -- EG --
add r8, r8, r9
cmp r8, #EG_TIMER_OVERFLOW
bcc eg_done
ldr r1, [lr, #0x3c] @ eg_cnt
eg_loop:
sub r8, r8, #EG_TIMER_OVERFLOW
add r1, r1, #1
cmp r1, #4096
movge r1, #1
@ SLOT1 (0)
@ r5=slot, r1=eg_cnt, trashes: r0,r2,r3
update_eg_phase_slot SLOT1
add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT2 (2)
update_eg_phase_slot SLOT2
sub r5, r5, #SLOT_STRUCT_SIZE @ SLOT3 (1)
update_eg_phase_slot SLOT3
add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT4 (3)
update_eg_phase_slot SLOT4
mov r6, #4
eg_upd_loop:
update_eg_phase_slot
#if 1
subs r6, r6, #1
addne r5, r5, #SLOT_STRUCT_SIZE
#else
add r5, r5, #SLOT_STRUCT_SIZE*2
update_eg_phase_slot
subs r6, r6, #2
subne r5, r5, #SLOT_STRUCT_SIZE
#endif
bne eg_upd_loop
cmp r8, #EG_TIMER_OVERFLOW
sub r5, r5, #SLOT_STRUCT_SIZE*3
@ -754,64 +742,49 @@ eg_done:
beq crl_loop
@ output interpolation
#if 0 // too expensive on slow platforms
#if defined(INTERPOL)
#if 1 // possibly too expensive for slow platforms?
@ basic interpolator, interpolate in middle region, else use closer value
mov r3, r8, lsr #EG_SH @ eg_timer, [0..3<<EG_SH) after loop
cmp r3, #(EG_TIMER_OVERFLOW>>EG_SH)/2
bgt 0f @ mix is vol_out
bne 0f @ mix is vol_out
ldrh r0, [r5,#0x36] @ SLOT1 vol_ipol
lsleq r2, r6, #16
addeq r0, r0, r2, lsr #16
lsreq r0, r0, #1
mov r6, r6, lsr #16
orr r6, r0, r6, lsl #16
ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*2] @ SLOT2 vol_ipol
addeq r0, r0, r6, lsr #16
lsreq r0, r0, #1
mov r6, r6, lsl #16
orr r6, r6, r0
ror r6, r6, #16
ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE] @ SLOT3 vol_ipol
lsleq r2, r7, #16
addeq r0, r0, r2, lsr #16
lsreq r0, r0, #1
mov r7, r7, lsr #16
orr r7, r0, r7, lsl #16
ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*3] @ SLOT4 vol_ipol
addeq r0, r0, r7, lsr #16
lsreq r0, r0, #1
mov r7, r7, lsl #16
orr r7, r7, r0
ror r7, r7, #16
#elif defined(INTERPOL)
ldr r6, [r5, #0x34] @ vol_out, vol_ipol for all slots
ldr r2, [r5, #0x34+SLOT_STRUCT_SIZE*2]
ldr r7, [r5, #0x34+SLOT_STRUCT_SIZE]
ldr r3, [r5, #0x34+SLOT_STRUCT_SIZE*3]
add r6, r6, r6, lsl #16
lsr r6, r6, #17
add r2, r2, r2, lsl #16
lsr r2, r2, #17
add r7, r7, r7, lsl #16
lsr r7, r7, #17
add r3, r3, r3, lsl #16
lsr r3, r3, #17
b 1f
#else
@ super-basic... just take value closest to sample point
mov r3, r8, lsr #EG_SH-1 @ eg_timer, [0..3<<EG_SH) after loop
cmp r3, #(EG_TIMER_OVERFLOW>>EG_SH)
bge 0f @ mix is vol_out
ldrh r0, [r5,#0x36] @ SLOT1 vol_ipol
mov r6, r6, lsr #16
orr r6, r0, r6, lsl #16
ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*2] @ SLOT2 vol_ipol
mov r6, r6, lsl #16
orr r6, r6, r0
ror r6, r6, #16
ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE] @ SLOT3 vol_ipol
mov r7, r7, lsr #16
orr r7, r0, r7, lsl #16
ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*3] @ SLOT4 vol_ipol
mov r7, r7, lsl #16
orr r7, r7, r0
ror r7, r7, #16
#endif
0:
0: ldrgeh r6, [r5, #0x34] @ vol_out values for all slots
ldrlth r6, [r5, #0x36] @ vol_ipol values for all slots
ldrgeh r2, [r5, #0x34+SLOT_STRUCT_SIZE*2]
ldrlth r2, [r5, #0x36+SLOT_STRUCT_SIZE*2]
ldrgeh r7, [r5, #0x34+SLOT_STRUCT_SIZE]
ldrlth r7, [r5, #0x36+SLOT_STRUCT_SIZE]
ldrgeh r3, [r5, #0x34+SLOT_STRUCT_SIZE*3]
ldrlth r3, [r5, #0x36+SLOT_STRUCT_SIZE*3]
#else
ldrh r6, [r5, #0x34] @ vol_out values for all slots
ldrh r2, [r5, #0x34+SLOT_STRUCT_SIZE*2]
ldrh r7, [r5, #0x34+SLOT_STRUCT_SIZE]
ldrh r3, [r5, #0x34+SLOT_STRUCT_SIZE*3]
#endif
1: orr r6, r6, r2, lsl #16
orr r7, r7, r3, lsl #16
@ -- SLOT1 --
PIC_LDR(r3, r2, ym_tl_tab)
@ -893,34 +866,28 @@ crl_algo_done:
strne r1, [r11], #4
b crl_do_phase
ctl_sample_skip:
and r1, r12, #1
add r1, r1, #1
add r11,r11, r1, lsl #2
b crl_do_phase
ctl_sample_mono:
ldr r1, [r11]
add r1, r0, r1
str r1, [r11], #4
b crl_do_phase
ctl_sample_skip:
and r1, r12, #1
add r1, r1, #1
add r11,r11, r1, lsl #2
crl_do_phase:
@ -- PHASE UPDATE --
add r5, lr, #0x10
ldmia r5, {r0-r1}
add r5, lr, #0x20
ldmia r5, {r2-r3}
add r5, lr, #0x10
add r0, r0, r2
add r1, r1, r3
stmia r5!,{r0-r1}
ldmia r5, {r0-r1}
add r5, lr, #0x28
ldmia r5, {r2-r3}
add r5, lr, #0x18
add r0, r0, r2
add r1, r1, r3
stmia r5, {r0-r1}
ldmia r5, {r0-r3,r6-r7}
add r0, r0, r6
add r1, r1, r7
ldr r6, [r5, #0x18]
ldr r7, [r5, #0x1c]
add r2, r2, r6
add r3, r3, r7
stmia r5, {r0-r3}
tst r12, #8
bne crl_loop_lfo
@ -928,7 +895,6 @@ crl_do_phase:
crl_loop_end:
@ stmia lr, {r6,r7} @ save volumes (for debug)
str r8, [lr, #0x44] @ eg_timer
str r12, [lr, #0x4c] @ pack (for lfo_ampm)
str r4, [lr, #0x50] @ was_update