ym2612 ARM optimisations

This commit is contained in:
kub 2020-04-07 22:07:38 +02:00
parent b061bc166c
commit c918379137

View file

@ -15,8 +15,8 @@
#include "../arm_features.h" #include "../arm_features.h"
@ very simple adaption YM2612 output rate to sample rate (~1M cycles @44100) @ very simple YM2612 output rate to sample rate adaption (~500k cycles @44100)
//#define INTERPOL #define INTERPOL
.equiv SLOT1, 0 .equiv SLOT1, 0
.equiv SLOT2, 2 .equiv SLOT2, 2
@ -44,7 +44,7 @@
@ r5=slot, r1=eg_cnt, trashes: r0,r2,r3 @ r5=slot, r1=eg_cnt, trashes: r0,r2,r3
@ writes output to routp, but only if vol_out changes @ writes output to routp, but only if vol_out changes
.macro update_eg_phase_slot slot .macro update_eg_phase_slot
#if defined(INTERPOL) #if defined(INTERPOL)
ldrh r0, [r5,#0x34] @ vol_out ldrh r0, [r5,#0x34] @ vol_out
#endif #endif
@ -190,21 +190,6 @@
ldrh r3, [r5,#0x18] @ tl ldrh r3, [r5,#0x18] @ tl
add r0, r0, r3 @ volume += tl add r0, r0, r3 @ volume += tl
strh r0, [r5,#0x34] @ vol_out strh r0, [r5,#0x34] @ vol_out
.if \slot == SLOT1
mov r6, r6, lsr #16
orr r6, r0, r6, lsl #16
.elseif \slot == SLOT2
mov r6, r6, lsl #16
mov r0, r0, lsl #16
orr r6, r0, r6, lsr #16
.elseif \slot == SLOT3
mov r7, r7, lsr #16
orr r7, r0, r7, lsl #16
.elseif \slot == SLOT4
mov r7, r7, lsl #16
mov r0, r0, lsl #16
orr r7, r0, r7, lsr #16
.endif
0: @ EG_OFF 0: @ EG_OFF
.endm .endm
@ -672,24 +657,16 @@ chan_render_loop:
mov r11, r1 mov r11, r1
and r0, r0, #7 and r0, r0, #7
orr r4, r4, r0 @ (length<<8)|algo orr r4, r4, r0 @ (length<<8)|algo
add r0, lr, #0x44 ldr r8, [lr, #0x44] @ eg_timer
ldmia r0, {r8,r9} @ eg_timer, eg_timer_add ldr r9, [lr, #0x48] @ eg_timer_add
ldr r10, [lr, #0x54] @ op1_out ldr r10, [lr, #0x54] @ op1_out
@ ldmia lr, {r6,r7} @ load volumes
ldr r5, [lr, #0x40] @ CH
ldrh r6, [r5, #0x34] @ vol_out values for all slots
ldrh r2, [r5, #0x34+SLOT_STRUCT_SIZE*2]
ldrh r7, [r5, #0x34+SLOT_STRUCT_SIZE]
ldrh r3, [r5, #0x34+SLOT_STRUCT_SIZE*3]
orr r6, r6, r2, lsl #16
orr r7, r7, r3, lsl #16
tst r12, #8 @ lfo? tst r12, #8 @ lfo?
beq crl_loop beq crl_loop
crl_loop_lfo: crl_loop_lfo:
add r0, lr, #0x30 ldr r1, [lr, #0x30] @ lfo_cnt
ldmia r0, {r1,r2} @ lfo_cnt, lfo_inc ldr r2, [lr, #0x34] @ lfo_inc
subs r4, r4, #0x100 subs r4, r4, #0x100
bmi crl_loop_end bmi crl_loop_end
@ -707,37 +684,48 @@ crl_loop:
bmi crl_loop_end bmi crl_loop_end
@ -- SSG -- @ -- SSG --
add r0, lr, #0x3c ldr r5, [lr, #0x40] @ CH
ldmia r0, {r1,r5} @ eg_cnt, CH
@ r5=slot, trashes: r0,r2,r3 @ r5=slot, trashes: r0,r2,r3
mov r6, #4
ssg_upd_loop:
update_ssg_eg update_ssg_eg
add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT2 (2) #if 0
update_ssg_eg subs r6, r6, #1
sub r5, r5, #SLOT_STRUCT_SIZE @ SLOT3 (1) addne r5, r5, #SLOT_STRUCT_SIZE
update_ssg_eg #else
add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT4 (3) add r5, r5, #SLOT_STRUCT_SIZE*2
update_ssg_eg update_ssg_eg
subs r6, r6, #2
subne r5, r5, #SLOT_STRUCT_SIZE
#endif
bne ssg_upd_loop
sub r5, r5, #SLOT_STRUCT_SIZE*3 sub r5, r5, #SLOT_STRUCT_SIZE*3
@ -- EG -- @ -- EG --
add r8, r8, r9 add r8, r8, r9
cmp r8, #EG_TIMER_OVERFLOW cmp r8, #EG_TIMER_OVERFLOW
bcc eg_done bcc eg_done
ldr r1, [lr, #0x3c] @ eg_cnt
eg_loop: eg_loop:
sub r8, r8, #EG_TIMER_OVERFLOW sub r8, r8, #EG_TIMER_OVERFLOW
add r1, r1, #1 add r1, r1, #1
cmp r1, #4096 cmp r1, #4096
movge r1, #1 movge r1, #1
@ SLOT1 (0)
@ r5=slot, r1=eg_cnt, trashes: r0,r2,r3 mov r6, #4
update_eg_phase_slot SLOT1 eg_upd_loop:
add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT2 (2) update_eg_phase_slot
update_eg_phase_slot SLOT2 #if 1
sub r5, r5, #SLOT_STRUCT_SIZE @ SLOT3 (1) subs r6, r6, #1
update_eg_phase_slot SLOT3 addne r5, r5, #SLOT_STRUCT_SIZE
add r5, r5, #SLOT_STRUCT_SIZE*2 @ SLOT4 (3) #else
update_eg_phase_slot SLOT4 add r5, r5, #SLOT_STRUCT_SIZE*2
update_eg_phase_slot
subs r6, r6, #2
subne r5, r5, #SLOT_STRUCT_SIZE
#endif
bne eg_upd_loop
cmp r8, #EG_TIMER_OVERFLOW cmp r8, #EG_TIMER_OVERFLOW
sub r5, r5, #SLOT_STRUCT_SIZE*3 sub r5, r5, #SLOT_STRUCT_SIZE*3
@ -754,64 +742,49 @@ eg_done:
beq crl_loop beq crl_loop
@ output interpolation @ output interpolation
#if 0 // too expensive on slow platforms #if defined(INTERPOL)
#if 1 // possibly too expensive for slow platforms?
@ basic interpolator, interpolate in middle region, else use closer value @ basic interpolator, interpolate in middle region, else use closer value
mov r3, r8, lsr #EG_SH @ eg_timer, [0..3<<EG_SH) after loop mov r3, r8, lsr #EG_SH @ eg_timer, [0..3<<EG_SH) after loop
cmp r3, #(EG_TIMER_OVERFLOW>>EG_SH)/2 cmp r3, #(EG_TIMER_OVERFLOW>>EG_SH)/2
bgt 0f @ mix is vol_out bne 0f @ mix is vol_out
ldrh r0, [r5,#0x36] @ SLOT1 vol_ipol ldr r6, [r5, #0x34] @ vol_out, vol_ipol for all slots
lsleq r2, r6, #16 ldr r2, [r5, #0x34+SLOT_STRUCT_SIZE*2]
addeq r0, r0, r2, lsr #16 ldr r7, [r5, #0x34+SLOT_STRUCT_SIZE]
lsreq r0, r0, #1 ldr r3, [r5, #0x34+SLOT_STRUCT_SIZE*3]
mov r6, r6, lsr #16 add r6, r6, r6, lsl #16
orr r6, r0, r6, lsl #16 lsr r6, r6, #17
add r2, r2, r2, lsl #16
ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*2] @ SLOT2 vol_ipol lsr r2, r2, #17
addeq r0, r0, r6, lsr #16 add r7, r7, r7, lsl #16
lsreq r0, r0, #1 lsr r7, r7, #17
mov r6, r6, lsl #16 add r3, r3, r3, lsl #16
orr r6, r6, r0 lsr r3, r3, #17
ror r6, r6, #16 b 1f
#else
ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE] @ SLOT3 vol_ipol
lsleq r2, r7, #16
addeq r0, r0, r2, lsr #16
lsreq r0, r0, #1
mov r7, r7, lsr #16
orr r7, r0, r7, lsl #16
ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*3] @ SLOT4 vol_ipol
addeq r0, r0, r7, lsr #16
lsreq r0, r0, #1
mov r7, r7, lsl #16
orr r7, r7, r0
ror r7, r7, #16
#elif defined(INTERPOL)
@ super-basic... just take value closest to sample point @ super-basic... just take value closest to sample point
mov r3, r8, lsr #EG_SH-1 @ eg_timer, [0..3<<EG_SH) after loop mov r3, r8, lsr #EG_SH-1 @ eg_timer, [0..3<<EG_SH) after loop
cmp r3, #(EG_TIMER_OVERFLOW>>EG_SH) cmp r3, #(EG_TIMER_OVERFLOW>>EG_SH)
bge 0f @ mix is vol_out
ldrh r0, [r5,#0x36] @ SLOT1 vol_ipol
mov r6, r6, lsr #16
orr r6, r0, r6, lsl #16
ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*2] @ SLOT2 vol_ipol
mov r6, r6, lsl #16
orr r6, r6, r0
ror r6, r6, #16
ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE] @ SLOT3 vol_ipol
mov r7, r7, lsr #16
orr r7, r0, r7, lsl #16
ldrh r0, [r5,#0x36+SLOT_STRUCT_SIZE*3] @ SLOT4 vol_ipol
mov r7, r7, lsl #16
orr r7, r7, r0
ror r7, r7, #16
#endif #endif
0:
0: ldrgeh r6, [r5, #0x34] @ vol_out values for all slots
ldrlth r6, [r5, #0x36] @ vol_ipol values for all slots
ldrgeh r2, [r5, #0x34+SLOT_STRUCT_SIZE*2]
ldrlth r2, [r5, #0x36+SLOT_STRUCT_SIZE*2]
ldrgeh r7, [r5, #0x34+SLOT_STRUCT_SIZE]
ldrlth r7, [r5, #0x36+SLOT_STRUCT_SIZE]
ldrgeh r3, [r5, #0x34+SLOT_STRUCT_SIZE*3]
ldrlth r3, [r5, #0x36+SLOT_STRUCT_SIZE*3]
#else
ldrh r6, [r5, #0x34] @ vol_out values for all slots
ldrh r2, [r5, #0x34+SLOT_STRUCT_SIZE*2]
ldrh r7, [r5, #0x34+SLOT_STRUCT_SIZE]
ldrh r3, [r5, #0x34+SLOT_STRUCT_SIZE*3]
#endif
1: orr r6, r6, r2, lsl #16
orr r7, r7, r3, lsl #16
@ -- SLOT1 -- @ -- SLOT1 --
PIC_LDR(r3, r2, ym_tl_tab) PIC_LDR(r3, r2, ym_tl_tab)
@ -893,34 +866,28 @@ crl_algo_done:
strne r1, [r11], #4 strne r1, [r11], #4
b crl_do_phase b crl_do_phase
ctl_sample_skip:
and r1, r12, #1
add r1, r1, #1
add r11,r11, r1, lsl #2
b crl_do_phase
ctl_sample_mono: ctl_sample_mono:
ldr r1, [r11] ldr r1, [r11]
add r1, r0, r1 add r1, r0, r1
str r1, [r11], #4 str r1, [r11], #4
b crl_do_phase
ctl_sample_skip:
and r1, r12, #1
add r1, r1, #1
add r11,r11, r1, lsl #2
crl_do_phase: crl_do_phase:
@ -- PHASE UPDATE -- @ -- PHASE UPDATE --
add r5, lr, #0x10 add r5, lr, #0x10
ldmia r5, {r0-r1} ldmia r5, {r0-r3,r6-r7}
add r5, lr, #0x20 add r0, r0, r6
ldmia r5, {r2-r3} add r1, r1, r7
add r5, lr, #0x10 ldr r6, [r5, #0x18]
add r0, r0, r2 ldr r7, [r5, #0x1c]
add r1, r1, r3 add r2, r2, r6
stmia r5!,{r0-r1} add r3, r3, r7
ldmia r5, {r0-r1} stmia r5, {r0-r3}
add r5, lr, #0x28
ldmia r5, {r2-r3}
add r5, lr, #0x18
add r0, r0, r2
add r1, r1, r3
stmia r5, {r0-r1}
tst r12, #8 tst r12, #8
bne crl_loop_lfo bne crl_loop_lfo
@ -928,7 +895,6 @@ crl_do_phase:
crl_loop_end: crl_loop_end:
@ stmia lr, {r6,r7} @ save volumes (for debug)
str r8, [lr, #0x44] @ eg_timer str r8, [lr, #0x44] @ eg_timer
str r12, [lr, #0x4c] @ pack (for lfo_ampm) str r12, [lr, #0x4c] @ pack (for lfo_ampm)
str r4, [lr, #0x50] @ was_update str r4, [lr, #0x50] @ was_update