sh2: optimisations in drc

This commit is contained in:
kub 2020-05-06 23:06:10 +02:00
parent 39c5ec3f4c
commit 904fb98e6c
2 changed files with 72 additions and 51 deletions

View file

@ -17,18 +17,18 @@ void drc_cmn_cleanup(void);
// binary search approach, since we don't have CLZ on ARM920T // binary search approach, since we don't have CLZ on ARM920T
#define FOR_ALL_BITS_SET_DO(mask, bit, code) { \ #define FOR_ALL_BITS_SET_DO(mask, bit, code) { \
u32 __mask = mask; \ u32 __mask = mask; \
for (bit = 31; bit >= 0 && mask; bit--, __mask <<= 1) { \ for (bit = 0; bit < 32 && mask; bit++, __mask >>= 1) { \
if (!(__mask & (0xffff << 16))) \ if (!(__mask & 0xffff)) \
bit -= 16, __mask <<= 16; \ bit += 16,__mask >>= 16; \
if (!(__mask & (0xff << 24))) \ if (!(__mask & 0xff)) \
bit -= 8, __mask <<= 8; \ bit += 8, __mask >>= 8; \
if (!(__mask & (0xf << 28))) \ if (!(__mask & 0xf)) \
bit -= 4, __mask <<= 4; \ bit += 4, __mask >>= 4; \
if (!(__mask & (0x3 << 30))) \ if (!(__mask & 0x3)) \
bit -= 2, __mask <<= 2; \ bit += 2, __mask >>= 2; \
if (!(__mask & (0x1 << 31))) \ if (!(__mask & 0x1)) \
bit -= 1, __mask <<= 1; \ bit += 1, __mask >>= 1; \
if (__mask & (0x1 << 31)) { \ if (__mask & 0x1) { \
code; \ code; \
} \ } \
} \ } \

View file

@ -1549,22 +1549,31 @@ static u32 rcache_regs_clean; // regs needing cleaning
static void rcache_lock_vreg(int x) static void rcache_lock_vreg(int x)
{ {
if (x >= 0) { if (x >= 0) {
cache_regs[x].locked ++;
#if DRC_DEBUG & 64
if (cache_regs[x].type == HR_FREE) { if (cache_regs[x].type == HR_FREE) {
printf("locking free vreg %x, aborting\n", x); printf("locking free vreg %x, aborting\n", x);
exit(1); exit(1);
} }
cache_regs[x].locked ++; if (!cache_regs[x].locked) {
printf("locking overflow vreg %x, aborting\n", x);
exit(1);
}
#endif
} }
} }
static void rcache_unlock_vreg(int x) static void rcache_unlock_vreg(int x)
{ {
if (x >= 0) { if (x >= 0) {
#if DRC_DEBUG & 64
if (cache_regs[x].type == HR_FREE) { if (cache_regs[x].type == HR_FREE) {
printf("unlocking free vreg %x, aborting\n", x); printf("unlocking free vreg %x, aborting\n", x);
exit(1); exit(1);
} }
cache_regs[x].locked --; #endif
if (cache_regs[x].locked)
cache_regs[x].locked --;
} }
} }
@ -1582,7 +1591,7 @@ static void rcache_unmap_vreg(int x)
FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, i, FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, i,
if (guest_regs[i].flags & GRF_DIRTY) { if (guest_regs[i].flags & GRF_DIRTY) {
// if a dirty reg is unmapped save its value to context // if a dirty reg is unmapped save its value to context
if (~rcache_regs_discard & (1 << i)) if ((~rcache_regs_discard | rcache_regs_now) & (1 << i))
emith_ctx_write(cache_regs[x].hreg, i * 4); emith_ctx_write(cache_regs[x].hreg, i * 4);
guest_regs[i].flags &= ~GRF_DIRTY; guest_regs[i].flags &= ~GRF_DIRTY;
} }
@ -1700,26 +1709,28 @@ static int rcache_allocate(int what, int minprio)
continue; continue;
if (cache_regs[i].type == HR_FREE || cache_regs[i].type == HR_TEMP) { if (cache_regs[i].type == HR_FREE || cache_regs[i].type == HR_TEMP) {
// REG is free // REG is free
prio = 6; prio = 10;
oldest = i; oldest = i;
break; break;
} }
if (cache_regs[i].type == HR_CACHED) { if (cache_regs[i].type == HR_CACHED) {
if (rcache_regs_now & cache_regs[i].gregs) if (rcache_regs_now & cache_regs[i].gregs)
// REGs needed for the current insn // REGs needed for the current insn
i_prio = 1; i_prio = 0;
else if (rcache_regs_soon & cache_regs[i].gregs) else if (rcache_regs_soon & cache_regs[i].gregs)
// REGs needed in the next insns // REGs needed in the next insns
i_prio = 2; i_prio = 2;
else if (rcache_regs_late & cache_regs[i].gregs) else if (rcache_regs_late & cache_regs[i].gregs)
// REGs needed in some future insn // REGs needed in some future insn
i_prio = 3;
else if (!(~rcache_regs_discard & cache_regs[i].gregs))
// REGs not needed in the foreseeable future
i_prio = 4; i_prio = 4;
else if (~rcache_regs_discard & cache_regs[i].gregs)
// REGs not needed in the foreseeable future
i_prio = 6;
else else
// REGs soon overwritten anyway // REGs soon overwritten anyway
i_prio = 5; i_prio = 8;
if (!(cache_regs[i].flags & HRF_DIRTY)) i_prio ++;
if (prio < i_prio || (prio == i_prio && cache_regs[i].stamp < min_stamp)) { if (prio < i_prio || (prio == i_prio && cache_regs[i].stamp < min_stamp)) {
min_stamp = cache_regs[i].stamp; min_stamp = cache_regs[i].stamp;
oldest = i; oldest = i;
@ -1744,21 +1755,21 @@ static int rcache_allocate_vreg(int needed)
{ {
int x; int x;
x = rcache_allocate(1, needed ? 0 : 3); x = rcache_allocate(1, needed ? 0 : 4);
if (x < 0) if (x < 0)
x = rcache_allocate(-1, 1); x = rcache_allocate(-1, 0);
return x; return x;
} }
static int rcache_allocate_nontemp(void) static int rcache_allocate_nontemp(void)
{ {
int x = rcache_allocate(0, 3); int x = rcache_allocate(0, 4);
return x; return x;
} }
static int rcache_allocate_temp(void) static int rcache_allocate_temp(void)
{ {
int x = rcache_allocate(-1, 1); int x = rcache_allocate(-1, 0);
if (x < 0) if (x < 0)
x = rcache_allocate(0, 0); x = rcache_allocate(0, 0);
return x; return x;
@ -1821,20 +1832,25 @@ static void rcache_remap_vreg(int x)
int d; int d;
// x must be a cached vreg // x must be a cached vreg
if (cache_regs[x].type != HR_CACHED) if (cache_regs[x].type != HR_CACHED || cache_regs[x].locked)
return; return;
// don't do it if x is already a REG or isn't used or to be cleaned anyway // don't do it if x isn't used
if ((cache_regs[x].htype & HRT_REG) || if (!(rsl_d & cache_regs[x].gregs)) {
!(rsl_d & cache_regs[x].gregs)) {
// clean here to avoid data loss on invalidation // clean here to avoid data loss on invalidation
rcache_clean_vreg(x); rcache_clean_vreg(x);
return; return;
} }
if (cache_regs[x].locked) { FOR_ALL_BITS_SET_DO(cache_regs[x].gregs, d,
printf("remap vreg %d is locked\n", x); if ((guest_regs[d].flags & (GRF_STATIC|GRF_PINNED)) &&
exit(1); !cache_regs[guest_regs[d].sreg].locked &&
} !((rsl_d|rcache_regs_now) & cache_regs[guest_regs[d].sreg].gregs)) {
// STATIC not in its sreg and sreg is available
rcache_evict_vreg(guest_regs[d].sreg);
rcache_move_vreg(guest_regs[d].sreg, x);
return;
}
)
// allocate a non-TEMP vreg // allocate a non-TEMP vreg
rcache_lock_vreg(x); // lock to avoid evicting x rcache_lock_vreg(x); // lock to avoid evicting x
@ -1891,8 +1907,8 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr
{ {
int src, dst, ali; int src, dst, ali;
cache_reg_t *tr; cache_reg_t *tr;
u32 rsp_d = (rcache_regs_now | rcache_regs_soon | u32 rsp_d = (rcache_regs_soon | rcache_regs_static | rcache_regs_pinned) &
rcache_regs_static | rcache_regs_pinned) & ~rcache_regs_discard; ~rcache_regs_discard;
dst = src = guest_regs[r].vreg; dst = src = guest_regs[r].vreg;
@ -1901,7 +1917,7 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr
if ((guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) && if ((guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) &&
src != guest_regs[r].sreg && (src < 0 || mode != RC_GR_READ) && src != guest_regs[r].sreg && (src < 0 || mode != RC_GR_READ) &&
!cache_regs[guest_regs[r].sreg].locked && !cache_regs[guest_regs[r].sreg].locked &&
!(rsp_d & cache_regs[guest_regs[r].sreg].gregs)) { !((rsp_d|rcache_regs_now) & cache_regs[guest_regs[r].sreg].gregs)) {
dst = guest_regs[r].sreg; dst = guest_regs[r].sreg;
rcache_evict_vreg(dst); rcache_evict_vreg(dst);
} else if (dst < 0) { } else if (dst < 0) {
@ -1926,7 +1942,7 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr
ali = tr->gregs & ~(1 << r); ali = tr->gregs & ~(1 << r);
if (mode != RC_GR_READ && src == dst && ali) { if (mode != RC_GR_READ && src == dst && ali) {
int x = -1; int x = -1;
if (rsp_d & ali) { if ((rsp_d|rcache_regs_now) & ali) {
if ((guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) && if ((guest_regs[r].flags & (GRF_STATIC|GRF_PINNED)) &&
guest_regs[r].sreg == dst && !tr->locked) { guest_regs[r].sreg == dst && !tr->locked) {
// split aliases if r is STATIC in sreg and dst isn't already locked // split aliases if r is STATIC in sreg and dst isn't already locked
@ -1935,7 +1951,7 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr
if ((guest_regs[t].flags & (GRF_STATIC|GRF_PINNED)) && if ((guest_regs[t].flags & (GRF_STATIC|GRF_PINNED)) &&
!(ali & ~(1 << t)) && !(ali & ~(1 << t)) &&
!cache_regs[guest_regs[t].sreg].locked && !cache_regs[guest_regs[t].sreg].locked &&
!(rsp_d & cache_regs[guest_regs[t].sreg].gregs)) { !((rsp_d|rcache_regs_now) & cache_regs[guest_regs[t].sreg].gregs)) {
// alias is a single STATIC and its sreg is available // alias is a single STATIC and its sreg is available
x = guest_regs[t].sreg; x = guest_regs[t].sreg;
rcache_evict_vreg(x); rcache_evict_vreg(x);
@ -1947,8 +1963,9 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr
break; break;
) )
if (x >= 0) { if (x >= 0) {
src = x; rcache_remove_vreg_alias(src, r);
rcache_move_vreg(src, dst); src = dst;
rcache_move_vreg(x, dst);
} }
} else { } else {
// split r // split r
@ -1956,6 +1973,7 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr
x = rcache_allocate_vreg(rsp_d & (1 << r)); x = rcache_allocate_vreg(rsp_d & (1 << r));
rcache_unlock_vreg(src); rcache_unlock_vreg(src);
if (x >= 0) { if (x >= 0) {
rcache_remove_vreg_alias(src, r);
dst = x; dst = x;
tr = &cache_regs[dst]; tr = &cache_regs[dst];
tr->stamp = rcache_counter; tr->stamp = rcache_counter;
@ -1965,8 +1983,6 @@ static int rcache_get_reg_(sh2_reg_e r, rc_gr_mode mode, int do_locking, int *hr
if (x < 0) if (x < 0)
// aliases not needed or no vreg available, remove them // aliases not needed or no vreg available, remove them
rcache_evict_vreg_aliases(dst, r); rcache_evict_vreg_aliases(dst, r);
else if (src != dst)
rcache_remove_vreg_alias(src, r);
} }
// assign r to dst // assign r to dst
@ -2342,13 +2358,16 @@ static void rcache_clean_tmp(void)
static void rcache_clean_masked(u32 mask) static void rcache_clean_masked(u32 mask)
{ {
int i, r, hr; int i, r, hr;
u32 m;
rcache_regs_clean |= mask; rcache_regs_clean |= mask;
mask = rcache_regs_clean; mask = rcache_regs_clean;
// clean constants where all aliases are covered by the mask // clean constants where all aliases are covered by the mask, exempt statics
// to avoid flushing them to context if sreg isn't available
m = mask & ~(rcache_regs_static | rcache_regs_pinned);
for (i = 0; i < ARRAY_SIZE(gconsts); i++) for (i = 0; i < ARRAY_SIZE(gconsts); i++)
if ((gconsts[i].gregs & mask) && !(gconsts[i].gregs & ~mask)) { if ((gconsts[i].gregs & m) && !(gconsts[i].gregs & ~mask)) {
FOR_ALL_BITS_SET_DO(gconsts[i].gregs, r, FOR_ALL_BITS_SET_DO(gconsts[i].gregs, r,
if (guest_regs[r].flags & GRF_CDIRTY) { if (guest_regs[r].flags & GRF_CDIRTY) {
hr = rcache_get_reg_(r, RC_GR_READ, 0, NULL); hr = rcache_get_reg_(r, RC_GR_READ, 0, NULL);
@ -2479,6 +2498,9 @@ static void rcache_create(void)
} }
// create static host register mapping for SH2 regs // create static host register mapping for SH2 regs
for (i = 0; i < ARRAY_SIZE(guest_regs); i++) {
guest_regs[i] = (guest_reg_t){.sreg = -1};
}
for (i = 0; i < ARRAY_SIZE(regs_static); i += 2) { for (i = 0; i < ARRAY_SIZE(regs_static); i += 2) {
for (x = ARRAY_SIZE(cache_regs)-1; x >= 0; x--) for (x = ARRAY_SIZE(cache_regs)-1; x >= 0; x--)
if (cache_regs[x].hreg == regs_static[i+1]) break; if (cache_regs[x].hreg == regs_static[i+1]) break;
@ -2486,8 +2508,7 @@ static void rcache_create(void)
guest_regs[regs_static[i]] = (guest_reg_t){.flags = GRF_STATIC,.sreg = x}; guest_regs[regs_static[i]] = (guest_reg_t){.flags = GRF_STATIC,.sreg = x};
rcache_regs_static |= (1 << regs_static[i]); rcache_regs_static |= (1 << regs_static[i]);
rcache_vregs_reg &= ~(1 << x); rcache_vregs_reg &= ~(1 << x);
} else }
guest_regs[regs_static[i]] = (guest_reg_t){.sreg = -1};
} }
printf("DRC registers created, %ld host regs (%d REG, %d STATIC, 1 CTX)\n", printf("DRC registers created, %ld host regs (%d REG, %d STATIC, 1 CTX)\n",
@ -3501,7 +3522,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
rcache_set_usage_now(opd[0].source); // current insn rcache_set_usage_now(opd[0].source); // current insn
rcache_set_usage_soon(soon); // insns 1-4 rcache_set_usage_soon(soon); // insns 1-4
rcache_set_usage_late(late & ~soon); // insns 5-9 rcache_set_usage_late(late & ~soon); // insns 5-9
rcache_set_usage_discard(write & ~(late|soon|opd[0].source)); rcache_set_usage_discard(write & ~(late|soon));
if (v <= 9) if (v <= 9)
// upcoming rcache_flush, start writing back unused dirty stuff // upcoming rcache_flush, start writing back unused dirty stuff
rcache_clean_masked(rcache_dirty_mask() & ~(write|opd[0].dest)); rcache_clean_masked(rcache_dirty_mask() & ~(write|opd[0].dest));
@ -4717,7 +4738,7 @@ end_op:
// branch not taken, correct cycle count // branch not taken, correct cycle count
if (ctaken) if (ctaken)
emith_add_r_imm(sr, ctaken << 12); cycles -= ctaken;
// set T bit to reflect branch not taken for OP_BRANCH_CT/CF // set T bit to reflect branch not taken for OP_BRANCH_CT/CF
if (emith_get_t_cond() >= 0) // T is synced for all other cases if (emith_get_t_cond() >= 0) // T is synced for all other cases
emith_set_t(sr, opd_b->op == OP_BRANCH_CF); emith_set_t(sr, opd_b->op == OP_BRANCH_CF);
@ -5263,11 +5284,11 @@ static void sh2_smc_rm_blocks(u32 a, int len, int tcache_id, u32 shift)
start_lit = block->addr_lit & wtmask; start_lit = block->addr_lit & wtmask;
end_lit = start_lit + block->size_lit; end_lit = start_lit + block->size_lit;
// disable/delete block if it covers the modified address // disable/delete block if it covers the modified address
if ((start_addr <= a+len && a < end_addr) || if ((start_addr < a+len && a < end_addr) ||
(start_lit <= a+len && a < end_lit)) (start_lit < a+len && a < end_lit))
{ {
dbg(2, "smc remove @%08x", a); dbg(2, "smc remove @%08x", a);
end_addr = (start_lit <= a+len && block->size_lit ? a : 0); end_addr = (start_lit < a+len && block->size_lit ? a : 0);
dr_rm_block_entry(block, tcache_id, end_addr, 0); dr_rm_block_entry(block, tcache_id, end_addr, 0);
#if (DRC_DEBUG & 2) #if (DRC_DEBUG & 2)
removed = 1; removed = 1;