SH2 drc: bug fixing and small speed improvements

This commit is contained in:
kub 2019-06-25 20:23:45 +02:00
parent 1891e649e5
commit 748b8187db
8 changed files with 254 additions and 203 deletions

View file

@ -6,3 +6,39 @@ extern u8 *tcache;
void drc_cmn_init(void);
void drc_cmn_cleanup(void);
#define BITMASK1(v0) (1 << (v0))
#define BITMASK2(v0,v1) ((1 << (v0)) | (1 << (v1)))
#define BITMASK3(v0,v1,v2) (BITMASK2(v0,v1) | (1 << (v2)))
#define BITMASK4(v0,v1,v2,v3) (BITMASK3(v0,v1,v2) | (1 << (v3)))
#define BITMASK5(v0,v1,v2,v3,v4) (BITMASK4(v0,v1,v2,v3) | (1 << (v4)))
#define BITMASK6(v0,v1,v2,v3,v4,v5) (BITMASK5(v0,v1,v2,v3,v4) | (1 << (v5)))
#define BITRANGE(v0,v1) (BITMASK1(v1+1)-BITMASK1(v0)) // set with v0..v1
// binary search approach, since we don't have CLZ on ARM920T
#define FOR_ALL_BITS_SET_DO(mask, bit, code) { \
u32 __mask = mask; \
for (bit = 31; bit >= 0 && mask; bit--, __mask <<= 1) { \
if (!(__mask & (0xffff << 16))) \
bit -= 16, __mask <<= 16; \
if (!(__mask & (0xff << 24))) \
bit -= 8, __mask <<= 8; \
if (!(__mask & (0xf << 28))) \
bit -= 4, __mask <<= 4; \
if (!(__mask & (0x3 << 30))) \
bit -= 2, __mask <<= 2; \
if (!(__mask & (0x1 << 31))) \
bit -= 1, __mask <<= 1; \
if (__mask & (0x1 << 31)) { \
code; \
} \
} \
}
// inspired by https://graphics.stanford.edu/~seander/bithacks.html
static inline int count_bits(unsigned val)
{
val = val - ((val >> 1) & 0x55555555);
val = (val & 0x33333333) + ((val >> 2) & 0x33333333);
return (((val + (val >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
}

View file

@ -382,13 +382,6 @@ static void emith_flush(void)
#define EOP_MOVT(rd,imm) \
EMIT(0xe3400000 | ((rd)<<12) | (((imm)>>16)&0xfff) | (((imm)>>12)&0xf0000), M1(rd), NO)
static inline int count_bits(unsigned val)
{
val = val - ((val >> 1) & 0x55555555);
val = (val & 0x33333333) + ((val >> 2) & 0x33333333);
return (((val + (val >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
}
// host literal pool; must be significantly smaller than 1024 (max LDR offset = 4096)
#define MAX_HOST_LITERALS 128
static u32 literal_pool[MAX_HOST_LITERALS];
@ -429,18 +422,26 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int
// count insns needed for mov/orr #imm
for (v = imm, ror2 = 0; (v >> 24) && ror2 < 32/2; ror2++)
v = (v << 2) | (v >> 30);
#ifdef HAVE_ARMV7
for (i = 2; i > 0; i--, v >>= 8)
while (v > 0xff && !(v & 3))
v >>= 2;
if (v) { // 3+ insns needed...
if (op == A_OP_MVN)
imm = ~imm;
#ifdef HAVE_ARMV7
// ...prefer movw/movt
EOP_MOVW(rd, imm);
if (imm & 0xffff0000)
EOP_MOVT(rd, imm);
return;
}
#else
for (i = 3; i > 0; i--, v >>= 8)
while (v > 0xff && !(v & 3))
v >>= 2;
if (v) { // 4 insns needed...
if (op == A_OP_MVN)
imm = ~imm;
// ...emit literal load
int idx, o;
if (literal_iindex >= MAX_HOST_LITERALS) {
@ -455,9 +456,9 @@ static void emith_op_imm2(int cond, int s, int op, int rd, int rn, unsigned int
EOP_C_DOP_IMM(cond, A_OP_ADD, 0, rd, rd, 0, o);
else if (o < 0)
EOP_C_DOP_IMM(cond, A_OP_SUB, 0, rd, rd, 0, -o);
#endif
return;
}
#endif
break;
case A_OP_AND:
@ -544,7 +545,7 @@ static int emith_xbranch(int cond, void *target, int is_call)
EMIT((u32)target,M1(PC),0);
#else
// should never happen
elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, "indirect jmp %08x->%08x", target, tcache_ptr);
elprintf(EL_STATUS|EL_SVP|EL_ANOMALY, "indirect jmp %8p->%8p", target, tcache_ptr);
exit(1);
#endif
}
@ -633,9 +634,6 @@ static inline void emith_pool_adjust(int pool_index, int move_offs)
#define EMITH_NOTHING1(cond) \
(void)(cond)
#define EMITH_SJMP_DECL_()
#define EMITH_SJMP_START_(cond) EMITH_NOTHING1(cond)
#define EMITH_SJMP_END_(cond) EMITH_NOTHING1(cond)
#define EMITH_SJMP_START(cond) EMITH_NOTHING1(cond)
#define EMITH_SJMP_END(cond) EMITH_NOTHING1(cond)
#define EMITH_SJMP2_START(cond) EMITH_NOTHING1(cond)
@ -806,6 +804,9 @@ static inline void emith_pool_adjust(int pool_index, int move_offs)
#define emith_eor_r_imm(r, imm) \
emith_op_imm(A_COND_AL, 0, A_OP_EOR, r, imm)
#define emith_eor_r_imm_ptr(r, imm) \
emith_eor_r_imm(r, imm)
// note: only use 8bit imm for these
#define emith_tst_r_imm(r, imm) \
emith_top_imm(A_COND_AL, A_OP_TST, r, imm)
@ -837,6 +838,9 @@ static inline void emith_pool_adjust(int pool_index, int move_offs)
#define emith_eor_r_imm_c(cond, r, imm) \
emith_op_imm(cond, 0, A_OP_EOR, r, imm)
#define emith_eor_r_imm_ptr_c(cond, r, imm) \
emith_eor_r_imm_c(cond, r, imm)
#define emith_bic_r_imm_c(cond, r, imm) \
emith_op_imm(cond, 0, A_OP_BIC, r, imm)
@ -1139,6 +1143,8 @@ static inline void emith_pool_adjust(int pool_index, int move_offs)
emith_jump(target); \
} while (0)
#define emith_call_cleanup() /**/
#define emith_ret_c(cond) \
emith_jump_reg_c(cond, LR)
@ -1228,10 +1234,10 @@ static inline void emith_pool_adjust(int pool_index, int move_offs)
/* if (reg <= turns) turns = reg-1 */ \
t3 = rcache_get_reg(reg, RC_GR_RMW, NULL); \
emith_cmp_r_r(t3, t2); \
emith_sub_r_r_imm_c(DCOND_LE, t2, t3, 1); \
emith_sub_r_r_imm_c(DCOND_LS, t2, t3, 1); \
/* if (reg <= 1) turns = 0 */ \
emith_cmp_r_imm(t3, 1); \
emith_move_r_imm_c(DCOND_LE, t2, 0); \
emith_move_r_imm_c(DCOND_LS, t2, 0); \
/* reg -= turns */ \
emith_sub_r_r(t3, t2); \
} \
@ -1361,7 +1367,7 @@ static int tcond = -1;
#define emith_set_t(sr, val) \
tcond = ((val) ? A_COND_AL: A_COND_NV)
static void emith_sync_t(sr)
static void emith_sync_t(int sr)
{
if (tcond == A_COND_AL)
emith_or_r_imm(sr, T);

View file

@ -396,6 +396,12 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common
#define emith_cmp_r_imm(r, imm) \
emith_arith_r_imm(7, r, imm)
#define emith_eor_r_imm_ptr(r, imm) do { \
EMIT_REX_IF(1, 0, r); \
EMIT_OP_MODRM64(0x81, 3, 6, r); \
EMIT(imm, u32); \
} while (0)
#define emith_tst_r_imm(r, imm) do { \
EMIT_REX_IF(0, 0, r); \
EMIT_OP_MODRM64(0xf7, 3, 0, r); \
@ -417,6 +423,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common
emith_or_r_imm(r, imm)
#define emith_eor_r_imm_c(cond, r, imm) \
emith_eor_r_imm(r, imm)
#define emith_eor_r_imm_ptr_c(cond, r, imm) \
emith_eor_r_imm_ptr(r, imm)
#define emith_bic_r_imm_c(cond, r, imm) \
emith_bic_r_imm(r, imm)
#define emith_tst_r_imm_c(cond, r, imm) \
@ -589,9 +597,9 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common
// XXX: stupid mess
#define emith_mul_(op, dlo, dhi, s1, s2) do { \
int rmr; \
if (dlo != xAX && dhi != xAX) \
if (dlo != xAX && dhi != xAX && rcache_is_hreg_used(xAX)) \
emith_push(xAX); \
if (dlo != xDX && dhi != xDX) \
if (dlo != xDX && dhi != xDX && rcache_is_hreg_used(xDX)) \
emith_push(xDX); \
if ((s1) == xAX) \
rmr = s2; \
@ -609,9 +617,9 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common
} \
if (dhi != xDX && dhi != -1 && !(dhi == xAX && dlo == xDX)) \
emith_move_r_r(dhi, (dlo == xDX ? xAX : xDX)); \
if (dlo != xDX && dhi != xDX) \
if (dlo != xDX && dhi != xDX && rcache_is_hreg_used(xDX)) \
emith_pop(xDX); \
if (dlo != xAX && dhi != xAX) \
if (dlo != xAX && dhi != xAX && rcache_is_hreg_used(xAX)) \
emith_pop(xAX); \
} while (0)
@ -898,6 +906,9 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common
emith_jump(target); \
} while (0)
#define emith_call_cleanup() \
emith_add_r_r_ptr_imm(xSP, xSP, sizeof(void *)); // remove return addr
#define emith_ret() \
EMIT_OP(0xc3)
@ -912,10 +923,12 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common
#define emith_push_ret(r) do { \
int r_ = (r >= 0 ? r : xSI); \
emith_push(r_); /* always push to align */ \
emith_add_r_r_ptr_imm(xSP, xSP, -8*4); /* args shadow space */ \
} while (0)
#define emith_pop_and_ret(r) do { \
int r_ = (r >= 0 ? r : xSI); \
emith_add_r_r_ptr_imm(xSP, xSP, 8*4); /* args shadow space */ \
emith_pop(r_); \
emith_ret(); \
} while (0)
@ -942,15 +955,6 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common
// "simple" jump (no more then a few insns)
// ARM will use conditional instructions here
#define EMITH_SJMP_DECL_() \
u8 *cond_ptr
#define EMITH_SJMP_START_(cond) \
JMP8_POS(cond_ptr)
#define EMITH_SJMP_END_(cond) \
JMP8_EMIT(cond, cond_ptr)
#define EMITH_SJMP_START EMITH_JMP_START
#define EMITH_SJMP_END EMITH_JMP_END
@ -1046,7 +1050,7 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common
emith_push(xR15); \
emith_push(xSI); \
emith_push(xDI); \
emith_add_r_r_ptr_imm(xSP, xSP, -8*5); /* align + ABI param area */ \
emith_add_r_r_ptr_imm(xSP, xSP, -8*5); /* align + args shadow space */ \
} while (0)
#define emith_sh2_drc_exit() do { \
@ -1106,19 +1110,17 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common
#endif
#define emith_save_caller_regs(mask) do { \
if ((mask) & (1 << xAX)) emith_push(xAX); \
if ((mask) & (1 << xCX)) emith_push(xCX); \
if ((mask) & (1 << xDX)) emith_push(xDX); \
if ((mask) & (1 << xSI)) emith_push(xSI); \
if ((mask) & (1 << xDI)) emith_push(xDI); \
int _c; u32 _m = mask & 0xfc7; /* AX, CX, DX, SI, DI, 8, 9, 10, 11 */ \
if (__builtin_parity(_m) == 1) _m |= 0x8; /* BX for ABI align */ \
for (_c = HOST_REGS; _m && _c >= 0; _m &= ~(1 << _c), _c--) \
if (_m & (1 << _c)) emith_push(_c); \
} while (0)
#define emith_restore_caller_regs(mask) do { \
if ((mask) & (1 << xDI)) emith_pop(xDI); \
if ((mask) & (1 << xSI)) emith_pop(xSI); \
if ((mask) & (1 << xDX)) emith_pop(xDX); \
if ((mask) & (1 << xCX)) emith_pop(xCX); \
if ((mask) & (1 << xAX)) emith_pop(xAX); \
int _c; u32 _m = mask & 0xfc7; \
if (__builtin_parity(_m) == 1) _m |= 0x8; /* BX for ABI align */ \
for (_c = 0; _m && _c < HOST_REGS; _m &= ~(1 << _c), _c++) \
if (_m & (1 << _c)) emith_pop(_c); \
} while (0)
#define emith_sh2_rcall(a, tab, func, mask) do { \
@ -1192,14 +1194,14 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common
/* if (reg <= turns) turns = reg-1 */ \
t3 = rcache_get_reg(reg, RC_GR_RMW, NULL); \
emith_cmp_r_r(t3, t2); \
EMITH_SJMP_START(DCOND_GT); \
emith_sub_r_r_imm_c(DCOND_LE, t2, t3, 1); \
EMITH_SJMP_END(DCOND_GT); \
EMITH_SJMP_START(DCOND_HI); \
emith_sub_r_r_imm_c(DCOND_LS, t2, t3, 1); \
EMITH_SJMP_END(DCOND_HI); \
/* if (reg <= 1) turns = 0 */ \
emith_cmp_r_imm(t3, 1); \
EMITH_SJMP_START(DCOND_GT); \
emith_move_r_imm_c(DCOND_LE, t2, 0); \
EMITH_SJMP_END(DCOND_GT); \
EMITH_SJMP_START(DCOND_HI); \
emith_move_r_imm_c(DCOND_LS, t2, 0); \
EMITH_SJMP_END(DCOND_HI); \
/* reg -= turns */ \
emith_sub_r_r(t3, t2); \
} \