sh2 drc: optimize T bit handling for A64

This commit is contained in:
kub 2019-12-21 16:33:52 +01:00
parent a5e51c16e6
commit 0e12269073
4 changed files with 58 additions and 38 deletions

View file

@ -36,10 +36,11 @@ endif
ifeq ("$(PLATFORM)",$(filter "$(PLATFORM)","gp2x" "opendingux" "rpi1"))
# very small caches, avoid optimization options making the binary much bigger
CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp
CFLAGS += -finline-limit=43 -fno-unroll-loops -fno-ipa-cp -ffast-math
# this gets you about 20% better execution speed on 32bit arm/mips
CFLAGS += -fno-common -fno-stack-protector -fno-guess-branch-probability -fno-caller-saves -fno-tree-loop-if-convert -ffast-math
CFLAGS += -fno-common -fno-stack-protector -fno-guess-branch-probability -fno-caller-saves -fno-tree-loop-if-convert -fno-regmove
endif
#OBJS += align.o
# default settings
ifeq "$(ARCH)" "arm"

View file

@ -44,10 +44,11 @@
#define A64_COND_LE 0xd
#define A64_COND_CS A64_COND_HS
#define A64_COND_CC A64_COND_LO
// "fake" conditions for T bit handling
#define A64_COND_AL 0xe
#define A64_COND_NV 0xf
/* unified conditions */
// DRC conditions
#define DCOND_EQ A64_COND_EQ
#define DCOND_NE A64_COND_NE
#define DCOND_MI A64_COND_MI
@ -261,6 +262,13 @@ enum { XT_UXTW=0x4, XT_UXTX=0x6, XT_LSL=0x7, XT_SXTW=0xc, XT_SXTX=0xe };
#define A64_BCOND(cond, offs19) \
A64_INSN(0xa,0x2,_,_,_,_,_,(offs19) >> 2,(cond))
// conditional select
#define A64_CINC(cond, rn, rm) \
A64_INSN(0xd,0x0,0x2,0,rm,(cond)^1,0x1,rm,rn) /* CSINC */
#define A64_CSET(cond, rn) \
A64_CINC(cond, rn, Z0)
// load pc-relative
#define A64_LDRLIT_IMM(rd, offs19) \
@ -1356,38 +1364,52 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode)
#ifdef T
// T bit handling
static int tcond = -1;
#define emith_invert_cond(cond) \
((cond) ^ 1)
static void emith_clr_t_cond(int sr)
#define emith_clr_t_cond(sr) \
(void)sr
#define emith_set_t_cond(sr, cond) \
tcond = cond
#define emith_get_t_cond() \
tcond
#define emith_invalidate_t() \
tcond = -1
#define emith_set_t(sr, val) \
tcond = ((val) ? A64_COND_AL: A64_COND_NV)
static void emith_sync_t(int sr)
{
emith_bic_r_imm(sr, T);
}
static void emith_set_t_cond(int sr, int cond)
{
EMITH_SJMP_START(emith_invert_cond(cond));
emith_or_r_imm_c(cond, sr, T);
EMITH_SJMP_END(emith_invert_cond(cond));
}
#define emith_get_t_cond() -1
#define emith_sync_t(sr) ((void)sr)
#define emith_invalidate_t()
static void emith_set_t(int sr, int val)
{
if (val)
emith_or_r_imm(sr, T);
else
emith_bic_r_imm(sr, T);
if (tcond == A64_COND_AL)
emith_or_r_imm(sr, T);
else if (tcond == A64_COND_NV)
emith_bic_r_imm(sr, T);
else if (tcond >= 0) {
int tmp = rcache_get_tmp();
EMIT(A64_CSET(tcond, tmp));
EMIT(A64_BFI_IMM(sr, tmp, 0, 1)); // assumes SR.T = bit 0
rcache_free_tmp(tmp);
}
tcond = -1;
}
static int emith_tst_t(int sr, int tf)
{
emith_tst_r_imm(sr, T);
return tf ? DCOND_NE: DCOND_EQ;
if (tcond < 0) {
emith_tst_r_imm(sr, T);
return tf ? DCOND_NE: DCOND_EQ;
} else if (tcond >= A64_COND_AL) {
// MUST sync because A64_COND_AL/NV isn't a real condition
emith_sync_t(sr);
emith_tst_r_imm(sr, T);
return tf ? DCOND_NE: DCOND_EQ;
} else
return tf ? tcond : emith_invert_cond(tcond);
}
#endif

View file

@ -87,8 +87,6 @@ enum { F1_B, F1_H, F1_W, F1_D, F1_BU, F1_HU, F1_WU }; // LD/ST
// func7
enum { F2_ALT=0x20, F2_MULDIV=0x01 };
#define __(n) o##n // enum marker for "undefined"
#define R5_NOP R5_I_INSN(OP_IMM, F1_ADD, Z0, Z0, 0) // nop: ADDI r0, r0, #0
// arithmetic/logical
@ -687,9 +685,8 @@ static void emith_pool_check(void)
static void emith_move_imm(int r, uintptr_t imm)
{
u32 lui = imm + _CB(imm,1,11,12);
u32 lui = imm + _CB(imm,1,11,12); // compensate for ADDI sign extension
if (lui >> 12) {
// take out the effect of the sign extension of ADDI
EMIT(R5_MOVT_IMM(r, lui));
if (imm & 0xfff)
EMIT(R5_ADD_IMM(r, r, imm));

View file

@ -446,7 +446,6 @@ static void rcache_free_tmp(int hr);
// there must be at least 3 PARAM, and PARAM+TEMPORARY must be at least 4.
// SR must and R0 should by all means be statically mapped.
// XXX the static definition of SR MUST match that in compiler.h
// PC and PR must not be statically mapped (accessed in context by utils).
#ifdef __arm__
#include "../drc/emit_arm.c"
@ -3365,7 +3364,7 @@ static void REGPARM(2) *sh2_translate(SH2 *sh2, int tcache_id)
rcache_get_reg_arg(2, SHR_SR, NULL);
tmp2 = rcache_get_tmp_arg(0);
tmp3 = rcache_get_tmp_arg(1);
tmp4 = rcache_get_tmp_arg(3);
tmp4 = rcache_get_tmp();
emith_move_r_ptr_imm(tmp2, tcache_ptr);
emith_move_r_r_ptr(tmp3, CONTEXT_REG);
emith_move_r_imm(tmp4, pc);
@ -5049,11 +5048,12 @@ static void sh2_generate_utils(void)
emith_add_r_imm(arg2, (u32)(2*sizeof(void *)));
emith_and_r_imm(arg2, (ARRAY_SIZE(sh2s->rts_cache)-1) * 2*sizeof(void *));
emith_ctx_write(arg2, offsetof(SH2, rts_cache_idx));
emith_add_r_r_r_lsl_ptr(arg2, CONTEXT_REG, arg2, 0);
emith_ctx_read(arg3, SHR_PR * 4);
emith_add_r_r_r_lsl_ptr(arg3, CONTEXT_REG, arg2, 0);
rcache_get_reg_arg(2, SHR_PR, NULL);
emith_add_r_ret(arg1);
emith_write_r_r_offs_ptr(arg1, arg2, offsetof(SH2, rts_cache)+sizeof(void *));
emith_write_r_r_offs(arg3, arg2, offsetof(SH2, rts_cache));
emith_write_r_r_offs_ptr(arg1, arg3, offsetof(SH2, rts_cache)+sizeof(void *));
emith_write_r_r_offs(arg2, arg3, offsetof(SH2, rts_cache));
rcache_flush();
emith_ret();
emith_flush();