various small fixes and optimsations

This commit is contained in:
kub 2019-08-16 15:14:41 +02:00
parent b90e104fc9
commit 8284ab7107
9 changed files with 39 additions and 27 deletions

View file

@ -236,6 +236,14 @@ pico/cd/cd_file.o: CFLAGS += -fno-strict-aliasing
pico/cd/pcm.o: CFLAGS += -fno-strict-aliasing pico/cd/pcm.o: CFLAGS += -fno-strict-aliasing
pico/cd/LC89510.o: CFLAGS += -fno-strict-aliasing pico/cd/LC89510.o: CFLAGS += -fno-strict-aliasing
pico/cd/gfx_cd.o: CFLAGS += -fno-strict-aliasing pico/cd/gfx_cd.o: CFLAGS += -fno-strict-aliasing
ifeq (1,$(use_sh2drc))
ifneq (,$(findstring -flto,$(CFLAGS)))
# if using the DRC, memory and sh2soc use a global register variable to avoid
# saving and reloading the SH2 SR. However, this collides with the use of LTO.
pico/32x/memory.o: CFLAGS += -fno-lto
pico/32x/sh2soc.o: CFLAGS += -fno-lto
endif
endif
# fame needs ~2GB of RAM to compile on gcc 4.8 # fame needs ~2GB of RAM to compile on gcc 4.8
# on x86, this is reduced by ~300MB when debug info is off (but not on ARM) # on x86, this is reduced by ~300MB when debug info is off (but not on ARM)

View file

@ -26,7 +26,7 @@ CFLAGS ?=
STATIC_LINKING:= 0 STATIC_LINKING:= 0
TARGET_NAME := picodrive TARGET_NAME := picodrive
LIBM := -lm LIBM := -lm
GIT_VERSION ?= " $(shell git rev-parse --short HEAD || echo unknown)" GIT_VERSION ?= $(shell git rev-parse --short HEAD || echo unknown)
ifneq ($(GIT_VERSION)," unknown") ifneq ($(GIT_VERSION)," unknown")
CFLAGS += -DGIT_VERSION=\"$(GIT_VERSION)\" CFLAGS += -DGIT_VERSION=\"$(GIT_VERSION)\"
endif endif
@ -427,6 +427,7 @@ else ifeq ($(platform), gcw0)
use_fame = 1 use_fame = 1
use_drz80 = 0 use_drz80 = 0
use_cz80 = 1 use_cz80 = 1
use_sh2drc = 1
# Windows # Windows
else else

View file

@ -1174,6 +1174,8 @@ static inline void emith_pool_adjust(int pool_index, int move_offs)
#define host_arg2reg(rd, arg) \ #define host_arg2reg(rd, arg) \
rd = arg rd = arg
#define emith_rw_offs_max() 0xff
/* SH2 drc specific */ /* SH2 drc specific */
/* pushes r12 for eabi alignment */ /* pushes r12 for eabi alignment */
#define emith_sh2_drc_entry() \ #define emith_sh2_drc_entry() \

View file

@ -1117,6 +1117,7 @@ static void emith_ldst_offs(int sz, int rd, int rn, int o9, int ld, int mode)
#define emith_flush() /**/ #define emith_flush() /**/
#define host_instructions_updated(base, end) __builtin___clear_cache(base, end) #define host_instructions_updated(base, end) __builtin___clear_cache(base, end)
#define emith_jump_patch_size() 8 #define emith_jump_patch_size() 8
#define emith_rw_offs_max() 0xff
// SH2 drc specific // SH2 drc specific

View file

@ -394,7 +394,7 @@ int emith_flg_noV; // V flag known not to be set
// NB: for adcf and sbcf, carry-in must be dealt with separately (see there) // NB: for adcf and sbcf, carry-in must be dealt with separately (see there)
static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub) static void emith_set_arith_flags(int rd, int rt, int rs, s32 imm, int sub)
{ {
if (sub && rd == FNZ && rt && rs) // is this cmp_r_r? if (sub && rd == FNZ && rt > AT && rs > AT) // is this cmp_r_r?
emith_flg_rs = rs, emith_flg_rt = rt; emith_flg_rs = rs, emith_flg_rt = rt;
else emith_flg_rs = emith_flg_rt = 0; else emith_flg_rs = emith_flg_rt = 0;
@ -858,7 +858,7 @@ static void emith_log_imm(int op, int rd, int rs, u32 imm)
// NB: mips32r2 has EXT and INS // NB: mips32r2 has EXT and INS
#define emith_clear_msb(d, s, count) /* bits to clear */ do { \ #define emith_clear_msb(d, s, count) /* bits to clear */ do { \
u32 t; \ u32 t; \
if ((count) > 16) { \ if ((count) >= 16) { \
t = (count) - 16; \ t = (count) - 16; \
t = 0xffff >> t; \ t = 0xffff >> t; \
emith_and_r_r_imm(d, s, t); \ emith_and_r_r_imm(d, s, t); \
@ -1262,6 +1262,7 @@ static int emith_cond_check(int cond, int *r)
// NB: mips32r2 has SYNCI // NB: mips32r2 has SYNCI
#define host_instructions_updated(base, end) __builtin___clear_cache(base, end) #define host_instructions_updated(base, end) __builtin___clear_cache(base, end)
#define emith_jump_patch_size() 4 #define emith_jump_patch_size() 4
#define emith_rw_offs_max() 0x7fff
// SH2 drc specific // SH2 drc specific
#define emith_sh2_drc_entry() do { \ #define emith_sh2_drc_entry() do { \

View file

@ -986,6 +986,8 @@ enum { xAX = 0, xCX, xDX, xBX, xSP, xBP, xSI, xDI, // x86-64,i386 common
#define host_instructions_updated(base, end) #define host_instructions_updated(base, end)
#define emith_rw_offs_max() 0xffffffff
#ifdef __x86_64__ #ifdef __x86_64__
#define HOST_REGS 16 #define HOST_REGS 16

View file

@ -419,8 +419,8 @@ typedef struct {
static int rcache_get_tmp(void); static int rcache_get_tmp(void);
static void rcache_free_tmp(int hr); static void rcache_free_tmp(int hr);
// Note: cache_regs[] must have at least the amount of REG and TEMP registers // Note: cache_regs[] must have at least the amount of HRF_REG registers used
// used by handlers in worst case (currently 4). // by handlers in worst case (currently 4).
// Register assignment goes by ABI convention. Caller save registers are TEMP, // Register assignment goes by ABI convention. Caller save registers are TEMP,
// the others are either static or REG. SR must be static, R0 very recommended. // the others are either static or REG. SR must be static, R0 very recommended.
// VBR, PC, PR must not be static (read from context in utils). // VBR, PC, PR must not be static (read from context in utils).
@ -2418,7 +2418,7 @@ static void rcache_init(void)
// NB may return either REG or TEMP // NB may return either REG or TEMP
static int emit_get_rbase_and_offs(SH2 *sh2, sh2_reg_e r, int rmode, u32 *offs) static int emit_get_rbase_and_offs(SH2 *sh2, sh2_reg_e r, int rmode, u32 *offs)
{ {
uptr omask = 0xff; // offset mask, XXX: ARM oriented.. uptr omask = emith_rw_offs_max(); // offset mask
u32 mask = 0; u32 mask = 0;
u32 a; u32 a;
int poffs; int poffs;
@ -4447,7 +4447,7 @@ end_op:
static void sh2_generate_utils(void) static void sh2_generate_utils(void)
{ {
int arg0, arg1, arg2, arg3, sr, tmp; int arg0, arg1, arg2, arg3, sr, tmp, tmp2;
host_arg2reg(arg0, 0); host_arg2reg(arg0, 0);
host_arg2reg(arg1, 1); host_arg2reg(arg1, 1);
@ -4689,18 +4689,18 @@ static void sh2_generate_utils(void)
emith_sub_r_imm(tmp, 4*2); emith_sub_r_imm(tmp, 4*2);
rcache_clean(); rcache_clean();
// push SR // push SR
tmp = rcache_get_reg_arg(0, SHR_SP, NULL); tmp = rcache_get_reg_arg(0, SHR_SP,&tmp2);
emith_add_r_imm(tmp, 4); emith_add_r_r_imm(tmp, tmp2, 4);
tmp = rcache_get_reg_arg(1, SHR_SR, NULL); tmp = rcache_get_reg_arg(1, SHR_SR, NULL);
emith_clear_msb(tmp, tmp, 22); emith_clear_msb(tmp, tmp, 22);
emith_move_r_r_ptr(arg2, CONTEXT_REG); emith_move_r_r_ptr(arg2, CONTEXT_REG);
rcache_invalidate(); rcache_invalidate_tmp();
emith_call(p32x_sh2_write32); // XXX: use sh2_drc_write32? emith_call(p32x_sh2_write32); // XXX: use sh2_drc_write32?
// push PC // push PC
rcache_get_reg_arg(0, SHR_SP, NULL); rcache_get_reg_arg(0, SHR_SP, NULL);
emith_ctx_read(arg1, SHR_PC * 4); emith_ctx_read(arg1, SHR_PC * 4);
emith_move_r_r_ptr(arg2, CONTEXT_REG); emith_move_r_r_ptr(arg2, CONTEXT_REG);
rcache_invalidate(); rcache_invalidate_tmp();
emith_call(p32x_sh2_write32); emith_call(p32x_sh2_write32);
// update I, cycles, do callback // update I, cycles, do callback
emith_ctx_read(arg1, offsetof(SH2, pending_level)); emith_ctx_read(arg1, offsetof(SH2, pending_level));

View file

@ -197,24 +197,19 @@ static NOINLINE u32 sh2_poll_read(u32 a, u32 d, unsigned int cycles, SH2* sh2)
// fetch oldest write to address from fifo, but stop when reaching the present // fetch oldest write to address from fifo, but stop when reaching the present
idx = sh2_poll_rd[hix]; idx = sh2_poll_rd[hix];
while (idx != sh2_poll_wr[hix] && CYCLES_GE(cycles, fifo[idx].cycles)) { while (idx != sh2_poll_wr[hix] && CYCLES_GE(cycles, fifo[idx].cycles)) {
// int oidx = idx;
p = &fifo[idx]; p = &fifo[idx];
idx = (idx+1) % PFIFO_SZ; idx = (idx+1) % PFIFO_SZ;
if (CYCLES_GT(cycles, p->cycles+80)) { if (cpu != p->cpu) {
// drop older fifo stores that may cause synchronisation problems. if (CYCLES_GT(cycles, p->cycles+80)) {
// NB unfortunately this cycle diff is quite sensitive: // drop older fifo stores that may cause synchronisation problems.
// observed in Brutal Unleashed: min 80, observed in Afterburner: max 110 sh2_poll_rd[hix] = idx;
sh2_poll_rd[hix] = idx; } else if (p->a == a) {
} else if (p->a == a) { // replace current data with fifo value and discard fifo entry
// replace current data with fifo value and discard fifo entry
if (cpu != p->cpu) {
d = p->d; d = p->d;
p->a = -1; p->a = -1;
// if (oidx == sh2_poll_rd[hix]) break;
// sh2_poll_rd[hix] = idx;
} }
break;
} }
} }
return d; return d;
@ -224,7 +219,6 @@ static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2)
{ {
int hix = (a >> 1) % PFIFO_CNT; int hix = (a >> 1) % PFIFO_CNT;
struct sh2_poll_fifo *fifo = sh2_poll_fifo[hix]; struct sh2_poll_fifo *fifo = sh2_poll_fifo[hix];
struct sh2_poll_fifo *p = &fifo[sh2_poll_wr[hix]];
struct sh2_poll_fifo *q = &fifo[(sh2_poll_wr[hix]-1) % PFIFO_SZ]; struct sh2_poll_fifo *q = &fifo[(sh2_poll_wr[hix]-1) % PFIFO_SZ];
int cpu = sh2 ? sh2->is_slave+1 : 0; int cpu = sh2 ? sh2->is_slave+1 : 0;
@ -233,15 +227,16 @@ static NOINLINE void sh2_poll_write(u32 a, u32 d, unsigned int cycles, SH2 *sh2)
// intermediate values that may cause synchronisation problems. // intermediate values that may cause synchronisation problems.
// NB this can take an eternity on m68k: mov.b <addr1.l>,<addr2.l> needs // NB this can take an eternity on m68k: mov.b <addr1.l>,<addr2.l> needs
// 28 m68k-cycles (~80 sh2-cycles) to complete (observed in Metal Head) // 28 m68k-cycles (~80 sh2-cycles) to complete (observed in Metal Head)
if (q->a == a && !CYCLES_GT(cycles,q->cycles+30)) { if (q->a == a && sh2_poll_wr[hix] != sh2_poll_rd[hix] && !CYCLES_GT(cycles,q->cycles+30)) {
q->d = d; q->d = d;
} else { } else {
// store write to poll address in fifo // store write to poll address in fifo
fifo[sh2_poll_wr[hix]] =
(struct sh2_poll_fifo){ .cycles = cycles, .a = a, .d = d, .cpu = cpu };
sh2_poll_wr[hix] = (sh2_poll_wr[hix]+1) % PFIFO_SZ; sh2_poll_wr[hix] = (sh2_poll_wr[hix]+1) % PFIFO_SZ;
if (sh2_poll_wr[hix] == sh2_poll_rd[hix]) if (sh2_poll_wr[hix] == sh2_poll_rd[hix])
// fifo overflow, discard oldest value // fifo overflow, discard oldest value
sh2_poll_rd[hix] = (sh2_poll_rd[hix]+1) % PFIFO_SZ; sh2_poll_rd[hix] = (sh2_poll_rd[hix]+1) % PFIFO_SZ;
*p = (struct sh2_poll_fifo){ .cycles = cycles, .a = a, .d = d, .cpu = cpu };
} }
} }
@ -2369,6 +2364,8 @@ void PicoMemSetup32x(void)
sh2_drc_mem_setup(&msh2); sh2_drc_mem_setup(&msh2);
sh2_drc_mem_setup(&ssh2); sh2_drc_mem_setup(&ssh2);
memset(sh2_poll_rd, 0, sizeof(sh2_poll_rd));
memset(sh2_poll_wr, 0, sizeof(sh2_poll_wr));
// z80 hack // z80 hack
z80_map_set(z80_write_map, 0x8000, 0xffff, z80_md_bank_write_32x, 1); z80_map_set(z80_write_map, 0x8000, 0xffff, z80_md_bank_write_32x, 1);

View file

@ -11,7 +11,7 @@ ENDIAN=
# compile with target C compiler and extract value from .rodata section # compile with target C compiler and extract value from .rodata section
compile_rodata () compile_rodata ()
{ {
$CC $CFLAGS -I .. -c /tmp/getoffs.c -o /tmp/getoffs.o || exit 1 $CC $CFLAGS -I .. -shared /tmp/getoffs.c -o /tmp/getoffs.o || exit 1
# find the name of the .rodata section (in case -fdata-sections is used) # find the name of the .rodata section (in case -fdata-sections is used)
rosect=$(readelf -S /tmp/getoffs.o | grep '\.rodata' | rosect=$(readelf -S /tmp/getoffs.o | grep '\.rodata' |
sed 's/^[^.]*././;s/ .*//') sed 's/^[^.]*././;s/ .*//')