32x DMA memory copy performance optimisation

This commit is contained in:
kub 2019-05-28 23:16:45 +02:00
parent 8141d75694
commit 346153e08e
5 changed files with 105 additions and 15 deletions

View file

@ -2261,7 +2261,7 @@ static int emit_get_rom_data(SH2 *sh2, sh2_reg_e r, u32 offs, int size, u32 *val
if (gconst_get(r, &a)) {
a += offs;
// check if rom is memory mapped (not bank switched), and address is in rom
if (dr_is_rom(a) && p32x_sh2_get_mem_ptr(a, &mask, sh2)) {
if (dr_is_rom(a) && p32x_sh2_get_mem_ptr(a, &mask, sh2) != (void *)-1) {
switch (size & MF_SIZEMASK) {
case 0: *val = (s8)p32x_sh2_read8(a, sh2s); break; // 8
case 1: *val = (s16)p32x_sh2_read16(a, sh2s); break; // 16
@ -4896,12 +4896,7 @@ void sh2_drc_flush_all(void)
void sh2_drc_mem_setup(SH2 *sh2)
{
// fill the convenience pointers
sh2->p_bios = sh2->is_slave ? Pico32xMem->sh2_rom_s.w : Pico32xMem->sh2_rom_m.w;
sh2->p_da = sh2->data_array;
sh2->p_sdram = Pico32xMem->sdram;
sh2->p_rom = Pico.rom;
// sh2->p_dram filled in dram bank switching
// fill the DRC-only convenience pointers
sh2->p_drcblk_da = Pico32xMem->drcblk_da[!!sh2->is_slave];
sh2->p_drcblk_ram = Pico32xMem->drcblk_ram;
}

View file

@ -1855,17 +1855,15 @@ void *p32x_sh2_get_mem_ptr(u32 a, u32 *mask, SH2 *sh2)
{
const sh2_memmap *mm = sh2->read8_map;
void *ret = (void *)-1;
u32 am;
mm += a >> SH2_READ_SHIFT;
am = a & ((1 << SH2_READ_SHIFT)-1);
if (!map_flag_set(mm->addr) && !(am & ~mm->mask)) {
mm += SH2MAP_ADDR2OFFS_R(a);
if (!map_flag_set(mm->addr)) {
// directly mapped memory (SDRAM, ROM, data array)
ret = (void *)(mm->addr << 1);
*mask = mm->mask;
} else if ((a & ~0x7ff) == 0) {
// BIOS, has handler function since it shares its segment with I/O
ret = sh2->is_slave ? Pico32xMem->sh2_rom_s.w : Pico32xMem->sh2_rom_m.w;
ret = sh2->p_bios;
*mask = 0x7ff;
} else if ((a & 0xc6000000) == 0x02000000) {
// banked ROM. Return bank address
@ -1877,6 +1875,75 @@ void *p32x_sh2_get_mem_ptr(u32 a, u32 *mask, SH2 *sh2)
return ret;
}
int p32x_sh2_memcpy(u32 dst, u32 src, int count, int size, SH2 *sh2)
{
u32 mask;
void *ps, *pd;
int len, i;
// check if src and dst points to memory (rom/sdram/dram/da)
if ((pd = p32x_sh2_get_mem_ptr(dst, &mask, sh2)) == (void *)-1)
return 0;
if ((ps = p32x_sh2_get_mem_ptr(src, &mask, sh2)) == (void *)-1)
return 0;
ps += src & mask;
len = count * size;
// DRAM in byte access is always in overwrite mode
if (pd == sh2->p_dram && size == 1)
dst |= 0x20000;
// align dst to halfword
if (dst & 1) {
p32x_sh2_write8(dst, *(u8 *)((uptr)ps ^ 1), sh2);
ps++, dst++, len --;
}
// copy data
if ((uptr)ps & 1) {
// unaligned, use halfword copy mode to reduce memory bandwidth
u16 *sp = (u16 *)(ps - 1);
u16 dl, dh = *sp++;
for (i = 0; i < (len & ~1); i += 2, dst += 2, sp++) {
dl = dh, dh = *sp;
p32x_sh2_write16(dst, (dh >> 8) | (dl << 8), sh2);
}
if (len & 1)
p32x_sh2_write8(dst, dh, sh2);
} else {
// dst and src at least halfword aligned
u16 *sp = (u16 *)ps;
// align dst to word
if ((dst & 2) && len >= 2) {
p32x_sh2_write16(dst, *sp++, sh2);
dst += 2, len -= 2;
}
if ((uptr)sp & 2) {
// halfword copy, using word writes to reduce memory bandwidth
u16 dl, dh;
for (i = 0; i < (len & ~3); i += 4, dst += 4, sp += 2) {
dl = sp[0], dh = sp[1];
p32x_sh2_write32(dst, (dl << 16) | dh, sh2);
}
} else {
// word copy
u32 d;
for (i = 0; i < (len & ~3); i += 4, dst += 4, sp += 2) {
d = *(u32 *)sp;
p32x_sh2_write32(dst, (d << 16) | (d >> 16), sh2);
}
}
if (len & 2) {
p32x_sh2_write16(dst, *sp++, sh2);
dst += 2;
}
if (len & 1)
p32x_sh2_write8(dst, *sp >> 8, sh2);
}
return count;
}
// -----------------------------------------------------------------
static void z80_md_bank_write_32x(unsigned int a, unsigned char d)
@ -2107,8 +2174,12 @@ void Pico32xSwapDRAM(int b)
ssh2_read16_map[0x04/2].addr = ssh2_read16_map[0x24/2].addr =
ssh2_read32_map[0x04/2].addr = ssh2_read32_map[0x24/2].addr = MAP_MEMORY(Pico32xMem->dram[b]);
msh2.p_dram = ssh2.p_dram = Pico32xMem->dram[b]; // DRC conveniance ptr
// convenience ptrs
msh2.p_sdram = ssh2.p_sdram = Pico32xMem->sdram;
msh2.p_dram = ssh2.p_dram = Pico32xMem->dram[b];
msh2.p_rom = ssh2.p_rom = Pico.rom;
msh2.p_bios = Pico32xMem->sh2_rom_m.w; msh2.p_da = msh2.data_array;
ssh2.p_bios = Pico32xMem->sh2_rom_s.w; ssh2.p_da = ssh2.data_array;
}
static void bank_switch_rom_sh2(void)

View file

@ -129,6 +129,24 @@ static void dmac_transfer_one(SH2 *sh2, struct dma_chan *chan)
chan->sar += size;
}
// optimization for copying around memory with SH2 DMA
static void dmac_memcpy(struct dma_chan *chan, SH2 *sh2)
{
u32 size = (chan->chcr >> 10) & 3, up = chan->chcr & (1 << 14);
int count;
if (!up || chan->tcr < 4)
return;
if (size == 3) size = 2; // 4-word xfer mode still counts in words
// XXX check TCR being a multiple of 4 in 4-word xfer mode?
// XXX check alignment of sar/dar, generating a bus error if unaligned?
count = p32x_sh2_memcpy(chan->dar, chan->sar, chan->tcr, 1 << size, sh2);
chan->sar += count << size;
chan->dar += count << size;
chan->tcr -= count;
}
// DMA trigger by SH2 register write
static void dmac_trigger(SH2 *sh2, struct dma_chan *chan)
{
@ -139,6 +157,11 @@ static void dmac_trigger(SH2 *sh2, struct dma_chan *chan)
if (chan->chcr & DMA_AR) {
// auto-request transfer
sh2->state |= SH2_STATE_SLEEP;
if ((((chan->chcr >> 12) ^ (chan->chcr >> 14)) & 3) == 0 &&
(((chan->chcr >> 14) ^ (chan->chcr >> 15)) & 1) == 1) {
// SM == DM and either DM0 or DM1 are set. check for mem to mem copy
dmac_memcpy(chan, sh2);
}
while ((int)chan->tcr > 0)
dmac_transfer_one(sh2, chan);
dmac_transfer_complete(sh2, chan);

View file

@ -937,6 +937,7 @@ unsigned int REGPARM(3) p32x_sh2_poll_memory16(unsigned int a, unsigned int d, S
unsigned int REGPARM(3) p32x_sh2_poll_memory32(unsigned int a, unsigned int d, SH2 *sh2);
void *p32x_sh2_get_mem_ptr(unsigned int a, unsigned int *mask, SH2 *sh2);
void p32x_sh2_poll_event(SH2 *sh2, unsigned int flags, unsigned int m68k_cycles);
int p32x_sh2_memcpy(unsigned int dst, unsigned int src, int count, int size, SH2 *sh2);
// 32x/draw.c
void PicoDrawSetOutFormat32x(pdso_t which, int use_32x_line_mode);