core, improve 68k timing accuracy

This commit is contained in:
kub 2023-07-11 21:18:05 +00:00
parent a5aae2c39f
commit 7263343dc7
8 changed files with 76 additions and 63 deletions

View file

@ -197,10 +197,6 @@ void p32x_reset_sh2s(void)
void Pico32xInit(void)
{
if (msh2.mult_m68k_to_sh2 == 0 || msh2.mult_sh2_to_m68k == 0)
Pico32xSetClocks(PICO_MSH2_HZ, 0);
if (ssh2.mult_m68k_to_sh2 == 0 || ssh2.mult_sh2_to_m68k == 0)
Pico32xSetClocks(0, PICO_MSH2_HZ);
}
void PicoPower32x(void)
@ -284,8 +280,11 @@ static void p32x_end_blank(void)
Pico32x.vdp_regs[0x0a/2] &= ~P32XV_VBLK; // get out of vblank
if ((Pico32x.vdp_regs[0] & P32XV_Mx) != 0) // no forced blanking
Pico32x.vdp_regs[0x0a/2] &= ~P32XV_PEN; // no palette access
if (!(Pico32x.sh2_regs[0] & 0x80))
if (!(Pico32x.sh2_regs[0] & 0x80)) {
// NB must precede VInt per hw manual, min 4 SH-2 cycles to pass Mars Check
Pico32x.hint_counter = -0x18;
p32x_schedule_hint(NULL, Pico.t.m68c_aim);
}
p32x_sh2_poll_event(msh2.poll_addr, &msh2, SH2_STATE_VPOLL, Pico.t.m68c_aim);
p32x_sh2_poll_event(ssh2.poll_addr, &ssh2, SH2_STATE_VPOLL, Pico.t.m68c_aim);
@ -300,7 +299,9 @@ void p32x_schedule_hint(SH2 *sh2, unsigned int m68k_cycles)
if (!(Pico32x.sh2_regs[0] & 0x80) && (Pico.video.status & PVS_VB2))
return;
after = (Pico32x.sh2_regs[4 / 2] + 1) * 488;
Pico32x.hint_counter += (Pico32x.sh2_regs[4 / 2] + 1) * (int)(488.5*0x10);
after = Pico32x.hint_counter >> 4;
Pico32x.hint_counter &= 0xf;
if (sh2 != NULL)
p32x_event_schedule_sh2(sh2, P32X_EVENT_HINT, after);
else
@ -633,7 +634,8 @@ void Pico32xStateLoaded(int is_early)
return;
}
if (sh2s[0].m68krcycles_done == 0 && sh2s[1].m68krcycles_done == 0)
if (CYCLES_GE(sh2s[0].m68krcycles_done - Pico.t.m68c_aim, 500) ||
CYCLES_GE(sh2s[1].m68krcycles_done - Pico.t.m68c_aim, 500))
sh2s[0].m68krcycles_done = sh2s[1].m68krcycles_done = SekCyclesDone();
p32x_update_irls(NULL, SekCyclesDone());
p32x_timers_recalc();
@ -643,6 +645,11 @@ void Pico32xStateLoaded(int is_early)
void Pico32xPrepare(void)
{
if (msh2.mult_m68k_to_sh2 == 0 || msh2.mult_sh2_to_m68k == 0)
Pico32xSetClocks(PICO_MSH2_HZ, 0);
if (ssh2.mult_m68k_to_sh2 == 0 || ssh2.mult_sh2_to_m68k == 0)
Pico32xSetClocks(0, PICO_MSH2_HZ);
sh2_execute_prepare(&msh2, PicoIn.opt & POPT_EN_DRC);
sh2_execute_prepare(&ssh2, PicoIn.opt & POPT_EN_DRC);
}

View file

@ -123,18 +123,13 @@ static void SekRunS68k(unsigned int to)
pprof_end(s68k);
}
static void pcd_set_cycle_mult(void)
void PicoMCDPrepare(void)
{
unsigned int div;
if (Pico.m.pal)
div = 50*313*488;
else
div = 60*262*488;
// ~1.63 for NTSC, ~1.645 for PAL; round to nearest, x/y+0.5 -> (x+y/2)/y
mcd_m68k_cycle_mult = ((12500000ull << 16) + div/2) / div;
mcd_s68k_cycle_mult = ((1ull*div << 16) + 6250000) / 12500000;
// ~1.63 for NTSC, ~1.645 for PAL
#define DIV_ROUND(x,y) ((x)+(y)/2) / (y) // round to nearest, x/y+0.5 -> (x+y/2)/y
unsigned int osc = (Pico.m.pal ? OSC_PAL : OSC_NTSC);
mcd_m68k_cycle_mult = DIV_ROUND(12500000ull << 16, osc / 7);
mcd_s68k_cycle_mult = DIV_ROUND(1ull * osc << 16, 7 * 12500000);
}
unsigned int pcd_cycles_m68k_to_s68k(unsigned int c)
@ -312,11 +307,13 @@ int pcd_sync_s68k(unsigned int m68k_target, int m68k_poll_sync)
#define pcd_run_cpus_normal pcd_run_cpus
//#define pcd_run_cpus_lockstep pcd_run_cpus
static void SekAimM68k(int cyc, int mult);
static int SekSyncM68k(int once);
void pcd_run_cpus_normal(int m68k_cycles)
{
Pico.t.m68c_aim += m68k_cycles;
// TODO this is suspicious. ~1 cycle refresh delay every 256 cycles?
SekAimM68k(m68k_cycles, 0x43); // Fhey area
while (CYCLES_GT(Pico.t.m68c_aim, Pico.t.m68c_cnt)) {
if (SekShouldInterrupt()) {
@ -376,8 +373,6 @@ void pcd_run_cpus_lockstep(int m68k_cycles)
void pcd_prepare_frame(void)
{
pcd_set_cycle_mult();
// need this because we can't have direct mapping between
// master<->slave cycle counters because of overflows
mcd_m68k_cycle_base = Pico.t.m68c_aim;
@ -397,7 +392,6 @@ void pcd_state_loaded(void)
unsigned int cycles;
int diff;
pcd_set_cycle_mult();
pcd_state_loaded_mem();
memset(Pico_mcd->pcm_mixbuf, 0, sizeof(Pico_mcd->pcm_mixbuf));
@ -407,8 +401,7 @@ void pcd_state_loaded(void)
// old savestates..
cycles = pcd_cycles_m68k_to_s68k(Pico.t.m68c_aim);
diff = cycles - SekCycleAimS68k;
if (diff < -1000 || diff > 1000) {
if (CYCLES_GE(cycles - SekCycleAimS68k, 1000)) {
SekCycleCntS68k = SekCycleAimS68k = cycles;
}
if (pcd_event_times[PCD_EVENT_CDC] == 0) {

View file

@ -1041,11 +1041,11 @@ static int get_scanline(int is_from_z80)
if (is_from_z80) {
// ugh... compute by dividing cycles since frame start by cycles per line
// need some fractional resolution here, else there may be an extra line
int cycles_line = cycles_68k_to_z80(488 << 8)+1; // cycles per line, as Q8
int cycles_line = cycles_68k_to_z80((unsigned)(488.5*256))+1; // cycles per line, Q8
int cycles_z80 = (z80_cyclesLeft<0 ? Pico.t.z80c_aim:z80_cyclesDone())<<8;
int cycles = cycles_line * Pico.t.z80_scanline;
// approximation by multiplying with inverse
if (cycles_z80 - cycles >= 2*cycles_line) {
if (cycles_z80 - cycles >= 4*cycles_line) {
// compute 1/cycles_line, storing the result to avoid future dividing
static int cycles_line_o, cycles_line_i;
if (cycles_line_o != cycles_line)
@ -1150,7 +1150,6 @@ static int ym2612_write_local(u32 a, u32 d, int is_from_z80)
switch (addr)
{
// NB, OD2 A/V sync HACK: lower timer step by 1/4 z80 cycle (=64 in Q8)
case 0x24: // timer A High 8
case 0x25: { // timer A Low 2
int TAnew = (addr == 0x24) ? ((ym2612.OPN.ST.TA & 0x03)|(((int)d)<<2))
@ -1163,7 +1162,7 @@ static int ym2612_write_local(u32 a, u32 d, int is_from_z80)
ym2612.OPN.ST.TA = TAnew;
//ym2612.OPN.ST.TAC = (1024-TAnew)*18;
//ym2612.OPN.ST.TAT = 0;
Pico.t.timer_a_step = TIMER_A_TICK_ZCYCLES * (1024 - TAnew) - 64;
Pico.t.timer_a_step = TIMER_A_TICK_ZCYCLES * (1024 - TAnew);
elprintf(EL_YMTIMER, "timer a set to %i, %i", 1024 - TAnew, Pico.t.timer_a_next_oflow>>8);
}
return 0;
@ -1176,7 +1175,7 @@ static int ym2612_write_local(u32 a, u32 d, int is_from_z80)
ym2612.OPN.ST.TB = d;
//ym2612.OPN.ST.TBC = (256-d) * 288;
//ym2612.OPN.ST.TBT = 0;
Pico.t.timer_b_step = TIMER_B_TICK_ZCYCLES * (256 - d) - 64;
Pico.t.timer_b_step = TIMER_B_TICK_ZCYCLES * (256 - d);
elprintf(EL_YMTIMER, "timer b set to %i, %i", 256 - d, Pico.t.timer_b_next_oflow>>8);
}
return 0;
@ -1350,7 +1349,7 @@ static void access_68k_bus(int delay) // bus delay as Q8
// until an additional cycle is full. That is then added to the integer part.
Pico.t.z80_busdelay = (delay&0xff) + (Pico.t.z80_busdelay&0xff); // accumulate
z80_subCLeft((delay>>8) + (Pico.t.z80_busdelay>>8));
// don't use SekCyclesBurn(7) here since the Z80 doesn't run in cycle lock to
// don't use SekCyclesBurn() here since the Z80 doesn't run in cycle lock to
// the 68K. Count the stolen cycles to be accounted later in the 68k CPU runs
Pico.t.z80_buscycles += 7;
}
@ -1358,8 +1357,8 @@ static void access_68k_bus(int delay) // bus delay as Q8
static unsigned char z80_md_vdp_read(unsigned short a)
{
if ((a & 0xff00) == 0x7f00) {
// 68k bus access delay=3.3 per kabuto, for notaz picotest 2.4<=delay<2.55?
access_68k_bus(0x280); // Q8, picotest: 0x266(>=2.4) - 0x28b(<2.55)
// 68k bus access delay=3.3 per kabuto, for notaz picotest 2.42<delay<2.57?
access_68k_bus(0x280); // Q8, picotest: 0x26d(>2.42) - 0x292(<2.57)
switch (a & 0x0d)
{
@ -1383,8 +1382,8 @@ static unsigned char z80_md_bank_read(unsigned short a)
unsigned int addr68k;
unsigned char ret;
// 68k bus access delay=3.3 per kabuto, but for notaz picotest 3.0<delay<3.3
access_68k_bus(0x340); // // Q8, picotest: 0x301(>3.0)-0x34c(<3.3)
// 68k bus access delay=3.3 per kabuto, but for notaz picotest 3.02<delay<3.32
access_68k_bus(0x340); // Q8, picotest: 0x306(>3.02)-0x351(<3.32)
addr68k = Pico.m.z80_bank68k << 15;
addr68k |= a & 0x7fff;
@ -1425,8 +1424,8 @@ static void z80_md_bank_write(unsigned int a, unsigned char data)
{
unsigned int addr68k;
// 68k bus access delay=3.3 per kabuto, but for notaz picotest 3.0<delay<3.3
access_68k_bus(0x340); // // Q8, picotest: 0x301(>3.0)-0x34c(<3.3)
// 68k bus access delay=3.3 per kabuto, but for notaz picotest 3.02<delay<3.32
access_68k_bus(0x340); // Q8, picotest: 0x306(>3.02)-0x351(<3.32)
addr68k = Pico.m.z80_bank68k << 15;
addr68k += a & 0x7fff;

View file

@ -228,6 +228,8 @@ void PicoLoopPrepare(void)
Pico.m.dirtyPal = 1;
rendstatus_old = -1;
if (PicoIn.AHW & PAHW_MCD)
PicoMCDPrepare();
if (PicoIn.AHW & PAHW_32X)
Pico32xPrepare();
}

View file

@ -66,15 +66,22 @@ static int SekSyncM68k(int once)
return Pico.t.m68c_aim > Pico.t.m68c_cnt;
}
static __inline void SekAimM68k(int cyc, int mult)
{
// refresh slowdown, for cart: 2 cycles every 128 - make this 1 every 64,
// for RAM: seems to be 0-3 every 128. Carts usually run from the cart
// area, but MCD games only use RAM, hence a different multiplier is needed.
// NB must be quite accurate, so handle fractions as well (c/f OutRunners)
int delay = (Pico.t.refresh_delay += cyc*mult) >> 14;
Pico.t.m68c_cnt += delay;
Pico.t.refresh_delay -= delay << 14;
Pico.t.m68c_aim += cyc;
}
static __inline void SekRunM68k(int cyc)
{
// refresh slowdown handling, 2 cycles every 128 - make this 1 every 64
// NB must be quite accurate, so handle fractions as well (c/f OutRunners)
static int refresh;
Pico.t.m68c_cnt += (cyc + refresh) >> 6;
refresh = (cyc + refresh) & 0x3f;
Pico.t.m68c_aim += cyc;
// TODO 0x100 would by 2 cycles/128, moreover far too sensitive
SekAimM68k(cyc, 0x10c); // OutRunners, testpico, VDPFIFOTesting
SekSyncM68k(0);
}
@ -108,10 +115,9 @@ static void do_timing_hacks_end(struct PicoVideo *pv)
PicoVideoFIFOSync(CYCLES_M68K_LINE);
// need rather tight Z80 sync for emulation of main bus cycle stealing
if (Pico.m.scanline&1) {
if (Pico.m.scanline&1)
if (Pico.m.z80Run && !Pico.m.z80_reset && (PicoIn.opt&POPT_EN_Z80))
PicoSyncZ80(Pico.t.m68c_aim);
}
}
static void do_timing_hacks_start(struct PicoVideo *pv)
@ -122,6 +128,8 @@ static void do_timing_hacks_start(struct PicoVideo *pv)
// XXX how to handle Z80 bus cycle stealing during DMA correctly?
if ((Pico.t.z80_buscycles -= cycles) < 0)
Pico.t.z80_buscycles = 0;
if (Pico.m.scanline&1)
Pico.t.m68c_aim += 1; // add cycle each other line for 488.5 cycles/line
}
static int PicoFrameHints(void)
@ -167,7 +175,7 @@ static int PicoFrameHints(void)
}
// decide if we draw this line
if (!skip && (PicoIn.opt & POPT_ALT_RENDERER))
if ((PicoIn.opt & POPT_ALT_RENDERER) && !skip)
{
// find the right moment for frame renderer, when display is no longer blanked
if ((pv->reg[1]&0x40) || y > 100) {

View file

@ -204,7 +204,7 @@ extern struct DrZ80 drZ80;
#define z80_cyclesDone() \
(Pico.t.z80c_aim - z80_cyclesLeft)
// 68k clock = OSC/7, z80 clock = OSC/15, 68k:z80 ratio = 7/15*8192=3822.9
// 68k clock = OSC/7, z80 clock = OSC/15, 68k:z80 ratio = 7/15 = 3822.9/8192
#define cycles_68k_to_z80(x) ((x) * 3823 >> 13)
// ----------------------- SH2 CPU -----------------------
@ -443,6 +443,7 @@ struct PicoTiming
unsigned int m68c_aim;
unsigned int m68c_frame_start; // m68k cycles
unsigned int m68c_line_start;
int refresh_delay;
unsigned int z80c_cnt; // z80 cycles done (this frame)
unsigned int z80c_aim;
@ -523,7 +524,7 @@ struct mcd_misc
unsigned int stopwatch_base_c;
unsigned short m68k_poll_a;
unsigned short m68k_poll_cnt;
unsigned short s68k_poll_a;
unsigned short s68k_poll_a; // 10
unsigned short s68k_poll_cnt;
unsigned int s68k_poll_clk;
unsigned char bcram_reg; // 18: battery-backed RAM cart register
@ -640,7 +641,8 @@ struct Pico32x
unsigned char pad1;
unsigned short pwm_p[2]; // pwm pos in fifo
unsigned int pwm_cycle_p; // pwm play cursor (32x cycles)
unsigned int reserved[6];
unsigned int hint_counter;
unsigned int reserved[5];
};
struct Pico32xMem
@ -803,6 +805,7 @@ PICO_INTERNAL void PicoExitMCD(void);
PICO_INTERNAL void PicoPowerMCD(void);
PICO_INTERNAL int PicoResetMCD(void);
PICO_INTERNAL void PicoFrameMCD(void);
PICO_INTERNAL void PicoMCDPrepare(void);
enum pcd_event {
PCD_EVENT_CDC,

View file

@ -177,7 +177,7 @@ void PsndRerate(int preserve_state)
// samples per line (Q16)
Pico.snd.smpl_mult = 65536LL * PicoIn.sndRate / (target_fps*target_lines);
// samples per z80 clock (Q20)
Pico.snd.clkl_mult = 16 * Pico.snd.smpl_mult * 15/7 / 488;
Pico.snd.clkl_mult = 16 * Pico.snd.smpl_mult * 15/7 / 488.5;
// samples per 44.1 KHz sample
Pico.snd.cdda_mult = 65536LL * 44100 / PicoIn.sndRate;
Pico.snd.cdda_div = 65536LL * PicoIn.sndRate / 44100;

View file

@ -21,25 +21,27 @@ enum { clkdiv = 2 }; // CPU clock granularity: one of 1,2,4,8
// Thank you very much for the great work, Nemesis, Kabuto!
// Slot clock is sysclock/20 for h32 and sysclock/16 for h40.
// One scanline is 63.7us/63.5us (h32/h40) long which is 488.6/487.4 68k cycles.
// Assume 488 for everything.
// One scanline is 63.7us/64.3us (ntsc/pal) long which is ~488.57 68k cycles.
// Approximate by 488 for VDP.
// 1 slot is 488/171 = 2.8538 68k cycles in h32, and 488/210 = 2.3238 in h40.
enum { slcpu = 488 };
// VDP has a slot counter running from 0x00 to 0xff every scanline, but it has
// a gap depending on the video mode. The slot in which a horizontal interrupt
// is generated also depends on the video mode.
// NB Kabuto says gapend40 is 0xe4. That's technically correct, since slots 0xb6
// and 0xe4 are only half slots. Ignore 0xe4 here and make 0xb6 a full slot.
enum { hint32 = 0x85, gapstart32 = 0x94, gapend32 = 0xe9};
enum { hint40 = 0xa5, gapstart40 = 0xb7, gapend40 = 0xe5};
// XXX Kabuto says gapend40 is 0xe4, but then a line would've 211 slots, while
// it's 210 in all other sources I looked at?
// The horizontal sync period (HBLANK) is 30/37 slots (h32/h40):
// h32: 4 slots front porch (1.49us), 13 HSYNC (4.84us), 13 back porch (4.84us)
// h40: 5 slots front porch (1.49us), 16 HSYNC (4.77us), 16 back porch (4.77us)
// HBLANK starts in slot 0x93/0xb3 and ends after slot 0x05 (from Kabuto's doc)
// HBLANK starts at slot 0x93/0xb4 and ends in the middle of slot 0x05/0x06,
// NB VDP slows down the h40 clock to h32 during HSYNC for 17 slots to get the
// right sync timing. Ignored in the slot calculation, but hblen40 is correct.
enum { hboff32 = 0x93-hint32, hblen32 = 0xf8-(gapend32-gapstart32)-hint32};//30
enum { hboff40 = 0xb3-hint40, hblen40 = 0xf8-(gapend40-gapstart40)-hint40};//37
enum { hboff40 = 0xb4-hint40, hblen40 = 0xf8-(gapend40-gapstart40)-hint40};//37
// number of slots in a scanline
#define slots32 (0x100-(gapend32-gapstart32)) // 171
@ -263,7 +265,7 @@ void PicoVideoFIFOSync(int cycles)
// calculate #slots since last executed slot
slots = Cyc2Sl(vf, cycles) - vf->fifo_slot;
if (!slots || !vf->fifo_ql) return;
if (slots <= 0 || !vf->fifo_ql) return;
// advance FIFO queue by #done slots
done = slots;
@ -308,7 +310,7 @@ static int PicoVideoFIFODrain(int level, int cycles, int bgdma)
}
}
if (vf->fifo_ql && ((vf->fifo_total > level) | bd))
cycles = 488; // not completed in this scanline
cycles = slcpu; // not completed in this scanline
if (cycles > ocyc)
burn = cycles - ocyc;
@ -430,7 +432,7 @@ void PicoVideoFIFOMode(int active, int h40)
vf->fifo_hcounts = vdphcounts[h40];
// recalculate FIFO slot for new mode
vf->fifo_slot = Cyc2Sl(vf, lc);
vf->fifo_maxslot = Cyc2Sl(vf, 488);
vf->fifo_maxslot = Cyc2Sl(vf, slcpu);
}
// VDP memory rd/wr
@ -1031,10 +1033,9 @@ update_irq:
static u32 VideoSr(const struct PicoVideo *pv)
{
unsigned int hp = pv->reg[12]&1 ? hboff40*488/slots40 : hboff32*488/slots32;
unsigned int hl = pv->reg[12]&1 ? hblen40*488/slots40 : hblen32*488/slots32;
// XXX -2 is to please notaz' testpico, but why is this?
unsigned int c = SekCyclesDone()-2 - Pico.t.m68c_line_start;
unsigned int hp = pv->reg[12]&1 ? hboff40*488.5/slots40 : hboff32*488.5/slots32;
unsigned int hl = pv->reg[12]&1 ? hblen40*488.5/slots40 : hblen32*488.5/slots32;
unsigned int c = SekCyclesDone() - Pico.t.m68c_line_start;
u32 d;
PicoVideoFIFOSync(c);