core, improve 68k timing accuracy

2025-10-26 08:19:38 -04:00 · 2023-07-11 21:18:05 +00:00 · 2023-07-11 21:18:05 +00:00 · 7263343dc7
commit 7263343dc7
parent a5aae2c39f
8 changed files with 76 additions and 63 deletions
--- a/pico/32x/32x.c
+++ b/pico/32x/32x.c
@ -197,10 +197,6 @@ void p32x_reset_sh2s(void)

 void Pico32xInit(void)
 {
-  if (msh2.mult_m68k_to_sh2 == 0 || msh2.mult_sh2_to_m68k == 0)
-    Pico32xSetClocks(PICO_MSH2_HZ, 0);
-  if (ssh2.mult_m68k_to_sh2 == 0 || ssh2.mult_sh2_to_m68k == 0)
-    Pico32xSetClocks(0, PICO_MSH2_HZ);
 }

 void PicoPower32x(void)
@ -284,8 +280,11 @@ static void p32x_end_blank(void)
  Pico32x.vdp_regs[0x0a/2] &= ~P32XV_VBLK; // get out of vblank
  if ((Pico32x.vdp_regs[0] & P32XV_Mx) != 0) // no forced blanking
    Pico32x.vdp_regs[0x0a/2] &= ~P32XV_PEN; // no palette access
-  if (!(Pico32x.sh2_regs[0] & 0x80))
+  if (!(Pico32x.sh2_regs[0] & 0x80)) {
+    // NB must precede VInt per hw manual, min 4 SH-2 cycles to pass Mars Check
+    Pico32x.hint_counter = -0x18;
    p32x_schedule_hint(NULL, Pico.t.m68c_aim);
+  }

  p32x_sh2_poll_event(msh2.poll_addr, &msh2, SH2_STATE_VPOLL, Pico.t.m68c_aim);
  p32x_sh2_poll_event(ssh2.poll_addr, &ssh2, SH2_STATE_VPOLL, Pico.t.m68c_aim);
@ -300,7 +299,9 @@ void p32x_schedule_hint(SH2 *sh2, unsigned int m68k_cycles)
  if (!(Pico32x.sh2_regs[0] & 0x80) && (Pico.video.status & PVS_VB2))
    return;

-  after = (Pico32x.sh2_regs[4 / 2] + 1) * 488;
+  Pico32x.hint_counter += (Pico32x.sh2_regs[4 / 2] + 1) * (int)(488.5*0x10);
+  after = Pico32x.hint_counter >> 4;
+  Pico32x.hint_counter &= 0xf;
  if (sh2 != NULL)
    p32x_event_schedule_sh2(sh2, P32X_EVENT_HINT, after);
  else
@ -633,7 +634,8 @@ void Pico32xStateLoaded(int is_early)
    return;
  }

-  if (sh2s[0].m68krcycles_done == 0 && sh2s[1].m68krcycles_done == 0)
+  if (CYCLES_GE(sh2s[0].m68krcycles_done - Pico.t.m68c_aim, 500) ||
+      CYCLES_GE(sh2s[1].m68krcycles_done - Pico.t.m68c_aim, 500))
    sh2s[0].m68krcycles_done = sh2s[1].m68krcycles_done = SekCyclesDone();
  p32x_update_irls(NULL, SekCyclesDone());
  p32x_timers_recalc();
@ -643,6 +645,11 @@ void Pico32xStateLoaded(int is_early)

 void Pico32xPrepare(void)
 {
+  if (msh2.mult_m68k_to_sh2 == 0 || msh2.mult_sh2_to_m68k == 0)
+    Pico32xSetClocks(PICO_MSH2_HZ, 0);
+  if (ssh2.mult_m68k_to_sh2 == 0 || ssh2.mult_sh2_to_m68k == 0)
+    Pico32xSetClocks(0, PICO_MSH2_HZ);
+
  sh2_execute_prepare(&msh2, PicoIn.opt & POPT_EN_DRC);
  sh2_execute_prepare(&ssh2, PicoIn.opt & POPT_EN_DRC);
 }
--- a/pico/cd/mcd.c
+++ b/pico/cd/mcd.c
@ -123,18 +123,13 @@ static void SekRunS68k(unsigned int to)
  pprof_end(s68k);
 }

-static void pcd_set_cycle_mult(void)
+void PicoMCDPrepare(void)
 {
-  unsigned int div;
-
-  if (Pico.m.pal)
-    div = 50*313*488;
-  else
-    div = 60*262*488;
-
-  // ~1.63 for NTSC, ~1.645 for PAL; round to nearest, x/y+0.5 -> (x+y/2)/y
-  mcd_m68k_cycle_mult = ((12500000ull << 16) + div/2) / div;
-  mcd_s68k_cycle_mult = ((1ull*div << 16)  + 6250000) / 12500000;
+  // ~1.63 for NTSC, ~1.645 for PAL
+#define DIV_ROUND(x,y) ((x)+(y)/2) / (y) // round to nearest, x/y+0.5 -> (x+y/2)/y
+  unsigned int osc = (Pico.m.pal ? OSC_PAL : OSC_NTSC);
+  mcd_m68k_cycle_mult = DIV_ROUND(12500000ull << 16, osc / 7);
+  mcd_s68k_cycle_mult = DIV_ROUND(1ull * osc << 16, 7 * 12500000);
 }

 unsigned int pcd_cycles_m68k_to_s68k(unsigned int c)
@ -312,11 +307,13 @@ int pcd_sync_s68k(unsigned int m68k_target, int m68k_poll_sync)
 #define pcd_run_cpus_normal pcd_run_cpus
 //#define pcd_run_cpus_lockstep pcd_run_cpus

+static void SekAimM68k(int cyc, int mult);
 static int SekSyncM68k(int once);

 void pcd_run_cpus_normal(int m68k_cycles)
 {
-  Pico.t.m68c_aim += m68k_cycles;
+  // TODO this is suspicious. ~1 cycle refresh delay every 256 cycles?
+  SekAimM68k(m68k_cycles, 0x43); // Fhey area

  while (CYCLES_GT(Pico.t.m68c_aim, Pico.t.m68c_cnt)) {
    if (SekShouldInterrupt()) {
@ -376,8 +373,6 @@ void pcd_run_cpus_lockstep(int m68k_cycles)

 void pcd_prepare_frame(void)
 {
-  pcd_set_cycle_mult();
-
  // need this because we can't have direct mapping between
  // master<->slave cycle counters because of overflows
  mcd_m68k_cycle_base = Pico.t.m68c_aim;
@ -397,7 +392,6 @@ void pcd_state_loaded(void)
  unsigned int cycles;
  int diff;

-  pcd_set_cycle_mult();
  pcd_state_loaded_mem();

  memset(Pico_mcd->pcm_mixbuf, 0, sizeof(Pico_mcd->pcm_mixbuf));
@ -407,8 +401,7 @@ void pcd_state_loaded(void)

  // old savestates..
  cycles = pcd_cycles_m68k_to_s68k(Pico.t.m68c_aim);
-  diff = cycles - SekCycleAimS68k;
-  if (diff < -1000 || diff > 1000) {
+  if (CYCLES_GE(cycles - SekCycleAimS68k, 1000)) {
    SekCycleCntS68k = SekCycleAimS68k = cycles;
  }
  if (pcd_event_times[PCD_EVENT_CDC] == 0) {
--- a/pico/memory.c
+++ b/pico/memory.c
@ -1041,11 +1041,11 @@ static int get_scanline(int is_from_z80)
  if (is_from_z80) {
    // ugh... compute by dividing cycles since frame start by cycles per line
    // need some fractional resolution here, else there may be an extra line
-    int cycles_line = cycles_68k_to_z80(488 << 8)+1; // cycles per line, as Q8
+    int cycles_line = cycles_68k_to_z80((unsigned)(488.5*256))+1; // cycles per line, Q8
    int cycles_z80 = (z80_cyclesLeft<0 ? Pico.t.z80c_aim:z80_cyclesDone())<<8;
    int cycles = cycles_line * Pico.t.z80_scanline;
    // approximation by multiplying with inverse
-    if (cycles_z80 - cycles >= 2*cycles_line) {
+    if (cycles_z80 - cycles >= 4*cycles_line) {
      // compute 1/cycles_line, storing the result to avoid future dividing
      static int cycles_line_o, cycles_line_i;
      if (cycles_line_o != cycles_line)
@ -1150,7 +1150,6 @@ static int ym2612_write_local(u32 a, u32 d, int is_from_z80)

      switch (addr)
      {
-        // NB, OD2 A/V sync HACK: lower timer step by 1/4 z80 cycle (=64 in Q8)
        case 0x24: // timer A High 8
        case 0x25: { // timer A Low 2
          int TAnew = (addr == 0x24) ? ((ym2612.OPN.ST.TA & 0x03)|(((int)d)<<2))
@ -1163,7 +1162,7 @@ static int ym2612_write_local(u32 a, u32 d, int is_from_z80)
            ym2612.OPN.ST.TA = TAnew;
            //ym2612.OPN.ST.TAC = (1024-TAnew)*18;
            //ym2612.OPN.ST.TAT = 0;
-            Pico.t.timer_a_step = TIMER_A_TICK_ZCYCLES * (1024 - TAnew) - 64;
+            Pico.t.timer_a_step = TIMER_A_TICK_ZCYCLES * (1024 - TAnew);
            elprintf(EL_YMTIMER, "timer a set to %i, %i", 1024 - TAnew, Pico.t.timer_a_next_oflow>>8);
          }
          return 0;
@ -1176,7 +1175,7 @@ static int ym2612_write_local(u32 a, u32 d, int is_from_z80)
            ym2612.OPN.ST.TB = d;
            //ym2612.OPN.ST.TBC = (256-d) * 288;
            //ym2612.OPN.ST.TBT  = 0;
-            Pico.t.timer_b_step = TIMER_B_TICK_ZCYCLES * (256 - d) - 64;
+            Pico.t.timer_b_step = TIMER_B_TICK_ZCYCLES * (256 - d);
            elprintf(EL_YMTIMER, "timer b set to %i, %i", 256 - d, Pico.t.timer_b_next_oflow>>8);
          }
          return 0;
@ -1350,7 +1349,7 @@ static void access_68k_bus(int delay) // bus delay as Q8
  // until an additional cycle is full. That is then added to the integer part.
  Pico.t.z80_busdelay = (delay&0xff) + (Pico.t.z80_busdelay&0xff); // accumulate
  z80_subCLeft((delay>>8) + (Pico.t.z80_busdelay>>8));
-  // don't use SekCyclesBurn(7) here since the Z80 doesn't run in cycle lock to
+  // don't use SekCyclesBurn() here since the Z80 doesn't run in cycle lock to
  // the 68K. Count the stolen cycles to be accounted later in the 68k CPU runs
  Pico.t.z80_buscycles += 7;
 }
@ -1358,8 +1357,8 @@ static void access_68k_bus(int delay) // bus delay as Q8
 static unsigned char z80_md_vdp_read(unsigned short a)
 {
  if ((a & 0xff00) == 0x7f00) {
-    // 68k bus access delay=3.3 per kabuto, for notaz picotest 2.4<=delay<2.55?
-    access_68k_bus(0x280); // Q8, picotest: 0x266(>=2.4) - 0x28b(<2.55)
+    // 68k bus access delay=3.3 per kabuto, for notaz picotest 2.42<delay<2.57?
+    access_68k_bus(0x280); // Q8, picotest: 0x26d(>2.42) - 0x292(<2.57)

    switch (a & 0x0d)
    {
@ -1383,8 +1382,8 @@ static unsigned char z80_md_bank_read(unsigned short a)
  unsigned int addr68k;
  unsigned char ret;

-  // 68k bus access delay=3.3 per kabuto, but for notaz picotest 3.0<delay<3.3
-  access_68k_bus(0x340); // // Q8, picotest: 0x301(>3.0)-0x34c(<3.3)
+  // 68k bus access delay=3.3 per kabuto, but for notaz picotest 3.02<delay<3.32
+  access_68k_bus(0x340); // Q8, picotest: 0x306(>3.02)-0x351(<3.32)

  addr68k = Pico.m.z80_bank68k << 15;
  addr68k |= a & 0x7fff;
@ -1425,8 +1424,8 @@ static void z80_md_bank_write(unsigned int a, unsigned char data)
 {
  unsigned int addr68k;

-  // 68k bus access delay=3.3 per kabuto, but for notaz picotest 3.0<delay<3.3
-  access_68k_bus(0x340); // // Q8, picotest: 0x301(>3.0)-0x34c(<3.3)
+  // 68k bus access delay=3.3 per kabuto, but for notaz picotest 3.02<delay<3.32
+  access_68k_bus(0x340); // Q8, picotest: 0x306(>3.02)-0x351(<3.32)

  addr68k = Pico.m.z80_bank68k << 15;
  addr68k += a & 0x7fff;
--- a/pico/pico.c
+++ b/pico/pico.c
@ -228,6 +228,8 @@ void PicoLoopPrepare(void)
  Pico.m.dirtyPal = 1;
  rendstatus_old = -1;

+  if (PicoIn.AHW & PAHW_MCD)
+    PicoMCDPrepare();
  if (PicoIn.AHW & PAHW_32X)
    Pico32xPrepare();
 }
--- a/pico/pico_cmn.c
+++ b/pico/pico_cmn.c
@ -66,15 +66,22 @@ static int SekSyncM68k(int once)
  return Pico.t.m68c_aim > Pico.t.m68c_cnt;
 }

+static __inline void SekAimM68k(int cyc, int mult)
+{
+  // refresh slowdown, for cart: 2 cycles every 128 - make this 1 every 64,
+  // for RAM: seems to be 0-3 every 128. Carts usually run from the cart
+  // area, but MCD games only use RAM, hence a different multiplier is needed.
+  // NB must be quite accurate, so handle fractions as well (c/f OutRunners)
+  int delay = (Pico.t.refresh_delay += cyc*mult) >> 14;
+  Pico.t.m68c_cnt += delay;
+  Pico.t.refresh_delay -= delay << 14;
+  Pico.t.m68c_aim += cyc;
+}
+
 static __inline void SekRunM68k(int cyc)
 {
-  // refresh slowdown handling, 2 cycles every 128 - make this 1 every 64
-  // NB must be quite accurate, so handle fractions as well (c/f OutRunners)
-  static int refresh;
-  Pico.t.m68c_cnt += (cyc + refresh) >> 6;
-  refresh = (cyc + refresh) & 0x3f;
-  Pico.t.m68c_aim += cyc;
-
+  // TODO 0x100 would by 2 cycles/128, moreover far too sensitive
+  SekAimM68k(cyc, 0x10c); // OutRunners, testpico, VDPFIFOTesting
  SekSyncM68k(0);
 }

@ -108,10 +115,9 @@ static void do_timing_hacks_end(struct PicoVideo *pv)
  PicoVideoFIFOSync(CYCLES_M68K_LINE);

  // need rather tight Z80 sync for emulation of main bus cycle stealing
-  if (Pico.m.scanline&1) {
+  if (Pico.m.scanline&1)
    if (Pico.m.z80Run && !Pico.m.z80_reset && (PicoIn.opt&POPT_EN_Z80))
      PicoSyncZ80(Pico.t.m68c_aim);
-  }
 }

 static void do_timing_hacks_start(struct PicoVideo *pv)
@ -122,6 +128,8 @@ static void do_timing_hacks_start(struct PicoVideo *pv)
  // XXX how to handle Z80 bus cycle stealing during DMA correctly?
  if ((Pico.t.z80_buscycles -= cycles) < 0)
    Pico.t.z80_buscycles = 0;
+  if (Pico.m.scanline&1)
+    Pico.t.m68c_aim += 1; // add cycle each other line for 488.5 cycles/line
 }

 static int PicoFrameHints(void)
@ -167,7 +175,7 @@ static int PicoFrameHints(void)
    }

    // decide if we draw this line
-    if (!skip && (PicoIn.opt & POPT_ALT_RENDERER))
+    if ((PicoIn.opt & POPT_ALT_RENDERER) && !skip)
    {
      // find the right moment for frame renderer, when display is no longer blanked
      if ((pv->reg[1]&0x40) || y > 100) {
--- a/pico/pico_int.h
+++ b/pico/pico_int.h
@ -204,7 +204,7 @@ extern struct DrZ80 drZ80;
 #define z80_cyclesDone() \
  (Pico.t.z80c_aim - z80_cyclesLeft)

-// 68k clock = OSC/7, z80 clock = OSC/15, 68k:z80 ratio = 7/15*8192=3822.9
+// 68k clock = OSC/7, z80 clock = OSC/15, 68k:z80 ratio = 7/15 = 3822.9/8192
 #define cycles_68k_to_z80(x) ((x) * 3823 >> 13)

 // ----------------------- SH2 CPU -----------------------
@ -443,6 +443,7 @@ struct PicoTiming
  unsigned int m68c_aim;
  unsigned int m68c_frame_start;        // m68k cycles
  unsigned int m68c_line_start;
+  int refresh_delay;

  unsigned int z80c_cnt;                // z80 cycles done (this frame)
  unsigned int z80c_aim;
@ -523,7 +524,7 @@ struct mcd_misc
  unsigned int   stopwatch_base_c;
  unsigned short m68k_poll_a;
  unsigned short m68k_poll_cnt;
-  unsigned short s68k_poll_a;
+  unsigned short s68k_poll_a;     // 10
  unsigned short s68k_poll_cnt;
  unsigned int   s68k_poll_clk;
  unsigned char  bcram_reg;       // 18: battery-backed RAM cart register
@ -640,7 +641,8 @@ struct Pico32x
  unsigned char pad1;
  unsigned short pwm_p[2];       // pwm pos in fifo
  unsigned int pwm_cycle_p;      // pwm play cursor (32x cycles)
-  unsigned int reserved[6];
+  unsigned int hint_counter;
+  unsigned int reserved[5];
 };

 struct Pico32xMem
@ -803,6 +805,7 @@ PICO_INTERNAL void PicoExitMCD(void);
 PICO_INTERNAL void PicoPowerMCD(void);
 PICO_INTERNAL int  PicoResetMCD(void);
 PICO_INTERNAL void PicoFrameMCD(void);
+PICO_INTERNAL void PicoMCDPrepare(void);

 enum pcd_event {
  PCD_EVENT_CDC,
--- a/pico/sound/sound.c
+++ b/pico/sound/sound.c
@ -177,7 +177,7 @@ void PsndRerate(int preserve_state)
  // samples per line (Q16)
  Pico.snd.smpl_mult = 65536LL * PicoIn.sndRate / (target_fps*target_lines);
  // samples per z80 clock (Q20)
-  Pico.snd.clkl_mult = 16 * Pico.snd.smpl_mult * 15/7 / 488;
+  Pico.snd.clkl_mult = 16 * Pico.snd.smpl_mult * 15/7 / 488.5;
  // samples per 44.1 KHz sample
  Pico.snd.cdda_mult = 65536LL * 44100 / PicoIn.sndRate;
  Pico.snd.cdda_div  = 65536LL * PicoIn.sndRate / 44100;
--- a/pico/videoport.c
+++ b/pico/videoport.c
@ -21,25 +21,27 @@ enum { clkdiv = 2 };    // CPU clock granularity: one of 1,2,4,8
 // Thank you very much for the great work, Nemesis, Kabuto!

 // Slot clock is sysclock/20 for h32 and sysclock/16 for h40.
-// One scanline is 63.7us/63.5us (h32/h40) long which is 488.6/487.4 68k cycles.
-// Assume 488 for everything.
+// One scanline is 63.7us/64.3us (ntsc/pal) long which is ~488.57 68k cycles.
+// Approximate by 488 for VDP.
 // 1 slot is 488/171 = 2.8538 68k cycles in h32, and 488/210 = 2.3238 in h40.
 enum { slcpu = 488 };

 // VDP has a slot counter running from 0x00 to 0xff every scanline, but it has
 // a gap depending on the video mode. The slot in which a horizontal interrupt
 // is generated also depends on the video mode.
+// NB Kabuto says gapend40 is 0xe4. That's technically correct, since slots 0xb6
+// and 0xe4 are only half slots. Ignore 0xe4 here and make 0xb6 a full slot.
 enum { hint32 = 0x85, gapstart32 = 0x94, gapend32 = 0xe9};
 enum { hint40 = 0xa5, gapstart40 = 0xb7, gapend40 = 0xe5};
-// XXX Kabuto says gapend40 is 0xe4, but then a line would've 211 slots, while
-// it's 210 in all other sources I looked at?

 // The horizontal sync period (HBLANK) is 30/37 slots (h32/h40):
 // h32: 4 slots front porch (1.49us), 13 HSYNC (4.84us), 13 back porch (4.84us)
 // h40: 5 slots front porch (1.49us), 16 HSYNC (4.77us), 16 back porch (4.77us)
-// HBLANK starts in slot 0x93/0xb3 and ends after slot 0x05 (from Kabuto's doc)
+// HBLANK starts at slot 0x93/0xb4 and ends in the middle of slot 0x05/0x06,
+// NB VDP slows down the h40 clock to h32 during HSYNC for 17 slots to get the
+// right sync timing. Ignored in the slot calculation, but hblen40 is correct.
 enum { hboff32 = 0x93-hint32, hblen32 = 0xf8-(gapend32-gapstart32)-hint32};//30
-enum { hboff40 = 0xb3-hint40, hblen40 = 0xf8-(gapend40-gapstart40)-hint40};//37
+enum { hboff40 = 0xb4-hint40, hblen40 = 0xf8-(gapend40-gapstart40)-hint40};//37

 // number of slots in a scanline
 #define slots32	(0x100-(gapend32-gapstart32)) // 171
@ -263,7 +265,7 @@ void PicoVideoFIFOSync(int cycles)

  // calculate #slots since last executed slot
  slots = Cyc2Sl(vf, cycles) - vf->fifo_slot;
-  if (!slots || !vf->fifo_ql) return;
+  if (slots <= 0 || !vf->fifo_ql) return;

  // advance FIFO queue by #done slots
  done = slots;
@ -308,7 +310,7 @@ static int PicoVideoFIFODrain(int level, int cycles, int bgdma)
    }
  }
  if (vf->fifo_ql && ((vf->fifo_total > level) | bd))
-    cycles = 488; // not completed in this scanline
+    cycles = slcpu; // not completed in this scanline
  if (cycles > ocyc)
    burn = cycles - ocyc;

@ -430,7 +432,7 @@ void PicoVideoFIFOMode(int active, int h40)
  vf->fifo_hcounts = vdphcounts[h40];
  // recalculate FIFO slot for new mode
  vf->fifo_slot = Cyc2Sl(vf, lc);
-  vf->fifo_maxslot = Cyc2Sl(vf, 488);
+  vf->fifo_maxslot = Cyc2Sl(vf, slcpu);
 }

 // VDP memory rd/wr
@ -1031,10 +1033,9 @@ update_irq:

 static u32 VideoSr(const struct PicoVideo *pv)
 {
-  unsigned int hp = pv->reg[12]&1 ? hboff40*488/slots40 : hboff32*488/slots32;
-  unsigned int hl = pv->reg[12]&1 ? hblen40*488/slots40 : hblen32*488/slots32;
-  // XXX -2 is to please notaz' testpico, but why is this?
-  unsigned int c = SekCyclesDone()-2 - Pico.t.m68c_line_start;
+  unsigned int hp = pv->reg[12]&1 ? hboff40*488.5/slots40 : hboff32*488.5/slots32;
+  unsigned int hl = pv->reg[12]&1 ? hblen40*488.5/slots40 : hblen32*488.5/slots32;
+  unsigned int c = SekCyclesDone() - Pico.t.m68c_line_start;
  u32 d;

  PicoVideoFIFOSync(c);