sdl, complete overhaul of hardware/software scaling

2025-10-26 16:29:37 -04:00 · 2021-09-10 18:46:05 +02:00 · 2021-09-10 18:46:05 +02:00 · d5d1778252
commit d5d1778252
parent 6651998e9f
21 changed files with 1233 additions and 547 deletions
--- a/platform/common/emu.c
+++ b/platform/common/emu.c
@ -1219,7 +1219,7 @@ void emu_cmn_forced_frame(int no_scale, int do_emu, void *buf)
 		memset32((short *)g_screen_ptr + g_screen_ppitch * y, 0,
 			 g_screen_width * 2 / 4);

-	PicoIn.opt &= ~POPT_ALT_RENDERER;
+	PicoIn.opt &= ~(POPT_ALT_RENDERER|POPT_EN_SOFTSCALE);
 	PicoIn.opt |= POPT_ACC_SPRITES;
 	if (!no_scale && currentConfig.scaling)
 		PicoIn.opt |= POPT_EN_SOFTSCALE;
--- a/platform/common/emu.h
+++ b/platform/common/emu.h
@ -37,10 +37,19 @@ extern int g_screen_ppitch; // pitch in pixels

 enum {
 	EOPT_SCALE_NONE = 0,
-	EOPT_SCALE_SW,
+	// linux, GP2X:
+	EOPT_SCALE_SW = 1,
 	EOPT_SCALE_HW,
 };

+enum {
+	EOPT_FILTER_NONE = 0,
+	// software scalers
+	EOPT_FILTER_SMOOTHER = 1,
+	EOPT_FILTER_BILINEAR1,
+	EOPT_FILTER_BILINEAR2,
+};
+
 enum {
 	EOPT_CONFIRM_NONE = 0,
 	EOPT_CONFIRM_SAVE = 1,
@ -63,7 +72,7 @@ typedef struct _currentConfig_t {
 	int CPUclock;
 	int volume;
 	int gamma;
-	int scaling;  // gp2x: EOPT_SCALE_*; psp: bilinear filtering
+	int scaling;  // EOPT_SCALE_*
 	int vscaling;
 	int rotation; // for UIQ
 	float scale; // psp: screen scale
@ -72,7 +81,7 @@ typedef struct _currentConfig_t {
 	int turbo_rate;
 	int renderer;
 	int renderer32x;
-	int filter; // pandora
+	int filter;  // EOPT_FILTER_* video filter
 	int analog_deadzone;
 	int msh2_khz;
 	int ssh2_khz;
@ -180,6 +189,7 @@ void plat_update_volume(int has_changed, int is_up);
 /* should be in libpicofe/plat.h */
 void plat_video_clear_status(void);
 void plat_video_clear_buffers(void);
+void plat_video_set_size(int w, int h);

 #ifdef __cplusplus
 } // extern "C"
--- a/platform/common/plat_sdl.c
+++ b/platform/common/plat_sdl.c
@ -22,6 +22,7 @@
 #include <pico/pico_int.h>

 static void *shadow_fb;
+static struct area { int w, h; } area;

 static struct in_pdata in_sdl_platform_data = {
 	.defbinds = in_sdl_defbinds,
@ -81,54 +82,77 @@ void bgr_to_uyvy_init(void)
  }
 }

-void rgb565_to_uyvy(void *d, const void *s, int pixels, int x2)
+void rgb565_to_uyvy(void *d, const void *s, int w, int h, int pitch, int x2)
 {
  uint32_t *dst = d;
  const uint16_t *src = s;
+  int i;

-  if (x2)
-  for (; pixels > 0; src += 4, dst += 4, pixels -= 4)
-  {
-    struct uyvy *uyvy0 = yuv_uyvy + src[0], *uyvy1 = yuv_uyvy + src[1];
-    struct uyvy *uyvy2 = yuv_uyvy + src[2], *uyvy3 = yuv_uyvy + src[3];
+  if (x2) while (h--) {
+    for (i = w; i > 0; src += 4, dst += 4, i -= 4)
+    {
+      struct uyvy *uyvy0 = yuv_uyvy + src[0], *uyvy1 = yuv_uyvy + src[1];
+      struct uyvy *uyvy2 = yuv_uyvy + src[2], *uyvy3 = yuv_uyvy + src[3];
 #if CPU_IS_LE
-    dst[0] = (uyvy0->y << 24) | uyvy0->vyu;
-    dst[1] = (uyvy1->y << 24) | uyvy1->vyu;
-    dst[2] = (uyvy2->y << 24) | uyvy2->vyu;
-    dst[3] = (uyvy3->y << 24) | uyvy3->vyu;
+      dst[0] = (uyvy0->y << 24) | uyvy0->vyu;
+      dst[1] = (uyvy1->y << 24) | uyvy1->vyu;
+      dst[2] = (uyvy2->y << 24) | uyvy2->vyu;
+      dst[3] = (uyvy3->y << 24) | uyvy3->vyu;
 #else
-    dst[0] = uyvy0->y | (uyvy0->vyu << 8);
-    dst[1] = uyvy1->y | (uyvy1->vyu << 8);
-    dst[2] = uyvy2->y | (uyvy2->vyu << 8);
-    dst[3] = uyvy3->y | (uyvy3->vyu << 8);
+      dst[0] = uyvy0->y | (uyvy0->vyu << 8);
+      dst[1] = uyvy1->y | (uyvy1->vyu << 8);
+      dst[2] = uyvy2->y | (uyvy2->vyu << 8);
+      dst[3] = uyvy3->y | (uyvy3->vyu << 8);
 #endif
-  } else 
-  for (; pixels > 0; src += 4, dst += 2, pixels -= 4)
-  {
-    struct uyvy *uyvy0 = yuv_uyvy + src[0], *uyvy1 = yuv_uyvy + src[1];
-    struct uyvy *uyvy2 = yuv_uyvy + src[2], *uyvy3 = yuv_uyvy + src[3];
+    }
+    src += pitch - w;
+  } else while (h--) {
+    for (i = w; i > 0; src += 4, dst += 2, i -= 4)
+    {
+      struct uyvy *uyvy0 = yuv_uyvy + src[0], *uyvy1 = yuv_uyvy + src[1];
+      struct uyvy *uyvy2 = yuv_uyvy + src[2], *uyvy3 = yuv_uyvy + src[3];
 #if CPU_IS_LE
-    dst[0] = (uyvy1->y << 24) | uyvy0->vyu;
-    dst[1] = (uyvy3->y << 24) | uyvy2->vyu;
+      dst[0] = (uyvy1->y << 24) | uyvy0->vyu;
+      dst[1] = (uyvy3->y << 24) | uyvy2->vyu;
 #else
-    dst[0] = uyvy1->y | (uyvy0->vyu << 8);
-    dst[1] = uyvy3->y | (uyvy2->vyu << 8);
+      dst[0] = uyvy1->y | (uyvy0->vyu << 8);
+      dst[1] = uyvy3->y | (uyvy2->vyu << 8);
 #endif
+    }
+    src += pitch - w;
  }
 }

 static int clear_buf_cnt, clear_stat_cnt;

+void plat_video_set_size(int w, int h)
+{
+	if (area.w != w || area.h != h) {
+		area = (struct area) { w, h };
+
+		if (plat_sdl_change_video_mode(w, h, 0) < 0) {
+			// failed, revert to original resolution
+			plat_sdl_change_video_mode(g_screen_width, g_screen_height, 0);
+			w = g_screen_width, h = g_screen_height;
+		}
+		if (!plat_sdl_overlay && !plat_sdl_gl_active) {
+			g_screen_width = w;
+			g_screen_height = h;
+			g_screen_ppitch = w;
+			g_screen_ptr = plat_sdl_screen->pixels;
+		}
+	}
+}
+
 void plat_video_flip(void)
 {
 	if (plat_sdl_overlay != NULL) {
 		SDL_Rect dstrect =
 			{ 0, 0, plat_sdl_screen->w, plat_sdl_screen->h };
-
 		SDL_LockYUVOverlay(plat_sdl_overlay);
 		rgb565_to_uyvy(plat_sdl_overlay->pixels[0], shadow_fb,
-				g_screen_ppitch * g_screen_height,
-				plat_sdl_overlay->w > 2*plat_sdl_overlay->h);
+				area.w, area.h, g_screen_ppitch,
+				plat_sdl_overlay->w >= 2*area.w);
 		SDL_UnlockYUVOverlay(plat_sdl_overlay);
 		SDL_DisplayYUVOverlay(plat_sdl_overlay, &dstrect);
 	}
@ -205,7 +229,7 @@ void plat_video_menu_end(void)

 		SDL_LockYUVOverlay(plat_sdl_overlay);
 		rgb565_to_uyvy(plat_sdl_overlay->pixels[0], shadow_fb,
-				g_menuscreen_pp * g_menuscreen_h, 0);
+			g_menuscreen_w, g_menuscreen_h, g_menuscreen_pp, 0);
 		SDL_UnlockYUVOverlay(plat_sdl_overlay);

 		SDL_DisplayYUVOverlay(plat_sdl_overlay, &dstrect);
@ -227,10 +251,10 @@ void plat_video_menu_leave(void)

 void plat_video_loop_prepare(void)
 {
-	// take over any new vout settings XXX ask plat_sdl for scaling instead!
+	// take over any new vout settings
 	plat_sdl_change_video_mode(g_menuscreen_w, g_menuscreen_h, 0);
 	// switch over to scaled output if available
-	if (plat_sdl_overlay != NULL || plat_sdl_gl_active || currentConfig.scaling != EOPT_SCALE_NONE) {
+	if (plat_sdl_overlay != NULL || plat_sdl_gl_active) {
 		g_screen_width = 320;
 		g_screen_height = 240;
 		g_screen_ppitch = g_screen_width;
@ -246,6 +270,7 @@ void plat_video_loop_prepare(void)
 		g_screen_ptr = plat_sdl_screen->pixels;
 	}
 	plat_video_set_buffer(g_screen_ptr);
+	plat_video_set_size(g_screen_width, g_screen_height);
 }

 void plat_early_init(void)
--- a/platform/common/upscale.c
+++ b/platform/common/upscale.c
@ -7,7 +7,7 @@
 * nn:	nearest neighbour
 * snn:	"smoothed" nearest neighbour (see below)
 * bln:	n-level-bilinear with n quantized weights
- *	quantization: 0: a<1/2*n, 1/n: 1/2*n<=a<3/2*n, etc
+ *	quantization: 0: a<1/(2*n), 1/n: 1/(2*n)<=a<3/(2*n), etc
 *	currently n=2, n=4 are implemented (there's n=8 mixing, but no filters)
 *	[NB this has been brought to my attn, which is probably the same as bl2:
 *	https://www.drdobbs.com/image-scaling-with-bresenham/184405045?pgno=1]
@ -18,490 +18,586 @@
 *	a sharper look than a bilinear filter, at the price of some visible jags
 *	on diagonal edges.
 * 
- * scaling modes:
- * 256x___ -> 320x___	only horizontal scaling. Produces an aspect error of
- *			~7% for NTSC 224 line modes, but is correct for PAL
- * 256/320x224/240
- *	-> 320x240	always produces 320x240 at DAR 4:3
- * 160x144 -> 320x240	game gear (currently unused)
+ * example scaling modes:
+ * 256x_Y_ -> 320x_Y_, H32/mode 4, PAR 5:4, for PAL DAR 4:3 (NTSC 7% aspect err)
+ * 256x224 -> 320x240, H32/mode 4, PAR 5:4, for NTSC DAR 4:3 (PAL 7% aspect err)
+ * 320x224 -> 320x240, PAR 1:1, for NTSC, DAR 4:3 (PAL 7% etc etc...)
+ * 160x144 -> 320x240: GG, PAR 6:5, scaling to 320x240 for DAR 4:3
 * 
 * (C) 2021 kub <derkub@gmail.com>
 */

 #include "upscale.h"

-/* 256x___ -> 320x___, H32/mode 4, PAR 5:4, for PAL DAR 4:3 (wrong for NTSC) */
-void upscale_clut_nn_256_320x___(u8 *__restrict di, int ds, u8 *__restrict si, int ss, int height)
+/* X x Y -> X*5/4 x Y */
+void upscale_clut_nn_x_4_5(u8 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height)
 {
 	int y;

 	for (y = 0; y < height; y++) {
-		h_upscale_nn_4_5(di, ds, si, ss, 256, f_nop);
+		h_upscale_nn_4_5(di, ds, si, ss, width, f_nop);
 	}
 }

-void upscale_rgb_nn_256_320x___(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int height, u16 *pal)
+void upscale_rgb_nn_x_4_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
 {
 	int y;

 	for (y = 0; y < height; y++) {
-		h_upscale_nn_4_5(di, ds, si, ss, 256, f_pal);
+		h_upscale_nn_4_5(di, ds, si, ss, width, f_pal);
 	}
 }

-void upscale_rgb_snn_256_320x___(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int height, u16 *pal)
+void upscale_rgb_snn_x_4_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
 {
 	int y;

 	for (y = 0; y < height; y++) {
-		h_upscale_snn_4_5(di, ds, si, ss, 256, f_pal);
+		h_upscale_snn_4_5(di, ds, si, ss, width, f_pal);
 	}
 }

-void upscale_rgb_bl2_256_320x___(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int height, u16 *pal)
+void upscale_rgb_bl2_x_4_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
 {
 	int y;

 	for (y = 0; y < height; y++) {
-		h_upscale_bl2_4_5(di, ds, si, ss, 256, f_pal);
+		h_upscale_bl2_4_5(di, ds, si, ss, width, f_pal);
 	}
 }

-void upscale_rgb_bl4_256_320x___(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int height, u16 *pal)
+void upscale_rgb_bl4_x_4_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
 {
 	int y;

 	for (y = 0; y < height; y++) {
-		h_upscale_bl4_4_5(di, ds, si, ss, 256, f_pal);
+		h_upscale_bl4_4_5(di, ds, si, ss, width, f_pal);
 	}
 }

-/* 256x224 -> 320x240, H32/mode 4, PAR 5:4, for NTSC DAR 4:3 (wrong for PAL) */
-void upscale_clut_nn_256_320x224_240(u8 *__restrict di, int ds, u8 *__restrict si, int ss)
+/* X x Y -> X*5/4 x Y*17/16 */
+void upscale_clut_nn_x_4_5_y_16_17(u8 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height)
 {
+	int swidth = width * 5/4;
 	int y, j;

-	/* 14:15, 0 1 2 3 4 5 6 6 7 8 9 10 11 12 13 */
-	for (y = 0; y < 224; y += 14) {
-		/* lines 0-6 */
-		for (j = 0; j < 7; j++) {
-			h_upscale_nn_4_5(di, ds, si, ss, 256, f_nop);
-		}
-		/* lines 8-14 */
-		di += ds;
-		for (j = 0; j < 7; j++) {
-			h_upscale_nn_4_5(di, ds, si, ss, 256, f_nop);
-		}
-		/* line 7 */
-		di -= 8*ds;
-		v_copy(&di[0], &di[-ds], 320, f_nop);
-		di += 8*ds;
-	}
-}
-
-void upscale_rgb_nn_256_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal)
-{
-	int y, j;
-
-	for (y = 0; y < 224; y += 14) {
-		for (j = 0; j < 7; j++) {
-			h_upscale_nn_4_5(di, ds, si, ss, 256, f_pal);
+	for (y = 0; y < height; y += 16) {
+		for (j = 0; j < 8; j++) {
+			h_upscale_nn_4_5(di, ds, si, ss, width, f_nop);
 		}
 		di +=  ds;
-		for (j = 0; j < 7; j++) {
-			h_upscale_nn_4_5(di, ds, si, ss, 256, f_pal);
+		for (j = 0; j < 8; j++) {
+			h_upscale_nn_4_5(di, ds, si, ss, width, f_nop);
 		}

-		di -= 8*ds;
-		v_copy(&di[0], &di[-ds], 320, f_nop);
-		di += 8*ds;
+		di -= 9*ds;
+		v_copy(&di[0], &di[-ds], swidth, f_nop);
+		di += 9*ds;
 	}
 }

-void upscale_rgb_snn_256_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal)
+void upscale_rgb_nn_x_4_5_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
 {
+	int swidth = width * 5/4;
 	int y, j;

-	/* 14:15, 0 1 2 3 4 5 5+6 6+7 7+8 8 9 10 11 12 13 */
-	for (y = 0; y < 224; y += 14) {
-		for (j = 0; j < 7; j++) {
-			h_upscale_snn_4_5(di, ds, si, ss, 256, f_pal);
+	for (y = 0; y < height; y += 16) {
+		for (j = 0; j < 8; j++) {
+			h_upscale_nn_4_5(di, ds, si, ss, width, f_pal);
 		}
 		di +=  ds;
-		for (j = 0; j < 7; j++) {
-			h_upscale_snn_4_5(di, ds, si, ss, 256, f_pal);
+		for (j = 0; j < 8; j++) {
+			h_upscale_nn_4_5(di, ds, si, ss, width, f_pal);
+		}
+
+		di -= 9*ds;
+		v_copy(&di[0], &di[-ds], swidth, f_nop);
+		di += 9*ds;
+	}
+}
+
+void upscale_rgb_snn_x_4_5_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
+{
+	int swidth = width * 5/4;
+	int y, j;
+
+	for (y = 0; y < height; y += 16) {
+		for (j = 0; j < 8; j++) {
+			h_upscale_snn_4_5(di, ds, si, ss, width, f_pal);
+		}
+		di +=  ds;
+		for (j = 0; j < 8; j++) {
+			h_upscale_snn_4_5(di, ds, si, ss, width, f_pal);
 		}

 		/* mix lines 6-8 */
-		di -= 8*ds;
-		v_mix(&di[0], &di[-ds], &di[ds], 320, p_05, f_nop);
-		v_mix(&di[-ds], &di[-2*ds], &di[-ds], 320, p_05, f_nop);
-		v_mix(&di[ ds], &di[ ds], &di[ 2*ds], 320, p_05, f_nop);
-		di += 8*ds;
+		di -= 9*ds;
+		v_mix(&di[0], &di[-ds], &di[ds], swidth, p_05, f_nop);
+		v_mix(&di[-ds], &di[-2*ds], &di[-ds], swidth, p_05, f_nop);
+		v_mix(&di[ ds], &di[ ds], &di[ 2*ds], swidth, p_05, f_nop);
+		di += 9*ds;
 	}
 }

-void upscale_rgb_bln_256_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal)
+void upscale_rgb_bl2_x_4_5_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
 {
+	int swidth = width * 5/4;
 	int y, j;

-	/* 14:15, 0 1 2 2+3 3+4 4+5 5+6 6+7 7+8 8+9 9+10 10+11 11 12 13 */
-	for (y = 0; y < 224; y += 14) {
-		/* lines 0-2 */
-		for (j = 0; j < 3; j++) {
-			h_upscale_bln_4_5(di, ds, si, ss, 256, f_pal);
+	for (y = 0; y < height; y += 16) {
+		for (j = 0; j < 4; j++) {
+			h_upscale_bl2_4_5(di, ds, si, ss, width, f_pal);
 		}
-		/* lines 3-11 mixing prep */
-		di += ds;
-		for (j = 0; j < 11; j++) {
-			h_upscale_bln_4_5(di, ds, si, ss, 256, f_pal);
+		di +=  ds;
+		for (j = 0; j < 12; j++) {
+			h_upscale_bl2_4_5(di, ds, si, ss, width, f_pal);
 		}
-		di -= 12*ds;
-		/* mixing line 3: line 2 = -ds, line 3 = +ds */
-			v_mix(&di[0], &di[-ds], &di[ds], 320, p_025, f_nop);
+		/* mix lines 3-10 */
+		di -= 13*ds;
+			v_mix(&di[0], &di[-ds], &di[ds], swidth, p_05, f_nop);
+		for (j = 0; j < 7; j++) {
 			di += ds;
-		/* mixing lines 4-5: line n-1 = 0, line n = +ds */
+			v_mix(&di[0], &di[0], &di[ds], swidth, p_05, f_nop);
+		}
+		di += 6*ds;
+	}
+}
+
+void upscale_rgb_bl4_x_4_5_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
+{
+	int swidth = width * 5/4;
+	int y, j;
+
+	for (y = 0; y < height; y += 16) {
 		for (j = 0; j < 2; j++) {
-			v_mix(&di[0], &di[0], &di[ds], 320, p_025, f_nop);
+			h_upscale_bl4_4_5(di, ds, si, ss, width, f_pal);
+		}
+		di += ds;
+		for (j = 0; j < 14; j++) {
+			h_upscale_bl4_4_5(di, ds, si, ss, width, f_pal);
+		}
+		di -= 15*ds;
+		/* mixing line 2: line 1 = -ds, line 2 = +ds */
+			v_mix(&di[0], &di[-ds], &di[ds], swidth, p_025, f_nop);
+			di += ds;
+		/* mixing lines 3-5: line n-1 = 0, line n = +ds */
+		for (j = 0; j < 3; j++) {
+			v_mix(&di[0], &di[0], &di[ds], swidth, p_025, f_nop);
 			di += ds;
 			}
-		/* mixing line 6-8 */
-		for (j = 0; j < 3; j++) {
-			v_mix(&di[0], &di[0], &di[ds], 320, p_05, f_nop);
+		/* mixing lines 6-9 */
+		for (j = 0; j < 4; j++) {
+			v_mix(&di[0], &di[0], &di[ds], swidth, p_05, f_nop);
 			di += ds;
 		}
-		/* mixing lines 9-11 */
-		for (j = 0; j < 3; j++) {
-			v_mix(&di[0], &di[0], &di[ds], 320, p_075, f_nop);
+		/* mixing lines 10-13 */
+		for (j = 0; j < 4; j++) {
+			v_mix(&di[0], &di[0], &di[ds], swidth, p_075, f_nop);
 			di += ds;
 		}
-		/* lines 12-14, already in place */
+		/* lines 14-16, already in place */
 		di += 3*ds;
 	}
 }

-void upscale_rgb_bl2_256_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal)
+/* "classic" upscaler as found in several emulators. It's really more like a
+ * x*4/3, y*16/15 upscaler, with an additional 5th row/17th line just inserted
+ * from the source image. That gives nice n/4,n/16 alpha values plus better
+ * symmetry in each block and avoids "borrowing" a row/line between blocks.
+ */
+void upscale_rgb_bln_x_4_5_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
 {
+	int swidth = width * 5/4;
 	int y, j;

-	/* 14:15, 0 1 2 2+3 3+4 4+5 5+6 6+7 7+8 8+9 9+10 10 11 12 13 */
-	for (y = 0; y < 224; y += 14) {
-		for (j = 0; j < 3; j++) {
-			h_upscale_bl2_4_5(di, ds, si, ss, 256, f_pal);
+	for (y = 0; y < height; y += 16) {
+		for (j = 0; j < 4; j++) {
+			h_upscale_bln_4_5(di, ds, si, ss, width, f_pal);
 		}
-		di +=  ds;
-		for (j = 0; j < 11; j++) {
-			h_upscale_bl2_4_5(di, ds, si, ss, 256, f_pal);
-		}
-		/* mix lines 3-10 */
-		di -= 12*ds;
-			v_mix(&di[0], &di[-ds], &di[ds], 320, p_05, f_nop);
-		for (j = 0; j < 7; j++) {
-			di += ds;
-			v_mix(&di[0], &di[0], &di[ds], 320, p_05, f_nop);
-		}
-		di += 5*ds;
-	}
-}
-
-void upscale_rgb_bl4_256_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal)
-{
-	int y, j;
-
-	/* 14:15, 0 0+1 1+2 2+3 3+4 4+5 5+6 6+7 7+8 8+9 9+10 10+11 11+12 12 13 */
-	for (y = 0; y < 224; y += 14) {
-		/* line 0 */
-			h_upscale_bl4_4_5(di, ds, si, ss, 256, f_pal);
-		/* lines 1-14 mixing prep */
 		di += ds;
-		for (j = 0; j < 13; j++) {
-			h_upscale_bl4_4_5(di, ds, si, ss, 256, f_pal);
+		for (j = 0; j < 12; j++) {
+			h_upscale_bln_4_5(di, ds, si, ss, width, f_pal);
 		}
-		di -= 14*ds;
-		/* mixing line 1: line 0 = -ds, line 1 = +ds */
-			v_mix(&di[0], &di[-ds], &di[ds], 320, p_025, f_nop);
+		di -= 13*ds;
+		/* mixing line 4: line 3 = -ds, line 4 = +ds */
+			v_mix(&di[0], &di[-ds], &di[ds], swidth, p_025, f_nop);
 			di += ds;
-		/* mixing lines 2-4: line n-1 = 0, line n = +ds */
-		for (j = 0; j < 3; j++) {
-			v_mix(&di[0], &di[0], &di[ds], 320, p_025, f_nop);
+		/* mixing lines 5-6: line n-1 = 0, line n = +ds */
+		for (j = 0; j < 2; j++) {
+			v_mix(&di[0], &di[0], &di[ds], swidth, p_025, f_nop);
 			di += ds;
 			}
-		/* mixing lines 5-8 */
-		for (j = 0; j < 4; j++) {
-			v_mix(&di[0], &di[0], &di[ds], 320, p_05, f_nop);
+		/* mixing line 7-9 */
+		for (j = 0; j < 3; j++) {
+			v_mix(&di[0], &di[0], &di[ds], swidth, p_05, f_nop);
 			di += ds;
 		}
-		/* mixing lines 9-12 */
-		for (j = 0; j < 4; j++) {
-			v_mix(&di[0], &di[0], &di[ds], 320, p_075, f_nop);
+		/* mixing lines 10-12 */
+		for (j = 0; j < 3; j++) {
+			v_mix(&di[0], &di[0], &di[ds], swidth, p_075, f_nop);
 			di += ds;
 		}
-		/* lines 13-14, already in place */
-		di += 2*ds;
+		/* lines 13-16, already in place */
+		di += 4*ds;
 	}
 }

-void upscale_rgb_bl8_256_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal)
+/* experimental 8 level bilinear for quality assessment */
+void upscale_rgb_bl8_x_4_5_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
 {
-	int y, j, d;
+	int swidth = width * 5/4;
+	int y, j;

-	/* 14:15, -1+0 0+1 1+2 2+3 3+4 4+5 5+6 6+7 7+8 8+9 9+10 10+11 11+12 12+13 13 */
-	for (y = 0, d = ds; y < 224; y += 14, d = -ds) {
-		/* lines 0-14 mixing prep */
+	for (y = 0; y < 224; y += 16) {
+		for (j = 0; j < 2; j++) {
+			h_upscale_bl8_4_5(di, ds, si, ss, width, f_pal);
+		}
 		di += ds;
 		for (j = 0; j < 14; j++) {
-			h_upscale_bl8_4_5(di, ds, si, ss, 256, f_pal);
+			h_upscale_bl8_4_5(di, ds, si, ss, width, f_pal);
 		}
 		di -= 15*ds;
-		/* mixing line 0: line 0 = -ds, line 1 = +ds */
-			v_mix(&di[0], &di[d], &di[ds], 320, p_0125, f_nop);
+		/* mixing line 2: line 2 = -ds, line 3 = +ds */
+			v_mix(&di[0], &di[-ds], &di[ds], swidth, p_0125, f_nop);
 			di += ds;
-		/* mixing line 1: line 1 = 0, line 2 = +ds */
-			v_mix(&di[0], &di[0], &di[ds], 320, p_0125, f_nop);
+		/* mixing line 3: line 3 = 0, line 4 = +ds */
+			v_mix(&di[0], &di[0], &di[ds], swidth, p_0125, f_nop);
 			di += ds;
-		/* mixing lines 2-3: line n-1 = 0, line n = +ds */
+		/* mixing lines 4-5: line n-1 = 0, line n = +ds */
 		for (j = 0; j < 2; j++) {
-			v_mix(&di[0], &di[0], &di[ds], 320, p_025, f_nop);
+			v_mix(&di[0], &di[0], &di[ds], swidth, p_025, f_nop);
 			di += ds;
 			}
-		/* mixing lines 4-5 */
+		/* mixing lines 6-7 */
 		for (j = 0; j < 2; j++) {
 			v_mix(&di[0], &di[0], &di[ds], 320, p_0375, f_nop);
 			di += ds;
 		}
-		/* mixing lines 6-7 */
+		/* mixing lines 8-9 */
 		for (j = 0; j < 2; j++) {
 			v_mix(&di[0], &di[0], &di[ds], 320, p_05, f_nop);
 			di += ds;
 		}
-		/* mixing lines 8-9 */
+		/* mixing lines 10-11 */
 		for (j = 0; j < 2; j++) {
 			v_mix(&di[0], &di[0], &di[ds], 320, p_0625, f_nop);
 			di += ds;
 		}
-		/* mixing lines 10-11 */
+		/* mixing lines 12-13 */
 		for (j = 0; j < 2; j++) {
 			v_mix(&di[0], &di[0], &di[ds], 320, p_075, f_nop);
 			di += ds;
 		}
-		/* mixing lines 12-13 */
+		/* mixing lines 14-15 */
 		for (j = 0; j < 2; j++) {
 			v_mix(&di[0], &di[0], &di[ds], 320, p_0875, f_nop);
 			di += ds;
 		}
-		/* line 14, already in place */
+		/* line 16, already in place */
 		di += ds;
 	}
 }

-/* 320x224 -> 320x240, PAR 1:1, for NTSC, DAR 4:3 (wrong for PAL) */
-void upscale_clut_nn_320x224_240(u8 *__restrict di, int ds, u8 *__restrict si, int ss)
+/* X x Y -> X x Y*17/16 */
+void upscale_clut_nn_y_16_17(u8 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height)
 {
 	int y, j;

-	for (y = 0; y < 224; y += 14) {
-		for (j = 0; j < 7; j++) {
-			h_copy(di, ds, si, ss, 320, f_nop);
-		}
-		di += ds;
-		for (j = 0; j < 7; j++) {
-			h_copy(di, ds, si, ss, 320, f_nop);
-		}
-
-		di -= 8*ds;
-		v_copy(&di[0], &di[-ds], 320, f_nop);
-		di += 8*ds;
-
-	}
-}
-
-void upscale_rgb_nn_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal)
-{
-	int y, j;
-
-	for (y = 0; y < 224; y += 14) {
-		for (j = 0; j < 7; j++) {
-			h_copy(di, ds, si, ss, 320, f_pal);
-		}
-		di +=  ds;
-		for (j = 0; j < 7; j++) {
-			h_copy(di, ds, si, ss, 320, f_pal);
-		}
-
-		di -= 8*ds;
-		v_copy(&di[0], &di[-ds], 320, f_nop);
-		di += 8*ds;
-	}
-}
-
-void upscale_rgb_snn_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal)
-{
-	int y, j;
-
-	for (y = 0; y < 224; y += 14) {
-		for (j = 0; j < 7; j++) {
-			h_copy(di, ds, si, ss, 320, f_pal);
-		}
-		di +=  ds;
-		for (j = 0; j < 7; j++) {
-			h_copy(di, ds, si, ss, 320, f_pal);
-		}
-
-		di -= 8*ds;
-		v_mix(&di[  0], &di[-ds], &di[ds], 320, p_05, f_nop);
-		v_mix(&di[-ds], &di[-2*ds], &di[-ds], 320, p_05, f_nop);
-		v_mix(&di[ ds], &di[ ds], &di[ 2*ds], 320, p_05, f_nop);
-		di += 8*ds;
-	}
-}
-
-void upscale_rgb_bl2_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal)
-{
-	int y, j;
-
-	for (y = 0; y < 224; y += 14) {
-		for (j = 0; j < 3; j++) {
-			h_copy(di, ds, si, ss, 320, f_pal);
-		}
+	for (y = 0; y < height; y += 16) {
 		for (j = 0; j < 8; j++) {
-			v_mix(&di[0], &si[-ss], &si[0], 320, p_05, f_pal);
-			di += ds;
-			si += ss;
+			h_copy(di, ds, si, ss, width, f_nop);
 		}
-		si -= ss;
-		for (j = 0; j < 4; j++) {
-			h_copy(di, ds, si, ss, 320, f_pal);
+		di +=  ds;
+		for (j = 0; j < 8; j++) {
+			h_copy(di, ds, si, ss, width, f_nop);
 		}
+
+		di -= 9*ds;
+		v_copy(&di[0], &di[-ds], width, f_nop);
+		di += 9*ds;
 	}
 }

-void upscale_rgb_bl4_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal)
+void upscale_rgb_nn_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
 {
 	int y, j;

-	for (y = 0; y < 224; y += 14) {
-			h_copy(di, ds, si, ss, 320, f_pal);
-		for (j = 0; j < 4; j++) {
-			v_mix(&di[0], &si[-ss], &si[0], 320, p_025, f_pal);
-			di += ds;
-			si += ss;
+	for (y = 0; y < height; y += 16) {
+		for (j = 0; j < 8; j++) {
+			h_copy(di, ds, si, ss, width, f_pal);
 		}
-		for (j = 0; j < 4; j++) {
-			v_mix(&di[0], &si[-ss], &si[0], 320, p_05, f_pal);
-			di += ds;
-			si += ss;
+		di +=  ds;
+		for (j = 0; j < 8; j++) {
+			h_copy(di, ds, si, ss, width, f_pal);
 		}
-		for (j = 0; j < 4; j++) {
-			v_mix(&di[0], &si[-ss], &si[0], 320, p_075, f_pal);
-			di += ds;
-			si += ss;
+
+		di -= 9*ds;
+		v_copy(&di[0], &di[-ds], width, f_nop);
+		di += 9*ds;
+	}
+}
+
+void upscale_rgb_snn_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
+{
+	int y, j;
+
+	for (y = 0; y < height; y += 16) {
+		for (j = 0; j < 8; j++) {
+			h_copy(di, ds, si, ss, width, f_pal);
 		}
-		si -= ss;
+		di +=  ds;
+		for (j = 0; j < 8; j++) {
+			h_copy(di, ds, si, ss, width, f_pal);
+		}
+
+		/* mix lines 6-8 */
+		di -= 9*ds;
+		v_mix(&di[0], &di[-ds], &di[ds], width, p_05, f_nop);
+		v_mix(&di[-ds], &di[-2*ds], &di[-ds], width, p_05, f_nop);
+		v_mix(&di[ ds], &di[ ds], &di[ 2*ds], width, p_05, f_nop);
+		di += 9*ds;
+	}
+}
+
+void upscale_rgb_bl2_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
+{
+	int y, j;
+
+	for (y = 0; y < height; y += 16) {
+		for (j = 0; j < 4; j++) {
+			h_copy(di, ds, si, ss, width, f_pal);
+		}
+		di +=  ds;
+		for (j = 0; j < 12; j++) {
+			h_copy(di, ds, si, ss, width, f_pal);
+		}
+		/* mix lines 3-10 */
+		di -= 13*ds;
+			v_mix(&di[0], &di[-ds], &di[ds], width, p_05, f_nop);
+		for (j = 0; j < 7; j++) {
+			di += ds;
+			v_mix(&di[0], &di[0], &di[ds], width, p_05, f_nop);
+		}
+		di += 6*ds;
+	}
+}
+
+void upscale_rgb_bl4_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
+{
+	int y, j;
+
+	for (y = 0; y < height; y += 16) {
 		for (j = 0; j < 2; j++) {
-			h_copy(di, ds, si, ss, 320, f_pal);
+			h_copy(di, ds, si, ss, width, f_pal);
 		}
+		di += ds;
+		for (j = 0; j < 14; j++) {
+			h_copy(di, ds, si, ss, width, f_pal);
+		}
+		di -= 15*ds;
+		/* mixing line 2: line 1 = -ds, line 2 = +ds */
+			v_mix(&di[0], &di[-ds], &di[ds], width, p_025, f_nop);
+			di += ds;
+		/* mixing lines 3-5: line n-1 = 0, line n = +ds */
+		for (j = 0; j < 3; j++) {
+			v_mix(&di[0], &di[0], &di[ds], width, p_025, f_nop);
+			di += ds;
+			}
+		/* mixing lines 6-9 */
+		for (j = 0; j < 4; j++) {
+			v_mix(&di[0], &di[0], &di[ds], width, p_05, f_nop);
+			di += ds;
+		}
+		/* mixing lines 10-13 */
+		for (j = 0; j < 4; j++) {
+			v_mix(&di[0], &di[0], &di[ds], width, p_075, f_nop);
+			di += ds;
+		}
+		/* lines 14-16, already in place */
+		di += 3*ds;
 	}
 }

-/* 160x144 -> 320x240: GG, PAR 6:5, scaling to 320x240 for DAR 4:3 */
-/* NB for smoother image could scale to 288x216, x*9/5, y*3/2 ?
- *	 h: 11111 11112 22222 22233 33333 33444 44444 45555 55555
- *            1     1     2    2+3    3    3+4    4     5     5
- *       v: 11  12  22
- *          1   1+2 2
- */
-void upscale_clut_nn_160_320x144_240(u8 *__restrict di, int ds, u8 *__restrict si, int ss)
+/* X x Y -> X*2/1 x Y, e.g. for X 160->320 (GG) */
+void upscale_clut_nn_x_1_2(u8 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height)
 {
+	int y;
+
+	for (y = 0; y < height; y++) {
+		h_upscale_nn_1_2(di, ds, si, ss, width, f_nop);
+	}
+}
+
+void upscale_rgb_nn_x_1_2(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
+{
+	int y;
+
+	for (y = 0; y < height; y++) {
+		h_upscale_nn_1_2(di, ds, si, ss, width, f_pal);
+	}
+}
+
+void upscale_rgb_bl2_x_1_2(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
+{
+	int y;
+
+	for (y = 0; y < height; y++) {
+		h_upscale_bl2_1_2(di, ds, si, ss, width, f_pal);
+	}
+}
+
+/* X x Y -> X*2/1 x Y*5/3 (GG) */
+void upscale_clut_nn_x_1_2_y_3_5(u8 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height)
+{
+	int swidth = width * 2;
 	int y, j;

-	/* 3:5, 0 0 1 1 2 */
-	for (y = 0; y < 144; y += 3) {
+	for (y = 0; y < height; y += 3) {
 		/* lines 0,2,4 */
 		for (j = 0; j < 3; j++) {
-			h_upscale_nn_1_2(di, ds, si, ss, 160, f_nop);
+			h_upscale_nn_1_2(di, ds, si, ss, width, f_nop);
 			di += ds;
 		}
 		/* lines 1,3 */
 		di -= 5*ds;
 		for (j = 0; j < 2; j++) {
-			v_copy(&di[0], &di[-ds], 320, f_nop);
+			v_copy(&di[0], &di[-ds], swidth, f_nop);
 			di += 2*ds;
 		}
 	}
 }

-void upscale_rgb_nn_160_320x144_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal)
+void upscale_rgb_nn_x_1_2_y_3_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
 {
+	int swidth = width * 2;
 	int y, j;

-	for (y = 0; y < 144; y += 3) {
+	for (y = 0; y < height; y += 3) {
 		for (j = 0; j < 3; j++) {
-			h_upscale_nn_1_2(di, ds, si, ss, 160, f_pal);
+			h_upscale_nn_1_2(di, ds, si, ss, width, f_pal);
 			di += ds;
 		}
 		di -= 5*ds;
 		for (j = 0; j < 2; j++) {
-			v_copy(&di[0], &di[-ds], 320, f_nop);
+			v_copy(&di[0], &di[-ds], swidth, f_nop);
 			di += 2*ds;
 		}
 	}
 }

-void upscale_rgb_snn_160_320x144_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal)
+void upscale_rgb_bl2_x_1_2_y_3_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
 {
+	int swidth = width * 2;
 	int y, j;

-	/* 3:5, 0 0+1 1 1+2 2 */
-	for (y = 0; y < 144; y += 3) {
+	for (y = 0; y < height; y += 3) {
 		for (j = 0; j < 3; j++) {
-			h_upscale_nn_1_2(di, ds, si, ss, 160, f_pal);
+			h_upscale_bl2_1_2(di, ds, si, ss, width, f_pal);
 			di += ds;
 		}
 		di -= 5*ds;
 		for (j = 0; j < 2; j++) {
-			v_mix(&di[0], &di[-ds], &di[ds], 320, p_05, f_nop);
+			v_mix(&di[0], &di[-ds], &di[ds], swidth, p_05, f_nop);
 			di += 2*ds;
 		}
 	}
 }

-void upscale_rgb_bl2_160_320x144_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal)
+void upscale_rgb_bl4_x_1_2_y_3_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
+{
+	int swidth = width * 2;
+	int y, j, d;
+
+	/* for 1st block backwards reference virtually duplicate source line 0 */
+	for (y = 0, d = 2*ds; y < height; y += 3, d = -ds) {
+		di += 2*ds;
+		for (j = 0; j < 3; j++) {
+			h_upscale_bl2_1_2(di, ds, si, ss, width, f_pal);
+		}
+		di -= 5*ds;
+		v_mix(&di[0], &di[d ], &di[2*ds], swidth, p_05, f_nop); /*-1+0 */
+		di += ds;
+		v_mix(&di[0], &di[ds], &di[2*ds], swidth, p_075, f_nop);/* 0+1 */
+		di += ds;
+		v_mix(&di[0], &di[ 0], &di[  ds], swidth, p_025, f_nop);/* 0+1 */
+		di += ds;
+		v_mix(&di[0], &di[ 0], &di[  ds], swidth, p_05, f_nop); /* 1+2 */
+		di += 2*ds;
+	}
+}
+
+/* X x Y -> X x Y*5/3, e.g. for Y 144->240 (GG) */
+void upscale_clut_nn_y_3_5(u8 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height)
 {
 	int y, j;

-	/* 3:5, 0 0+1 1 1+2 2 */
-	for (y = 0; y < 144; y += 3) {
+	for (y = 0; y < height; y += 3) {
+		/* lines 0,2,4 */
 		for (j = 0; j < 3; j++) {
-			h_upscale_bl2_1_2(di, ds, si, ss, 160, f_pal);
+			h_copy(di, ds, si, ss, width, f_nop);
+			di += ds;
+		}
+		/* lines 1,3 */
+		di -= 5*ds;
+		for (j = 0; j < 2; j++) {
+			v_copy(&di[0], &di[-ds], width, f_nop);
+			di += 2*ds;
+		}
+	}
+}
+
+void upscale_rgb_nn_y_3_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
+{
+	int y, j;
+
+	for (y = 0; y < height; y += 3) {
+		for (j = 0; j < 3; j++) {
+			h_copy(di, ds, si, ss, width, f_pal);
 			di += ds;
 		}
 		di -= 5*ds;
 		for (j = 0; j < 2; j++) {
-			v_mix(&di[0], &di[-ds], &di[ds], 320, p_05, f_nop);
+			v_copy(&di[0], &di[-ds], width, f_nop);
 			di += 2*ds;
 		}
 	}
 }

-void upscale_rgb_bl4_160_320x144_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal)
+void upscale_rgb_bl2_y_3_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
+{
+	int y, j;
+
+	for (y = 0; y < height; y += 3) {
+		for (j = 0; j < 3; j++) {
+			h_copy(di, ds, si, ss, width, f_pal);
+			di += ds;
+		}
+		di -= 5*ds;
+		for (j = 0; j < 2; j++) {
+			v_mix(&di[0], &di[-ds], &di[ds], width, p_05, f_nop);
+			di += 2*ds;
+		}
+	}
+}
+
+void upscale_rgb_bl4_y_3_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal)
 {
 	int y, j, d;

-	/* 3:5, -1+0, 0+1 0+1 1+2 2
-	 * for 1st block backwards reference virtually duplicate source line 0 */
-	for (y = 0, d = 2*ds; y < 144; y += 3, d = -ds) {
+	/* for 1st block backwards reference virtually duplicate source line 0 */
+	for (y = 0, d = 2*ds; y < height; y += 3, d = -ds) {
 		di += 2*ds;
 		for (j = 0; j < 3; j++) {
-			h_upscale_bl2_1_2(di, ds, si, ss, 160, f_pal);
+			h_copy(di, ds, si, ss, width, f_pal);
 		}
 		di -= 5*ds;
-		v_mix(&di[0], &di[d ], &di[2*ds], 320, p_05, f_nop);	/*-1+0 */
+		v_mix(&di[0], &di[d ], &di[2*ds], width, p_05, f_nop); /*-1+0 */
 		di += ds;
-		v_mix(&di[0], &di[ds], &di[2*ds], 320, p_075, f_nop);	/* 0+1 */
+		v_mix(&di[0], &di[ds], &di[2*ds], width, p_075, f_nop);/* 0+1 */
 		di += ds;
-		v_mix(&di[0], &di[ 0], &di[  ds], 320, p_025, f_nop);	/* 0+1 */
+		v_mix(&di[0], &di[ 0], &di[  ds], width, p_025, f_nop);/* 0+1 */
 		di += ds;
-		v_mix(&di[0], &di[ 0], &di[  ds], 320, p_05, f_nop);	/* 1+2 */
+		v_mix(&di[0], &di[ 0], &di[  ds], width, p_05, f_nop); /* 1+2 */
 		di += 2*ds;
 	}
 }
--- a/platform/common/upscale.h
+++ b/platform/common/upscale.h
@ -7,7 +7,7 @@
 * nn:	nearest neighbour
 * snn:	"smoothed" nearest neighbour (see below)
 * bln:	n-level-bilinear with n quantized weights
- *	quantization: 0: a<1/2*n, 1/n: 1/2*n<=a<3/2*n, etc
+ *	quantization: 0: a<1/(2*n), 1/n: 1/(2*n)<=a<3/(2*n), etc
 *	currently n=2, n=4 are implemented (there's n=8 mixing, but no filters)
 *	[NB this has been brought to my attn, which is probably the same as bl2:
 *	https://www.drdobbs.com/image-scaling-with-bresenham/184405045?pgno=1]
@ -18,34 +18,37 @@
 *	a sharper look than a bilinear filter, at the price of some visible jags
 *	on diagonal edges.
 * 
- * scaling modes:
- * 256x___ -> 320x___	only horizontal scaling. Produces an aspect error of
- *			~7% for NTSC 224 line modes, but is correct for PAL
- * 256/320x224/240
- *	-> 320x240	always produces 320x240 at DAR 4:3
-* 160x144 -> 320x240	game gear (currently unused)
-* 
+ * example scaling modes:
+ * 256x_Y_ -> 320x_Y_, H32/mode 4, PAR 5:4, for PAL DAR 4:3 (NTSC 7% aspect err)
+ * 256x224 -> 320x240, H32/mode 4, PAR 5:4, for NTSC DAR 4:3 (PAL 7% aspect err)
+ * 320x224 -> 320x240, PAR 1:1, for NTSC, DAR 4:3 (PAL 7% etc etc...)
+ * 160x144 -> 320x240: GG, PAR 6:5, scaling to 320x240 for DAR 4:3
+ *
+ *
 * (C) 2021 kub <derkub@gmail.com>
+ *
+ * This work is licensed under the terms of MAME license.
+ * See COPYING file in the top-level directory.
 */
 #include <pico/pico_types.h>

 /* RGB565 pixel mixing, see https://www.compuphase.com/graphic/scale3.htm and
  			    http://blargg.8bitalley.com/info/rgb_mixing.html */
 /* 2-level mixing */
-//#define p_05(p1,p2)	(((p1)+(p2)  + ( ((p1)^(p2))&0x0821))>>1) // round up
-//#define p_05(p1,p2)	(((p1)+(p2)  - ( ((p1)^(p2))&0x0821))>>1) // round down
-#define p_05(p1,p2)	(((p1)&(p2)) + ((((p1)^(p2))&~0x0821)>>1))
+//#define p_05(d,p1,p2)	d=(((p1)+(p2)  + ( ((p1)^(p2))&0x0821))>>1) // round up
+//#define p_05(d,p1,p2)	d=(((p1)+(p2)  - ( ((p1)^(p2))&0x0821))>>1) // round down
+#define p_05(d,p1,p2)	d=(((p1)&(p2)) + ((((p1)^(p2))&~0x0821)>>1))
 /* 4-level mixing, 2 times slower */
 // 1/4*p1 + 3/4*p2 = 1/2*(1/2*(p1+p2) + p2)
-#define p_025(p1,p2)	(t=p_05(p1, p2),   p_05( t, p2))
-#define p_075(p1,p2)	p_025(p2,p1)
+#define p_025(d,p1,p2)	p_05(t, p1, p2); p_05( d, t, p2)
+#define p_075(d,p1,p2)	p_025(d,p2,p1)
 /* 8-level mixing, 3 times slower */
 // 1/8*p1 + 7/8*p2 = 1/2*(1/2*(1/2*(p1+p2) + p2) + p2)
-#define p_0125(p1,p2)	(t=p_05(p1, p2), u=p_05( t, p2), p_05( u, p2))
+#define p_0125(d,p1,p2)	p_05(t, p1, p2); p_05( u, t, p2); p_05( d, u, p2)
 // 3/8*p1 + 5/8*p2 = 1/2*(1/2*(1/2*(p1+p2) + p2) + 1/2*(p1+p2))
-#define p_0375(p1,p2)	(t=p_05(p1, p2), u=p_05( t, p2), p_05( u,  t))
-#define p_0625(p1,p2)	p_0375(p2,p1)
-#define p_0875(p1,p2)	p_0125(p2,p1)
+#define p_0375(d,p1,p2)	p_05(t, p1, p2); p_05( u, t, p2); p_05( d, u,  t)
+#define p_0625(d,p1,p2)	p_0375(d,p2,p1)
+#define p_0875(d,p1,p2)	p_0125(d,p2,p1)

 /* pixel transforms */
 #define	f_pal(v)	pal[v]	// convert CLUT index -> RGB565
@ -55,7 +58,7 @@
 /*
 scalers h:
 256->320:       - (4:5)         (256x224/240 -> 320x224/240)
-256->299:	- (6:7)		(256x224 -> 299x224, DAR 4:3, 10.5 px border )
+256->299:	- (6:7)		(256x224 -> 299x224, alt?)
 160->320:       - (1:2) 2x      (160x144 -> 320x240, GG)
 160->288:	- (5:9)		(160x144 -> 288x216, GG alt?)
 */
@ -95,7 +98,7 @@ scalers h:
 	for (i = w/4; i > 0; i--, si += 4, di += 5) {	\
 		di[0] = f(si[0]);			\
 		di[1] = f(si[1]);			\
-		di[2] = p_05(f(si[1]),f(si[2]));	\
+		p_05(di[2], f(si[1]),f(si[2]));		\
 		di[3] = f(si[2]);			\
 		di[4] = f(si[3]);			\
 	}						\
@ -104,12 +107,12 @@ scalers h:
 } while (0)

 #define h_upscale_bln_4_5(di,ds,si,ss,w,f) do {		\
-	int i, t; 					\
+	int i; u16 t; 					\
 	for (i = w/4; i > 0; i--, si += 4, di += 5) {	\
 		di[0] = f(si[0]);			\
-		di[1] = p_025(f(si[0]),f(si[1]));	\
-		di[2] = p_05 (f(si[1]),f(si[2]));	\
-		di[3] = p_075(f(si[2]),f(si[3]));	\
+		p_025(di[1], f(si[0]),f(si[1]));	\
+		p_05 (di[2], f(si[1]),f(si[2]));	\
+		p_075(di[3], f(si[2]),f(si[3]));	\
 		di[4] = f(si[3]);			\
 	}						\
 	di += ds - w/4*5;				\
@ -120,8 +123,8 @@ scalers h:
 	int i;						\
 	for (i = w/4; i > 0; i--, si += 4, di += 5) {	\
 		di[0] = f(si[0]);			\
-		di[1] = p_05(f(si[0]),f(si[1]));	\
-		di[2] = p_05(f(si[1]),f(si[2]));	\
+		p_05(di[1], f(si[0]),f(si[1]));		\
+		p_05(di[2], f(si[1]),f(si[2]));		\
 		di[3] = f(si[2]);			\
 		di[4] = f(si[3]);			\
 	}						\
@ -130,12 +133,12 @@ scalers h:
 } while (0)

 #define h_upscale_bl4_4_5(di,ds,si,ss,w,f) do {		\
-	int i, t; uint p = f(si[0]);			\
+	int i; u16 t, p = f(si[0]);			\
 	for (i = w/4; i > 0; i--, si += 4, di += 5) {	\
-		di[0] = p_025(p, f(si[0]));		\
-		di[1] = p_05 (f(si[0]),f(si[1]));	\
-		di[2] = p_05 (f(si[1]),f(si[2]));	\
-		di[3] = p_075(f(si[2]),f(si[3]));	\
+		p_025(di[0], p,       f(si[0]));	\
+		p_05 (di[1], f(si[0]),f(si[1]));	\
+		p_05 (di[2], f(si[1]),f(si[2]));	\
+		p_075(di[3], f(si[2]),f(si[3]));	\
 		di[4] = p = f(si[3]);			\
 	}						\
 	di += ds - w/4*5;				\
@ -143,12 +146,12 @@ scalers h:
 } while (0)

 #define h_upscale_bl8_4_5(di,ds,si,ss,w,f) do {		\
-	int i, t, u; uint p = f(si[0]);			\
+	int i; u16 t, u, p = f(si[0]);			\
 	for (i = w/4; i > 0; i--, si += 4, di += 5) {	\
-		di[0] = p_025(p, f(si[0]));		\
-		di[1] = p_0375(f(si[0]),f(si[1]));	\
-		di[2] = p_0625(f(si[1]),f(si[2]));	\
-		di[3] = p_075(f(si[2]),f(si[3]));	\
+		p_025 (di[0], p,       f(si[0]));	\
+		p_0375(di[1], f(si[0]),f(si[1]));	\
+		p_0625(di[2], f(si[1]),f(si[2]));	\
+		p_075 (di[3], f(si[2]),f(si[3]));	\
 		di[4] = p = f(si[3]);			\
 	}						\
 	di += ds - w/4*5;				\
@ -195,7 +198,7 @@ scalers h:
 		di[0] = f(si[0]);			\
 		di[1] = f(si[1]);			\
 		di[2] = f(si[2]);			\
-		di[3] = p_05(f(si[2]),f(si[3]));	\
+		p_05(di[3], f(si[2]),f(si[3]));		\
 		di[4] = f(si[3]);			\
 		di[5] = f(si[4]);			\
 		di[6] = f(si[5]);			\
@ -208,10 +211,10 @@ scalers h:
 	int i;						\
 	for (i = w/6; i > 0; i--, si += 6, di += 7) {	\
 		di[0] = f(si[0]);			\
-		di[1] = p_05(f(si[0]),f(si[1]));	\
-		di[2] = p_05(f(si[1]),f(si[2]));	\
-		di[3] = p_05(f(si[2]),f(si[3]));	\
-		di[4] = p_05(f(si[3]),f(si[4]));	\
+		p_05(di[1], f(si[0]),f(si[1]));		\
+		p_05(di[2], f(si[1]),f(si[2]));		\
+		p_05(di[3], f(si[2]),f(si[3]));		\
+		p_05(di[4], f(si[3]),f(si[4]));		\
 		di[5] = f(si[4]);			\
 		di[6] = f(si[5]);			\
 	}						\
@ -220,14 +223,14 @@ scalers h:
 } while (0)

 #define h_upscale_bl4_6_7(di,ds,si,ss,w,f) do {		\
-	int i, t; uint p = f(si[0]);			\
+	int i; u16 t p = f(si[0]);			\
 	for (i = w/6; i > 0; i--, si += 6, di += 7) {	\
-		di[0] = p_025(p,f(si[0]));		\
-		di[1] = p_025(f(si[0]),f(si[1]));	\
-		di[2] = p_05 (f(si[1]),f(si[2]));	\
-		di[3] = p_05 (f(si[2]),f(si[3]));	\
-		di[4] = p_075(f(si[3]),f(si[4]));	\
-		di[5] = p_075(f(si[4]),f(si[5]));	\
+		p_025(di[0], p,       f(si[0]));	\
+		p_025(di[1], f(si[0]),f(si[1]));	\
+		p_05 (di[2], f(si[1]),f(si[2]));	\
+		p_05 (di[3], f(si[2]),f(si[3]));	\
+		p_075(di[4], f(si[3]),f(si[4]));	\
+		p_075(di[5], f(si[4]),f(si[5]));	\
 		di[6] = p = f(si[5]);			\
 	}						\
 	di += ds - w/6*7;				\
@ -258,9 +261,9 @@ scalers h:
 		di[0] = f(si[0]);			\
 		di[1] = f(si[0]);			\
 		di[2] = f(si[1]);			\
-		di[3] = p_05(f(si[1]),f(si[2]));	\
+		p_05(di[3], f(si[1]),f(si[2]));		\
 		di[4] = f(si[2]);			\
-		di[5] = p_05(f(si[2]),f(si[3]));	\
+		p_05(di[5], f(si[2]),f(si[3]));		\
 		di[6] = f(si[3]);			\
 		di[7] = f(si[4]);			\
 		di[8] = f(si[4]);			\
@ -273,13 +276,13 @@ scalers h:
 	int i;						\
 	for (i = w/5; i > 0; i--, si += 5, di += 9) {	\
 		di[0] = f(si[0]);			\
-		di[1] = p_05(f(si[0]),f(si[1]));	\
+		p_05(di[1], f(si[0]),f(si[1]));		\
 		di[2] = f(si[1]);			\
-		di[3] = p_05(f(si[1]),f(si[2]));	\
+		p_05(di[3], f(si[1]),f(si[2]));		\
 		di[4] = f(si[2]);			\
-		di[5] = p_05(f(si[2]),f(si[3]));	\
+		p_05(di[5], f(si[2]),f(si[3]));		\
 		di[6] = f(si[3]);			\
-		di[7] = p_05(f(si[3]),f(si[4]));	\
+		p_05(di[7], f(si[3]),f(si[4]));		\
 		di[8] = f(si[4]);			\
 	}						\
 	di += ds - w/5*9;				\
@ -287,16 +290,16 @@ scalers h:
 } while (0)

 #define h_upscale_bl4_5_9(di,ds,si,ss,w,f) do {		\
-	int i, t; uint p = f(si[0]);			\
+	int i; u16 t, p = f(si[0]);			\
 	for (i = w/5; i > 0; i--, si += 5, di += 9) {	\
-		di[0] = p_05 (p,f(si[0]));		\
+		p_05 (di[0], p,       f(si[0]));	\
 		di[1] = f(si[0]);			\
-		di[2] = p_025(f(si[0]),f(si[1]));	\
-		di[3] = p_075(f(si[1]),f(si[2]));	\
-		di[4] = p_025(f(si[1]),f(si[2]));	\
-		di[5] = p_075(f(si[2]),f(si[3]));	\
+		p_025(di[2], f(si[0]),f(si[1]));	\
+		p_075(di[3], f(si[1]),f(si[2]));	\
+		p_025(di[4], f(si[1]),f(si[2]));	\
+		p_075(di[5], f(si[2]),f(si[3]));	\
 		di[6] = f(si[3]);			\
-		di[7] = p_05 (f(si[3]),f(si[4]));	\
+		p_05 (di[7], f(si[3]),f(si[4]));	\
 		di[8] = p = f(si[4]);			\
 	}						\
 	di += ds - w/5*9;				\
@ -319,9 +322,9 @@ scalers h:
 #define h_upscale_bl2_1_2(di,ds,si,ss,w,f) do {		\
 	int i; uint p = f(si[0]);			\
 	for (i = w/2; i > 0; i--, si += 2, di += 4) {	\
-		di[0] = p_05 (p, f(si[0]));		\
+		p_05 (di[0], p,       f(si[0]));	\
 		di[1] = f(si[0]);			\
-		di[2] = p_05 (f(si[0]), f(si[1]));	\
+		p_05 (di[2], f(si[0]),f(si[1]));	\
 		di[3] = p = f(si[1]);			\
 	}						\
 	di += ds - w*2;					\
@ -350,12 +353,12 @@ scalers v:
 */

 #define v_mix(di,li,ri,w,p_mix,f) do {			\
-	int i, t, u; (void)t, (void)u;			\
+	u16 i, t, u; (void)t, (void)u;			\
 	for (i = 0; i < w; i += 4) {			\
-		(di)[i  ] = p_mix(f((li)[i  ]), f((ri)[i  ])); \
-		(di)[i+1] = p_mix(f((li)[i+1]), f((ri)[i+1])); \
-		(di)[i+2] = p_mix(f((li)[i+2]), f((ri)[i+2])); \
-		(di)[i+3] = p_mix(f((li)[i+3]), f((ri)[i+3])); \
+		p_mix((di)[i  ], f((li)[i  ]),f((ri)[i  ])); \
+		p_mix((di)[i+1], f((li)[i+1]),f((ri)[i+1])); \
+		p_mix((di)[i+2], f((li)[i+2]),f((ri)[i+2])); \
+		p_mix((di)[i+3], f((li)[i+3]),f((ri)[i+3])); \
 	}						\
 } while (0)

@ -369,32 +372,222 @@ scalers v:
 	}						\
 } while (0)

+/* scale 14:15 */
+#define v_upscale_nn_14_15(di,ds,w,l) do {		\
+	if (++l == 7) {					\
+		di += ds; 				\
+	} else if (l >= 14) {				\
+		l = 0;					\
+		di -= 7*ds;				\
+		v_copy(&di[0], &di[-ds], w, f_nop);	\
+		di += 7*ds;				\
+	}						\
+} while (0)
+
+#define v_upscale_snn_14_15(di,ds,w,l) do {		\
+	if (++l == 7) {					\
+		di += ds; 				\
+	} else if (l >= 14) {				\
+		l = 0;					\
+		di -= 7*ds;				\
+		v_mix(&di[0], &di[-ds], &di[ds], w, p_05, f_nop); \
+		v_mix(&di[-ds], &di[-2*ds], &di[-ds], w, p_05, f_nop); \
+		v_mix(&di[ ds], &di[ ds], &di[ 2*ds], w, p_05, f_nop); \
+		di += 7*ds;				\
+	}						\
+} while (0)
+
+#define v_upscale_bl2_14_15(di,ds,w,l) do {		\
+	if (++l == 3) {					\
+		di += ds; 				\
+	} else if (l >= 14) {				\
+		int j;					\
+		l = 0;					\
+		di -= 11*ds;				\
+			v_mix(&di[0], &di[-ds], &di[ds], w, p_05, f_nop); \
+		for (j = 0; j < 7; j++)	{		\
+			di += ds;			\
+			v_mix(&di[0], &di[0], &di[ds], w, p_05, f_nop); \
+		}					\
+		di += 4*ds;				\
+	}						\
+} while (0)
+
+#define v_upscale_bl4_14_15(di,ds,w,l) do {		\
+	if (++l == 1) {					\
+		di += ds; 				\
+	} else if (l >= 14) {				\
+		int j;					\
+		l = 0;					\
+		di -= 13*ds;				\
+			v_mix(&di[0], &di[-ds], &di[ds], w, p_025, f_nop); \
+			di += ds;			\
+		for (j = 0; j < 3; j++) {		\
+			v_mix(&di[0], &di[0], &di[ds], w, p_025, f_nop); \
+			di += ds;			\
+			}				\
+		for (j = 0; j < 4; j++) {		\
+			v_mix(&di[0], &di[0], &di[ds], w, p_05, f_nop); \
+			di += ds;			\
+		}					\
+		for (j = 0; j < 4; j++) {		\
+			v_mix(&di[0], &di[0], &di[ds], w, p_075, f_nop); \
+			di += ds;			\
+		}					\
+		di += 1*ds;				\
+	}						\
+} while (0)
+
+/* scale 16:17 */
+#define v_upscale_nn_16_17(di,ds,w,l) do {		\
+	if (++l == 8) {					\
+		di += ds; 				\
+	} else if (l >= 16) {				\
+		l = 0;					\
+		di -= 8*ds;				\
+		v_copy(&di[0], &di[-ds], w, f_nop);	\
+		di += 8*ds;				\
+	}						\
+} while (0)
+
+#define v_upscale_snn_16_17(di,ds,w,l) do {		\
+	if (++l == 8) {					\
+		di += ds; 				\
+	} else if (l >= 16) {				\
+		l = 0;					\
+		di -= 8*ds;				\
+		v_mix(&di[0], &di[-ds], &di[ds], w, p_05, f_nop); \
+		v_mix(&di[-ds], &di[-2*ds], &di[-ds], w, p_05, f_nop); \
+		v_mix(&di[ ds], &di[ ds], &di[ 2*ds], w, p_05, f_nop); \
+		di += 8*ds;				\
+	}						\
+} while (0)
+
+#define v_upscale_bl2_16_17(di,ds,w,l) do {		\
+	if (++l == 4) {					\
+		di += ds; 				\
+	} else if (l >= 16) {				\
+		int j;					\
+		l = 0;					\
+		di -= 12*ds;				\
+			v_mix(&di[0], &di[-ds], &di[ds], w, p_05, f_nop); \
+		for (j = 0; j < 7; j++)	{		\
+			di += ds;			\
+			v_mix(&di[0], &di[0], &di[ds], w, p_05, f_nop); \
+		}					\
+		di += 5*ds;				\
+	}						\
+} while (0)
+
+#define v_upscale_bl4_16_17(di,ds,w,l) do {		\
+	if (++l == 2) {					\
+		di += ds; 				\
+	} else if (l >= 16) {				\
+		int j;					\
+		l = 0;					\
+		di -= 14*ds;				\
+			v_mix(&di[0], &di[-ds], &di[ds], w, p_025, f_nop); \
+			di += ds;			\
+		for (j = 0; j < 3; j++) {		\
+			v_mix(&di[0], &di[0], &di[ds], w, p_025, f_nop); \
+			di += ds;			\
+			}				\
+		for (j = 0; j < 4; j++) {		\
+			v_mix(&di[0], &di[0], &di[ds], w, p_05, f_nop); \
+			di += ds;			\
+		}					\
+		for (j = 0; j < 4; j++) {		\
+			v_mix(&di[0], &di[0], &di[ds], w, p_075, f_nop); \
+			di += ds;			\
+		}					\
+		di += 2*ds;				\
+	}						\
+} while (0)
+
+/* scale 3:5 */
+#define v_upscale_nn_3_5(di,ds,w,l) do {		\
+	if (++l < 3) {					\
+		di += ds; 				\
+	} else  {					\
+		int j;					\
+		l = 0;					\
+		di -= 4*ds;				\
+		for (j = 0; j < 2; j++) {		\
+			v_copy(&di[0], &di[-ds], w, f_nop); \
+			di += 2*ds;			\
+		}					\
+	}						\
+} while (0)
+
+#define v_upscale_snn_3_5(di,ds,w,l) do {		\
+	if (++l < 3) {					\
+		di += ds; 				\
+	} else  {					\
+		int j;					\
+		l = 0;					\
+		di -= 4*ds;				\
+		for (j = 0; j < 2; j++) {		\
+			v_mix(&di[0], &di[-ds], &di[ds], w, p_05, f_nop); \
+			di += 2*ds;			\
+		}					\
+	}						\
+} while (0)
+
+/* scale 2:3 */
+#define v_upscale_nn_2_3(di,ds,w,l) do {		\
+	if (++l < 2) {					\
+		di += ds; 				\
+	} else  {					\
+		int j;					\
+		l = 0;					\
+		di -= 2*ds;				\
+		v_copy(&di[0], &di[-ds], w, f_nop);	\
+		di += 2*ds;				\
+	}						\
+} while (0)
+
+#define v_upscale_snn_2_3(di,ds,w,l) do {		\
+	if (++l < 2) {					\
+		di += ds; 				\
+	} else  {					\
+		int j;					\
+		l = 0;					\
+		di -= 2*ds;				\
+		v_mix(&di[0], &di[-ds], &di[ds], w, p_05, f_nop); \
+		di += 2*ds;				\
+	}						\
+} while (0)


-/* 256x___ -> 320x___, H32/mode 4, PAR 5:4, for PAL DAR 4:3 (wrong for NTSC) */
-void upscale_clut_nn_256_320x___(u8 *__restrict di, int ds, u8 *__restrict si, int ss, int height);
-void upscale_rgb_nn_256_320x___(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int height, u16 *pal);
-void upscale_rgb_snn_256_320x___(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int height, u16 *pal);
-void upscale_rgb_bl2_256_320x___(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int height, u16 *pal);
-void upscale_rgb_bl4_256_320x___(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int height, u16 *pal);
+/* X x Y -> X*5/4 x Y, for X 256->320 */
+void upscale_rgb_nn_x_4_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);
+void upscale_rgb_snn_x_4_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);
+void upscale_rgb_bl2_x_4_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);
+void upscale_rgb_bl4_x_4_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);

-/* 256x224 -> 320x240, H32/mode 4, PAR 5:4, for NTSC DAR 4:3 (wrong for PAL) */
-void upscale_clut_nn_256_320x224_240(u8 *__restrict di, int ds, u8 *__restrict si, int ss);
-void upscale_rgb_nn_256_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal);
-void upscale_rgb_snn_256_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal);
-void upscale_rgb_bl2_256_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal);
-void upscale_rgb_bl4_256_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal);
+/* X x Y -> X x Y*17/16, for Y 224->238 or 192->204 (SMS) */
+void upscale_rgb_nn_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);
+void upscale_rgb_snn_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);
+void upscale_rgb_bl2_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);
+void upscale_rgb_bl4_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);

-/* 320x224 -> 320x240, PAR 1:1, for NTSC, DAR 4:3 (wrong for PAL) */
-void upscale_clut_nn_320x224_240(u8 *__restrict di, int ds, u8 *__restrict si, int ss);
-void upscale_rgb_nn_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal);
-void upscale_rgb_snn_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal);
-void upscale_rgb_bl2_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal);
-void upscale_rgb_bl4_320x224_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal);
+/* X x Y -> X*5/4 x Y*17/16 */
+void upscale_rgb_nn_x_4_5_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);
+void upscale_rgb_snn_x_4_5_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);
+void upscale_rgb_bl2_x_4_5_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);
+void upscale_rgb_bl4_x_4_5_y_16_17(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);
+
+/* X x Y -> X*2/1 x Y, e.g. for X 160->320 (GG) */
+void upscale_rgb_nn_x_1_2(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);
+void upscale_rgb_bl2_x_1_2(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);
+
+/* X x Y -> X x Y*5/3, e.g. for Y 144->240 (GG) */
+void upscale_rgb_nn_y_3_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);
+void upscale_rgb_bl2_y_3_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);
+void upscale_rgb_bl4_y_3_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);
+
+/* X x Y -> X*2/1 x Y*5/3 (GG) */
+void upscale_rgb_nn_x_1_2_y_3_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);
+void upscale_rgb_bl2_x_1_2_y_3_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);
+void upscale_rgb_bl4_x_1_2_y_3_5(u16 *__restrict di, int ds, u8 *__restrict si, int ss, int width, int height, u16 *pal);

-/* 160x144 -> 320x240: GG, PAR 6:5, scaling to 320x240 for DAR 4:3 */
-void upscale_clut_nn_160_320x144_240(u8 *__restrict di, int ds, u8 *__restrict si, int ss);
-void upscale_rgb_nn_160_320x144_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal);
-void upscale_rgb_snn_160_320x144_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal);
-void upscale_rgb_bl2_160_320x144_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal);
-void upscale_rgb_bl4_160_320x144_240(u16 *__restrict di, int ds, u8 *__restrict si, int ss, u16 *pal);