some drawing code C optimisations

This commit is contained in:
kub 2019-08-25 17:33:13 +02:00
parent f6b4a9ca53
commit f740428b81
5 changed files with 54 additions and 40 deletions

View file

@ -5,6 +5,10 @@ CFLAGS += -I.
ifeq "$(DEBUG)" "0"
CFLAGS += -O3 -DNDEBUG
endif
ifeq ("$(PLATFORM)",$(filter "$(PLATFORM)","gp2x" "opendingux" "rpi1"))
# very small caches, avoid optimization options making the binary much bigger
CFLAGS += -finline-limit=42 -fno-unroll-loops -fno-ipa-cp-clone # -fno-ipa-cp
endif
# This is actually needed, bevieve me.
# If you really have to disable this, set NO_ALIGN_FUNCTIONS elsewhere.

View file

@ -29,8 +29,8 @@ assuming $TC points to the appropriate cross compile toolchain directory:
platform|toolchain|configure command
--------|---------|-----------------
gp2x,wiz,caanoo|open2x|CROSS_COMPILE=arm-open2x-linux- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -finline-limit=42 -fno-unroll-loops -fno-stack-protector -fno-common" LDFLAGS="--sysroot $TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x
gp2x,wiz,caanoo|open2x with ubuntu arm gcc 4.7|CROSS_COMPILE=arm-linux-gnueabi- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -finline-limit=42 -fno-unroll-loops -fno-stack-protector -fno-common" LDFLAGS="-B$TC/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x
gp2x,wiz,caanoo|open2x|CROSS_COMPILE=arm-open2x-linux- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -fno-stack-protector -fno-common" LDFLAGS="--sysroot $TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x
gp2x,wiz,caanoo|open2x with ubuntu arm gcc 4.7|CROSS_COMPILE=arm-linux-gnueabi- CFLAGS="-I$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/include -fno-stack-protector -fno-common" LDFLAGS="-B$TC/gcc-4.1.1-glibc-2.3.6/lib/gcc/arm-open2x-linux/4.1.1 -B$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib -L$TC/gcc-4.1.1-glibc-2.3.6/arm-open2x-linux/lib" ./configure --platform=gp2x
opendingux|opendingux|CROSS_COMPILE=mipsel-linux- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL" LDFLAGS="--sysroot $TC -L$TC/lib" ./configure --platform=opendingux
opendingux|opendingux with ubuntu mips gcc 5.4|CROSS_COMPILE=mipsel-linux-gnu- CFLAGS="-I$TC/usr/include -I$TC/usr/include/SDL" LDFLAGS="-B$TC/usr/lib -B$TC/lib -Wl,-rpath-link=$TC/usr/lib -Wl,-rpath-link=$TC/lib" ./configure --platform=opendingux
gcw0|gcw0|CROSS_COMPILE=mipsel-gcw0-linux-uclibc- CFLAGS="-I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include -I$TC/usr/mipsel-gcw0-linux-uclibc/sysroot/usr/include/SDL" LDFLAGS="--sysroot $TC/usr/mipsel-gcw0-linux-uclibc/sysroot" ./configure --platform=gcw0

View file

@ -42,16 +42,21 @@ static void convert_pal555(int invert_prio)
const unsigned int m1 = 0x001f; \
const unsigned int m2 = 0x03e0; \
const unsigned int m3 = 0x7c00; \
int i; \
unsigned short t; \
int i = 320; \
\
for (i = 320; i > 0; i--, pd++, p32x++, pmd++) { \
unsigned short t = *p32x; \
if ((*pmd & 0x3f) != mdbg && !((t ^ inv) & 0x8000)) { \
pmd_draw_code; \
continue; \
while (i > 0) { \
for (; i > 0 && (*pmd & 0x3f) == mdbg; pd++, pmd++, i--) { \
t = *p32x++; \
*pd = ((t&m1) << 11) | ((t&m2) << 1) | ((t&m3) >> 10); \
} \
for (; i > 0 && (*pmd & 0x3f) != mdbg; pd++, pmd++, i--) { \
t = *p32x++; \
if ((t ^ inv) & 0x8000) \
*pd = ((t&m1) << 11) | ((t&m2) << 1) | ((t&m3) >> 10); \
else \
pmd_draw_code; \
} \
\
*pd = ((t & m1) << 11) | ((t & m2) << 1) | ((t & m3) >> 10); \
} \
}
@ -59,14 +64,20 @@ static void convert_pal555(int invert_prio)
#define do_line_pp(pd, p32x, pmd, pmd_draw_code) \
{ \
unsigned short t; \
int i; \
for (i = 320; i > 0; i--, pd++, p32x++, pmd++) { \
t = pal[*(unsigned char *)((uintptr_t)p32x ^ 1)]; \
if ((t & 0x20) || (*pmd & 0x3f) == mdbg) \
int i = 320; \
while (i > 0) { \
for (; i > 0 && (*pmd & 0x3f) == mdbg; pd++, pmd++, i--) { \
t = pal[*(unsigned char *)((uintptr_t)(p32x++) ^ 1)]; \
*pd = t; \
} \
for (; i > 0 && (*pmd & 0x3f) != mdbg; pd++, pmd++, i--) { \
t = pal[*(unsigned char *)((uintptr_t)(p32x++) ^ 1)]; \
if (t & 0x20) \
*pd = t; \
else \
pmd_draw_code; \
} \
} \
}
// run length mode

View file

@ -1341,8 +1341,14 @@ void FinalizeLine555(int sh, int line, struct PicoEState *est)
#if 1
int i;
for (i = 0; i < len; i++)
pd[i] = pal[ps[i]];
for (i = len; i > 0; i-=4) {
*pd++ = pal[*ps++];
*pd++ = pal[*ps++];
*pd++ = pal[*ps++];
*pd++ = pal[*ps++];
}
// for (i = 0; i < len; i++)
// pd[i] = pal[ps[i]];
#else
extern void amips_clut(unsigned short *dst, unsigned char *src, unsigned short *pal, int count);
extern void amips_clut_6bit(unsigned short *dst, unsigned char *src, unsigned short *pal, int count);

View file

@ -89,7 +89,8 @@ static const struct in_pdata in_sdl_platform_data = {
/* YUV stuff */
static int yuv_ry[32], yuv_gy[32], yuv_by[32];
static unsigned char yuv_u[32 * 2], yuv_v[32 * 2];
static int yuv_y[256];
static unsigned char yuv_y[256];
static struct uyvy { unsigned int y:8; unsigned int vyu:24; } yuv_uyvy[65536];
void bgr_to_uyvy_init(void)
{
@ -124,34 +125,26 @@ void bgr_to_uyvy_init(void)
for (i = 0; i < 256; i++) {
yuv_y[i] = 16 + 219 * i / 32;
}
// everything combined into one large array for speed
for (i = 0; i < 65536; i++) {
int r = (i >> 11) & 0x1f, g = (i >> 6) & 0x1f, b = (i >> 0) & 0x1f;
int y = (yuv_ry[r] + yuv_gy[g] + yuv_by[b]) >> 16;
yuv_uyvy[i].y = yuv_y[y];
yuv_uyvy[i].vyu = (yuv_v[r-y + 32] << 16) | (yuv_y[y] << 8) | yuv_u[b-y + 32];
}
}
void rgb565_to_uyvy(void *d, const void *s, int pixels)
{
unsigned int *dst = d;
const unsigned short *src = s;
const unsigned char *yu = yuv_u + 32;
const unsigned char *yv = yuv_v + 32;
int r0, g0, b0, r1, g1, b1;
int y0, y1, u, v;
for (; pixels > 0; src += 2, dst++, pixels -= 2)
for (; pixels > 0; src += 4, dst += 2, pixels -= 4)
{
r0 = (src[0] >> 11) & 0x1f;
g0 = (src[0] >> 6) & 0x1f;
b0 = src[0] & 0x1f;
r1 = (src[1] >> 11) & 0x1f;
g1 = (src[1] >> 6) & 0x1f;
b1 = src[1] & 0x1f;
y0 = (yuv_ry[r0] + yuv_gy[g0] + yuv_by[b0]) >> 16;
y1 = (yuv_ry[r1] + yuv_gy[g1] + yuv_by[b1]) >> 16;
u = yu[b0 - y0];
v = yv[r0 - y0];
// valid Y range seems to be 16..235
y0 = yuv_y[y0];
y1 = yuv_y[y1];
*dst = (y1 << 24) | (v << 16) | (y0 << 8) | u;
struct uyvy *uyvy0 = yuv_uyvy + src[0], *uyvy1 = yuv_uyvy + src[1];
struct uyvy *uyvy2 = yuv_uyvy + src[2], *uyvy3 = yuv_uyvy + src[3];
dst[0] = (uyvy1->y << 24) | uyvy0->vyu;
dst[1] = (uyvy3->y << 24) | uyvy2->vyu;
}
}