code940 now plays mp3s

git-svn-id: file:///home/notaz/opt/svn/PicoDrive@22 be3aeb3a-fb24-0410-a615-afba39da0efa
2025-10-26 16:29:37 -04:00 · 2007-01-20 23:11:02 +00:00 · 2007-01-20 23:11:02 +00:00 · 42c7b14797
commit 42c7b14797
parent b837b69b3f
26 changed files with 1061 additions and 75 deletions
--- a/platform/gp2x/code940/940.c
+++ b/platform/gp2x/code940/940.c
@ -0,0 +1,134 @@
+#include "940shared.h"
+
+static _940_data_t *shared_data = (_940_data_t *)   0x00100000;
+static _940_ctl_t  *shared_ctl  = (_940_ctl_t *)    0x00200000;
+static unsigned char *mp3_data  = (unsigned char *) 0x01000000;
+YM2612 *ym2612_940;
+int *mix_buffer;
+
+// from init.s
+void wait_irq(void);
+void spend_cycles(int c);
+void cache_clean(void);
+void cache_clean_flush(void);
+
+//	asm volatile ("mov r0, #0" ::: "r0");
+//	asm volatile ("mcr p15, 0, r0, c7, c6,  0" ::: "r0"); /* flush dcache */
+//	asm volatile ("mcr p15, 0, r0, c7, c10, 4" ::: "r0"); /* drain write buffer */
+
+void Main940(int startvector)
+{
+	ym2612_940 = &shared_data->ym2612;
+	mix_buffer = shared_data->mix_buffer;
+
+	// debug
+	shared_ctl->vstarts[startvector]++;
+	// asm volatile ("mcr p15, 0, r0, c7, c10, 4" ::: "r0");
+
+
+	for (;;)
+	{
+		int job_num = 0;
+/*
+		while (!shared_ctl->busy)
+		{
+			//shared_ctl->waitc++;
+			spend_cycles(8*1024);
+		}
+*/
+		if (!shared_ctl->busy)
+		{
+			wait_irq();
+		}
+
+		for (job_num = 0; job_num < MAX_940JOBS; job_num++)
+		{
+			switch (shared_ctl->jobs[job_num])
+			{
+				case JOB940_INITALL:
+					/* ym2612 */
+					shared_ctl->writebuff0[0] = shared_ctl->writebuff1[0] = 0xffff;
+					YM2612Init_(shared_ctl->baseclock, shared_ctl->rate);
+					/* Helix mp3 decoder */
+					shared_data->mp3dec = MP3InitDecoder();
+					break;
+
+				case JOB940_YM2612RESETCHIP:
+					YM2612ResetChip_();
+					break;
+
+				case JOB940_PICOSTATELOAD:
+					YM2612PicoStateLoad_();
+					break;
+
+				case JOB940_YM2612UPDATEONE: {
+					int i, dw, *wbuff;
+					if (shared_ctl->writebuffsel == 1) {
+						wbuff = (int *) shared_ctl->writebuff1;
+					} else {
+						wbuff = (int *) shared_ctl->writebuff0;
+					}
+
+					/* playback all writes */
+					for (i = 2048/2; i > 0; i--) {
+						UINT16 d;
+						dw = *wbuff++;
+						d = dw;
+						if (d == 0xffff) break;
+						YM2612Write_(d >> 8, d);
+						d = (dw>>16);
+						if (d == 0xffff) break;
+						YM2612Write_(d >> 8, d);
+					}
+
+					YM2612UpdateOne_(0, shared_ctl->length, shared_ctl->stereo);
+					break;
+				}
+
+				case JOB940_MP3DECODE: {
+					int mp3_offs = shared_ctl->mp3_offs;
+					unsigned char *readPtr = mp3_data + mp3_offs;
+					int bytesLeft = shared_ctl->mp3_len - mp3_offs;
+					int offset; // frame offset from readPtr
+					int err;
+
+					if (bytesLeft <= 0) break; // EOF, nothing to do
+
+					offset = MP3FindSyncWord(readPtr, bytesLeft);
+					if (offset < 0) {
+						shared_ctl->mp3_offs = shared_ctl->mp3_len;
+						break; // EOF
+					}
+					readPtr += offset;
+					bytesLeft -= offset;
+
+ 					err = MP3Decode(shared_data->mp3dec, &readPtr, &bytesLeft,
+						shared_data->mp3_buffer[shared_ctl->mp3_buffsel], 0);
+					if (err) {
+						if (err == ERR_MP3_INDATA_UNDERFLOW) {
+							shared_ctl->mp3_offs = shared_ctl->mp3_len; // EOF
+							break;
+						} else if (err <= -6 && err >= -12) {
+							// ERR_MP3_INVALID_FRAMEHEADER, ERR_MP3_INVALID_*
+							// just try to skip the offending frame..
+							readPtr++;
+						}
+						shared_ctl->mp3_errors++;
+						shared_ctl->mp3_lasterr = err;
+					}
+					shared_ctl->mp3_offs = readPtr - mp3_data;
+					break;
+				}
+			}
+		}
+
+		cache_clean();
+//		asm volatile ("mov r0, #0" ::: "r0");
+//		asm volatile ("mcr p15, 0, r0, c7, c10, 4" ::: "r0"); /* drain write buffer, should be done on nonbuffered write */
+//		cache_clean_flush();
+
+		shared_ctl->loopc++;
+		shared_ctl->busy = 0;
+	}
+}
+
--- a/platform/gp2x/code940/940init.s
+++ b/platform/gp2x/code940/940init.s
@ -0,0 +1,185 @@
+.global code940
+
+code940:                          @ interrupt table:
+    b .b_reset                    @ reset
+    b .b_undef                    @ undefined instructions
+    b .b_swi                      @ software interrupt
+    b .b_pabort                   @ prefetch abort
+    b .b_dabort                   @ data abort
+    b .b_reserved                 @ reserved
+    b .b_irq                      @ IRQ
+    b .b_fiq                      @ FIQ
+
+@ test
+.b_reset:
+    mov     r12, #0
+    b       .Begin
+.b_undef:
+    mov     r12, #1
+    b       .Begin
+.b_swi:
+    mov     r12, #2
+    b       .Begin
+.b_pabort:
+    mov     r12, #3
+    b       .Begin
+.b_dabort:
+    mov     r12, #4
+    b       .Begin
+.b_reserved:
+    mov     r12, #5
+    b       .Begin
+.b_irq:
+    mov     r12, #6
+    mov     sp, #0x100000       @ reset stack
+    sub     sp, sp, #4
+    mov     r1, #0xbe000000     @ assume we live @ 0x2000000 bank
+    orr     r2, r1, #0x3B00
+    orr     r2, r2, #0x0046
+    mvn     r3, #0
+    strh    r3, [r2]            @ clear any pending interrupts from the DUALCPU unit
+    orr     r2, r1, #0x4500
+    str     r3, [r2]            @ clear all pending interrupts in irq controller's SRCPND register
+    orr     r2, r2, #0x0010
+    str     r3, [r2]            @ clear all pending interrupts in irq controller's INTPND register
+    b       .Enter
+.b_fiq:
+    mov     r12, #7
+    b       .Begin
+
+.Begin:
+    mov sp, #0x100000           @ set the stack top (1M)
+    sub sp, sp, #4              @ minus 4
+
+    @ set up memory region 0 -- the whole 4GB address space
+    mov r0, #(0x1f<<1)|1        @ region data
+    mcr p15, 0, r0, c6, c0, 0   @ opcode2 ~ data/instr
+    mcr p15, 0, r0, c6, c0, 1
+
+    @ set up region 1 which is the first 2 megabytes.
+    mov r0, #(0x14<<1)|1        @ region data
+    mcr p15, 0, r0, c6, c1, 0
+    mcr p15, 0, r0, c6, c1, 1
+
+    @ set up region 2: 64k 0x200000-0x210000
+    mov r0, #(0x0f<<1)|1
+    orr r0, r0, #0x200000
+    mcr p15, 0, r0, c6, c2, 0
+    mcr p15, 0, r0, c6, c2, 1
+
+    @ set up region 3: 64k 0xbe000000-0xbe010000 (hw control registers)
+    mov r0, #(0x0f<<1)|1
+    orr r0, r0, #0xbe000000
+    mcr p15, 0, r0, c6, c3, 0
+    mcr p15, 0, r0, c6, c3, 1
+
+    @ set up region 4: 16M 0x01000000-0x02000000 (mp3 area)
+    mov r0, #(0x17<<1)|1
+    orr r0, r0, #0x01000000
+    mcr p15, 0, r0, c6, c4, 0
+    mcr p15, 0, r0, c6, c4, 1
+
+    @ set regions 1 and 4 to be cacheable (so the first 2M and mp3 area will be cacheable)
+    mov r0, #(1<<1)|(1<<4)
+    mcr p15, 0, r0, c2, c0, 0
+    mcr p15, 0, r0, c2, c0, 1
+
+    @ set region 1 to be bufferable too (only data)
+    mov r0, #(1<<1)
+    mcr p15, 0, r0, c3, c0, 0
+
+    @ set protection, allow accsess only to regions 1 and 2
+    mov r0, #(3<<8)|(3<<6)|(3<<4)|(3<<2)|(0)  @ data: [full, full, full, full, no access] for regions [4 3 2 1 0]
+    mcr p15, 0, r0, c5, c0, 0
+    mov r0, #(0<<8)|(0<<6)|(0<<4)|(3<<2)|(0)  @ instructions: [no access, no, no, full, no]
+    mcr p15, 0, r0, c5, c0, 1
+
+    mrc p15, 0, r0, c1, c0, 0   @ fetch current control reg
+    orr r0, r0, #1              @ 0x00000001: enable protection unit
+    orr r0, r0, #4              @ 0x00000004: enable D cache
+    orr r0, r0, #0x1000         @ 0x00001000: enable I cache
+    bic r0, r0, #0xC0000000
+    orr r0, r0, #0x40000000     @ 0x40000000: synchronous, faster?
+@    orr r0, r0, #0xC0000000     @ 0xC0000000: async
+    mcr p15, 0, r0, c1, c0, 0   @ set control reg
+
+    @ flush (invalidate) the cache (just in case)
+    mov r0, #0
+    mcr p15, 0, r0, c7, c6, 0
+
+.Enter:
+    mov r0, r12
+    bl Main940
+
+    @ we should never get here
+.b_deadloop:
+    b .b_deadloop
+
+
+
+@ so asm utils are also defined here:
+.global spend_cycles @ c
+
+spend_cycles:
+    mov     r0, r0, lsr #2  @ 4 cycles/iteration
+    sub     r0, r0, #2      @ entry/exit/init
+.sc_loop:
+    subs    r0, r0, #1
+    bpl     .sc_loop
+
+    bx      lr
+
+
+@ clean-flush function from ARM940T technical reference manual
+.global cache_clean_flush
+
+cache_clean_flush:
+    mov     r1, #0                  @ init line counter
+ccf_outer_loop:
+    mov     r0, #0                  @ segment counter
+ccf_inner_loop:
+    orr     r2, r1, r0              @ make segment and line address
+    mcr     p15, 0, r2, c7, c14, 2  @ clean and flush that line
+    add     r0, r0, #0x10           @ incremet secment counter
+    cmp     r0, #0x40               @ complete all 4 segments?
+    bne     ccf_inner_loop
+    add     r1, r1, #0x04000000     @ increment line counter
+    cmp     r1, #0                  @ complete all lines?
+    bne     ccf_outer_loop
+    bx      lr
+
+
+@ clean-only version
+.global cache_clean
+
+cache_clean:
+    mov     r1, #0                  @ init line counter
+cf_outer_loop:
+    mov     r0, #0                  @ segment counter
+cf_inner_loop:
+    orr     r2, r1, r0              @ make segment and line address
+    mcr     p15, 0, r2, c7, c10, 2  @ clean that line
+    add     r0, r0, #0x10           @ incremet secment counter
+    cmp     r0, #0x40               @ complete all 4 segments?
+    bne     cf_inner_loop
+    add     r1, r1, #0x04000000     @ increment line counter
+    cmp     r1, #0                  @ complete all lines?
+    bne     cf_outer_loop
+    bx      lr
+
+
+.global wait_irq
+
+wait_irq:
+    mrs     r0, cpsr
+    bic     r0, r0, #0x80
+    msr     cpsr_c, r0               @ enable interrupts
+
+    mov     r0, #0
+    mcr     p15, 0, r0, c7, c0, 4   @ wait for IRQ
+@    mcr     p15, 0, r0, c15, c8, 2
+    b       .b_reserved
+
+.pool
+
+@ vim:filetype=armasm:
--- a/platform/gp2x/code940/940shared.h
+++ b/platform/gp2x/code940/940shared.h
@ -0,0 +1,42 @@
+#include "../../../Pico/sound/ym2612.h"
+#include "../helix/pub/mp3dec.h"
+
+enum _940_job_t {
+	JOB940_INITALL = 1,
+	JOB940_YM2612RESETCHIP,
+	JOB940_YM2612UPDATEONE,
+	JOB940_PICOSTATELOAD,
+	JOB940_MP3DECODE,
+	JOB940_NUMJOBS
+};
+
+#define MAX_940JOBS	2
+
+typedef struct
+{
+	YM2612  ym2612;					/* current state of the emulated YM2612 */
+	HMP3Decoder mp3dec;			/* mp3 decoder's handle */
+	int     mix_buffer[44100/50*2];		/* this is where the YM2612 samples will be mixed to */
+	short   mp3_buffer[2][1152*2];		/* buffers for mp3 decoder's output */
+} _940_data_t;
+
+
+typedef struct
+{
+	int		jobs[MAX_940JOBS];			/* jobs for second core */
+	int		busy;					/* busy status of the 940 core */
+	int		length;					/* number of samples to mix (882 max) */
+	int		stereo;					/* mix samples as stereo, doubles sample count automatically */
+	int		baseclock;				/* ym2612 settings */
+	int		rate;
+	int		writebuffsel;			/* which write buffer to use (from 940 side) */
+	UINT16  writebuff0[2048];			/* list of writes to ym2612, 1024 for savestates, 1024 extra */
+	UINT16  writebuff1[2048];
+	int		mp3_len;			/* data len of loaded mp3 */
+	int		mp3_offs;			/* current playback offset (just after last decoded frame) */
+	int		mp3_buffsel;			/* which output buffer to decode to */
+	int		vstarts[8];				/* debug: number of starts from each of 8 vectors */
+	int		loopc;					/* debug: main loop counter */
+	int		mp3_errors;			/* debug: mp3 decoder's error counter */
+	int		mp3_lasterr;			/* debug: mp3 decoder's last error */
+} _940_ctl_t;
--- a/platform/gp2x/code940/Makefile
+++ b/platform/gp2x/code940/Makefile
@ -0,0 +1,87 @@
+
+# you may or may not need to change this
+#devkit_path = x:/stuff/dev/devkitgp2x/
+devkit_path = /usr/local/devkitPro/devkitGP2X/
+lgcc_path = $(devkit_path)lib/gcc/arm-linux/4.0.3/
+CROSS = arm-linux-
+#CROSS = $(devkit_path)bin/arm-linux-
+
+# settings
+#up = 1
+
+DEFINC = -I../.. -I. -D__GP2X__ -DARM # -DBENCHMARK
+COPT_COMMON = -static -s -O3 -ftracer -fstrength-reduce -Wall -funroll-loops -fomit-frame-pointer -fstrict-aliasing -ffast-math
+COPT = $(COPT_COMMON) -mtune=arm920t
+GCC = $(CROSS)gcc
+STRIP = $(CROSS)strip
+AS = $(CROSS)as
+LD = $(CROSS)ld
+OBJCOPY = $(CROSS)objcopy
+
+all: code940.bin
+
+up940:
+	@cp -v code940.bin /mnt/gp2x/mnt/sd/games/PicoDrive/
+
+#	@cmd //C copy code940.bin \\\\10.0.1.2\\gp2x\\mnt\\sd\\games\\PicoDrive\\
+
+.c.o:
+	@echo $<
+	$(GCC) $(COPT) $(DEFINC) -c $< -o $@
+.s.o:
+	@echo $<
+	$(GCC) $(COPT) $(DEFINC) -c $< -o $@
+
+
+# stuff for 940 core
+
+# init, emu_control, emu
+OBJS940 += 940init.o 940.o 940ym2612.o memcpy.o
+# the asm code seems to be faster when run on 920, but not on 940 for some reason
+# OBJS940 += ../../Pico/sound/ym2612_asm.o
+
+# uClibc library code
+OBJS940 += uClibc/memset.o uClibc/s_floor.o uClibc/e_pow.o uClibc/e_sqrt.o uClibc/s_fabs.o
+OBJS940 += uClibc/s_scalbn.o uClibc/s_copysign.o uClibc/k_sin.o uClibc/k_cos.o uClibc/s_sin.o
+OBJS940 += uClibc/e_rem_pio2.o uClibc/k_rem_pio2.o uClibc/e_log.o uClibc/wrappers.o
+
+code940.bin : code940.gpe
+	@echo $@
+	@$(OBJCOPY) -O binary $< $@
+
+code940.gpe : $(OBJS940) ../helix/helix_mp3.a
+	@echo $@
+	@$(LD) -static -e code940 -Ttext 0x0 $^ -L$(lgcc_path) -lgcc -o $@
+
+940ym2612.o : ../../../Pico/sound/ym2612.c
+	@echo $@
+	@$(GCC) $(COPT_COMMON) -mtune=arm940t $(DEFINC) -DEXTERNAL_YM2612 -c $< -o $@
+
+../helix/helix_mp3.a:
+	@make -C ../helix/
+
+
+# cleanup
+clean: tidy
+	@$(RM) code940.bin
+tidy:
+	@$(RM) code940.gpe $(OBJS940)
+
+
+OBJSMP3T = mp3test.o ../gp2x.o ../asmutils.o ../usbjoy.o
+
+mp3test.gpe : $(OBJSMP3T) ../helix/helix_mp3.a
+	$(GCC) -static -o $@ $^
+	$(STRIP) $@
+	@cp -v $@ /mnt/gp2x/mnt/sd
+
+cleanmp3test:
+	$(RM) $(OBJSMP3T) mp3test.gpe
+
+# uClibc/e_pow.o : uClibc/e_pow.c
+#	@echo $<
+#	@$(GCC) $(COPT) $(DEFINC) -fno-profile-generate -c $< -o $@
+
+# uClibc/e_sqrt.o : uClibc/e_sqrt.c
+#	@echo $<
+#	@$(GCC) $(COPT) $(DEFINC) -fno-profile-generate -c $< -o $@
--- a/platform/gp2x/code940/memcpy.s
+++ b/platform/gp2x/code940/memcpy.s
@ -0,0 +1,500 @@
+/* $NetBSD: memcpy.S,v 1.3 1997/11/22 03:27:12 mark Exp $ */
+
+/*-
+* Copyright (c) 1997 The NetBSD Foundation, Inc.
+* All rights reserved.
+*
+* This code is derived from software contributed to The NetBSD Foundation
+* by Neil A. Carson and Mark Brinicombe
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+* 1. Redistributions of source code must retain the above copyright
+*    notice, this list of conditions and the following disclaimer.
+* 2. Redistributions in binary form must reproduce the above copyright
+*    notice, this list of conditions and the following disclaimer in the
+*    documentation and/or other materials provided with the distribution.
+* 3. All advertising materials mentioning features or use of this software
+*    must display the following acknowledgement:
+* This product includes software developed by the NetBSD
+* Foundation, Inc. and its contributors.
+* 4. Neither the name of The NetBSD Foundation nor the names of its
+*    contributors may be used to endorse or promote products derived
+*    from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+* ``AS IS\'\' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+* PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+* POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* This was modified by Jay Monkman <jmonkman@smoothsmoothie.com> to
+*   save and restore r12. This is necessary for RTEMS.
+*/
+/* #include <machine/asm.h>*/
+
+#define ENTRY(_LABEL) \
+ .global _LABEL; _LABEL:
+
+.globl memcpy
+memcpy:
+
+@ ENTRY(gp2x_memcpy)
+stmfd sp!, {r0, r12, lr}
+@ bl _gp2x_memcpy
+bl _memcpy
+ldmfd sp!, {r0, r12, pc}
+
+
+
+.globl memmove
+memmove:
+
+@ ENTRY(gp2x_memmove)
+stmfd sp!, {r0, r12, lr}
+@ bl _gp2x_memcpy
+bl _memcpy
+ldmfd sp!, {r0, r12, pc}
+
+
+
+/*
+* This is one fun bit of code ...
+* Some easy listening music is suggested while trying to understand this
+* code e.g. Iron Maiden
+*
+* For anyone attempting to understand it :
+*
+* The core code is implemented here with simple stubs for memcpy()
+* memmove() and bcopy().
+*
+* All local labels are prefixed with Lmemcpy_
+* Following the prefix a label starting f is used in the forward copy code
+* while a label using b is used in the backwards copy code
+* The source and destination addresses determine whether a forward or
+* backward copy is performed.
+* Separate bits of code are used to deal with the following situations
+* for both the forward and backwards copy.
+* unaligned source address
+* unaligned destination address
+* Separate copy routines are used to produce an optimised result for each
+* of these cases.
+* The copy code will use LDM/STM instructions to copy up to 32 bytes at
+* a time where possible.
+*
+* Note: r12 (aka ip) can be trashed during the function along with
+* r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
+* Additional registers are preserved prior to use i.e. r4, r5 & lr
+*
+* Apologies for the state of the comments;-)
+*/
+
+
+
+_memcpy:
+
+@ ENTRY(_gp2x_memcpy)
+/* Determine copy direction */
+cmp r1, r0
+bcc Lmemcpy_backwards
+
+moveq r0, #0   /* Quick abort for len=0 */
+moveq pc, lr
+
+stmdb sp!, {r0, lr}  /* memcpy() returns dest addr */
+subs r2, r2, #4
+blt Lmemcpy_fl4  /* less than 4 bytes */
+ands r12, r0, #3
+bne Lmemcpy_fdestul  /* oh unaligned destination addr */
+ands r12, r1, #3
+bne Lmemcpy_fsrcul  /* oh unaligned source addr */
+
+Lmemcpy_ft8:
+/* We have aligned source and destination */
+subs r2, r2, #8
+blt Lmemcpy_fl12  /* less than 12 bytes (4 from above) */
+subs r2, r2, #0x14        
+blt Lmemcpy_fl32  /* less than 32 bytes (12 from above) */
+stmdb sp!, {r4, r7, r8, r9, r10}  /* borrow r4 */
+
+/* blat 64 bytes at a time */
+/* XXX for really big copies perhaps we should use more registers */
+Lmemcpy_floop32:
+ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
+stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
+ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
+stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
+subs r2, r2, #0x40        
+bge Lmemcpy_floop32
+
+cmn r2, #0x10
+ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
+stmgeia r0!, {r3, r4, r12, lr}
+subge r2, r2, #0x10        
+ldmia sp!, {r4, r7, r8, r9, r10}  /* return r4 */
+
+Lmemcpy_fl32:
+adds r2, r2, #0x14        
+
+/* blat 12 bytes at a time */
+Lmemcpy_floop12:
+ldmgeia r1!, {r3, r12, lr}
+stmgeia r0!, {r3, r12, lr}
+subges r2, r2, #0x0c        
+bge Lmemcpy_floop12
+
+Lmemcpy_fl12:
+adds r2, r2, #8
+blt Lmemcpy_fl4
+
+subs r2, r2, #4
+ldrlt r3, [r1], #4
+strlt r3, [r0], #4
+ldmgeia r1!, {r3, r12}
+stmgeia r0!, {r3, r12}
+subge r2, r2, #4
+
+Lmemcpy_fl4:
+/* less than 4 bytes to go */
+adds r2, r2, #4
+ldmeqia sp!, {r0, pc}  /* done */
+
+/* copy the crud byte at a time */
+cmp r2, #2
+ldrb r3, [r1], #1
+strb r3, [r0], #1
+ldrgeb r3, [r1], #1
+strgeb r3, [r0], #1
+ldrgtb r3, [r1], #1
+strgtb r3, [r0], #1
+ldmia sp!, {r0, pc}
+
+/* erg - unaligned destination */
+Lmemcpy_fdestul:
+rsb r12, r12, #4
+cmp r12, #2
+
+/* align destination with byte copies */
+ldrb r3, [r1], #1
+strb r3, [r0], #1
+ldrgeb r3, [r1], #1
+strgeb r3, [r0], #1
+ldrgtb r3, [r1], #1
+strgtb r3, [r0], #1
+subs r2, r2, r12
+blt Lmemcpy_fl4  /* less the 4 bytes */
+
+ands r12, r1, #3
+beq Lmemcpy_ft8  /* we have an aligned source */
+
+/* erg - unaligned source */
+/* This is where it gets nasty ... */
+Lmemcpy_fsrcul:
+bic r1, r1, #3
+ldr lr, [r1], #4
+cmp r12, #2
+bgt Lmemcpy_fsrcul3
+beq Lmemcpy_fsrcul2
+cmp r2, #0x0c            
+blt Lmemcpy_fsrcul1loop4
+sub r2, r2, #0x0c        
+stmdb sp!, {r4, r5}
+
+Lmemcpy_fsrcul1loop16:
+mov r3, lr, lsr #8
+ldmia r1!, {r4, r5, r12, lr}
+orr r3, r3, r4, lsl #24
+mov r4, r4, lsr #8
+orr r4, r4, r5, lsl #24
+mov r5, r5, lsr #8
+orr r5, r5, r12, lsl #24
+mov r12, r12, lsr #8
+orr r12, r12, lr, lsl #24
+stmia r0!, {r3-r5, r12}
+subs r2, r2, #0x10        
+bge Lmemcpy_fsrcul1loop16
+ldmia sp!, {r4, r5}
+adds r2, r2, #0x0c        
+blt Lmemcpy_fsrcul1l4
+
+Lmemcpy_fsrcul1loop4:
+mov r12, lr, lsr #8
+ldr lr, [r1], #4
+orr r12, r12, lr, lsl #24
+str r12, [r0], #4
+subs r2, r2, #4
+bge Lmemcpy_fsrcul1loop4
+
+Lmemcpy_fsrcul1l4:
+sub r1, r1, #3
+b Lmemcpy_fl4
+
+Lmemcpy_fsrcul2:
+cmp r2, #0x0c            
+blt Lmemcpy_fsrcul2loop4
+sub r2, r2, #0x0c        
+stmdb sp!, {r4, r5}
+
+Lmemcpy_fsrcul2loop16:
+mov r3, lr, lsr #16
+ldmia r1!, {r4, r5, r12, lr}
+orr r3, r3, r4, lsl #16
+mov r4, r4, lsr #16
+orr r4, r4, r5, lsl #16
+mov r5, r5, lsr #16
+orr r5, r5, r12, lsl #16
+mov r12, r12, lsr #16
+orr r12, r12, lr, lsl #16
+stmia r0!, {r3-r5, r12}
+subs r2, r2, #0x10        
+bge Lmemcpy_fsrcul2loop16
+ldmia sp!, {r4, r5}
+adds r2, r2, #0x0c        
+blt Lmemcpy_fsrcul2l4
+
+Lmemcpy_fsrcul2loop4:
+mov r12, lr, lsr #16
+ldr lr, [r1], #4
+orr r12, r12, lr, lsl #16
+str r12, [r0], #4
+subs r2, r2, #4
+bge Lmemcpy_fsrcul2loop4
+
+Lmemcpy_fsrcul2l4:
+sub r1, r1, #2
+b Lmemcpy_fl4
+
+Lmemcpy_fsrcul3:
+cmp r2, #0x0c            
+blt Lmemcpy_fsrcul3loop4
+sub r2, r2, #0x0c        
+stmdb sp!, {r4, r5}
+
+Lmemcpy_fsrcul3loop16:
+mov r3, lr, lsr #24
+ldmia r1!, {r4, r5, r12, lr}
+orr r3, r3, r4, lsl #8
+mov r4, r4, lsr #24
+orr r4, r4, r5, lsl #8
+mov r5, r5, lsr #24
+orr r5, r5, r12, lsl #8
+mov r12, r12, lsr #24
+orr r12, r12, lr, lsl #8
+stmia r0!, {r3-r5, r12}
+subs r2, r2, #0x10        
+bge Lmemcpy_fsrcul3loop16
+ldmia sp!, {r4, r5}
+adds r2, r2, #0x0c        
+blt Lmemcpy_fsrcul3l4
+
+Lmemcpy_fsrcul3loop4:
+mov r12, lr, lsr #24
+ldr lr, [r1], #4
+orr r12, r12, lr, lsl #8
+str r12, [r0], #4
+subs r2, r2, #4
+bge Lmemcpy_fsrcul3loop4
+
+Lmemcpy_fsrcul3l4:
+sub r1, r1, #1
+b Lmemcpy_fl4
+
+Lmemcpy_backwards:
+add r1, r1, r2
+add r0, r0, r2
+subs r2, r2, #4
+blt Lmemcpy_bl4  /* less than 4 bytes */
+ands r12, r0, #3
+bne Lmemcpy_bdestul  /* oh unaligned destination addr */
+ands r12, r1, #3
+bne Lmemcpy_bsrcul  /* oh unaligned source addr */
+
+Lmemcpy_bt8:
+/* We have aligned source and destination */
+subs r2, r2, #8
+blt Lmemcpy_bl12  /* less than 12 bytes (4 from above) */
+stmdb sp!, {r4, r7, r8, r9, r10, lr}
+subs r2, r2, #0x14  /* less than 32 bytes (12 from above) */
+blt Lmemcpy_bl32
+
+/* blat 64 bytes at a time */
+/* XXX for really big copies perhaps we should use more registers */
+Lmemcpy_bloop32:
+ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
+stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
+ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
+stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
+subs r2, r2, #0x40        
+bge Lmemcpy_bloop32
+
+Lmemcpy_bl32:
+cmn r2, #0x10            
+ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
+stmgedb r0!, {r3, r4, r12, lr}
+subge r2, r2, #0x10        
+adds r2, r2, #0x14        
+ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
+stmgedb r0!, {r3, r12, lr}
+subge r2, r2, #0x0c        
+ldmia sp!, {r4, r7, r8, r9, r10, lr}
+
+Lmemcpy_bl12:
+adds r2, r2, #8
+blt Lmemcpy_bl4
+subs r2, r2, #4
+ldrlt r3, [r1, #-4]!
+strlt r3, [r0, #-4]!
+ldmgedb r1!, {r3, r12}
+stmgedb r0!, {r3, r12}
+subge r2, r2, #4
+
+Lmemcpy_bl4:
+/* less than 4 bytes to go */
+adds r2, r2, #4
+moveq pc, lr   /* done */
+
+/* copy the crud byte at a time */
+cmp r2, #2
+ldrb r3, [r1, #-1]!
+strb r3, [r0, #-1]!
+ldrgeb r3, [r1, #-1]!
+strgeb r3, [r0, #-1]!
+ldrgtb r3, [r1, #-1]!
+strgtb r3, [r0, #-1]!
+mov pc, lr
+
+/* erg - unaligned destination */
+Lmemcpy_bdestul:
+cmp r12, #2
+
+/* align destination with byte copies */
+ldrb r3, [r1, #-1]!
+strb r3, [r0, #-1]!
+ldrgeb r3, [r1, #-1]!
+strgeb r3, [r0, #-1]!
+ldrgtb r3, [r1, #-1]!
+strgtb r3, [r0, #-1]!
+subs r2, r2, r12
+blt Lmemcpy_bl4  /* less than 4 bytes to go */
+ands r12, r1, #3
+beq Lmemcpy_bt8  /* we have an aligned source */
+
+/* erg - unaligned source */
+/* This is where it gets nasty ... */
+Lmemcpy_bsrcul:
+bic r1, r1, #3
+ldr r3, [r1, #0]
+cmp r12, #2
+blt Lmemcpy_bsrcul1
+beq Lmemcpy_bsrcul2
+cmp r2, #0x0c            
+blt Lmemcpy_bsrcul3loop4
+sub r2, r2, #0x0c        
+stmdb sp!, {r4, r5, lr}
+
+Lmemcpy_bsrcul3loop16:
+mov lr, r3, lsl #8
+ldmdb r1!, {r3-r5, r12}
+orr lr, lr, r12, lsr #24
+mov r12, r12, lsl #8
+orr r12, r12, r5, lsr #24
+mov r5, r5, lsl #8
+orr r5, r5, r4, lsr #24
+mov r4, r4, lsl #8
+orr r4, r4, r3, lsr #24
+stmdb r0!, {r4, r5, r12, lr}
+subs r2, r2, #0x10        
+bge Lmemcpy_bsrcul3loop16
+ldmia sp!, {r4, r5, lr}
+adds r2, r2, #0x0c        
+blt Lmemcpy_bsrcul3l4
+
+Lmemcpy_bsrcul3loop4:
+mov r12, r3, lsl #8
+ldr r3, [r1, #-4]!
+orr r12, r12, r3, lsr #24
+str r12, [r0, #-4]!
+subs r2, r2, #4
+bge Lmemcpy_bsrcul3loop4
+
+Lmemcpy_bsrcul3l4:
+add r1, r1, #3
+b Lmemcpy_bl4
+
+Lmemcpy_bsrcul2:
+cmp r2, #0x0c            
+blt Lmemcpy_bsrcul2loop4
+sub r2, r2, #0x0c        
+stmdb sp!, {r4, r5, lr}
+
+Lmemcpy_bsrcul2loop16:
+mov lr, r3, lsl #16
+ldmdb r1!, {r3-r5, r12}
+orr lr, lr, r12, lsr #16
+mov r12, r12, lsl #16
+orr r12, r12, r5, lsr #16
+mov r5, r5, lsl #16
+orr r5, r5, r4, lsr #16
+mov r4, r4, lsl #16
+orr r4, r4, r3, lsr #16
+stmdb r0!, {r4, r5, r12, lr}
+subs r2, r2, #0x10        
+bge Lmemcpy_bsrcul2loop16
+ldmia sp!, {r4, r5, lr}
+adds r2, r2, #0x0c        
+blt Lmemcpy_bsrcul2l4
+
+Lmemcpy_bsrcul2loop4:
+mov r12, r3, lsl #16
+ldr r3, [r1, #-4]!
+orr r12, r12, r3, lsr #16
+str r12, [r0, #-4]!
+subs r2, r2, #4
+bge Lmemcpy_bsrcul2loop4
+
+Lmemcpy_bsrcul2l4:
+add r1, r1, #2
+b Lmemcpy_bl4
+
+Lmemcpy_bsrcul1:
+cmp r2, #0x0c            
+blt Lmemcpy_bsrcul1loop4
+sub r2, r2, #0x0c        
+stmdb sp!, {r4, r5, lr}
+
+Lmemcpy_bsrcul1loop32:
+mov lr, r3, lsl #24
+ldmdb r1!, {r3-r5, r12}
+orr lr, lr, r12, lsr #8
+mov r12, r12, lsl #24
+orr r12, r12, r5, lsr #8
+mov r5, r5, lsl #24
+orr r5, r5, r4, lsr #8
+mov r4, r4, lsl #24
+orr r4, r4, r3, lsr #8
+stmdb r0!, {r4, r5, r12, lr}
+subs r2, r2, #0x10        
+bge Lmemcpy_bsrcul1loop32
+ldmia sp!, {r4, r5, lr}
+adds r2, r2, #0x0c        
+blt Lmemcpy_bsrcul1l4
+
+Lmemcpy_bsrcul1loop4:
+mov r12, r3, lsl #24
+ldr r3, [r1, #-4]!
+orr r12, r12, r3, lsr #8
+str r12, [r0, #-4]!
+subs r2, r2, #4
+bge Lmemcpy_bsrcul1loop4
+
+Lmemcpy_bsrcul1l4:
+add r1, r1, #1
+b Lmemcpy_bl4
--- a/platform/gp2x/code940/mp3test.c
+++ b/platform/gp2x/code940/mp3test.c
@ -0,0 +1,383 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include "940shared.h"
+#include "../gp2x.h"
+//#include "emu.h"
+//#include "menu.h"
+#include "../asmutils.h"
+#include "../helix/pub/mp3dec.h"
+
+/* we will need some gp2x internals here */
+extern volatile unsigned short *gp2x_memregs; /* from minimal library rlyeh */
+extern volatile unsigned long  *gp2x_memregl;
+
+static unsigned char *shared_mem = 0;
+static _940_data_t *shared_data = 0;
+static _940_ctl_t *shared_ctl = 0;
+static unsigned char *mp3_mem = 0;
+
+#define MP3_SIZE_MAX (0x1000000 - 4*640*480)
+
+int crashed_940 = 0;
+
+
+/***********************************************************/
+
+#define MAXOUT		(+32767)
+#define MINOUT		(-32768)
+
+/* limitter */
+#define Limit(val, max,min) { \
+	if ( val > max )      val = max; \
+	else if ( val < min ) val = min; \
+}
+
+
+void wait_busy_940(void)
+{
+	int i;
+#if 0
+	printf("940 busy, entering wait loop.. (cnt: %i, wc: %i, ve: ", shared_ctl->loopc, shared_ctl->waitc);
+	for (i = 0; i < 8; i++)
+		printf("%i ", shared_ctl->vstarts[i]);
+	printf(")\n");
+
+	for (i = 0; shared_ctl->busy; i++)
+	{
+		spend_cycles(1024); /* needs tuning */
+	}
+	printf("wait iterations: %i\n", i);
+#else
+	for (i = 0; shared_ctl->busy && i < 0x10000; i++)
+		spend_cycles(8*1024);
+	if (i < 0x10000) return;
+
+	/* 940 crashed */
+	printf("940 crashed (cnt: %i, ve: ", shared_ctl->loopc);
+	for (i = 0; i < 8; i++)
+		printf("%i ", shared_ctl->vstarts[i]);
+	printf(")\n");
+	crashed_940 = 1;
+#endif
+}
+
+
+void add_job_940(int job0, int job1)
+{
+	shared_ctl->jobs[0] = job0;
+	shared_ctl->jobs[1] = job1;
+	shared_ctl->busy = 1;
+	gp2x_memregs[0x3B3E>>1] = 0xffff; // cause an IRQ for 940
+}
+
+
+static int read_to_upper(void *dest, void *tmpbuf, int tmpsize, FILE *f)
+{
+	int nRead, nLen = 0;
+
+	while(1)
+	{
+		nRead = fread(tmpbuf, 1, tmpsize, f);
+		if(nRead <= 0)
+			break;
+		memcpy((unsigned char *)dest + nLen, tmpbuf, nRead);
+		nLen += nRead;
+	}
+
+	return nLen;
+}
+
+static void simpleWait(int thissec, int lim_time)
+{
+	struct timeval tval;
+
+	spend_cycles(1024);
+	gettimeofday(&tval, 0);
+	if(thissec != tval.tv_sec) tval.tv_usec+=1000000;
+
+	while(tval.tv_usec < lim_time)
+	{
+		spend_cycles(1024);
+		gettimeofday(&tval, 0);
+		if(thissec != tval.tv_sec) tval.tv_usec+=1000000;
+	}
+}
+
+
+char **g_argv;
+
+/* none of the functions in this file should be called before this one */
+void YM2612Init_940(int baseclock, int rate)
+{
+	printf("YM2612Init_940()\n");
+	printf("Mem usage: shared_data: %i, shared_ctl: %i\n", sizeof(*shared_data), sizeof(*shared_ctl));
+
+	Reset940(1, 2);
+	Pause940(1);
+
+	gp2x_memregs[0x3B46>>1] = 0xffff; // clear pending DUALCPU interrupts for 940
+	gp2x_memregs[0x3B42>>1] = 0xffff; // enable DUALCPU interrupts for 940
+
+	gp2x_memregl[0x4508>>2] = ~(1<<26); // unmask DUALCPU ints in the undocumented 940's interrupt controller
+
+	if (shared_mem == NULL)
+	{
+		shared_mem = (unsigned char *) mmap(0, 0x210000, PROT_READ|PROT_WRITE, MAP_SHARED, memdev, 0x2000000);
+		if(shared_mem == MAP_FAILED)
+		{
+			printf("mmap(shared_data) failed with %i\n", errno);
+			exit(1);
+		}
+		shared_data = (_940_data_t *) (shared_mem+0x100000);
+		/* this area must not get buffered on either side */
+		shared_ctl =  (_940_ctl_t *)  (shared_mem+0x200000);
+		mp3_mem = (unsigned char *) mmap(0, MP3_SIZE_MAX, PROT_READ|PROT_WRITE, MAP_SHARED, memdev, 0x3000000);
+		if (mp3_mem == MAP_FAILED)
+		{
+			printf("mmap(mp3_mem) failed with %i\n", errno);
+			exit(1);
+		}
+		crashed_940 = 1;
+	}
+
+	if (crashed_940)
+	{
+		unsigned char ucData[1024];
+		int i;
+		char binpath[1024];
+		FILE *fp;
+
+		strncpy(binpath, g_argv[0], 1023);
+		binpath[1023] = 0;
+		for (i = strlen(binpath); i > 0; i--)
+			if (binpath[i] == '/') { binpath[i] = 0; break; }
+		strcat(binpath, "/code940.bin");
+
+		fp = fopen(binpath, "rb");
+		if(!fp)
+		{
+			printf("failed to open %s\n", binpath);
+			exit(1);
+		}
+
+		read_to_upper(shared_mem, ucData, sizeof(ucData), fp);
+		fclose(fp);
+		crashed_940 = 0;
+	}
+
+	memset(shared_data, 0, sizeof(*shared_data));
+	memset(shared_ctl,  0, sizeof(*shared_ctl));
+
+	/* now cause 940 to init it's ym2612 stuff */
+	shared_ctl->baseclock = baseclock;
+	shared_ctl->rate = rate;
+	shared_ctl->jobs[0] = JOB940_INITALL;
+	shared_ctl->jobs[1] = 0;
+	shared_ctl->busy = 1;
+
+	/* start the 940 */
+	Reset940(0, 2);
+	Pause940(0);
+}
+
+
+unsigned char *mp3_data = 0;
+
+void local_decode(void)
+{
+	int mp3_offs = shared_ctl->mp3_offs;
+	unsigned char *readPtr = mp3_data + mp3_offs;
+	int bytesLeft = shared_ctl->mp3_len - mp3_offs;
+	int offset; // frame offset from readPtr
+	int err = 0;
+
+	if (bytesLeft <= 0) return; // EOF, nothing to do
+
+	offset = MP3FindSyncWord(readPtr, bytesLeft);
+	if (offset < 0) {
+		shared_ctl->mp3_offs = shared_ctl->mp3_len;
+		return; // EOF
+	}
+	readPtr += offset;
+	bytesLeft -= offset;
+
+	err = MP3Decode(shared_data->mp3dec, &readPtr, &bytesLeft,
+			shared_data->mp3_buffer[shared_ctl->mp3_buffsel], 0);
+	if (err) {
+		if (err == ERR_MP3_INDATA_UNDERFLOW) {
+			shared_ctl->mp3_offs = shared_ctl->mp3_len; // EOF
+			return;
+		} else if (err <= -6 && err >= -12) {
+			// ERR_MP3_INVALID_FRAMEHEADER, ERR_MP3_INVALID_*
+			// just try to skip the offending frame..
+			readPtr++;
+		}
+		shared_ctl->mp3_errors++;
+		shared_ctl->mp3_lasterr = err;
+	}
+	shared_ctl->mp3_offs = readPtr - mp3_data;
+}
+
+
+void gp2x_sound_sync(void);
+
+#define USE_LOCAL 0
+#define BENCHMARK 0
+
+int main(int argc, char *argv[])
+{
+	FILE *f;
+	int size;
+	struct timeval tval; // timing
+	int thissec = 0, fps = 0;
+	int target_frametime, frame_samples, samples_ready, mp3_buffer_offs, play_bufsel;
+	unsigned char play_buffer[44100/50*2*2];
+
+	if (argc != 2) {
+		printf("usage: %s <mp3file>\n", argv[0]);
+		return 1;
+	}
+
+	g_argv = argv;
+
+	gp2x_init();
+	YM2612Init_940(123, 44100);
+
+	// load a mp3
+	f = fopen(argv[1], "rb");
+	if (!f) {
+		printf("can't open %s\n", argv[1]);
+		return 1;
+	}
+
+	fseek(f, 0, SEEK_END);
+	size = (int) ftell(f);
+	if (size > MP3_SIZE_MAX) {
+		printf("size %i > %i\n", size, MP3_SIZE_MAX);
+		size = MP3_SIZE_MAX;
+	}
+
+	fseek(f, 0, SEEK_SET);
+	if (fread(mp3_mem, 1, size, f) != size) {
+		printf("read failed, errno=%i\n", errno);
+		fclose(f);
+		exit(1);
+	}
+	fclose(f);
+	shared_ctl->mp3_len = size;
+
+#if USE_LOCAL
+	shared_data->mp3dec = MP3InitDecoder();
+	mp3_data = malloc(size);
+	printf("init: dec: %p ptr: %p\n", shared_data->mp3dec, mp3_data);
+	if (!mp3_data) {
+		printf("low mem\n");
+		exit(1);
+	}
+	memcpy(mp3_data, mp3_mem, size);
+#else
+	//printf("YM2612UpdateOne_940()\n");
+	if (shared_ctl->busy) wait_busy_940();
+#endif
+
+	gp2x_start_sound(44100, 16, 1);
+
+	#define DESIRED_FPS 50
+	target_frametime = 1000000/DESIRED_FPS;
+	frame_samples = 44100/DESIRED_FPS;
+	samples_ready = mp3_buffer_offs = 0;
+	play_bufsel = 1;
+
+	for (;; fps++)
+	{
+		int lim_time;
+
+		gettimeofday(&tval, 0);
+		if (tval.tv_sec != thissec)
+		{
+			printf("fps: %i\n", fps);
+			thissec = tval.tv_sec;
+			fps = 0;
+#if BENCHMARK
+			shared_ctl->mp3_offs = 0;
+#endif
+		}
+
+#if 0
+		// decode
+#if USE_LOCAL
+		shared_ctl->mp3_buffsel ^= 1;
+		local_decode();
+#else
+		wait_busy_940();
+		shared_ctl->mp3_buffsel ^= 1;
+		add_job_940(JOB940_MP3DECODE, 0);
+#endif
+
+		if (shared_ctl->mp3_lasterr) {
+			printf("mp3_lasterr #%i: %i size: %i offs: %i\n", shared_ctl->mp3_errors, shared_ctl->mp3_lasterr,
+				shared_ctl->mp3_len, shared_ctl->mp3_offs);
+			printf("loopc: %i bytes: %08x\n",
+				shared_ctl->loopc, *(int *)(mp3_mem+shared_ctl->mp3_offs));
+			shared_ctl->mp3_lasterr = 0;
+		}
+
+#if !BENCHMARK
+		// play
+		gp2x_sound_sync();
+		gp2x_sound_write(shared_data->mp3_buffer[shared_ctl->mp3_buffsel^1], 1152*2*2);
+#endif
+#else
+		lim_time = (fps+1) * target_frametime;
+
+		wait_busy_940();
+
+		// decode, play
+		if (samples_ready >= frame_samples) {
+			if (1152 - mp3_buffer_offs >= frame_samples) {
+				memcpy(play_buffer, shared_data->mp3_buffer[play_bufsel] + mp3_buffer_offs*2,
+					frame_samples*2*2);
+				mp3_buffer_offs += frame_samples;
+			} else {
+				// collect from both buffers..
+				int left = 1152 - mp3_buffer_offs;
+				memcpy(play_buffer, shared_data->mp3_buffer[play_bufsel] + mp3_buffer_offs*2,
+					left*2*2);
+				play_bufsel ^= 1;
+				mp3_buffer_offs = frame_samples - left;
+				memcpy(play_buffer + left*2*2, shared_data->mp3_buffer[play_bufsel],
+					mp3_buffer_offs*2*2);
+			}
+			gp2x_sound_write(play_buffer, frame_samples*2*2);
+			samples_ready -= frame_samples;
+		}
+
+		// make sure we will have enough samples next frame
+		if (samples_ready < frame_samples) {
+//			wait_busy_940();
+			shared_ctl->mp3_buffsel ^= 1;
+			add_job_940(JOB940_MP3DECODE, 0);
+			samples_ready += 1152;
+		}
+
+		gettimeofday(&tval, 0);
+		if(thissec != tval.tv_sec) tval.tv_usec+=1000000;
+		if(tval.tv_usec < lim_time)
+		{
+			// we are too fast
+			simpleWait(thissec, lim_time);
+		}
+#endif
+	}
+
+	return 0;
+}
+
--- a/platform/gp2x/code940/uClibc/README
+++ b/platform/gp2x/code940/uClibc/README
@ -0,0 +1,16 @@
+The routines included in this math library are derived from the
+math library for Apple's MacOS X/Darwin math library, which was
+itself swiped from FreeBSD.  The original copyright information
+is as follows:
+
+	Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+
+	Developed at SunPro, a Sun Microsystems, Inc. business.
+	Permission to use, copy, modify, and distribute this
+	software is freely granted, provided that this notice
+	is preserved.
+
+It has been ported to work with uClibc and generally behave
+by Erik Andersen <andersen@codepoet.org>
+  22 May, 2001
+
--- a/platform/gp2x/code940/uClibc/e_log.c
+++ b/platform/gp2x/code940/uClibc/e_log.c
@ -0,0 +1,147 @@
+/* @(#)e_log.c 5.1 93/09/24 */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: e_log.c,v 1.8 1995/05/10 20:45:49 jtc Exp $";
+#endif
+
+/* __ieee754_log(x)
+ * Return the logrithm of x
+ *
+ * Method :
+ *   1. Argument Reduction: find k and f such that
+ *			x = 2^k * (1+f),
+ *	   where  sqrt(2)/2 < 1+f < sqrt(2) .
+ *
+ *   2. Approximation of log(1+f).
+ *	Let s = f/(2+f) ; based on log(1+f) = log(1+s) - log(1-s)
+ *		 = 2s + 2/3 s**3 + 2/5 s**5 + .....,
+ *	     	 = 2s + s*R
+ *      We use a special Reme algorithm on [0,0.1716] to generate
+ * 	a polynomial of degree 14 to approximate R The maximum error
+ *	of this polynomial approximation is bounded by 2**-58.45. In
+ *	other words,
+ *		        2      4      6      8      10      12      14
+ *	    R(z) ~ Lg1*s +Lg2*s +Lg3*s +Lg4*s +Lg5*s  +Lg6*s  +Lg7*s
+ *  	(the values of Lg1 to Lg7 are listed in the program)
+ *	and
+ *	    |      2          14          |     -58.45
+ *	    | Lg1*s +...+Lg7*s    -  R(z) | <= 2
+ *	    |                             |
+ *	Note that 2s = f - s*f = f - hfsq + s*hfsq, where hfsq = f*f/2.
+ *	In order to guarantee error in log below 1ulp, we compute log
+ *	by
+ *		log(1+f) = f - s*(f - R)	(if f is not too large)
+ *		log(1+f) = f - (hfsq - s*(hfsq+R)).	(better accuracy)
+ *
+ *	3. Finally,  log(x) = k*ln2 + log(1+f).
+ *			    = k*ln2_hi+(f-(hfsq-(s*(hfsq+R)+k*ln2_lo)))
+ *	   Here ln2 is split into two floating point number:
+ *			ln2_hi + ln2_lo,
+ *	   where n*ln2_hi is always exact for |n| < 2000.
+ *
+ * Special cases:
+ *	log(x) is NaN with signal if x < 0 (including -INF) ;
+ *	log(+INF) is +INF; log(0) is -INF with signal;
+ *	log(NaN) is that NaN with no signal.
+ *
+ * Accuracy:
+ *	according to an error analysis, the error is always less than
+ *	1 ulp (unit in the last place).
+ *
+ * Constants:
+ * The hexadecimal values are the intended ones for the following
+ * constants. The decimal values may be used, provided that the
+ * compiler will convert from decimal to binary accurately enough
+ * to produce the hexadecimal values shown.
+ */
+
+#include "math.h"
+#include "math_private.h"
+
+#ifdef __STDC__
+static const double
+#else
+static double
+#endif
+ln2_hi  =  6.93147180369123816490e-01,	/* 3fe62e42 fee00000 */
+ln2_lo  =  1.90821492927058770002e-10,	/* 3dea39ef 35793c76 */
+two54   =  1.80143985094819840000e+16,  /* 43500000 00000000 */
+Lg1 = 6.666666666666735130e-01,  /* 3FE55555 55555593 */
+Lg2 = 3.999999999940941908e-01,  /* 3FD99999 9997FA04 */
+Lg3 = 2.857142874366239149e-01,  /* 3FD24924 94229359 */
+Lg4 = 2.222219843214978396e-01,  /* 3FCC71C5 1D8E78AF */
+Lg5 = 1.818357216161805012e-01,  /* 3FC74664 96CB03DE */
+Lg6 = 1.531383769920937332e-01,  /* 3FC39A09 D078C69F */
+Lg7 = 1.479819860511658591e-01;  /* 3FC2F112 DF3E5244 */
+
+#ifdef __STDC__
+static const double zero   =  0.0;
+#else
+static double zero   =  0.0;
+#endif
+
+#ifdef __STDC__
+	double __ieee754_log(double x)
+#else
+	double __ieee754_log(x)
+	double x;
+#endif
+{
+	double hfsq,f,s,z,R,w,t1,t2,dk;
+	int32_t k,hx,i,j;
+	u_int32_t lx;
+
+	EXTRACT_WORDS(hx,lx,x);
+
+	k=0;
+	if (hx < 0x00100000) {			/* x < 2**-1022  */
+	    if (((hx&0x7fffffff)|lx)==0)
+		return -two54/zero;		/* log(+-0)=-inf */
+	    if (hx<0) return (x-x)/zero;	/* log(-#) = NaN */
+	    k -= 54; x *= two54; /* subnormal number, scale up x */
+	    GET_HIGH_WORD(hx,x);
+	}
+	if (hx >= 0x7ff00000) return x+x;
+	k += (hx>>20)-1023;
+	hx &= 0x000fffff;
+	i = (hx+0x95f64)&0x100000;
+	SET_HIGH_WORD(x,hx|(i^0x3ff00000));	/* normalize x or x/2 */
+	k += (i>>20);
+	f = x-1.0;
+	if((0x000fffff&(2+hx))<3) {	/* |f| < 2**-20 */
+	    if(f==zero) {if(k==0) return zero;  else {dk=(double)k;
+				 return dk*ln2_hi+dk*ln2_lo;}
+	    }
+	    R = f*f*(0.5-0.33333333333333333*f);
+	    if(k==0) return f-R; else {dk=(double)k;
+	    	     return dk*ln2_hi-((R-dk*ln2_lo)-f);}
+	}
+ 	s = f/(2.0+f);
+	dk = (double)k;
+	z = s*s;
+	i = hx-0x6147a;
+	w = z*z;
+	j = 0x6b851-hx;
+	t1= w*(Lg2+w*(Lg4+w*Lg6));
+	t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
+	i |= j;
+	R = t2+t1;
+	if(i>0) {
+	    hfsq=0.5*f*f;
+	    if(k==0) return f-(hfsq-s*(hfsq+R)); else
+		     return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
+	} else {
+	    if(k==0) return f-s*(f-R); else
+		     return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f);
+	}
+}
--- a/platform/gp2x/code940/uClibc/e_pow.c
+++ b/platform/gp2x/code940/uClibc/e_pow.c
@ -0,0 +1,308 @@
+/* @(#)e_pow.c 5.1 93/09/24 */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: e_pow.c,v 1.9 1995/05/12 04:57:32 jtc Exp $";
+#endif
+
+/* __ieee754_pow(x,y) return x**y
+ *
+ *		      n
+ * Method:  Let x =  2   * (1+f)
+ *	1. Compute and return log2(x) in two pieces:
+ *		log2(x) = w1 + w2,
+ *	   where w1 has 53-24 = 29 bit trailing zeros.
+ *	2. Perform y*log2(x) = n+y' by simulating muti-precision
+ *	   arithmetic, where |y'|<=0.5.
+ *	3. Return x**y = 2**n*exp(y'*log2)
+ *
+ * Special cases:
+ *	1.  (anything) ** 0  is 1
+ *	2.  (anything) ** 1  is itself
+ *	3.  (anything) ** NAN is NAN
+ *	4.  NAN ** (anything except 0) is NAN
+ *	5.  +-(|x| > 1) **  +INF is +INF
+ *	6.  +-(|x| > 1) **  -INF is +0
+ *	7.  +-(|x| < 1) **  +INF is +0
+ *	8.  +-(|x| < 1) **  -INF is +INF
+ *	9.  +-1         ** +-INF is NAN
+ *	10. +0 ** (+anything except 0, NAN)               is +0
+ *	11. -0 ** (+anything except 0, NAN, odd integer)  is +0
+ *	12. +0 ** (-anything except 0, NAN)               is +INF
+ *	13. -0 ** (-anything except 0, NAN, odd integer)  is +INF
+ *	14. -0 ** (odd integer) = -( +0 ** (odd integer) )
+ *	15. +INF ** (+anything except 0,NAN) is +INF
+ *	16. +INF ** (-anything except 0,NAN) is +0
+ *	17. -INF ** (anything)  = -0 ** (-anything)
+ *	18. (-anything) ** (integer) is (-1)**(integer)*(+anything**integer)
+ *	19. (-anything except 0 and inf) ** (non-integer) is NAN
+ *
+ * Accuracy:
+ *	pow(x,y) returns x**y nearly rounded. In particular
+ *			pow(integer,integer)
+ *	always returns the correct integer provided it is
+ *	representable.
+ *
+ * Constants :
+ * The hexadecimal values are the intended ones for the following
+ * constants. The decimal values may be used, provided that the
+ * compiler will convert from decimal to binary accurately enough
+ * to produce the hexadecimal values shown.
+ */
+
+#include "math.h"
+#include "math_private.h"
+
+#ifdef __STDC__
+static const double
+#else
+static double
+#endif
+bp[] = {1.0, 1.5,},
+dp_h[] = { 0.0, 5.84962487220764160156e-01,}, /* 0x3FE2B803, 0x40000000 */
+dp_l[] = { 0.0, 1.35003920212974897128e-08,}, /* 0x3E4CFDEB, 0x43CFD006 */
+zero    =  0.0,
+one	=  1.0,
+two	=  2.0,
+two53	=  9007199254740992.0,	/* 0x43400000, 0x00000000 */
+huge	=  1.0e300,
+tiny    =  1.0e-300,
+	/* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
+L1  =  5.99999999999994648725e-01, /* 0x3FE33333, 0x33333303 */
+L2  =  4.28571428578550184252e-01, /* 0x3FDB6DB6, 0xDB6FABFF */
+L3  =  3.33333329818377432918e-01, /* 0x3FD55555, 0x518F264D */
+L4  =  2.72728123808534006489e-01, /* 0x3FD17460, 0xA91D4101 */
+L5  =  2.30660745775561754067e-01, /* 0x3FCD864A, 0x93C9DB65 */
+L6  =  2.06975017800338417784e-01, /* 0x3FCA7E28, 0x4A454EEF */
+P1   =  1.66666666666666019037e-01, /* 0x3FC55555, 0x5555553E */
+P2   = -2.77777777770155933842e-03, /* 0xBF66C16C, 0x16BEBD93 */
+P3   =  6.61375632143793436117e-05, /* 0x3F11566A, 0xAF25DE2C */
+P4   = -1.65339022054652515390e-06, /* 0xBEBBBD41, 0xC5D26BF1 */
+P5   =  4.13813679705723846039e-08, /* 0x3E663769, 0x72BEA4D0 */
+lg2  =  6.93147180559945286227e-01, /* 0x3FE62E42, 0xFEFA39EF */
+lg2_h  =  6.93147182464599609375e-01, /* 0x3FE62E43, 0x00000000 */
+lg2_l  = -1.90465429995776804525e-09, /* 0xBE205C61, 0x0CA86C39 */
+ovt =  8.0085662595372944372e-0017, /* -(1024-log2(ovfl+.5ulp)) */
+cp    =  9.61796693925975554329e-01, /* 0x3FEEC709, 0xDC3A03FD =2/(3ln2) */
+cp_h  =  9.61796700954437255859e-01, /* 0x3FEEC709, 0xE0000000 =(float)cp */
+cp_l  = -7.02846165095275826516e-09, /* 0xBE3E2FE0, 0x145B01F5 =tail of cp_h*/
+ivln2    =  1.44269504088896338700e+00, /* 0x3FF71547, 0x652B82FE =1/ln2 */
+ivln2_h  =  1.44269502162933349609e+00, /* 0x3FF71547, 0x60000000 =24b 1/ln2*/
+ivln2_l  =  1.92596299112661746887e-08; /* 0x3E54AE0B, 0xF85DDF44 =1/ln2 tail*/
+
+#ifdef __STDC__
+	double __ieee754_pow(double x, double y)
+#else
+	double __ieee754_pow(x,y)
+	double x, y;
+#endif
+{
+	double z,ax,z_h,z_l,p_h,p_l;
+	double y1,t1,t2,r,s,t,u,v,w;
+	int32_t i,j,k,yisint,n;
+	int32_t hx,hy,ix,iy;
+	u_int32_t lx,ly;
+
+	EXTRACT_WORDS(hx,lx,x);
+	EXTRACT_WORDS(hy,ly,y);
+	ix = hx&0x7fffffff;  iy = hy&0x7fffffff;
+
+    /* y==zero: x**0 = 1 */
+	if((iy|ly)==0) return one;
+
+    /* +-NaN return x+y */
+	if(ix > 0x7ff00000 || ((ix==0x7ff00000)&&(lx!=0)) ||
+	   iy > 0x7ff00000 || ((iy==0x7ff00000)&&(ly!=0)))
+		return x+y;
+
+    /* determine if y is an odd int when x < 0
+     * yisint = 0	... y is not an integer
+     * yisint = 1	... y is an odd int
+     * yisint = 2	... y is an even int
+     */
+	yisint  = 0;
+	if(hx<0) {
+	    if(iy>=0x43400000) yisint = 2; /* even integer y */
+	    else if(iy>=0x3ff00000) {
+		k = (iy>>20)-0x3ff;	   /* exponent */
+		if(k>20) {
+		    j = ly>>(52-k);
+		    if((j<<(52-k))==ly) yisint = 2-(j&1);
+		} else if(ly==0) {
+		    j = iy>>(20-k);
+		    if((j<<(20-k))==iy) yisint = 2-(j&1);
+		}
+	    }
+	}
+
+    /* special value of y */
+	if(ly==0) {
+	    if (iy==0x7ff00000) {	/* y is +-inf */
+	        if(((ix-0x3ff00000)|lx)==0)
+		    return  y - y;	/* inf**+-1 is NaN */
+	        else if (ix >= 0x3ff00000)/* (|x|>1)**+-inf = inf,0 */
+		    return (hy>=0)? y: zero;
+	        else			/* (|x|<1)**-,+inf = inf,0 */
+		    return (hy<0)?-y: zero;
+	    }
+	    if(iy==0x3ff00000) {	/* y is  +-1 */
+		if(hy<0) return one/x; else return x;
+	    }
+	    if(hy==0x40000000) return x*x; /* y is  2 */
+	    if(hy==0x3fe00000) {	/* y is  0.5 */
+		if(hx>=0)	/* x >= +0 */
+		return __ieee754_sqrt(x);
+	    }
+	}
+
+	ax   = fabs(x);
+    /* special value of x */
+	if(lx==0) {
+	    if(ix==0x7ff00000||ix==0||ix==0x3ff00000){
+		z = ax;			/*x is +-0,+-inf,+-1*/
+		if(hy<0) z = one/z;	/* z = (1/|x|) */
+		if(hx<0) {
+		    if(((ix-0x3ff00000)|yisint)==0) {
+			z = (z-z)/(z-z); /* (-1)**non-int is NaN */
+		    } else if(yisint==1)
+			z = -z;		/* (x<0)**odd = -(|x|**odd) */
+		}
+		return z;
+	    }
+	}
+
+    /* (x<0)**(non-int) is NaN */
+	if(((((u_int32_t)hx>>31)-1)|yisint)==0) return (x-x)/(x-x);
+
+    /* |y| is huge */
+	if(iy>0x41e00000) { /* if |y| > 2**31 */
+	    if(iy>0x43f00000){	/* if |y| > 2**64, must o/uflow */
+		if(ix<=0x3fefffff) return (hy<0)? huge*huge:tiny*tiny;
+		if(ix>=0x3ff00000) return (hy>0)? huge*huge:tiny*tiny;
+	    }
+	/* over/underflow if x is not close to one */
+	    if(ix<0x3fefffff) return (hy<0)? huge*huge:tiny*tiny;
+	    if(ix>0x3ff00000) return (hy>0)? huge*huge:tiny*tiny;
+	/* now |1-x| is tiny <= 2**-20, suffice to compute
+	   log(x) by x-x^2/2+x^3/3-x^4/4 */
+	    t = x-1;		/* t has 20 trailing zeros */
+	    w = (t*t)*(0.5-t*(0.3333333333333333333333-t*0.25));
+	    u = ivln2_h*t;	/* ivln2_h has 21 sig. bits */
+	    v = t*ivln2_l-w*ivln2;
+	    t1 = u+v;
+	    SET_LOW_WORD(t1,0);
+	    t2 = v-(t1-u);
+	} else {
+	    double s2,s_h,s_l,t_h,t_l;
+	    n = 0;
+	/* take care subnormal number */
+	    if(ix<0x00100000)
+		{ax *= two53; n -= 53; GET_HIGH_WORD(ix,ax); }
+	    n  += ((ix)>>20)-0x3ff;
+	    j  = ix&0x000fffff;
+	/* determine interval */
+	    ix = j|0x3ff00000;		/* normalize ix */
+	    if(j<=0x3988E) k=0;		/* |x|<sqrt(3/2) */
+	    else if(j<0xBB67A) k=1;	/* |x|<sqrt(3)   */
+	    else {k=0;n+=1;ix -= 0x00100000;}
+	    SET_HIGH_WORD(ax,ix);
+
+	/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
+	    u = ax-bp[k];		/* bp[0]=1.0, bp[1]=1.5 */
+	    v = one/(ax+bp[k]);
+	    s = u*v;
+	    s_h = s;
+	    SET_LOW_WORD(s_h,0);
+	/* t_h=ax+bp[k] High */
+	    t_h = zero;
+	    SET_HIGH_WORD(t_h,((ix>>1)|0x20000000)+0x00080000+(k<<18));
+	    t_l = ax - (t_h-bp[k]);
+	    s_l = v*((u-s_h*t_h)-s_h*t_l);
+	/* compute log(ax) */
+	    s2 = s*s;
+	    r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
+	    r += s_l*(s_h+s);
+	    s2  = s_h*s_h;
+	    t_h = 3.0+s2+r;
+	    SET_LOW_WORD(t_h,0);
+	    t_l = r-((t_h-3.0)-s2);
+	/* u+v = s*(1+...) */
+	    u = s_h*t_h;
+	    v = s_l*t_h+t_l*s;
+	/* 2/(3log2)*(s+...) */
+	    p_h = u+v;
+	    SET_LOW_WORD(p_h,0);
+	    p_l = v-(p_h-u);
+	    z_h = cp_h*p_h;		/* cp_h+cp_l = 2/(3*log2) */
+	    z_l = cp_l*p_h+p_l*cp+dp_l[k];
+	/* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
+	    t = (double)n;
+	    t1 = (((z_h+z_l)+dp_h[k])+t);
+	    SET_LOW_WORD(t1,0);
+	    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
+	}
+
+	s = one; /* s (sign of result -ve**odd) = -1 else = 1 */
+	if(((((u_int32_t)hx>>31)-1)|(yisint-1))==0)
+	    s = -one;/* (-ve)**(odd int) */
+
+    /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
+	y1  = y;
+	SET_LOW_WORD(y1,0);
+	p_l = (y-y1)*t1+y*t2;
+	p_h = y1*t1;
+	z = p_l+p_h;
+	EXTRACT_WORDS(j,i,z);
+	if (j>=0x40900000) {				/* z >= 1024 */
+	    if(((j-0x40900000)|i)!=0)			/* if z > 1024 */
+		return s*huge*huge;			/* overflow */
+	    else {
+		if(p_l+ovt>z-p_h) return s*huge*huge;	/* overflow */
+	    }
+	} else if((j&0x7fffffff)>=0x4090cc00 ) {	/* z <= -1075 */
+	    if(((j-0xc090cc00)|i)!=0) 		/* z < -1075 */
+		return s*tiny*tiny;		/* underflow */
+	    else {
+		if(p_l<=z-p_h) return s*tiny*tiny;	/* underflow */
+	    }
+	}
+    /*
+     * compute 2**(p_h+p_l)
+     */
+	i = j&0x7fffffff;
+	k = (i>>20)-0x3ff;
+	n = 0;
+	if(i>0x3fe00000) {		/* if |z| > 0.5, set n = [z+0.5] */
+	    n = j+(0x00100000>>(k+1));
+	    k = ((n&0x7fffffff)>>20)-0x3ff;	/* new k for n */
+	    t = zero;
+	    SET_HIGH_WORD(t,n&~(0x000fffff>>k));
+	    n = ((n&0x000fffff)|0x00100000)>>(20-k);
+	    if(j<0) n = -n;
+	    p_h -= t;
+	}
+	t = p_l+p_h;
+	SET_LOW_WORD(t,0);
+	u = t*lg2_h;
+	v = (p_l-(t-p_h))*lg2+t*lg2_l;
+	z = u+v;
+	w = v-(z-u);
+	t  = z*z;
+	t1  = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+	r  = (z*t1)/(t1-two)-(w+z*w);
+	z  = one-(r-z);
+	GET_HIGH_WORD(j,z);
+	j += (n<<20);
+	if((j>>20)<=0) z = scalbn(z,n);	/* subnormal output */
+	else SET_HIGH_WORD(z,j);
+	return s*z;
+}
--- a/platform/gp2x/code940/uClibc/e_rem_pio2.c
+++ b/platform/gp2x/code940/uClibc/e_rem_pio2.c
@ -0,0 +1,183 @@
+/* @(#)e_rem_pio2.c 5.1 93/09/24 */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: e_rem_pio2.c,v 1.8 1995/05/10 20:46:02 jtc Exp $";
+#endif
+
+/* __ieee754_rem_pio2(x,y)
+ *
+ * return the remainder of x rem pi/2 in y[0]+y[1]
+ * use __kernel_rem_pio2()
+ */
+
+#include "math.h"
+#include "math_private.h"
+
+/*
+ * Table of constants for 2/pi, 396 Hex digits (476 decimal) of 2/pi
+ */
+#ifdef __STDC__
+static const int32_t two_over_pi[] = {
+#else
+static int32_t two_over_pi[] = {
+#endif
+0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62,
+0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A,
+0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129,
+0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41,
+0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8,
+0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF,
+0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5,
+0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08,
+0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3,
+0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880,
+0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B,
+};
+
+#ifdef __STDC__
+static const int32_t npio2_hw[] = {
+#else
+static int32_t npio2_hw[] = {
+#endif
+0x3FF921FB, 0x400921FB, 0x4012D97C, 0x401921FB, 0x401F6A7A, 0x4022D97C,
+0x4025FDBB, 0x402921FB, 0x402C463A, 0x402F6A7A, 0x4031475C, 0x4032D97C,
+0x40346B9C, 0x4035FDBB, 0x40378FDB, 0x403921FB, 0x403AB41B, 0x403C463A,
+0x403DD85A, 0x403F6A7A, 0x40407E4C, 0x4041475C, 0x4042106C, 0x4042D97C,
+0x4043A28C, 0x40446B9C, 0x404534AC, 0x4045FDBB, 0x4046C6CB, 0x40478FDB,
+0x404858EB, 0x404921FB,
+};
+
+/*
+ * invpio2:  53 bits of 2/pi
+ * pio2_1:   first  33 bit of pi/2
+ * pio2_1t:  pi/2 - pio2_1
+ * pio2_2:   second 33 bit of pi/2
+ * pio2_2t:  pi/2 - (pio2_1+pio2_2)
+ * pio2_3:   third  33 bit of pi/2
+ * pio2_3t:  pi/2 - (pio2_1+pio2_2+pio2_3)
+ */
+
+#ifdef __STDC__
+static const double
+#else
+static double
+#endif
+zero =  0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
+half =  5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
+two24 =  1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
+invpio2 =  6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
+pio2_1  =  1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
+pio2_1t =  6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
+pio2_2  =  6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */
+pio2_2t =  2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
+pio2_3  =  2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
+pio2_3t =  8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
+
+#ifdef __STDC__
+	int32_t __ieee754_rem_pio2(double x, double *y)
+#else
+	int32_t __ieee754_rem_pio2(x,y)
+	double x,y[];
+#endif
+{
+	double z=0.0,w,t,r,fn;
+	double tx[3];
+	int32_t e0,i,j,nx,n,ix,hx;
+	u_int32_t low;
+
+	GET_HIGH_WORD(hx,x);		/* high word of x */
+	ix = hx&0x7fffffff;
+	if(ix<=0x3fe921fb)   /* |x| ~<= pi/4 , no need for reduction */
+	    {y[0] = x; y[1] = 0; return 0;}
+	if(ix<0x4002d97c) {  /* |x| < 3pi/4, special case with n=+-1 */
+	    if(hx>0) {
+		z = x - pio2_1;
+		if(ix!=0x3ff921fb) { 	/* 33+53 bit pi is good enough */
+		    y[0] = z - pio2_1t;
+		    y[1] = (z-y[0])-pio2_1t;
+		} else {		/* near pi/2, use 33+33+53 bit pi */
+		    z -= pio2_2;
+		    y[0] = z - pio2_2t;
+		    y[1] = (z-y[0])-pio2_2t;
+		}
+		return 1;
+	    } else {	/* negative x */
+		z = x + pio2_1;
+		if(ix!=0x3ff921fb) { 	/* 33+53 bit pi is good enough */
+		    y[0] = z + pio2_1t;
+		    y[1] = (z-y[0])+pio2_1t;
+		} else {		/* near pi/2, use 33+33+53 bit pi */
+		    z += pio2_2;
+		    y[0] = z + pio2_2t;
+		    y[1] = (z-y[0])+pio2_2t;
+		}
+		return -1;
+	    }
+	}
+	if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */
+	    t  = fabs(x);
+	    n  = (int32_t) (t*invpio2+half);
+	    fn = (double)n;
+	    r  = t-fn*pio2_1;
+	    w  = fn*pio2_1t;	/* 1st round good to 85 bit */
+	    if(n<32&&ix!=npio2_hw[n-1]) {
+		y[0] = r-w;	/* quick check no cancellation */
+	    } else {
+	        u_int32_t high;
+	        j  = ix>>20;
+	        y[0] = r-w;
+		GET_HIGH_WORD(high,y[0]);
+	        i = j-((high>>20)&0x7ff);
+	        if(i>16) {  /* 2nd iteration needed, good to 118 */
+		    t  = r;
+		    w  = fn*pio2_2;
+		    r  = t-w;
+		    w  = fn*pio2_2t-((t-r)-w);
+		    y[0] = r-w;
+		    GET_HIGH_WORD(high,y[0]);
+		    i = j-((high>>20)&0x7ff);
+		    if(i>49)  {	/* 3rd iteration need, 151 bits acc */
+		    	t  = r;	/* will cover all possible cases */
+		    	w  = fn*pio2_3;
+		    	r  = t-w;
+		    	w  = fn*pio2_3t-((t-r)-w);
+		    	y[0] = r-w;
+		    }
+		}
+	    }
+	    y[1] = (r-y[0])-w;
+	    if(hx<0) 	{y[0] = -y[0]; y[1] = -y[1]; return -n;}
+	    else	 return n;
+	}
+    /*
+     * all other (large) arguments
+     */
+	if(ix>=0x7ff00000) {		/* x is inf or NaN */
+	    y[0]=y[1]=x-x; return 0;
+	}
+    /* set z = scalbn(|x|,ilogb(x)-23) */
+	GET_LOW_WORD(low,x);
+	SET_LOW_WORD(z,low);
+	e0 	= (ix>>20)-1046;	/* e0 = ilogb(z)-23; */
+	SET_HIGH_WORD(z, ix - ((int32_t)(e0<<20)));
+	for(i=0;i<2;i++) {
+		tx[i] = (double)((int32_t)(z));
+		z     = (z-tx[i])*two24;
+	}
+	tx[2] = z;
+	nx = 3;
+	while(tx[nx-1]==zero) nx--;	/* skip zero term */
+	n  =  __kernel_rem_pio2(tx,y,e0,nx,2,two_over_pi);
+	if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
+	return n;
+}
--- a/platform/gp2x/code940/uClibc/e_sqrt.c
+++ b/platform/gp2x/code940/uClibc/e_sqrt.c
@ -0,0 +1,453 @@
+/* @(#)e_sqrt.c 5.1 93/09/24 */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: e_sqrt.c,v 1.8 1995/05/10 20:46:17 jtc Exp $";
+#endif
+
+/* __ieee754_sqrt(x)
+ * Return correctly rounded sqrt.
+ *           ------------------------------------------
+ *	     |  Use the hardware sqrt if you have one |
+ *           ------------------------------------------
+ * Method:
+ *   Bit by bit method using integer arithmetic. (Slow, but portable)
+ *   1. Normalization
+ *	Scale x to y in [1,4) with even powers of 2:
+ *	find an integer k such that  1 <= (y=x*2^(2k)) < 4, then
+ *		sqrt(x) = 2^k * sqrt(y)
+ *   2. Bit by bit computation
+ *	Let q  = sqrt(y) truncated to i bit after binary point (q = 1),
+ *	     i							 0
+ *                                     i+1         2
+ *	    s  = 2*q , and	y  =  2   * ( y - q  ).		(1)
+ *	     i      i            i                 i
+ *
+ *	To compute q    from q , one checks whether
+ *		    i+1       i
+ *
+ *			      -(i+1) 2
+ *			(q + 2      ) <= y.			(2)
+ *     			  i
+ *							      -(i+1)
+ *	If (2) is false, then q   = q ; otherwise q   = q  + 2      .
+ *		 	       i+1   i             i+1   i
+ *
+ *	With some algebric manipulation, it is not difficult to see
+ *	that (2) is equivalent to
+ *                             -(i+1)
+ *			s  +  2       <= y			(3)
+ *			 i                i
+ *
+ *	The advantage of (3) is that s  and y  can be computed by
+ *				      i      i
+ *	the following recurrence formula:
+ *	    if (3) is false
+ *
+ *	    s     =  s  ,	y    = y   ;			(4)
+ *	     i+1      i		 i+1    i
+ *
+ *	    otherwise,
+ *                         -i                     -(i+1)
+ *	    s	  =  s  + 2  ,  y    = y  -  s  - 2  		(5)
+ *           i+1      i          i+1    i     i
+ *
+ *	One may easily use induction to prove (4) and (5).
+ *	Note. Since the left hand side of (3) contain only i+2 bits,
+ *	      it does not necessary to do a full (53-bit) comparison
+ *	      in (3).
+ *   3. Final rounding
+ *	After generating the 53 bits result, we compute one more bit.
+ *	Together with the remainder, we can decide whether the
+ *	result is exact, bigger than 1/2ulp, or less than 1/2ulp
+ *	(it will never equal to 1/2ulp).
+ *	The rounding mode can be detected by checking whether
+ *	huge + tiny is equal to huge, and whether huge - tiny is
+ *	equal to huge for some floating point number "huge" and "tiny".
+ *
+ * Special cases:
+ *	sqrt(+-0) = +-0 	... exact
+ *	sqrt(inf) = inf
+ *	sqrt(-ve) = NaN		... with invalid signal
+ *	sqrt(NaN) = NaN		... with invalid signal for signaling NaN
+ *
+ * Other methods : see the appended file at the end of the program below.
+ *---------------
+ */
+
+#include "math.h"
+#include "math_private.h"
+
+#ifdef __STDC__
+static	const double	one	= 1.0, tiny=1.0e-300;
+#else
+static	double	one	= 1.0, tiny=1.0e-300;
+#endif
+
+#ifdef __STDC__
+	double __ieee754_sqrt(double x)
+#else
+	double __ieee754_sqrt(x)
+	double x;
+#endif
+{
+	double z;
+	int32_t sign = (int)0x80000000;
+	int32_t ix0,s0,q,m,t,i;
+	u_int32_t r,t1,s1,ix1,q1;
+
+	EXTRACT_WORDS(ix0,ix1,x);
+
+    /* take care of Inf and NaN */
+	if((ix0&0x7ff00000)==0x7ff00000) {
+	    return x*x+x;		/* sqrt(NaN)=NaN, sqrt(+inf)=+inf
+					   sqrt(-inf)=sNaN */
+	}
+    /* take care of zero */
+	if(ix0<=0) {
+	    if(((ix0&(~sign))|ix1)==0) return x;/* sqrt(+-0) = +-0 */
+	    else if(ix0<0)
+		return (x-x)/(x-x);		/* sqrt(-ve) = sNaN */
+	}
+    /* normalize x */
+	m = (ix0>>20);
+	if(m==0) {				/* subnormal x */
+	    while(ix0==0) {
+		m -= 21;
+		ix0 |= (ix1>>11); ix1 <<= 21;
+	    }
+	    for(i=0;(ix0&0x00100000)==0;i++) ix0<<=1;
+	    m -= i-1;
+	    ix0 |= (ix1>>(32-i));
+	    ix1 <<= i;
+	}
+	m -= 1023;	/* unbias exponent */
+	ix0 = (ix0&0x000fffff)|0x00100000;
+	if(m&1){	/* odd m, double x to make it even */
+	    ix0 += ix0 + ((ix1&sign)>>31);
+	    ix1 += ix1;
+	}
+	m >>= 1;	/* m = [m/2] */
+
+    /* generate sqrt(x) bit by bit */
+	ix0 += ix0 + ((ix1&sign)>>31);
+	ix1 += ix1;
+	q = q1 = s0 = s1 = 0;	/* [q,q1] = sqrt(x) */
+	r = 0x00200000;		/* r = moving bit from right to left */
+
+	while(r!=0) {
+	    t = s0+r;
+	    if(t<=ix0) {
+		s0   = t+r;
+		ix0 -= t;
+		q   += r;
+	    }
+	    ix0 += ix0 + ((ix1&sign)>>31);
+	    ix1 += ix1;
+	    r>>=1;
+	}
+
+	r = sign;
+	while(r!=0) {
+	    t1 = s1+r;
+	    t  = s0;
+	    if((t<ix0)||((t==ix0)&&(t1<=ix1))) {
+		s1  = t1+r;
+		if(((t1&sign)==sign)&&(s1&sign)==0) s0 += 1;
+		ix0 -= t;
+		if (ix1 < t1) ix0 -= 1;
+		ix1 -= t1;
+		q1  += r;
+	    }
+	    ix0 += ix0 + ((ix1&sign)>>31);
+	    ix1 += ix1;
+	    r>>=1;
+	}
+
+    /* use floating add to find out rounding direction */
+	if((ix0|ix1)!=0) {
+	    z = one-tiny; /* trigger inexact flag */
+	    if (z>=one) {
+	        z = one+tiny;
+	        if (q1==(u_int32_t)0xffffffff) { q1=0; q += 1;}
+		else if (z>one) {
+		    if (q1==(u_int32_t)0xfffffffe) q+=1;
+		    q1+=2;
+		} else
+	            q1 += (q1&1);
+	    }
+	}
+	ix0 = (q>>1)+0x3fe00000;
+	ix1 =  q1>>1;
+	if ((q&1)==1) ix1 |= sign;
+	ix0 += (m <<20);
+	INSERT_WORDS(z,ix0,ix1);
+	return z;
+}
+
+/*
+Other methods  (use floating-point arithmetic)
+-------------
+(This is a copy of a drafted paper by Prof W. Kahan
+and K.C. Ng, written in May, 1986)
+
+	Two algorithms are given here to implement sqrt(x)
+	(IEEE double precision arithmetic) in software.
+	Both supply sqrt(x) correctly rounded. The first algorithm (in
+	Section A) uses newton iterations and involves four divisions.
+	The second one uses reciproot iterations to avoid division, but
+	requires more multiplications. Both algorithms need the ability
+	to chop results of arithmetic operations instead of round them,
+	and the INEXACT flag to indicate when an arithmetic operation
+	is executed exactly with no roundoff error, all part of the
+	standard (IEEE 754-1985). The ability to perform shift, add,
+	subtract and logical AND operations upon 32-bit words is needed
+	too, though not part of the standard.
+
+A.  sqrt(x) by Newton Iteration
+
+   (1)	Initial approximation
+
+	Let x0 and x1 be the leading and the trailing 32-bit words of
+	a floating point number x (in IEEE double format) respectively
+
+	    1    11		     52				  ...widths
+	   ------------------------------------------------------
+	x: |s|	  e     |	      f				|
+	   ------------------------------------------------------
+	      msb    lsb  msb				      lsb ...order
+
+
+	     ------------------------  	     ------------------------
+	x0:  |s|   e    |    f1     |	 x1: |          f2           |
+	     ------------------------  	     ------------------------
+
+	By performing shifts and subtracts on x0 and x1 (both regarded
+	as integers), we obtain an 8-bit approximation of sqrt(x) as
+	follows.
+
+		k  := (x0>>1) + 0x1ff80000;
+		y0 := k - T1[31&(k>>15)].	... y ~ sqrt(x) to 8 bits
+	Here k is a 32-bit integer and T1[] is an integer array containing
+	correction terms. Now magically the floating value of y (y's
+	leading 32-bit word is y0, the value of its trailing word is 0)
+	approximates sqrt(x) to almost 8-bit.
+
+	Value of T1:
+	static int T1[32]= {
+	0,	1024,	3062,	5746,	9193,	13348,	18162,	23592,
+	29598,	36145,	43202,	50740,	58733,	67158,	75992,	85215,
+	83599,	71378,	60428,	50647,	41945,	34246,	27478,	21581,
+	16499,	12183,	8588,	5674,	3403,	1742,	661,	130,};
+
+    (2)	Iterative refinement
+
+	Apply Heron's rule three times to y, we have y approximates
+	sqrt(x) to within 1 ulp (Unit in the Last Place):
+
+		y := (y+x/y)/2		... almost 17 sig. bits
+		y := (y+x/y)/2		... almost 35 sig. bits
+		y := y-(y-x/y)/2	... within 1 ulp
+
+
+	Remark 1.
+	    Another way to improve y to within 1 ulp is:
+
+		y := (y+x/y)		... almost 17 sig. bits to 2*sqrt(x)
+		y := y - 0x00100006	... almost 18 sig. bits to sqrt(x)
+
+				2
+			    (x-y )*y
+		y := y + 2* ----------	...within 1 ulp
+			       2
+			     3y  + x
+
+
+	This formula has one division fewer than the one above; however,
+	it requires more multiplications and additions. Also x must be
+	scaled in advance to avoid spurious overflow in evaluating the
+	expression 3y*y+x. Hence it is not recommended uless division
+	is slow. If division is very slow, then one should use the
+	reciproot algorithm given in section B.
+
+    (3) Final adjustment
+
+	By twiddling y's last bit it is possible to force y to be
+	correctly rounded according to the prevailing rounding mode
+	as follows. Let r and i be copies of the rounding mode and
+	inexact flag before entering the square root program. Also we
+	use the expression y+-ulp for the next representable floating
+	numbers (up and down) of y. Note that y+-ulp = either fixed
+	point y+-1, or multiply y by nextafter(1,+-inf) in chopped
+	mode.
+
+		I := FALSE;	... reset INEXACT flag I
+		R := RZ;	... set rounding mode to round-toward-zero
+		z := x/y;	... chopped quotient, possibly inexact
+		If(not I) then {	... if the quotient is exact
+		    if(z=y) {
+		        I := i;	 ... restore inexact flag
+		        R := r;  ... restore rounded mode
+		        return sqrt(x):=y.
+		    } else {
+			z := z - ulp;	... special rounding
+		    }
+		}
+		i := TRUE;		... sqrt(x) is inexact
+		If (r=RN) then z=z+ulp	... rounded-to-nearest
+		If (r=RP) then {	... round-toward-+inf
+		    y = y+ulp; z=z+ulp;
+		}
+		y := y+z;		... chopped sum
+		y0:=y0-0x00100000;	... y := y/2 is correctly rounded.
+	        I := i;	 		... restore inexact flag
+	        R := r;  		... restore rounded mode
+	        return sqrt(x):=y.
+
+    (4)	Special cases
+
+	Square root of +inf, +-0, or NaN is itself;
+	Square root of a negative number is NaN with invalid signal.
+
+
+B.  sqrt(x) by Reciproot Iteration
+
+   (1)	Initial approximation
+
+	Let x0 and x1 be the leading and the trailing 32-bit words of
+	a floating point number x (in IEEE double format) respectively
+	(see section A). By performing shifs and subtracts on x0 and y0,
+	we obtain a 7.8-bit approximation of 1/sqrt(x) as follows.
+
+	    k := 0x5fe80000 - (x0>>1);
+	    y0:= k - T2[63&(k>>14)].	... y ~ 1/sqrt(x) to 7.8 bits
+
+	Here k is a 32-bit integer and T2[] is an integer array
+	containing correction terms. Now magically the floating
+	value of y (y's leading 32-bit word is y0, the value of
+	its trailing word y1 is set to zero) approximates 1/sqrt(x)
+	to almost 7.8-bit.
+
+	Value of T2:
+	static int T2[64]= {
+	0x1500,	0x2ef8,	0x4d67,	0x6b02,	0x87be,	0xa395,	0xbe7a,	0xd866,
+	0xf14a,	0x1091b,0x11fcd,0x13552,0x14999,0x15c98,0x16e34,0x17e5f,
+	0x18d03,0x19a01,0x1a545,0x1ae8a,0x1b5c4,0x1bb01,0x1bfde,0x1c28d,
+	0x1c2de,0x1c0db,0x1ba73,0x1b11c,0x1a4b5,0x1953d,0x18266,0x16be0,
+	0x1683e,0x179d8,0x18a4d,0x19992,0x1a789,0x1b445,0x1bf61,0x1c989,
+	0x1d16d,0x1d77b,0x1dddf,0x1e2ad,0x1e5bf,0x1e6e8,0x1e654,0x1e3cd,
+	0x1df2a,0x1d635,0x1cb16,0x1be2c,0x1ae4e,0x19bde,0x1868e,0x16e2e,
+	0x1527f,0x1334a,0x11051,0xe951,	0xbe01,	0x8e0d,	0x5924,	0x1edd,};
+
+    (2)	Iterative refinement
+
+	Apply Reciproot iteration three times to y and multiply the
+	result by x to get an approximation z that matches sqrt(x)
+	to about 1 ulp. To be exact, we will have
+		-1ulp < sqrt(x)-z<1.0625ulp.
+
+	... set rounding mode to Round-to-nearest
+	   y := y*(1.5-0.5*x*y*y)	... almost 15 sig. bits to 1/sqrt(x)
+	   y := y*((1.5-2^-30)+0.5*x*y*y)... about 29 sig. bits to 1/sqrt(x)
+	... special arrangement for better accuracy
+	   z := x*y			... 29 bits to sqrt(x), with z*y<1
+	   z := z + 0.5*z*(1-z*y)	... about 1 ulp to sqrt(x)
+
+	Remark 2. The constant 1.5-2^-30 is chosen to bias the error so that
+	(a) the term z*y in the final iteration is always less than 1;
+	(b) the error in the final result is biased upward so that
+		-1 ulp < sqrt(x) - z < 1.0625 ulp
+	    instead of |sqrt(x)-z|<1.03125ulp.
+
+    (3)	Final adjustment
+
+	By twiddling y's last bit it is possible to force y to be
+	correctly rounded according to the prevailing rounding mode
+	as follows. Let r and i be copies of the rounding mode and
+	inexact flag before entering the square root program. Also we
+	use the expression y+-ulp for the next representable floating
+	numbers (up and down) of y. Note that y+-ulp = either fixed
+	point y+-1, or multiply y by nextafter(1,+-inf) in chopped
+	mode.
+
+	R := RZ;		... set rounding mode to round-toward-zero
+	switch(r) {
+	    case RN:		... round-to-nearest
+	       if(x<= z*(z-ulp)...chopped) z = z - ulp; else
+	       if(x<= z*(z+ulp)...chopped) z = z; else z = z+ulp;
+	       break;
+	    case RZ:case RM:	... round-to-zero or round-to--inf
+	       R:=RP;		... reset rounding mod to round-to-+inf
+	       if(x<z*z ... rounded up) z = z - ulp; else
+	       if(x>=(z+ulp)*(z+ulp) ...rounded up) z = z+ulp;
+	       break;
+	    case RP:		... round-to-+inf
+	       if(x>(z+ulp)*(z+ulp)...chopped) z = z+2*ulp; else
+	       if(x>z*z ...chopped) z = z+ulp;
+	       break;
+	}
+
+	Remark 3. The above comparisons can be done in fixed point. For
+	example, to compare x and w=z*z chopped, it suffices to compare
+	x1 and w1 (the trailing parts of x and w), regarding them as
+	two's complement integers.
+
+	...Is z an exact square root?
+	To determine whether z is an exact square root of x, let z1 be the
+	trailing part of z, and also let x0 and x1 be the leading and
+	trailing parts of x.
+
+	If ((z1&0x03ffffff)!=0)	... not exact if trailing 26 bits of z!=0
+	    I := 1;		... Raise Inexact flag: z is not exact
+	else {
+	    j := 1 - [(x0>>20)&1]	... j = logb(x) mod 2
+	    k := z1 >> 26;		... get z's 25-th and 26-th
+					    fraction bits
+	    I := i or (k&j) or ((k&(j+j+1))!=(x1&3));
+	}
+	R:= r		... restore rounded mode
+	return sqrt(x):=z.
+
+	If multiplication is cheaper then the foregoing red tape, the
+	Inexact flag can be evaluated by
+
+	    I := i;
+	    I := (z*z!=x) or I.
+
+	Note that z*z can overwrite I; this value must be sensed if it is
+	True.
+
+	Remark 4. If z*z = x exactly, then bit 25 to bit 0 of z1 must be
+	zero.
+
+		    --------------------
+		z1: |        f2        |
+		    --------------------
+		bit 31		   bit 0
+
+	Further more, bit 27 and 26 of z1, bit 0 and 1 of x1, and the odd
+	or even of logb(x) have the following relations:
+
+	-------------------------------------------------
+	bit 27,26 of z1		bit 1,0 of x1	logb(x)
+	-------------------------------------------------
+	00			00		odd and even
+	01			01		even
+	10			10		odd
+	10			00		even
+	11			01		even
+	-------------------------------------------------
+
+    (4)	Special cases (see (4) of Section A).
+
+ */
+
--- a/platform/gp2x/code940/uClibc/k_cos.c
+++ b/platform/gp2x/code940/uClibc/k_cos.c
@ -0,0 +1,96 @@
+/* @(#)k_cos.c 5.1 93/09/24 */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: k_cos.c,v 1.8 1995/05/10 20:46:22 jtc Exp $";
+#endif
+
+/*
+ * __kernel_cos( x,  y )
+ * kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164
+ * Input x is assumed to be bounded by ~pi/4 in magnitude.
+ * Input y is the tail of x.
+ *
+ * Algorithm
+ *	1. Since cos(-x) = cos(x), we need only to consider positive x.
+ *	2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0.
+ *	3. cos(x) is approximated by a polynomial of degree 14 on
+ *	   [0,pi/4]
+ *		  	                 4            14
+ *	   	cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x
+ *	   where the remez error is
+ *
+ * 	|              2     4     6     8     10    12     14 |     -58
+ * 	|cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  )| <= 2
+ * 	|    					               |
+ *
+ * 	               4     6     8     10    12     14
+ *	4. let r = C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  , then
+ *	       cos(x) = 1 - x*x/2 + r
+ *	   since cos(x+y) ~ cos(x) - sin(x)*y
+ *			  ~ cos(x) - x*y,
+ *	   a correction term is necessary in cos(x) and hence
+ *		cos(x+y) = 1 - (x*x/2 - (r - x*y))
+ *	   For better accuracy when x > 0.3, let qx = |x|/4 with
+ *	   the last 32 bits mask off, and if x > 0.78125, let qx = 0.28125.
+ *	   Then
+ *		cos(x+y) = (1-qx) - ((x*x/2-qx) - (r-x*y)).
+ *	   Note that 1-qx and (x*x/2-qx) is EXACT here, and the
+ *	   magnitude of the latter is at least a quarter of x*x/2,
+ *	   thus, reducing the rounding error in the subtraction.
+ */
+
+#include "math.h"
+#include "math_private.h"
+
+#ifdef __STDC__
+static const double
+#else
+static double
+#endif
+one =  1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
+C1  =  4.16666666666666019037e-02, /* 0x3FA55555, 0x5555554C */
+C2  = -1.38888888888741095749e-03, /* 0xBF56C16C, 0x16C15177 */
+C3  =  2.48015872894767294178e-05, /* 0x3EFA01A0, 0x19CB1590 */
+C4  = -2.75573143513906633035e-07, /* 0xBE927E4F, 0x809C52AD */
+C5  =  2.08757232129817482790e-09, /* 0x3E21EE9E, 0xBDB4B1C4 */
+C6  = -1.13596475577881948265e-11; /* 0xBDA8FAE9, 0xBE8838D4 */
+
+#ifdef __STDC__
+	double __kernel_cos(double x, double y)
+#else
+	double __kernel_cos(x, y)
+	double x,y;
+#endif
+{
+	double a,hz,z,r,qx;
+	int32_t ix;
+	GET_HIGH_WORD(ix,x);
+	ix &= 0x7fffffff;			/* ix = |x|'s high word*/
+	if(ix<0x3e400000) {			/* if x < 2**27 */
+	    if(((int)x)==0) return one;		/* generate inexact */
+	}
+	z  = x*x;
+	r  = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
+	if(ix < 0x3FD33333) 			/* if |x| < 0.3 */
+	    return one - (0.5*z - (z*r - x*y));
+	else {
+	    if(ix > 0x3fe90000) {		/* x > 0.78125 */
+		qx = 0.28125;
+	    } else {
+	        INSERT_WORDS(qx,ix-0x00200000,0);	/* x/4 */
+	    }
+	    hz = 0.5*z-qx;
+	    a  = one-qx;
+	    return a - (hz - (z*r-x*y));
+	}
+}
--- a/platform/gp2x/code940/uClibc/k_rem_pio2.c
+++ b/platform/gp2x/code940/uClibc/k_rem_pio2.c
@ -0,0 +1,320 @@
+/* @(#)k_rem_pio2.c 5.1 93/09/24 */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: k_rem_pio2.c,v 1.7 1995/05/10 20:46:25 jtc Exp $";
+#endif
+
+/*
+ * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
+ * double x[],y[]; int e0,nx,prec; int ipio2[];
+ *
+ * __kernel_rem_pio2 return the last three digits of N with
+ *		y = x - N*pi/2
+ * so that |y| < pi/2.
+ *
+ * The method is to compute the integer (mod 8) and fraction parts of
+ * (2/pi)*x without doing the full multiplication. In general we
+ * skip the part of the product that are known to be a huge integer (
+ * more accurately, = 0 mod 8 ). Thus the number of operations are
+ * independent of the exponent of the input.
+ *
+ * (2/pi) is represented by an array of 24-bit integers in ipio2[].
+ *
+ * Input parameters:
+ * 	x[]	The input value (must be positive) is broken into nx
+ *		pieces of 24-bit integers in double precision format.
+ *		x[i] will be the i-th 24 bit of x. The scaled exponent
+ *		of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
+ *		match x's up to 24 bits.
+ *
+ *		Example of breaking a double positive z into x[0]+x[1]+x[2]:
+ *			e0 = ilogb(z)-23
+ *			z  = scalbn(z,-e0)
+ *		for i = 0,1,2
+ *			x[i] = floor(z)
+ *			z    = (z-x[i])*2**24
+ *
+ *
+ *	y[]	ouput result in an array of double precision numbers.
+ *		The dimension of y[] is:
+ *			24-bit  precision	1
+ *			53-bit  precision	2
+ *			64-bit  precision	2
+ *			113-bit precision	3
+ *		The actual value is the sum of them. Thus for 113-bit
+ *		precison, one may have to do something like:
+ *
+ *		long double t,w,r_head, r_tail;
+ *		t = (long double)y[2] + (long double)y[1];
+ *		w = (long double)y[0];
+ *		r_head = t+w;
+ *		r_tail = w - (r_head - t);
+ *
+ *	e0	The exponent of x[0]
+ *
+ *	nx	dimension of x[]
+ *
+ *  	prec	an integer indicating the precision:
+ *			0	24  bits (single)
+ *			1	53  bits (double)
+ *			2	64  bits (extended)
+ *			3	113 bits (quad)
+ *
+ *	ipio2[]
+ *		integer array, contains the (24*i)-th to (24*i+23)-th
+ *		bit of 2/pi after binary point. The corresponding
+ *		floating value is
+ *
+ *			ipio2[i] * 2^(-24(i+1)).
+ *
+ * External function:
+ *	double scalbn(), floor();
+ *
+ *
+ * Here is the description of some local variables:
+ *
+ * 	jk	jk+1 is the initial number of terms of ipio2[] needed
+ *		in the computation. The recommended value is 2,3,4,
+ *		6 for single, double, extended,and quad.
+ *
+ * 	jz	local integer variable indicating the number of
+ *		terms of ipio2[] used.
+ *
+ *	jx	nx - 1
+ *
+ *	jv	index for pointing to the suitable ipio2[] for the
+ *		computation. In general, we want
+ *			( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
+ *		is an integer. Thus
+ *			e0-3-24*jv >= 0 or (e0-3)/24 >= jv
+ *		Hence jv = max(0,(e0-3)/24).
+ *
+ *	jp	jp+1 is the number of terms in PIo2[] needed, jp = jk.
+ *
+ * 	q[]	double array with integral value, representing the
+ *		24-bits chunk of the product of x and 2/pi.
+ *
+ *	q0	the corresponding exponent of q[0]. Note that the
+ *		exponent for q[i] would be q0-24*i.
+ *
+ *	PIo2[]	double precision array, obtained by cutting pi/2
+ *		into 24 bits chunks.
+ *
+ *	f[]	ipio2[] in floating point
+ *
+ *	iq[]	integer array by breaking up q[] in 24-bits chunk.
+ *
+ *	fq[]	final product of x*(2/pi) in fq[0],..,fq[jk]
+ *
+ *	ih	integer. If >0 it indicates q[] is >= 0.5, hence
+ *		it also indicates the *sign* of the result.
+ *
+ */
+
+
+/*
+ * Constants:
+ * The hexadecimal values are the intended ones for the following
+ * constants. The decimal values may be used, provided that the
+ * compiler will convert from decimal to binary accurately enough
+ * to produce the hexadecimal values shown.
+ */
+
+#include "math.h"
+#include "math_private.h"
+
+#ifdef __STDC__
+static const int init_jk[] = {2,3,4,6}; /* initial value for jk */
+#else
+static int init_jk[] = {2,3,4,6};
+#endif
+
+#ifdef __STDC__
+static const double PIo2[] = {
+#else
+static double PIo2[] = {
+#endif
+  1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
+  7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
+  5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
+  3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
+  1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
+  1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
+  2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
+  2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
+};
+
+#ifdef __STDC__
+static const double
+#else
+static double
+#endif
+zero   = 0.0,
+one    = 1.0,
+two24   =  1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
+twon24  =  5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
+
+#ifdef __STDC__
+	int __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, const int32_t *ipio2)
+#else
+	int __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
+	double x[], y[]; int e0,nx,prec; int32_t ipio2[];
+#endif
+{
+	int32_t jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
+	double z,fw,f[20],fq[20],q[20];
+
+    /* initialize jk*/
+	jk = init_jk[prec];
+	jp = jk;
+
+    /* determine jx,jv,q0, note that 3>q0 */
+	jx =  nx-1;
+	jv = (e0-3)/24; if(jv<0) jv=0;
+	q0 =  e0-24*(jv+1);
+
+    /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
+	j = jv-jx; m = jx+jk;
+	for(i=0;i<=m;i++,j++) f[i] = (j<0)? zero : (double) ipio2[j];
+
+    /* compute q[0],q[1],...q[jk] */
+	for (i=0;i<=jk;i++) {
+	    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
+	}
+
+	jz = jk;
+recompute:
+    /* distill q[] into iq[] reversingly */
+	for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
+	    fw    =  (double)((int32_t)(twon24* z));
+	    iq[i] =  (int32_t)(z-two24*fw);
+	    z     =  q[j-1]+fw;
+	}
+
+    /* compute n */
+	z  = scalbn(z,q0);		/* actual value of z */
+	z -= 8.0*floor(z*0.125);		/* trim off integer >= 8 */
+	n  = (int32_t) z;
+	z -= (double)n;
+	ih = 0;
+	if(q0>0) {	/* need iq[jz-1] to determine n */
+	    i  = (iq[jz-1]>>(24-q0)); n += i;
+	    iq[jz-1] -= i<<(24-q0);
+	    ih = iq[jz-1]>>(23-q0);
+	}
+	else if(q0==0) ih = iq[jz-1]>>23;
+	else if(z>=0.5) ih=2;
+
+	if(ih>0) {	/* q > 0.5 */
+	    n += 1; carry = 0;
+	    for(i=0;i<jz ;i++) {	/* compute 1-q */
+		j = iq[i];
+		if(carry==0) {
+		    if(j!=0) {
+			carry = 1; iq[i] = 0x1000000- j;
+		    }
+		} else  iq[i] = 0xffffff - j;
+	    }
+	    if(q0>0) {		/* rare case: chance is 1 in 12 */
+	        switch(q0) {
+	        case 1:
+	    	   iq[jz-1] &= 0x7fffff; break;
+	    	case 2:
+	    	   iq[jz-1] &= 0x3fffff; break;
+	        }
+	    }
+	    if(ih==2) {
+		z = one - z;
+		if(carry!=0) z -= scalbn(one,q0);
+	    }
+	}
+
+    /* check if recomputation is needed */
+	if(z==zero) {
+	    j = 0;
+	    for (i=jz-1;i>=jk;i--) j |= iq[i];
+	    if(j==0) { /* need recomputation */
+		for(k=1;iq[jk-k]==0;k++);   /* k = no. of terms needed */
+
+		for(i=jz+1;i<=jz+k;i++) {   /* add q[jz+1] to q[jz+k] */
+		    f[jx+i] = (double) ipio2[jv+i];
+		    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
+		    q[i] = fw;
+		}
+		jz += k;
+		goto recompute;
+	    }
+	}
+
+    /* chop off zero terms */
+	if(z==0.0) {
+	    jz -= 1; q0 -= 24;
+	    while(iq[jz]==0) { jz--; q0-=24;}
+	} else { /* break z into 24-bit if necessary */
+	    z = scalbn(z,-q0);
+	    if(z>=two24) {
+		fw = (double)((int32_t)(twon24*z));
+		iq[jz] = (int32_t)(z-two24*fw);
+		jz += 1; q0 += 24;
+		iq[jz] = (int32_t) fw;
+	    } else iq[jz] = (int32_t) z ;
+	}
+
+    /* convert integer "bit" chunk to floating-point value */
+	fw = scalbn(one,q0);
+	for(i=jz;i>=0;i--) {
+	    q[i] = fw*(double)iq[i]; fw*=twon24;
+	}
+
+    /* compute PIo2[0,...,jp]*q[jz,...,0] */
+	for(i=jz;i>=0;i--) {
+	    for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
+	    fq[jz-i] = fw;
+	}
+
+    /* compress fq[] into y[] */
+	switch(prec) {
+	    case 0:
+		fw = 0.0;
+		for (i=jz;i>=0;i--) fw += fq[i];
+		y[0] = (ih==0)? fw: -fw;
+		break;
+	    case 1:
+	    case 2:
+		fw = 0.0;
+		for (i=jz;i>=0;i--) fw += fq[i];
+		y[0] = (ih==0)? fw: -fw;
+		fw = fq[0]-fw;
+		for (i=1;i<=jz;i++) fw += fq[i];
+		y[1] = (ih==0)? fw: -fw;
+		break;
+	    case 3:	/* painful */
+		for (i=jz;i>0;i--) {
+		    fw      = fq[i-1]+fq[i];
+		    fq[i]  += fq[i-1]-fw;
+		    fq[i-1] = fw;
+		}
+		for (i=jz;i>1;i--) {
+		    fw      = fq[i-1]+fq[i];
+		    fq[i]  += fq[i-1]-fw;
+		    fq[i-1] = fw;
+		}
+		for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
+		if(ih==0) {
+		    y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
+		} else {
+		    y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
+		}
+	}
+	return n&7;
+}
--- a/platform/gp2x/code940/uClibc/k_sin.c
+++ b/platform/gp2x/code940/uClibc/k_sin.c
@ -0,0 +1,79 @@
+/* @(#)k_sin.c 5.1 93/09/24 */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: k_sin.c,v 1.8 1995/05/10 20:46:31 jtc Exp $";
+#endif
+
+/* __kernel_sin( x, y, iy)
+ * kernel sin function on [-pi/4, pi/4], pi/4 ~ 0.7854
+ * Input x is assumed to be bounded by ~pi/4 in magnitude.
+ * Input y is the tail of x.
+ * Input iy indicates whether y is 0. (if iy=0, y assume to be 0).
+ *
+ * Algorithm
+ *	1. Since sin(-x) = -sin(x), we need only to consider positive x.
+ *	2. if x < 2^-27 (hx<0x3e400000 0), return x with inexact if x!=0.
+ *	3. sin(x) is approximated by a polynomial of degree 13 on
+ *	   [0,pi/4]
+ *		  	         3            13
+ *	   	sin(x) ~ x + S1*x + ... + S6*x
+ *	   where
+ *
+ * 	|sin(x)         2     4     6     8     10     12  |     -58
+ * 	|----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x  +S6*x   )| <= 2
+ * 	|  x 					           |
+ *
+ *	4. sin(x+y) = sin(x) + sin'(x')*y
+ *		    ~ sin(x) + (1-x*x/2)*y
+ *	   For better accuracy, let
+ *		     3      2      2      2      2
+ *		r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6))))
+ *	   then                   3    2
+ *		sin(x) = x + (S1*x + (x *(r-y/2)+y))
+ */
+
+#include "math.h"
+#include "math_private.h"
+
+#ifdef __STDC__
+static const double
+#else
+static double
+#endif
+half =  5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
+S1  = -1.66666666666666324348e-01, /* 0xBFC55555, 0x55555549 */
+S2  =  8.33333333332248946124e-03, /* 0x3F811111, 0x1110F8A6 */
+S3  = -1.98412698298579493134e-04, /* 0xBF2A01A0, 0x19C161D5 */
+S4  =  2.75573137070700676789e-06, /* 0x3EC71DE3, 0x57B1FE7D */
+S5  = -2.50507602534068634195e-08, /* 0xBE5AE5E6, 0x8A2B9CEB */
+S6  =  1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */
+
+#ifdef __STDC__
+	double __kernel_sin(double x, double y, int iy)
+#else
+	double __kernel_sin(x, y, iy)
+	double x,y; int iy;		/* iy=0 if y is zero */
+#endif
+{
+	double z,r,v;
+	int32_t ix;
+	GET_HIGH_WORD(ix,x);
+	ix &= 0x7fffffff;			/* high word of x */
+	if(ix<0x3e400000)			/* |x| < 2**-27 */
+	   {if((int)x==0) return x;}		/* generate inexact */
+	z	=  x*x;
+	v	=  z*x;
+	r	=  S2+z*(S3+z*(S4+z*(S5+z*S6)));
+	if(iy==0) return x+v*(S1+z*r);
+	else      return x-((z*(half*y-v*r)-y)-v*S1);
+}
--- a/platform/gp2x/code940/uClibc/math.h
+++ b/platform/gp2x/code940/uClibc/math.h
@ -0,0 +1,15 @@
+#include <sys/types.h>
+
+int __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, const int32_t *ipio2);
+double __kernel_sin(double x, double y, int iy);
+double __kernel_cos(double x, double y);
+
+double __ieee754_pow(double x, double y);
+double __ieee754_sqrt(double x);
+double __ieee754_log(double x);
+int32_t __ieee754_rem_pio2(double x, double *y);
+
+double fabs(double x);
+double scalbn(double x, int n);
+double copysign(double x, double y);
+double floor(double x);
--- a/platform/gp2x/code940/uClibc/math_private.h
+++ b/platform/gp2x/code940/uClibc/math_private.h
@ -0,0 +1,129 @@
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+/*
+ * from: @(#)fdlibm.h 5.1 93/09/24
+ * $Id: math_private.h,v 1.3 2004/02/09 07:10:38 andersen Exp $
+ */
+
+#ifndef _MATH_PRIVATE_H_
+#define _MATH_PRIVATE_H_
+
+#include <endian.h>
+#include <sys/types.h>
+
+/* The original fdlibm code used statements like:
+	n0 = ((*(int*)&one)>>29)^1;		* index of high word *
+	ix0 = *(n0+(int*)&x);			* high word of x *
+	ix1 = *((1-n0)+(int*)&x);		* low word of x *
+   to dig two 32 bit words out of the 64 bit IEEE floating point
+   value.  That is non-ANSI, and, moreover, the gcc instruction
+   scheduler gets it wrong.  We instead use the following macros.
+   Unlike the original code, we determine the endianness at compile
+   time, not at run time; I don't see much benefit to selecting
+   endianness at run time.  */
+
+/* A union which permits us to convert between a double and two 32 bit
+   ints.  */
+
+/*
+ * Math on arm is special:
+ * For FPA, float words are always big-endian.
+ * For VFP, floats words follow the memory system mode.
+ */
+
+#if (__BYTE_ORDER == __BIG_ENDIAN) || \
+    (!defined(__VFP_FP__) && (defined(__arm__) || defined(__thumb__)))
+
+typedef union
+{
+  double value;
+  struct
+  {
+    u_int32_t msw;
+    u_int32_t lsw;
+  } parts;
+} ieee_double_shape_type;
+
+#else
+
+typedef union
+{
+  double value;
+  struct
+  {
+    u_int32_t lsw;
+    u_int32_t msw;
+  } parts;
+} ieee_double_shape_type;
+
+#endif
+
+/* Get two 32 bit ints from a double.  */
+
+#define EXTRACT_WORDS(ix0,ix1,d)				\
+do {								\
+  ieee_double_shape_type ew_u;					\
+  ew_u.value = (d);						\
+  (ix0) = ew_u.parts.msw;					\
+  (ix1) = ew_u.parts.lsw;					\
+} while (0)
+
+/* Get the more significant 32 bit int from a double.  */
+
+#define GET_HIGH_WORD(i,d)					\
+do {								\
+  ieee_double_shape_type gh_u;					\
+  gh_u.value = (d);						\
+  (i) = gh_u.parts.msw;						\
+} while (0)
+
+/* Get the less significant 32 bit int from a double.  */
+
+#define GET_LOW_WORD(i,d)					\
+do {								\
+  ieee_double_shape_type gl_u;					\
+  gl_u.value = (d);						\
+  (i) = gl_u.parts.lsw;						\
+} while (0)
+
+/* Set a double from two 32 bit ints.  */
+
+#define INSERT_WORDS(d,ix0,ix1)					\
+do {								\
+  ieee_double_shape_type iw_u;					\
+  iw_u.parts.msw = (ix0);					\
+  iw_u.parts.lsw = (ix1);					\
+  (d) = iw_u.value;						\
+} while (0)
+
+/* Set the more significant 32 bits of a double from an int.  */
+
+#define SET_HIGH_WORD(d,v)					\
+do {								\
+  ieee_double_shape_type sh_u;					\
+  sh_u.value = (d);						\
+  sh_u.parts.msw = (v);						\
+  (d) = sh_u.value;						\
+} while (0)
+
+/* Set the less significant 32 bits of a double from an int.  */
+
+#define SET_LOW_WORD(d,v)					\
+do {								\
+  ieee_double_shape_type sl_u;					\
+  sl_u.value = (d);						\
+  sl_u.parts.lsw = (v);						\
+  (d) = sl_u.value;						\
+} while (0)
+
+
+#endif /* _MATH_PRIVATE_H_ */
--- a/platform/gp2x/code940/uClibc/memset.s
+++ b/platform/gp2x/code940/uClibc/memset.s
@ -0,0 +1,72 @@
+/* Copyright (C) 1998 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Philip Blundell <philb@gnu.org>
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+@ #include <sys/syscall.h>
+
+		.text
+		.global memset
+		.type memset,%function
+		.align 4
+
+memset:
+	mov	a4, a1
+	cmp	a3, $8		@ at least 8 bytes to do?
+	blt	2f
+	orr	a2, a2, a2, lsl $8
+	orr	a2, a2, a2, lsl $16
+1:
+	tst	a4, $3		@ aligned yet?
+	strneb	a2, [a4], $1
+	subne	a3, a3, $1
+	bne	1b
+	mov	ip, a2
+1:
+	cmp	a3, $8		@ 8 bytes still to do?
+	blt	2f
+	stmia	a4!, {a2, ip}
+	sub	a3, a3, $8
+	cmp	a3, $8		@ 8 bytes still to do?
+	blt	2f
+	stmia	a4!, {a2, ip}
+	sub	a3, a3, $8
+	cmp	a3, $8		@ 8 bytes still to do?
+	blt	2f
+	stmia	a4!, {a2, ip}
+	sub	a3, a3, $8
+	cmp	a3, $8		@ 8 bytes still to do?
+	stmgeia	a4!, {a2, ip}
+	subge	a3, a3, $8
+	bge	1b
+2:
+	movs	a3, a3		@ anything left?
+	moveq	pc, lr		@ nope
+	rsb	a3, a3, $7
+	add	pc, pc, a3, lsl $2
+	mov	r0, r0
+	strb	a2, [a4], $1
+	strb	a2, [a4], $1
+	strb	a2, [a4], $1
+	strb	a2, [a4], $1
+	strb	a2, [a4], $1
+	strb	a2, [a4], $1
+	strb	a2, [a4], $1
+	mov	pc, lr
+
+.size memset,.-memset;
+
--- a/platform/gp2x/code940/uClibc/s_copysign.c
+++ b/platform/gp2x/code940/uClibc/s_copysign.c
@ -0,0 +1,39 @@
+/* @(#)s_copysign.c 5.1 93/09/24 */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: s_copysign.c,v 1.8 1995/05/10 20:46:57 jtc Exp $";
+#endif
+
+/*
+ * copysign(double x, double y)
+ * copysign(x,y) returns a value with the magnitude of x and
+ * with the sign bit of y.
+ */
+
+#include "math.h"
+#include "math_private.h"
+
+#ifdef __STDC__
+	double copysign(double x, double y)
+#else
+	double copysign(x,y)
+	double x,y;
+#endif
+{
+	u_int32_t hx,hy;
+	GET_HIGH_WORD(hx,x);
+	GET_HIGH_WORD(hy,y);
+	SET_HIGH_WORD(x,(hx&0x7fffffff)|(hy&0x80000000));
+        return x;
+}
+
--- a/platform/gp2x/code940/uClibc/s_fabs.c
+++ b/platform/gp2x/code940/uClibc/s_fabs.c
@ -0,0 +1,35 @@
+/* @(#)s_fabs.c 5.1 93/09/24 */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: s_fabs.c,v 1.7 1995/05/10 20:47:13 jtc Exp $";
+#endif
+
+/*
+ * fabs(x) returns the absolute value of x.
+ */
+
+#include "math.h"
+#include "math_private.h"
+
+#ifdef __STDC__
+	double fabs(double x)
+#else
+	double fabs(x)
+	double x;
+#endif
+{
+	u_int32_t high;
+	GET_HIGH_WORD(high,x);
+	SET_HIGH_WORD(x,high&0x7fffffff);
+        return x;
+}
--- a/platform/gp2x/code940/uClibc/s_floor.c
+++ b/platform/gp2x/code940/uClibc/s_floor.c
@ -0,0 +1,81 @@
+/* @(#)s_floor.c 5.1 93/09/24 */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: s_floor.c,v 1.8 1995/05/10 20:47:20 jtc Exp $";
+#endif
+
+/*
+ * floor(x)
+ * Return x rounded toward -inf to integral value
+ * Method:
+ *	Bit twiddling.
+ * Exception:
+ *	Inexact flag raised if x not equal to floor(x).
+ */
+
+#include "math.h"
+#include "math_private.h"
+
+#ifdef __STDC__
+static const double huge = 1.0e300;
+#else
+static double huge = 1.0e300;
+#endif
+
+#ifdef __STDC__
+	double floor(double x)
+#else
+	double floor(x)
+	double x;
+#endif
+{
+	int32_t i0,i1,j0;
+	u_int32_t i,j;
+	EXTRACT_WORDS(i0,i1,x);
+	j0 = ((i0>>20)&0x7ff)-0x3ff;
+	if(j0<20) {
+	    if(j0<0) { 	/* raise inexact if x != 0 */
+		if(huge+x>0.0) {/* return 0*sign(x) if |x|<1 */
+		    if(i0>=0) {i0=i1=0;}
+		    else if(((i0&0x7fffffff)|i1)!=0)
+			{ i0=0xbff00000;i1=0;}
+		}
+	    } else {
+		i = (0x000fffff)>>j0;
+		if(((i0&i)|i1)==0) return x; /* x is integral */
+		if(huge+x>0.0) {	/* raise inexact flag */
+		    if(i0<0) i0 += (0x00100000)>>j0;
+		    i0 &= (~i); i1=0;
+		}
+	    }
+	} else if (j0>51) {
+	    if(j0==0x400) return x+x;	/* inf or NaN */
+	    else return x;		/* x is integral */
+	} else {
+	    i = ((u_int32_t)(0xffffffff))>>(j0-20);
+	    if((i1&i)==0) return x;	/* x is integral */
+	    if(huge+x>0.0) { 		/* raise inexact flag */
+		if(i0<0) {
+		    if(j0==20) i0+=1;
+		    else {
+			j = i1+(1<<(52-j0));
+			if(j<i1) i0 +=1 ; 	/* got a carry */
+			i1=j;
+		    }
+		}
+		i1 &= (~i);
+	    }
+	}
+	INSERT_WORDS(x,i0,i1);
+	return x;
+}
--- a/platform/gp2x/code940/uClibc/s_scalbn.c
+++ b/platform/gp2x/code940/uClibc/s_scalbn.c
@ -0,0 +1,67 @@
+/* @(#)s_scalbn.c 5.1 93/09/24 */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: s_scalbn.c,v 1.8 1995/05/10 20:48:08 jtc Exp $";
+#endif
+
+/*
+ * scalbn (double x, int n)
+ * scalbn(x,n) returns x* 2**n  computed by  exponent
+ * manipulation rather than by actually performing an
+ * exponentiation or a multiplication.
+ */
+
+#include "math.h"
+#include "math_private.h"
+
+#ifdef __STDC__
+static const double
+#else
+static double
+#endif
+two54   =  1.80143985094819840000e+16, /* 0x43500000, 0x00000000 */
+twom54  =  5.55111512312578270212e-17, /* 0x3C900000, 0x00000000 */
+huge   = 1.0e+300,
+tiny   = 1.0e-300;
+
+#ifdef __STDC__
+	double scalbn (double x, int n)
+#else
+	double scalbn (x,n)
+	double x; int n;
+#endif
+{
+	int32_t k,hx,lx;
+	EXTRACT_WORDS(hx,lx,x);
+        k = (hx&0x7ff00000)>>20;		/* extract exponent */
+        if (k==0) {				/* 0 or subnormal x */
+            if ((lx|(hx&0x7fffffff))==0) return x; /* +-0 */
+	    x *= two54;
+	    GET_HIGH_WORD(hx,x);
+	    k = ((hx&0x7ff00000)>>20) - 54;
+            if (n< -50000) return tiny*x; 	/*underflow*/
+	    }
+        if (k==0x7ff) return x+x;		/* NaN or Inf */
+        k = k+n;
+        if (k >  0x7fe) return huge*copysign(huge,x); /* overflow  */
+        if (k > 0) 				/* normal result */
+	    {SET_HIGH_WORD(x,(hx&0x800fffff)|(k<<20)); return x;}
+        if (k <= -54) {
+            if (n > 50000) 	/* in case integer overflow in n+k */
+		return huge*copysign(huge,x);	/*overflow*/
+	    else return tiny*copysign(tiny,x); 	/*underflow*/
+	}
+        k += 54;				/* subnormal result */
+	SET_HIGH_WORD(x,(hx&0x800fffff)|(k<<20));
+        return x*twom54;
+}
--- a/platform/gp2x/code940/uClibc/s_sin.c
+++ b/platform/gp2x/code940/uClibc/s_sin.c
@ -0,0 +1,82 @@
+/* @(#)s_sin.c 5.1 93/09/24 */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: s_sin.c,v 1.7 1995/05/10 20:48:15 jtc Exp $";
+#endif
+
+/* sin(x)
+ * Return sine function of x.
+ *
+ * kernel function:
+ *	__kernel_sin		... sine function on [-pi/4,pi/4]
+ *	__kernel_cos		... cose function on [-pi/4,pi/4]
+ *	__ieee754_rem_pio2	... argument reduction routine
+ *
+ * Method.
+ *      Let S,C and T denote the sin, cos and tan respectively on
+ *	[-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
+ *	in [-pi/4 , +pi/4], and let n = k mod 4.
+ *	We have
+ *
+ *          n        sin(x)      cos(x)        tan(x)
+ *     ----------------------------------------------------------
+ *	    0	       S	   C		 T
+ *	    1	       C	  -S		-1/T
+ *	    2	      -S	  -C		 T
+ *	    3	      -C	   S		-1/T
+ *     ----------------------------------------------------------
+ *
+ * Special cases:
+ *      Let trig be any of sin, cos, or tan.
+ *      trig(+-INF)  is NaN, with signals;
+ *      trig(NaN)    is that NaN;
+ *
+ * Accuracy:
+ *	TRIG(x) returns trig(x) nearly rounded
+ */
+
+#include "math.h"
+#include "math_private.h"
+
+#ifdef __STDC__
+	double sin(double x)
+#else
+	double sin(x)
+	double x;
+#endif
+{
+	double y[2],z=0.0;
+	int32_t n, ix;
+
+    /* High word of x. */
+	GET_HIGH_WORD(ix,x);
+
+    /* |x| ~< pi/4 */
+	ix &= 0x7fffffff;
+	if(ix <= 0x3fe921fb) return __kernel_sin(x,z,0);
+
+    /* sin(Inf or NaN) is NaN */
+	else if (ix>=0x7ff00000) return x-x;
+
+    /* argument reduction needed */
+	else {
+	    n = __ieee754_rem_pio2(x,y);
+	    switch(n&3) {
+		case 0: return  __kernel_sin(y[0],y[1],1);
+		case 1: return  __kernel_cos(y[0],y[1]);
+		case 2: return -__kernel_sin(y[0],y[1],1);
+		default:
+			return -__kernel_cos(y[0],y[1]);
+	    }
+	}
+}
--- a/platform/gp2x/code940/uClibc/wrappers.c
+++ b/platform/gp2x/code940/uClibc/wrappers.c
@ -0,0 +1,12 @@
+#include "math.h"
+
+double pow(double x, double y)
+{
+	return __ieee754_pow(x, y);
+}
+
+
+double log(double x)
+{
+	return __ieee754_log(x);
+}