code940 now plays mp3s

git-svn-id: file:///home/notaz/opt/svn/PicoDrive@22 be3aeb3a-fb24-0410-a615-afba39da0efa
This commit is contained in:
notaz 2007-01-20 23:11:02 +00:00
parent b837b69b3f
commit 42c7b14797
26 changed files with 1061 additions and 75 deletions

134
platform/gp2x/code940/940.c Normal file
View file

@ -0,0 +1,134 @@
#include "940shared.h"
static _940_data_t *shared_data = (_940_data_t *) 0x00100000;
static _940_ctl_t *shared_ctl = (_940_ctl_t *) 0x00200000;
static unsigned char *mp3_data = (unsigned char *) 0x01000000;
YM2612 *ym2612_940;
int *mix_buffer;
// from init.s
void wait_irq(void);
void spend_cycles(int c);
void cache_clean(void);
void cache_clean_flush(void);
// asm volatile ("mov r0, #0" ::: "r0");
// asm volatile ("mcr p15, 0, r0, c7, c6, 0" ::: "r0"); /* flush dcache */
// asm volatile ("mcr p15, 0, r0, c7, c10, 4" ::: "r0"); /* drain write buffer */
void Main940(int startvector)
{
ym2612_940 = &shared_data->ym2612;
mix_buffer = shared_data->mix_buffer;
// debug
shared_ctl->vstarts[startvector]++;
// asm volatile ("mcr p15, 0, r0, c7, c10, 4" ::: "r0");
for (;;)
{
int job_num = 0;
/*
while (!shared_ctl->busy)
{
//shared_ctl->waitc++;
spend_cycles(8*1024);
}
*/
if (!shared_ctl->busy)
{
wait_irq();
}
for (job_num = 0; job_num < MAX_940JOBS; job_num++)
{
switch (shared_ctl->jobs[job_num])
{
case JOB940_INITALL:
/* ym2612 */
shared_ctl->writebuff0[0] = shared_ctl->writebuff1[0] = 0xffff;
YM2612Init_(shared_ctl->baseclock, shared_ctl->rate);
/* Helix mp3 decoder */
shared_data->mp3dec = MP3InitDecoder();
break;
case JOB940_YM2612RESETCHIP:
YM2612ResetChip_();
break;
case JOB940_PICOSTATELOAD:
YM2612PicoStateLoad_();
break;
case JOB940_YM2612UPDATEONE: {
int i, dw, *wbuff;
if (shared_ctl->writebuffsel == 1) {
wbuff = (int *) shared_ctl->writebuff1;
} else {
wbuff = (int *) shared_ctl->writebuff0;
}
/* playback all writes */
for (i = 2048/2; i > 0; i--) {
UINT16 d;
dw = *wbuff++;
d = dw;
if (d == 0xffff) break;
YM2612Write_(d >> 8, d);
d = (dw>>16);
if (d == 0xffff) break;
YM2612Write_(d >> 8, d);
}
YM2612UpdateOne_(0, shared_ctl->length, shared_ctl->stereo);
break;
}
case JOB940_MP3DECODE: {
int mp3_offs = shared_ctl->mp3_offs;
unsigned char *readPtr = mp3_data + mp3_offs;
int bytesLeft = shared_ctl->mp3_len - mp3_offs;
int offset; // frame offset from readPtr
int err;
if (bytesLeft <= 0) break; // EOF, nothing to do
offset = MP3FindSyncWord(readPtr, bytesLeft);
if (offset < 0) {
shared_ctl->mp3_offs = shared_ctl->mp3_len;
break; // EOF
}
readPtr += offset;
bytesLeft -= offset;
err = MP3Decode(shared_data->mp3dec, &readPtr, &bytesLeft,
shared_data->mp3_buffer[shared_ctl->mp3_buffsel], 0);
if (err) {
if (err == ERR_MP3_INDATA_UNDERFLOW) {
shared_ctl->mp3_offs = shared_ctl->mp3_len; // EOF
break;
} else if (err <= -6 && err >= -12) {
// ERR_MP3_INVALID_FRAMEHEADER, ERR_MP3_INVALID_*
// just try to skip the offending frame..
readPtr++;
}
shared_ctl->mp3_errors++;
shared_ctl->mp3_lasterr = err;
}
shared_ctl->mp3_offs = readPtr - mp3_data;
break;
}
}
}
cache_clean();
// asm volatile ("mov r0, #0" ::: "r0");
// asm volatile ("mcr p15, 0, r0, c7, c10, 4" ::: "r0"); /* drain write buffer, should be done on nonbuffered write */
// cache_clean_flush();
shared_ctl->loopc++;
shared_ctl->busy = 0;
}
}

View file

@ -0,0 +1,185 @@
.global code940
code940: @ interrupt table:
b .b_reset @ reset
b .b_undef @ undefined instructions
b .b_swi @ software interrupt
b .b_pabort @ prefetch abort
b .b_dabort @ data abort
b .b_reserved @ reserved
b .b_irq @ IRQ
b .b_fiq @ FIQ
@ test
.b_reset:
mov r12, #0
b .Begin
.b_undef:
mov r12, #1
b .Begin
.b_swi:
mov r12, #2
b .Begin
.b_pabort:
mov r12, #3
b .Begin
.b_dabort:
mov r12, #4
b .Begin
.b_reserved:
mov r12, #5
b .Begin
.b_irq:
mov r12, #6
mov sp, #0x100000 @ reset stack
sub sp, sp, #4
mov r1, #0xbe000000 @ assume we live @ 0x2000000 bank
orr r2, r1, #0x3B00
orr r2, r2, #0x0046
mvn r3, #0
strh r3, [r2] @ clear any pending interrupts from the DUALCPU unit
orr r2, r1, #0x4500
str r3, [r2] @ clear all pending interrupts in irq controller's SRCPND register
orr r2, r2, #0x0010
str r3, [r2] @ clear all pending interrupts in irq controller's INTPND register
b .Enter
.b_fiq:
mov r12, #7
b .Begin
.Begin:
mov sp, #0x100000 @ set the stack top (1M)
sub sp, sp, #4 @ minus 4
@ set up memory region 0 -- the whole 4GB address space
mov r0, #(0x1f<<1)|1 @ region data
mcr p15, 0, r0, c6, c0, 0 @ opcode2 ~ data/instr
mcr p15, 0, r0, c6, c0, 1
@ set up region 1 which is the first 2 megabytes.
mov r0, #(0x14<<1)|1 @ region data
mcr p15, 0, r0, c6, c1, 0
mcr p15, 0, r0, c6, c1, 1
@ set up region 2: 64k 0x200000-0x210000
mov r0, #(0x0f<<1)|1
orr r0, r0, #0x200000
mcr p15, 0, r0, c6, c2, 0
mcr p15, 0, r0, c6, c2, 1
@ set up region 3: 64k 0xbe000000-0xbe010000 (hw control registers)
mov r0, #(0x0f<<1)|1
orr r0, r0, #0xbe000000
mcr p15, 0, r0, c6, c3, 0
mcr p15, 0, r0, c6, c3, 1
@ set up region 4: 16M 0x01000000-0x02000000 (mp3 area)
mov r0, #(0x17<<1)|1
orr r0, r0, #0x01000000
mcr p15, 0, r0, c6, c4, 0
mcr p15, 0, r0, c6, c4, 1
@ set regions 1 and 4 to be cacheable (so the first 2M and mp3 area will be cacheable)
mov r0, #(1<<1)|(1<<4)
mcr p15, 0, r0, c2, c0, 0
mcr p15, 0, r0, c2, c0, 1
@ set region 1 to be bufferable too (only data)
mov r0, #(1<<1)
mcr p15, 0, r0, c3, c0, 0
@ set protection, allow accsess only to regions 1 and 2
mov r0, #(3<<8)|(3<<6)|(3<<4)|(3<<2)|(0) @ data: [full, full, full, full, no access] for regions [4 3 2 1 0]
mcr p15, 0, r0, c5, c0, 0
mov r0, #(0<<8)|(0<<6)|(0<<4)|(3<<2)|(0) @ instructions: [no access, no, no, full, no]
mcr p15, 0, r0, c5, c0, 1
mrc p15, 0, r0, c1, c0, 0 @ fetch current control reg
orr r0, r0, #1 @ 0x00000001: enable protection unit
orr r0, r0, #4 @ 0x00000004: enable D cache
orr r0, r0, #0x1000 @ 0x00001000: enable I cache
bic r0, r0, #0xC0000000
orr r0, r0, #0x40000000 @ 0x40000000: synchronous, faster?
@ orr r0, r0, #0xC0000000 @ 0xC0000000: async
mcr p15, 0, r0, c1, c0, 0 @ set control reg
@ flush (invalidate) the cache (just in case)
mov r0, #0
mcr p15, 0, r0, c7, c6, 0
.Enter:
mov r0, r12
bl Main940
@ we should never get here
.b_deadloop:
b .b_deadloop
@ so asm utils are also defined here:
.global spend_cycles @ c
spend_cycles:
mov r0, r0, lsr #2 @ 4 cycles/iteration
sub r0, r0, #2 @ entry/exit/init
.sc_loop:
subs r0, r0, #1
bpl .sc_loop
bx lr
@ clean-flush function from ARM940T technical reference manual
.global cache_clean_flush
cache_clean_flush:
mov r1, #0 @ init line counter
ccf_outer_loop:
mov r0, #0 @ segment counter
ccf_inner_loop:
orr r2, r1, r0 @ make segment and line address
mcr p15, 0, r2, c7, c14, 2 @ clean and flush that line
add r0, r0, #0x10 @ incremet secment counter
cmp r0, #0x40 @ complete all 4 segments?
bne ccf_inner_loop
add r1, r1, #0x04000000 @ increment line counter
cmp r1, #0 @ complete all lines?
bne ccf_outer_loop
bx lr
@ clean-only version
.global cache_clean
cache_clean:
mov r1, #0 @ init line counter
cf_outer_loop:
mov r0, #0 @ segment counter
cf_inner_loop:
orr r2, r1, r0 @ make segment and line address
mcr p15, 0, r2, c7, c10, 2 @ clean that line
add r0, r0, #0x10 @ incremet secment counter
cmp r0, #0x40 @ complete all 4 segments?
bne cf_inner_loop
add r1, r1, #0x04000000 @ increment line counter
cmp r1, #0 @ complete all lines?
bne cf_outer_loop
bx lr
.global wait_irq
wait_irq:
mrs r0, cpsr
bic r0, r0, #0x80
msr cpsr_c, r0 @ enable interrupts
mov r0, #0
mcr p15, 0, r0, c7, c0, 4 @ wait for IRQ
@ mcr p15, 0, r0, c15, c8, 2
b .b_reserved
.pool
@ vim:filetype=armasm:

View file

@ -0,0 +1,42 @@
#include "../../../Pico/sound/ym2612.h"
#include "../helix/pub/mp3dec.h"
enum _940_job_t {
JOB940_INITALL = 1,
JOB940_YM2612RESETCHIP,
JOB940_YM2612UPDATEONE,
JOB940_PICOSTATELOAD,
JOB940_MP3DECODE,
JOB940_NUMJOBS
};
#define MAX_940JOBS 2
typedef struct
{
YM2612 ym2612; /* current state of the emulated YM2612 */
HMP3Decoder mp3dec; /* mp3 decoder's handle */
int mix_buffer[44100/50*2]; /* this is where the YM2612 samples will be mixed to */
short mp3_buffer[2][1152*2]; /* buffers for mp3 decoder's output */
} _940_data_t;
typedef struct
{
int jobs[MAX_940JOBS]; /* jobs for second core */
int busy; /* busy status of the 940 core */
int length; /* number of samples to mix (882 max) */
int stereo; /* mix samples as stereo, doubles sample count automatically */
int baseclock; /* ym2612 settings */
int rate;
int writebuffsel; /* which write buffer to use (from 940 side) */
UINT16 writebuff0[2048]; /* list of writes to ym2612, 1024 for savestates, 1024 extra */
UINT16 writebuff1[2048];
int mp3_len; /* data len of loaded mp3 */
int mp3_offs; /* current playback offset (just after last decoded frame) */
int mp3_buffsel; /* which output buffer to decode to */
int vstarts[8]; /* debug: number of starts from each of 8 vectors */
int loopc; /* debug: main loop counter */
int mp3_errors; /* debug: mp3 decoder's error counter */
int mp3_lasterr; /* debug: mp3 decoder's last error */
} _940_ctl_t;

View file

@ -0,0 +1,87 @@
# you may or may not need to change this
#devkit_path = x:/stuff/dev/devkitgp2x/
devkit_path = /usr/local/devkitPro/devkitGP2X/
lgcc_path = $(devkit_path)lib/gcc/arm-linux/4.0.3/
CROSS = arm-linux-
#CROSS = $(devkit_path)bin/arm-linux-
# settings
#up = 1
DEFINC = -I../.. -I. -D__GP2X__ -DARM # -DBENCHMARK
COPT_COMMON = -static -s -O3 -ftracer -fstrength-reduce -Wall -funroll-loops -fomit-frame-pointer -fstrict-aliasing -ffast-math
COPT = $(COPT_COMMON) -mtune=arm920t
GCC = $(CROSS)gcc
STRIP = $(CROSS)strip
AS = $(CROSS)as
LD = $(CROSS)ld
OBJCOPY = $(CROSS)objcopy
all: code940.bin
up940:
@cp -v code940.bin /mnt/gp2x/mnt/sd/games/PicoDrive/
# @cmd //C copy code940.bin \\\\10.0.1.2\\gp2x\\mnt\\sd\\games\\PicoDrive\\
.c.o:
@echo $<
$(GCC) $(COPT) $(DEFINC) -c $< -o $@
.s.o:
@echo $<
$(GCC) $(COPT) $(DEFINC) -c $< -o $@
# stuff for 940 core
# init, emu_control, emu
OBJS940 += 940init.o 940.o 940ym2612.o memcpy.o
# the asm code seems to be faster when run on 920, but not on 940 for some reason
# OBJS940 += ../../Pico/sound/ym2612_asm.o
# uClibc library code
OBJS940 += uClibc/memset.o uClibc/s_floor.o uClibc/e_pow.o uClibc/e_sqrt.o uClibc/s_fabs.o
OBJS940 += uClibc/s_scalbn.o uClibc/s_copysign.o uClibc/k_sin.o uClibc/k_cos.o uClibc/s_sin.o
OBJS940 += uClibc/e_rem_pio2.o uClibc/k_rem_pio2.o uClibc/e_log.o uClibc/wrappers.o
code940.bin : code940.gpe
@echo $@
@$(OBJCOPY) -O binary $< $@
code940.gpe : $(OBJS940) ../helix/helix_mp3.a
@echo $@
@$(LD) -static -e code940 -Ttext 0x0 $^ -L$(lgcc_path) -lgcc -o $@
940ym2612.o : ../../../Pico/sound/ym2612.c
@echo $@
@$(GCC) $(COPT_COMMON) -mtune=arm940t $(DEFINC) -DEXTERNAL_YM2612 -c $< -o $@
../helix/helix_mp3.a:
@make -C ../helix/
# cleanup
clean: tidy
@$(RM) code940.bin
tidy:
@$(RM) code940.gpe $(OBJS940)
OBJSMP3T = mp3test.o ../gp2x.o ../asmutils.o ../usbjoy.o
mp3test.gpe : $(OBJSMP3T) ../helix/helix_mp3.a
$(GCC) -static -o $@ $^
$(STRIP) $@
@cp -v $@ /mnt/gp2x/mnt/sd
cleanmp3test:
$(RM) $(OBJSMP3T) mp3test.gpe
# uClibc/e_pow.o : uClibc/e_pow.c
# @echo $<
# @$(GCC) $(COPT) $(DEFINC) -fno-profile-generate -c $< -o $@
# uClibc/e_sqrt.o : uClibc/e_sqrt.c
# @echo $<
# @$(GCC) $(COPT) $(DEFINC) -fno-profile-generate -c $< -o $@

View file

@ -0,0 +1,500 @@
/* $NetBSD: memcpy.S,v 1.3 1997/11/22 03:27:12 mark Exp $ */
/*-
* Copyright (c) 1997 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Neil A. Carson and Mark Brinicombe
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the NetBSD
* Foundation, Inc. and its contributors.
* 4. Neither the name of The NetBSD Foundation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS\'\' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/* This was modified by Jay Monkman <jmonkman@smoothsmoothie.com> to
* save and restore r12. This is necessary for RTEMS.
*/
/* #include <machine/asm.h>*/
#define ENTRY(_LABEL) \
.global _LABEL; _LABEL:
.globl memcpy
memcpy:
@ ENTRY(gp2x_memcpy)
stmfd sp!, {r0, r12, lr}
@ bl _gp2x_memcpy
bl _memcpy
ldmfd sp!, {r0, r12, pc}
.globl memmove
memmove:
@ ENTRY(gp2x_memmove)
stmfd sp!, {r0, r12, lr}
@ bl _gp2x_memcpy
bl _memcpy
ldmfd sp!, {r0, r12, pc}
/*
* This is one fun bit of code ...
* Some easy listening music is suggested while trying to understand this
* code e.g. Iron Maiden
*
* For anyone attempting to understand it :
*
* The core code is implemented here with simple stubs for memcpy()
* memmove() and bcopy().
*
* All local labels are prefixed with Lmemcpy_
* Following the prefix a label starting f is used in the forward copy code
* while a label using b is used in the backwards copy code
* The source and destination addresses determine whether a forward or
* backward copy is performed.
* Separate bits of code are used to deal with the following situations
* for both the forward and backwards copy.
* unaligned source address
* unaligned destination address
* Separate copy routines are used to produce an optimised result for each
* of these cases.
* The copy code will use LDM/STM instructions to copy up to 32 bytes at
* a time where possible.
*
* Note: r12 (aka ip) can be trashed during the function along with
* r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
* Additional registers are preserved prior to use i.e. r4, r5 & lr
*
* Apologies for the state of the comments;-)
*/
_memcpy:
@ ENTRY(_gp2x_memcpy)
/* Determine copy direction */
cmp r1, r0
bcc Lmemcpy_backwards
moveq r0, #0 /* Quick abort for len=0 */
moveq pc, lr
stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
subs r2, r2, #4
blt Lmemcpy_fl4 /* less than 4 bytes */
ands r12, r0, #3
bne Lmemcpy_fdestul /* oh unaligned destination addr */
ands r12, r1, #3
bne Lmemcpy_fsrcul /* oh unaligned source addr */
Lmemcpy_ft8:
/* We have aligned source and destination */
subs r2, r2, #8
blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */
subs r2, r2, #0x14
blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */
stmdb sp!, {r4, r7, r8, r9, r10} /* borrow r4 */
/* blat 64 bytes at a time */
/* XXX for really big copies perhaps we should use more registers */
Lmemcpy_floop32:
ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
ldmia r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
stmia r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
subs r2, r2, #0x40
bge Lmemcpy_floop32
cmn r2, #0x10
ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
stmgeia r0!, {r3, r4, r12, lr}
subge r2, r2, #0x10
ldmia sp!, {r4, r7, r8, r9, r10} /* return r4 */
Lmemcpy_fl32:
adds r2, r2, #0x14
/* blat 12 bytes at a time */
Lmemcpy_floop12:
ldmgeia r1!, {r3, r12, lr}
stmgeia r0!, {r3, r12, lr}
subges r2, r2, #0x0c
bge Lmemcpy_floop12
Lmemcpy_fl12:
adds r2, r2, #8
blt Lmemcpy_fl4
subs r2, r2, #4
ldrlt r3, [r1], #4
strlt r3, [r0], #4
ldmgeia r1!, {r3, r12}
stmgeia r0!, {r3, r12}
subge r2, r2, #4
Lmemcpy_fl4:
/* less than 4 bytes to go */
adds r2, r2, #4
ldmeqia sp!, {r0, pc} /* done */
/* copy the crud byte at a time */
cmp r2, #2
ldrb r3, [r1], #1
strb r3, [r0], #1
ldrgeb r3, [r1], #1
strgeb r3, [r0], #1
ldrgtb r3, [r1], #1
strgtb r3, [r0], #1
ldmia sp!, {r0, pc}
/* erg - unaligned destination */
Lmemcpy_fdestul:
rsb r12, r12, #4
cmp r12, #2
/* align destination with byte copies */
ldrb r3, [r1], #1
strb r3, [r0], #1
ldrgeb r3, [r1], #1
strgeb r3, [r0], #1
ldrgtb r3, [r1], #1
strgtb r3, [r0], #1
subs r2, r2, r12
blt Lmemcpy_fl4 /* less the 4 bytes */
ands r12, r1, #3
beq Lmemcpy_ft8 /* we have an aligned source */
/* erg - unaligned source */
/* This is where it gets nasty ... */
Lmemcpy_fsrcul:
bic r1, r1, #3
ldr lr, [r1], #4
cmp r12, #2
bgt Lmemcpy_fsrcul3
beq Lmemcpy_fsrcul2
cmp r2, #0x0c
blt Lmemcpy_fsrcul1loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
Lmemcpy_fsrcul1loop16:
mov r3, lr, lsr #8
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #24
mov r4, r4, lsr #8
orr r4, r4, r5, lsl #24
mov r5, r5, lsr #8
orr r5, r5, r12, lsl #24
mov r12, r12, lsr #8
orr r12, r12, lr, lsl #24
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10
bge Lmemcpy_fsrcul1loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt Lmemcpy_fsrcul1l4
Lmemcpy_fsrcul1loop4:
mov r12, lr, lsr #8
ldr lr, [r1], #4
orr r12, r12, lr, lsl #24
str r12, [r0], #4
subs r2, r2, #4
bge Lmemcpy_fsrcul1loop4
Lmemcpy_fsrcul1l4:
sub r1, r1, #3
b Lmemcpy_fl4
Lmemcpy_fsrcul2:
cmp r2, #0x0c
blt Lmemcpy_fsrcul2loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
Lmemcpy_fsrcul2loop16:
mov r3, lr, lsr #16
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #16
mov r4, r4, lsr #16
orr r4, r4, r5, lsl #16
mov r5, r5, lsr #16
orr r5, r5, r12, lsl #16
mov r12, r12, lsr #16
orr r12, r12, lr, lsl #16
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10
bge Lmemcpy_fsrcul2loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt Lmemcpy_fsrcul2l4
Lmemcpy_fsrcul2loop4:
mov r12, lr, lsr #16
ldr lr, [r1], #4
orr r12, r12, lr, lsl #16
str r12, [r0], #4
subs r2, r2, #4
bge Lmemcpy_fsrcul2loop4
Lmemcpy_fsrcul2l4:
sub r1, r1, #2
b Lmemcpy_fl4
Lmemcpy_fsrcul3:
cmp r2, #0x0c
blt Lmemcpy_fsrcul3loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
Lmemcpy_fsrcul3loop16:
mov r3, lr, lsr #24
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #8
mov r4, r4, lsr #24
orr r4, r4, r5, lsl #8
mov r5, r5, lsr #24
orr r5, r5, r12, lsl #8
mov r12, r12, lsr #24
orr r12, r12, lr, lsl #8
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10
bge Lmemcpy_fsrcul3loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt Lmemcpy_fsrcul3l4
Lmemcpy_fsrcul3loop4:
mov r12, lr, lsr #24
ldr lr, [r1], #4
orr r12, r12, lr, lsl #8
str r12, [r0], #4
subs r2, r2, #4
bge Lmemcpy_fsrcul3loop4
Lmemcpy_fsrcul3l4:
sub r1, r1, #1
b Lmemcpy_fl4
Lmemcpy_backwards:
add r1, r1, r2
add r0, r0, r2
subs r2, r2, #4
blt Lmemcpy_bl4 /* less than 4 bytes */
ands r12, r0, #3
bne Lmemcpy_bdestul /* oh unaligned destination addr */
ands r12, r1, #3
bne Lmemcpy_bsrcul /* oh unaligned source addr */
Lmemcpy_bt8:
/* We have aligned source and destination */
subs r2, r2, #8
blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */
stmdb sp!, {r4, r7, r8, r9, r10, lr}
subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
blt Lmemcpy_bl32
/* blat 64 bytes at a time */
/* XXX for really big copies perhaps we should use more registers */
Lmemcpy_bloop32:
ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
ldmdb r1!, {r3, r4, r7, r8, r9, r10, r12, lr}
stmdb r0!, {r3, r4, r7, r8, r9, r10, r12, lr}
subs r2, r2, #0x40
bge Lmemcpy_bloop32
Lmemcpy_bl32:
cmn r2, #0x10
ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
stmgedb r0!, {r3, r4, r12, lr}
subge r2, r2, #0x10
adds r2, r2, #0x14
ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
stmgedb r0!, {r3, r12, lr}
subge r2, r2, #0x0c
ldmia sp!, {r4, r7, r8, r9, r10, lr}
Lmemcpy_bl12:
adds r2, r2, #8
blt Lmemcpy_bl4
subs r2, r2, #4
ldrlt r3, [r1, #-4]!
strlt r3, [r0, #-4]!
ldmgedb r1!, {r3, r12}
stmgedb r0!, {r3, r12}
subge r2, r2, #4
Lmemcpy_bl4:
/* less than 4 bytes to go */
adds r2, r2, #4
moveq pc, lr /* done */
/* copy the crud byte at a time */
cmp r2, #2
ldrb r3, [r1, #-1]!
strb r3, [r0, #-1]!
ldrgeb r3, [r1, #-1]!
strgeb r3, [r0, #-1]!
ldrgtb r3, [r1, #-1]!
strgtb r3, [r0, #-1]!
mov pc, lr
/* erg - unaligned destination */
Lmemcpy_bdestul:
cmp r12, #2
/* align destination with byte copies */
ldrb r3, [r1, #-1]!
strb r3, [r0, #-1]!
ldrgeb r3, [r1, #-1]!
strgeb r3, [r0, #-1]!
ldrgtb r3, [r1, #-1]!
strgtb r3, [r0, #-1]!
subs r2, r2, r12
blt Lmemcpy_bl4 /* less than 4 bytes to go */
ands r12, r1, #3
beq Lmemcpy_bt8 /* we have an aligned source */
/* erg - unaligned source */
/* This is where it gets nasty ... */
Lmemcpy_bsrcul:
bic r1, r1, #3
ldr r3, [r1, #0]
cmp r12, #2
blt Lmemcpy_bsrcul1
beq Lmemcpy_bsrcul2
cmp r2, #0x0c
blt Lmemcpy_bsrcul3loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5, lr}
Lmemcpy_bsrcul3loop16:
mov lr, r3, lsl #8
ldmdb r1!, {r3-r5, r12}
orr lr, lr, r12, lsr #24
mov r12, r12, lsl #8
orr r12, r12, r5, lsr #24
mov r5, r5, lsl #8
orr r5, r5, r4, lsr #24
mov r4, r4, lsl #8
orr r4, r4, r3, lsr #24
stmdb r0!, {r4, r5, r12, lr}
subs r2, r2, #0x10
bge Lmemcpy_bsrcul3loop16
ldmia sp!, {r4, r5, lr}
adds r2, r2, #0x0c
blt Lmemcpy_bsrcul3l4
Lmemcpy_bsrcul3loop4:
mov r12, r3, lsl #8
ldr r3, [r1, #-4]!
orr r12, r12, r3, lsr #24
str r12, [r0, #-4]!
subs r2, r2, #4
bge Lmemcpy_bsrcul3loop4
Lmemcpy_bsrcul3l4:
add r1, r1, #3
b Lmemcpy_bl4
Lmemcpy_bsrcul2:
cmp r2, #0x0c
blt Lmemcpy_bsrcul2loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5, lr}
Lmemcpy_bsrcul2loop16:
mov lr, r3, lsl #16
ldmdb r1!, {r3-r5, r12}
orr lr, lr, r12, lsr #16
mov r12, r12, lsl #16
orr r12, r12, r5, lsr #16
mov r5, r5, lsl #16
orr r5, r5, r4, lsr #16
mov r4, r4, lsl #16
orr r4, r4, r3, lsr #16
stmdb r0!, {r4, r5, r12, lr}
subs r2, r2, #0x10
bge Lmemcpy_bsrcul2loop16
ldmia sp!, {r4, r5, lr}
adds r2, r2, #0x0c
blt Lmemcpy_bsrcul2l4
Lmemcpy_bsrcul2loop4:
mov r12, r3, lsl #16
ldr r3, [r1, #-4]!
orr r12, r12, r3, lsr #16
str r12, [r0, #-4]!
subs r2, r2, #4
bge Lmemcpy_bsrcul2loop4
Lmemcpy_bsrcul2l4:
add r1, r1, #2
b Lmemcpy_bl4
Lmemcpy_bsrcul1:
cmp r2, #0x0c
blt Lmemcpy_bsrcul1loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5, lr}
Lmemcpy_bsrcul1loop32:
mov lr, r3, lsl #24
ldmdb r1!, {r3-r5, r12}
orr lr, lr, r12, lsr #8
mov r12, r12, lsl #24
orr r12, r12, r5, lsr #8
mov r5, r5, lsl #24
orr r5, r5, r4, lsr #8
mov r4, r4, lsl #24
orr r4, r4, r3, lsr #8
stmdb r0!, {r4, r5, r12, lr}
subs r2, r2, #0x10
bge Lmemcpy_bsrcul1loop32
ldmia sp!, {r4, r5, lr}
adds r2, r2, #0x0c
blt Lmemcpy_bsrcul1l4
Lmemcpy_bsrcul1loop4:
mov r12, r3, lsl #24
ldr r3, [r1, #-4]!
orr r12, r12, r3, lsr #8
str r12, [r0, #-4]!
subs r2, r2, #4
bge Lmemcpy_bsrcul1loop4
Lmemcpy_bsrcul1l4:
add r1, r1, #1
b Lmemcpy_bl4

View file

@ -0,0 +1,383 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <sys/time.h>
#include <fcntl.h>
#include <errno.h>
#include "940shared.h"
#include "../gp2x.h"
//#include "emu.h"
//#include "menu.h"
#include "../asmutils.h"
#include "../helix/pub/mp3dec.h"
/* we will need some gp2x internals here */
extern volatile unsigned short *gp2x_memregs; /* from minimal library rlyeh */
extern volatile unsigned long *gp2x_memregl;
static unsigned char *shared_mem = 0;
static _940_data_t *shared_data = 0;
static _940_ctl_t *shared_ctl = 0;
static unsigned char *mp3_mem = 0;
#define MP3_SIZE_MAX (0x1000000 - 4*640*480)
int crashed_940 = 0;
/***********************************************************/
#define MAXOUT (+32767)
#define MINOUT (-32768)
/* limitter */
#define Limit(val, max,min) { \
if ( val > max ) val = max; \
else if ( val < min ) val = min; \
}
void wait_busy_940(void)
{
int i;
#if 0
printf("940 busy, entering wait loop.. (cnt: %i, wc: %i, ve: ", shared_ctl->loopc, shared_ctl->waitc);
for (i = 0; i < 8; i++)
printf("%i ", shared_ctl->vstarts[i]);
printf(")\n");
for (i = 0; shared_ctl->busy; i++)
{
spend_cycles(1024); /* needs tuning */
}
printf("wait iterations: %i\n", i);
#else
for (i = 0; shared_ctl->busy && i < 0x10000; i++)
spend_cycles(8*1024);
if (i < 0x10000) return;
/* 940 crashed */
printf("940 crashed (cnt: %i, ve: ", shared_ctl->loopc);
for (i = 0; i < 8; i++)
printf("%i ", shared_ctl->vstarts[i]);
printf(")\n");
crashed_940 = 1;
#endif
}
void add_job_940(int job0, int job1)
{
shared_ctl->jobs[0] = job0;
shared_ctl->jobs[1] = job1;
shared_ctl->busy = 1;
gp2x_memregs[0x3B3E>>1] = 0xffff; // cause an IRQ for 940
}
static int read_to_upper(void *dest, void *tmpbuf, int tmpsize, FILE *f)
{
int nRead, nLen = 0;
while(1)
{
nRead = fread(tmpbuf, 1, tmpsize, f);
if(nRead <= 0)
break;
memcpy((unsigned char *)dest + nLen, tmpbuf, nRead);
nLen += nRead;
}
return nLen;
}
static void simpleWait(int thissec, int lim_time)
{
struct timeval tval;
spend_cycles(1024);
gettimeofday(&tval, 0);
if(thissec != tval.tv_sec) tval.tv_usec+=1000000;
while(tval.tv_usec < lim_time)
{
spend_cycles(1024);
gettimeofday(&tval, 0);
if(thissec != tval.tv_sec) tval.tv_usec+=1000000;
}
}
char **g_argv;
/* none of the functions in this file should be called before this one */
void YM2612Init_940(int baseclock, int rate)
{
printf("YM2612Init_940()\n");
printf("Mem usage: shared_data: %i, shared_ctl: %i\n", sizeof(*shared_data), sizeof(*shared_ctl));
Reset940(1, 2);
Pause940(1);
gp2x_memregs[0x3B46>>1] = 0xffff; // clear pending DUALCPU interrupts for 940
gp2x_memregs[0x3B42>>1] = 0xffff; // enable DUALCPU interrupts for 940
gp2x_memregl[0x4508>>2] = ~(1<<26); // unmask DUALCPU ints in the undocumented 940's interrupt controller
if (shared_mem == NULL)
{
shared_mem = (unsigned char *) mmap(0, 0x210000, PROT_READ|PROT_WRITE, MAP_SHARED, memdev, 0x2000000);
if(shared_mem == MAP_FAILED)
{
printf("mmap(shared_data) failed with %i\n", errno);
exit(1);
}
shared_data = (_940_data_t *) (shared_mem+0x100000);
/* this area must not get buffered on either side */
shared_ctl = (_940_ctl_t *) (shared_mem+0x200000);
mp3_mem = (unsigned char *) mmap(0, MP3_SIZE_MAX, PROT_READ|PROT_WRITE, MAP_SHARED, memdev, 0x3000000);
if (mp3_mem == MAP_FAILED)
{
printf("mmap(mp3_mem) failed with %i\n", errno);
exit(1);
}
crashed_940 = 1;
}
if (crashed_940)
{
unsigned char ucData[1024];
int i;
char binpath[1024];
FILE *fp;
strncpy(binpath, g_argv[0], 1023);
binpath[1023] = 0;
for (i = strlen(binpath); i > 0; i--)
if (binpath[i] == '/') { binpath[i] = 0; break; }
strcat(binpath, "/code940.bin");
fp = fopen(binpath, "rb");
if(!fp)
{
printf("failed to open %s\n", binpath);
exit(1);
}
read_to_upper(shared_mem, ucData, sizeof(ucData), fp);
fclose(fp);
crashed_940 = 0;
}
memset(shared_data, 0, sizeof(*shared_data));
memset(shared_ctl, 0, sizeof(*shared_ctl));
/* now cause 940 to init it's ym2612 stuff */
shared_ctl->baseclock = baseclock;
shared_ctl->rate = rate;
shared_ctl->jobs[0] = JOB940_INITALL;
shared_ctl->jobs[1] = 0;
shared_ctl->busy = 1;
/* start the 940 */
Reset940(0, 2);
Pause940(0);
}
unsigned char *mp3_data = 0;
void local_decode(void)
{
int mp3_offs = shared_ctl->mp3_offs;
unsigned char *readPtr = mp3_data + mp3_offs;
int bytesLeft = shared_ctl->mp3_len - mp3_offs;
int offset; // frame offset from readPtr
int err = 0;
if (bytesLeft <= 0) return; // EOF, nothing to do
offset = MP3FindSyncWord(readPtr, bytesLeft);
if (offset < 0) {
shared_ctl->mp3_offs = shared_ctl->mp3_len;
return; // EOF
}
readPtr += offset;
bytesLeft -= offset;
err = MP3Decode(shared_data->mp3dec, &readPtr, &bytesLeft,
shared_data->mp3_buffer[shared_ctl->mp3_buffsel], 0);
if (err) {
if (err == ERR_MP3_INDATA_UNDERFLOW) {
shared_ctl->mp3_offs = shared_ctl->mp3_len; // EOF
return;
} else if (err <= -6 && err >= -12) {
// ERR_MP3_INVALID_FRAMEHEADER, ERR_MP3_INVALID_*
// just try to skip the offending frame..
readPtr++;
}
shared_ctl->mp3_errors++;
shared_ctl->mp3_lasterr = err;
}
shared_ctl->mp3_offs = readPtr - mp3_data;
}
void gp2x_sound_sync(void);
#define USE_LOCAL 0
#define BENCHMARK 0
int main(int argc, char *argv[])
{
FILE *f;
int size;
struct timeval tval; // timing
int thissec = 0, fps = 0;
int target_frametime, frame_samples, samples_ready, mp3_buffer_offs, play_bufsel;
unsigned char play_buffer[44100/50*2*2];
if (argc != 2) {
printf("usage: %s <mp3file>\n", argv[0]);
return 1;
}
g_argv = argv;
gp2x_init();
YM2612Init_940(123, 44100);
// load a mp3
f = fopen(argv[1], "rb");
if (!f) {
printf("can't open %s\n", argv[1]);
return 1;
}
fseek(f, 0, SEEK_END);
size = (int) ftell(f);
if (size > MP3_SIZE_MAX) {
printf("size %i > %i\n", size, MP3_SIZE_MAX);
size = MP3_SIZE_MAX;
}
fseek(f, 0, SEEK_SET);
if (fread(mp3_mem, 1, size, f) != size) {
printf("read failed, errno=%i\n", errno);
fclose(f);
exit(1);
}
fclose(f);
shared_ctl->mp3_len = size;
#if USE_LOCAL
shared_data->mp3dec = MP3InitDecoder();
mp3_data = malloc(size);
printf("init: dec: %p ptr: %p\n", shared_data->mp3dec, mp3_data);
if (!mp3_data) {
printf("low mem\n");
exit(1);
}
memcpy(mp3_data, mp3_mem, size);
#else
//printf("YM2612UpdateOne_940()\n");
if (shared_ctl->busy) wait_busy_940();
#endif
gp2x_start_sound(44100, 16, 1);
#define DESIRED_FPS 50
target_frametime = 1000000/DESIRED_FPS;
frame_samples = 44100/DESIRED_FPS;
samples_ready = mp3_buffer_offs = 0;
play_bufsel = 1;
for (;; fps++)
{
int lim_time;
gettimeofday(&tval, 0);
if (tval.tv_sec != thissec)
{
printf("fps: %i\n", fps);
thissec = tval.tv_sec;
fps = 0;
#if BENCHMARK
shared_ctl->mp3_offs = 0;
#endif
}
#if 0
// decode
#if USE_LOCAL
shared_ctl->mp3_buffsel ^= 1;
local_decode();
#else
wait_busy_940();
shared_ctl->mp3_buffsel ^= 1;
add_job_940(JOB940_MP3DECODE, 0);
#endif
if (shared_ctl->mp3_lasterr) {
printf("mp3_lasterr #%i: %i size: %i offs: %i\n", shared_ctl->mp3_errors, shared_ctl->mp3_lasterr,
shared_ctl->mp3_len, shared_ctl->mp3_offs);
printf("loopc: %i bytes: %08x\n",
shared_ctl->loopc, *(int *)(mp3_mem+shared_ctl->mp3_offs));
shared_ctl->mp3_lasterr = 0;
}
#if !BENCHMARK
// play
gp2x_sound_sync();
gp2x_sound_write(shared_data->mp3_buffer[shared_ctl->mp3_buffsel^1], 1152*2*2);
#endif
#else
lim_time = (fps+1) * target_frametime;
wait_busy_940();
// decode, play
if (samples_ready >= frame_samples) {
if (1152 - mp3_buffer_offs >= frame_samples) {
memcpy(play_buffer, shared_data->mp3_buffer[play_bufsel] + mp3_buffer_offs*2,
frame_samples*2*2);
mp3_buffer_offs += frame_samples;
} else {
// collect from both buffers..
int left = 1152 - mp3_buffer_offs;
memcpy(play_buffer, shared_data->mp3_buffer[play_bufsel] + mp3_buffer_offs*2,
left*2*2);
play_bufsel ^= 1;
mp3_buffer_offs = frame_samples - left;
memcpy(play_buffer + left*2*2, shared_data->mp3_buffer[play_bufsel],
mp3_buffer_offs*2*2);
}
gp2x_sound_write(play_buffer, frame_samples*2*2);
samples_ready -= frame_samples;
}
// make sure we will have enough samples next frame
if (samples_ready < frame_samples) {
// wait_busy_940();
shared_ctl->mp3_buffsel ^= 1;
add_job_940(JOB940_MP3DECODE, 0);
samples_ready += 1152;
}
gettimeofday(&tval, 0);
if(thissec != tval.tv_sec) tval.tv_usec+=1000000;
if(tval.tv_usec < lim_time)
{
// we are too fast
simpleWait(thissec, lim_time);
}
#endif
}
return 0;
}

View file

@ -0,0 +1,16 @@
The routines included in this math library are derived from the
math library for Apple's MacOS X/Darwin math library, which was
itself swiped from FreeBSD. The original copyright information
is as follows:
Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
Developed at SunPro, a Sun Microsystems, Inc. business.
Permission to use, copy, modify, and distribute this
software is freely granted, provided that this notice
is preserved.
It has been ported to work with uClibc and generally behave
by Erik Andersen <andersen@codepoet.org>
22 May, 2001

View file

@ -0,0 +1,147 @@
/* @(#)e_log.c 5.1 93/09/24 */
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#if defined(LIBM_SCCS) && !defined(lint)
static char rcsid[] = "$NetBSD: e_log.c,v 1.8 1995/05/10 20:45:49 jtc Exp $";
#endif
/* __ieee754_log(x)
* Return the logrithm of x
*
* Method :
* 1. Argument Reduction: find k and f such that
* x = 2^k * (1+f),
* where sqrt(2)/2 < 1+f < sqrt(2) .
*
* 2. Approximation of log(1+f).
* Let s = f/(2+f) ; based on log(1+f) = log(1+s) - log(1-s)
* = 2s + 2/3 s**3 + 2/5 s**5 + .....,
* = 2s + s*R
* We use a special Reme algorithm on [0,0.1716] to generate
* a polynomial of degree 14 to approximate R The maximum error
* of this polynomial approximation is bounded by 2**-58.45. In
* other words,
* 2 4 6 8 10 12 14
* R(z) ~ Lg1*s +Lg2*s +Lg3*s +Lg4*s +Lg5*s +Lg6*s +Lg7*s
* (the values of Lg1 to Lg7 are listed in the program)
* and
* | 2 14 | -58.45
* | Lg1*s +...+Lg7*s - R(z) | <= 2
* | |
* Note that 2s = f - s*f = f - hfsq + s*hfsq, where hfsq = f*f/2.
* In order to guarantee error in log below 1ulp, we compute log
* by
* log(1+f) = f - s*(f - R) (if f is not too large)
* log(1+f) = f - (hfsq - s*(hfsq+R)). (better accuracy)
*
* 3. Finally, log(x) = k*ln2 + log(1+f).
* = k*ln2_hi+(f-(hfsq-(s*(hfsq+R)+k*ln2_lo)))
* Here ln2 is split into two floating point number:
* ln2_hi + ln2_lo,
* where n*ln2_hi is always exact for |n| < 2000.
*
* Special cases:
* log(x) is NaN with signal if x < 0 (including -INF) ;
* log(+INF) is +INF; log(0) is -INF with signal;
* log(NaN) is that NaN with no signal.
*
* Accuracy:
* according to an error analysis, the error is always less than
* 1 ulp (unit in the last place).
*
* Constants:
* The hexadecimal values are the intended ones for the following
* constants. The decimal values may be used, provided that the
* compiler will convert from decimal to binary accurately enough
* to produce the hexadecimal values shown.
*/
#include "math.h"
#include "math_private.h"
#ifdef __STDC__
static const double
#else
static double
#endif
ln2_hi = 6.93147180369123816490e-01, /* 3fe62e42 fee00000 */
ln2_lo = 1.90821492927058770002e-10, /* 3dea39ef 35793c76 */
two54 = 1.80143985094819840000e+16, /* 43500000 00000000 */
Lg1 = 6.666666666666735130e-01, /* 3FE55555 55555593 */
Lg2 = 3.999999999940941908e-01, /* 3FD99999 9997FA04 */
Lg3 = 2.857142874366239149e-01, /* 3FD24924 94229359 */
Lg4 = 2.222219843214978396e-01, /* 3FCC71C5 1D8E78AF */
Lg5 = 1.818357216161805012e-01, /* 3FC74664 96CB03DE */
Lg6 = 1.531383769920937332e-01, /* 3FC39A09 D078C69F */
Lg7 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */
#ifdef __STDC__
static const double zero = 0.0;
#else
static double zero = 0.0;
#endif
#ifdef __STDC__
double __ieee754_log(double x)
#else
double __ieee754_log(x)
double x;
#endif
{
double hfsq,f,s,z,R,w,t1,t2,dk;
int32_t k,hx,i,j;
u_int32_t lx;
EXTRACT_WORDS(hx,lx,x);
k=0;
if (hx < 0x00100000) { /* x < 2**-1022 */
if (((hx&0x7fffffff)|lx)==0)
return -two54/zero; /* log(+-0)=-inf */
if (hx<0) return (x-x)/zero; /* log(-#) = NaN */
k -= 54; x *= two54; /* subnormal number, scale up x */
GET_HIGH_WORD(hx,x);
}
if (hx >= 0x7ff00000) return x+x;
k += (hx>>20)-1023;
hx &= 0x000fffff;
i = (hx+0x95f64)&0x100000;
SET_HIGH_WORD(x,hx|(i^0x3ff00000)); /* normalize x or x/2 */
k += (i>>20);
f = x-1.0;
if((0x000fffff&(2+hx))<3) { /* |f| < 2**-20 */
if(f==zero) {if(k==0) return zero; else {dk=(double)k;
return dk*ln2_hi+dk*ln2_lo;}
}
R = f*f*(0.5-0.33333333333333333*f);
if(k==0) return f-R; else {dk=(double)k;
return dk*ln2_hi-((R-dk*ln2_lo)-f);}
}
s = f/(2.0+f);
dk = (double)k;
z = s*s;
i = hx-0x6147a;
w = z*z;
j = 0x6b851-hx;
t1= w*(Lg2+w*(Lg4+w*Lg6));
t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
i |= j;
R = t2+t1;
if(i>0) {
hfsq=0.5*f*f;
if(k==0) return f-(hfsq-s*(hfsq+R)); else
return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
} else {
if(k==0) return f-s*(f-R); else
return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f);
}
}

View file

@ -0,0 +1,308 @@
/* @(#)e_pow.c 5.1 93/09/24 */
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#if defined(LIBM_SCCS) && !defined(lint)
static char rcsid[] = "$NetBSD: e_pow.c,v 1.9 1995/05/12 04:57:32 jtc Exp $";
#endif
/* __ieee754_pow(x,y) return x**y
*
* n
* Method: Let x = 2 * (1+f)
* 1. Compute and return log2(x) in two pieces:
* log2(x) = w1 + w2,
* where w1 has 53-24 = 29 bit trailing zeros.
* 2. Perform y*log2(x) = n+y' by simulating muti-precision
* arithmetic, where |y'|<=0.5.
* 3. Return x**y = 2**n*exp(y'*log2)
*
* Special cases:
* 1. (anything) ** 0 is 1
* 2. (anything) ** 1 is itself
* 3. (anything) ** NAN is NAN
* 4. NAN ** (anything except 0) is NAN
* 5. +-(|x| > 1) ** +INF is +INF
* 6. +-(|x| > 1) ** -INF is +0
* 7. +-(|x| < 1) ** +INF is +0
* 8. +-(|x| < 1) ** -INF is +INF
* 9. +-1 ** +-INF is NAN
* 10. +0 ** (+anything except 0, NAN) is +0
* 11. -0 ** (+anything except 0, NAN, odd integer) is +0
* 12. +0 ** (-anything except 0, NAN) is +INF
* 13. -0 ** (-anything except 0, NAN, odd integer) is +INF
* 14. -0 ** (odd integer) = -( +0 ** (odd integer) )
* 15. +INF ** (+anything except 0,NAN) is +INF
* 16. +INF ** (-anything except 0,NAN) is +0
* 17. -INF ** (anything) = -0 ** (-anything)
* 18. (-anything) ** (integer) is (-1)**(integer)*(+anything**integer)
* 19. (-anything except 0 and inf) ** (non-integer) is NAN
*
* Accuracy:
* pow(x,y) returns x**y nearly rounded. In particular
* pow(integer,integer)
* always returns the correct integer provided it is
* representable.
*
* Constants :
* The hexadecimal values are the intended ones for the following
* constants. The decimal values may be used, provided that the
* compiler will convert from decimal to binary accurately enough
* to produce the hexadecimal values shown.
*/
#include "math.h"
#include "math_private.h"
#ifdef __STDC__
static const double
#else
static double
#endif
bp[] = {1.0, 1.5,},
dp_h[] = { 0.0, 5.84962487220764160156e-01,}, /* 0x3FE2B803, 0x40000000 */
dp_l[] = { 0.0, 1.35003920212974897128e-08,}, /* 0x3E4CFDEB, 0x43CFD006 */
zero = 0.0,
one = 1.0,
two = 2.0,
two53 = 9007199254740992.0, /* 0x43400000, 0x00000000 */
huge = 1.0e300,
tiny = 1.0e-300,
/* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
L1 = 5.99999999999994648725e-01, /* 0x3FE33333, 0x33333303 */
L2 = 4.28571428578550184252e-01, /* 0x3FDB6DB6, 0xDB6FABFF */
L3 = 3.33333329818377432918e-01, /* 0x3FD55555, 0x518F264D */
L4 = 2.72728123808534006489e-01, /* 0x3FD17460, 0xA91D4101 */
L5 = 2.30660745775561754067e-01, /* 0x3FCD864A, 0x93C9DB65 */
L6 = 2.06975017800338417784e-01, /* 0x3FCA7E28, 0x4A454EEF */
P1 = 1.66666666666666019037e-01, /* 0x3FC55555, 0x5555553E */
P2 = -2.77777777770155933842e-03, /* 0xBF66C16C, 0x16BEBD93 */
P3 = 6.61375632143793436117e-05, /* 0x3F11566A, 0xAF25DE2C */
P4 = -1.65339022054652515390e-06, /* 0xBEBBBD41, 0xC5D26BF1 */
P5 = 4.13813679705723846039e-08, /* 0x3E663769, 0x72BEA4D0 */
lg2 = 6.93147180559945286227e-01, /* 0x3FE62E42, 0xFEFA39EF */
lg2_h = 6.93147182464599609375e-01, /* 0x3FE62E43, 0x00000000 */
lg2_l = -1.90465429995776804525e-09, /* 0xBE205C61, 0x0CA86C39 */
ovt = 8.0085662595372944372e-0017, /* -(1024-log2(ovfl+.5ulp)) */
cp = 9.61796693925975554329e-01, /* 0x3FEEC709, 0xDC3A03FD =2/(3ln2) */
cp_h = 9.61796700954437255859e-01, /* 0x3FEEC709, 0xE0000000 =(float)cp */
cp_l = -7.02846165095275826516e-09, /* 0xBE3E2FE0, 0x145B01F5 =tail of cp_h*/
ivln2 = 1.44269504088896338700e+00, /* 0x3FF71547, 0x652B82FE =1/ln2 */
ivln2_h = 1.44269502162933349609e+00, /* 0x3FF71547, 0x60000000 =24b 1/ln2*/
ivln2_l = 1.92596299112661746887e-08; /* 0x3E54AE0B, 0xF85DDF44 =1/ln2 tail*/
#ifdef __STDC__
double __ieee754_pow(double x, double y)
#else
double __ieee754_pow(x,y)
double x, y;
#endif
{
double z,ax,z_h,z_l,p_h,p_l;
double y1,t1,t2,r,s,t,u,v,w;
int32_t i,j,k,yisint,n;
int32_t hx,hy,ix,iy;
u_int32_t lx,ly;
EXTRACT_WORDS(hx,lx,x);
EXTRACT_WORDS(hy,ly,y);
ix = hx&0x7fffffff; iy = hy&0x7fffffff;
/* y==zero: x**0 = 1 */
if((iy|ly)==0) return one;
/* +-NaN return x+y */
if(ix > 0x7ff00000 || ((ix==0x7ff00000)&&(lx!=0)) ||
iy > 0x7ff00000 || ((iy==0x7ff00000)&&(ly!=0)))
return x+y;
/* determine if y is an odd int when x < 0
* yisint = 0 ... y is not an integer
* yisint = 1 ... y is an odd int
* yisint = 2 ... y is an even int
*/
yisint = 0;
if(hx<0) {
if(iy>=0x43400000) yisint = 2; /* even integer y */
else if(iy>=0x3ff00000) {
k = (iy>>20)-0x3ff; /* exponent */
if(k>20) {
j = ly>>(52-k);
if((j<<(52-k))==ly) yisint = 2-(j&1);
} else if(ly==0) {
j = iy>>(20-k);
if((j<<(20-k))==iy) yisint = 2-(j&1);
}
}
}
/* special value of y */
if(ly==0) {
if (iy==0x7ff00000) { /* y is +-inf */
if(((ix-0x3ff00000)|lx)==0)
return y - y; /* inf**+-1 is NaN */
else if (ix >= 0x3ff00000)/* (|x|>1)**+-inf = inf,0 */
return (hy>=0)? y: zero;
else /* (|x|<1)**-,+inf = inf,0 */
return (hy<0)?-y: zero;
}
if(iy==0x3ff00000) { /* y is +-1 */
if(hy<0) return one/x; else return x;
}
if(hy==0x40000000) return x*x; /* y is 2 */
if(hy==0x3fe00000) { /* y is 0.5 */
if(hx>=0) /* x >= +0 */
return __ieee754_sqrt(x);
}
}
ax = fabs(x);
/* special value of x */
if(lx==0) {
if(ix==0x7ff00000||ix==0||ix==0x3ff00000){
z = ax; /*x is +-0,+-inf,+-1*/
if(hy<0) z = one/z; /* z = (1/|x|) */
if(hx<0) {
if(((ix-0x3ff00000)|yisint)==0) {
z = (z-z)/(z-z); /* (-1)**non-int is NaN */
} else if(yisint==1)
z = -z; /* (x<0)**odd = -(|x|**odd) */
}
return z;
}
}
/* (x<0)**(non-int) is NaN */
if(((((u_int32_t)hx>>31)-1)|yisint)==0) return (x-x)/(x-x);
/* |y| is huge */
if(iy>0x41e00000) { /* if |y| > 2**31 */
if(iy>0x43f00000){ /* if |y| > 2**64, must o/uflow */
if(ix<=0x3fefffff) return (hy<0)? huge*huge:tiny*tiny;
if(ix>=0x3ff00000) return (hy>0)? huge*huge:tiny*tiny;
}
/* over/underflow if x is not close to one */
if(ix<0x3fefffff) return (hy<0)? huge*huge:tiny*tiny;
if(ix>0x3ff00000) return (hy>0)? huge*huge:tiny*tiny;
/* now |1-x| is tiny <= 2**-20, suffice to compute
log(x) by x-x^2/2+x^3/3-x^4/4 */
t = x-1; /* t has 20 trailing zeros */
w = (t*t)*(0.5-t*(0.3333333333333333333333-t*0.25));
u = ivln2_h*t; /* ivln2_h has 21 sig. bits */
v = t*ivln2_l-w*ivln2;
t1 = u+v;
SET_LOW_WORD(t1,0);
t2 = v-(t1-u);
} else {
double s2,s_h,s_l,t_h,t_l;
n = 0;
/* take care subnormal number */
if(ix<0x00100000)
{ax *= two53; n -= 53; GET_HIGH_WORD(ix,ax); }
n += ((ix)>>20)-0x3ff;
j = ix&0x000fffff;
/* determine interval */
ix = j|0x3ff00000; /* normalize ix */
if(j<=0x3988E) k=0; /* |x|<sqrt(3/2) */
else if(j<0xBB67A) k=1; /* |x|<sqrt(3) */
else {k=0;n+=1;ix -= 0x00100000;}
SET_HIGH_WORD(ax,ix);
/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
u = ax-bp[k]; /* bp[0]=1.0, bp[1]=1.5 */
v = one/(ax+bp[k]);
s = u*v;
s_h = s;
SET_LOW_WORD(s_h,0);
/* t_h=ax+bp[k] High */
t_h = zero;
SET_HIGH_WORD(t_h,((ix>>1)|0x20000000)+0x00080000+(k<<18));
t_l = ax - (t_h-bp[k]);
s_l = v*((u-s_h*t_h)-s_h*t_l);
/* compute log(ax) */
s2 = s*s;
r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
r += s_l*(s_h+s);
s2 = s_h*s_h;
t_h = 3.0+s2+r;
SET_LOW_WORD(t_h,0);
t_l = r-((t_h-3.0)-s2);
/* u+v = s*(1+...) */
u = s_h*t_h;
v = s_l*t_h+t_l*s;
/* 2/(3log2)*(s+...) */
p_h = u+v;
SET_LOW_WORD(p_h,0);
p_l = v-(p_h-u);
z_h = cp_h*p_h; /* cp_h+cp_l = 2/(3*log2) */
z_l = cp_l*p_h+p_l*cp+dp_l[k];
/* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
t = (double)n;
t1 = (((z_h+z_l)+dp_h[k])+t);
SET_LOW_WORD(t1,0);
t2 = z_l-(((t1-t)-dp_h[k])-z_h);
}
s = one; /* s (sign of result -ve**odd) = -1 else = 1 */
if(((((u_int32_t)hx>>31)-1)|(yisint-1))==0)
s = -one;/* (-ve)**(odd int) */
/* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
y1 = y;
SET_LOW_WORD(y1,0);
p_l = (y-y1)*t1+y*t2;
p_h = y1*t1;
z = p_l+p_h;
EXTRACT_WORDS(j,i,z);
if (j>=0x40900000) { /* z >= 1024 */
if(((j-0x40900000)|i)!=0) /* if z > 1024 */
return s*huge*huge; /* overflow */
else {
if(p_l+ovt>z-p_h) return s*huge*huge; /* overflow */
}
} else if((j&0x7fffffff)>=0x4090cc00 ) { /* z <= -1075 */
if(((j-0xc090cc00)|i)!=0) /* z < -1075 */
return s*tiny*tiny; /* underflow */
else {
if(p_l<=z-p_h) return s*tiny*tiny; /* underflow */
}
}
/*
* compute 2**(p_h+p_l)
*/
i = j&0x7fffffff;
k = (i>>20)-0x3ff;
n = 0;
if(i>0x3fe00000) { /* if |z| > 0.5, set n = [z+0.5] */
n = j+(0x00100000>>(k+1));
k = ((n&0x7fffffff)>>20)-0x3ff; /* new k for n */
t = zero;
SET_HIGH_WORD(t,n&~(0x000fffff>>k));
n = ((n&0x000fffff)|0x00100000)>>(20-k);
if(j<0) n = -n;
p_h -= t;
}
t = p_l+p_h;
SET_LOW_WORD(t,0);
u = t*lg2_h;
v = (p_l-(t-p_h))*lg2+t*lg2_l;
z = u+v;
w = v-(z-u);
t = z*z;
t1 = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
r = (z*t1)/(t1-two)-(w+z*w);
z = one-(r-z);
GET_HIGH_WORD(j,z);
j += (n<<20);
if((j>>20)<=0) z = scalbn(z,n); /* subnormal output */
else SET_HIGH_WORD(z,j);
return s*z;
}

View file

@ -0,0 +1,183 @@
/* @(#)e_rem_pio2.c 5.1 93/09/24 */
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#if defined(LIBM_SCCS) && !defined(lint)
static char rcsid[] = "$NetBSD: e_rem_pio2.c,v 1.8 1995/05/10 20:46:02 jtc Exp $";
#endif
/* __ieee754_rem_pio2(x,y)
*
* return the remainder of x rem pi/2 in y[0]+y[1]
* use __kernel_rem_pio2()
*/
#include "math.h"
#include "math_private.h"
/*
* Table of constants for 2/pi, 396 Hex digits (476 decimal) of 2/pi
*/
#ifdef __STDC__
static const int32_t two_over_pi[] = {
#else
static int32_t two_over_pi[] = {
#endif
0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62,
0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A,
0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129,
0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41,
0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8,
0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF,
0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5,
0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08,
0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3,
0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880,
0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B,
};
#ifdef __STDC__
static const int32_t npio2_hw[] = {
#else
static int32_t npio2_hw[] = {
#endif
0x3FF921FB, 0x400921FB, 0x4012D97C, 0x401921FB, 0x401F6A7A, 0x4022D97C,
0x4025FDBB, 0x402921FB, 0x402C463A, 0x402F6A7A, 0x4031475C, 0x4032D97C,
0x40346B9C, 0x4035FDBB, 0x40378FDB, 0x403921FB, 0x403AB41B, 0x403C463A,
0x403DD85A, 0x403F6A7A, 0x40407E4C, 0x4041475C, 0x4042106C, 0x4042D97C,
0x4043A28C, 0x40446B9C, 0x404534AC, 0x4045FDBB, 0x4046C6CB, 0x40478FDB,
0x404858EB, 0x404921FB,
};
/*
* invpio2: 53 bits of 2/pi
* pio2_1: first 33 bit of pi/2
* pio2_1t: pi/2 - pio2_1
* pio2_2: second 33 bit of pi/2
* pio2_2t: pi/2 - (pio2_1+pio2_2)
* pio2_3: third 33 bit of pi/2
* pio2_3t: pi/2 - (pio2_1+pio2_2+pio2_3)
*/
#ifdef __STDC__
static const double
#else
static double
#endif
zero = 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
pio2_1 = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
pio2_2 = 6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */
pio2_2t = 2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
pio2_3 = 2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
#ifdef __STDC__
int32_t __ieee754_rem_pio2(double x, double *y)
#else
int32_t __ieee754_rem_pio2(x,y)
double x,y[];
#endif
{
double z=0.0,w,t,r,fn;
double tx[3];
int32_t e0,i,j,nx,n,ix,hx;
u_int32_t low;
GET_HIGH_WORD(hx,x); /* high word of x */
ix = hx&0x7fffffff;
if(ix<=0x3fe921fb) /* |x| ~<= pi/4 , no need for reduction */
{y[0] = x; y[1] = 0; return 0;}
if(ix<0x4002d97c) { /* |x| < 3pi/4, special case with n=+-1 */
if(hx>0) {
z = x - pio2_1;
if(ix!=0x3ff921fb) { /* 33+53 bit pi is good enough */
y[0] = z - pio2_1t;
y[1] = (z-y[0])-pio2_1t;
} else { /* near pi/2, use 33+33+53 bit pi */
z -= pio2_2;
y[0] = z - pio2_2t;
y[1] = (z-y[0])-pio2_2t;
}
return 1;
} else { /* negative x */
z = x + pio2_1;
if(ix!=0x3ff921fb) { /* 33+53 bit pi is good enough */
y[0] = z + pio2_1t;
y[1] = (z-y[0])+pio2_1t;
} else { /* near pi/2, use 33+33+53 bit pi */
z += pio2_2;
y[0] = z + pio2_2t;
y[1] = (z-y[0])+pio2_2t;
}
return -1;
}
}
if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */
t = fabs(x);
n = (int32_t) (t*invpio2+half);
fn = (double)n;
r = t-fn*pio2_1;
w = fn*pio2_1t; /* 1st round good to 85 bit */
if(n<32&&ix!=npio2_hw[n-1]) {
y[0] = r-w; /* quick check no cancellation */
} else {
u_int32_t high;
j = ix>>20;
y[0] = r-w;
GET_HIGH_WORD(high,y[0]);
i = j-((high>>20)&0x7ff);
if(i>16) { /* 2nd iteration needed, good to 118 */
t = r;
w = fn*pio2_2;
r = t-w;
w = fn*pio2_2t-((t-r)-w);
y[0] = r-w;
GET_HIGH_WORD(high,y[0]);
i = j-((high>>20)&0x7ff);
if(i>49) { /* 3rd iteration need, 151 bits acc */
t = r; /* will cover all possible cases */
w = fn*pio2_3;
r = t-w;
w = fn*pio2_3t-((t-r)-w);
y[0] = r-w;
}
}
}
y[1] = (r-y[0])-w;
if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
else return n;
}
/*
* all other (large) arguments
*/
if(ix>=0x7ff00000) { /* x is inf or NaN */
y[0]=y[1]=x-x; return 0;
}
/* set z = scalbn(|x|,ilogb(x)-23) */
GET_LOW_WORD(low,x);
SET_LOW_WORD(z,low);
e0 = (ix>>20)-1046; /* e0 = ilogb(z)-23; */
SET_HIGH_WORD(z, ix - ((int32_t)(e0<<20)));
for(i=0;i<2;i++) {
tx[i] = (double)((int32_t)(z));
z = (z-tx[i])*two24;
}
tx[2] = z;
nx = 3;
while(tx[nx-1]==zero) nx--; /* skip zero term */
n = __kernel_rem_pio2(tx,y,e0,nx,2,two_over_pi);
if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
return n;
}

View file

@ -0,0 +1,453 @@
/* @(#)e_sqrt.c 5.1 93/09/24 */
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#if defined(LIBM_SCCS) && !defined(lint)
static char rcsid[] = "$NetBSD: e_sqrt.c,v 1.8 1995/05/10 20:46:17 jtc Exp $";
#endif
/* __ieee754_sqrt(x)
* Return correctly rounded sqrt.
* ------------------------------------------
* | Use the hardware sqrt if you have one |
* ------------------------------------------
* Method:
* Bit by bit method using integer arithmetic. (Slow, but portable)
* 1. Normalization
* Scale x to y in [1,4) with even powers of 2:
* find an integer k such that 1 <= (y=x*2^(2k)) < 4, then
* sqrt(x) = 2^k * sqrt(y)
* 2. Bit by bit computation
* Let q = sqrt(y) truncated to i bit after binary point (q = 1),
* i 0
* i+1 2
* s = 2*q , and y = 2 * ( y - q ). (1)
* i i i i
*
* To compute q from q , one checks whether
* i+1 i
*
* -(i+1) 2
* (q + 2 ) <= y. (2)
* i
* -(i+1)
* If (2) is false, then q = q ; otherwise q = q + 2 .
* i+1 i i+1 i
*
* With some algebric manipulation, it is not difficult to see
* that (2) is equivalent to
* -(i+1)
* s + 2 <= y (3)
* i i
*
* The advantage of (3) is that s and y can be computed by
* i i
* the following recurrence formula:
* if (3) is false
*
* s = s , y = y ; (4)
* i+1 i i+1 i
*
* otherwise,
* -i -(i+1)
* s = s + 2 , y = y - s - 2 (5)
* i+1 i i+1 i i
*
* One may easily use induction to prove (4) and (5).
* Note. Since the left hand side of (3) contain only i+2 bits,
* it does not necessary to do a full (53-bit) comparison
* in (3).
* 3. Final rounding
* After generating the 53 bits result, we compute one more bit.
* Together with the remainder, we can decide whether the
* result is exact, bigger than 1/2ulp, or less than 1/2ulp
* (it will never equal to 1/2ulp).
* The rounding mode can be detected by checking whether
* huge + tiny is equal to huge, and whether huge - tiny is
* equal to huge for some floating point number "huge" and "tiny".
*
* Special cases:
* sqrt(+-0) = +-0 ... exact
* sqrt(inf) = inf
* sqrt(-ve) = NaN ... with invalid signal
* sqrt(NaN) = NaN ... with invalid signal for signaling NaN
*
* Other methods : see the appended file at the end of the program below.
*---------------
*/
#include "math.h"
#include "math_private.h"
#ifdef __STDC__
static const double one = 1.0, tiny=1.0e-300;
#else
static double one = 1.0, tiny=1.0e-300;
#endif
#ifdef __STDC__
double __ieee754_sqrt(double x)
#else
double __ieee754_sqrt(x)
double x;
#endif
{
double z;
int32_t sign = (int)0x80000000;
int32_t ix0,s0,q,m,t,i;
u_int32_t r,t1,s1,ix1,q1;
EXTRACT_WORDS(ix0,ix1,x);
/* take care of Inf and NaN */
if((ix0&0x7ff00000)==0x7ff00000) {
return x*x+x; /* sqrt(NaN)=NaN, sqrt(+inf)=+inf
sqrt(-inf)=sNaN */
}
/* take care of zero */
if(ix0<=0) {
if(((ix0&(~sign))|ix1)==0) return x;/* sqrt(+-0) = +-0 */
else if(ix0<0)
return (x-x)/(x-x); /* sqrt(-ve) = sNaN */
}
/* normalize x */
m = (ix0>>20);
if(m==0) { /* subnormal x */
while(ix0==0) {
m -= 21;
ix0 |= (ix1>>11); ix1 <<= 21;
}
for(i=0;(ix0&0x00100000)==0;i++) ix0<<=1;
m -= i-1;
ix0 |= (ix1>>(32-i));
ix1 <<= i;
}
m -= 1023; /* unbias exponent */
ix0 = (ix0&0x000fffff)|0x00100000;
if(m&1){ /* odd m, double x to make it even */
ix0 += ix0 + ((ix1&sign)>>31);
ix1 += ix1;
}
m >>= 1; /* m = [m/2] */
/* generate sqrt(x) bit by bit */
ix0 += ix0 + ((ix1&sign)>>31);
ix1 += ix1;
q = q1 = s0 = s1 = 0; /* [q,q1] = sqrt(x) */
r = 0x00200000; /* r = moving bit from right to left */
while(r!=0) {
t = s0+r;
if(t<=ix0) {
s0 = t+r;
ix0 -= t;
q += r;
}
ix0 += ix0 + ((ix1&sign)>>31);
ix1 += ix1;
r>>=1;
}
r = sign;
while(r!=0) {
t1 = s1+r;
t = s0;
if((t<ix0)||((t==ix0)&&(t1<=ix1))) {
s1 = t1+r;
if(((t1&sign)==sign)&&(s1&sign)==0) s0 += 1;
ix0 -= t;
if (ix1 < t1) ix0 -= 1;
ix1 -= t1;
q1 += r;
}
ix0 += ix0 + ((ix1&sign)>>31);
ix1 += ix1;
r>>=1;
}
/* use floating add to find out rounding direction */
if((ix0|ix1)!=0) {
z = one-tiny; /* trigger inexact flag */
if (z>=one) {
z = one+tiny;
if (q1==(u_int32_t)0xffffffff) { q1=0; q += 1;}
else if (z>one) {
if (q1==(u_int32_t)0xfffffffe) q+=1;
q1+=2;
} else
q1 += (q1&1);
}
}
ix0 = (q>>1)+0x3fe00000;
ix1 = q1>>1;
if ((q&1)==1) ix1 |= sign;
ix0 += (m <<20);
INSERT_WORDS(z,ix0,ix1);
return z;
}
/*
Other methods (use floating-point arithmetic)
-------------
(This is a copy of a drafted paper by Prof W. Kahan
and K.C. Ng, written in May, 1986)
Two algorithms are given here to implement sqrt(x)
(IEEE double precision arithmetic) in software.
Both supply sqrt(x) correctly rounded. The first algorithm (in
Section A) uses newton iterations and involves four divisions.
The second one uses reciproot iterations to avoid division, but
requires more multiplications. Both algorithms need the ability
to chop results of arithmetic operations instead of round them,
and the INEXACT flag to indicate when an arithmetic operation
is executed exactly with no roundoff error, all part of the
standard (IEEE 754-1985). The ability to perform shift, add,
subtract and logical AND operations upon 32-bit words is needed
too, though not part of the standard.
A. sqrt(x) by Newton Iteration
(1) Initial approximation
Let x0 and x1 be the leading and the trailing 32-bit words of
a floating point number x (in IEEE double format) respectively
1 11 52 ...widths
------------------------------------------------------
x: |s| e | f |
------------------------------------------------------
msb lsb msb lsb ...order
------------------------ ------------------------
x0: |s| e | f1 | x1: | f2 |
------------------------ ------------------------
By performing shifts and subtracts on x0 and x1 (both regarded
as integers), we obtain an 8-bit approximation of sqrt(x) as
follows.
k := (x0>>1) + 0x1ff80000;
y0 := k - T1[31&(k>>15)]. ... y ~ sqrt(x) to 8 bits
Here k is a 32-bit integer and T1[] is an integer array containing
correction terms. Now magically the floating value of y (y's
leading 32-bit word is y0, the value of its trailing word is 0)
approximates sqrt(x) to almost 8-bit.
Value of T1:
static int T1[32]= {
0, 1024, 3062, 5746, 9193, 13348, 18162, 23592,
29598, 36145, 43202, 50740, 58733, 67158, 75992, 85215,
83599, 71378, 60428, 50647, 41945, 34246, 27478, 21581,
16499, 12183, 8588, 5674, 3403, 1742, 661, 130,};
(2) Iterative refinement
Apply Heron's rule three times to y, we have y approximates
sqrt(x) to within 1 ulp (Unit in the Last Place):
y := (y+x/y)/2 ... almost 17 sig. bits
y := (y+x/y)/2 ... almost 35 sig. bits
y := y-(y-x/y)/2 ... within 1 ulp
Remark 1.
Another way to improve y to within 1 ulp is:
y := (y+x/y) ... almost 17 sig. bits to 2*sqrt(x)
y := y - 0x00100006 ... almost 18 sig. bits to sqrt(x)
2
(x-y )*y
y := y + 2* ---------- ...within 1 ulp
2
3y + x
This formula has one division fewer than the one above; however,
it requires more multiplications and additions. Also x must be
scaled in advance to avoid spurious overflow in evaluating the
expression 3y*y+x. Hence it is not recommended uless division
is slow. If division is very slow, then one should use the
reciproot algorithm given in section B.
(3) Final adjustment
By twiddling y's last bit it is possible to force y to be
correctly rounded according to the prevailing rounding mode
as follows. Let r and i be copies of the rounding mode and
inexact flag before entering the square root program. Also we
use the expression y+-ulp for the next representable floating
numbers (up and down) of y. Note that y+-ulp = either fixed
point y+-1, or multiply y by nextafter(1,+-inf) in chopped
mode.
I := FALSE; ... reset INEXACT flag I
R := RZ; ... set rounding mode to round-toward-zero
z := x/y; ... chopped quotient, possibly inexact
If(not I) then { ... if the quotient is exact
if(z=y) {
I := i; ... restore inexact flag
R := r; ... restore rounded mode
return sqrt(x):=y.
} else {
z := z - ulp; ... special rounding
}
}
i := TRUE; ... sqrt(x) is inexact
If (r=RN) then z=z+ulp ... rounded-to-nearest
If (r=RP) then { ... round-toward-+inf
y = y+ulp; z=z+ulp;
}
y := y+z; ... chopped sum
y0:=y0-0x00100000; ... y := y/2 is correctly rounded.
I := i; ... restore inexact flag
R := r; ... restore rounded mode
return sqrt(x):=y.
(4) Special cases
Square root of +inf, +-0, or NaN is itself;
Square root of a negative number is NaN with invalid signal.
B. sqrt(x) by Reciproot Iteration
(1) Initial approximation
Let x0 and x1 be the leading and the trailing 32-bit words of
a floating point number x (in IEEE double format) respectively
(see section A). By performing shifs and subtracts on x0 and y0,
we obtain a 7.8-bit approximation of 1/sqrt(x) as follows.
k := 0x5fe80000 - (x0>>1);
y0:= k - T2[63&(k>>14)]. ... y ~ 1/sqrt(x) to 7.8 bits
Here k is a 32-bit integer and T2[] is an integer array
containing correction terms. Now magically the floating
value of y (y's leading 32-bit word is y0, the value of
its trailing word y1 is set to zero) approximates 1/sqrt(x)
to almost 7.8-bit.
Value of T2:
static int T2[64]= {
0x1500, 0x2ef8, 0x4d67, 0x6b02, 0x87be, 0xa395, 0xbe7a, 0xd866,
0xf14a, 0x1091b,0x11fcd,0x13552,0x14999,0x15c98,0x16e34,0x17e5f,
0x18d03,0x19a01,0x1a545,0x1ae8a,0x1b5c4,0x1bb01,0x1bfde,0x1c28d,
0x1c2de,0x1c0db,0x1ba73,0x1b11c,0x1a4b5,0x1953d,0x18266,0x16be0,
0x1683e,0x179d8,0x18a4d,0x19992,0x1a789,0x1b445,0x1bf61,0x1c989,
0x1d16d,0x1d77b,0x1dddf,0x1e2ad,0x1e5bf,0x1e6e8,0x1e654,0x1e3cd,
0x1df2a,0x1d635,0x1cb16,0x1be2c,0x1ae4e,0x19bde,0x1868e,0x16e2e,
0x1527f,0x1334a,0x11051,0xe951, 0xbe01, 0x8e0d, 0x5924, 0x1edd,};
(2) Iterative refinement
Apply Reciproot iteration three times to y and multiply the
result by x to get an approximation z that matches sqrt(x)
to about 1 ulp. To be exact, we will have
-1ulp < sqrt(x)-z<1.0625ulp.
... set rounding mode to Round-to-nearest
y := y*(1.5-0.5*x*y*y) ... almost 15 sig. bits to 1/sqrt(x)
y := y*((1.5-2^-30)+0.5*x*y*y)... about 29 sig. bits to 1/sqrt(x)
... special arrangement for better accuracy
z := x*y ... 29 bits to sqrt(x), with z*y<1
z := z + 0.5*z*(1-z*y) ... about 1 ulp to sqrt(x)
Remark 2. The constant 1.5-2^-30 is chosen to bias the error so that
(a) the term z*y in the final iteration is always less than 1;
(b) the error in the final result is biased upward so that
-1 ulp < sqrt(x) - z < 1.0625 ulp
instead of |sqrt(x)-z|<1.03125ulp.
(3) Final adjustment
By twiddling y's last bit it is possible to force y to be
correctly rounded according to the prevailing rounding mode
as follows. Let r and i be copies of the rounding mode and
inexact flag before entering the square root program. Also we
use the expression y+-ulp for the next representable floating
numbers (up and down) of y. Note that y+-ulp = either fixed
point y+-1, or multiply y by nextafter(1,+-inf) in chopped
mode.
R := RZ; ... set rounding mode to round-toward-zero
switch(r) {
case RN: ... round-to-nearest
if(x<= z*(z-ulp)...chopped) z = z - ulp; else
if(x<= z*(z+ulp)...chopped) z = z; else z = z+ulp;
break;
case RZ:case RM: ... round-to-zero or round-to--inf
R:=RP; ... reset rounding mod to round-to-+inf
if(x<z*z ... rounded up) z = z - ulp; else
if(x>=(z+ulp)*(z+ulp) ...rounded up) z = z+ulp;
break;
case RP: ... round-to-+inf
if(x>(z+ulp)*(z+ulp)...chopped) z = z+2*ulp; else
if(x>z*z ...chopped) z = z+ulp;
break;
}
Remark 3. The above comparisons can be done in fixed point. For
example, to compare x and w=z*z chopped, it suffices to compare
x1 and w1 (the trailing parts of x and w), regarding them as
two's complement integers.
...Is z an exact square root?
To determine whether z is an exact square root of x, let z1 be the
trailing part of z, and also let x0 and x1 be the leading and
trailing parts of x.
If ((z1&0x03ffffff)!=0) ... not exact if trailing 26 bits of z!=0
I := 1; ... Raise Inexact flag: z is not exact
else {
j := 1 - [(x0>>20)&1] ... j = logb(x) mod 2
k := z1 >> 26; ... get z's 25-th and 26-th
fraction bits
I := i or (k&j) or ((k&(j+j+1))!=(x1&3));
}
R:= r ... restore rounded mode
return sqrt(x):=z.
If multiplication is cheaper then the foregoing red tape, the
Inexact flag can be evaluated by
I := i;
I := (z*z!=x) or I.
Note that z*z can overwrite I; this value must be sensed if it is
True.
Remark 4. If z*z = x exactly, then bit 25 to bit 0 of z1 must be
zero.
--------------------
z1: | f2 |
--------------------
bit 31 bit 0
Further more, bit 27 and 26 of z1, bit 0 and 1 of x1, and the odd
or even of logb(x) have the following relations:
-------------------------------------------------
bit 27,26 of z1 bit 1,0 of x1 logb(x)
-------------------------------------------------
00 00 odd and even
01 01 even
10 10 odd
10 00 even
11 01 even
-------------------------------------------------
(4) Special cases (see (4) of Section A).
*/

View file

@ -0,0 +1,96 @@
/* @(#)k_cos.c 5.1 93/09/24 */
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#if defined(LIBM_SCCS) && !defined(lint)
static char rcsid[] = "$NetBSD: k_cos.c,v 1.8 1995/05/10 20:46:22 jtc Exp $";
#endif
/*
* __kernel_cos( x, y )
* kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164
* Input x is assumed to be bounded by ~pi/4 in magnitude.
* Input y is the tail of x.
*
* Algorithm
* 1. Since cos(-x) = cos(x), we need only to consider positive x.
* 2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0.
* 3. cos(x) is approximated by a polynomial of degree 14 on
* [0,pi/4]
* 4 14
* cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x
* where the remez error is
*
* | 2 4 6 8 10 12 14 | -58
* |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x +C6*x )| <= 2
* | |
*
* 4 6 8 10 12 14
* 4. let r = C1*x +C2*x +C3*x +C4*x +C5*x +C6*x , then
* cos(x) = 1 - x*x/2 + r
* since cos(x+y) ~ cos(x) - sin(x)*y
* ~ cos(x) - x*y,
* a correction term is necessary in cos(x) and hence
* cos(x+y) = 1 - (x*x/2 - (r - x*y))
* For better accuracy when x > 0.3, let qx = |x|/4 with
* the last 32 bits mask off, and if x > 0.78125, let qx = 0.28125.
* Then
* cos(x+y) = (1-qx) - ((x*x/2-qx) - (r-x*y)).
* Note that 1-qx and (x*x/2-qx) is EXACT here, and the
* magnitude of the latter is at least a quarter of x*x/2,
* thus, reducing the rounding error in the subtraction.
*/
#include "math.h"
#include "math_private.h"
#ifdef __STDC__
static const double
#else
static double
#endif
one = 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
C1 = 4.16666666666666019037e-02, /* 0x3FA55555, 0x5555554C */
C2 = -1.38888888888741095749e-03, /* 0xBF56C16C, 0x16C15177 */
C3 = 2.48015872894767294178e-05, /* 0x3EFA01A0, 0x19CB1590 */
C4 = -2.75573143513906633035e-07, /* 0xBE927E4F, 0x809C52AD */
C5 = 2.08757232129817482790e-09, /* 0x3E21EE9E, 0xBDB4B1C4 */
C6 = -1.13596475577881948265e-11; /* 0xBDA8FAE9, 0xBE8838D4 */
#ifdef __STDC__
double __kernel_cos(double x, double y)
#else
double __kernel_cos(x, y)
double x,y;
#endif
{
double a,hz,z,r,qx;
int32_t ix;
GET_HIGH_WORD(ix,x);
ix &= 0x7fffffff; /* ix = |x|'s high word*/
if(ix<0x3e400000) { /* if x < 2**27 */
if(((int)x)==0) return one; /* generate inexact */
}
z = x*x;
r = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
if(ix < 0x3FD33333) /* if |x| < 0.3 */
return one - (0.5*z - (z*r - x*y));
else {
if(ix > 0x3fe90000) { /* x > 0.78125 */
qx = 0.28125;
} else {
INSERT_WORDS(qx,ix-0x00200000,0); /* x/4 */
}
hz = 0.5*z-qx;
a = one-qx;
return a - (hz - (z*r-x*y));
}
}

View file

@ -0,0 +1,320 @@
/* @(#)k_rem_pio2.c 5.1 93/09/24 */
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#if defined(LIBM_SCCS) && !defined(lint)
static char rcsid[] = "$NetBSD: k_rem_pio2.c,v 1.7 1995/05/10 20:46:25 jtc Exp $";
#endif
/*
* __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
* double x[],y[]; int e0,nx,prec; int ipio2[];
*
* __kernel_rem_pio2 return the last three digits of N with
* y = x - N*pi/2
* so that |y| < pi/2.
*
* The method is to compute the integer (mod 8) and fraction parts of
* (2/pi)*x without doing the full multiplication. In general we
* skip the part of the product that are known to be a huge integer (
* more accurately, = 0 mod 8 ). Thus the number of operations are
* independent of the exponent of the input.
*
* (2/pi) is represented by an array of 24-bit integers in ipio2[].
*
* Input parameters:
* x[] The input value (must be positive) is broken into nx
* pieces of 24-bit integers in double precision format.
* x[i] will be the i-th 24 bit of x. The scaled exponent
* of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
* match x's up to 24 bits.
*
* Example of breaking a double positive z into x[0]+x[1]+x[2]:
* e0 = ilogb(z)-23
* z = scalbn(z,-e0)
* for i = 0,1,2
* x[i] = floor(z)
* z = (z-x[i])*2**24
*
*
* y[] ouput result in an array of double precision numbers.
* The dimension of y[] is:
* 24-bit precision 1
* 53-bit precision 2
* 64-bit precision 2
* 113-bit precision 3
* The actual value is the sum of them. Thus for 113-bit
* precison, one may have to do something like:
*
* long double t,w,r_head, r_tail;
* t = (long double)y[2] + (long double)y[1];
* w = (long double)y[0];
* r_head = t+w;
* r_tail = w - (r_head - t);
*
* e0 The exponent of x[0]
*
* nx dimension of x[]
*
* prec an integer indicating the precision:
* 0 24 bits (single)
* 1 53 bits (double)
* 2 64 bits (extended)
* 3 113 bits (quad)
*
* ipio2[]
* integer array, contains the (24*i)-th to (24*i+23)-th
* bit of 2/pi after binary point. The corresponding
* floating value is
*
* ipio2[i] * 2^(-24(i+1)).
*
* External function:
* double scalbn(), floor();
*
*
* Here is the description of some local variables:
*
* jk jk+1 is the initial number of terms of ipio2[] needed
* in the computation. The recommended value is 2,3,4,
* 6 for single, double, extended,and quad.
*
* jz local integer variable indicating the number of
* terms of ipio2[] used.
*
* jx nx - 1
*
* jv index for pointing to the suitable ipio2[] for the
* computation. In general, we want
* ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
* is an integer. Thus
* e0-3-24*jv >= 0 or (e0-3)/24 >= jv
* Hence jv = max(0,(e0-3)/24).
*
* jp jp+1 is the number of terms in PIo2[] needed, jp = jk.
*
* q[] double array with integral value, representing the
* 24-bits chunk of the product of x and 2/pi.
*
* q0 the corresponding exponent of q[0]. Note that the
* exponent for q[i] would be q0-24*i.
*
* PIo2[] double precision array, obtained by cutting pi/2
* into 24 bits chunks.
*
* f[] ipio2[] in floating point
*
* iq[] integer array by breaking up q[] in 24-bits chunk.
*
* fq[] final product of x*(2/pi) in fq[0],..,fq[jk]
*
* ih integer. If >0 it indicates q[] is >= 0.5, hence
* it also indicates the *sign* of the result.
*
*/
/*
* Constants:
* The hexadecimal values are the intended ones for the following
* constants. The decimal values may be used, provided that the
* compiler will convert from decimal to binary accurately enough
* to produce the hexadecimal values shown.
*/
#include "math.h"
#include "math_private.h"
#ifdef __STDC__
static const int init_jk[] = {2,3,4,6}; /* initial value for jk */
#else
static int init_jk[] = {2,3,4,6};
#endif
#ifdef __STDC__
static const double PIo2[] = {
#else
static double PIo2[] = {
#endif
1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
};
#ifdef __STDC__
static const double
#else
static double
#endif
zero = 0.0,
one = 1.0,
two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
twon24 = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
#ifdef __STDC__
int __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, const int32_t *ipio2)
#else
int __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
double x[], y[]; int e0,nx,prec; int32_t ipio2[];
#endif
{
int32_t jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
double z,fw,f[20],fq[20],q[20];
/* initialize jk*/
jk = init_jk[prec];
jp = jk;
/* determine jx,jv,q0, note that 3>q0 */
jx = nx-1;
jv = (e0-3)/24; if(jv<0) jv=0;
q0 = e0-24*(jv+1);
/* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
j = jv-jx; m = jx+jk;
for(i=0;i<=m;i++,j++) f[i] = (j<0)? zero : (double) ipio2[j];
/* compute q[0],q[1],...q[jk] */
for (i=0;i<=jk;i++) {
for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
}
jz = jk;
recompute:
/* distill q[] into iq[] reversingly */
for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
fw = (double)((int32_t)(twon24* z));
iq[i] = (int32_t)(z-two24*fw);
z = q[j-1]+fw;
}
/* compute n */
z = scalbn(z,q0); /* actual value of z */
z -= 8.0*floor(z*0.125); /* trim off integer >= 8 */
n = (int32_t) z;
z -= (double)n;
ih = 0;
if(q0>0) { /* need iq[jz-1] to determine n */
i = (iq[jz-1]>>(24-q0)); n += i;
iq[jz-1] -= i<<(24-q0);
ih = iq[jz-1]>>(23-q0);
}
else if(q0==0) ih = iq[jz-1]>>23;
else if(z>=0.5) ih=2;
if(ih>0) { /* q > 0.5 */
n += 1; carry = 0;
for(i=0;i<jz ;i++) { /* compute 1-q */
j = iq[i];
if(carry==0) {
if(j!=0) {
carry = 1; iq[i] = 0x1000000- j;
}
} else iq[i] = 0xffffff - j;
}
if(q0>0) { /* rare case: chance is 1 in 12 */
switch(q0) {
case 1:
iq[jz-1] &= 0x7fffff; break;
case 2:
iq[jz-1] &= 0x3fffff; break;
}
}
if(ih==2) {
z = one - z;
if(carry!=0) z -= scalbn(one,q0);
}
}
/* check if recomputation is needed */
if(z==zero) {
j = 0;
for (i=jz-1;i>=jk;i--) j |= iq[i];
if(j==0) { /* need recomputation */
for(k=1;iq[jk-k]==0;k++); /* k = no. of terms needed */
for(i=jz+1;i<=jz+k;i++) { /* add q[jz+1] to q[jz+k] */
f[jx+i] = (double) ipio2[jv+i];
for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
q[i] = fw;
}
jz += k;
goto recompute;
}
}
/* chop off zero terms */
if(z==0.0) {
jz -= 1; q0 -= 24;
while(iq[jz]==0) { jz--; q0-=24;}
} else { /* break z into 24-bit if necessary */
z = scalbn(z,-q0);
if(z>=two24) {
fw = (double)((int32_t)(twon24*z));
iq[jz] = (int32_t)(z-two24*fw);
jz += 1; q0 += 24;
iq[jz] = (int32_t) fw;
} else iq[jz] = (int32_t) z ;
}
/* convert integer "bit" chunk to floating-point value */
fw = scalbn(one,q0);
for(i=jz;i>=0;i--) {
q[i] = fw*(double)iq[i]; fw*=twon24;
}
/* compute PIo2[0,...,jp]*q[jz,...,0] */
for(i=jz;i>=0;i--) {
for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
fq[jz-i] = fw;
}
/* compress fq[] into y[] */
switch(prec) {
case 0:
fw = 0.0;
for (i=jz;i>=0;i--) fw += fq[i];
y[0] = (ih==0)? fw: -fw;
break;
case 1:
case 2:
fw = 0.0;
for (i=jz;i>=0;i--) fw += fq[i];
y[0] = (ih==0)? fw: -fw;
fw = fq[0]-fw;
for (i=1;i<=jz;i++) fw += fq[i];
y[1] = (ih==0)? fw: -fw;
break;
case 3: /* painful */
for (i=jz;i>0;i--) {
fw = fq[i-1]+fq[i];
fq[i] += fq[i-1]-fw;
fq[i-1] = fw;
}
for (i=jz;i>1;i--) {
fw = fq[i-1]+fq[i];
fq[i] += fq[i-1]-fw;
fq[i-1] = fw;
}
for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
if(ih==0) {
y[0] = fq[0]; y[1] = fq[1]; y[2] = fw;
} else {
y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
}
}
return n&7;
}

View file

@ -0,0 +1,79 @@
/* @(#)k_sin.c 5.1 93/09/24 */
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#if defined(LIBM_SCCS) && !defined(lint)
static char rcsid[] = "$NetBSD: k_sin.c,v 1.8 1995/05/10 20:46:31 jtc Exp $";
#endif
/* __kernel_sin( x, y, iy)
* kernel sin function on [-pi/4, pi/4], pi/4 ~ 0.7854
* Input x is assumed to be bounded by ~pi/4 in magnitude.
* Input y is the tail of x.
* Input iy indicates whether y is 0. (if iy=0, y assume to be 0).
*
* Algorithm
* 1. Since sin(-x) = -sin(x), we need only to consider positive x.
* 2. if x < 2^-27 (hx<0x3e400000 0), return x with inexact if x!=0.
* 3. sin(x) is approximated by a polynomial of degree 13 on
* [0,pi/4]
* 3 13
* sin(x) ~ x + S1*x + ... + S6*x
* where
*
* |sin(x) 2 4 6 8 10 12 | -58
* |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x +S6*x )| <= 2
* | x |
*
* 4. sin(x+y) = sin(x) + sin'(x')*y
* ~ sin(x) + (1-x*x/2)*y
* For better accuracy, let
* 3 2 2 2 2
* r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6))))
* then 3 2
* sin(x) = x + (S1*x + (x *(r-y/2)+y))
*/
#include "math.h"
#include "math_private.h"
#ifdef __STDC__
static const double
#else
static double
#endif
half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
S1 = -1.66666666666666324348e-01, /* 0xBFC55555, 0x55555549 */
S2 = 8.33333333332248946124e-03, /* 0x3F811111, 0x1110F8A6 */
S3 = -1.98412698298579493134e-04, /* 0xBF2A01A0, 0x19C161D5 */
S4 = 2.75573137070700676789e-06, /* 0x3EC71DE3, 0x57B1FE7D */
S5 = -2.50507602534068634195e-08, /* 0xBE5AE5E6, 0x8A2B9CEB */
S6 = 1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */
#ifdef __STDC__
double __kernel_sin(double x, double y, int iy)
#else
double __kernel_sin(x, y, iy)
double x,y; int iy; /* iy=0 if y is zero */
#endif
{
double z,r,v;
int32_t ix;
GET_HIGH_WORD(ix,x);
ix &= 0x7fffffff; /* high word of x */
if(ix<0x3e400000) /* |x| < 2**-27 */
{if((int)x==0) return x;} /* generate inexact */
z = x*x;
v = z*x;
r = S2+z*(S3+z*(S4+z*(S5+z*S6)));
if(iy==0) return x+v*(S1+z*r);
else return x-((z*(half*y-v*r)-y)-v*S1);
}

View file

@ -0,0 +1,15 @@
#include <sys/types.h>
int __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, const int32_t *ipio2);
double __kernel_sin(double x, double y, int iy);
double __kernel_cos(double x, double y);
double __ieee754_pow(double x, double y);
double __ieee754_sqrt(double x);
double __ieee754_log(double x);
int32_t __ieee754_rem_pio2(double x, double *y);
double fabs(double x);
double scalbn(double x, int n);
double copysign(double x, double y);
double floor(double x);

View file

@ -0,0 +1,129 @@
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
/*
* from: @(#)fdlibm.h 5.1 93/09/24
* $Id: math_private.h,v 1.3 2004/02/09 07:10:38 andersen Exp $
*/
#ifndef _MATH_PRIVATE_H_
#define _MATH_PRIVATE_H_
#include <endian.h>
#include <sys/types.h>
/* The original fdlibm code used statements like:
n0 = ((*(int*)&one)>>29)^1; * index of high word *
ix0 = *(n0+(int*)&x); * high word of x *
ix1 = *((1-n0)+(int*)&x); * low word of x *
to dig two 32 bit words out of the 64 bit IEEE floating point
value. That is non-ANSI, and, moreover, the gcc instruction
scheduler gets it wrong. We instead use the following macros.
Unlike the original code, we determine the endianness at compile
time, not at run time; I don't see much benefit to selecting
endianness at run time. */
/* A union which permits us to convert between a double and two 32 bit
ints. */
/*
* Math on arm is special:
* For FPA, float words are always big-endian.
* For VFP, floats words follow the memory system mode.
*/
#if (__BYTE_ORDER == __BIG_ENDIAN) || \
(!defined(__VFP_FP__) && (defined(__arm__) || defined(__thumb__)))
typedef union
{
double value;
struct
{
u_int32_t msw;
u_int32_t lsw;
} parts;
} ieee_double_shape_type;
#else
typedef union
{
double value;
struct
{
u_int32_t lsw;
u_int32_t msw;
} parts;
} ieee_double_shape_type;
#endif
/* Get two 32 bit ints from a double. */
#define EXTRACT_WORDS(ix0,ix1,d) \
do { \
ieee_double_shape_type ew_u; \
ew_u.value = (d); \
(ix0) = ew_u.parts.msw; \
(ix1) = ew_u.parts.lsw; \
} while (0)
/* Get the more significant 32 bit int from a double. */
#define GET_HIGH_WORD(i,d) \
do { \
ieee_double_shape_type gh_u; \
gh_u.value = (d); \
(i) = gh_u.parts.msw; \
} while (0)
/* Get the less significant 32 bit int from a double. */
#define GET_LOW_WORD(i,d) \
do { \
ieee_double_shape_type gl_u; \
gl_u.value = (d); \
(i) = gl_u.parts.lsw; \
} while (0)
/* Set a double from two 32 bit ints. */
#define INSERT_WORDS(d,ix0,ix1) \
do { \
ieee_double_shape_type iw_u; \
iw_u.parts.msw = (ix0); \
iw_u.parts.lsw = (ix1); \
(d) = iw_u.value; \
} while (0)
/* Set the more significant 32 bits of a double from an int. */
#define SET_HIGH_WORD(d,v) \
do { \
ieee_double_shape_type sh_u; \
sh_u.value = (d); \
sh_u.parts.msw = (v); \
(d) = sh_u.value; \
} while (0)
/* Set the less significant 32 bits of a double from an int. */
#define SET_LOW_WORD(d,v) \
do { \
ieee_double_shape_type sl_u; \
sl_u.value = (d); \
sl_u.parts.lsw = (v); \
(d) = sl_u.value; \
} while (0)
#endif /* _MATH_PRIVATE_H_ */

View file

@ -0,0 +1,72 @@
/* Copyright (C) 1998 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Philip Blundell <philb@gnu.org>
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
@ #include <sys/syscall.h>
.text
.global memset
.type memset,%function
.align 4
memset:
mov a4, a1
cmp a3, $8 @ at least 8 bytes to do?
blt 2f
orr a2, a2, a2, lsl $8
orr a2, a2, a2, lsl $16
1:
tst a4, $3 @ aligned yet?
strneb a2, [a4], $1
subne a3, a3, $1
bne 1b
mov ip, a2
1:
cmp a3, $8 @ 8 bytes still to do?
blt 2f
stmia a4!, {a2, ip}
sub a3, a3, $8
cmp a3, $8 @ 8 bytes still to do?
blt 2f
stmia a4!, {a2, ip}
sub a3, a3, $8
cmp a3, $8 @ 8 bytes still to do?
blt 2f
stmia a4!, {a2, ip}
sub a3, a3, $8
cmp a3, $8 @ 8 bytes still to do?
stmgeia a4!, {a2, ip}
subge a3, a3, $8
bge 1b
2:
movs a3, a3 @ anything left?
moveq pc, lr @ nope
rsb a3, a3, $7
add pc, pc, a3, lsl $2
mov r0, r0
strb a2, [a4], $1
strb a2, [a4], $1
strb a2, [a4], $1
strb a2, [a4], $1
strb a2, [a4], $1
strb a2, [a4], $1
strb a2, [a4], $1
mov pc, lr
.size memset,.-memset;

View file

@ -0,0 +1,39 @@
/* @(#)s_copysign.c 5.1 93/09/24 */
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#if defined(LIBM_SCCS) && !defined(lint)
static char rcsid[] = "$NetBSD: s_copysign.c,v 1.8 1995/05/10 20:46:57 jtc Exp $";
#endif
/*
* copysign(double x, double y)
* copysign(x,y) returns a value with the magnitude of x and
* with the sign bit of y.
*/
#include "math.h"
#include "math_private.h"
#ifdef __STDC__
double copysign(double x, double y)
#else
double copysign(x,y)
double x,y;
#endif
{
u_int32_t hx,hy;
GET_HIGH_WORD(hx,x);
GET_HIGH_WORD(hy,y);
SET_HIGH_WORD(x,(hx&0x7fffffff)|(hy&0x80000000));
return x;
}

View file

@ -0,0 +1,35 @@
/* @(#)s_fabs.c 5.1 93/09/24 */
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#if defined(LIBM_SCCS) && !defined(lint)
static char rcsid[] = "$NetBSD: s_fabs.c,v 1.7 1995/05/10 20:47:13 jtc Exp $";
#endif
/*
* fabs(x) returns the absolute value of x.
*/
#include "math.h"
#include "math_private.h"
#ifdef __STDC__
double fabs(double x)
#else
double fabs(x)
double x;
#endif
{
u_int32_t high;
GET_HIGH_WORD(high,x);
SET_HIGH_WORD(x,high&0x7fffffff);
return x;
}

View file

@ -0,0 +1,81 @@
/* @(#)s_floor.c 5.1 93/09/24 */
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#if defined(LIBM_SCCS) && !defined(lint)
static char rcsid[] = "$NetBSD: s_floor.c,v 1.8 1995/05/10 20:47:20 jtc Exp $";
#endif
/*
* floor(x)
* Return x rounded toward -inf to integral value
* Method:
* Bit twiddling.
* Exception:
* Inexact flag raised if x not equal to floor(x).
*/
#include "math.h"
#include "math_private.h"
#ifdef __STDC__
static const double huge = 1.0e300;
#else
static double huge = 1.0e300;
#endif
#ifdef __STDC__
double floor(double x)
#else
double floor(x)
double x;
#endif
{
int32_t i0,i1,j0;
u_int32_t i,j;
EXTRACT_WORDS(i0,i1,x);
j0 = ((i0>>20)&0x7ff)-0x3ff;
if(j0<20) {
if(j0<0) { /* raise inexact if x != 0 */
if(huge+x>0.0) {/* return 0*sign(x) if |x|<1 */
if(i0>=0) {i0=i1=0;}
else if(((i0&0x7fffffff)|i1)!=0)
{ i0=0xbff00000;i1=0;}
}
} else {
i = (0x000fffff)>>j0;
if(((i0&i)|i1)==0) return x; /* x is integral */
if(huge+x>0.0) { /* raise inexact flag */
if(i0<0) i0 += (0x00100000)>>j0;
i0 &= (~i); i1=0;
}
}
} else if (j0>51) {
if(j0==0x400) return x+x; /* inf or NaN */
else return x; /* x is integral */
} else {
i = ((u_int32_t)(0xffffffff))>>(j0-20);
if((i1&i)==0) return x; /* x is integral */
if(huge+x>0.0) { /* raise inexact flag */
if(i0<0) {
if(j0==20) i0+=1;
else {
j = i1+(1<<(52-j0));
if(j<i1) i0 +=1 ; /* got a carry */
i1=j;
}
}
i1 &= (~i);
}
}
INSERT_WORDS(x,i0,i1);
return x;
}

View file

@ -0,0 +1,67 @@
/* @(#)s_scalbn.c 5.1 93/09/24 */
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#if defined(LIBM_SCCS) && !defined(lint)
static char rcsid[] = "$NetBSD: s_scalbn.c,v 1.8 1995/05/10 20:48:08 jtc Exp $";
#endif
/*
* scalbn (double x, int n)
* scalbn(x,n) returns x* 2**n computed by exponent
* manipulation rather than by actually performing an
* exponentiation or a multiplication.
*/
#include "math.h"
#include "math_private.h"
#ifdef __STDC__
static const double
#else
static double
#endif
two54 = 1.80143985094819840000e+16, /* 0x43500000, 0x00000000 */
twom54 = 5.55111512312578270212e-17, /* 0x3C900000, 0x00000000 */
huge = 1.0e+300,
tiny = 1.0e-300;
#ifdef __STDC__
double scalbn (double x, int n)
#else
double scalbn (x,n)
double x; int n;
#endif
{
int32_t k,hx,lx;
EXTRACT_WORDS(hx,lx,x);
k = (hx&0x7ff00000)>>20; /* extract exponent */
if (k==0) { /* 0 or subnormal x */
if ((lx|(hx&0x7fffffff))==0) return x; /* +-0 */
x *= two54;
GET_HIGH_WORD(hx,x);
k = ((hx&0x7ff00000)>>20) - 54;
if (n< -50000) return tiny*x; /*underflow*/
}
if (k==0x7ff) return x+x; /* NaN or Inf */
k = k+n;
if (k > 0x7fe) return huge*copysign(huge,x); /* overflow */
if (k > 0) /* normal result */
{SET_HIGH_WORD(x,(hx&0x800fffff)|(k<<20)); return x;}
if (k <= -54) {
if (n > 50000) /* in case integer overflow in n+k */
return huge*copysign(huge,x); /*overflow*/
else return tiny*copysign(tiny,x); /*underflow*/
}
k += 54; /* subnormal result */
SET_HIGH_WORD(x,(hx&0x800fffff)|(k<<20));
return x*twom54;
}

View file

@ -0,0 +1,82 @@
/* @(#)s_sin.c 5.1 93/09/24 */
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#if defined(LIBM_SCCS) && !defined(lint)
static char rcsid[] = "$NetBSD: s_sin.c,v 1.7 1995/05/10 20:48:15 jtc Exp $";
#endif
/* sin(x)
* Return sine function of x.
*
* kernel function:
* __kernel_sin ... sine function on [-pi/4,pi/4]
* __kernel_cos ... cose function on [-pi/4,pi/4]
* __ieee754_rem_pio2 ... argument reduction routine
*
* Method.
* Let S,C and T denote the sin, cos and tan respectively on
* [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
* in [-pi/4 , +pi/4], and let n = k mod 4.
* We have
*
* n sin(x) cos(x) tan(x)
* ----------------------------------------------------------
* 0 S C T
* 1 C -S -1/T
* 2 -S -C T
* 3 -C S -1/T
* ----------------------------------------------------------
*
* Special cases:
* Let trig be any of sin, cos, or tan.
* trig(+-INF) is NaN, with signals;
* trig(NaN) is that NaN;
*
* Accuracy:
* TRIG(x) returns trig(x) nearly rounded
*/
#include "math.h"
#include "math_private.h"
#ifdef __STDC__
double sin(double x)
#else
double sin(x)
double x;
#endif
{
double y[2],z=0.0;
int32_t n, ix;
/* High word of x. */
GET_HIGH_WORD(ix,x);
/* |x| ~< pi/4 */
ix &= 0x7fffffff;
if(ix <= 0x3fe921fb) return __kernel_sin(x,z,0);
/* sin(Inf or NaN) is NaN */
else if (ix>=0x7ff00000) return x-x;
/* argument reduction needed */
else {
n = __ieee754_rem_pio2(x,y);
switch(n&3) {
case 0: return __kernel_sin(y[0],y[1],1);
case 1: return __kernel_cos(y[0],y[1]);
case 2: return -__kernel_sin(y[0],y[1],1);
default:
return -__kernel_cos(y[0],y[1]);
}
}
}

View file

@ -0,0 +1,12 @@
#include "math.h"
double pow(double x, double y)
{
return __ieee754_pow(x, y);
}
double log(double x)
{
return __ieee754_log(x);
}