Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

17
arch/sh/lib64/Makefile Normal file
View file

@ -0,0 +1,17 @@
#
# Makefile for the SH-5 specific library files..
#
# Copyright (C) 2000, 2001 Paolo Alberelli
# Copyright (C) 2003 - 2008 Paul Mundt
#
# This file is subject to the terms and conditions of the GNU General Public
# License. See the file "COPYING" in the main directory of this archive
# for more details.
#
# Panic should really be compiled as PIC
lib-y := udelay.o panic.o memcpy.o memset.o \
copy_user_memcpy.o copy_page.o strcpy.o strlen.o
# Extracted from libgcc
lib-y += udivsi3.o udivdi3.o sdivsi3.o

89
arch/sh/lib64/copy_page.S Normal file
View file

@ -0,0 +1,89 @@
/*
Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
This file is subject to the terms and conditions of the GNU General Public
License. See the file "COPYING" in the main directory of this archive
for more details.
Tight version of mempy for the case of just copying a page.
Prefetch strategy empirically optimised against RTL simulations
of SH5-101 cut2 eval chip with Cayman board DDR memory.
Parameters:
r2 : destination effective address (start of page)
r3 : source effective address (start of page)
Always copies 4096 bytes.
Points to review.
* Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
It seems like the prefetch needs to be at at least 4 lines ahead to get
the data into the cache in time, and the allocos contend with outstanding
prefetches for the same cache set, so it's better to have the numbers
different.
*/
.section .text..SHmedia32,"ax"
.little
.balign 8
.global copy_page
copy_page:
/* Copy 4096 bytes worth of data from r3 to r2.
Do prefetches 4 lines ahead.
Do alloco 2 lines ahead */
pta 1f, tr1
pta 2f, tr2
pta 3f, tr3
ptabs r18, tr0
#if 0
/* TAKum03020 */
ld.q r3, 0x00, r63
ld.q r3, 0x20, r63
ld.q r3, 0x40, r63
ld.q r3, 0x60, r63
#endif
alloco r2, 0x00
synco ! TAKum03020
alloco r2, 0x20
synco ! TAKum03020
movi 3968, r6
add r2, r6, r6
addi r6, 64, r7
addi r7, 64, r8
sub r3, r2, r60
addi r60, 8, r61
addi r61, 8, r62
addi r62, 8, r23
addi r60, 0x80, r22
/* Minimal code size. The extra branches inside the loop don't cost much
because they overlap with the time spent waiting for prefetches to
complete. */
1:
#if 0
/* TAKum03020 */
bge/u r2, r6, tr2 ! skip prefetch for last 4 lines
ldx.q r2, r22, r63 ! prefetch 4 lines hence
#endif
2:
bge/u r2, r7, tr3 ! skip alloco for last 2 lines
alloco r2, 0x40 ! alloc destination line 2 lines ahead
synco ! TAKum03020
3:
ldx.q r2, r60, r36
ldx.q r2, r61, r37
ldx.q r2, r62, r38
ldx.q r2, r23, r39
st.q r2, 0, r36
st.q r2, 8, r37
st.q r2, 16, r38
st.q r2, 24, r39
addi r2, 32, r2
bgt/l r8, r2, tr1
blink tr0, r63 ! return

View file

@ -0,0 +1,217 @@
!
! Fast SH memcpy
!
! by Toshiyasu Morita (tm@netcom.com)
! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
! SH5 code Copyright 2002 SuperH Ltd.
!
! Entry: ARG0: destination pointer
! ARG1: source pointer
! ARG2: byte count
!
! Exit: RESULT: destination pointer
! any other registers in the range r0-r7: trashed
!
! Notes: Usually one wants to do small reads and write a longword, but
! unfortunately it is difficult in some cases to concatanate bytes
! into a longword on the SH, so this does a longword read and small
! writes.
!
! This implementation makes two assumptions about how it is called:
!
! 1.: If the byte count is nonzero, the address of the last byte to be
! copied is unsigned greater than the address of the first byte to
! be copied. This could be easily swapped for a signed comparison,
! but the algorithm used needs some comparison.
!
! 2.: When there are two or three bytes in the last word of an 11-or-more
! bytes memory chunk to b copied, the rest of the word can be read
! without side effects.
! This could be easily changed by increasing the minimum size of
! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
! however, this would cost a few extra cyles on average.
! For SHmedia, the assumption is that any quadword can be read in its
! enirety if at least one byte is included in the copy.
/* Imported into Linux kernel by Richard Curnow. This is used to implement the
__copy_user function in the general case, so it has to be a distinct
function from intra-kernel memcpy to allow for exception fix-ups in the
event that the user pointer is bad somewhere in the copy (e.g. due to
running off the end of the vma).
Note, this algorithm will be slightly wasteful in the case where the source
and destination pointers are equally aligned, because the stlo/sthi pairs
could then be merged back into single stores. If there are a lot of cache
misses, this is probably offset by the stall lengths on the preloads.
*/
/* NOTE : Prefetches removed and allocos guarded by synco to avoid TAKum03020
* erratum. The first two prefetches are nop-ed out to avoid upsetting the
* instruction counts used in the jump address calculation.
* */
.section .text..SHmedia32,"ax"
.little
.balign 32
.global copy_user_memcpy
.global copy_user_memcpy_end
copy_user_memcpy:
#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
nop ! ld.b r3,0,r63 ! TAKum03020
pta/l Large,tr0
movi 25,r0
bgeu/u r4,r0,tr0
nsb r4,r0
shlli r0,5,r0
movi (L1-L0+63*32 + 1) & 0xffff,r1
sub r1, r0, r0
L0: ptrel r0,tr0
add r2,r4,r5
ptabs r18,tr1
add r3,r4,r6
blink tr0,r63
/* Rearranged to make cut2 safe */
.balign 8
L4_7: /* 4..7 byte memcpy cntd. */
stlo.l r2, 0, r0
or r6, r7, r6
sthi.l r5, -1, r6
stlo.l r5, -4, r6
blink tr1,r63
.balign 8
L1: /* 0 byte memcpy */
nop
blink tr1,r63
nop
nop
nop
nop
L2_3: /* 2 or 3 byte memcpy cntd. */
st.b r5,-1,r6
blink tr1,r63
/* 1 byte memcpy */
ld.b r3,0,r0
st.b r2,0,r0
blink tr1,r63
L8_15: /* 8..15 byte memcpy cntd. */
stlo.q r2, 0, r0
or r6, r7, r6
sthi.q r5, -1, r6
stlo.q r5, -8, r6
blink tr1,r63
/* 2 or 3 byte memcpy */
ld.b r3,0,r0
nop ! ld.b r2,0,r63 ! TAKum03020
ld.b r3,1,r1
st.b r2,0,r0
pta/l L2_3,tr0
ld.b r6,-1,r6
st.b r2,1,r1
blink tr0, r63
/* 4 .. 7 byte memcpy */
LDUAL (r3, 0, r0, r1)
pta L4_7, tr0
ldlo.l r6, -4, r7
or r0, r1, r0
sthi.l r2, 3, r0
ldhi.l r6, -1, r6
blink tr0, r63
/* 8 .. 15 byte memcpy */
LDUAQ (r3, 0, r0, r1)
pta L8_15, tr0
ldlo.q r6, -8, r7
or r0, r1, r0
sthi.q r2, 7, r0
ldhi.q r6, -1, r6
blink tr0, r63
/* 16 .. 24 byte memcpy */
LDUAQ (r3, 0, r0, r1)
LDUAQ (r3, 8, r8, r9)
or r0, r1, r0
sthi.q r2, 7, r0
or r8, r9, r8
sthi.q r2, 15, r8
ldlo.q r6, -8, r7
ldhi.q r6, -1, r6
stlo.q r2, 8, r8
stlo.q r2, 0, r0
or r6, r7, r6
sthi.q r5, -1, r6
stlo.q r5, -8, r6
blink tr1,r63
Large:
! ld.b r2, 0, r63 ! TAKum03020
pta/l Loop_ua, tr1
ori r3, -8, r7
sub r2, r7, r22
sub r3, r2, r6
add r2, r4, r5
ldlo.q r3, 0, r0
addi r5, -16, r5
movi 64+8, r27 ! could subtract r7 from that.
stlo.q r2, 0, r0
sthi.q r2, 7, r0
ldx.q r22, r6, r0
bgtu/l r27, r4, tr1
addi r5, -48, r27
pta/l Loop_line, tr0
addi r6, 64, r36
addi r6, -24, r19
addi r6, -16, r20
addi r6, -8, r21
Loop_line:
! ldx.q r22, r36, r63 ! TAKum03020
alloco r22, 32
synco
addi r22, 32, r22
ldx.q r22, r19, r23
sthi.q r22, -25, r0
ldx.q r22, r20, r24
ldx.q r22, r21, r25
stlo.q r22, -32, r0
ldx.q r22, r6, r0
sthi.q r22, -17, r23
sthi.q r22, -9, r24
sthi.q r22, -1, r25
stlo.q r22, -24, r23
stlo.q r22, -16, r24
stlo.q r22, -8, r25
bgeu r27, r22, tr0
Loop_ua:
addi r22, 8, r22
sthi.q r22, -1, r0
stlo.q r22, -8, r0
ldx.q r22, r6, r0
bgtu/l r5, r22, tr1
add r3, r4, r7
ldlo.q r7, -8, r1
sthi.q r22, 7, r0
ldhi.q r7, -1, r7
ptabs r18,tr1
stlo.q r22, 0, r0
or r1, r7, r1
sthi.q r5, 15, r1
stlo.q r5, 8, r1
blink tr1, r63
copy_user_memcpy_end:
nop

201
arch/sh/lib64/memcpy.S Normal file
View file

@ -0,0 +1,201 @@
/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
/* Modified by SuperH, Inc. September 2003 */
!
! Fast SH memcpy
!
! by Toshiyasu Morita (tm@netcom.com)
! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
! SH5 code Copyright 2002 SuperH Ltd.
!
! Entry: ARG0: destination pointer
! ARG1: source pointer
! ARG2: byte count
!
! Exit: RESULT: destination pointer
! any other registers in the range r0-r7: trashed
!
! Notes: Usually one wants to do small reads and write a longword, but
! unfortunately it is difficult in some cases to concatanate bytes
! into a longword on the SH, so this does a longword read and small
! writes.
!
! This implementation makes two assumptions about how it is called:
!
! 1.: If the byte count is nonzero, the address of the last byte to be
! copied is unsigned greater than the address of the first byte to
! be copied. This could be easily swapped for a signed comparison,
! but the algorithm used needs some comparison.
!
! 2.: When there are two or three bytes in the last word of an 11-or-more
! bytes memory chunk to b copied, the rest of the word can be read
! without side effects.
! This could be easily changed by increasing the minimum size of
! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
! however, this would cost a few extra cyles on average.
! For SHmedia, the assumption is that any quadword can be read in its
! enirety if at least one byte is included in the copy.
!
.section .text..SHmedia32,"ax"
.globl memcpy
.type memcpy, @function
.align 5
memcpy:
#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
ld.b r3,0,r63
pta/l Large,tr0
movi 25,r0
bgeu/u r4,r0,tr0
nsb r4,r0
shlli r0,5,r0
movi (L1-L0+63*32 + 1) & 0xffff,r1
sub r1, r0, r0
L0: ptrel r0,tr0
add r2,r4,r5
ptabs r18,tr1
add r3,r4,r6
blink tr0,r63
/* Rearranged to make cut2 safe */
.balign 8
L4_7: /* 4..7 byte memcpy cntd. */
stlo.l r2, 0, r0
or r6, r7, r6
sthi.l r5, -1, r6
stlo.l r5, -4, r6
blink tr1,r63
.balign 8
L1: /* 0 byte memcpy */
nop
blink tr1,r63
nop
nop
nop
nop
L2_3: /* 2 or 3 byte memcpy cntd. */
st.b r5,-1,r6
blink tr1,r63
/* 1 byte memcpy */
ld.b r3,0,r0
st.b r2,0,r0
blink tr1,r63
L8_15: /* 8..15 byte memcpy cntd. */
stlo.q r2, 0, r0
or r6, r7, r6
sthi.q r5, -1, r6
stlo.q r5, -8, r6
blink tr1,r63
/* 2 or 3 byte memcpy */
ld.b r3,0,r0
ld.b r2,0,r63
ld.b r3,1,r1
st.b r2,0,r0
pta/l L2_3,tr0
ld.b r6,-1,r6
st.b r2,1,r1
blink tr0, r63
/* 4 .. 7 byte memcpy */
LDUAL (r3, 0, r0, r1)
pta L4_7, tr0
ldlo.l r6, -4, r7
or r0, r1, r0
sthi.l r2, 3, r0
ldhi.l r6, -1, r6
blink tr0, r63
/* 8 .. 15 byte memcpy */
LDUAQ (r3, 0, r0, r1)
pta L8_15, tr0
ldlo.q r6, -8, r7
or r0, r1, r0
sthi.q r2, 7, r0
ldhi.q r6, -1, r6
blink tr0, r63
/* 16 .. 24 byte memcpy */
LDUAQ (r3, 0, r0, r1)
LDUAQ (r3, 8, r8, r9)
or r0, r1, r0
sthi.q r2, 7, r0
or r8, r9, r8
sthi.q r2, 15, r8
ldlo.q r6, -8, r7
ldhi.q r6, -1, r6
stlo.q r2, 8, r8
stlo.q r2, 0, r0
or r6, r7, r6
sthi.q r5, -1, r6
stlo.q r5, -8, r6
blink tr1,r63
Large:
ld.b r2, 0, r63
pta/l Loop_ua, tr1
ori r3, -8, r7
sub r2, r7, r22
sub r3, r2, r6
add r2, r4, r5
ldlo.q r3, 0, r0
addi r5, -16, r5
movi 64+8, r27 // could subtract r7 from that.
stlo.q r2, 0, r0
sthi.q r2, 7, r0
ldx.q r22, r6, r0
bgtu/l r27, r4, tr1
addi r5, -48, r27
pta/l Loop_line, tr0
addi r6, 64, r36
addi r6, -24, r19
addi r6, -16, r20
addi r6, -8, r21
Loop_line:
ldx.q r22, r36, r63
alloco r22, 32
addi r22, 32, r22
ldx.q r22, r19, r23
sthi.q r22, -25, r0
ldx.q r22, r20, r24
ldx.q r22, r21, r25
stlo.q r22, -32, r0
ldx.q r22, r6, r0
sthi.q r22, -17, r23
sthi.q r22, -9, r24
sthi.q r22, -1, r25
stlo.q r22, -24, r23
stlo.q r22, -16, r24
stlo.q r22, -8, r25
bgeu r27, r22, tr0
Loop_ua:
addi r22, 8, r22
sthi.q r22, -1, r0
stlo.q r22, -8, r0
ldx.q r22, r6, r0
bgtu/l r5, r22, tr1
add r3, r4, r7
ldlo.q r7, -8, r1
sthi.q r22, 7, r0
ldhi.q r7, -1, r7
ptabs r18,tr1
stlo.q r22, 0, r0
or r1, r7, r1
sthi.q r5, 15, r1
stlo.q r5, 8, r1
blink tr1, r63
.size memcpy,.-memcpy

91
arch/sh/lib64/memset.S Normal file
View file

@ -0,0 +1,91 @@
/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
/* Modified by SuperH, Inc. September 2003 */
!
! Fast SH memset
!
! by Toshiyasu Morita (tm@netcom.com)
!
! SH5 code by J"orn Rennecke (joern.rennecke@superh.com)
! Copyright 2002 SuperH Ltd.
!
#if __BYTE_ORDER == __LITTLE_ENDIAN
#define SHHI shlld
#define SHLO shlrd
#else
#define SHHI shlrd
#define SHLO shlld
#endif
.section .text..SHmedia32,"ax"
.globl memset
.type memset, @function
.align 5
memset:
pta/l multiquad, tr0
andi r2, 7, r22
ptabs r18, tr2
mshflo.b r3,r3,r3
add r4, r22, r23
mperm.w r3, r63, r3 // Fill pattern now in every byte of r3
movi 8, r9
bgtu/u r23, r9, tr0 // multiquad
beqi/u r4, 0, tr2 // Return with size 0 - ensures no mem accesses
ldlo.q r2, 0, r7
shlli r4, 2, r4
movi -1, r8
SHHI r8, r4, r8
SHHI r8, r4, r8
mcmv r7, r8, r3
stlo.q r2, 0, r3
blink tr2, r63
multiquad:
pta/l lastquad, tr0
stlo.q r2, 0, r3
shlri r23, 3, r24
add r2, r4, r5
beqi/u r24, 1, tr0 // lastquad
pta/l loop, tr1
sub r2, r22, r25
andi r5, -8, r20 // calculate end address and
addi r20, -7*8, r8 // loop end address; This might overflow, so we need
// to use a different test before we start the loop
bge/u r24, r9, tr1 // loop
st.q r25, 8, r3
st.q r20, -8, r3
shlri r24, 1, r24
beqi/u r24, 1, tr0 // lastquad
st.q r25, 16, r3
st.q r20, -16, r3
beqi/u r24, 2, tr0 // lastquad
st.q r25, 24, r3
st.q r20, -24, r3
lastquad:
sthi.q r5, -1, r3
blink tr2,r63
loop:
!!! alloco r25, 32 // QQQ comment out for short-term fix to SHUK #3895.
// QQQ commenting out is locically correct, but sub-optimal
// QQQ Sean McGoogan - 4th April 2003.
st.q r25, 8, r3
st.q r25, 16, r3
st.q r25, 24, r3
st.q r25, 32, r3
addi r25, 32, r25
bgeu/l r8, r25, tr1 // loop
st.q r20, -40, r3
st.q r20, -32, r3
st.q r20, -24, r3
st.q r20, -16, r3
st.q r20, -8, r3
sthi.q r5, -1, r3
blink tr2,r63
.size memset,.-memset

15
arch/sh/lib64/panic.c Normal file
View file

@ -0,0 +1,15 @@
/*
* Copyright (C) 2003 Richard Curnow, SuperH UK Limited
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*/
void
panic_handler(unsigned long panicPC, unsigned long panicSSR,
unsigned long panicEXPEVT)
{
/* Never return from the panic handler */
for (;;) ;
}

135
arch/sh/lib64/sdivsi3.S Normal file
View file

@ -0,0 +1,135 @@
.global __sdivsi3
.global __sdivsi3_1
.global __sdivsi3_2
.section .text..SHmedia32,"ax"
.align 2
/* inputs: r4,r5 */
/* clobbered: r1,r18,r19,r20,r21,r25,tr0 */
/* result in r0 */
__sdivsi3:
__sdivsi3_1:
ptb __div_table,tr0
gettr tr0,r20
__sdivsi3_2:
nsb r5, r1
shlld r5, r1, r25 /* normalize; [-2 ..1, 1..2) in s2.62 */
shari r25, 58, r21 /* extract 5(6) bit index (s2.4 with hole -1..1) */
/* bubble */
ldx.ub r20, r21, r19 /* u0.8 */
shari r25, 32, r25 /* normalize to s2.30 */
shlli r21, 1, r21
muls.l r25, r19, r19 /* s2.38 */
ldx.w r20, r21, r21 /* s2.14 */
ptabs r18, tr0
shari r19, 24, r19 /* truncate to s2.14 */
sub r21, r19, r19 /* some 11 bit inverse in s1.14 */
muls.l r19, r19, r21 /* u0.28 */
sub r63, r1, r1
addi r1, 92, r1
muls.l r25, r21, r18 /* s2.58 */
shlli r19, 45, r19 /* multiply by two and convert to s2.58 */
/* bubble */
sub r19, r18, r18
shari r18, 28, r18 /* some 22 bit inverse in s1.30 */
muls.l r18, r25, r0 /* s2.60 */
muls.l r18, r4, r25 /* s32.30 */
/* bubble */
shari r0, 16, r19 /* s-16.44 */
muls.l r19, r18, r19 /* s-16.74 */
shari r25, 63, r0
shari r4, 14, r18 /* s19.-14 */
shari r19, 30, r19 /* s-16.44 */
muls.l r19, r18, r19 /* s15.30 */
xor r21, r0, r21 /* You could also use the constant 1 << 27. */
add r21, r25, r21
sub r21, r19, r21
shard r21, r1, r21
sub r21, r0, r0
blink tr0, r63
/* This table has been generated by divtab.c .
Defects for bias -330:
Max defect: 6.081536e-07 at -1.000000e+00
Min defect: 2.849516e-08 at 1.030651e+00
Max 2nd step defect: 9.606539e-12 at -1.000000e+00
Min 2nd step defect: 0.000000e+00 at 0.000000e+00
Defect at 1: 1.238659e-07
Defect at -2: 1.061708e-07 */
.balign 2
.type __div_table,@object
.size __div_table,128
/* negative division constants */
.word -16638
.word -17135
.word -17737
.word -18433
.word -19103
.word -19751
.word -20583
.word -21383
.word -22343
.word -23353
.word -24407
.word -25582
.word -26863
.word -28382
.word -29965
.word -31800
/* negative division factors */
.byte 66
.byte 70
.byte 75
.byte 81
.byte 87
.byte 93
.byte 101
.byte 109
.byte 119
.byte 130
.byte 142
.byte 156
.byte 172
.byte 192
.byte 214
.byte 241
.skip 16
.global __div_table
__div_table:
.skip 16
/* positive division factors */
.byte 241
.byte 214
.byte 192
.byte 172
.byte 156
.byte 142
.byte 130
.byte 119
.byte 109
.byte 101
.byte 93
.byte 87
.byte 81
.byte 75
.byte 70
.byte 66
/* positive division constants */
.word 31801
.word 29966
.word 28383
.word 26864
.word 25583
.word 24408
.word 23354
.word 22344
.word 21384
.word 20584
.word 19752
.word 19104
.word 18434
.word 17738
.word 17136
.word 16639

97
arch/sh/lib64/strcpy.S Normal file
View file

@ -0,0 +1,97 @@
/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
/* Modified by SuperH, Inc. September 2003 */
! Entry: arg0: destination
! arg1: source
! Exit: result: destination
!
! SH5 code Copyright 2002 SuperH Ltd.
#if __BYTE_ORDER == __LITTLE_ENDIAN
#define SHHI shlld
#define SHLO shlrd
#else
#define SHHI shlrd
#define SHLO shlld
#endif
.section .text..SHmedia32,"ax"
.globl strcpy
.type strcpy, @function
.align 5
strcpy:
pta/l shortstring,tr1
ldlo.q r3,0,r4
ptabs r18,tr4
shlli r3,3,r7
addi r2, 8, r0
mcmpeq.b r4,r63,r6
SHHI r6,r7,r6
bnei/u r6,0,tr1 // shortstring
pta/l no_lddst, tr2
ori r3,-8,r23
sub r2, r23, r0
sub r3, r2, r21
addi r21, 8, r20
ldx.q r0, r21, r5
pta/l loop, tr0
ori r2,-8,r22
mcmpeq.b r5, r63, r6
bgt/u r22, r23, tr2 // no_lddst
// r22 < r23 : Need to do a load from the destination.
// r22 == r23 : Doesn't actually need to load from destination,
// but still can be handled here.
ldlo.q r2, 0, r9
movi -1, r8
SHLO r8, r7, r8
mcmv r4, r8, r9
stlo.q r2, 0, r9
beqi/l r6, 0, tr0 // loop
add r5, r63, r4
addi r0, 8, r0
blink tr1, r63 // shortstring
no_lddst:
// r22 > r23: note that for r22 == r23 the sthi.q would clobber
// bytes before the destination region.
stlo.q r2, 0, r4
SHHI r4, r7, r4
sthi.q r0, -1, r4
beqi/l r6, 0, tr0 // loop
add r5, r63, r4
addi r0, 8, r0
shortstring:
#if __BYTE_ORDER != __LITTLE_ENDIAN
pta/l shortstring2,tr1
byterev r4,r4
#endif
shortstring2:
st.b r0,-8,r4
andi r4,0xff,r5
shlri r4,8,r4
addi r0,1,r0
bnei/l r5,0,tr1
blink tr4,r63 // return
.balign 8
loop:
stlo.q r0, 0, r5
ldx.q r0, r20, r4
addi r0, 16, r0
sthi.q r0, -9, r5
mcmpeq.b r4, r63, r6
bnei/u r6, 0, tr1 // shortstring
ldx.q r0, r21, r5
stlo.q r0, -8, r4
sthi.q r0, -1, r4
mcmpeq.b r5, r63, r6
beqi/l r6, 0, tr0 // loop
add r5, r63, r4
addi r0, 8, r0
blink tr1, r63 // shortstring
.size strcpy,.-strcpy

33
arch/sh/lib64/strlen.S Normal file
View file

@ -0,0 +1,33 @@
/*
* Simplistic strlen() implementation for SHmedia.
*
* Copyright (C) 2003 Paul Mundt <lethal@linux-sh.org>
*/
.section .text..SHmedia32,"ax"
.globl strlen
.type strlen,@function
.balign 16
strlen:
ptabs r18, tr4
/*
* Note: We could easily deal with the NULL case here with a simple
* sanity check, though it seems that the behavior we want is to fault
* in the event that r2 == NULL, so we don't bother.
*/
/* beqi r2, 0, tr4 */ ! Sanity check
movi -1, r0
pta/l loop, tr0
loop:
ld.b r2, 0, r1
addi r2, 1, r2
addi r0, 1, r0
bnei/l r1, 0, tr0
or r0, r63, r2
blink tr4, r63
.size strlen,.-strlen

49
arch/sh/lib64/udelay.c Normal file
View file

@ -0,0 +1,49 @@
/*
* arch/sh/lib64/udelay.c
*
* Delay routines, using a pre-computed "loops_per_jiffy" value.
*
* Copyright (C) 2000, 2001 Paolo Alberelli
* Copyright (C) 2003, 2004 Paul Mundt
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*/
#include <linux/sched.h>
#include <asm/param.h>
/*
* Use only for very small delays (< 1 msec).
*
* The active part of our cycle counter is only 32-bits wide, and
* we're treating the difference between two marks as signed. On
* a 1GHz box, that's about 2 seconds.
*/
void __delay(unsigned long loops)
{
long long dummy;
__asm__ __volatile__("gettr tr0, %1\n\t"
"pta $+4, tr0\n\t"
"addi %0, -1, %0\n\t"
"bne %0, r63, tr0\n\t"
"ptabs %1, tr0\n\t":"=r"(loops),
"=r"(dummy)
:"0"(loops));
}
void __const_udelay(unsigned long xloops)
{
__delay(xloops * (HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy));
}
void __udelay(unsigned long usecs)
{
__const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */
}
void __ndelay(unsigned long nsecs)
{
__const_udelay(nsecs * 0x00000005);
}

120
arch/sh/lib64/udivdi3.S Normal file
View file

@ -0,0 +1,120 @@
.section .text..SHmedia32,"ax"
.align 2
.global __udivdi3
__udivdi3:
shlri r3,1,r4
nsb r4,r22
shlld r3,r22,r6
shlri r6,49,r5
movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */
sub r21,r5,r1
mmulfx.w r1,r1,r4
mshflo.w r1,r63,r1
sub r63,r22,r20 // r63 == 64 % 64
mmulfx.w r5,r4,r4
pta large_divisor,tr0
addi r20,32,r9
msub.w r1,r4,r1
madd.w r1,r1,r1
mmulfx.w r1,r1,r4
shlri r6,32,r7
bgt/u r9,r63,tr0 // large_divisor
mmulfx.w r5,r4,r4
shlri r2,32+14,r19
addi r22,-31,r0
msub.w r1,r4,r1
mulu.l r1,r7,r4
addi r1,-3,r5
mulu.l r5,r19,r5
sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
the case may be, %0000000000000000 000.11111111111, still */
muls.l r1,r4,r4 /* leaving at least one sign bit. */
mulu.l r5,r3,r8
mshalds.l r1,r21,r1
shari r4,26,r4
shlld r8,r0,r8
add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
sub r2,r8,r2
/* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */
shlri r2,22,r21
mulu.l r21,r1,r21
shlld r5,r0,r8
addi r20,30-22,r0
shlrd r21,r0,r21
mulu.l r21,r3,r5
add r8,r21,r8
mcmpgt.l r21,r63,r21 // See Note 1
addi r20,30,r0
mshfhi.l r63,r21,r21
sub r2,r5,r2
andc r2,r21,r2
/* small divisor: need a third divide step */
mulu.l r2,r1,r7
ptabs r18,tr0
addi r2,1,r2
shlrd r7,r0,r7
mulu.l r7,r3,r5
add r8,r7,r8
sub r2,r3,r2
cmpgt r2,r5,r5
add r8,r5,r2
/* could test r3 here to check for divide by zero. */
blink tr0,r63
large_divisor:
mmulfx.w r5,r4,r4
shlrd r2,r9,r25
shlri r25,32,r8
msub.w r1,r4,r1
mulu.l r1,r7,r4
addi r1,-3,r5
mulu.l r5,r8,r5
sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
the case may be, %0000000000000000 000.11111111111, still */
muls.l r1,r4,r4 /* leaving at least one sign bit. */
shlri r5,14-1,r8
mulu.l r8,r7,r5
mshalds.l r1,r21,r1
shari r4,26,r4
add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
sub r25,r5,r25
/* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */
shlri r25,22,r21
mulu.l r21,r1,r21
pta no_lo_adj,tr0
addi r22,32,r0
shlri r21,40,r21
mulu.l r21,r7,r5
add r8,r21,r8
shlld r2,r0,r2
sub r25,r5,r25
bgtu/u r7,r25,tr0 // no_lo_adj
addi r8,1,r8
sub r25,r7,r25
no_lo_adj:
mextr4 r2,r25,r2
/* large_divisor: only needs a few adjustments. */
mulu.l r8,r6,r5
ptabs r18,tr0
/* bubble */
cmpgtu r5,r2,r5
sub r8,r5,r2
blink tr0,r63
/* Note 1: To shift the result of the second divide stage so that the result
always fits into 32 bits, yet we still reduce the rest sufficiently
would require a lot of instructions to do the shifts just right. Using
the full 64 bit shift result to multiply with the divisor would require
four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
Fortunately, if the upper 32 bits of the shift result are nonzero, we
know that the rest after taking this partial result into account will
fit into 32 bits. So we just clear the upper 32 bits of the rest if the
upper 32 bits of the partial result are nonzero. */

59
arch/sh/lib64/udivsi3.S Normal file
View file

@ -0,0 +1,59 @@
.global __udivsi3
.section .text..SHmedia32,"ax"
.align 2
/*
inputs: r4,r5
clobbered: r18,r19,r20,r21,r22,r25,tr0
result in r0.
*/
__udivsi3:
addz.l r5,r63,r22
nsb r22,r0
shlld r22,r0,r25
shlri r25,48,r25
movi 0xffffffffffffbb0c,r20 /* shift count eqiv 76 */
sub r20,r25,r21
mmulfx.w r21,r21,r19
mshflo.w r21,r63,r21
ptabs r18,tr0
mmulfx.w r25,r19,r19
sub r20,r0,r0
/* bubble */
msub.w r21,r19,r19
/*
* It would be nice for scheduling to do this add to r21 before
* the msub.w, but we need a different value for r19 to keep
* errors under control.
*/
addi r19,-2,r21
mulu.l r4,r21,r18
mmulfx.w r19,r19,r19
shlli r21,15,r21
shlrd r18,r0,r18
mulu.l r18,r22,r20
mmacnfx.wl r25,r19,r21
/* bubble */
sub r4,r20,r25
mulu.l r25,r21,r19
addi r0,14,r0
/* bubble */
shlrd r19,r0,r19
mulu.l r19,r22,r20
add r18,r19,r18
/* bubble */
sub.l r25,r20,r25
mulu.l r25,r21,r19
addz.l r25,r63,r25
sub r25,r22,r25
shlrd r19,r0,r19
mulu.l r19,r22,r20
addi r25,1,r25
add r18,r19,r18
cmpgt r25,r20,r25
add.l r18,r25,r0
blink tr0,r63