Fixed MTP to work with TWRP

2025-10-28 23:08:52 +01:00 · 2018-06-19 23:16:04 +02:00 · 2018-06-19 23:16:04 +02:00 · f6dfaef42e
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions
--- a/arch/sh/lib/Makefile
+++ b/arch/sh/lib/Makefile
@ -0,0 +1,33 @@
+#
+# Makefile for SuperH-specific library files..
+#
+
+lib-y  = delay.o memmove.o memchr.o \
+	 checksum.o strlen.o div64.o div64-generic.o
+
+# Extracted from libgcc
+obj-y += movmem.o ashldi3.o ashrdi3.o lshrdi3.o \
+	 ashlsi3.o ashrsi3.o ashiftrt.o lshrsi3.o \
+	 udiv_qrnnd.o
+
+udivsi3-y			:= udivsi3_i4i-Os.o
+
+ifneq ($(CONFIG_CC_OPTIMIZE_FOR_SIZE),y)
+udivsi3-$(CONFIG_CPU_SH3)	:= udivsi3_i4i.o
+udivsi3-$(CONFIG_CPU_SH4)	:= udivsi3_i4i.o
+endif
+udivsi3-y			+= udivsi3.o
+
+obj-y				+= io.o
+
+memcpy-y			:= memcpy.o
+memcpy-$(CONFIG_CPU_SH4)	:= memcpy-sh4.o
+
+memset-y			:= memset.o
+memset-$(CONFIG_CPU_SH4)	:= memset-sh4.o
+
+lib-$(CONFIG_MMU)		+= copy_page.o __clear_user.o
+lib-$(CONFIG_MCOUNT)		+= mcount.o
+lib-y				+= $(memcpy-y) $(memset-y) $(udivsi3-y)
+
+ccflags-y := -Werror
--- a/arch/sh/lib/__clear_user.S
+++ b/arch/sh/lib/__clear_user.S
@ -0,0 +1,108 @@
+/*
+ * __clear_user_page, __clear_user, clear_page implementation of SuperH
+ *
+ * Copyright (C) 2001  Kaz Kojima
+ * Copyright (C) 2001, 2002  Niibe Yutaka
+ * Copyright (C) 2006  Paul Mundt
+ */
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+ENTRY(__clear_user)
+	!
+	mov	#0, r0
+	mov	#0xffffffe0, r1
+	!
+	! r4..(r4+31)&~32 	   -------- not aligned	[ Area 0 ]
+	! (r4+31)&~32..(r4+r5)&~32 -------- aligned	[ Area 1 ]
+	! (r4+r5)&~32..r4+r5       -------- not aligned	[ Area 2 ]
+	!
+	! Clear area 0
+	mov	r4, r2
+	!
+	tst	r1, r5		! length < 32
+	bt	.Larea2		! skip to remainder
+	!
+	add	#31, r2
+	and	r1, r2
+	cmp/eq	r4, r2
+	bt	.Larea1
+	mov	r2, r3
+	sub	r4, r3
+	mov	r3, r7
+	mov	r4, r2
+	!
+.L0:	dt	r3
+0:	mov.b	r0, @r2
+	bf/s	.L0
+	 add	#1, r2
+	!
+	sub	r7, r5
+	mov	r2, r4
+.Larea1:
+	mov	r4, r3
+	add	r5, r3
+	and	r1, r3
+	cmp/hi	r2, r3
+	bf	.Larea2
+	!
+	! Clear area 1
+#if defined(CONFIG_CPU_SH4)
+1:	movca.l	r0, @r2
+#else
+1:	mov.l	r0, @r2
+#endif
+	add	#4, r2
+2:	mov.l	r0, @r2
+	add	#4, r2
+3:	mov.l	r0, @r2
+	add	#4, r2
+4:	mov.l	r0, @r2
+	add	#4, r2
+5:	mov.l	r0, @r2
+	add	#4, r2
+6:	mov.l	r0, @r2
+	add	#4, r2
+7:	mov.l	r0, @r2
+	add	#4, r2
+8:	mov.l	r0, @r2
+	add	#4, r2
+	cmp/hi	r2, r3
+	bt/s	1b
+	 nop
+	!
+	! Clear area 2
+.Larea2:
+	mov	r4, r3
+	add	r5, r3
+	cmp/hs	r3, r2
+	bt/s	.Ldone
+	 sub	r2, r3
+.L2:	dt	r3
+9:	mov.b	r0, @r2
+	bf/s	.L2
+	 add	#1, r2
+	!
+.Ldone:	rts
+	 mov	#0, r0	! return 0 as normal return
+
+	! return the number of bytes remained
+.Lbad_clear_user:
+	mov	r4, r0
+	add	r5, r0
+	rts
+	 sub	r2, r0
+
+.section __ex_table,"a"
+	.align 2
+	.long	0b, .Lbad_clear_user
+	.long	1b, .Lbad_clear_user
+	.long	2b, .Lbad_clear_user
+	.long	3b, .Lbad_clear_user
+	.long	4b, .Lbad_clear_user
+	.long	5b, .Lbad_clear_user
+	.long	6b, .Lbad_clear_user
+	.long	7b, .Lbad_clear_user
+	.long	8b, .Lbad_clear_user
+	.long	9b, .Lbad_clear_user
+.previous
--- a/arch/sh/lib/ashiftrt.S
+++ b/arch/sh/lib/ashiftrt.S
@ -0,0 +1,149 @@
+/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
+   2004, 2005, 2006
+   Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+!! libgcc routines for the Renesas / SuperH SH CPUs.
+!! Contributed by Steve Chamberlain.
+!! sac@cygnus.com
+
+!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines
+!! recoded in assembly by Toshiyasu Morita
+!! tm@netcom.com
+
+/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and
+   ELF local label prefixes by J"orn Rennecke
+   amylaar@cygnus.com  */
+
+	.global	__ashiftrt_r4_0
+	.global	__ashiftrt_r4_1
+	.global	__ashiftrt_r4_2
+	.global	__ashiftrt_r4_3
+	.global	__ashiftrt_r4_4
+	.global	__ashiftrt_r4_5
+	.global	__ashiftrt_r4_6
+	.global	__ashiftrt_r4_7
+	.global	__ashiftrt_r4_8
+	.global	__ashiftrt_r4_9
+	.global	__ashiftrt_r4_10
+	.global	__ashiftrt_r4_11
+	.global	__ashiftrt_r4_12
+	.global	__ashiftrt_r4_13
+	.global	__ashiftrt_r4_14
+	.global	__ashiftrt_r4_15
+	.global	__ashiftrt_r4_16
+	.global	__ashiftrt_r4_17
+	.global	__ashiftrt_r4_18
+	.global	__ashiftrt_r4_19
+	.global	__ashiftrt_r4_20
+	.global	__ashiftrt_r4_21
+	.global	__ashiftrt_r4_22
+	.global	__ashiftrt_r4_23
+	.global	__ashiftrt_r4_24
+	.global	__ashiftrt_r4_25
+	.global	__ashiftrt_r4_26
+	.global	__ashiftrt_r4_27
+	.global	__ashiftrt_r4_28
+	.global	__ashiftrt_r4_29
+	.global	__ashiftrt_r4_30
+	.global	__ashiftrt_r4_31
+	.global	__ashiftrt_r4_32
+
+	.align	1
+__ashiftrt_r4_32:
+__ashiftrt_r4_31:
+	rotcl	r4
+	rts
+	subc	r4,r4
+__ashiftrt_r4_30:
+	shar	r4
+__ashiftrt_r4_29:
+	shar	r4
+__ashiftrt_r4_28:
+	shar	r4
+__ashiftrt_r4_27:
+	shar	r4
+__ashiftrt_r4_26:
+	shar	r4
+__ashiftrt_r4_25:
+	shar	r4
+__ashiftrt_r4_24:
+	shlr16	r4
+	shlr8	r4
+	rts
+	exts.b	r4,r4
+__ashiftrt_r4_23:
+	shar	r4
+__ashiftrt_r4_22:
+	shar	r4
+__ashiftrt_r4_21:
+	shar	r4
+__ashiftrt_r4_20:
+	shar	r4
+__ashiftrt_r4_19:
+	shar	r4
+__ashiftrt_r4_18:
+	shar	r4
+__ashiftrt_r4_17:
+	shar	r4
+__ashiftrt_r4_16:
+	shlr16	r4
+	rts
+	exts.w	r4,r4
+__ashiftrt_r4_15:
+	shar	r4
+__ashiftrt_r4_14:
+	shar	r4
+__ashiftrt_r4_13:
+	shar	r4
+__ashiftrt_r4_12:
+	shar	r4
+__ashiftrt_r4_11:
+	shar	r4
+__ashiftrt_r4_10:
+	shar	r4
+__ashiftrt_r4_9:
+	shar	r4
+__ashiftrt_r4_8:
+	shar	r4
+__ashiftrt_r4_7:
+	shar	r4
+__ashiftrt_r4_6:
+	shar	r4
+__ashiftrt_r4_5:
+	shar	r4
+__ashiftrt_r4_4:
+	shar	r4
+__ashiftrt_r4_3:
+	shar	r4
+__ashiftrt_r4_2:
+	shar	r4
+__ashiftrt_r4_1:
+	rts
+	shar	r4
+__ashiftrt_r4_0:
+	rts
+	nop
--- a/arch/sh/lib/ashldi3.c
+++ b/arch/sh/lib/ashldi3.c
@ -0,0 +1,29 @@
+#include <linux/module.h>
+
+#include "libgcc.h"
+
+long long __ashldi3(long long u, word_type b)
+{
+	DWunion uu, w;
+	word_type bm;
+
+	if (b == 0)
+		return u;
+
+	uu.ll = u;
+	bm = 32 - b;
+
+	if (bm <= 0) {
+		w.s.low = 0;
+		w.s.high = (unsigned int) uu.s.low << -bm;
+	} else {
+		const unsigned int carries = (unsigned int) uu.s.low >> bm;
+
+		w.s.low = (unsigned int) uu.s.low << b;
+		w.s.high = ((unsigned int) uu.s.high << b) | carries;
+	}
+
+	return w.ll;
+}
+
+EXPORT_SYMBOL(__ashldi3);
--- a/arch/sh/lib/ashlsi3.S
+++ b/arch/sh/lib/ashlsi3.S
@ -0,0 +1,193 @@
+/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
+   2004, 2005, 2006
+   Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+!! libgcc routines for the Renesas / SuperH SH CPUs.
+!! Contributed by Steve Chamberlain.
+!! sac@cygnus.com
+
+!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines
+!! recoded in assembly by Toshiyasu Morita
+!! tm@netcom.com
+
+/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and
+   ELF local label prefixes by J"orn Rennecke
+   amylaar@cygnus.com  */
+
+!
+! __ashlsi3
+!
+! Entry:
+!
+! r4: Value to shift
+! r5: Shifts
+!
+! Exit:
+!
+! r0: Result
+!
+! Destroys:
+!
+! (none)
+!
+	.global	__ashlsi3
+	
+	.align	2
+__ashlsi3:
+	mov	#31,r0
+	and	r0,r5
+	mova	ashlsi3_table,r0
+	mov.b	@(r0,r5),r5
+#ifdef __sh1__
+	add	r5,r0
+	jmp	@r0
+#else
+	braf	r5
+#endif
+	mov	r4,r0
+
+	.align	2
+ashlsi3_table:
+	.byte		ashlsi3_0-ashlsi3_table
+	.byte		ashlsi3_1-ashlsi3_table
+	.byte		ashlsi3_2-ashlsi3_table
+	.byte		ashlsi3_3-ashlsi3_table
+	.byte		ashlsi3_4-ashlsi3_table
+	.byte		ashlsi3_5-ashlsi3_table
+	.byte		ashlsi3_6-ashlsi3_table
+	.byte		ashlsi3_7-ashlsi3_table
+	.byte		ashlsi3_8-ashlsi3_table
+	.byte		ashlsi3_9-ashlsi3_table
+	.byte		ashlsi3_10-ashlsi3_table
+	.byte		ashlsi3_11-ashlsi3_table
+	.byte		ashlsi3_12-ashlsi3_table
+	.byte		ashlsi3_13-ashlsi3_table
+	.byte		ashlsi3_14-ashlsi3_table
+	.byte		ashlsi3_15-ashlsi3_table
+	.byte		ashlsi3_16-ashlsi3_table
+	.byte		ashlsi3_17-ashlsi3_table
+	.byte		ashlsi3_18-ashlsi3_table
+	.byte		ashlsi3_19-ashlsi3_table
+	.byte		ashlsi3_20-ashlsi3_table
+	.byte		ashlsi3_21-ashlsi3_table
+	.byte		ashlsi3_22-ashlsi3_table
+	.byte		ashlsi3_23-ashlsi3_table
+	.byte		ashlsi3_24-ashlsi3_table
+	.byte		ashlsi3_25-ashlsi3_table
+	.byte		ashlsi3_26-ashlsi3_table
+	.byte		ashlsi3_27-ashlsi3_table
+	.byte		ashlsi3_28-ashlsi3_table
+	.byte		ashlsi3_29-ashlsi3_table
+	.byte		ashlsi3_30-ashlsi3_table
+	.byte		ashlsi3_31-ashlsi3_table
+
+ashlsi3_6:
+	shll2	r0
+ashlsi3_4:
+	shll2	r0
+ashlsi3_2:
+	rts
+	shll2	r0
+
+ashlsi3_7:
+	shll2	r0
+ashlsi3_5:
+	shll2	r0
+ashlsi3_3:
+	shll2	r0
+ashlsi3_1:
+	rts
+	shll	r0
+
+ashlsi3_14:
+	shll2	r0
+ashlsi3_12:
+	shll2	r0
+ashlsi3_10:
+	shll2	r0
+ashlsi3_8:
+	rts
+	shll8	r0
+
+ashlsi3_15:
+	shll2	r0
+ashlsi3_13:
+	shll2	r0
+ashlsi3_11:
+	shll2	r0
+ashlsi3_9:
+	shll8	r0
+	rts
+	shll	r0
+
+ashlsi3_22:
+	shll2	r0
+ashlsi3_20:
+	shll2	r0
+ashlsi3_18:
+	shll2	r0
+ashlsi3_16:
+	rts
+	shll16	r0
+
+ashlsi3_23:
+	shll2	r0
+ashlsi3_21:
+	shll2	r0
+ashlsi3_19:
+	shll2	r0
+ashlsi3_17:
+	shll16	r0
+	rts
+	shll	r0
+
+ashlsi3_30:
+	shll2	r0
+ashlsi3_28:
+	shll2	r0
+ashlsi3_26:
+	shll2	r0
+ashlsi3_24:
+	shll16	r0
+	rts
+	shll8	r0
+
+ashlsi3_31:
+	shll2	r0
+ashlsi3_29:
+	shll2	r0
+ashlsi3_27:
+	shll2	r0
+ashlsi3_25:
+	shll16	r0
+	shll8	r0
+	rts
+	shll	r0
+
+ashlsi3_0:
+	rts
+	nop
--- a/arch/sh/lib/ashrdi3.c
+++ b/arch/sh/lib/ashrdi3.c
@ -0,0 +1,31 @@
+#include <linux/module.h>
+
+#include "libgcc.h"
+
+long long __ashrdi3(long long u, word_type b)
+{
+	DWunion uu, w;
+	word_type bm;
+
+	if (b == 0)
+		return u;
+
+	uu.ll = u;
+	bm = 32 - b;
+
+	if (bm <= 0) {
+		/* w.s.high = 1..1 or 0..0 */
+		w.s.high =
+		    uu.s.high >> 31;
+		w.s.low = uu.s.high >> -bm;
+	} else {
+		const unsigned int carries = (unsigned int) uu.s.high << bm;
+
+		w.s.high = uu.s.high >> b;
+		w.s.low = ((unsigned int) uu.s.low >> b) | carries;
+	}
+
+	return w.ll;
+}
+
+EXPORT_SYMBOL(__ashrdi3);
--- a/arch/sh/lib/ashrsi3.S
+++ b/arch/sh/lib/ashrsi3.S
@ -0,0 +1,185 @@
+/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
+   2004, 2005, 2006
+   Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+!! libgcc routines for the Renesas / SuperH SH CPUs.
+!! Contributed by Steve Chamberlain.
+!! sac@cygnus.com
+
+!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines
+!! recoded in assembly by Toshiyasu Morita
+!! tm@netcom.com
+
+/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and
+   ELF local label prefixes by J"orn Rennecke
+   amylaar@cygnus.com  */
+
+!
+! __ashrsi3
+!
+! Entry:
+!
+! r4: Value to shift
+! r5: Shifts
+!
+! Exit:
+!
+! r0: Result
+!
+! Destroys:
+!
+! (none)
+!
+
+	.global	__ashrsi3
+	
+	.align	2
+__ashrsi3:
+	mov	#31,r0
+	and	r0,r5
+	mova	ashrsi3_table,r0
+	mov.b	@(r0,r5),r5
+#ifdef __sh1__
+	add	r5,r0
+	jmp	@r0
+#else
+	braf	r5
+#endif
+	mov	r4,r0
+
+	.align	2
+ashrsi3_table:
+	.byte		ashrsi3_0-ashrsi3_table
+	.byte		ashrsi3_1-ashrsi3_table
+	.byte		ashrsi3_2-ashrsi3_table
+	.byte		ashrsi3_3-ashrsi3_table
+	.byte		ashrsi3_4-ashrsi3_table
+	.byte		ashrsi3_5-ashrsi3_table
+	.byte		ashrsi3_6-ashrsi3_table
+	.byte		ashrsi3_7-ashrsi3_table
+	.byte		ashrsi3_8-ashrsi3_table
+	.byte		ashrsi3_9-ashrsi3_table
+	.byte		ashrsi3_10-ashrsi3_table
+	.byte		ashrsi3_11-ashrsi3_table
+	.byte		ashrsi3_12-ashrsi3_table
+	.byte		ashrsi3_13-ashrsi3_table
+	.byte		ashrsi3_14-ashrsi3_table
+	.byte		ashrsi3_15-ashrsi3_table
+	.byte		ashrsi3_16-ashrsi3_table
+	.byte		ashrsi3_17-ashrsi3_table
+	.byte		ashrsi3_18-ashrsi3_table
+	.byte		ashrsi3_19-ashrsi3_table
+	.byte		ashrsi3_20-ashrsi3_table
+	.byte		ashrsi3_21-ashrsi3_table
+	.byte		ashrsi3_22-ashrsi3_table
+	.byte		ashrsi3_23-ashrsi3_table
+	.byte		ashrsi3_24-ashrsi3_table
+	.byte		ashrsi3_25-ashrsi3_table
+	.byte		ashrsi3_26-ashrsi3_table
+	.byte		ashrsi3_27-ashrsi3_table
+	.byte		ashrsi3_28-ashrsi3_table
+	.byte		ashrsi3_29-ashrsi3_table
+	.byte		ashrsi3_30-ashrsi3_table
+	.byte		ashrsi3_31-ashrsi3_table
+
+ashrsi3_31:
+	rotcl	r0
+	rts
+	subc	r0,r0
+
+ashrsi3_30:
+	shar	r0
+ashrsi3_29:
+	shar	r0
+ashrsi3_28:
+	shar	r0
+ashrsi3_27:
+	shar	r0
+ashrsi3_26:
+	shar	r0
+ashrsi3_25:
+	shar	r0
+ashrsi3_24:
+	shlr16	r0
+	shlr8	r0
+	rts
+	exts.b	r0,r0
+
+ashrsi3_23:
+	shar	r0
+ashrsi3_22:
+	shar	r0
+ashrsi3_21:
+	shar	r0
+ashrsi3_20:
+	shar	r0
+ashrsi3_19:
+	shar	r0
+ashrsi3_18:
+	shar	r0
+ashrsi3_17:
+	shar	r0
+ashrsi3_16:
+	shlr16	r0
+	rts
+	exts.w	r0,r0
+
+ashrsi3_15:
+	shar	r0
+ashrsi3_14:
+	shar	r0
+ashrsi3_13:
+	shar	r0
+ashrsi3_12:
+	shar	r0
+ashrsi3_11:
+	shar	r0
+ashrsi3_10:
+	shar	r0
+ashrsi3_9:
+	shar	r0
+ashrsi3_8:
+	shar	r0
+ashrsi3_7:
+	shar	r0
+ashrsi3_6:
+	shar	r0
+ashrsi3_5:
+	shar	r0
+ashrsi3_4:
+	shar	r0
+ashrsi3_3:
+	shar	r0
+ashrsi3_2:
+	shar	r0
+ashrsi3_1:
+	rts
+	shar	r0
+
+ashrsi3_0:
+	rts
+	nop
--- a/arch/sh/lib/checksum.S
+++ b/arch/sh/lib/checksum.S
@ -0,0 +1,417 @@
+/* $Id: checksum.S,v 1.10 2001/07/06 13:11:32 gniibe Exp $
+ *
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IP/TCP/UDP checksumming routines
+ *
+ * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Tom May, <ftom@netcom.com>
+ *              Pentium Pro/II routines:
+ *              Alexander Kjeldaas <astor@guardian.no>
+ *              Finn Arne Gangstad <finnag@guardian.no>
+ *		Lots of code moved from tcp.c and ip.c; see those files
+ *		for more names.
+ *
+ * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
+ *			     handling.
+ *		Andi Kleen,  add zeroing on error
+ *                   converted to pure assembler
+ *
+ * SuperH version:  Copyright (C) 1999  Niibe Yutaka
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/errno.h>
+#include <linux/linkage.h>
+
+/*
+ * computes a partial checksum, e.g. for TCP/UDP fragments
+ */
+
+/*	
+ * asmlinkage __wsum csum_partial(const void *buf, int len, __wsum sum);
+ */
+
+.text
+ENTRY(csum_partial)
+	  /*
+	   * Experiments with Ethernet and SLIP connections show that buff
+	   * is aligned on either a 2-byte or 4-byte boundary.  We get at
+	   * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
+	   * Fortunately, it is easy to convert 2-byte alignment to 4-byte
+	   * alignment for the unrolled loop.
+	   */
+	mov	r4, r0
+	tst	#3, r0		! Check alignment.
+	bt/s	2f		! Jump if alignment is ok.
+	 mov	r4, r7		! Keep a copy to check for alignment
+	!
+	tst	#1, r0		! Check alignment.
+	bt	21f		! Jump if alignment is boundary of 2bytes.
+
+	! buf is odd
+	tst	r5, r5
+	add	#-1, r5
+	bt	9f
+	mov.b	@r4+, r0
+	extu.b	r0, r0
+	addc	r0, r6		! t=0 from previous tst
+	mov	r6, r0
+	shll8	r6
+	shlr16	r0
+	shlr8	r0
+	or	r0, r6
+	mov	r4, r0
+	tst	#2, r0
+	bt	2f
+21:
+	! buf is 2 byte aligned (len could be 0)
+	add	#-2, r5		! Alignment uses up two bytes.
+	cmp/pz	r5		!
+	bt/s	1f		! Jump if we had at least two bytes.
+	 clrt
+	bra	6f
+	 add	#2, r5		! r5 was < 2.  Deal with it.
+1:
+	mov.w	@r4+, r0
+	extu.w	r0, r0
+	addc	r0, r6
+	bf	2f
+	add	#1, r6
+2:
+	! buf is 4 byte aligned (len could be 0)
+	mov	r5, r1
+	mov	#-5, r0
+	shld	r0, r1
+	tst	r1, r1
+	bt/s	4f		! if it's =0, go to 4f
+	 clrt
+	.align	2
+3:
+	mov.l	@r4+, r0
+	mov.l	@r4+, r2
+	mov.l	@r4+, r3
+	addc	r0, r6
+	mov.l	@r4+, r0
+	addc	r2, r6
+	mov.l	@r4+, r2
+	addc	r3, r6
+	mov.l	@r4+, r3
+	addc	r0, r6
+	mov.l	@r4+, r0
+	addc	r2, r6
+	mov.l	@r4+, r2
+	addc	r3, r6
+	addc	r0, r6
+	addc	r2, r6
+	movt	r0
+	dt	r1
+	bf/s	3b
+	 cmp/eq	#1, r0
+	! here, we know r1==0
+	addc	r1, r6			! add carry to r6
+4:
+	mov	r5, r0
+	and	#0x1c, r0
+	tst	r0, r0
+	bt	6f
+	! 4 bytes or more remaining
+	mov	r0, r1
+	shlr2	r1
+	mov	#0, r2
+5:
+	addc	r2, r6
+	mov.l	@r4+, r2
+	movt	r0
+	dt	r1
+	bf/s	5b
+	 cmp/eq	#1, r0
+	addc	r2, r6
+	addc	r1, r6		! r1==0 here, so it means add carry-bit
+6:
+	! 3 bytes or less remaining
+	mov	#3, r0
+	and	r0, r5
+	tst	r5, r5
+	bt	9f		! if it's =0 go to 9f
+	mov	#2, r1
+	cmp/hs  r1, r5
+	bf	7f
+	mov.w	@r4+, r0
+	extu.w	r0, r0
+	cmp/eq	r1, r5
+	bt/s	8f
+	 clrt
+	shll16	r0
+	addc	r0, r6
+7:
+	mov.b	@r4+, r0
+	extu.b	r0, r0
+#ifndef	__LITTLE_ENDIAN__
+	shll8	r0
+#endif
+8:
+	addc	r0, r6
+	mov	#0, r0
+	addc	r0, r6
+9:
+	! Check if the buffer was misaligned, if so realign sum
+	mov	r7, r0
+	tst	#1, r0
+	bt	10f
+	mov	r6, r0
+	shll8	r6
+	shlr16	r0
+	shlr8	r0
+	or	r0, r6
+10:
+	rts
+	 mov	r6, r0
+
+/*
+unsigned int csum_partial_copy_generic (const char *src, char *dst, int len, 
+					int sum, int *src_err_ptr, int *dst_err_ptr)
+ */ 
+
+/*
+ * Copy from ds while checksumming, otherwise like csum_partial
+ *
+ * The macros SRC and DST specify the type of access for the instruction.
+ * thus we can call a custom exception handler for all access types.
+ *
+ * FIXME: could someone double-check whether I haven't mixed up some SRC and
+ *	  DST definitions? It's damn hard to trigger all cases.  I hope I got
+ *	  them all but there's no guarantee.
+ */
+
+#define SRC(...)			\
+	9999: __VA_ARGS__ ;		\
+	.section __ex_table, "a";	\
+	.long 9999b, 6001f	;	\
+	.previous
+
+#define DST(...)			\
+	9999: __VA_ARGS__ ;		\
+	.section __ex_table, "a";	\
+	.long 9999b, 6002f	;	\
+	.previous
+
+!
+! r4:	const char *SRC
+! r5:	char *DST
+! r6:	int LEN
+! r7:	int SUM
+!
+! on stack:
+! int *SRC_ERR_PTR
+! int *DST_ERR_PTR
+!
+ENTRY(csum_partial_copy_generic)
+	mov.l	r5,@-r15
+	mov.l	r6,@-r15
+
+	mov	#3,r0		! Check src and dest are equally aligned
+	mov	r4,r1
+	and	r0,r1
+	and	r5,r0
+	cmp/eq	r1,r0
+	bf	3f		! Different alignments, use slow version
+	tst	#1,r0		! Check dest word aligned
+	bf	3f		! If not, do it the slow way
+
+	mov	#2,r0
+	tst	r0,r5		! Check dest alignment. 
+	bt	2f		! Jump if alignment is ok.
+	add	#-2,r6		! Alignment uses up two bytes.
+	cmp/pz	r6		! Jump if we had at least two bytes.
+	bt/s	1f
+	 clrt
+	add	#2,r6		! r6 was < 2.	Deal with it.
+	bra	4f
+	 mov	r6,r2
+
+3:	! Handle different src and dest alignments.
+	! This is not common, so simple byte by byte copy will do.
+	mov	r6,r2
+	shlr	r6
+	tst	r6,r6
+	bt	4f
+	clrt
+	.align	2
+5:
+SRC(	mov.b	@r4+,r1 	)
+SRC(	mov.b	@r4+,r0		)
+	extu.b	r1,r1
+DST(	mov.b	r1,@r5		)
+DST(	mov.b	r0,@(1,r5)	)
+	extu.b	r0,r0
+	add	#2,r5
+
+#ifdef	__LITTLE_ENDIAN__
+	shll8	r0
+#else
+	shll8	r1
+#endif
+	or	r1,r0
+
+	addc	r0,r7
+	movt	r0
+	dt	r6
+	bf/s	5b
+	 cmp/eq	#1,r0
+	mov	#0,r0
+	addc	r0, r7
+
+	mov	r2, r0
+	tst	#1, r0
+	bt	7f
+	bra	5f
+	 clrt
+
+	! src and dest equally aligned, but to a two byte boundary.
+	! Handle first two bytes as a special case
+	.align	2
+1:	
+SRC(	mov.w	@r4+,r0		)
+DST(	mov.w	r0,@r5		)
+	add	#2,r5
+	extu.w	r0,r0
+	addc	r0,r7
+	mov	#0,r0
+	addc	r0,r7
+2:
+	mov	r6,r2
+	mov	#-5,r0
+	shld	r0,r6
+	tst	r6,r6
+	bt/s	2f
+	 clrt
+	.align	2
+1:	
+SRC(	mov.l	@r4+,r0		)
+SRC(	mov.l	@r4+,r1		)
+	addc	r0,r7
+DST(	mov.l	r0,@r5		)
+DST(	mov.l	r1,@(4,r5)	)
+	addc	r1,r7
+
+SRC(	mov.l	@r4+,r0		)
+SRC(	mov.l	@r4+,r1		)
+	addc	r0,r7
+DST(	mov.l	r0,@(8,r5)	)
+DST(	mov.l	r1,@(12,r5)	)
+	addc	r1,r7
+
+SRC(	mov.l	@r4+,r0 	)
+SRC(	mov.l	@r4+,r1		)
+	addc	r0,r7
+DST(	mov.l	r0,@(16,r5)	)
+DST(	mov.l	r1,@(20,r5)	)
+	addc	r1,r7
+
+SRC(	mov.l	@r4+,r0		)
+SRC(	mov.l	@r4+,r1		)
+	addc	r0,r7
+DST(	mov.l	r0,@(24,r5)	)
+DST(	mov.l	r1,@(28,r5)	)
+	addc	r1,r7
+	add	#32,r5
+	movt	r0
+	dt	r6
+	bf/s	1b
+	 cmp/eq	#1,r0
+	mov	#0,r0
+	addc	r0,r7
+
+2:	mov	r2,r6
+	mov	#0x1c,r0
+	and	r0,r6
+	cmp/pl	r6
+	bf/s	4f
+	 clrt
+	shlr2	r6
+3:	
+SRC(	mov.l	@r4+,r0	)
+	addc	r0,r7
+DST(	mov.l	r0,@r5	)
+	add	#4,r5
+	movt	r0
+	dt	r6
+	bf/s	3b
+	 cmp/eq	#1,r0
+	mov	#0,r0
+	addc	r0,r7
+4:	mov	r2,r6
+	mov	#3,r0
+	and	r0,r6
+	cmp/pl	r6
+	bf	7f
+	mov	#2,r1
+	cmp/hs	r1,r6
+	bf	5f
+SRC(	mov.w	@r4+,r0	)
+DST(	mov.w	r0,@r5	)
+	extu.w	r0,r0
+	add	#2,r5
+	cmp/eq	r1,r6
+	bt/s	6f
+	 clrt
+	shll16	r0
+	addc	r0,r7
+5:	
+SRC(	mov.b	@r4+,r0	)
+DST(	mov.b	r0,@r5	)
+	extu.b	r0,r0
+#ifndef	__LITTLE_ENDIAN__
+	shll8	r0
+#endif
+6:	addc	r0,r7
+	mov	#0,r0
+	addc	r0,r7
+7:
+5000:
+
+# Exception handler:
+.section .fixup, "ax"							
+
+6001:
+	mov.l	@(8,r15),r0			! src_err_ptr
+	mov	#-EFAULT,r1
+	mov.l	r1,@r0
+
+	! zero the complete destination - computing the rest
+	! is too much work 
+	mov.l	@(4,r15),r5		! dst
+	mov.l	@r15,r6			! len
+	mov	#0,r7
+1:	mov.b	r7,@r5
+	dt	r6
+	bf/s	1b
+	 add	#1,r5
+	mov.l	8000f,r0
+	jmp	@r0
+	 nop
+	.align	2
+8000:	.long	5000b
+
+6002:
+	mov.l	@(12,r15),r0			! dst_err_ptr
+	mov	#-EFAULT,r1
+	mov.l	r1,@r0
+	mov.l	8001f,r0
+	jmp	@r0
+	 nop
+	.align	2
+8001:	.long	5000b
+
+.previous
+	add	#8,r15
+	rts
+	 mov	r7,r0
--- a/arch/sh/lib/copy_page.S
+++ b/arch/sh/lib/copy_page.S
@ -0,0 +1,389 @@
+/*
+ * copy_page, __copy_user_page, __copy_user implementation of SuperH
+ *
+ * Copyright (C) 2001  Niibe Yutaka & Kaz Kojima
+ * Copyright (C) 2002  Toshinobu Sugioka
+ * Copyright (C) 2006  Paul Mundt
+ */
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+/*
+ * copy_page
+ * @to: P1 address
+ * @from: P1 address
+ *
+ * void copy_page(void *to, void *from)
+ */
+
+/*
+ * r0, r1, r2, r3, r4, r5, r6, r7 --- scratch 
+ * r8 --- from + PAGE_SIZE
+ * r9 --- not used
+ * r10 --- to
+ * r11 --- from
+ */
+ENTRY(copy_page)
+	mov.l	r8,@-r15
+	mov.l	r10,@-r15
+	mov.l	r11,@-r15
+	mov	r4,r10
+	mov	r5,r11
+	mov	r5,r8
+	mov	#(PAGE_SIZE >> 10), r0
+	shll8	r0
+	shll2	r0
+	add	r0,r8
+	!
+1:	mov.l	@r11+,r0
+	mov.l	@r11+,r1
+	mov.l	@r11+,r2
+	mov.l	@r11+,r3
+	mov.l	@r11+,r4
+	mov.l	@r11+,r5
+	mov.l	@r11+,r6
+	mov.l	@r11+,r7
+#if defined(CONFIG_CPU_SH4)
+	movca.l	r0,@r10
+#else
+	mov.l	r0,@r10
+#endif
+	add	#32,r10
+	mov.l	r7,@-r10
+	mov.l	r6,@-r10
+	mov.l	r5,@-r10
+	mov.l	r4,@-r10
+	mov.l	r3,@-r10
+	mov.l	r2,@-r10
+	mov.l	r1,@-r10
+	cmp/eq	r11,r8
+	bf/s	1b
+	 add	#28,r10
+	!
+	mov.l	@r15+,r11
+	mov.l	@r15+,r10
+	mov.l	@r15+,r8
+	rts
+	 nop
+
+/*
+ * __kernel_size_t __copy_user(void *to, const void *from, __kernel_size_t n);
+ * Return the number of bytes NOT copied
+ */
+#define EX(...)			\
+	9999: __VA_ARGS__ ;		\
+	.section __ex_table, "a";	\
+	.long 9999b, 6000f	;	\
+	.previous
+#define EX_NO_POP(...)			\
+	9999: __VA_ARGS__ ;		\
+	.section __ex_table, "a";	\
+	.long 9999b, 6005f	;	\
+	.previous
+ENTRY(__copy_user)
+	! Check if small number of bytes
+	mov	#11,r0
+	mov	r4,r3
+	cmp/gt	r0,r6		! r6 (len) > r0 (11)
+	bf/s	.L_cleanup_loop_no_pop
+	 add	r6,r3		! last destination address
+
+	! Calculate bytes needed to align to src
+	mov.l	r11,@-r15
+	neg	r5,r0
+	mov.l	r10,@-r15
+	add	#4,r0
+	mov.l	r9,@-r15
+	and	#3,r0
+	mov.l	r8,@-r15
+	tst	r0,r0
+	bt	2f
+
+1:
+	! Copy bytes to long word align src
+EX(	mov.b	@r5+,r1		)
+	dt	r0
+	add	#-1,r6
+EX(	mov.b	r1,@r4		)
+	bf/s	1b
+	 add	#1,r4
+
+	! Jump to appropriate routine depending on dest
+2:	mov	#3,r1
+	mov	r6, r2
+	and	r4,r1
+	shlr2	r2
+	shll2	r1
+	mova	.L_jump_tbl,r0
+	mov.l	@(r0,r1),r1
+	jmp	@r1
+	 nop
+
+	.align 2
+.L_jump_tbl:
+	.long	.L_dest00
+	.long	.L_dest01
+	.long	.L_dest10
+	.long	.L_dest11
+
+/*
+ * Come here if there are less than 12 bytes to copy
+ *
+ * Keep the branch target close, so the bf/s callee doesn't overflow
+ * and result in a more expensive branch being inserted. This is the
+ * fast-path for small copies, the jump via the jump table will hit the
+ * default slow-path cleanup. -PFM.
+ */
+.L_cleanup_loop_no_pop:
+	tst	r6,r6		! Check explicitly for zero
+	bt	1f
+
+2:
+EX_NO_POP(	mov.b	@r5+,r0		)
+	dt	r6
+EX_NO_POP(	mov.b	r0,@r4		)
+	bf/s	2b
+	 add	#1,r4
+
+1:	mov	#0,r0		! normal return
+5000:
+
+# Exception handler:
+.section .fixup, "ax"
+6005:
+	mov.l	8000f,r1
+	mov	r3,r0
+	jmp	@r1
+	 sub	r4,r0
+	.align	2
+8000:	.long	5000b
+
+.previous
+	rts
+	 nop
+
+! Destination = 00
+
+.L_dest00:
+	! Skip the large copy for small transfers
+	mov	#(32+32-4), r0
+	cmp/gt	r6, r0		! r0 (60) > r6 (len)
+	bt	1f
+
+	! Align dest to a 32 byte boundary
+	neg	r4,r0
+	add	#0x20, r0
+	and	#0x1f, r0
+	tst	r0, r0
+	bt	2f
+
+	sub	r0, r6
+	shlr2	r0
+3:
+EX(	mov.l	@r5+,r1		)
+	dt	r0
+EX(	mov.l	r1,@r4		)
+	bf/s	3b
+	 add	#4,r4
+
+2:
+EX(	mov.l	@r5+,r0		)
+EX(	mov.l	@r5+,r1		)
+EX(	mov.l	@r5+,r2		)
+EX(	mov.l	@r5+,r7		)
+EX(	mov.l	@r5+,r8		)
+EX(	mov.l	@r5+,r9		)
+EX(	mov.l	@r5+,r10	)
+EX(	mov.l	@r5+,r11	)
+#ifdef CONFIG_CPU_SH4
+EX(	movca.l	r0,@r4		)
+#else
+EX(	mov.l	r0,@r4		)
+#endif
+	add	#-32, r6
+EX(	mov.l	r1,@(4,r4)	)
+	mov	#32, r0
+EX(	mov.l	r2,@(8,r4)	)
+	cmp/gt	r6, r0		! r0 (32) > r6 (len)
+EX(	mov.l	r7,@(12,r4)	)
+EX(	mov.l	r8,@(16,r4)	)
+EX(	mov.l	r9,@(20,r4)	)
+EX(	mov.l	r10,@(24,r4)	)
+EX(	mov.l	r11,@(28,r4)	)
+	bf/s	2b
+	 add	#32,r4
+
+1:	mov	r6, r0
+	shlr2	r0
+	tst	r0, r0
+	bt	.L_cleanup
+1:
+EX(	mov.l	@r5+,r1		)
+	dt	r0
+EX(	mov.l	r1,@r4		)
+	bf/s	1b
+	 add	#4,r4
+
+	bra	.L_cleanup
+	 nop
+
+! Destination = 10
+
+.L_dest10:
+	mov	r2,r7
+	shlr2	r7
+	shlr	r7
+	tst	r7,r7
+	mov	#7,r0
+	bt/s	1f
+	 and	r0,r2
+2:
+	dt	r7
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+EX(	mov.l	@r5+,r0		)
+EX(	mov.l	@r5+,r1		)
+EX(	mov.l	@r5+,r8		)
+EX(	mov.l	@r5+,r9		)
+EX(	mov.l	@r5+,r10	)
+EX(	mov.w	r0,@r4		)
+	add	#2,r4
+	xtrct	r1,r0
+	xtrct	r8,r1
+	xtrct	r9,r8
+	xtrct	r10,r9
+
+EX(	mov.l	r0,@r4		)
+EX(	mov.l	r1,@(4,r4)	)
+EX(	mov.l	r8,@(8,r4)	)
+EX(	mov.l	r9,@(12,r4)	)
+
+EX(	mov.l	@r5+,r1		)
+EX(	mov.l	@r5+,r8		)
+EX(	mov.l	@r5+,r0		)
+	xtrct	r1,r10
+	xtrct	r8,r1
+	xtrct	r0,r8
+	shlr16	r0
+EX(	mov.l	r10,@(16,r4)	)
+EX(	mov.l	r1,@(20,r4)	)
+EX(	mov.l	r8,@(24,r4)	)
+EX(	mov.w	r0,@(28,r4)	)
+	bf/s	2b
+	 add	#30,r4
+#else
+EX(	mov.l	@(28,r5),r0	)
+EX(	mov.l	@(24,r5),r8	)
+EX(	mov.l	@(20,r5),r9	)
+EX(	mov.l	@(16,r5),r10	)
+EX(	mov.w	r0,@(30,r4)	)
+	add	#-2,r4
+	xtrct	r8,r0
+	xtrct	r9,r8
+	xtrct	r10,r9
+EX(	mov.l	r0,@(28,r4)	)
+EX(	mov.l	r8,@(24,r4)	)
+EX(	mov.l	r9,@(20,r4)	)
+
+EX(	mov.l	@(12,r5),r0	)
+EX(	mov.l	@(8,r5),r8	)
+	xtrct	r0,r10
+EX(	mov.l	@(4,r5),r9	)
+	mov.l	r10,@(16,r4)
+EX(	mov.l	@r5,r10		)
+	xtrct	r8,r0
+	xtrct	r9,r8
+	xtrct	r10,r9
+EX(	mov.l	r0,@(12,r4)	)
+EX(	mov.l	r8,@(8,r4)	)
+	swap.w	r10,r0
+EX(	mov.l	r9,@(4,r4)	)
+EX(	mov.w	r0,@(2,r4)	)
+
+	add	#32,r5
+	bf/s	2b
+	 add	#34,r4
+#endif
+	tst	r2,r2
+	bt	.L_cleanup
+
+1:	! Read longword, write two words per iteration
+EX(	mov.l	@r5+,r0		)
+	dt	r2
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+EX(	mov.w	r0,@r4		)
+	shlr16	r0
+EX(	mov.w 	r0,@(2,r4)	)
+#else
+EX(	mov.w	r0,@(2,r4)	)
+	shlr16	r0
+EX(	mov.w	r0,@r4		)
+#endif
+	bf/s	1b
+	 add	#4,r4
+
+	bra	.L_cleanup
+	 nop
+
+! Destination = 01 or 11
+
+.L_dest01:
+.L_dest11:
+	! Read longword, write byte, word, byte per iteration
+EX(	mov.l	@r5+,r0		)
+	dt	r2
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+EX(	mov.b	r0,@r4		)
+	shlr8	r0
+	add	#1,r4
+EX(	mov.w	r0,@r4		)
+	shlr16	r0
+EX(	mov.b	r0,@(2,r4)	)
+	bf/s	.L_dest01
+	 add	#3,r4
+#else
+EX(	mov.b	r0,@(3,r4)	)
+	shlr8	r0
+	swap.w	r0,r7
+EX(	mov.b	r7,@r4		)
+	add	#1,r4
+EX(	mov.w	r0,@r4		)
+	bf/s	.L_dest01
+	 add	#3,r4
+#endif
+
+! Cleanup last few bytes
+.L_cleanup:
+	mov	r6,r0
+	and	#3,r0
+	tst	r0,r0
+	bt	.L_exit
+	mov	r0,r6
+
+.L_cleanup_loop:
+EX(	mov.b	@r5+,r0		)
+	dt	r6
+EX(	mov.b	r0,@r4		)
+	bf/s	.L_cleanup_loop
+	 add	#1,r4
+
+.L_exit:
+	mov	#0,r0		! normal return
+
+5000:
+
+# Exception handler:
+.section .fixup, "ax"
+6000:
+	mov.l	8000f,r1
+	mov	r3,r0
+	jmp	@r1
+	 sub	r4,r0
+	.align	2
+8000:	.long	5000b
+
+.previous
+	mov.l	@r15+,r8
+	mov.l	@r15+,r9
+	mov.l	@r15+,r10
+	rts
+	 mov.l	@r15+,r11
--- a/arch/sh/lib/delay.c
+++ b/arch/sh/lib/delay.c
@ -0,0 +1,53 @@
+/*
+ *	Precise Delay Loops for SuperH
+ *
+ *	Copyright (C) 1999 Niibe Yutaka & Kaz Kojima
+ */
+
+#include <linux/sched.h>
+#include <linux/delay.h>
+
+void __delay(unsigned long loops)
+{
+	__asm__ __volatile__(
+		/*
+		 * ST40-300 appears to have an issue with this code,
+		 * normally taking two cycles each loop, as with all
+		 * other SH variants. If however the branch and the
+		 * delay slot straddle an 8 byte boundary, this increases
+		 * to 3 cycles.
+		 * This align directive ensures this doesn't occur.
+		 */
+		".balign 8\n\t"
+
+		"tst	%0, %0\n\t"
+		"1:\t"
+		"bf/s	1b\n\t"
+		" dt	%0"
+		: "=r" (loops)
+		: "0" (loops)
+		: "t");
+}
+
+inline void __const_udelay(unsigned long xloops)
+{
+	xloops *= 4;
+	__asm__("dmulu.l	%0, %2\n\t"
+		"sts	mach, %0"
+		: "=r" (xloops)
+		: "0" (xloops),
+		  "r" (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4))
+		: "macl", "mach");
+	__delay(++xloops);
+}
+
+void __udelay(unsigned long usecs)
+{
+	__const_udelay(usecs * 0x000010c6);  /* 2**32 / 1000000 */
+}
+
+void __ndelay(unsigned long nsecs)
+{
+	__const_udelay(nsecs * 0x00000005);
+}
+
--- a/arch/sh/lib/div64-generic.c
+++ b/arch/sh/lib/div64-generic.c
@ -0,0 +1,19 @@
+/*
+ * Generic __div64_32 wrapper for __xdiv64_32.
+ */
+
+#include <linux/types.h>
+#include <asm/div64.h>
+
+extern uint64_t __xdiv64_32(u64 n, u32 d);
+
+uint32_t __div64_32(u64 *xp, u32 y)
+{
+	uint32_t rem;
+	uint64_t q = __xdiv64_32(*xp, y);
+
+	rem = *xp - q * y;
+	*xp = q;
+
+	return rem;
+}
--- a/arch/sh/lib/div64.S
+++ b/arch/sh/lib/div64.S
@ -0,0 +1,46 @@
+/*	
+ * unsigned long __xdiv64_32(unsigned long long n, unsigned long d); 
+ */
+
+#include <linux/linkage.h>
+
+.text
+ENTRY(__xdiv64_32)
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+	mov	r4, r0
+	mov	r5, r1
+#else
+	mov	r4, r1
+	mov	r5, r0
+#endif
+	cmp/hs	r6, r1
+	bf.s	1f
+	 mov	#0, r2
+
+	mov	r1, r2
+	mov	#0, r3
+	div0u
+	.rept	32
+	rotcl	r2
+	div1	r6, r3
+	.endr
+	rotcl	r2
+	mul.l	r6, r2
+	sts	macl, r3
+	sub	r3, r1
+1:
+	div0u
+	.rept	32
+	rotcl	r0
+	div1	r6, r1
+	.endr
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+	mov	r2, r1
+	rts
+	 rotcl	r0
+#else
+	rotcl	r0
+	mov	r0, r1
+	rts
+	 mov	r2, r0
+#endif
--- a/arch/sh/lib/io.c
+++ b/arch/sh/lib/io.c
@ -0,0 +1,82 @@
+/*
+ * arch/sh/lib/io.c - SH32 optimized I/O routines
+ *
+ * Copyright (C) 2000  Stuart Menefy
+ * Copyright (C) 2005  Paul Mundt
+ *
+ * Provide real functions which expand to whatever the header file defined.
+ * Also definitions of machine independent IO functions.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/module.h>
+#include <linux/io.h>
+
+void __raw_readsl(const void __iomem *addr, void *datap, int len)
+{
+	u32 *data;
+
+	for (data = datap; (len != 0) && (((u32)data & 0x1f) != 0); len--)
+		*data++ = __raw_readl(addr);
+
+	if (likely(len >= (0x20 >> 2))) {
+		int tmp2, tmp3, tmp4, tmp5, tmp6;
+
+		__asm__ __volatile__(
+			"1:			\n\t"
+			"mov.l	@%7, r0		\n\t"
+			"mov.l	@%7, %2		\n\t"
+#ifdef CONFIG_CPU_SH4
+			"movca.l r0, @%0	\n\t"
+#else
+			"mov.l	r0, @%0		\n\t"
+#endif
+			"mov.l	@%7, %3		\n\t"
+			"mov.l	@%7, %4		\n\t"
+			"mov.l	@%7, %5		\n\t"
+			"mov.l	@%7, %6		\n\t"
+			"mov.l	@%7, r7		\n\t"
+			"mov.l	@%7, r0		\n\t"
+			"mov.l	%2, @(0x04,%0)	\n\t"
+			"mov	#0x20>>2, %2	\n\t"
+			"mov.l	%3, @(0x08,%0)	\n\t"
+			"sub	%2, %1		\n\t"
+			"mov.l	%4, @(0x0c,%0)	\n\t"
+			"cmp/hi	%1, %2		! T if 32 > len	\n\t"
+			"mov.l	%5, @(0x10,%0)	\n\t"
+			"mov.l	%6, @(0x14,%0)	\n\t"
+			"mov.l	r7, @(0x18,%0)	\n\t"
+			"mov.l	r0, @(0x1c,%0)	\n\t"
+			"bf.s	1b		\n\t"
+			" add	#0x20, %0	\n\t"
+			: "=&r" (data), "=&r" (len),
+			  "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4),
+			  "=&r" (tmp5), "=&r" (tmp6)
+			: "r"(addr), "0" (data), "1" (len)
+			: "r0", "r7", "t", "memory");
+	}
+
+	for (; len != 0; len--)
+		*data++ = __raw_readl(addr);
+}
+EXPORT_SYMBOL(__raw_readsl);
+
+void __raw_writesl(void __iomem *addr, const void *data, int len)
+{
+	if (likely(len != 0)) {
+		int tmp1;
+
+		__asm__ __volatile__ (
+			"1:				\n\t"
+			"mov.l	@%0+, %1	\n\t"
+			"dt		%3		\n\t"
+			"bf.s		1b		\n\t"
+			" mov.l	%1, @%4		\n\t"
+			: "=&r" (data), "=&r" (tmp1)
+			: "0" (data), "r" (len), "r"(addr)
+			: "t", "memory");
+	}
+}
+EXPORT_SYMBOL(__raw_writesl);
--- a/arch/sh/lib/libgcc.h
+++ b/arch/sh/lib/libgcc.h
@ -0,0 +1,25 @@
+#ifndef __ASM_LIBGCC_H
+#define __ASM_LIBGCC_H
+
+#include <asm/byteorder.h>
+
+typedef int word_type __attribute__ ((mode (__word__)));
+
+#ifdef __BIG_ENDIAN
+struct DWstruct {
+	int high, low;
+};
+#elif defined(__LITTLE_ENDIAN)
+struct DWstruct {
+	int low, high;
+};
+#else
+#error I feel sick.
+#endif
+
+typedef union {
+	struct DWstruct s;
+	long long ll;
+} DWunion;
+
+#endif /* __ASM_LIBGCC_H */
--- a/arch/sh/lib/lshrdi3.c
+++ b/arch/sh/lib/lshrdi3.c
@ -0,0 +1,29 @@
+#include <linux/module.h>
+
+#include "libgcc.h"
+
+long long __lshrdi3(long long u, word_type b)
+{
+	DWunion uu, w;
+	word_type bm;
+
+	if (b == 0)
+		return u;
+
+	uu.ll = u;
+	bm = 32 - b;
+
+	if (bm <= 0) {
+		w.s.high = 0;
+		w.s.low = (unsigned int) uu.s.high >> -bm;
+	} else {
+		const unsigned int carries = (unsigned int) uu.s.high << bm;
+
+		w.s.high = (unsigned int) uu.s.high >> b;
+		w.s.low = ((unsigned int) uu.s.low >> b) | carries;
+	}
+
+	return w.ll;
+}
+
+EXPORT_SYMBOL(__lshrdi3);
--- a/arch/sh/lib/lshrsi3.S
+++ b/arch/sh/lib/lshrsi3.S
@ -0,0 +1,193 @@
+/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
+   2004, 2005, 2006
+   Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+!! libgcc routines for the Renesas / SuperH SH CPUs.
+!! Contributed by Steve Chamberlain.
+!! sac@cygnus.com
+
+!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines
+!! recoded in assembly by Toshiyasu Morita
+!! tm@netcom.com
+
+/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and
+   ELF local label prefixes by J"orn Rennecke
+   amylaar@cygnus.com  */
+
+!
+! __lshrsi3
+!
+! Entry:
+!
+! r4: Value to shift
+! r5: Shifts
+!
+! Exit:
+!
+! r0: Result
+!
+! Destroys:
+!
+! (none)
+!
+	.global	__lshrsi3
+	
+	.align	2
+__lshrsi3:
+	mov	#31,r0
+	and	r0,r5
+	mova	lshrsi3_table,r0
+	mov.b	@(r0,r5),r5
+#ifdef __sh1__
+	add	r5,r0
+	jmp	@r0
+#else
+	braf	r5
+#endif
+	mov	r4,r0
+
+	.align	2
+lshrsi3_table:
+	.byte		lshrsi3_0-lshrsi3_table
+	.byte		lshrsi3_1-lshrsi3_table
+	.byte		lshrsi3_2-lshrsi3_table
+	.byte		lshrsi3_3-lshrsi3_table
+	.byte		lshrsi3_4-lshrsi3_table
+	.byte		lshrsi3_5-lshrsi3_table
+	.byte		lshrsi3_6-lshrsi3_table
+	.byte		lshrsi3_7-lshrsi3_table
+	.byte		lshrsi3_8-lshrsi3_table
+	.byte		lshrsi3_9-lshrsi3_table
+	.byte		lshrsi3_10-lshrsi3_table
+	.byte		lshrsi3_11-lshrsi3_table
+	.byte		lshrsi3_12-lshrsi3_table
+	.byte		lshrsi3_13-lshrsi3_table
+	.byte		lshrsi3_14-lshrsi3_table
+	.byte		lshrsi3_15-lshrsi3_table
+	.byte		lshrsi3_16-lshrsi3_table
+	.byte		lshrsi3_17-lshrsi3_table
+	.byte		lshrsi3_18-lshrsi3_table
+	.byte		lshrsi3_19-lshrsi3_table
+	.byte		lshrsi3_20-lshrsi3_table
+	.byte		lshrsi3_21-lshrsi3_table
+	.byte		lshrsi3_22-lshrsi3_table
+	.byte		lshrsi3_23-lshrsi3_table
+	.byte		lshrsi3_24-lshrsi3_table
+	.byte		lshrsi3_25-lshrsi3_table
+	.byte		lshrsi3_26-lshrsi3_table
+	.byte		lshrsi3_27-lshrsi3_table
+	.byte		lshrsi3_28-lshrsi3_table
+	.byte		lshrsi3_29-lshrsi3_table
+	.byte		lshrsi3_30-lshrsi3_table
+	.byte		lshrsi3_31-lshrsi3_table
+
+lshrsi3_6:
+	shlr2	r0
+lshrsi3_4:
+	shlr2	r0
+lshrsi3_2:
+	rts
+	shlr2	r0
+
+lshrsi3_7:
+	shlr2	r0
+lshrsi3_5:
+	shlr2	r0
+lshrsi3_3:
+	shlr2	r0
+lshrsi3_1:
+	rts
+	shlr	r0
+
+lshrsi3_14:
+	shlr2	r0
+lshrsi3_12:
+	shlr2	r0
+lshrsi3_10:
+	shlr2	r0
+lshrsi3_8:
+	rts
+	shlr8	r0
+
+lshrsi3_15:
+	shlr2	r0
+lshrsi3_13:
+	shlr2	r0
+lshrsi3_11:
+	shlr2	r0
+lshrsi3_9:
+	shlr8	r0
+	rts
+	shlr	r0
+
+lshrsi3_22:
+	shlr2	r0
+lshrsi3_20:
+	shlr2	r0
+lshrsi3_18:
+	shlr2	r0
+lshrsi3_16:
+	rts
+	shlr16	r0
+
+lshrsi3_23:
+	shlr2	r0
+lshrsi3_21:
+	shlr2	r0
+lshrsi3_19:
+	shlr2	r0
+lshrsi3_17:
+	shlr16	r0
+	rts
+	shlr	r0
+
+lshrsi3_30:
+	shlr2	r0
+lshrsi3_28:
+	shlr2	r0
+lshrsi3_26:
+	shlr2	r0
+lshrsi3_24:
+	shlr16	r0
+	rts
+	shlr8	r0
+
+lshrsi3_31:
+	shlr2	r0
+lshrsi3_29:
+	shlr2	r0
+lshrsi3_27:
+	shlr2	r0
+lshrsi3_25:
+	shlr16	r0
+	shlr8	r0
+	rts
+	shlr	r0
+
+lshrsi3_0:
+	rts
+	nop
--- a/arch/sh/lib/mcount.S
+++ b/arch/sh/lib/mcount.S
@ -0,0 +1,290 @@
+/*
+ * arch/sh/lib/mcount.S
+ *
+ *  Copyright (C) 2008, 2009  Paul Mundt
+ *  Copyright (C) 2008, 2009  Matt Fleming
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <asm/ftrace.h>
+#include <asm/thread_info.h>
+#include <asm/asm-offsets.h>
+
+#define MCOUNT_ENTER()		\
+	mov.l	r4, @-r15;	\
+	mov.l	r5, @-r15;	\
+	mov.l	r6, @-r15;	\
+	mov.l	r7, @-r15;	\
+	sts.l	pr, @-r15;	\
+				\
+	mov.l	@(20,r15),r4;	\
+	sts	pr, r5
+
+#define MCOUNT_LEAVE()		\
+	lds.l	@r15+, pr;	\
+	mov.l	@r15+, r7;	\
+	mov.l	@r15+, r6;	\
+	mov.l	@r15+, r5;	\
+	rts;			\
+	 mov.l	@r15+, r4
+
+#ifdef CONFIG_STACK_DEBUG
+/*
+ * Perform diagnostic checks on the state of the kernel stack.
+ *
+ * Check for stack overflow. If there is less than 1KB free
+ * then it has overflowed.
+ *
+ * Make sure the stack pointer contains a valid address. Valid
+ * addresses for kernel stacks are anywhere after the bss
+ * (after __bss_stop) and anywhere in init_thread_union (init_stack).
+ */
+#define STACK_CHECK()					\
+	mov	#(THREAD_SIZE >> 10), r0;		\
+	shll8	r0;					\
+	shll2	r0;					\
+							\
+	/* r1 = sp & (THREAD_SIZE - 1) */		\
+	mov	#-1, r1;				\
+	add	r0, r1;					\
+	and	r15, r1;				\
+							\
+	mov	#TI_SIZE, r3;				\
+	mov	#(STACK_WARN >> 8), r2;			\
+	shll8	r2;					\
+	add	r3, r2;					\
+							\
+	/* Is the stack overflowing? */			\
+	cmp/hi	r2, r1;					\
+	bf	stack_panic;				\
+							\
+	/* If sp > __bss_stop then we're OK. */		\
+	mov.l	.L_ebss, r1;				\
+	cmp/hi	r1, r15;				\
+	bt	1f;					\
+							\
+	/* If sp < init_stack, we're not OK. */		\
+	mov.l	.L_init_thread_union, r1;		\
+	cmp/hs	r1, r15;				\
+	bf	stack_panic;				\
+							\
+	/* If sp > init_stack && sp < __bss_stop, not OK. */	\
+	add	r0, r1;					\
+	cmp/hs	r1, r15;				\
+	bt	stack_panic;				\
+1:
+#else
+#define STACK_CHECK()
+#endif /* CONFIG_STACK_DEBUG */
+
+	.align 2
+	.globl	_mcount
+	.type	_mcount,@function
+	.globl	mcount
+	.type	mcount,@function
+_mcount:
+mcount:
+	STACK_CHECK()
+
+#ifndef CONFIG_FUNCTION_TRACER
+	rts
+	 nop
+#else
+	MCOUNT_ENTER()
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+	.globl	mcount_call
+mcount_call:
+	mov.l	.Lftrace_stub, r6
+#else
+	mov.l	.Lftrace_trace_function, r6
+	mov.l	ftrace_stub, r7
+	cmp/eq	r6, r7
+	bt	skip_trace
+	mov.l	@r6, r6
+#endif
+
+	jsr	@r6
+	 nop
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	mov.l   .Lftrace_graph_return, r6
+	mov.l   .Lftrace_stub, r7
+	cmp/eq  r6, r7
+	bt      1f
+
+	mov.l   .Lftrace_graph_caller, r0
+	jmp     @r0
+	 nop
+
+1:
+	mov.l	.Lftrace_graph_entry, r6
+	mov.l	.Lftrace_graph_entry_stub, r7
+	cmp/eq	r6, r7
+	bt	skip_trace
+
+	mov.l   .Lftrace_graph_caller, r0
+	jmp	@r0
+	 nop
+
+	.align 2
+.Lftrace_graph_return:
+	.long   ftrace_graph_return
+.Lftrace_graph_entry:
+	.long   ftrace_graph_entry
+.Lftrace_graph_entry_stub:
+	.long   ftrace_graph_entry_stub
+.Lftrace_graph_caller:
+	.long   ftrace_graph_caller
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+	.globl skip_trace
+skip_trace:
+	MCOUNT_LEAVE()
+
+	.align 2
+.Lftrace_trace_function:
+	.long   ftrace_trace_function
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/*
+ * NOTE: Do not move either ftrace_graph_call or ftrace_caller
+ * as this will affect the calculation of GRAPH_INSN_OFFSET.
+ */
+	.globl ftrace_graph_call
+ftrace_graph_call:
+	mov.l	.Lskip_trace, r0
+	jmp	@r0
+	 nop
+
+	.align 2
+.Lskip_trace:
+	.long	skip_trace
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+	.globl ftrace_caller
+ftrace_caller:
+	MCOUNT_ENTER()
+
+	.globl ftrace_call
+ftrace_call:
+	mov.l	.Lftrace_stub, r6
+	jsr	@r6
+	 nop
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	bra	ftrace_graph_call
+	 nop
+#else
+	MCOUNT_LEAVE()
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+	.align 2
+
+/*
+ * NOTE: From here on the locations of the .Lftrace_stub label and
+ * ftrace_stub itself are fixed. Adding additional data here will skew
+ * the displacement for the memory table and break the block replacement.
+ * Place new labels either after the ftrace_stub body, or before
+ * ftrace_caller. You have been warned.
+ */
+.Lftrace_stub:
+	.long	ftrace_stub
+
+	.globl	ftrace_stub
+ftrace_stub:
+	rts
+	 nop
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	.globl	ftrace_graph_caller
+ftrace_graph_caller:
+	mov.l	2f, r1
+	jmp	@r1
+	 nop
+1:
+	/*
+	 * MCOUNT_ENTER() pushed 5 registers onto the stack, so
+	 * the stack address containing our return address is
+	 * r15 + 20.
+	 */
+	mov	#20, r0
+	add	r15, r0
+	mov	r0, r4
+
+	mov.l	.Lprepare_ftrace_return, r0
+	jsr	@r0
+	 nop
+
+	MCOUNT_LEAVE()
+
+	.align 2
+2:	.long	skip_trace
+.Lprepare_ftrace_return:
+	.long	prepare_ftrace_return
+
+	.globl	return_to_handler
+return_to_handler:
+	/*
+	 * Save the return values.
+	 */
+	mov.l	r0, @-r15
+	mov.l	r1, @-r15
+
+	mov	#0, r4
+
+	mov.l	.Lftrace_return_to_handler, r0
+	jsr	@r0
+	 nop
+
+	/*
+	 * The return value from ftrace_return_handler has the real
+	 * address that we should return to.
+	 */
+	lds	r0, pr
+	mov.l	@r15+, r1
+	rts
+	 mov.l	@r15+, r0
+
+
+	.align 2
+.Lftrace_return_to_handler:
+	.long	ftrace_return_to_handler
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_STACK_DEBUG
+	.globl	stack_panic
+stack_panic:
+	mov.l	.Ldump_stack, r0
+	jsr	@r0
+	 nop
+
+	mov.l	.Lpanic, r0
+	jsr	@r0
+	 mov.l	.Lpanic_s, r4
+
+	rts
+	 nop
+
+	.align 2
+.L_init_thread_union:
+	.long	init_thread_union
+.L_ebss:
+	.long	__bss_stop
+.Lpanic:
+	.long	panic
+.Lpanic_s:
+	.long	.Lpanic_str
+.Ldump_stack:
+	.long	dump_stack
+
+	.section	.rodata
+	.align 2
+.Lpanic_str:
+	.string "Stack error"
+#endif /* CONFIG_STACK_DEBUG */
--- a/arch/sh/lib/memchr.S
+++ b/arch/sh/lib/memchr.S
@ -0,0 +1,26 @@
+/* $Id: memchr.S,v 1.1 2000/04/14 16:49:01 mjd Exp $
+ *
+ * "memchr" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ *
+ */
+
+/*
+ * void *memchr(const void *s, int c, size_t n);
+ */
+
+#include <linux/linkage.h>
+ENTRY(memchr)
+	tst	r6,r6
+	bt/s	2f
+	 exts.b	r5,r5
+1:	mov.b	@r4,r1
+	cmp/eq	r1,r5
+	bt/s	3f
+	 dt	r6
+	bf/s	1b
+	 add	#1,r4
+2:	mov	#0,r4
+3:	rts
+	 mov	r4,r0
--- a/arch/sh/lib/memcpy-sh4.S
+++ b/arch/sh/lib/memcpy-sh4.S
@ -0,0 +1,799 @@
+/*
+ * "memcpy" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ * Copyright (c) 2002  STMicroelectronics Ltd
+ *   Modified from memcpy.S and micro-optimised for SH4
+ *   Stuart Menefy (stuart.menefy@st.com)
+ *
+ */
+#include <linux/linkage.h>
+
+/*
+ * void *memcpy(void *dst, const void *src, size_t n);
+ *
+ * It is assumed that there is no overlap between src and dst.
+ * If there is an overlap, then the results are undefined.
+ */
+
+	!
+	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
+	!
+
+	! Size is 16 or greater, and may have trailing bytes
+
+	.balign	32
+.Lcase1:
+	! Read a long word and write a long word at once
+	! At the start of each iteration, r7 contains last long load
+	add	#-1,r5		!  79 EX
+	mov	r4,r2		!   5 MT (0 cycles latency)
+
+	mov.l	@(r0,r5),r7	!  21 LS (2 cycles latency)
+	add	#-4,r5		!  50 EX
+
+	add	#7,r2		!  79 EX
+	!
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+	! 6 cycles, 4 bytes per iteration
+3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
+	mov	r7, r3		!   5 MT (latency=0)	! RQPO
+
+	cmp/hi	r2,r0		!  57 MT
+	shll16	r3		! 103 EX
+
+	mov	r1,r6		!   5 MT (latency=0)
+	shll8	r3		! 102 EX		! Oxxx
+
+	shlr8	r6		! 106 EX		! xNML
+	mov	r1, r7		!   5 MT (latency=0)
+
+	or	r6,r3		!  82 EX		! ONML
+	bt/s	3b		! 109 BR
+
+	 mov.l	r3,@-r0		!  30 LS
+#else
+3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! KLMN
+	mov	r7,r3		!   5 MT (latency=0)	! OPQR
+
+	cmp/hi	r2,r0		!  57 MT
+	shlr16	r3		! 107 EX
+
+	shlr8	r3		! 106 EX		! xxxO
+	mov	r1,r6		!   5 MT (latency=0)
+
+	shll8	r6		! 102 EX		! LMNx
+	mov	r1,r7		!   5 MT (latency=0)
+
+	or	r6,r3		!  82 EX		! LMNO
+	bt/s	3b		! 109 BR
+
+	 mov.l	r3,@-r0		!  30 LS
+#endif
+	! Finally, copy a byte at once, if necessary
+
+	add	#4,r5		!  50 EX
+	cmp/eq	r4,r0		!  54 MT
+
+	add	#-6,r2		!  50 EX
+	bt	9f		! 109 BR
+
+8:	cmp/hi	r2,r0		!  57 MT
+	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
+
+	bt/s	8b		! 109 BR
+
+	 mov.b	r1,@-r0		!  29 LS
+
+9:	rts
+	 nop
+
+
+	!
+	!	GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
+	!
+
+	! Size is 16 or greater, and may have trailing bytes
+
+	.balign	32
+.Lcase3:
+	! Read a long word and write a long word at once
+	! At the start of each iteration, r7 contains last long load
+	add	#-3,r5		! 79 EX
+	mov	r4,r2		!  5 MT (0 cycles latency)
+
+	mov.l	@(r0,r5),r7	! 21 LS (2 cycles latency)
+	add	#-4,r5		! 50 EX
+
+	add	#7,r2		!  79 EX
+	!
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+	! 6 cycles, 4 bytes per iteration
+3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
+	mov	r7, r3		!   5 MT (latency=0)	! RQPO
+
+	cmp/hi	r2,r0		!  57 MT
+	shll8	r3		! 102 EX		! QPOx
+
+	mov	r1,r6		!   5 MT (latency=0)
+	shlr16	r6		! 107 EX
+
+	shlr8	r6		! 106 EX		! xxxN
+	mov	r1, r7		!   5 MT (latency=0)
+
+	or	r6,r3		!  82 EX		! QPON
+	bt/s	3b		! 109 BR
+
+	 mov.l	r3,@-r0		!  30 LS
+#else
+3:	mov	r7,r3		! OPQR
+	shlr8	r3		! xOPQ
+	mov.l	@(r0,r5),r7	! KLMN
+	mov	r7,r6
+	shll16	r6
+	shll8	r6		! Nxxx
+	or	r6,r3		! NOPQ
+	cmp/hi	r2,r0
+	bt/s	3b
+	 mov.l	r3,@-r0
+#endif
+
+	! Finally, copy a byte at once, if necessary
+
+	add	#6,r5		!  50 EX
+	cmp/eq	r4,r0		!  54 MT
+
+	add	#-6,r2		!  50 EX
+	bt	9f		! 109 BR
+
+8:	cmp/hi	r2,r0		!  57 MT
+	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
+
+	bt/s	8b		! 109 BR
+
+	 mov.b	r1,@-r0		!  29 LS
+
+9:	rts
+	 nop
+
+ENTRY(memcpy)
+
+	! Calculate the invariants which will be used in the remainder
+	! of the code:
+	!
+	!      r4   -->  [ ...  ] DST             [ ...  ] SRC
+	!	         [ ...  ]                 [ ...  ]
+	!	           :                        :
+	!      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
+	!
+	!
+
+	! Short circuit the common case of src, dst and len being 32 bit aligned
+	! and test for zero length move
+
+	mov	r6, r0		!   5 MT (0 cycle latency)
+	or	r4, r0		!  82 EX
+
+	or	r5, r0		!  82 EX
+	tst	r6, r6		!  86 MT
+
+	bt/s	99f		! 111 BR		(zero len)
+	 tst	#3, r0		!  87 MT
+
+	mov	r4, r0		!   5 MT (0 cycle latency)
+	add	r6, r0		!  49 EX
+
+	mov	#16, r1		!   6 EX
+	bt/s	.Lcase00	! 111 BR		(aligned)
+
+	 sub	r4, r5		!  75 EX
+
+	! Arguments are not nicely long word aligned or zero len.
+	! Check for small copies, and if so do a simple byte at a time copy.
+	!
+	! Deciding on an exact value of 'small' is not easy, as the point at which
+	! using the optimised routines become worthwhile varies (these are the
+	! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
+	!	size	byte-at-time	long	word	byte
+	!	16	42		39-40	46-50	50-55
+	!	24	58		43-44	54-58	62-67
+	!	36	82		49-50	66-70	80-85
+	! However the penalty for getting it 'wrong' is much higher for long word
+	! aligned data (and this is more common), so use a value of 16.
+
+	cmp/gt	r6,r1		!  56 MT
+
+	add	#-1,r5		!  50 EX
+	bf/s	6f		! 108 BR		(not small)
+
+	 mov	r5, r3		!   5 MT (latency=0)
+	shlr	r6		! 104 EX
+
+	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
+	bf/s	4f		! 111 BR
+
+	 add	#-1,r3		!  50 EX
+	tst	r6, r6		!  86 MT
+
+	bt/s	98f		! 110 BR
+	 mov.b	r1,@-r0		!  29 LS
+
+	! 4 cycles, 2 bytes per iteration
+3:	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
+
+4:	mov.b	@(r0,r3),r2	!  20 LS (latency=2)
+	dt	r6		!  67 EX
+
+	mov.b	r1,@-r0		!  29 LS
+	bf/s	3b		! 111 BR
+
+	 mov.b	r2,@-r0		!  29 LS
+98:
+	rts
+	 nop
+
+99:	rts
+	 mov	r4, r0
+
+	! Size is not small, so its worthwhile looking for optimisations.
+	! First align destination to a long word boundary.
+	!
+	! r5 = normal value -1
+
+6:	tst	#3, r0		!  87 MT
+        mov	#3, r3		!   6 EX
+
+	bt/s	2f		! 111 BR
+	 and	r0,r3		!  78 EX
+
+	! 3 cycles, 1 byte per iteration
+1:	dt	r3		!  67 EX
+	mov.b	@(r0,r5),r1	!  19 LS (latency=2)
+
+	add	#-1, r6		!  79 EX
+	bf/s	1b		! 109 BR
+
+	 mov.b	r1,@-r0		!  28 LS
+
+2:	add	#1, r5		!  79 EX
+
+	! Now select the appropriate bulk transfer code based on relative
+	! alignment of src and dst.
+
+	mov	r0, r3		!   5 MT (latency=0)
+
+	mov	r5, r0		!   5 MT (latency=0)
+	tst	#1, r0		!  87 MT
+
+	bf/s	1f		! 111 BR
+	 mov	#64, r7		!   6 EX
+
+	! bit 0 clear
+
+	cmp/ge	r7, r6		!  55 MT
+
+	bt/s	2f		! 111 BR
+	 tst	#2, r0		!  87 MT
+
+	! small
+	bt/s	.Lcase0
+	 mov	r3, r0
+
+	bra	.Lcase2
+	 nop
+
+	! big
+2:	bt/s	.Lcase0b
+	 mov	r3, r0
+
+	bra	.Lcase2b
+	 nop
+
+	! bit 0 set
+1:	tst	#2, r0		! 87 MT
+
+	bt/s	.Lcase1
+	 mov	r3, r0
+
+	bra	.Lcase3
+	 nop
+
+
+	!
+	!	GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
+	!
+
+	! src, dst and size are all long word aligned
+	! size is non-zero
+
+	.balign	32
+.Lcase00:
+	mov	#64, r1		!   6 EX
+	mov	r5, r3		!   5 MT (latency=0)
+
+	cmp/gt	r6, r1		!  56 MT
+	add	#-4, r5		!  50 EX
+
+	bf	.Lcase00b	! 108 BR		(big loop)
+	shlr2	r6		! 105 EX
+
+	shlr	r6		! 104 EX
+	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+
+	bf/s	4f		! 111 BR
+	 add	#-8, r3		!  50 EX
+
+	tst	r6, r6		!  86 MT
+	bt/s	5f		! 110 BR
+
+	 mov.l	r1,@-r0		!  30 LS
+
+	! 4 cycles, 2 long words per iteration
+3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+
+4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
+	dt	r6		!  67 EX
+
+	mov.l	r1, @-r0	!  30 LS
+	bf/s	3b		! 109 BR
+
+	 mov.l	r2, @-r0	!  30 LS
+
+5:	rts
+	 nop
+
+
+	! Size is 16 or greater and less than 64, but may have trailing bytes
+
+	.balign	32
+.Lcase0:
+	add	#-4, r5		!  50 EX
+	mov	r4, r7		!   5 MT (latency=0)
+
+	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+	mov	#4, r2		!   6 EX
+
+	add	#11, r7		!  50 EX
+	tst	r2, r6		!  86 MT
+
+	mov	r5, r3		!   5 MT (latency=0)
+	bt/s	4f		! 111 BR
+
+	 add	#-4, r3		!  50 EX
+	mov.l	r1,@-r0		!  30 LS
+
+	! 4 cycles, 2 long words per iteration
+3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+
+4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
+	cmp/hi	r7, r0
+
+	mov.l	r1, @-r0	!  30 LS
+	bt/s	3b		! 109 BR
+
+	 mov.l	r2, @-r0	!  30 LS
+
+	! Copy the final 0-3 bytes
+
+	add	#3,r5		!  50 EX
+
+	cmp/eq	r0, r4		!  54 MT
+	add	#-10, r7	!  50 EX
+
+	bt	9f		! 110 BR
+
+	! 3 cycles, 1 byte per iteration
+1:	mov.b	@(r0,r5),r1	!  19 LS
+	cmp/hi	r7,r0		!  57 MT
+
+	bt/s	1b		! 111 BR
+	 mov.b	r1,@-r0		!  28 LS
+
+9:	rts
+	 nop
+
+	! Size is at least 64 bytes, so will be going round the big loop at least once.
+	!
+	!   r2 = rounded up r4
+	!   r3 = rounded down r0
+
+	.balign	32
+.Lcase0b:
+	add	#-4, r5		!  50 EX
+
+.Lcase00b:
+	mov	r0, r3		!   5 MT (latency=0)
+	mov	#(~0x1f), r1	!   6 EX
+
+	and	r1, r3		!  78 EX
+	mov	r4, r2		!   5 MT (latency=0)
+
+	cmp/eq	r3, r0		!  54 MT
+	add	#0x1f, r2	!  50 EX
+
+	bt/s	1f		! 110 BR
+	 and	r1, r2		!  78 EX
+
+	! copy initial words until cache line aligned
+
+	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+	tst	#4, r0		!  87 MT
+
+	mov	r5, r6		!   5 MT (latency=0)
+	add	#-4, r6		!  50 EX
+
+	bt/s	4f		! 111 BR
+	 add	#8, r3		!  50 EX
+
+	tst	#0x18, r0	!  87 MT
+
+	bt/s	1f		! 109 BR
+	 mov.l	r1,@-r0		!  30 LS
+
+	! 4 cycles, 2 long words per iteration
+3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+
+4:	mov.l	@(r0, r6), r7	!  21 LS (latency=2)
+	cmp/eq	r3, r0		!  54 MT
+
+	mov.l	r1, @-r0	!  30 LS
+	bf/s	3b		! 109 BR
+
+	 mov.l	r7, @-r0	!  30 LS
+
+	! Copy the cache line aligned blocks
+	!
+	! In use: r0, r2, r4, r5
+	! Scratch: r1, r3, r6, r7
+	!
+	! We could do this with the four scratch registers, but if src
+	! and dest hit the same cache line, this will thrash, so make
+	! use of additional registers.
+	!
+	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
+	!   r5:	 src (was r0+r5)
+	!   r1:	 dest (was r0)
+	! this can be reversed at the end, so we don't need to save any extra
+	! state.
+	!
+1:	mov.l	r8, @-r15	!  30 LS
+	add	r0, r5		!  49 EX
+
+	mov.l	r9, @-r15	!  30 LS
+	mov	r0, r1		!   5 MT (latency=0)
+
+	mov.l	r10, @-r15	!  30 LS
+	add	#-0x1c, r5	!  50 EX
+
+	mov.l	r11, @-r15	!  30 LS
+
+	! 16 cycles, 32 bytes per iteration
+2:	mov.l	@(0x00,r5),r0	! 18 LS (latency=2)
+	add	#-0x20, r1	! 50 EX
+	mov.l	@(0x04,r5),r3	! 18 LS (latency=2)
+	mov.l	@(0x08,r5),r6	! 18 LS (latency=2)
+	mov.l	@(0x0c,r5),r7	! 18 LS (latency=2)
+	mov.l	@(0x10,r5),r8	! 18 LS (latency=2)
+	mov.l	@(0x14,r5),r9	! 18 LS (latency=2)
+	mov.l	@(0x18,r5),r10	! 18 LS (latency=2)
+	mov.l	@(0x1c,r5),r11	! 18 LS (latency=2)
+	movca.l	r0,@r1		! 40 LS (latency=3-7)
+	mov.l	r3,@(0x04,r1)	! 33 LS
+	mov.l	r6,@(0x08,r1)	! 33 LS
+	mov.l	r7,@(0x0c,r1)	! 33 LS
+
+	mov.l	r8,@(0x10,r1)	! 33 LS
+	add	#-0x20, r5	! 50 EX
+
+	mov.l	r9,@(0x14,r1)	! 33 LS
+	cmp/eq	r2,r1		! 54 MT
+
+	mov.l	r10,@(0x18,r1)	!  33 LS
+	bf/s	2b		! 109 BR
+
+	 mov.l	r11,@(0x1c,r1)	!  33 LS
+
+	mov	r1, r0		!   5 MT (latency=0)
+
+	mov.l	@r15+, r11	!  15 LS
+	sub	r1, r5		!  75 EX
+
+	mov.l	@r15+, r10	!  15 LS
+	cmp/eq	r4, r0		!  54 MT
+
+	bf/s	1f		! 109 BR
+	 mov.l	 @r15+, r9	!  15 LS
+
+	rts
+1:	 mov.l	@r15+, r8	!  15 LS
+	sub	r4, r1		!  75 EX		(len remaining)
+
+	! number of trailing bytes is non-zero
+	!
+	! invariants restored (r5 already decremented by 4)
+	! also r1=num bytes remaining
+
+	mov	#4, r2		!   6 EX
+	mov	r4, r7		!   5 MT (latency=0)
+
+	add	#0x1c, r5	!  50 EX		(back to -4)
+	cmp/hs	r2, r1		!  58 MT
+
+	bf/s	5f		! 108 BR
+	 add	 #11, r7	!  50 EX
+
+	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
+	tst	r2, r1		!  86 MT
+
+	mov	r5, r3		!   5 MT (latency=0)
+	bt/s	4f		! 111 BR
+
+	 add	#-4, r3		!  50 EX
+	cmp/hs	r2, r1		!  58 MT
+
+	bt/s	5f		! 111 BR
+	 mov.l	r6,@-r0		!  30 LS
+
+	! 4 cycles, 2 long words per iteration
+3:	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
+
+4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
+	cmp/hi	r7, r0
+
+	mov.l	r6, @-r0	!  30 LS
+	bt/s	3b		! 109 BR
+
+	 mov.l	r2, @-r0	!  30 LS
+
+	! Copy the final 0-3 bytes
+
+5:	cmp/eq	r0, r4		!  54 MT
+	add	#-10, r7	!  50 EX
+
+	bt	9f		! 110 BR
+	add	#3,r5		!  50 EX
+
+	! 3 cycles, 1 byte per iteration
+1:	mov.b	@(r0,r5),r1	!  19 LS
+	cmp/hi	r7,r0		!  57 MT
+
+	bt/s	1b		! 111 BR
+	 mov.b	r1,@-r0		!  28 LS
+
+9:	rts
+	 nop
+
+	!
+	!	GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
+	!
+
+	.balign	32
+.Lcase2:
+	! Size is 16 or greater and less then 64, but may have trailing bytes
+
+2:	mov	r5, r6		!   5 MT (latency=0)
+	add	#-2,r5		!  50 EX
+
+	mov	r4,r2		!   5 MT (latency=0)
+	add	#-4,r6		!  50 EX
+
+	add	#7,r2		!  50 EX
+3:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
+
+	mov.w	@(r0,r6),r3	!  20 LS (latency=2)
+	cmp/hi	r2,r0		!  57 MT
+
+	mov.w	r1,@-r0		!  29 LS
+	bt/s	3b		! 111 BR
+
+	 mov.w	r3,@-r0		!  29 LS
+
+	bra	10f
+	 nop
+
+
+	.balign	32
+.Lcase2b:
+	! Size is at least 64 bytes, so will be going round the big loop at least once.
+	!
+	!   r2 = rounded up r4
+	!   r3 = rounded down r0
+
+	mov	r0, r3		!   5 MT (latency=0)
+	mov	#(~0x1f), r1	!   6 EX
+
+	and	r1, r3		!  78 EX
+	mov	r4, r2		!   5 MT (latency=0)
+
+	cmp/eq	r3, r0		!  54 MT
+	add	#0x1f, r2	!  50 EX
+
+	add	#-2, r5		!  50 EX
+	bt/s	1f		! 110 BR
+	 and	r1, r2		!  78 EX
+
+	! Copy a short word one at a time until we are cache line aligned
+	!   Normal values: r0, r2, r3, r4
+	!   Unused: r1, r6, r7
+	!   Mod: r5 (=r5-2)
+	!
+	add	#2, r3		!  50 EX
+
+2:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
+	cmp/eq	r3,r0		!  54 MT
+
+	bf/s	2b		! 111 BR
+
+	 mov.w	r1,@-r0		!  29 LS
+
+	! Copy the cache line aligned blocks
+	!
+	! In use: r0, r2, r4, r5 (=r5-2)
+	! Scratch: r1, r3, r6, r7
+	!
+	! We could do this with the four scratch registers, but if src
+	! and dest hit the same cache line, this will thrash, so make
+	! use of additional registers.
+	!
+	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
+	!   r5:	 src (was r0+r5)
+	!   r1:	 dest (was r0)
+	! this can be reversed at the end, so we don't need to save any extra
+	! state.
+	!
+1:	mov.l	r8, @-r15	!  30 LS
+	add	r0, r5		!  49 EX
+
+	mov.l	r9, @-r15	!  30 LS
+	mov	r0, r1		!   5 MT (latency=0)
+
+	mov.l	r10, @-r15	!  30 LS
+	add	#-0x1e, r5	!  50 EX
+
+	mov.l	r11, @-r15	!  30 LS
+
+	mov.l	r12, @-r15	!  30 LS
+
+	! 17 cycles, 32 bytes per iteration
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+2:	mov.w	@r5+, r0	!  14 LS (latency=2)		..JI
+	add	#-0x20, r1	!  50 EX
+
+	mov.l	@r5+, r3	!  15 LS (latency=2)		NMLK
+
+	mov.l	@r5+, r6	!  15 LS (latency=2)		RQPO
+	shll16	r0		! 103 EX			JI..
+
+	mov.l	@r5+, r7	!  15 LS (latency=2)
+	xtrct	r3, r0		!  48 EX			LKJI
+
+	mov.l	@r5+, r8	!  15 LS (latency=2)
+	xtrct	r6, r3		!  48 EX			PONM
+
+	mov.l	@r5+, r9	!  15 LS (latency=2)
+	xtrct	r7, r6		!  48 EX
+
+	mov.l	@r5+, r10	!  15 LS (latency=2)
+	xtrct	r8, r7		!  48 EX
+
+	mov.l	@r5+, r11	!  15 LS (latency=2)
+	xtrct	r9, r8		!  48 EX
+
+	mov.w	@r5+, r12	!  15 LS (latency=2)
+	xtrct	r10, r9		!  48 EX
+
+	movca.l	r0,@r1		!  40 LS (latency=3-7)
+	xtrct	r11, r10	!  48 EX
+
+	mov.l	r3, @(0x04,r1)	!  33 LS
+	xtrct	r12, r11	!  48 EX
+
+	mov.l	r6, @(0x08,r1)	!  33 LS
+
+	mov.l	r7, @(0x0c,r1)	!  33 LS
+
+	mov.l	r8, @(0x10,r1)	!  33 LS
+	add	#-0x40, r5	!  50 EX
+
+	mov.l	r9, @(0x14,r1)	!  33 LS
+	cmp/eq	r2,r1		!  54 MT
+
+	mov.l	r10, @(0x18,r1)	!  33 LS
+	bf/s	2b		! 109 BR
+
+	 mov.l	r11, @(0x1c,r1)	!  33 LS
+#else
+2:	mov.w	@(0x1e,r5), r0	!  17 LS (latency=2)
+	add	#-2, r5		!  50 EX
+
+	mov.l	@(0x1c,r5), r3	!  18 LS (latency=2)
+	add	#-4, r1		!  50 EX
+
+	mov.l	@(0x18,r5), r6	!  18 LS (latency=2)
+	shll16	r0		! 103 EX
+
+	mov.l	@(0x14,r5), r7	!  18 LS (latency=2)
+	xtrct	r3, r0		!  48 EX
+
+	mov.l	@(0x10,r5), r8	!  18 LS (latency=2)
+	xtrct	r6, r3		!  48 EX
+
+	mov.l	@(0x0c,r5), r9	!  18 LS (latency=2)
+	xtrct	r7, r6		!  48 EX
+
+	mov.l	@(0x08,r5), r10	!  18 LS (latency=2)
+	xtrct	r8, r7		!  48 EX
+
+	mov.l	@(0x04,r5), r11	!  18 LS (latency=2)
+	xtrct	r9, r8		!  48 EX
+
+	mov.l   @(0x00,r5), r12 !  18 LS (latency=2)
+    	xtrct	r10, r9		!  48 EX
+
+	movca.l	r0,@r1		!  40 LS (latency=3-7)
+	add	#-0x1c, r1	!  50 EX
+
+	mov.l	r3, @(0x18,r1)	!  33 LS
+	xtrct	r11, r10	!  48 EX
+
+	mov.l	r6, @(0x14,r1)	!  33 LS
+	xtrct	r12, r11	!  48 EX
+
+	mov.l	r7, @(0x10,r1)	!  33 LS
+
+	mov.l	r8, @(0x0c,r1)	!  33 LS
+	add	#-0x1e, r5	!  50 EX
+
+	mov.l	r9, @(0x08,r1)	!  33 LS
+	cmp/eq	r2,r1		!  54 MT
+
+	mov.l	r10, @(0x04,r1)	!  33 LS
+	bf/s	2b		! 109 BR
+
+	 mov.l	r11, @(0x00,r1)	!  33 LS
+#endif
+
+	mov.l	@r15+, r12
+	mov	r1, r0		!   5 MT (latency=0)
+
+	mov.l	@r15+, r11	!  15 LS
+	sub	r1, r5		!  75 EX
+
+	mov.l	@r15+, r10	!  15 LS
+	cmp/eq	r4, r0		!  54 MT
+
+	bf/s	1f		! 109 BR
+	 mov.l	 @r15+, r9	!  15 LS
+
+	rts
+1:	 mov.l	@r15+, r8	!  15 LS
+
+	add	#0x1e, r5	!  50 EX
+
+	! Finish off a short word at a time
+	! r5 must be invariant - 2
+10:	mov	r4,r2		!   5 MT (latency=0)
+	add	#1,r2		!  50 EX
+
+	cmp/hi	r2, r0		!  57 MT
+	bf/s	1f		! 109 BR
+
+	 add	#2, r2		!  50 EX
+
+3:	mov.w	@(r0,r5),r1	!  20 LS
+	cmp/hi	r2,r0		!  57 MT
+
+	bt/s	3b		! 109 BR
+
+	 mov.w	r1,@-r0		!  29 LS
+1:
+
+	!
+	! Finally, copy the last byte if necessary
+	cmp/eq	r4,r0		!  54 MT
+	bt/s	9b
+	 add	#1,r5
+	mov.b	@(r0,r5),r1
+	rts
+	 mov.b	r1,@-r0
+
--- a/arch/sh/lib/memcpy.S
+++ b/arch/sh/lib/memcpy.S
@ -0,0 +1,227 @@
+/* $Id: memcpy.S,v 1.3 2001/07/27 11:50:52 gniibe Exp $
+ *
+ * "memcpy" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ *
+ */
+
+/*
+ * void *memcpy(void *dst, const void *src, size_t n);
+ * No overlap between the memory of DST and of SRC are assumed.
+ */
+
+#include <linux/linkage.h>
+ENTRY(memcpy)
+	tst	r6,r6
+	bt/s	9f		! if n=0, do nothing
+	 mov	r4,r0
+	sub	r4,r5		! From here, r5 has the distance to r0
+	add	r6,r0		! From here, r0 points the end of copying point
+	mov	#12,r1
+	cmp/gt	r6,r1
+	bt/s	7f		! if it's too small, copy a byte at once
+	 add	#-1,r5
+	add	#1,r5
+	!			From here, r6 is free
+	!
+	!      r4   -->  [ ...  ] DST             [ ...  ] SRC
+	!	         [ ...  ]                 [ ...  ]
+	!	           :                        :
+	!      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
+	!
+	!
+	mov	r5,r1
+	mov	#3,r2
+	and	r2,r1
+	shll2	r1
+	mov	r0,r3		! Save the value on R0 to R3
+	mova	jmptable,r0
+	add	r1,r0
+	mov.l	@r0,r1
+	jmp	@r1
+	 mov	r3,r0		! and back to R0
+	.balign	4
+jmptable:
+	.long	case0
+	.long	case1
+	.long	case2
+	.long	case3
+
+	! copy a byte at once
+7:	mov	r4,r2
+	add	#1,r2
+8:
+	cmp/hi	r2,r0
+	mov.b	@(r0,r5),r1
+	bt/s	8b			! while (r0>r2)
+	 mov.b	r1,@-r0
+9:
+	rts
+	 nop
+
+case0:
+	!
+	!	GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
+	!
+	! First, align to long word boundary
+	mov	r0,r3
+	and	r2,r3
+	tst	r3,r3
+	bt/s	2f
+	 add	#-4,r5
+	add	#3,r5
+1:	dt	r3
+	mov.b	@(r0,r5),r1
+	bf/s	1b
+	 mov.b	r1,@-r0
+	!
+	add	#-3,r5
+2:	! Second, copy a long word at once
+	mov	r4,r2
+	add	#7,r2
+3:	mov.l	@(r0,r5),r1
+	cmp/hi	r2,r0
+	bt/s	3b
+	 mov.l	r1,@-r0
+	!
+	! Third, copy a byte at once, if necessary
+	cmp/eq	r4,r0
+	bt/s	9b
+	 add	#3,r5
+	bra	8b
+	 add	#-6,r2
+
+case1:
+	!
+	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
+	!
+	! First, align to long word boundary
+	mov	r0,r3
+	and	r2,r3
+	tst	r3,r3
+	bt/s	2f
+	 add	#-1,r5
+1:	dt	r3
+	mov.b	@(r0,r5),r1
+	bf/s	1b
+	 mov.b	r1,@-r0
+	!
+2:	! Second, read a long word and write a long word at once
+	mov.l	@(r0,r5),r1
+	add	#-4,r5
+	mov	r4,r2
+	add	#7,r2
+	!
+#ifdef __LITTLE_ENDIAN__
+3:	mov	r1,r3		! RQPO
+	shll16	r3
+	shll8	r3		! Oxxx
+	mov.l	@(r0,r5),r1	! NMLK
+	mov	r1,r6
+	shlr8	r6		! xNML
+	or	r6,r3		! ONML
+	cmp/hi	r2,r0
+	bt/s	3b
+	 mov.l	r3,@-r0
+#else
+3:	mov	r1,r3		! OPQR
+	shlr16	r3
+	shlr8	r3		! xxxO
+	mov.l	@(r0,r5),r1	! KLMN
+	mov	r1,r6
+	shll8	r6		! LMNx
+	or	r6,r3		! LMNO
+	cmp/hi	r2,r0
+	bt/s	3b
+	 mov.l	r3,@-r0
+#endif
+	!
+	! Third, copy a byte at once, if necessary
+	cmp/eq	r4,r0
+	bt/s	9b
+	 add	#4,r5
+	bra	8b
+	 add	#-6,r2
+
+case2:
+	!
+	!	GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
+	!
+	! First, align to word boundary
+	tst	#1,r0
+	bt/s	2f
+	 add	#-1,r5
+	mov.b	@(r0,r5),r1
+	mov.b	r1,@-r0
+	!
+2:	! Second, read a word and write a word at once
+	add	#-1,r5
+	mov	r4,r2
+	add	#3,r2
+	!
+3:	mov.w	@(r0,r5),r1
+	cmp/hi	r2,r0
+	bt/s	3b
+	 mov.w	r1,@-r0
+	!
+	! Third, copy a byte at once, if necessary
+	cmp/eq	r4,r0
+	bt/s	9b
+	 add	#1,r5
+	mov.b	@(r0,r5),r1
+	rts
+	 mov.b	r1,@-r0
+
+case3:
+	!
+	!	GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
+	!
+	! First, align to long word boundary
+	mov	r0,r3
+	and	r2,r3
+	tst	r3,r3
+	bt/s	2f
+	 add	#-1,r5
+1:	dt	r3
+	mov.b	@(r0,r5),r1
+	bf/s	1b
+	 mov.b	r1,@-r0
+	!
+2:	! Second, read a long word and write a long word at once
+	add	#-2,r5
+	mov.l	@(r0,r5),r1
+	add	#-4,r5
+	mov	r4,r2
+	add	#7,r2
+	!
+#ifdef __LITTLE_ENDIAN__
+3:	mov	r1,r3		! RQPO
+	shll8	r3		! QPOx
+	mov.l	@(r0,r5),r1	! NMLK
+	mov	r1,r6
+	shlr16	r6
+	shlr8	r6		! xxxN
+	or	r6,r3		! QPON
+	cmp/hi	r2,r0
+	bt/s	3b
+	 mov.l	r3,@-r0
+#else
+3:	mov	r1,r3		! OPQR
+	shlr8	r3		! xOPQ
+	mov.l	@(r0,r5),r1	! KLMN
+	mov	r1,r6
+	shll16	r6
+	shll8	r6		! Nxxx
+	or	r6,r3		! NOPQ
+	cmp/hi	r2,r0
+	bt/s	3b
+	 mov.l	r3,@-r0
+#endif
+	!
+	! Third, copy a byte at once, if necessary
+	cmp/eq	r4,r0
+	bt/s	9b
+	 add	#6,r5
+	bra	8b
+	 add	#-6,r2
--- a/arch/sh/lib/memmove.S
+++ b/arch/sh/lib/memmove.S
@ -0,0 +1,254 @@
+/* $Id: memmove.S,v 1.2 2001/07/27 11:51:09 gniibe Exp $
+ *
+ * "memmove" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ *
+ */
+
+/*
+ * void *memmove(void *dst, const void *src, size_t n);
+ * The memory areas may overlap.
+ */
+
+#include <linux/linkage.h>
+ENTRY(memmove)
+	! if dest > src, call memcpy (it copies in decreasing order)
+	cmp/hi	r5,r4
+	bf	1f
+	mov.l	2f,r0
+	jmp	@r0
+	 nop
+	.balign 4
+2:	.long	memcpy
+1:
+	sub	r5,r4		! From here, r4 has the distance to r0
+	tst	r6,r6
+	bt/s	9f		! if n=0, do nothing
+	 mov	r5,r0
+	add	r6,r5
+	mov	#12,r1
+	cmp/gt	r6,r1
+	bt/s	8f		! if it's too small, copy a byte at once
+	 add	#-1,r4
+	add	#1,r4
+	!
+	!                [ ...  ] DST             [ ...  ] SRC
+	!	         [ ...  ]                 [ ...  ]
+	!	           :                        :
+	!      r0+r4-->  [ ...  ]       r0    --> [ ...  ]
+	!	           :                        :
+	!	         [ ...  ]                 [ ...  ]
+	!			        r5    -->
+	!
+	mov	r4,r1
+	mov	#3,r2
+	and	r2,r1
+	shll2	r1
+	mov	r0,r3		! Save the value on R0 to R3
+	mova	jmptable,r0
+	add	r1,r0
+	mov.l	@r0,r1
+	jmp	@r1
+	 mov	r3,r0		! and back to R0
+	.balign	4
+jmptable:
+	.long	case0
+	.long	case1
+	.long	case2
+	.long	case3
+
+	! copy a byte at once
+8:	mov.b	@r0+,r1
+	cmp/hs	r5,r0
+	bf/s	8b			! while (r0<r5)
+	 mov.b	r1,@(r0,r4)
+	add	#1,r4
+9:
+	add	r4,r0
+	rts
+	 sub	r6,r0
+
+case_none:
+	bra	8b
+	 add	#-1,r4
+
+case0:
+	!
+	!	GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
+	!
+	! First, align to long word boundary
+	mov	r0,r3
+	and	r2,r3
+	tst	r3,r3
+	bt/s	2f
+	 add	#-1,r4
+	mov	#4,r2
+	sub	r3,r2
+1:	dt	r2
+	mov.b	@r0+,r1
+	bf/s	1b
+	 mov.b	r1,@(r0,r4)
+	!
+2:	! Second, copy a long word at once
+	add	#-3,r4
+	add	#-3,r5
+3:	mov.l	@r0+,r1
+	cmp/hs	r5,r0
+	bf/s	3b
+	 mov.l	r1,@(r0,r4)
+	add	#3,r5
+	!
+	! Third, copy a byte at once, if necessary
+	cmp/eq	r5,r0
+	bt/s	9b
+	 add	#4,r4
+	bra	8b
+	 add	#-1,r4
+
+case3:
+	!
+	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
+	!
+	! First, align to long word boundary
+	mov	r0,r3
+	and	r2,r3
+	tst	r3,r3
+	bt/s	2f
+	 add	#-1,r4
+	mov	#4,r2
+	sub	r3,r2
+1:	dt	r2
+	mov.b	@r0+,r1
+	bf/s	1b
+	 mov.b	r1,@(r0,r4)
+	!
+2:	! Second, read a long word and write a long word at once
+	add	#-2,r4
+	mov.l	@(r0,r4),r1
+	add	#-7,r5
+	add	#-4,r4
+	!
+#ifdef __LITTLE_ENDIAN__
+	shll8	r1
+3:	mov	r1,r3		! JIHG
+	shlr8	r3		! xJIH
+	mov.l	@r0+,r1		! NMLK
+	mov	r1,r2
+	shll16	r2
+	shll8	r2		! Kxxx
+	or	r2,r3		! KJIH
+	cmp/hs	r5,r0
+	bf/s	3b
+	 mov.l	r3,@(r0,r4)
+#else
+	shlr8	r1
+3:	mov	r1,r3		! GHIJ
+	shll8	r3		! HIJx
+	mov.l	@r0+,r1		! KLMN
+	mov	r1,r2
+	shlr16	r2
+	shlr8	r2		! xxxK
+	or	r2,r3		! HIJK
+	cmp/hs	r5,r0
+	bf/s	3b
+	 mov.l	r3,@(r0,r4)
+#endif
+	add	#7,r5
+	!
+	! Third, copy a byte at once, if necessary
+	cmp/eq	r5,r0
+	bt/s	9b
+	 add	#7,r4
+	add	#-3,r0
+	bra	8b
+	 add	#-1,r4
+
+case2:
+	!
+	!	GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
+	!
+	! First, align to word boundary
+	tst	#1,r0
+	bt/s	2f
+	 add	#-1,r4
+	mov.b	@r0+,r1
+	mov.b	r1,@(r0,r4)
+	!
+2:	! Second, read a word and write a word at once
+	add	#-1,r4
+	add	#-1,r5
+	!
+3:	mov.w	@r0+,r1
+	cmp/hs	r5,r0
+	bf/s	3b
+	 mov.w	r1,@(r0,r4)
+	add	#1,r5
+	!
+	! Third, copy a byte at once, if necessary
+	cmp/eq	r5,r0
+	bt/s	9b
+	 add	#2,r4
+	mov.b	@r0,r1
+	mov.b	r1,@(r0,r4)
+	bra	9b
+	 add	#1,r0
+
+case1:
+	!
+	!	GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
+	!
+	! First, align to long word boundary
+	mov	r0,r3
+	and	r2,r3
+	tst	r3,r3
+	bt/s	2f
+	 add	#-1,r4
+	mov	#4,r2
+	sub	r3,r2
+1:	dt	r2
+	mov.b	@r0+,r1
+	bf/s	1b
+	 mov.b	r1,@(r0,r4)
+	!
+2:	! Second, read a long word and write a long word at once
+	mov.l	@(r0,r4),r1
+	add	#-7,r5
+	add	#-4,r4
+	!
+#ifdef __LITTLE_ENDIAN__
+	shll16	r1
+	shll8	r1
+3:	mov	r1,r3		! JIHG
+	shlr16	r3
+	shlr8	r3		! xxxJ
+	mov.l	@r0+,r1		! NMLK
+	mov	r1,r2
+	shll8	r2		! MLKx
+	or	r2,r3		! MLKJ
+	cmp/hs	r5,r0
+	bf/s	3b
+	 mov.l	r3,@(r0,r4)
+#else
+	shlr16	r1
+	shlr8	r1
+3:	mov	r1,r3		! GHIJ
+	shll16	r3
+	shll8	r3		! Jxxx
+	mov.l	@r0+,r1		! KLMN
+	mov	r1,r2
+	shlr8	r2		! xKLM
+	or	r2,r3		! JKLM
+	cmp/hs	r5,r0
+	bf/s	3b		! while(r0<r5)
+	 mov.l	r3,@(r0,r4)
+#endif
+	add	#7,r5
+	!
+	! Third, copy a byte at once, if necessary
+	cmp/eq	r5,r0
+	bt/s	9b
+	 add	#5,r4
+	add	#-3,r0
+	bra	8b
+	 add	#-1,r4
--- a/arch/sh/lib/memset-sh4.S
+++ b/arch/sh/lib/memset-sh4.S
@ -0,0 +1,107 @@
+/*
+ * "memset" implementation for SH4
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ * Copyright (c) 2009  STMicroelectronics Limited
+ * Author: Stuart Menefy <stuart.menefy:st.com>
+ */
+
+/*
+ *            void *memset(void *s, int c, size_t n);
+ */
+
+#include <linux/linkage.h>
+
+ENTRY(memset)
+	mov	#12,r0
+	add	r6,r4
+	cmp/gt	r6,r0
+	bt/s	40f		! if it's too small, set a byte at once
+	 mov	r4,r0
+	and	#3,r0
+	cmp/eq	#0,r0
+	bt/s	2f		! It's aligned
+	 sub	r0,r6
+1:
+	dt	r0
+	bf/s	1b
+	 mov.b	r5,@-r4
+2:				! make VVVV
+	extu.b	r5,r5
+	swap.b	r5,r0		!   V0
+	or	r0,r5		!   VV
+	swap.w	r5,r0		! VV00
+	or	r0,r5		! VVVV
+
+	! Check if enough bytes need to be copied to be worth the big loop
+	mov	#0x40, r0	! (MT)
+	cmp/gt	r6,r0		! (MT)  64 > len => slow loop
+
+	bt/s	22f
+	 mov	r6,r0
+
+	! align the dst to the cache block size if necessary
+	mov	r4, r3
+	mov	#~(0x1f), r1
+
+	and	r3, r1
+	cmp/eq	r3, r1
+
+	bt/s	11f		! dst is already aligned
+	 sub	r1, r3		! r3-r1 -> r3
+	shlr2	r3		! number of loops
+
+10:	mov.l	r5,@-r4
+	dt	r3
+	bf/s	10b
+	 add	#-4, r6
+
+11:	! dst is 32byte aligned
+	mov	r6,r2
+	mov	#-5,r0
+	shld	r0,r2		! number of loops
+
+	add	#-32, r4
+	mov	r5, r0
+12:
+	movca.l	r0,@r4
+	mov.l	r5,@(4, r4)
+	mov.l	r5,@(8, r4)
+	mov.l	r5,@(12,r4)
+	mov.l	r5,@(16,r4)
+	mov.l	r5,@(20,r4)
+	add	#-0x20, r6
+	mov.l	r5,@(24,r4)
+	dt	r2
+	mov.l	r5,@(28,r4)
+	bf/s	12b
+	 add	#-32, r4
+
+	add	#32, r4
+	mov	#8, r0
+	cmp/ge	r0, r6
+	bf	40f
+
+	mov	r6,r0
+22:
+	shlr2	r0
+	shlr	r0		! r0 = r6 >> 3
+3:
+	dt	r0
+	mov.l	r5,@-r4		! set 8-byte at once
+	bf/s	3b
+	 mov.l	r5,@-r4
+	!
+	mov	#7,r0
+	and	r0,r6
+
+	! fill bytes (length may be zero)
+40:	tst	r6,r6
+	bt	5f
+4:
+	dt	r6
+	bf/s	4b
+	 mov.b	r5,@-r4
+5:
+	rts
+	 mov	r4,r0
--- a/arch/sh/lib/memset.S
+++ b/arch/sh/lib/memset.S
@ -0,0 +1,58 @@
+/* $Id: memset.S,v 1.1 2000/04/14 16:49:01 mjd Exp $
+ *
+ * "memset" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ *
+ */
+
+/*
+ *            void *memset(void *s, int c, size_t n);
+ */
+
+#include <linux/linkage.h>
+
+ENTRY(memset)
+	tst	r6,r6
+	bt/s	5f		! if n=0, do nothing
+	 add	r6,r4
+	mov	#12,r0
+	cmp/gt	r6,r0
+	bt/s	4f		! if it's too small, set a byte at once
+	 mov	r4,r0
+	and	#3,r0
+	cmp/eq	#0,r0
+	bt/s	2f		! It's aligned
+	 sub	r0,r6
+1:
+	dt	r0
+	bf/s	1b
+	 mov.b	r5,@-r4
+2:				! make VVVV
+	extu.b	r5,r5
+	swap.b	r5,r0		!   V0
+	or	r0,r5		!   VV
+	swap.w	r5,r0		! VV00
+	or	r0,r5		! VVVV
+	!
+	mov	r6,r0
+	shlr2	r0
+	shlr	r0		! r0 = r6 >> 3
+3:
+	dt	r0
+	mov.l	r5,@-r4		! set 8-byte at once
+	bf/s	3b
+	 mov.l	r5,@-r4
+	!
+	mov	#7,r0
+	and	r0,r6
+	tst	r6,r6
+	bt	5f
+	! fill bytes
+4:
+	dt	r6
+	bf/s	4b
+	 mov.b	r5,@-r4
+5:
+	rts
+	 mov	r4,r0
--- a/arch/sh/lib/movmem.S
+++ b/arch/sh/lib/movmem.S
@ -0,0 +1,238 @@
+/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
+   2004, 2005, 2006
+   Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+!! libgcc routines for the Renesas / SuperH SH CPUs.
+!! Contributed by Steve Chamberlain.
+!! sac@cygnus.com
+
+!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines
+!! recoded in assembly by Toshiyasu Morita
+!! tm@netcom.com
+
+/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and
+   ELF local label prefixes by J"orn Rennecke
+   amylaar@cygnus.com  */
+
+	.text
+	.balign	4
+	.global	__movmem
+	.global __movstr
+	.set __movstr, __movmem	
+	/* This would be a lot simpler if r6 contained the byte count
+	   minus 64, and we wouldn't be called here for a byte count of 64.  */
+__movmem:
+	sts.l	pr,@-r15
+	shll2	r6
+	bsr	__movmemSI52+2
+	mov.l	@(48,r5),r0
+	.balign	4
+movmem_loop: /* Reached with rts */
+	mov.l	@(60,r5),r0
+	add	#-64,r6
+	mov.l	r0,@(60,r4)
+	tst	r6,r6
+	mov.l	@(56,r5),r0
+	bt	movmem_done
+	mov.l	r0,@(56,r4)
+	cmp/pl	r6
+	mov.l	@(52,r5),r0
+	add	#64,r5
+	mov.l	r0,@(52,r4)
+	add	#64,r4
+	bt	__movmemSI52
+! done all the large groups, do the remainder
+! jump to movmem+
+	mova	__movmemSI4+4,r0
+	add	r6,r0
+	jmp	@r0
+movmem_done: ! share slot insn, works out aligned.
+	lds.l	@r15+,pr
+	mov.l	r0,@(56,r4)
+	mov.l	@(52,r5),r0
+	rts
+	mov.l	r0,@(52,r4)
+	.balign	4
+
+	.global	__movmemSI64
+	.global __movstrSI64
+	.set	__movstrSI64, __movmemSI64
+__movmemSI64:
+	mov.l	@(60,r5),r0
+	mov.l	r0,@(60,r4)
+	.global	__movmemSI60
+	.global __movstrSI60
+	.set	__movstrSI60, __movmemSI60
+__movmemSI60:
+	mov.l	@(56,r5),r0
+	mov.l	r0,@(56,r4)
+	.global	__movmemSI56
+	.global __movstrSI56
+	.set	__movstrSI56, __movmemSI56
+__movmemSI56:
+	mov.l	@(52,r5),r0
+	mov.l	r0,@(52,r4)
+	.global	__movmemSI52
+	.global __movstrSI52
+	.set	__movstrSI52, __movmemSI52
+__movmemSI52:
+	mov.l	@(48,r5),r0
+	mov.l	r0,@(48,r4)
+	.global	__movmemSI48
+	.global	__movstrSI48
+	.set	__movstrSI48, __movmemSI48
+__movmemSI48:
+	mov.l	@(44,r5),r0
+	mov.l	r0,@(44,r4)
+	.global	__movmemSI44
+	.global	__movstrSI44
+	.set	__movstrSI44, __movmemSI44
+__movmemSI44:
+	mov.l	@(40,r5),r0
+	mov.l	r0,@(40,r4)
+	.global	__movmemSI40
+	.global __movstrSI40
+	.set	__movstrSI40, __movmemSI40
+__movmemSI40:
+	mov.l	@(36,r5),r0
+	mov.l	r0,@(36,r4)
+	.global	__movmemSI36
+	.global	__movstrSI36
+	.set	__movstrSI36, __movmemSI36
+__movmemSI36:
+	mov.l	@(32,r5),r0
+	mov.l	r0,@(32,r4)
+	.global	__movmemSI32
+	.global	__movstrSI32
+	.set	__movstrSI32, __movmemSI32
+__movmemSI32:
+	mov.l	@(28,r5),r0
+	mov.l	r0,@(28,r4)
+	.global	__movmemSI28
+	.global	__movstrSI28
+	.set	__movstrSI28, __movmemSI28
+__movmemSI28:
+	mov.l	@(24,r5),r0
+	mov.l	r0,@(24,r4)
+	.global	__movmemSI24
+	.global	__movstrSI24
+	.set	__movstrSI24, __movmemSI24
+__movmemSI24:
+	mov.l	@(20,r5),r0
+	mov.l	r0,@(20,r4)
+	.global	__movmemSI20
+	.global	__movstrSI20
+	.set	__movstrSI20, __movmemSI20
+__movmemSI20:
+	mov.l	@(16,r5),r0
+	mov.l	r0,@(16,r4)
+	.global	__movmemSI16
+	.global	__movstrSI16
+	.set	__movstrSI16, __movmemSI16
+__movmemSI16:
+	mov.l	@(12,r5),r0
+	mov.l	r0,@(12,r4)
+	.global	__movmemSI12
+	.global	__movstrSI12
+	.set	__movstrSI12, __movmemSI12
+__movmemSI12:
+	mov.l	@(8,r5),r0
+	mov.l	r0,@(8,r4)
+	.global	__movmemSI8
+	.global	__movstrSI8
+	.set	__movstrSI8, __movmemSI8
+__movmemSI8:
+	mov.l	@(4,r5),r0
+	mov.l	r0,@(4,r4)
+	.global	__movmemSI4
+	.global	__movstrSI4
+	.set	__movstrSI4, __movmemSI4
+__movmemSI4:
+	mov.l	@(0,r5),r0
+	rts
+	mov.l	r0,@(0,r4)
+
+	.global	__movmem_i4_even
+	.global	__movstr_i4_even
+	.set	__movstr_i4_even, __movmem_i4_even
+
+	.global	__movmem_i4_odd
+	.global	__movstr_i4_odd
+	.set	__movstr_i4_odd, __movmem_i4_odd
+
+	.global	__movmemSI12_i4
+	.global	__movstrSI12_i4
+	.set	__movstrSI12_i4, __movmemSI12_i4
+
+	.p2align	5
+L_movmem_2mod4_end:
+	mov.l	r0,@(16,r4)
+	rts
+	mov.l	r1,@(20,r4)
+
+	.p2align	2
+
+__movmem_i4_even:
+	mov.l	@r5+,r0
+	bra	L_movmem_start_even
+	mov.l	@r5+,r1
+
+__movmem_i4_odd:
+	mov.l	@r5+,r1
+	add	#-4,r4
+	mov.l	@r5+,r2
+	mov.l	@r5+,r3
+	mov.l	r1,@(4,r4)
+	mov.l	r2,@(8,r4)
+
+L_movmem_loop:
+	mov.l	r3,@(12,r4)
+	dt	r6
+	mov.l	@r5+,r0
+	bt/s	L_movmem_2mod4_end
+	mov.l	@r5+,r1
+	add	#16,r4
+L_movmem_start_even:
+	mov.l	@r5+,r2
+	mov.l	@r5+,r3
+	mov.l	r0,@r4
+	dt	r6
+	mov.l	r1,@(4,r4)
+	bf/s	L_movmem_loop
+	mov.l	r2,@(8,r4)
+	rts
+	mov.l	r3,@(12,r4)
+
+	.p2align	4
+__movmemSI12_i4:
+	mov.l	@r5,r0
+	mov.l	@(4,r5),r1
+	mov.l	@(8,r5),r2
+	mov.l	r0,@r4
+	mov.l	r1,@(4,r4)
+	rts
+	mov.l	r2,@(8,r4)
--- a/arch/sh/lib/strlen.S
+++ b/arch/sh/lib/strlen.S
@ -0,0 +1,70 @@
+/* $Id: strlen.S,v 1.2 2001/06/29 14:07:15 gniibe Exp $
+ *
+ * "strlen" implementation of SuperH
+ *
+ * Copyright (C) 1999  Kaz Kojima
+ *
+ */
+
+/* size_t strlen (const char *s)  */
+
+#include <linux/linkage.h>
+ENTRY(strlen)
+	mov	r4,r0
+	and	#3,r0
+	tst	r0,r0
+	bt/s	1f
+	 mov	#0,r2
+
+	add	#-1,r0
+	shll2	r0
+	shll	r0
+	braf	r0
+	 nop
+
+	mov.b	@r4+,r1
+	tst	r1,r1
+	bt	8f
+	add	#1,r2
+
+	mov.b	@r4+,r1
+	tst	r1,r1
+	bt	8f
+	add	#1,r2
+
+	mov.b	@r4+,r1
+	tst	r1,r1
+	bt	8f
+	add	#1,r2
+
+1:
+	mov	#0,r3
+2:
+	mov.l	@r4+,r1
+	cmp/str	r3,r1
+	bf/s	2b
+	 add	#4,r2
+
+	add	#-4,r2
+#ifndef __LITTLE_ENDIAN__
+	swap.b	r1,r1
+	swap.w	r1,r1
+	swap.b	r1,r1
+#endif
+	extu.b	r1,r0
+	tst	r0,r0
+	bt/s	8f
+	 shlr8	r1
+	add	#1,r2
+	extu.b	r1,r0
+	tst	r0,r0
+	bt/s	8f
+	 shlr8	r1
+	add	#1,r2
+	extu.b	r1,r0
+	tst	r0,r0
+	bt	8f
+	add	#1,r2
+8:
+	rts
+	 mov	r2,r0
--- a/arch/sh/lib/udiv_qrnnd.S
+++ b/arch/sh/lib/udiv_qrnnd.S
@ -0,0 +1,81 @@
+/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
+   2004, 2005, 2006
+   Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+!! libgcc routines for the Renesas / SuperH SH CPUs.
+!! Contributed by Steve Chamberlain.
+!! sac@cygnus.com
+
+!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines
+!! recoded in assembly by Toshiyasu Morita
+!! tm@netcom.com
+
+/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and
+   ELF local label prefixes by J"orn Rennecke
+   amylaar@cygnus.com  */
+
+	/* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */
+	/* n1 < d, but n1 might be larger than d1.  */
+	.global __udiv_qrnnd_16
+	.balign 8
+__udiv_qrnnd_16:
+	div0u
+	cmp/hi r6,r0
+	bt .Lots
+	.rept 16
+	div1 r6,r0 
+	.endr
+	extu.w r0,r1
+	bt 0f
+	add r6,r0
+0:	rotcl r1
+	mulu.w r1,r5
+	xtrct r4,r0
+	swap.w r0,r0
+	sts macl,r2
+	cmp/hs r2,r0
+	sub r2,r0
+	bt 0f
+	addc r5,r0
+	add #-1,r1
+	bt 0f
+1:	add #-1,r1
+	rts
+	add r5,r0
+	.balign 8
+.Lots:
+	sub r5,r0
+	swap.w r4,r1
+	xtrct r0,r1
+	clrt
+	mov r1,r0
+	addc r5,r0
+	mov #-1,r1
+	bf/s 1b
+	 shlr16 r1
+0:	rts
+	 nop
--- a/arch/sh/lib/udivsi3.S
+++ b/arch/sh/lib/udivsi3.S
@ -0,0 +1,87 @@
+/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
+   2004, 2005
+   Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+!! libgcc routines for the Renesas / SuperH SH CPUs.
+!! Contributed by Steve Chamberlain.
+!! sac@cygnus.com
+
+	.balign 4
+	.global	__udivsi3
+	.type	__udivsi3, @function
+div8:
+	div1 r5,r4
+div7:
+	div1 r5,r4; div1 r5,r4; div1 r5,r4
+	div1 r5,r4; div1 r5,r4; div1 r5,r4; rts; div1 r5,r4
+
+divx4:
+	div1 r5,r4; rotcl r0
+	div1 r5,r4; rotcl r0
+	div1 r5,r4; rotcl r0
+	rts; div1 r5,r4
+
+__udivsi3:
+	sts.l pr,@-r15
+	extu.w r5,r0
+	cmp/eq r5,r0
+	bf/s large_divisor
+	div0u
+	swap.w r4,r0
+	shlr16 r4
+	bsr div8
+	shll16 r5
+	bsr div7
+	div1 r5,r4
+	xtrct r4,r0
+	xtrct r0,r4
+	bsr div8
+	swap.w r4,r4
+	bsr div7
+	div1 r5,r4
+	lds.l @r15+,pr
+	xtrct r4,r0
+	swap.w r0,r0
+	rotcl r0
+	rts
+	shlr16 r5
+
+large_divisor:
+	mov #0,r0
+	xtrct r4,r0
+	xtrct r0,r4
+	bsr divx4
+	rotcl r0
+	bsr divx4
+	rotcl r0
+	bsr divx4
+	rotcl r0
+	bsr divx4
+	rotcl r0
+	lds.l @r15+,pr
+	rts
+	rotcl r0
--- a/arch/sh/lib/udivsi3_i4i-Os.S
+++ b/arch/sh/lib/udivsi3_i4i-Os.S
@ -0,0 +1,149 @@
+/* Copyright (C) 2006 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+/* Moderately Space-optimized libgcc routines for the Renesas SH /
+   STMicroelectronics ST40 CPUs.
+   Contributed by J"orn Rennecke joern.rennecke@st.com.  */
+
+/* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i
+   sh4-200 run times:
+   udiv small divisor: 55 cycles
+   udiv large divisor: 52 cycles
+   sdiv small divisor, positive result: 59 cycles
+   sdiv large divisor, positive result: 56 cycles
+   sdiv small divisor, negative result: 65 cycles (*)
+   sdiv large divisor, negative result: 62 cycles (*)
+   (*): r2 is restored in the rts delay slot and has a lingering latency
+        of two more cycles.  */
+	.balign 4
+	.global	__udivsi3_i4i
+	.global	__udivsi3_i4
+	.set	__udivsi3_i4, __udivsi3_i4i
+	.type	__udivsi3_i4i, @function
+	.type	__sdivsi3_i4i, @function
+__udivsi3_i4i:
+	sts pr,r1
+	mov.l r4,@-r15
+	extu.w r5,r0
+	cmp/eq r5,r0
+	swap.w r4,r0
+	shlr16 r4
+	bf/s large_divisor
+	div0u
+	mov.l r5,@-r15
+	shll16 r5
+sdiv_small_divisor:
+	div1 r5,r4
+	bsr div6
+	div1 r5,r4
+	div1 r5,r4
+	bsr div6
+	div1 r5,r4
+	xtrct r4,r0
+	xtrct r0,r4
+	bsr div7
+	swap.w r4,r4
+	div1 r5,r4
+	bsr div7
+	div1 r5,r4
+	xtrct r4,r0
+	mov.l @r15+,r5
+	swap.w r0,r0
+	mov.l @r15+,r4
+	jmp @r1
+	rotcl r0
+div7:
+	div1 r5,r4
+div6:
+	            div1 r5,r4; div1 r5,r4; div1 r5,r4
+	div1 r5,r4; div1 r5,r4; rts;        div1 r5,r4
+
+divx3:
+	rotcl r0
+	div1 r5,r4
+	rotcl r0
+	div1 r5,r4
+	rotcl r0
+	rts
+	div1 r5,r4
+
+large_divisor:
+	mov.l r5,@-r15
+sdiv_large_divisor:
+	xor r4,r0
+	.rept 4
+	rotcl r0
+	bsr divx3
+	div1 r5,r4
+	.endr
+	mov.l @r15+,r5
+	mov.l @r15+,r4
+	jmp @r1
+	rotcl r0
+
+	.global	__sdivsi3_i4i
+	.global __sdivsi3_i4
+	.global __sdivsi3
+	.set	__sdivsi3_i4, __sdivsi3_i4i
+	.set	__sdivsi3, __sdivsi3_i4i
+__sdivsi3_i4i:
+	mov.l r4,@-r15
+	cmp/pz r5
+	mov.l r5,@-r15
+	bt/s pos_divisor
+	cmp/pz r4
+	neg r5,r5
+	extu.w r5,r0
+	bt/s neg_result
+	cmp/eq r5,r0
+	neg r4,r4
+pos_result:
+	swap.w r4,r0
+	bra sdiv_check_divisor
+	sts pr,r1
+pos_divisor:
+	extu.w r5,r0
+	bt/s pos_result
+	cmp/eq r5,r0
+	neg r4,r4
+neg_result:
+	mova negate_result,r0
+	;
+	mov r0,r1
+	swap.w r4,r0
+	lds r2,macl
+	sts pr,r2
+sdiv_check_divisor:
+	shlr16 r4
+	bf/s sdiv_large_divisor
+	div0u
+	bra sdiv_small_divisor
+	shll16 r5
+	.balign 4
+negate_result:
+	neg r0,r0
+	jmp @r2
+	sts macl,r2
--- a/arch/sh/lib/udivsi3_i4i.S
+++ b/arch/sh/lib/udivsi3_i4i.S
@ -0,0 +1,666 @@
+/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
+   2004, 2005, 2006
+   Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+!! libgcc routines for the Renesas / SuperH SH CPUs.
+!! Contributed by Steve Chamberlain.
+!! sac@cygnus.com
+
+!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines
+!! recoded in assembly by Toshiyasu Morita
+!! tm@netcom.com
+
+/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and
+   ELF local label prefixes by J"orn Rennecke
+   amylaar@cygnus.com  */
+
+/* This code used shld, thus is not suitable for SH1 / SH2.  */
+
+/* Signed / unsigned division without use of FPU, optimized for SH4.
+   Uses a lookup table for divisors in the range -128 .. +128, and
+   div1 with case distinction for larger divisors in three more ranges.
+   The code is lumped together with the table to allow the use of mova.  */
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+#define L_LSB 0
+#define L_LSWMSB 1
+#define L_MSWLSB 2
+#else
+#define L_LSB 3
+#define L_LSWMSB 2
+#define L_MSWLSB 1
+#endif
+
+	.balign 4
+	.global	__udivsi3_i4i
+	.global	__udivsi3_i4
+	.set	__udivsi3_i4, __udivsi3_i4i
+	.type	__udivsi3_i4i, @function
+__udivsi3_i4i:
+	mov.w c128_w, r1
+	div0u
+	mov r4,r0
+	shlr8 r0
+	cmp/hi r1,r5
+	extu.w r5,r1
+	bf udiv_le128
+	cmp/eq r5,r1
+	bf udiv_ge64k
+	shlr r0
+	mov r5,r1
+	shll16 r5
+	mov.l r4,@-r15
+	div1 r5,r0
+	mov.l r1,@-r15
+	div1 r5,r0
+	div1 r5,r0
+	bra udiv_25
+	div1 r5,r0
+
+div_le128:
+	mova div_table_ix,r0
+	bra div_le128_2
+	mov.b @(r0,r5),r1
+udiv_le128:
+	mov.l r4,@-r15
+	mova div_table_ix,r0
+	mov.b @(r0,r5),r1
+	mov.l r5,@-r15
+div_le128_2:
+	mova div_table_inv,r0
+	mov.l @(r0,r1),r1
+	mov r5,r0
+	tst #0xfe,r0
+	mova div_table_clz,r0
+	dmulu.l r1,r4
+	mov.b @(r0,r5),r1
+	bt/s div_by_1
+	mov r4,r0
+	mov.l @r15+,r5
+	sts mach,r0
+	/* clrt */
+	addc r4,r0
+	mov.l @r15+,r4
+	rotcr r0
+	rts
+	shld r1,r0
+
+div_by_1_neg:
+	neg r4,r0
+div_by_1:
+	mov.l @r15+,r5
+	rts
+	mov.l @r15+,r4
+
+div_ge64k:
+	bt/s div_r8
+	div0u
+	shll8 r5
+	bra div_ge64k_2
+	div1 r5,r0
+udiv_ge64k:
+	cmp/hi r0,r5
+	mov r5,r1
+	bt udiv_r8
+	shll8 r5
+	mov.l r4,@-r15
+	div1 r5,r0
+	mov.l r1,@-r15
+div_ge64k_2:
+	div1 r5,r0
+	mov.l zero_l,r1
+	.rept 4
+	div1 r5,r0
+	.endr
+	mov.l r1,@-r15
+	div1 r5,r0
+	mov.w m256_w,r1
+	div1 r5,r0
+	mov.b r0,@(L_LSWMSB,r15)
+	xor r4,r0
+	and r1,r0
+	bra div_ge64k_end
+	xor r4,r0
+	
+div_r8:
+	shll16 r4
+	bra div_r8_2
+	shll8 r4
+udiv_r8:
+	mov.l r4,@-r15
+	shll16 r4
+	clrt
+	shll8 r4
+	mov.l r5,@-r15
+div_r8_2:
+	rotcl r4
+	mov r0,r1
+	div1 r5,r1
+	mov r4,r0
+	rotcl r0
+	mov r5,r4
+	div1 r5,r1
+	.rept 5
+	rotcl r0; div1 r5,r1
+	.endr
+	rotcl r0
+	mov.l @r15+,r5
+	div1 r4,r1
+	mov.l @r15+,r4
+	rts
+	rotcl r0
+
+	.global	__sdivsi3_i4i
+	.global __sdivsi3_i4
+	.global	__sdivsi3
+	.set	__sdivsi3_i4, __sdivsi3_i4i
+	.set	__sdivsi3, __sdivsi3_i4i
+	.type	__sdivsi3_i4i, @function
+	/* This is link-compatible with a __sdivsi3 call,
+	   but we effectively clobber only r1.  */
+__sdivsi3_i4i:
+	mov.l r4,@-r15
+	cmp/pz r5
+	mov.w c128_w, r1
+	bt/s pos_divisor
+	cmp/pz r4
+	mov.l r5,@-r15
+	neg r5,r5
+	bt/s neg_result
+	cmp/hi r1,r5
+	neg r4,r4
+pos_result:
+	extu.w r5,r0
+	bf div_le128
+	cmp/eq r5,r0
+	mov r4,r0
+	shlr8 r0
+	bf/s div_ge64k
+	cmp/hi r0,r5
+	div0u
+	shll16 r5
+	div1 r5,r0
+	div1 r5,r0
+	div1 r5,r0
+udiv_25:
+	mov.l zero_l,r1
+	div1 r5,r0
+	div1 r5,r0
+	mov.l r1,@-r15
+	.rept 3
+	div1 r5,r0
+	.endr
+	mov.b r0,@(L_MSWLSB,r15)
+	xtrct r4,r0
+	swap.w r0,r0
+	.rept 8
+	div1 r5,r0
+	.endr
+	mov.b r0,@(L_LSWMSB,r15)
+div_ge64k_end:
+	.rept 8
+	div1 r5,r0
+	.endr
+	mov.l @r15+,r4 ! zero-extension and swap using LS unit.
+	extu.b r0,r0
+	mov.l @r15+,r5
+	or r4,r0
+	mov.l @r15+,r4
+	rts
+	rotcl r0
+
+div_le128_neg:
+	tst #0xfe,r0
+	mova div_table_ix,r0
+	mov.b @(r0,r5),r1
+	mova div_table_inv,r0
+	bt/s div_by_1_neg
+	mov.l @(r0,r1),r1
+	mova div_table_clz,r0
+	dmulu.l r1,r4
+	mov.b @(r0,r5),r1
+	mov.l @r15+,r5
+	sts mach,r0
+	/* clrt */
+	addc r4,r0
+	mov.l @r15+,r4
+	rotcr r0
+	shld r1,r0
+	rts
+	neg r0,r0
+
+pos_divisor:
+	mov.l r5,@-r15
+	bt/s pos_result
+	cmp/hi r1,r5
+	neg r4,r4
+neg_result:
+	extu.w r5,r0
+	bf div_le128_neg
+	cmp/eq r5,r0
+	mov r4,r0
+	shlr8 r0
+	bf/s div_ge64k_neg
+	cmp/hi r0,r5
+	div0u
+	mov.l zero_l,r1
+	shll16 r5
+	div1 r5,r0
+	mov.l r1,@-r15
+	.rept 7
+	div1 r5,r0
+	.endr
+	mov.b r0,@(L_MSWLSB,r15)
+	xtrct r4,r0
+	swap.w r0,r0
+	.rept 8
+	div1 r5,r0
+	.endr
+	mov.b r0,@(L_LSWMSB,r15)
+div_ge64k_neg_end:
+	.rept 8
+	div1 r5,r0
+	.endr
+	mov.l @r15+,r4 ! zero-extension and swap using LS unit.
+	extu.b r0,r1
+	mov.l @r15+,r5
+	or r4,r1
+div_r8_neg_end:
+	mov.l @r15+,r4
+	rotcl r1
+	rts
+	neg r1,r0
+
+div_ge64k_neg:
+	bt/s div_r8_neg
+	div0u
+	shll8 r5
+	mov.l zero_l,r1
+	.rept 6
+	div1 r5,r0
+	.endr
+	mov.l r1,@-r15
+	div1 r5,r0
+	mov.w m256_w,r1
+	div1 r5,r0
+	mov.b r0,@(L_LSWMSB,r15)
+	xor r4,r0
+	and r1,r0
+	bra div_ge64k_neg_end
+	xor r4,r0
+
+c128_w:
+	.word 128
+
+div_r8_neg:
+	clrt
+	shll16 r4
+	mov r4,r1
+	shll8 r1
+	mov r5,r4
+	.rept 7
+	rotcl r1; div1 r5,r0
+	.endr
+	mov.l @r15+,r5
+	rotcl r1
+	bra div_r8_neg_end
+	div1 r4,r0
+
+m256_w:
+	.word 0xff00
+/* This table has been generated by divtab-sh4.c.  */
+	.balign 4
+div_table_clz:
+	.byte	0
+	.byte	1
+	.byte	0
+	.byte	-1
+	.byte	-1
+	.byte	-2
+	.byte	-2
+	.byte	-2
+	.byte	-2
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+/* Lookup table translating positive divisor to index into table of
+   normalized inverse.  N.B. the '0' entry is also the last entry of the
+ previous table, and causes an unaligned access for division by zero.  */
+div_table_ix:
+	.byte	-6
+	.byte	-128
+	.byte	-128
+	.byte	0
+	.byte	-128
+	.byte	-64
+	.byte	0
+	.byte	64
+	.byte	-128
+	.byte	-96
+	.byte	-64
+	.byte	-32
+	.byte	0
+	.byte	32
+	.byte	64
+	.byte	96
+	.byte	-128
+	.byte	-112
+	.byte	-96
+	.byte	-80
+	.byte	-64
+	.byte	-48
+	.byte	-32
+	.byte	-16
+	.byte	0
+	.byte	16
+	.byte	32
+	.byte	48
+	.byte	64
+	.byte	80
+	.byte	96
+	.byte	112
+	.byte	-128
+	.byte	-120
+	.byte	-112
+	.byte	-104
+	.byte	-96
+	.byte	-88
+	.byte	-80
+	.byte	-72
+	.byte	-64
+	.byte	-56
+	.byte	-48
+	.byte	-40
+	.byte	-32
+	.byte	-24
+	.byte	-16
+	.byte	-8
+	.byte	0
+	.byte	8
+	.byte	16
+	.byte	24
+	.byte	32
+	.byte	40
+	.byte	48
+	.byte	56
+	.byte	64
+	.byte	72
+	.byte	80
+	.byte	88
+	.byte	96
+	.byte	104
+	.byte	112
+	.byte	120
+	.byte	-128
+	.byte	-124
+	.byte	-120
+	.byte	-116
+	.byte	-112
+	.byte	-108
+	.byte	-104
+	.byte	-100
+	.byte	-96
+	.byte	-92
+	.byte	-88
+	.byte	-84
+	.byte	-80
+	.byte	-76
+	.byte	-72
+	.byte	-68
+	.byte	-64
+	.byte	-60
+	.byte	-56
+	.byte	-52
+	.byte	-48
+	.byte	-44
+	.byte	-40
+	.byte	-36
+	.byte	-32
+	.byte	-28
+	.byte	-24
+	.byte	-20
+	.byte	-16
+	.byte	-12
+	.byte	-8
+	.byte	-4
+	.byte	0
+	.byte	4
+	.byte	8
+	.byte	12
+	.byte	16
+	.byte	20
+	.byte	24
+	.byte	28
+	.byte	32
+	.byte	36
+	.byte	40
+	.byte	44
+	.byte	48
+	.byte	52
+	.byte	56
+	.byte	60
+	.byte	64
+	.byte	68
+	.byte	72
+	.byte	76
+	.byte	80
+	.byte	84
+	.byte	88
+	.byte	92
+	.byte	96
+	.byte	100
+	.byte	104
+	.byte	108
+	.byte	112
+	.byte	116
+	.byte	120
+	.byte	124
+	.byte	-128
+/* 1/64 .. 1/127, normalized.  There is an implicit leading 1 in bit 32.  */
+	.balign 4
+zero_l:
+	.long	0x0
+	.long	0xF81F81F9
+	.long	0xF07C1F08
+	.long	0xE9131AC0
+	.long	0xE1E1E1E2
+	.long	0xDAE6076C
+	.long	0xD41D41D5
+	.long	0xCD856891
+	.long	0xC71C71C8
+	.long	0xC0E07039
+	.long	0xBACF914D
+	.long	0xB4E81B4F
+	.long	0xAF286BCB
+	.long	0xA98EF607
+	.long	0xA41A41A5
+	.long	0x9EC8E952
+	.long	0x9999999A
+	.long	0x948B0FCE
+	.long	0x8F9C18FA
+	.long	0x8ACB90F7
+	.long	0x86186187
+	.long	0x81818182
+	.long	0x7D05F418
+	.long	0x78A4C818
+	.long	0x745D1746
+	.long	0x702E05C1
+	.long	0x6C16C16D
+	.long	0x68168169
+	.long	0x642C8591
+	.long	0x60581606
+	.long	0x5C9882BA
+	.long	0x58ED2309
+div_table_inv:
+	.long	0x55555556
+	.long	0x51D07EAF
+	.long	0x4E5E0A73
+	.long	0x4AFD6A06
+	.long	0x47AE147B
+	.long	0x446F8657
+	.long	0x41414142
+	.long	0x3E22CBCF
+	.long	0x3B13B13C
+	.long	0x38138139
+	.long	0x3521CFB3
+	.long	0x323E34A3
+	.long	0x2F684BDB
+	.long	0x2C9FB4D9
+	.long	0x29E4129F
+	.long	0x27350B89
+	.long	0x24924925
+	.long	0x21FB7813
+	.long	0x1F7047DD
+	.long	0x1CF06ADB
+	.long	0x1A7B9612
+	.long	0x18118119
+	.long	0x15B1E5F8
+	.long	0x135C8114
+	.long	0x11111112
+	.long	0xECF56BF
+	.long	0xC9714FC
+	.long	0xA6810A7
+	.long	0x8421085
+	.long	0x624DD30
+	.long	0x4104105
+	.long	0x2040811
+	/* maximum error: 0.987342 scaled: 0.921875*/