mirror of
https://github.com/AetherDroid/android_kernel_samsung_on5xelte.git
synced 2025-09-08 09:08:05 -04:00
Fixed MTP to work with TWRP
This commit is contained in:
commit
f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions
18
arch/tile/lib/Makefile
Normal file
18
arch/tile/lib/Makefile
Normal file
|
@ -0,0 +1,18 @@
|
|||
#
|
||||
# Makefile for TILE-specific library files..
|
||||
#
|
||||
|
||||
lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \
|
||||
memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
|
||||
strchr_$(BITS).o strlen_$(BITS).o strnlen_$(BITS).o
|
||||
|
||||
lib-$(CONFIG_TILEGX) += memcpy_user_64.o
|
||||
lib-$(CONFIG_TILEPRO) += atomic_32.o atomic_asm_32.o
|
||||
lib-$(CONFIG_SMP) += spinlock_$(BITS).o usercopy_$(BITS).o
|
||||
|
||||
obj-$(CONFIG_MODULES) += exports.o
|
||||
|
||||
# The finv_buffer_remote() and copy_{to,from}_user() routines can't
|
||||
# have -pg added, since they both rely on being leaf functions.
|
||||
CFLAGS_REMOVE_cacheflush.o = -pg
|
||||
CFLAGS_REMOVE_memcpy_user_64.o = -pg
|
183
arch/tile/lib/atomic_32.c
Normal file
183
arch/tile/lib/atomic_32.c
Normal file
|
@ -0,0 +1,183 @@
|
|||
/*
|
||||
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/cache.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <arch/chip.h>
|
||||
|
||||
/* This page is remapped on startup to be hash-for-home. */
|
||||
int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss;
|
||||
|
||||
int *__atomic_hashed_lock(volatile void *v)
|
||||
{
|
||||
/* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */
|
||||
/*
|
||||
* Use bits [3, 3 + ATOMIC_HASH_SHIFT) as the lock index.
|
||||
* Using mm works here because atomic_locks is page aligned.
|
||||
*/
|
||||
unsigned long ptr = __insn_mm((unsigned long)v >> 1,
|
||||
(unsigned long)atomic_locks,
|
||||
2, (ATOMIC_HASH_SHIFT + 2) - 1);
|
||||
return (int *)ptr;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/* Return whether the passed pointer is a valid atomic lock pointer. */
|
||||
static int is_atomic_lock(int *p)
|
||||
{
|
||||
return p >= &atomic_locks[0] && p < &atomic_locks[ATOMIC_HASH_SIZE];
|
||||
}
|
||||
|
||||
void __atomic_fault_unlock(int *irqlock_word)
|
||||
{
|
||||
BUG_ON(!is_atomic_lock(irqlock_word));
|
||||
BUG_ON(*irqlock_word != 1);
|
||||
*irqlock_word = 0;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
static inline int *__atomic_setup(volatile void *v)
|
||||
{
|
||||
/* Issue a load to the target to bring it into cache. */
|
||||
*(volatile int *)v;
|
||||
return __atomic_hashed_lock(v);
|
||||
}
|
||||
|
||||
int _atomic_xchg(int *v, int n)
|
||||
{
|
||||
return __atomic_xchg(v, __atomic_setup(v), n).val;
|
||||
}
|
||||
EXPORT_SYMBOL(_atomic_xchg);
|
||||
|
||||
int _atomic_xchg_add(int *v, int i)
|
||||
{
|
||||
return __atomic_xchg_add(v, __atomic_setup(v), i).val;
|
||||
}
|
||||
EXPORT_SYMBOL(_atomic_xchg_add);
|
||||
|
||||
int _atomic_xchg_add_unless(int *v, int a, int u)
|
||||
{
|
||||
/*
|
||||
* Note: argument order is switched here since it is easier
|
||||
* to use the first argument consistently as the "old value"
|
||||
* in the assembly, as is done for _atomic_cmpxchg().
|
||||
*/
|
||||
return __atomic_xchg_add_unless(v, __atomic_setup(v), u, a).val;
|
||||
}
|
||||
EXPORT_SYMBOL(_atomic_xchg_add_unless);
|
||||
|
||||
int _atomic_cmpxchg(int *v, int o, int n)
|
||||
{
|
||||
return __atomic_cmpxchg(v, __atomic_setup(v), o, n).val;
|
||||
}
|
||||
EXPORT_SYMBOL(_atomic_cmpxchg);
|
||||
|
||||
unsigned long _atomic_or(volatile unsigned long *p, unsigned long mask)
|
||||
{
|
||||
return __atomic_or((int *)p, __atomic_setup(p), mask).val;
|
||||
}
|
||||
EXPORT_SYMBOL(_atomic_or);
|
||||
|
||||
unsigned long _atomic_andn(volatile unsigned long *p, unsigned long mask)
|
||||
{
|
||||
return __atomic_andn((int *)p, __atomic_setup(p), mask).val;
|
||||
}
|
||||
EXPORT_SYMBOL(_atomic_andn);
|
||||
|
||||
unsigned long _atomic_xor(volatile unsigned long *p, unsigned long mask)
|
||||
{
|
||||
return __atomic_xor((int *)p, __atomic_setup(p), mask).val;
|
||||
}
|
||||
EXPORT_SYMBOL(_atomic_xor);
|
||||
|
||||
|
||||
long long _atomic64_xchg(long long *v, long long n)
|
||||
{
|
||||
return __atomic64_xchg(v, __atomic_setup(v), n);
|
||||
}
|
||||
EXPORT_SYMBOL(_atomic64_xchg);
|
||||
|
||||
long long _atomic64_xchg_add(long long *v, long long i)
|
||||
{
|
||||
return __atomic64_xchg_add(v, __atomic_setup(v), i);
|
||||
}
|
||||
EXPORT_SYMBOL(_atomic64_xchg_add);
|
||||
|
||||
long long _atomic64_xchg_add_unless(long long *v, long long a, long long u)
|
||||
{
|
||||
/*
|
||||
* Note: argument order is switched here since it is easier
|
||||
* to use the first argument consistently as the "old value"
|
||||
* in the assembly, as is done for _atomic_cmpxchg().
|
||||
*/
|
||||
return __atomic64_xchg_add_unless(v, __atomic_setup(v), u, a);
|
||||
}
|
||||
EXPORT_SYMBOL(_atomic64_xchg_add_unless);
|
||||
|
||||
long long _atomic64_cmpxchg(long long *v, long long o, long long n)
|
||||
{
|
||||
return __atomic64_cmpxchg(v, __atomic_setup(v), o, n);
|
||||
}
|
||||
EXPORT_SYMBOL(_atomic64_cmpxchg);
|
||||
|
||||
|
||||
/*
|
||||
* If any of the atomic or futex routines hit a bad address (not in
|
||||
* the page tables at kernel PL) this routine is called. The futex
|
||||
* routines are never used on kernel space, and the normal atomics and
|
||||
* bitops are never used on user space. So a fault on kernel space
|
||||
* must be fatal, but a fault on userspace is a futex fault and we
|
||||
* need to return -EFAULT. Note that the context this routine is
|
||||
* invoked in is the context of the "_atomic_xxx()" routines called
|
||||
* by the functions in this file.
|
||||
*/
|
||||
struct __get_user __atomic_bad_address(int __user *addr)
|
||||
{
|
||||
if (unlikely(!access_ok(VERIFY_WRITE, addr, sizeof(int))))
|
||||
panic("Bad address used for kernel atomic op: %p\n", addr);
|
||||
return (struct __get_user) { .err = -EFAULT };
|
||||
}
|
||||
|
||||
|
||||
void __init __init_atomic_per_cpu(void)
|
||||
{
|
||||
/* Validate power-of-two and "bigger than cpus" assumption */
|
||||
BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1));
|
||||
BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids);
|
||||
|
||||
/*
|
||||
* On TILEPro we prefer to use a single hash-for-home
|
||||
* page, since this means atomic operations are less
|
||||
* likely to encounter a TLB fault and thus should
|
||||
* in general perform faster. You may wish to disable
|
||||
* this in situations where few hash-for-home tiles
|
||||
* are configured.
|
||||
*/
|
||||
BUG_ON((unsigned long)atomic_locks % PAGE_SIZE != 0);
|
||||
|
||||
/* The locks must all fit on one page. */
|
||||
BUILD_BUG_ON(ATOMIC_HASH_SIZE * sizeof(int) > PAGE_SIZE);
|
||||
|
||||
/*
|
||||
* We use the page offset of the atomic value's address as
|
||||
* an index into atomic_locks, excluding the low 3 bits.
|
||||
* That should not produce more indices than ATOMIC_HASH_SIZE.
|
||||
*/
|
||||
BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE);
|
||||
}
|
197
arch/tile/lib/atomic_asm_32.S
Normal file
197
arch/tile/lib/atomic_asm_32.S
Normal file
|
@ -0,0 +1,197 @@
|
|||
/*
|
||||
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*
|
||||
* Support routines for atomic operations. Each function takes:
|
||||
*
|
||||
* r0: address to manipulate
|
||||
* r1: pointer to atomic lock guarding this operation (for ATOMIC_LOCK_REG)
|
||||
* r2: new value to write, or for cmpxchg/add_unless, value to compare against
|
||||
* r3: (cmpxchg/xchg_add_unless) new value to write or add;
|
||||
* (atomic64 ops) high word of value to write
|
||||
* r4/r5: (cmpxchg64/add_unless64) new value to write or add
|
||||
*
|
||||
* The 32-bit routines return a "struct __get_user" so that the futex code
|
||||
* has an opportunity to return -EFAULT to the user if needed.
|
||||
* The 64-bit routines just return a "long long" with the value,
|
||||
* since they are only used from kernel space and don't expect to fault.
|
||||
* Support for 16-bit ops is included in the framework but we don't provide
|
||||
* any (x86_64 has an atomic_inc_short(), so we might want to some day).
|
||||
*
|
||||
* Note that the caller is advised to issue a suitable L1 or L2
|
||||
* prefetch on the address being manipulated to avoid extra stalls.
|
||||
* In addition, the hot path is on two icache lines, and we start with
|
||||
* a jump to the second line to make sure they are both in cache so
|
||||
* that we never stall waiting on icache fill while holding the lock.
|
||||
* (This doesn't work out with most 64-bit ops, since they consume
|
||||
* too many bundles, so may take an extra i-cache stall.)
|
||||
*
|
||||
* These routines set the INTERRUPT_CRITICAL_SECTION bit, just
|
||||
* like sys_cmpxchg(), so that NMIs like PERF_COUNT will not interrupt
|
||||
* the code, just page faults.
|
||||
*
|
||||
* If the load or store faults in a way that can be directly fixed in
|
||||
* the do_page_fault_ics() handler (e.g. a vmalloc reference) we fix it
|
||||
* directly, return to the instruction that faulted, and retry it.
|
||||
*
|
||||
* If the load or store faults in a way that potentially requires us
|
||||
* to release the atomic lock, then retry (e.g. a migrating PTE), we
|
||||
* reset the PC in do_page_fault_ics() to the "tns" instruction so
|
||||
* that on return we will reacquire the lock and restart the op. We
|
||||
* are somewhat overloading the exception_table_entry notion by doing
|
||||
* this, since those entries are not normally used for migrating PTEs.
|
||||
*
|
||||
* If the main page fault handler discovers a bad address, it will see
|
||||
* the PC pointing to the "tns" instruction (due to the earlier
|
||||
* exception_table_entry processing in do_page_fault_ics), and
|
||||
* re-reset the PC to the fault handler, atomic_bad_address(), which
|
||||
* effectively takes over from the atomic op and can either return a
|
||||
* bad "struct __get_user" (for user addresses) or can just panic (for
|
||||
* bad kernel addresses).
|
||||
*
|
||||
* Note that if the value we would store is the same as what we
|
||||
* loaded, we bypass the store. Other platforms with true atomics can
|
||||
* make the guarantee that a non-atomic __clear_bit(), for example,
|
||||
* can safely race with an atomic test_and_set_bit(); this example is
|
||||
* from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do
|
||||
* that on Tile since the "atomic" op is really just a
|
||||
* read/modify/write, and can race with the non-atomic
|
||||
* read/modify/write. However, if we can short-circuit the write when
|
||||
* it is not needed, in the atomic case, we avoid the race.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/atomic_32.h>
|
||||
#include <asm/page.h>
|
||||
#include <asm/processor.h>
|
||||
|
||||
.section .text.atomic,"ax"
|
||||
ENTRY(__start_atomic_asm_code)
|
||||
|
||||
.macro atomic_op, name, bitwidth, body
|
||||
.align 64
|
||||
STD_ENTRY_SECTION(__atomic\name, .text.atomic)
|
||||
{
|
||||
movei r24, 1
|
||||
j 4f /* branch to second cache line */
|
||||
}
|
||||
1: {
|
||||
.ifc \bitwidth,16
|
||||
lh r22, r0
|
||||
.else
|
||||
lw r22, r0
|
||||
addi r28, r0, 4
|
||||
.endif
|
||||
}
|
||||
.ifc \bitwidth,64
|
||||
lw r23, r28
|
||||
.endif
|
||||
\body /* set r24, and r25 if 64-bit */
|
||||
{
|
||||
seq r26, r22, r24
|
||||
seq r27, r23, r25
|
||||
}
|
||||
.ifc \bitwidth,64
|
||||
bbnst r27, 2f
|
||||
.endif
|
||||
bbs r26, 3f /* skip write-back if it's the same value */
|
||||
2: {
|
||||
.ifc \bitwidth,16
|
||||
sh r0, r24
|
||||
.else
|
||||
sw r0, r24
|
||||
.endif
|
||||
}
|
||||
.ifc \bitwidth,64
|
||||
sw r28, r25
|
||||
.endif
|
||||
mf
|
||||
3: {
|
||||
move r0, r22
|
||||
.ifc \bitwidth,64
|
||||
move r1, r23
|
||||
.else
|
||||
move r1, zero
|
||||
.endif
|
||||
sw ATOMIC_LOCK_REG_NAME, zero
|
||||
}
|
||||
mtspr INTERRUPT_CRITICAL_SECTION, zero
|
||||
jrp lr
|
||||
4: {
|
||||
move ATOMIC_LOCK_REG_NAME, r1
|
||||
mtspr INTERRUPT_CRITICAL_SECTION, r24
|
||||
}
|
||||
#ifndef CONFIG_SMP
|
||||
j 1b /* no atomic locks */
|
||||
#else
|
||||
{
|
||||
tns r21, ATOMIC_LOCK_REG_NAME
|
||||
moveli r23, 2048 /* maximum backoff time in cycles */
|
||||
}
|
||||
{
|
||||
bzt r21, 1b /* branch if lock acquired */
|
||||
moveli r25, 32 /* starting backoff time in cycles */
|
||||
}
|
||||
5: mtspr INTERRUPT_CRITICAL_SECTION, zero
|
||||
mfspr r26, CYCLE_LOW /* get start point for this backoff */
|
||||
6: mfspr r22, CYCLE_LOW /* test to see if we've backed off enough */
|
||||
sub r22, r22, r26
|
||||
slt r22, r22, r25
|
||||
bbst r22, 6b
|
||||
{
|
||||
mtspr INTERRUPT_CRITICAL_SECTION, r24
|
||||
shli r25, r25, 1 /* double the backoff; retry the tns */
|
||||
}
|
||||
{
|
||||
tns r21, ATOMIC_LOCK_REG_NAME
|
||||
slt r26, r23, r25 /* is the proposed backoff too big? */
|
||||
}
|
||||
{
|
||||
bzt r21, 1b /* branch if lock acquired */
|
||||
mvnz r25, r26, r23
|
||||
}
|
||||
j 5b
|
||||
#endif
|
||||
STD_ENDPROC(__atomic\name)
|
||||
.ifc \bitwidth,32
|
||||
.pushsection __ex_table,"a"
|
||||
.align 4
|
||||
.word 1b, __atomic\name
|
||||
.word 2b, __atomic\name
|
||||
.word __atomic\name, __atomic_bad_address
|
||||
.popsection
|
||||
.endif
|
||||
.endm
|
||||
|
||||
atomic_op _cmpxchg, 32, "seq r26, r22, r2; { bbns r26, 3f; move r24, r3 }"
|
||||
atomic_op _xchg, 32, "move r24, r2"
|
||||
atomic_op _xchg_add, 32, "add r24, r22, r2"
|
||||
atomic_op _xchg_add_unless, 32, \
|
||||
"sne r26, r22, r2; { bbns r26, 3f; add r24, r22, r3 }"
|
||||
atomic_op _or, 32, "or r24, r22, r2"
|
||||
atomic_op _andn, 32, "nor r2, r2, zero; and r24, r22, r2"
|
||||
atomic_op _xor, 32, "xor r24, r22, r2"
|
||||
|
||||
atomic_op 64_cmpxchg, 64, "{ seq r26, r22, r2; seq r27, r23, r3 }; \
|
||||
{ bbns r26, 3f; move r24, r4 }; { bbns r27, 3f; move r25, r5 }"
|
||||
atomic_op 64_xchg, 64, "{ move r24, r2; move r25, r3 }"
|
||||
atomic_op 64_xchg_add, 64, "{ add r24, r22, r2; add r25, r23, r3 }; \
|
||||
slt_u r26, r24, r22; add r25, r25, r26"
|
||||
atomic_op 64_xchg_add_unless, 64, \
|
||||
"{ sne r26, r22, r2; sne r27, r23, r3 }; \
|
||||
{ bbns r26, 3f; add r24, r22, r4 }; \
|
||||
{ bbns r27, 3f; add r25, r23, r5 }; \
|
||||
slt_u r26, r24, r22; add r25, r25, r26"
|
||||
|
||||
jrp lr /* happy backtracer */
|
||||
|
||||
ENTRY(__end_atomic_asm_code)
|
173
arch/tile/lib/cacheflush.c
Normal file
173
arch/tile/lib/cacheflush.c
Normal file
|
@ -0,0 +1,173 @@
|
|||
/*
|
||||
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/export.h>
|
||||
#include <asm/page.h>
|
||||
#include <asm/cacheflush.h>
|
||||
#include <arch/icache.h>
|
||||
#include <arch/spr_def.h>
|
||||
|
||||
|
||||
void __flush_icache_range(unsigned long start, unsigned long end)
|
||||
{
|
||||
invalidate_icache((const void *)start, end - start, PAGE_SIZE);
|
||||
}
|
||||
|
||||
|
||||
/* Force a load instruction to issue. */
|
||||
static inline void force_load(char *p)
|
||||
{
|
||||
*(volatile char *)p;
|
||||
}
|
||||
|
||||
/*
|
||||
* Flush and invalidate a VA range that is homed remotely on a single
|
||||
* core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting
|
||||
* until the memory controller holds the flushed values.
|
||||
*/
|
||||
void __attribute__((optimize("omit-frame-pointer")))
|
||||
finv_buffer_remote(void *buffer, size_t size, int hfh)
|
||||
{
|
||||
char *p, *base;
|
||||
size_t step_size, load_count;
|
||||
|
||||
/*
|
||||
* On TILEPro the striping granularity is a fixed 8KB; on
|
||||
* TILE-Gx it is configurable, and we rely on the fact that
|
||||
* the hypervisor always configures maximum striping, so that
|
||||
* bits 9 and 10 of the PA are part of the stripe function, so
|
||||
* every 512 bytes we hit a striping boundary.
|
||||
*
|
||||
*/
|
||||
#ifdef __tilegx__
|
||||
const unsigned long STRIPE_WIDTH = 512;
|
||||
#else
|
||||
const unsigned long STRIPE_WIDTH = 8192;
|
||||
#endif
|
||||
|
||||
#ifdef __tilegx__
|
||||
/*
|
||||
* On TILE-Gx, we must disable the dstream prefetcher before doing
|
||||
* a cache flush; otherwise, we could end up with data in the cache
|
||||
* that we don't want there. Note that normally we'd do an mf
|
||||
* after the SPR write to disabling the prefetcher, but we do one
|
||||
* below, before any further loads, so there's no need to do it
|
||||
* here.
|
||||
*/
|
||||
uint_reg_t old_dstream_pf = __insn_mfspr(SPR_DSTREAM_PF);
|
||||
__insn_mtspr(SPR_DSTREAM_PF, 0);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Flush and invalidate the buffer out of the local L1/L2
|
||||
* and request the home cache to flush and invalidate as well.
|
||||
*/
|
||||
__finv_buffer(buffer, size);
|
||||
|
||||
/*
|
||||
* Wait for the home cache to acknowledge that it has processed
|
||||
* all the flush-and-invalidate requests. This does not mean
|
||||
* that the flushed data has reached the memory controller yet,
|
||||
* but it does mean the home cache is processing the flushes.
|
||||
*/
|
||||
__insn_mf();
|
||||
|
||||
/*
|
||||
* Issue a load to the last cache line, which can't complete
|
||||
* until all the previously-issued flushes to the same memory
|
||||
* controller have also completed. If we weren't striping
|
||||
* memory, that one load would be sufficient, but since we may
|
||||
* be, we also need to back up to the last load issued to
|
||||
* another memory controller, which would be the point where
|
||||
* we crossed a "striping" boundary (the granularity of striping
|
||||
* across memory controllers). Keep backing up and doing this
|
||||
* until we are before the beginning of the buffer, or have
|
||||
* hit all the controllers.
|
||||
*
|
||||
* If we are flushing a hash-for-home buffer, it's even worse.
|
||||
* Each line may be homed on a different tile, and each tile
|
||||
* may have up to four lines that are on different
|
||||
* controllers. So as we walk backwards, we have to touch
|
||||
* enough cache lines to satisfy these constraints. In
|
||||
* practice this ends up being close enough to "load from
|
||||
* every cache line on a full memory stripe on each
|
||||
* controller" that we simply do that, to simplify the logic.
|
||||
*
|
||||
* On TILE-Gx the hash-for-home function is much more complex,
|
||||
* with the upshot being we can't readily guarantee we have
|
||||
* hit both entries in the 128-entry AMT that were hit by any
|
||||
* load in the entire range, so we just re-load them all.
|
||||
* With larger buffers, we may want to consider using a hypervisor
|
||||
* trap to issue loads directly to each hash-for-home tile for
|
||||
* each controller (doing it from Linux would trash the TLB).
|
||||
*/
|
||||
if (hfh) {
|
||||
step_size = L2_CACHE_BYTES;
|
||||
#ifdef __tilegx__
|
||||
load_count = (size + L2_CACHE_BYTES - 1) / L2_CACHE_BYTES;
|
||||
#else
|
||||
load_count = (STRIPE_WIDTH / L2_CACHE_BYTES) *
|
||||
(1 << CHIP_LOG_NUM_MSHIMS());
|
||||
#endif
|
||||
} else {
|
||||
step_size = STRIPE_WIDTH;
|
||||
load_count = (1 << CHIP_LOG_NUM_MSHIMS());
|
||||
}
|
||||
|
||||
/* Load the last byte of the buffer. */
|
||||
p = (char *)buffer + size - 1;
|
||||
force_load(p);
|
||||
|
||||
/* Bump down to the end of the previous stripe or cache line. */
|
||||
p -= step_size;
|
||||
p = (char *)((unsigned long)p | (step_size - 1));
|
||||
|
||||
/* Figure out how far back we need to go. */
|
||||
base = p - (step_size * (load_count - 2));
|
||||
if ((unsigned long)base < (unsigned long)buffer)
|
||||
base = buffer;
|
||||
|
||||
/*
|
||||
* Fire all the loads we need. The MAF only has eight entries
|
||||
* so we can have at most eight outstanding loads, so we
|
||||
* unroll by that amount.
|
||||
*/
|
||||
#pragma unroll 8
|
||||
for (; p >= base; p -= step_size)
|
||||
force_load(p);
|
||||
|
||||
/*
|
||||
* Repeat, but with finv's instead of loads, to get rid of the
|
||||
* data we just loaded into our own cache and the old home L3.
|
||||
* No need to unroll since finv's don't target a register.
|
||||
* The finv's are guaranteed not to actually flush the data in
|
||||
* the buffer back to their home, since we just read it, so the
|
||||
* lines are clean in cache; we will only invalidate those lines.
|
||||
*/
|
||||
p = (char *)buffer + size - 1;
|
||||
__insn_finv(p);
|
||||
p -= step_size;
|
||||
p = (char *)((unsigned long)p | (step_size - 1));
|
||||
for (; p >= base; p -= step_size)
|
||||
__insn_finv(p);
|
||||
|
||||
/* Wait for these finv's (and thus the first finvs) to be done. */
|
||||
__insn_mf();
|
||||
|
||||
#ifdef __tilegx__
|
||||
/* Reenable the prefetcher. */
|
||||
__insn_mtspr(SPR_DSTREAM_PF, old_dstream_pf);
|
||||
#endif
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(finv_buffer_remote);
|
89
arch/tile/lib/checksum.c
Normal file
89
arch/tile/lib/checksum.c
Normal file
|
@ -0,0 +1,89 @@
|
|||
/*
|
||||
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
* Support code for the main lib/checksum.c.
|
||||
*/
|
||||
|
||||
#include <net/checksum.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
__wsum do_csum(const unsigned char *buff, int len)
|
||||
{
|
||||
int odd, count;
|
||||
unsigned long result = 0;
|
||||
|
||||
if (len <= 0)
|
||||
goto out;
|
||||
odd = 1 & (unsigned long) buff;
|
||||
if (odd) {
|
||||
result = (*buff << 8);
|
||||
len--;
|
||||
buff++;
|
||||
}
|
||||
count = len >> 1; /* nr of 16-bit words.. */
|
||||
if (count) {
|
||||
if (2 & (unsigned long) buff) {
|
||||
result += *(const unsigned short *)buff;
|
||||
count--;
|
||||
len -= 2;
|
||||
buff += 2;
|
||||
}
|
||||
count >>= 1; /* nr of 32-bit words.. */
|
||||
if (count) {
|
||||
#ifdef __tilegx__
|
||||
if (4 & (unsigned long) buff) {
|
||||
unsigned int w = *(const unsigned int *)buff;
|
||||
result = __insn_v2sadau(result, w, 0);
|
||||
count--;
|
||||
len -= 4;
|
||||
buff += 4;
|
||||
}
|
||||
count >>= 1; /* nr of 64-bit words.. */
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This algorithm could wrap around for very
|
||||
* large buffers, but those should be impossible.
|
||||
*/
|
||||
BUG_ON(count >= 65530);
|
||||
|
||||
while (count) {
|
||||
unsigned long w = *(const unsigned long *)buff;
|
||||
count--;
|
||||
buff += sizeof(w);
|
||||
#ifdef __tilegx__
|
||||
result = __insn_v2sadau(result, w, 0);
|
||||
#else
|
||||
result = __insn_sadah_u(result, w, 0);
|
||||
#endif
|
||||
}
|
||||
#ifdef __tilegx__
|
||||
if (len & 4) {
|
||||
unsigned int w = *(const unsigned int *)buff;
|
||||
result = __insn_v2sadau(result, w, 0);
|
||||
buff += 4;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
if (len & 2) {
|
||||
result += *(const unsigned short *) buff;
|
||||
buff += 2;
|
||||
}
|
||||
}
|
||||
if (len & 1)
|
||||
result += *buff;
|
||||
result = csum_long(result);
|
||||
if (odd)
|
||||
result = swab16(result);
|
||||
out:
|
||||
return result;
|
||||
}
|
54
arch/tile/lib/cpumask.c
Normal file
54
arch/tile/lib/cpumask.c
Normal file
|
@ -0,0 +1,54 @@
|
|||
/*
|
||||
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/cpumask.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/smp.h>
|
||||
#include <linux/export.h>
|
||||
|
||||
/*
|
||||
* Allow cropping out bits beyond the end of the array.
|
||||
* Move to "lib" directory if more clients want to use this routine.
|
||||
*/
|
||||
int bitmap_parselist_crop(const char *bp, unsigned long *maskp, int nmaskbits)
|
||||
{
|
||||
unsigned a, b;
|
||||
|
||||
bitmap_zero(maskp, nmaskbits);
|
||||
do {
|
||||
if (!isdigit(*bp))
|
||||
return -EINVAL;
|
||||
a = simple_strtoul(bp, (char **)&bp, 10);
|
||||
b = a;
|
||||
if (*bp == '-') {
|
||||
bp++;
|
||||
if (!isdigit(*bp))
|
||||
return -EINVAL;
|
||||
b = simple_strtoul(bp, (char **)&bp, 10);
|
||||
}
|
||||
if (!(a <= b))
|
||||
return -EINVAL;
|
||||
if (b >= nmaskbits)
|
||||
b = nmaskbits-1;
|
||||
while (a <= b) {
|
||||
set_bit(a, maskp);
|
||||
a++;
|
||||
}
|
||||
if (*bp == ',')
|
||||
bp++;
|
||||
} while (*bp != '\0' && *bp != '\n');
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(bitmap_parselist_crop);
|
45
arch/tile/lib/delay.c
Normal file
45
arch/tile/lib/delay.c
Normal file
|
@ -0,0 +1,45 @@
|
|||
/*
|
||||
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/thread_info.h>
|
||||
#include <asm/timex.h>
|
||||
|
||||
void __udelay(unsigned long usecs)
|
||||
{
|
||||
if (usecs > ULONG_MAX / 1000) {
|
||||
WARN_ON_ONCE(usecs > ULONG_MAX / 1000);
|
||||
usecs = ULONG_MAX / 1000;
|
||||
}
|
||||
__ndelay(usecs * 1000);
|
||||
}
|
||||
EXPORT_SYMBOL(__udelay);
|
||||
|
||||
void __ndelay(unsigned long nsecs)
|
||||
{
|
||||
cycles_t target = get_cycles();
|
||||
target += ns2cycles(nsecs);
|
||||
while (get_cycles() < target)
|
||||
cpu_relax();
|
||||
}
|
||||
EXPORT_SYMBOL(__ndelay);
|
||||
|
||||
void __delay(unsigned long cycles)
|
||||
{
|
||||
cycles_t target = get_cycles() + cycles;
|
||||
while (get_cycles() < target)
|
||||
cpu_relax();
|
||||
}
|
||||
EXPORT_SYMBOL(__delay);
|
94
arch/tile/lib/exports.c
Normal file
94
arch/tile/lib/exports.c
Normal file
|
@ -0,0 +1,94 @@
|
|||
/*
|
||||
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*
|
||||
* Exports from assembler code and from libtile-cc.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
|
||||
/* arch/tile/lib/usercopy.S */
|
||||
#include <linux/uaccess.h>
|
||||
EXPORT_SYMBOL(strnlen_user_asm);
|
||||
EXPORT_SYMBOL(strncpy_from_user_asm);
|
||||
EXPORT_SYMBOL(clear_user_asm);
|
||||
EXPORT_SYMBOL(flush_user_asm);
|
||||
EXPORT_SYMBOL(finv_user_asm);
|
||||
|
||||
/* arch/tile/kernel/entry.S */
|
||||
#include <linux/kernel.h>
|
||||
#include <asm/processor.h>
|
||||
EXPORT_SYMBOL(current_text_addr);
|
||||
EXPORT_SYMBOL(dump_stack);
|
||||
|
||||
/* arch/tile/kernel/head.S */
|
||||
EXPORT_SYMBOL(empty_zero_page);
|
||||
|
||||
#ifdef CONFIG_FUNCTION_TRACER
|
||||
/* arch/tile/kernel/mcount_64.S */
|
||||
#include <asm/ftrace.h>
|
||||
EXPORT_SYMBOL(__mcount);
|
||||
#endif /* CONFIG_FUNCTION_TRACER */
|
||||
|
||||
/* arch/tile/lib/, various memcpy files */
|
||||
EXPORT_SYMBOL(memcpy);
|
||||
EXPORT_SYMBOL(__copy_to_user_inatomic);
|
||||
EXPORT_SYMBOL(__copy_from_user_inatomic);
|
||||
EXPORT_SYMBOL(__copy_from_user_zeroing);
|
||||
#ifdef __tilegx__
|
||||
EXPORT_SYMBOL(__copy_in_user_inatomic);
|
||||
#endif
|
||||
|
||||
/* hypervisor glue */
|
||||
#include <hv/hypervisor.h>
|
||||
EXPORT_SYMBOL(hv_dev_open);
|
||||
EXPORT_SYMBOL(hv_dev_pread);
|
||||
EXPORT_SYMBOL(hv_dev_pwrite);
|
||||
EXPORT_SYMBOL(hv_dev_preada);
|
||||
EXPORT_SYMBOL(hv_dev_pwritea);
|
||||
EXPORT_SYMBOL(hv_dev_poll);
|
||||
EXPORT_SYMBOL(hv_dev_poll_cancel);
|
||||
EXPORT_SYMBOL(hv_dev_close);
|
||||
EXPORT_SYMBOL(hv_sysconf);
|
||||
EXPORT_SYMBOL(hv_confstr);
|
||||
EXPORT_SYMBOL(hv_get_rtc);
|
||||
EXPORT_SYMBOL(hv_set_rtc);
|
||||
|
||||
/* libgcc.a */
|
||||
uint32_t __udivsi3(uint32_t dividend, uint32_t divisor);
|
||||
EXPORT_SYMBOL(__udivsi3);
|
||||
int32_t __divsi3(int32_t dividend, int32_t divisor);
|
||||
EXPORT_SYMBOL(__divsi3);
|
||||
uint64_t __udivdi3(uint64_t dividend, uint64_t divisor);
|
||||
EXPORT_SYMBOL(__udivdi3);
|
||||
int64_t __divdi3(int64_t dividend, int64_t divisor);
|
||||
EXPORT_SYMBOL(__divdi3);
|
||||
uint32_t __umodsi3(uint32_t dividend, uint32_t divisor);
|
||||
EXPORT_SYMBOL(__umodsi3);
|
||||
int32_t __modsi3(int32_t dividend, int32_t divisor);
|
||||
EXPORT_SYMBOL(__modsi3);
|
||||
uint64_t __umoddi3(uint64_t dividend, uint64_t divisor);
|
||||
EXPORT_SYMBOL(__umoddi3);
|
||||
int64_t __moddi3(int64_t dividend, int64_t divisor);
|
||||
EXPORT_SYMBOL(__moddi3);
|
||||
#ifndef __tilegx__
|
||||
int64_t __muldi3(int64_t, int64_t);
|
||||
EXPORT_SYMBOL(__muldi3);
|
||||
uint64_t __lshrdi3(uint64_t, unsigned int);
|
||||
EXPORT_SYMBOL(__lshrdi3);
|
||||
uint64_t __ashrdi3(uint64_t, unsigned int);
|
||||
EXPORT_SYMBOL(__ashrdi3);
|
||||
uint64_t __ashldi3(uint64_t, unsigned int);
|
||||
EXPORT_SYMBOL(__ashldi3);
|
||||
int __ffsdi2(uint64_t);
|
||||
EXPORT_SYMBOL(__ffsdi2);
|
||||
#endif
|
71
arch/tile/lib/memchr_32.c
Normal file
71
arch/tile/lib/memchr_32.c
Normal file
|
@ -0,0 +1,71 @@
|
|||
/*
|
||||
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
void *memchr(const void *s, int c, size_t n)
|
||||
{
|
||||
const uint32_t *last_word_ptr;
|
||||
const uint32_t *p;
|
||||
const char *last_byte_ptr;
|
||||
uintptr_t s_int;
|
||||
uint32_t goal, before_mask, v, bits;
|
||||
char *ret;
|
||||
|
||||
if (__builtin_expect(n == 0, 0)) {
|
||||
/* Don't dereference any memory if the array is empty. */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Get an aligned pointer. */
|
||||
s_int = (uintptr_t) s;
|
||||
p = (const uint32_t *)(s_int & -4);
|
||||
|
||||
/* Create four copies of the byte for which we are looking. */
|
||||
goal = 0x01010101 * (uint8_t) c;
|
||||
|
||||
/* Read the first word, but munge it so that bytes before the array
|
||||
* will not match goal.
|
||||
*
|
||||
* Note that this shift count expression works because we know
|
||||
* shift counts are taken mod 32.
|
||||
*/
|
||||
before_mask = (1 << (s_int << 3)) - 1;
|
||||
v = (*p | before_mask) ^ (goal & before_mask);
|
||||
|
||||
/* Compute the address of the last byte. */
|
||||
last_byte_ptr = (const char *)s + n - 1;
|
||||
|
||||
/* Compute the address of the word containing the last byte. */
|
||||
last_word_ptr = (const uint32_t *)((uintptr_t) last_byte_ptr & -4);
|
||||
|
||||
while ((bits = __insn_seqb(v, goal)) == 0) {
|
||||
if (__builtin_expect(p == last_word_ptr, 0)) {
|
||||
/* We already read the last word in the array,
|
||||
* so give up.
|
||||
*/
|
||||
return NULL;
|
||||
}
|
||||
v = *++p;
|
||||
}
|
||||
|
||||
/* We found a match, but it might be in a byte past the end
|
||||
* of the array.
|
||||
*/
|
||||
ret = ((char *)p) + (__insn_ctz(bits) >> 3);
|
||||
return (ret <= last_byte_ptr) ? ret : NULL;
|
||||
}
|
||||
EXPORT_SYMBOL(memchr);
|
69
arch/tile/lib/memchr_64.c
Normal file
69
arch/tile/lib/memchr_64.c
Normal file
|
@ -0,0 +1,69 @@
|
|||
/*
|
||||
* Copyright 2011 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/module.h>
|
||||
#include "string-endian.h"
|
||||
|
||||
void *memchr(const void *s, int c, size_t n)
|
||||
{
|
||||
const uint64_t *last_word_ptr;
|
||||
const uint64_t *p;
|
||||
const char *last_byte_ptr;
|
||||
uintptr_t s_int;
|
||||
uint64_t goal, before_mask, v, bits;
|
||||
char *ret;
|
||||
|
||||
if (__builtin_expect(n == 0, 0)) {
|
||||
/* Don't dereference any memory if the array is empty. */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Get an aligned pointer. */
|
||||
s_int = (uintptr_t) s;
|
||||
p = (const uint64_t *)(s_int & -8);
|
||||
|
||||
/* Create eight copies of the byte for which we are looking. */
|
||||
goal = copy_byte(c);
|
||||
|
||||
/* Read the first word, but munge it so that bytes before the array
|
||||
* will not match goal.
|
||||
*/
|
||||
before_mask = MASK(s_int);
|
||||
v = (*p | before_mask) ^ (goal & before_mask);
|
||||
|
||||
/* Compute the address of the last byte. */
|
||||
last_byte_ptr = (const char *)s + n - 1;
|
||||
|
||||
/* Compute the address of the word containing the last byte. */
|
||||
last_word_ptr = (const uint64_t *)((uintptr_t) last_byte_ptr & -8);
|
||||
|
||||
while ((bits = __insn_v1cmpeq(v, goal)) == 0) {
|
||||
if (__builtin_expect(p == last_word_ptr, 0)) {
|
||||
/* We already read the last word in the array,
|
||||
* so give up.
|
||||
*/
|
||||
return NULL;
|
||||
}
|
||||
v = *++p;
|
||||
}
|
||||
|
||||
/* We found a match, but it might be in a byte past the end
|
||||
* of the array.
|
||||
*/
|
||||
ret = ((char *)p) + (CFZ(bits) >> 3);
|
||||
return (ret <= last_byte_ptr) ? ret : NULL;
|
||||
}
|
||||
EXPORT_SYMBOL(memchr);
|
559
arch/tile/lib/memcpy_32.S
Normal file
559
arch/tile/lib/memcpy_32.S
Normal file
|
@ -0,0 +1,559 @@
|
|||
/*
|
||||
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <arch/chip.h>
|
||||
|
||||
|
||||
/*
|
||||
* This file shares the implementation of the userspace memcpy and
|
||||
* the kernel's memcpy, copy_to_user and copy_from_user.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
#define IS_MEMCPY 0
|
||||
#define IS_COPY_FROM_USER 1
|
||||
#define IS_COPY_FROM_USER_ZEROING 2
|
||||
#define IS_COPY_TO_USER -1
|
||||
|
||||
.section .text.memcpy_common, "ax"
|
||||
.align 64
|
||||
|
||||
/* Use this to preface each bundle that can cause an exception so
|
||||
* the kernel can clean up properly. The special cleanup code should
|
||||
* not use these, since it knows what it is doing.
|
||||
*/
|
||||
#define EX \
|
||||
.pushsection __ex_table, "a"; \
|
||||
.align 4; \
|
||||
.word 9f, memcpy_common_fixup; \
|
||||
.popsection; \
|
||||
9
|
||||
|
||||
|
||||
/* __copy_from_user_inatomic takes the kernel target address in r0,
|
||||
* the user source in r1, and the bytes to copy in r2.
|
||||
* It returns the number of uncopiable bytes (hopefully zero) in r0.
|
||||
*/
|
||||
ENTRY(__copy_from_user_inatomic)
|
||||
.type __copy_from_user_inatomic, @function
|
||||
FEEDBACK_ENTER_EXPLICIT(__copy_from_user_inatomic, \
|
||||
.text.memcpy_common, \
|
||||
.Lend_memcpy_common - __copy_from_user_inatomic)
|
||||
{ movei r29, IS_COPY_FROM_USER; j memcpy_common }
|
||||
.size __copy_from_user_inatomic, . - __copy_from_user_inatomic
|
||||
|
||||
/* __copy_from_user_zeroing is like __copy_from_user_inatomic, but
|
||||
* any uncopiable bytes are zeroed in the target.
|
||||
*/
|
||||
ENTRY(__copy_from_user_zeroing)
|
||||
.type __copy_from_user_zeroing, @function
|
||||
FEEDBACK_REENTER(__copy_from_user_inatomic)
|
||||
{ movei r29, IS_COPY_FROM_USER_ZEROING; j memcpy_common }
|
||||
.size __copy_from_user_zeroing, . - __copy_from_user_zeroing
|
||||
|
||||
/* __copy_to_user_inatomic takes the user target address in r0,
|
||||
* the kernel source in r1, and the bytes to copy in r2.
|
||||
* It returns the number of uncopiable bytes (hopefully zero) in r0.
|
||||
*/
|
||||
ENTRY(__copy_to_user_inatomic)
|
||||
.type __copy_to_user_inatomic, @function
|
||||
FEEDBACK_REENTER(__copy_from_user_inatomic)
|
||||
{ movei r29, IS_COPY_TO_USER; j memcpy_common }
|
||||
.size __copy_to_user_inatomic, . - __copy_to_user_inatomic
|
||||
|
||||
ENTRY(memcpy)
|
||||
.type memcpy, @function
|
||||
FEEDBACK_REENTER(__copy_from_user_inatomic)
|
||||
{ movei r29, IS_MEMCPY }
|
||||
.size memcpy, . - memcpy
|
||||
/* Fall through */
|
||||
|
||||
.type memcpy_common, @function
|
||||
memcpy_common:
|
||||
/* On entry, r29 holds one of the IS_* macro values from above. */
|
||||
|
||||
|
||||
/* r0 is the dest, r1 is the source, r2 is the size. */
|
||||
|
||||
/* Save aside original dest so we can return it at the end. */
|
||||
{ sw sp, lr; move r23, r0; or r4, r0, r1 }
|
||||
|
||||
/* Check for an empty size. */
|
||||
{ bz r2, .Ldone; andi r4, r4, 3 }
|
||||
|
||||
/* Save aside original values in case of a fault. */
|
||||
{ move r24, r1; move r25, r2 }
|
||||
move r27, lr
|
||||
|
||||
/* Check for an unaligned source or dest. */
|
||||
{ bnz r4, .Lcopy_unaligned_maybe_many; addli r4, r2, -256 }
|
||||
|
||||
.Lcheck_aligned_copy_size:
|
||||
/* If we are copying < 256 bytes, branch to simple case. */
|
||||
{ blzt r4, .Lcopy_8_check; slti_u r8, r2, 8 }
|
||||
|
||||
/* Copying >= 256 bytes, so jump to complex prefetching loop. */
|
||||
{ andi r6, r1, 63; j .Lcopy_many }
|
||||
|
||||
/*
|
||||
*
|
||||
* Aligned 4 byte at a time copy loop
|
||||
*
|
||||
*/
|
||||
|
||||
.Lcopy_8_loop:
|
||||
/* Copy two words at a time to hide load latency. */
|
||||
EX: { lw r3, r1; addi r1, r1, 4; slti_u r8, r2, 16 }
|
||||
EX: { lw r4, r1; addi r1, r1, 4 }
|
||||
EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
|
||||
EX: { sw r0, r4; addi r0, r0, 4; addi r2, r2, -4 }
|
||||
.Lcopy_8_check:
|
||||
{ bzt r8, .Lcopy_8_loop; slti_u r4, r2, 4 }
|
||||
|
||||
/* Copy odd leftover word, if any. */
|
||||
{ bnzt r4, .Lcheck_odd_stragglers }
|
||||
EX: { lw r3, r1; addi r1, r1, 4 }
|
||||
EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
|
||||
|
||||
.Lcheck_odd_stragglers:
|
||||
{ bnz r2, .Lcopy_unaligned_few }
|
||||
|
||||
.Ldone:
|
||||
/* For memcpy return original dest address, else zero. */
|
||||
{ mz r0, r29, r23; jrp lr }
|
||||
|
||||
|
||||
/*
|
||||
*
|
||||
* Prefetching multiple cache line copy handler (for large transfers).
|
||||
*
|
||||
*/
|
||||
|
||||
/* Copy words until r1 is cache-line-aligned. */
|
||||
.Lalign_loop:
|
||||
EX: { lw r3, r1; addi r1, r1, 4 }
|
||||
{ andi r6, r1, 63 }
|
||||
EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
|
||||
.Lcopy_many:
|
||||
{ bnzt r6, .Lalign_loop; addi r9, r0, 63 }
|
||||
|
||||
{ addi r3, r1, 60; andi r9, r9, -64 }
|
||||
|
||||
/* No need to prefetch dst, we'll just do the wh64
|
||||
* right before we copy a line.
|
||||
*/
|
||||
EX: { lw r5, r3; addi r3, r3, 64; movei r4, 1 }
|
||||
/* Intentionally stall for a few cycles to leave L2 cache alone. */
|
||||
{ bnzt zero, .; move r27, lr }
|
||||
EX: { lw r6, r3; addi r3, r3, 64 }
|
||||
/* Intentionally stall for a few cycles to leave L2 cache alone. */
|
||||
{ bnzt zero, . }
|
||||
EX: { lw r7, r3; addi r3, r3, 64 }
|
||||
/* Intentionally stall for a few cycles to leave L2 cache alone. */
|
||||
{ bz zero, .Lbig_loop2 }
|
||||
|
||||
/* On entry to this loop:
|
||||
* - r0 points to the start of dst line 0
|
||||
* - r1 points to start of src line 0
|
||||
* - r2 >= (256 - 60), only the first time the loop trips.
|
||||
* - r3 contains r1 + 128 + 60 [pointer to end of source line 2]
|
||||
* This is our prefetch address. When we get near the end
|
||||
* rather than prefetching off the end this is changed to point
|
||||
* to some "safe" recently loaded address.
|
||||
* - r5 contains *(r1 + 60) [i.e. last word of source line 0]
|
||||
* - r6 contains *(r1 + 64 + 60) [i.e. last word of source line 1]
|
||||
* - r9 contains ((r0 + 63) & -64)
|
||||
* [start of next dst cache line.]
|
||||
*/
|
||||
|
||||
.Lbig_loop:
|
||||
{ jal .Lcopy_line2; add r15, r1, r2 }
|
||||
|
||||
.Lbig_loop2:
|
||||
/* Copy line 0, first stalling until r5 is ready. */
|
||||
EX: { move r12, r5; lw r16, r1 }
|
||||
{ bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
|
||||
/* Prefetch several lines ahead. */
|
||||
EX: { lw r5, r3; addi r3, r3, 64 }
|
||||
{ jal .Lcopy_line }
|
||||
|
||||
/* Copy line 1, first stalling until r6 is ready. */
|
||||
EX: { move r12, r6; lw r16, r1 }
|
||||
{ bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
|
||||
/* Prefetch several lines ahead. */
|
||||
EX: { lw r6, r3; addi r3, r3, 64 }
|
||||
{ jal .Lcopy_line }
|
||||
|
||||
/* Copy line 2, first stalling until r7 is ready. */
|
||||
EX: { move r12, r7; lw r16, r1 }
|
||||
{ bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
|
||||
/* Prefetch several lines ahead. */
|
||||
EX: { lw r7, r3; addi r3, r3, 64 }
|
||||
/* Use up a caches-busy cycle by jumping back to the top of the
|
||||
* loop. Might as well get it out of the way now.
|
||||
*/
|
||||
{ j .Lbig_loop }
|
||||
|
||||
|
||||
/* On entry:
|
||||
* - r0 points to the destination line.
|
||||
* - r1 points to the source line.
|
||||
* - r3 is the next prefetch address.
|
||||
* - r9 holds the last address used for wh64.
|
||||
* - r12 = WORD_15
|
||||
* - r16 = WORD_0.
|
||||
* - r17 == r1 + 16.
|
||||
* - r27 holds saved lr to restore.
|
||||
*
|
||||
* On exit:
|
||||
* - r0 is incremented by 64.
|
||||
* - r1 is incremented by 64, unless that would point to a word
|
||||
* beyond the end of the source array, in which case it is redirected
|
||||
* to point to an arbitrary word already in the cache.
|
||||
* - r2 is decremented by 64.
|
||||
* - r3 is unchanged, unless it points to a word beyond the
|
||||
* end of the source array, in which case it is redirected
|
||||
* to point to an arbitrary word already in the cache.
|
||||
* Redirecting is OK since if we are that close to the end
|
||||
* of the array we will not come back to this subroutine
|
||||
* and use the contents of the prefetched address.
|
||||
* - r4 is nonzero iff r2 >= 64.
|
||||
* - r9 is incremented by 64, unless it points beyond the
|
||||
* end of the last full destination cache line, in which
|
||||
* case it is redirected to a "safe address" that can be
|
||||
* clobbered (sp - 64)
|
||||
* - lr contains the value in r27.
|
||||
*/
|
||||
|
||||
/* r26 unused */
|
||||
|
||||
.Lcopy_line:
|
||||
/* TODO: when r3 goes past the end, we would like to redirect it
|
||||
* to prefetch the last partial cache line (if any) just once, for the
|
||||
* benefit of the final cleanup loop. But we don't want to
|
||||
* prefetch that line more than once, or subsequent prefetches
|
||||
* will go into the RTF. But then .Lbig_loop should unconditionally
|
||||
* branch to top of loop to execute final prefetch, and its
|
||||
* nop should become a conditional branch.
|
||||
*/
|
||||
|
||||
/* We need two non-memory cycles here to cover the resources
|
||||
* used by the loads initiated by the caller.
|
||||
*/
|
||||
{ add r15, r1, r2 }
|
||||
.Lcopy_line2:
|
||||
{ slt_u r13, r3, r15; addi r17, r1, 16 }
|
||||
|
||||
/* NOTE: this will stall for one cycle as L1 is busy. */
|
||||
|
||||
/* Fill second L1D line. */
|
||||
EX: { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */
|
||||
|
||||
/* Prepare destination line for writing. */
|
||||
EX: { wh64 r9; addi r9, r9, 64 }
|
||||
/* Load seven words that are L1D hits to cover wh64 L2 usage. */
|
||||
|
||||
/* Load the three remaining words from the last L1D line, which
|
||||
* we know has already filled the L1D.
|
||||
*/
|
||||
EX: { lw r4, r1; addi r1, r1, 4; addi r20, r1, 16 } /* r4 = WORD_12 */
|
||||
EX: { lw r8, r1; addi r1, r1, 4; slt_u r13, r20, r15 }/* r8 = WORD_13 */
|
||||
EX: { lw r11, r1; addi r1, r1, -52; mvz r20, r13, r1 } /* r11 = WORD_14 */
|
||||
|
||||
/* Load the three remaining words from the first L1D line, first
|
||||
* stalling until it has filled by "looking at" r16.
|
||||
*/
|
||||
EX: { lw r13, r1; addi r1, r1, 4; move zero, r16 } /* r13 = WORD_1 */
|
||||
EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_2 */
|
||||
EX: { lw r15, r1; addi r1, r1, 8; addi r10, r0, 60 } /* r15 = WORD_3 */
|
||||
|
||||
/* Load second word from the second L1D line, first
|
||||
* stalling until it has filled by "looking at" r17.
|
||||
*/
|
||||
EX: { lw r19, r1; addi r1, r1, 4; move zero, r17 } /* r19 = WORD_5 */
|
||||
|
||||
/* Store last word to the destination line, potentially dirtying it
|
||||
* for the first time, which keeps the L2 busy for two cycles.
|
||||
*/
|
||||
EX: { sw r10, r12 } /* store(WORD_15) */
|
||||
|
||||
/* Use two L1D hits to cover the sw L2 access above. */
|
||||
EX: { lw r10, r1; addi r1, r1, 4 } /* r10 = WORD_6 */
|
||||
EX: { lw r12, r1; addi r1, r1, 4 } /* r12 = WORD_7 */
|
||||
|
||||
/* Fill third L1D line. */
|
||||
EX: { lw r18, r1; addi r1, r1, 4 } /* r18 = WORD_8 */
|
||||
|
||||
/* Store first L1D line. */
|
||||
EX: { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */
|
||||
EX: { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */
|
||||
EX: { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */
|
||||
EX: { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */
|
||||
/* Store second L1D line. */
|
||||
EX: { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */
|
||||
EX: { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */
|
||||
EX: { sw r0, r10; addi r0, r0, 4 } /* store(WORD_6) */
|
||||
EX: { sw r0, r12; addi r0, r0, 4 } /* store(WORD_7) */
|
||||
|
||||
EX: { lw r13, r1; addi r1, r1, 4; move zero, r18 } /* r13 = WORD_9 */
|
||||
EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_10 */
|
||||
EX: { lw r15, r1; move r1, r20 } /* r15 = WORD_11 */
|
||||
|
||||
/* Store third L1D line. */
|
||||
EX: { sw r0, r18; addi r0, r0, 4 } /* store(WORD_8) */
|
||||
EX: { sw r0, r13; addi r0, r0, 4 } /* store(WORD_9) */
|
||||
EX: { sw r0, r14; addi r0, r0, 4 } /* store(WORD_10) */
|
||||
EX: { sw r0, r15; addi r0, r0, 4 } /* store(WORD_11) */
|
||||
|
||||
/* Store rest of fourth L1D line. */
|
||||
EX: { sw r0, r4; addi r0, r0, 4 } /* store(WORD_12) */
|
||||
{
|
||||
EX: sw r0, r8 /* store(WORD_13) */
|
||||
addi r0, r0, 4
|
||||
/* Will r2 be > 64 after we subtract 64 below? */
|
||||
shri r4, r2, 7
|
||||
}
|
||||
{
|
||||
EX: sw r0, r11 /* store(WORD_14) */
|
||||
addi r0, r0, 8
|
||||
/* Record 64 bytes successfully copied. */
|
||||
addi r2, r2, -64
|
||||
}
|
||||
|
||||
{ jrp lr; move lr, r27 }
|
||||
|
||||
/* Convey to the backtrace library that the stack frame is size
|
||||
* zero, and the real return address is on the stack rather than
|
||||
* in 'lr'.
|
||||
*/
|
||||
{ info 8 }
|
||||
|
||||
.align 64
|
||||
.Lcopy_unaligned_maybe_many:
|
||||
/* Skip the setup overhead if we aren't copying many bytes. */
|
||||
{ slti_u r8, r2, 20; sub r4, zero, r0 }
|
||||
{ bnzt r8, .Lcopy_unaligned_few; andi r4, r4, 3 }
|
||||
{ bz r4, .Ldest_is_word_aligned; add r18, r1, r2 }
|
||||
|
||||
/*
|
||||
*
|
||||
* unaligned 4 byte at a time copy handler.
|
||||
*
|
||||
*/
|
||||
|
||||
/* Copy single bytes until r0 == 0 mod 4, so we can store words. */
|
||||
.Lalign_dest_loop:
|
||||
EX: { lb_u r3, r1; addi r1, r1, 1; addi r4, r4, -1 }
|
||||
EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
|
||||
{ bnzt r4, .Lalign_dest_loop; andi r3, r1, 3 }
|
||||
|
||||
/* If source and dest are now *both* aligned, do an aligned copy. */
|
||||
{ bz r3, .Lcheck_aligned_copy_size; addli r4, r2, -256 }
|
||||
|
||||
.Ldest_is_word_aligned:
|
||||
|
||||
EX: { andi r8, r0, 63; lwadd_na r6, r1, 4}
|
||||
{ slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned }
|
||||
|
||||
/* This copies unaligned words until either there are fewer
|
||||
* than 4 bytes left to copy, or until the destination pointer
|
||||
* is cache-aligned, whichever comes first.
|
||||
*
|
||||
* On entry:
|
||||
* - r0 is the next store address.
|
||||
* - r1 points 4 bytes past the load address corresponding to r0.
|
||||
* - r2 >= 4
|
||||
* - r6 is the next aligned word loaded.
|
||||
*/
|
||||
.Lcopy_unaligned_src_words:
|
||||
EX: { lwadd_na r7, r1, 4; slti_u r8, r2, 4 + 4 }
|
||||
/* stall */
|
||||
{ dword_align r6, r7, r1; slti_u r9, r2, 64 + 4 }
|
||||
EX: { swadd r0, r6, 4; addi r2, r2, -4 }
|
||||
{ bnz r8, .Lcleanup_unaligned_words; andi r8, r0, 63 }
|
||||
{ bnzt r8, .Lcopy_unaligned_src_words; move r6, r7 }
|
||||
|
||||
/* On entry:
|
||||
* - r0 is the next store address.
|
||||
* - r1 points 4 bytes past the load address corresponding to r0.
|
||||
* - r2 >= 4 (# of bytes left to store).
|
||||
* - r6 is the next aligned src word value.
|
||||
* - r9 = (r2 < 64U).
|
||||
* - r18 points one byte past the end of source memory.
|
||||
*/
|
||||
.Ldest_is_L2_line_aligned:
|
||||
|
||||
{
|
||||
/* Not a full cache line remains. */
|
||||
bnz r9, .Lcleanup_unaligned_words
|
||||
move r7, r6
|
||||
}
|
||||
|
||||
/* r2 >= 64 */
|
||||
|
||||
/* Kick off two prefetches, but don't go past the end. */
|
||||
{ addi r3, r1, 63 - 4; addi r8, r1, 64 + 63 - 4 }
|
||||
{ prefetch r3; move r3, r8; slt_u r8, r8, r18 }
|
||||
{ mvz r3, r8, r1; addi r8, r3, 64 }
|
||||
{ prefetch r3; move r3, r8; slt_u r8, r8, r18 }
|
||||
{ mvz r3, r8, r1; movei r17, 0 }
|
||||
|
||||
.Lcopy_unaligned_line:
|
||||
/* Prefetch another line. */
|
||||
{ prefetch r3; addi r15, r1, 60; addi r3, r3, 64 }
|
||||
/* Fire off a load of the last word we are about to copy. */
|
||||
EX: { lw_na r15, r15; slt_u r8, r3, r18 }
|
||||
|
||||
EX: { mvz r3, r8, r1; wh64 r0 }
|
||||
|
||||
/* This loop runs twice.
|
||||
*
|
||||
* On entry:
|
||||
* - r17 is even before the first iteration, and odd before
|
||||
* the second. It is incremented inside the loop. Encountering
|
||||
* an even value at the end of the loop makes it stop.
|
||||
*/
|
||||
.Lcopy_half_an_unaligned_line:
|
||||
EX: {
|
||||
/* Stall until the last byte is ready. In the steady state this
|
||||
* guarantees all words to load below will be in the L2 cache, which
|
||||
* avoids shunting the loads to the RTF.
|
||||
*/
|
||||
move zero, r15
|
||||
lwadd_na r7, r1, 16
|
||||
}
|
||||
EX: { lwadd_na r11, r1, 12 }
|
||||
EX: { lwadd_na r14, r1, -24 }
|
||||
EX: { lwadd_na r8, r1, 4 }
|
||||
EX: { lwadd_na r9, r1, 4 }
|
||||
EX: {
|
||||
lwadd_na r10, r1, 8
|
||||
/* r16 = (r2 < 64), after we subtract 32 from r2 below. */
|
||||
slti_u r16, r2, 64 + 32
|
||||
}
|
||||
EX: { lwadd_na r12, r1, 4; addi r17, r17, 1 }
|
||||
EX: { lwadd_na r13, r1, 8; dword_align r6, r7, r1 }
|
||||
EX: { swadd r0, r6, 4; dword_align r7, r8, r1 }
|
||||
EX: { swadd r0, r7, 4; dword_align r8, r9, r1 }
|
||||
EX: { swadd r0, r8, 4; dword_align r9, r10, r1 }
|
||||
EX: { swadd r0, r9, 4; dword_align r10, r11, r1 }
|
||||
EX: { swadd r0, r10, 4; dword_align r11, r12, r1 }
|
||||
EX: { swadd r0, r11, 4; dword_align r12, r13, r1 }
|
||||
EX: { swadd r0, r12, 4; dword_align r13, r14, r1 }
|
||||
EX: { swadd r0, r13, 4; addi r2, r2, -32 }
|
||||
{ move r6, r14; bbst r17, .Lcopy_half_an_unaligned_line }
|
||||
|
||||
{ bzt r16, .Lcopy_unaligned_line; move r7, r6 }
|
||||
|
||||
/* On entry:
|
||||
* - r0 is the next store address.
|
||||
* - r1 points 4 bytes past the load address corresponding to r0.
|
||||
* - r2 >= 0 (# of bytes left to store).
|
||||
* - r7 is the next aligned src word value.
|
||||
*/
|
||||
.Lcleanup_unaligned_words:
|
||||
/* Handle any trailing bytes. */
|
||||
{ bz r2, .Lcopy_unaligned_done; slti_u r8, r2, 4 }
|
||||
{ bzt r8, .Lcopy_unaligned_src_words; move r6, r7 }
|
||||
|
||||
/* Move r1 back to the point where it corresponds to r0. */
|
||||
{ addi r1, r1, -4 }
|
||||
|
||||
/* Fall through */
|
||||
|
||||
/*
|
||||
*
|
||||
* 1 byte at a time copy handler.
|
||||
*
|
||||
*/
|
||||
|
||||
.Lcopy_unaligned_few:
|
||||
EX: { lb_u r3, r1; addi r1, r1, 1 }
|
||||
EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
|
||||
{ bnzt r2, .Lcopy_unaligned_few }
|
||||
|
||||
.Lcopy_unaligned_done:
|
||||
|
||||
/* For memcpy return original dest address, else zero. */
|
||||
{ mz r0, r29, r23; jrp lr }
|
||||
|
||||
.Lend_memcpy_common:
|
||||
.size memcpy_common, .Lend_memcpy_common - memcpy_common
|
||||
|
||||
.section .fixup,"ax"
|
||||
memcpy_common_fixup:
|
||||
.type memcpy_common_fixup, @function
|
||||
|
||||
/* Skip any bytes we already successfully copied.
|
||||
* r2 (num remaining) is correct, but r0 (dst) and r1 (src)
|
||||
* may not be quite right because of unrolling and prefetching.
|
||||
* So we need to recompute their values as the address just
|
||||
* after the last byte we are sure was successfully loaded and
|
||||
* then stored.
|
||||
*/
|
||||
|
||||
/* Determine how many bytes we successfully copied. */
|
||||
{ sub r3, r25, r2 }
|
||||
|
||||
/* Add this to the original r0 and r1 to get their new values. */
|
||||
{ add r0, r23, r3; add r1, r24, r3 }
|
||||
|
||||
{ bzt r29, memcpy_fixup_loop }
|
||||
{ blzt r29, copy_to_user_fixup_loop }
|
||||
|
||||
copy_from_user_fixup_loop:
|
||||
/* Try copying the rest one byte at a time, expecting a load fault. */
|
||||
.Lcfu: { lb_u r3, r1; addi r1, r1, 1 }
|
||||
{ sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
|
||||
{ bnzt r2, copy_from_user_fixup_loop }
|
||||
|
||||
.Lcopy_from_user_fixup_zero_remainder:
|
||||
{ bbs r29, 2f } /* low bit set means IS_COPY_FROM_USER */
|
||||
/* byte-at-a-time loop faulted, so zero the rest. */
|
||||
{ move r3, r2; bz r2, 2f /* should be impossible, but handle it. */ }
|
||||
1: { sb r0, zero; addi r0, r0, 1; addi r3, r3, -1 }
|
||||
{ bnzt r3, 1b }
|
||||
2: move lr, r27
|
||||
{ move r0, r2; jrp lr }
|
||||
|
||||
copy_to_user_fixup_loop:
|
||||
/* Try copying the rest one byte at a time, expecting a store fault. */
|
||||
{ lb_u r3, r1; addi r1, r1, 1 }
|
||||
.Lctu: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
|
||||
{ bnzt r2, copy_to_user_fixup_loop }
|
||||
.Lcopy_to_user_fixup_done:
|
||||
move lr, r27
|
||||
{ move r0, r2; jrp lr }
|
||||
|
||||
memcpy_fixup_loop:
|
||||
/* Try copying the rest one byte at a time. We expect a disastrous
|
||||
* fault to happen since we are in fixup code, but let it happen.
|
||||
*/
|
||||
{ lb_u r3, r1; addi r1, r1, 1 }
|
||||
{ sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
|
||||
{ bnzt r2, memcpy_fixup_loop }
|
||||
/* This should be unreachable, we should have faulted again.
|
||||
* But be paranoid and handle it in case some interrupt changed
|
||||
* the TLB or something.
|
||||
*/
|
||||
move lr, r27
|
||||
{ move r0, r23; jrp lr }
|
||||
|
||||
.size memcpy_common_fixup, . - memcpy_common_fixup
|
||||
|
||||
.section __ex_table,"a"
|
||||
.align 4
|
||||
.word .Lcfu, .Lcopy_from_user_fixup_zero_remainder
|
||||
.word .Lctu, .Lcopy_to_user_fixup_done
|
367
arch/tile/lib/memcpy_64.c
Normal file
367
arch/tile/lib/memcpy_64.c
Normal file
|
@ -0,0 +1,367 @@
|
|||
/*
|
||||
* Copyright 2011 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/module.h>
|
||||
/* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */
|
||||
|
||||
/* Must be 8 bytes in size. */
|
||||
#define op_t uint64_t
|
||||
|
||||
/* Threshold value for when to enter the unrolled loops. */
|
||||
#define OP_T_THRES 16
|
||||
|
||||
#if CHIP_L2_LINE_SIZE() != 64
|
||||
#error "Assumes 64 byte line size"
|
||||
#endif
|
||||
|
||||
/* How many cache lines ahead should we prefetch? */
|
||||
#define PREFETCH_LINES_AHEAD 4
|
||||
|
||||
/*
|
||||
* Provide "base versions" of load and store for the normal code path.
|
||||
* The kernel provides other versions for userspace copies.
|
||||
*/
|
||||
#define ST(p, v) (*(p) = (v))
|
||||
#define LD(p) (*(p))
|
||||
|
||||
#ifndef USERCOPY_FUNC
|
||||
#define ST1 ST
|
||||
#define ST2 ST
|
||||
#define ST4 ST
|
||||
#define ST8 ST
|
||||
#define LD1 LD
|
||||
#define LD2 LD
|
||||
#define LD4 LD
|
||||
#define LD8 LD
|
||||
#define RETVAL dstv
|
||||
void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n)
|
||||
#else
|
||||
/*
|
||||
* Special kernel version will provide implementation of the LDn/STn
|
||||
* macros to return a count of uncopied bytes due to mm fault.
|
||||
*/
|
||||
#define RETVAL 0
|
||||
int __attribute__((optimize("omit-frame-pointer")))
|
||||
USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
|
||||
#endif
|
||||
{
|
||||
char *__restrict dst1 = (char *)dstv;
|
||||
const char *__restrict src1 = (const char *)srcv;
|
||||
const char *__restrict src1_end;
|
||||
const char *__restrict prefetch;
|
||||
op_t *__restrict dst8; /* 8-byte pointer to destination memory. */
|
||||
op_t final; /* Final bytes to write to trailing word, if any */
|
||||
long i;
|
||||
|
||||
if (n < 16) {
|
||||
for (; n; n--)
|
||||
ST1(dst1++, LD1(src1++));
|
||||
return RETVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Locate the end of source memory we will copy. Don't
|
||||
* prefetch past this.
|
||||
*/
|
||||
src1_end = src1 + n - 1;
|
||||
|
||||
/* Prefetch ahead a few cache lines, but not past the end. */
|
||||
prefetch = src1;
|
||||
for (i = 0; i < PREFETCH_LINES_AHEAD; i++) {
|
||||
__insn_prefetch(prefetch);
|
||||
prefetch += CHIP_L2_LINE_SIZE();
|
||||
prefetch = (prefetch < src1_end) ? prefetch : src1;
|
||||
}
|
||||
|
||||
/* Copy bytes until dst is word-aligned. */
|
||||
for (; (uintptr_t)dst1 & (sizeof(op_t) - 1); n--)
|
||||
ST1(dst1++, LD1(src1++));
|
||||
|
||||
/* 8-byte pointer to destination memory. */
|
||||
dst8 = (op_t *)dst1;
|
||||
|
||||
if (__builtin_expect((uintptr_t)src1 & (sizeof(op_t) - 1), 0)) {
|
||||
/* Unaligned copy. */
|
||||
|
||||
op_t tmp0 = 0, tmp1 = 0, tmp2, tmp3;
|
||||
const op_t *src8 = (const op_t *) ((uintptr_t)src1 &
|
||||
-sizeof(op_t));
|
||||
const void *srci = (void *)src1;
|
||||
int m;
|
||||
|
||||
m = (CHIP_L2_LINE_SIZE() << 2) -
|
||||
(((uintptr_t)dst8) & ((CHIP_L2_LINE_SIZE() << 2) - 1));
|
||||
m = (n < m) ? n : m;
|
||||
m /= sizeof(op_t);
|
||||
|
||||
/* Copy until 'dst' is cache-line-aligned. */
|
||||
n -= (sizeof(op_t) * m);
|
||||
|
||||
switch (m % 4) {
|
||||
case 0:
|
||||
if (__builtin_expect(!m, 0))
|
||||
goto _M0;
|
||||
tmp1 = LD8(src8++);
|
||||
tmp2 = LD8(src8++);
|
||||
goto _8B3;
|
||||
case 2:
|
||||
m += 2;
|
||||
tmp3 = LD8(src8++);
|
||||
tmp0 = LD8(src8++);
|
||||
goto _8B1;
|
||||
case 3:
|
||||
m += 1;
|
||||
tmp2 = LD8(src8++);
|
||||
tmp3 = LD8(src8++);
|
||||
goto _8B2;
|
||||
case 1:
|
||||
m--;
|
||||
tmp0 = LD8(src8++);
|
||||
tmp1 = LD8(src8++);
|
||||
if (__builtin_expect(!m, 0))
|
||||
goto _8B0;
|
||||
}
|
||||
|
||||
do {
|
||||
tmp2 = LD8(src8++);
|
||||
tmp0 = __insn_dblalign(tmp0, tmp1, srci);
|
||||
ST8(dst8++, tmp0);
|
||||
_8B3:
|
||||
tmp3 = LD8(src8++);
|
||||
tmp1 = __insn_dblalign(tmp1, tmp2, srci);
|
||||
ST8(dst8++, tmp1);
|
||||
_8B2:
|
||||
tmp0 = LD8(src8++);
|
||||
tmp2 = __insn_dblalign(tmp2, tmp3, srci);
|
||||
ST8(dst8++, tmp2);
|
||||
_8B1:
|
||||
tmp1 = LD8(src8++);
|
||||
tmp3 = __insn_dblalign(tmp3, tmp0, srci);
|
||||
ST8(dst8++, tmp3);
|
||||
m -= 4;
|
||||
} while (m);
|
||||
|
||||
_8B0:
|
||||
tmp0 = __insn_dblalign(tmp0, tmp1, srci);
|
||||
ST8(dst8++, tmp0);
|
||||
src8--;
|
||||
|
||||
_M0:
|
||||
if (__builtin_expect(n >= CHIP_L2_LINE_SIZE(), 0)) {
|
||||
op_t tmp4, tmp5, tmp6, tmp7, tmp8;
|
||||
|
||||
prefetch = ((const char *)src8) +
|
||||
CHIP_L2_LINE_SIZE() * PREFETCH_LINES_AHEAD;
|
||||
|
||||
for (tmp0 = LD8(src8++); n >= CHIP_L2_LINE_SIZE();
|
||||
n -= CHIP_L2_LINE_SIZE()) {
|
||||
/* Prefetch and advance to next line to
|
||||
prefetch, but don't go past the end. */
|
||||
__insn_prefetch(prefetch);
|
||||
|
||||
/* Make sure prefetch got scheduled
|
||||
earlier. */
|
||||
__asm__ ("" : : : "memory");
|
||||
|
||||
prefetch += CHIP_L2_LINE_SIZE();
|
||||
prefetch = (prefetch < src1_end) ? prefetch :
|
||||
(const char *) src8;
|
||||
|
||||
tmp1 = LD8(src8++);
|
||||
tmp2 = LD8(src8++);
|
||||
tmp3 = LD8(src8++);
|
||||
tmp4 = LD8(src8++);
|
||||
tmp5 = LD8(src8++);
|
||||
tmp6 = LD8(src8++);
|
||||
tmp7 = LD8(src8++);
|
||||
tmp8 = LD8(src8++);
|
||||
|
||||
tmp0 = __insn_dblalign(tmp0, tmp1, srci);
|
||||
tmp1 = __insn_dblalign(tmp1, tmp2, srci);
|
||||
tmp2 = __insn_dblalign(tmp2, tmp3, srci);
|
||||
tmp3 = __insn_dblalign(tmp3, tmp4, srci);
|
||||
tmp4 = __insn_dblalign(tmp4, tmp5, srci);
|
||||
tmp5 = __insn_dblalign(tmp5, tmp6, srci);
|
||||
tmp6 = __insn_dblalign(tmp6, tmp7, srci);
|
||||
tmp7 = __insn_dblalign(tmp7, tmp8, srci);
|
||||
|
||||
__insn_wh64(dst8);
|
||||
|
||||
ST8(dst8++, tmp0);
|
||||
ST8(dst8++, tmp1);
|
||||
ST8(dst8++, tmp2);
|
||||
ST8(dst8++, tmp3);
|
||||
ST8(dst8++, tmp4);
|
||||
ST8(dst8++, tmp5);
|
||||
ST8(dst8++, tmp6);
|
||||
ST8(dst8++, tmp7);
|
||||
|
||||
tmp0 = tmp8;
|
||||
}
|
||||
src8--;
|
||||
}
|
||||
|
||||
/* Copy the rest 8-byte chunks. */
|
||||
if (n >= sizeof(op_t)) {
|
||||
tmp0 = LD8(src8++);
|
||||
for (; n >= sizeof(op_t); n -= sizeof(op_t)) {
|
||||
tmp1 = LD8(src8++);
|
||||
tmp0 = __insn_dblalign(tmp0, tmp1, srci);
|
||||
ST8(dst8++, tmp0);
|
||||
tmp0 = tmp1;
|
||||
}
|
||||
src8--;
|
||||
}
|
||||
|
||||
if (n == 0)
|
||||
return RETVAL;
|
||||
|
||||
tmp0 = LD8(src8++);
|
||||
tmp1 = ((const char *)src8 <= src1_end)
|
||||
? LD8((op_t *)src8) : 0;
|
||||
final = __insn_dblalign(tmp0, tmp1, srci);
|
||||
|
||||
} else {
|
||||
/* Aligned copy. */
|
||||
|
||||
const op_t *__restrict src8 = (const op_t *)src1;
|
||||
|
||||
/* src8 and dst8 are both word-aligned. */
|
||||
if (n >= CHIP_L2_LINE_SIZE()) {
|
||||
/* Copy until 'dst' is cache-line-aligned. */
|
||||
for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1);
|
||||
n -= sizeof(op_t))
|
||||
ST8(dst8++, LD8(src8++));
|
||||
|
||||
for (; n >= CHIP_L2_LINE_SIZE(); ) {
|
||||
op_t tmp0, tmp1, tmp2, tmp3;
|
||||
op_t tmp4, tmp5, tmp6, tmp7;
|
||||
|
||||
/*
|
||||
* Prefetch and advance to next line
|
||||
* to prefetch, but don't go past the
|
||||
* end.
|
||||
*/
|
||||
__insn_prefetch(prefetch);
|
||||
|
||||
/* Make sure prefetch got scheduled
|
||||
earlier. */
|
||||
__asm__ ("" : : : "memory");
|
||||
|
||||
prefetch += CHIP_L2_LINE_SIZE();
|
||||
prefetch = (prefetch < src1_end) ? prefetch :
|
||||
(const char *)src8;
|
||||
|
||||
/*
|
||||
* Do all the loads before wh64. This
|
||||
* is necessary if [src8, src8+7] and
|
||||
* [dst8, dst8+7] share the same cache
|
||||
* line and dst8 <= src8, as can be
|
||||
* the case when called from memmove,
|
||||
* or with code tested on x86 whose
|
||||
* memcpy always works with forward
|
||||
* copies.
|
||||
*/
|
||||
tmp0 = LD8(src8++);
|
||||
tmp1 = LD8(src8++);
|
||||
tmp2 = LD8(src8++);
|
||||
tmp3 = LD8(src8++);
|
||||
tmp4 = LD8(src8++);
|
||||
tmp5 = LD8(src8++);
|
||||
tmp6 = LD8(src8++);
|
||||
tmp7 = LD8(src8++);
|
||||
|
||||
/* wh64 and wait for tmp7 load completion. */
|
||||
__asm__ ("move %0, %0; wh64 %1\n"
|
||||
: : "r"(tmp7), "r"(dst8));
|
||||
|
||||
ST8(dst8++, tmp0);
|
||||
ST8(dst8++, tmp1);
|
||||
ST8(dst8++, tmp2);
|
||||
ST8(dst8++, tmp3);
|
||||
ST8(dst8++, tmp4);
|
||||
ST8(dst8++, tmp5);
|
||||
ST8(dst8++, tmp6);
|
||||
ST8(dst8++, tmp7);
|
||||
|
||||
n -= CHIP_L2_LINE_SIZE();
|
||||
}
|
||||
#if CHIP_L2_LINE_SIZE() != 64
|
||||
# error "Fix code that assumes particular L2 cache line size."
|
||||
#endif
|
||||
}
|
||||
|
||||
for (; n >= sizeof(op_t); n -= sizeof(op_t))
|
||||
ST8(dst8++, LD8(src8++));
|
||||
|
||||
if (__builtin_expect(n == 0, 1))
|
||||
return RETVAL;
|
||||
|
||||
final = LD8(src8);
|
||||
}
|
||||
|
||||
/* n != 0 if we get here. Write out any trailing bytes. */
|
||||
dst1 = (char *)dst8;
|
||||
#ifndef __BIG_ENDIAN__
|
||||
if (n & 4) {
|
||||
ST4((uint32_t *)dst1, final);
|
||||
dst1 += 4;
|
||||
final >>= 32;
|
||||
n &= 3;
|
||||
}
|
||||
if (n & 2) {
|
||||
ST2((uint16_t *)dst1, final);
|
||||
dst1 += 2;
|
||||
final >>= 16;
|
||||
n &= 1;
|
||||
}
|
||||
if (n)
|
||||
ST1((uint8_t *)dst1, final);
|
||||
#else
|
||||
if (n & 4) {
|
||||
ST4((uint32_t *)dst1, final >> 32);
|
||||
dst1 += 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
final >>= 32;
|
||||
}
|
||||
if (n & 2) {
|
||||
ST2((uint16_t *)dst1, final >> 16);
|
||||
dst1 += 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
final >>= 16;
|
||||
}
|
||||
if (n & 1)
|
||||
ST1((uint8_t *)dst1, final >> 8);
|
||||
#endif
|
||||
|
||||
return RETVAL;
|
||||
}
|
||||
|
||||
#ifdef USERCOPY_FUNC
|
||||
#undef ST1
|
||||
#undef ST2
|
||||
#undef ST4
|
||||
#undef ST8
|
||||
#undef LD1
|
||||
#undef LD2
|
||||
#undef LD4
|
||||
#undef LD8
|
||||
#undef USERCOPY_FUNC
|
||||
#endif
|
94
arch/tile/lib/memcpy_user_64.c
Normal file
94
arch/tile/lib/memcpy_user_64.c
Normal file
|
@ -0,0 +1,94 @@
|
|||
/*
|
||||
* Copyright 2011 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*
|
||||
* Do memcpy(), but trap and return "n" when a load or store faults.
|
||||
*
|
||||
* Note: this idiom only works when memcpy() compiles to a leaf function.
|
||||
* Here leaf function not only means it does not have calls, but also
|
||||
* requires no stack operations (sp, stack frame pointer) and no
|
||||
* use of callee-saved registers, else "jrp lr" will be incorrect since
|
||||
* unwinding stack frame is bypassed. Since memcpy() is not complex so
|
||||
* these conditions are satisfied here, but we need to be careful when
|
||||
* modifying this file. This is not a clean solution but is the best
|
||||
* one so far.
|
||||
*
|
||||
* Also note that we are capturing "n" from the containing scope here.
|
||||
*/
|
||||
|
||||
#define _ST(p, inst, v) \
|
||||
({ \
|
||||
asm("1: " #inst " %0, %1;" \
|
||||
".pushsection .coldtext.memcpy,\"ax\";" \
|
||||
"2: { move r0, %2; jrp lr };" \
|
||||
".section __ex_table,\"a\";" \
|
||||
".align 8;" \
|
||||
".quad 1b, 2b;" \
|
||||
".popsection" \
|
||||
: "=m" (*(p)) : "r" (v), "r" (n)); \
|
||||
})
|
||||
|
||||
#define _LD(p, inst) \
|
||||
({ \
|
||||
unsigned long __v; \
|
||||
asm("1: " #inst " %0, %1;" \
|
||||
".pushsection .coldtext.memcpy,\"ax\";" \
|
||||
"2: { move r0, %2; jrp lr };" \
|
||||
".section __ex_table,\"a\";" \
|
||||
".align 8;" \
|
||||
".quad 1b, 2b;" \
|
||||
".popsection" \
|
||||
: "=r" (__v) : "m" (*(p)), "r" (n)); \
|
||||
__v; \
|
||||
})
|
||||
|
||||
#define USERCOPY_FUNC __copy_to_user_inatomic
|
||||
#define ST1(p, v) _ST((p), st1, (v))
|
||||
#define ST2(p, v) _ST((p), st2, (v))
|
||||
#define ST4(p, v) _ST((p), st4, (v))
|
||||
#define ST8(p, v) _ST((p), st, (v))
|
||||
#define LD1 LD
|
||||
#define LD2 LD
|
||||
#define LD4 LD
|
||||
#define LD8 LD
|
||||
#include "memcpy_64.c"
|
||||
|
||||
#define USERCOPY_FUNC __copy_from_user_inatomic
|
||||
#define ST1 ST
|
||||
#define ST2 ST
|
||||
#define ST4 ST
|
||||
#define ST8 ST
|
||||
#define LD1(p) _LD((p), ld1u)
|
||||
#define LD2(p) _LD((p), ld2u)
|
||||
#define LD4(p) _LD((p), ld4u)
|
||||
#define LD8(p) _LD((p), ld)
|
||||
#include "memcpy_64.c"
|
||||
|
||||
#define USERCOPY_FUNC __copy_in_user_inatomic
|
||||
#define ST1(p, v) _ST((p), st1, (v))
|
||||
#define ST2(p, v) _ST((p), st2, (v))
|
||||
#define ST4(p, v) _ST((p), st4, (v))
|
||||
#define ST8(p, v) _ST((p), st, (v))
|
||||
#define LD1(p) _LD((p), ld1u)
|
||||
#define LD2(p) _LD((p), ld2u)
|
||||
#define LD4(p) _LD((p), ld4u)
|
||||
#define LD8(p) _LD((p), ld)
|
||||
#include "memcpy_64.c"
|
||||
|
||||
unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
|
||||
unsigned long n)
|
||||
{
|
||||
unsigned long rc = __copy_from_user_inatomic(to, from, n);
|
||||
if (unlikely(rc))
|
||||
memset(to + n - rc, 0, rc);
|
||||
return rc;
|
||||
}
|
63
arch/tile/lib/memmove.c
Normal file
63
arch/tile/lib/memmove.c
Normal file
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
void *memmove(void *dest, const void *src, size_t n)
|
||||
{
|
||||
if ((const char *)src >= (char *)dest + n
|
||||
|| (char *)dest >= (const char *)src + n) {
|
||||
/* We found no overlap, so let memcpy do all the heavy
|
||||
* lifting (prefetching, etc.)
|
||||
*/
|
||||
return memcpy(dest, src, n);
|
||||
}
|
||||
|
||||
if (n != 0) {
|
||||
const uint8_t *in;
|
||||
uint8_t x;
|
||||
uint8_t *out;
|
||||
int stride;
|
||||
|
||||
if (src < dest) {
|
||||
/* copy backwards */
|
||||
in = (const uint8_t *)src + n - 1;
|
||||
out = (uint8_t *)dest + n - 1;
|
||||
stride = -1;
|
||||
} else {
|
||||
/* copy forwards */
|
||||
in = (const uint8_t *)src;
|
||||
out = (uint8_t *)dest;
|
||||
stride = 1;
|
||||
}
|
||||
|
||||
/* Manually software-pipeline this loop. */
|
||||
x = *in;
|
||||
in += stride;
|
||||
|
||||
while (--n != 0) {
|
||||
*out = x;
|
||||
out += stride;
|
||||
x = *in;
|
||||
in += stride;
|
||||
}
|
||||
|
||||
*out = x;
|
||||
}
|
||||
|
||||
return dest;
|
||||
}
|
||||
EXPORT_SYMBOL(memmove);
|
143
arch/tile/lib/memset_32.c
Normal file
143
arch/tile/lib/memset_32.c
Normal file
|
@ -0,0 +1,143 @@
|
|||
/*
|
||||
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/module.h>
|
||||
#include <arch/chip.h>
|
||||
|
||||
void *memset(void *s, int c, size_t n)
|
||||
{
|
||||
uint32_t *out32;
|
||||
int n32;
|
||||
uint32_t v16, v32;
|
||||
uint8_t *out8 = s;
|
||||
int to_align32;
|
||||
|
||||
/* Experimentation shows that a trivial tight loop is a win up until
|
||||
* around a size of 20, where writing a word at a time starts to win.
|
||||
*/
|
||||
#define BYTE_CUTOFF 20
|
||||
|
||||
#if BYTE_CUTOFF < 3
|
||||
/* This must be at least at least this big, or some code later
|
||||
* on doesn't work.
|
||||
*/
|
||||
#error "BYTE_CUTOFF is too small"
|
||||
#endif
|
||||
|
||||
if (n < BYTE_CUTOFF) {
|
||||
/* Strangely, this turns out to be the tightest way to
|
||||
* write this loop.
|
||||
*/
|
||||
if (n != 0) {
|
||||
do {
|
||||
/* Strangely, combining these into one line
|
||||
* performs worse.
|
||||
*/
|
||||
*out8 = c;
|
||||
out8++;
|
||||
} while (--n != 0);
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
/* Align 'out8'. We know n >= 3 so this won't write past the end. */
|
||||
while (((uintptr_t) out8 & 3) != 0) {
|
||||
*out8++ = c;
|
||||
--n;
|
||||
}
|
||||
|
||||
/* Align 'n'. */
|
||||
while (n & 3)
|
||||
out8[--n] = c;
|
||||
|
||||
out32 = (uint32_t *) out8;
|
||||
n32 = n >> 2;
|
||||
|
||||
/* Tile input byte out to 32 bits. */
|
||||
v16 = __insn_intlb(c, c);
|
||||
v32 = __insn_intlh(v16, v16);
|
||||
|
||||
/* This must be at least 8 or the following loop doesn't work. */
|
||||
#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
|
||||
|
||||
/* Determine how many words we need to emit before the 'out32'
|
||||
* pointer becomes aligned modulo the cache line size.
|
||||
*/
|
||||
to_align32 =
|
||||
(-((uintptr_t)out32 >> 2)) & (CACHE_LINE_SIZE_IN_WORDS - 1);
|
||||
|
||||
/* Only bother aligning and using wh64 if there is at least
|
||||
* one full cache line to process. This check also prevents
|
||||
* overrunning the end of the buffer with alignment words.
|
||||
*/
|
||||
if (to_align32 <= n32 - CACHE_LINE_SIZE_IN_WORDS) {
|
||||
int lines_left;
|
||||
|
||||
/* Align out32 mod the cache line size so we can use wh64. */
|
||||
n32 -= to_align32;
|
||||
for (; to_align32 != 0; to_align32--) {
|
||||
*out32 = v32;
|
||||
out32++;
|
||||
}
|
||||
|
||||
/* Use unsigned divide to turn this into a right shift. */
|
||||
lines_left = (unsigned)n32 / CACHE_LINE_SIZE_IN_WORDS;
|
||||
|
||||
do {
|
||||
/* Only wh64 a few lines at a time, so we don't
|
||||
* exceed the maximum number of victim lines.
|
||||
*/
|
||||
int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS())
|
||||
? lines_left
|
||||
: CHIP_MAX_OUTSTANDING_VICTIMS());
|
||||
uint32_t *wh = out32;
|
||||
int i = x;
|
||||
int j;
|
||||
|
||||
lines_left -= x;
|
||||
|
||||
do {
|
||||
__insn_wh64(wh);
|
||||
wh += CACHE_LINE_SIZE_IN_WORDS;
|
||||
} while (--i);
|
||||
|
||||
for (j = x * (CACHE_LINE_SIZE_IN_WORDS / 4);
|
||||
j != 0; j--) {
|
||||
*out32++ = v32;
|
||||
*out32++ = v32;
|
||||
*out32++ = v32;
|
||||
*out32++ = v32;
|
||||
}
|
||||
} while (lines_left != 0);
|
||||
|
||||
/* We processed all full lines above, so only this many
|
||||
* words remain to be processed.
|
||||
*/
|
||||
n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
|
||||
}
|
||||
|
||||
/* Now handle any leftover values. */
|
||||
if (n32 != 0) {
|
||||
do {
|
||||
*out32 = v32;
|
||||
out32++;
|
||||
} while (--n32 != 0);
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
EXPORT_SYMBOL(memset);
|
142
arch/tile/lib/memset_64.c
Normal file
142
arch/tile/lib/memset_64.c
Normal file
|
@ -0,0 +1,142 @@
|
|||
/*
|
||||
* Copyright 2011 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/module.h>
|
||||
#include <arch/chip.h>
|
||||
#include "string-endian.h"
|
||||
|
||||
void *memset(void *s, int c, size_t n)
|
||||
{
|
||||
uint64_t *out64;
|
||||
int n64, to_align64;
|
||||
uint64_t v64;
|
||||
uint8_t *out8 = s;
|
||||
|
||||
/* Experimentation shows that a trivial tight loop is a win up until
|
||||
* around a size of 20, where writing a word at a time starts to win.
|
||||
*/
|
||||
#define BYTE_CUTOFF 20
|
||||
|
||||
#if BYTE_CUTOFF < 7
|
||||
/* This must be at least at least this big, or some code later
|
||||
* on doesn't work.
|
||||
*/
|
||||
#error "BYTE_CUTOFF is too small"
|
||||
#endif
|
||||
|
||||
if (n < BYTE_CUTOFF) {
|
||||
/* Strangely, this turns out to be the tightest way to
|
||||
* write this loop.
|
||||
*/
|
||||
if (n != 0) {
|
||||
do {
|
||||
/* Strangely, combining these into one line
|
||||
* performs worse.
|
||||
*/
|
||||
*out8 = c;
|
||||
out8++;
|
||||
} while (--n != 0);
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
/* Align 'out8'. We know n >= 7 so this won't write past the end. */
|
||||
while (((uintptr_t) out8 & 7) != 0) {
|
||||
*out8++ = c;
|
||||
--n;
|
||||
}
|
||||
|
||||
/* Align 'n'. */
|
||||
while (n & 7)
|
||||
out8[--n] = c;
|
||||
|
||||
out64 = (uint64_t *) out8;
|
||||
n64 = n >> 3;
|
||||
|
||||
/* Tile input byte out to 64 bits. */
|
||||
v64 = copy_byte(c);
|
||||
|
||||
/* This must be at least 8 or the following loop doesn't work. */
|
||||
#define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8)
|
||||
|
||||
/* Determine how many words we need to emit before the 'out32'
|
||||
* pointer becomes aligned modulo the cache line size.
|
||||
*/
|
||||
to_align64 = (-((uintptr_t)out64 >> 3)) &
|
||||
(CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1);
|
||||
|
||||
/* Only bother aligning and using wh64 if there is at least
|
||||
* one full cache line to process. This check also prevents
|
||||
* overrunning the end of the buffer with alignment words.
|
||||
*/
|
||||
if (to_align64 <= n64 - CACHE_LINE_SIZE_IN_DOUBLEWORDS) {
|
||||
int lines_left;
|
||||
|
||||
/* Align out64 mod the cache line size so we can use wh64. */
|
||||
n64 -= to_align64;
|
||||
for (; to_align64 != 0; to_align64--) {
|
||||
*out64 = v64;
|
||||
out64++;
|
||||
}
|
||||
|
||||
/* Use unsigned divide to turn this into a right shift. */
|
||||
lines_left = (unsigned)n64 / CACHE_LINE_SIZE_IN_DOUBLEWORDS;
|
||||
|
||||
do {
|
||||
/* Only wh64 a few lines at a time, so we don't
|
||||
* exceed the maximum number of victim lines.
|
||||
*/
|
||||
int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS())
|
||||
? lines_left
|
||||
: CHIP_MAX_OUTSTANDING_VICTIMS());
|
||||
uint64_t *wh = out64;
|
||||
int i = x;
|
||||
int j;
|
||||
|
||||
lines_left -= x;
|
||||
|
||||
do {
|
||||
__insn_wh64(wh);
|
||||
wh += CACHE_LINE_SIZE_IN_DOUBLEWORDS;
|
||||
} while (--i);
|
||||
|
||||
for (j = x * (CACHE_LINE_SIZE_IN_DOUBLEWORDS / 4);
|
||||
j != 0; j--) {
|
||||
*out64++ = v64;
|
||||
*out64++ = v64;
|
||||
*out64++ = v64;
|
||||
*out64++ = v64;
|
||||
}
|
||||
} while (lines_left != 0);
|
||||
|
||||
/* We processed all full lines above, so only this many
|
||||
* words remain to be processed.
|
||||
*/
|
||||
n64 &= CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1;
|
||||
}
|
||||
|
||||
/* Now handle any leftover values. */
|
||||
if (n64 != 0) {
|
||||
do {
|
||||
*out64 = v64;
|
||||
out64++;
|
||||
} while (--n64 != 0);
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
EXPORT_SYMBOL(memset);
|
259
arch/tile/lib/spinlock_32.c
Normal file
259
arch/tile/lib/spinlock_32.c
Normal file
|
@ -0,0 +1,259 @@
|
|||
/*
|
||||
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/module.h>
|
||||
#include <asm/processor.h>
|
||||
#include <arch/spr_def.h>
|
||||
|
||||
#include "spinlock_common.h"
|
||||
|
||||
void arch_spin_lock(arch_spinlock_t *lock)
|
||||
{
|
||||
int my_ticket;
|
||||
int iterations = 0;
|
||||
int delta;
|
||||
|
||||
while ((my_ticket = __insn_tns((void *)&lock->next_ticket)) & 1)
|
||||
delay_backoff(iterations++);
|
||||
|
||||
/* Increment the next ticket number, implicitly releasing tns lock. */
|
||||
lock->next_ticket = my_ticket + TICKET_QUANTUM;
|
||||
|
||||
/* Wait until it's our turn. */
|
||||
while ((delta = my_ticket - lock->current_ticket) != 0)
|
||||
relax((128 / CYCLES_PER_RELAX_LOOP) * delta);
|
||||
}
|
||||
EXPORT_SYMBOL(arch_spin_lock);
|
||||
|
||||
int arch_spin_trylock(arch_spinlock_t *lock)
|
||||
{
|
||||
/*
|
||||
* Grab a ticket; no need to retry if it's busy, we'll just
|
||||
* treat that the same as "locked", since someone else
|
||||
* will lock it momentarily anyway.
|
||||
*/
|
||||
int my_ticket = __insn_tns((void *)&lock->next_ticket);
|
||||
|
||||
if (my_ticket == lock->current_ticket) {
|
||||
/* Not currently locked, so lock it by keeping this ticket. */
|
||||
lock->next_ticket = my_ticket + TICKET_QUANTUM;
|
||||
/* Success! */
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!(my_ticket & 1)) {
|
||||
/* Release next_ticket. */
|
||||
lock->next_ticket = my_ticket;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(arch_spin_trylock);
|
||||
|
||||
void arch_spin_unlock_wait(arch_spinlock_t *lock)
|
||||
{
|
||||
u32 iterations = 0;
|
||||
while (arch_spin_is_locked(lock))
|
||||
delay_backoff(iterations++);
|
||||
}
|
||||
EXPORT_SYMBOL(arch_spin_unlock_wait);
|
||||
|
||||
/*
|
||||
* The low byte is always reserved to be the marker for a "tns" operation
|
||||
* since the low bit is set to "1" by a tns. The next seven bits are
|
||||
* zeroes. The next byte holds the "next" writer value, i.e. the ticket
|
||||
* available for the next task that wants to write. The third byte holds
|
||||
* the current writer value, i.e. the writer who holds the current ticket.
|
||||
* If current == next == 0, there are no interested writers.
|
||||
*/
|
||||
#define WR_NEXT_SHIFT _WR_NEXT_SHIFT
|
||||
#define WR_CURR_SHIFT _WR_CURR_SHIFT
|
||||
#define WR_WIDTH _WR_WIDTH
|
||||
#define WR_MASK ((1 << WR_WIDTH) - 1)
|
||||
|
||||
/*
|
||||
* The last eight bits hold the active reader count. This has to be
|
||||
* zero before a writer can start to write.
|
||||
*/
|
||||
#define RD_COUNT_SHIFT _RD_COUNT_SHIFT
|
||||
#define RD_COUNT_WIDTH _RD_COUNT_WIDTH
|
||||
#define RD_COUNT_MASK ((1 << RD_COUNT_WIDTH) - 1)
|
||||
|
||||
|
||||
/*
|
||||
* We can get the read lock if everything but the reader bits (which
|
||||
* are in the high part of the word) is zero, i.e. no active or
|
||||
* waiting writers, no tns.
|
||||
*
|
||||
* We guard the tns/store-back with an interrupt critical section to
|
||||
* preserve the semantic that the same read lock can be acquired in an
|
||||
* interrupt context.
|
||||
*/
|
||||
int arch_read_trylock(arch_rwlock_t *rwlock)
|
||||
{
|
||||
u32 val;
|
||||
__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
|
||||
val = __insn_tns((int *)&rwlock->lock);
|
||||
if (likely((val << _RD_COUNT_WIDTH) == 0)) {
|
||||
val += 1 << RD_COUNT_SHIFT;
|
||||
rwlock->lock = val;
|
||||
__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
|
||||
BUG_ON(val == 0); /* we don't expect wraparound */
|
||||
return 1;
|
||||
}
|
||||
if ((val & 1) == 0)
|
||||
rwlock->lock = val;
|
||||
__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(arch_read_trylock);
|
||||
|
||||
/*
|
||||
* Spin doing arch_read_trylock() until we acquire the lock.
|
||||
* ISSUE: This approach can permanently starve readers. A reader who sees
|
||||
* a writer could instead take a ticket lock (just like a writer would),
|
||||
* and atomically enter read mode (with 1 reader) when it gets the ticket.
|
||||
* This way both readers and writers would always make forward progress
|
||||
* in a finite time.
|
||||
*/
|
||||
void arch_read_lock(arch_rwlock_t *rwlock)
|
||||
{
|
||||
u32 iterations = 0;
|
||||
while (unlikely(!arch_read_trylock(rwlock)))
|
||||
delay_backoff(iterations++);
|
||||
}
|
||||
EXPORT_SYMBOL(arch_read_lock);
|
||||
|
||||
void arch_read_unlock(arch_rwlock_t *rwlock)
|
||||
{
|
||||
u32 val, iterations = 0;
|
||||
|
||||
mb(); /* guarantee anything modified under the lock is visible */
|
||||
for (;;) {
|
||||
__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
|
||||
val = __insn_tns((int *)&rwlock->lock);
|
||||
if (likely((val & 1) == 0)) {
|
||||
rwlock->lock = val - (1 << _RD_COUNT_SHIFT);
|
||||
__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
|
||||
break;
|
||||
}
|
||||
__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
|
||||
delay_backoff(iterations++);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(arch_read_unlock);
|
||||
|
||||
/*
|
||||
* We don't need an interrupt critical section here (unlike for
|
||||
* arch_read_lock) since we should never use a bare write lock where
|
||||
* it could be interrupted by code that could try to re-acquire it.
|
||||
*/
|
||||
void arch_write_lock(arch_rwlock_t *rwlock)
|
||||
{
|
||||
/*
|
||||
* The trailing underscore on this variable (and curr_ below)
|
||||
* reminds us that the high bits are garbage; we mask them out
|
||||
* when we compare them.
|
||||
*/
|
||||
u32 my_ticket_;
|
||||
u32 iterations = 0;
|
||||
u32 val = __insn_tns((int *)&rwlock->lock);
|
||||
|
||||
if (likely(val == 0)) {
|
||||
rwlock->lock = 1 << _WR_NEXT_SHIFT;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait until there are no readers, then bump up the next
|
||||
* field and capture the ticket value.
|
||||
*/
|
||||
for (;;) {
|
||||
if (!(val & 1)) {
|
||||
if ((val >> RD_COUNT_SHIFT) == 0)
|
||||
break;
|
||||
rwlock->lock = val;
|
||||
}
|
||||
delay_backoff(iterations++);
|
||||
val = __insn_tns((int *)&rwlock->lock);
|
||||
}
|
||||
|
||||
/* Take out the next ticket and extract my ticket value. */
|
||||
rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT);
|
||||
my_ticket_ = val >> WR_NEXT_SHIFT;
|
||||
|
||||
/* Wait until the "current" field matches our ticket. */
|
||||
for (;;) {
|
||||
u32 curr_ = val >> WR_CURR_SHIFT;
|
||||
u32 delta = ((my_ticket_ - curr_) & WR_MASK);
|
||||
if (likely(delta == 0))
|
||||
break;
|
||||
|
||||
/* Delay based on how many lock-holders are still out there. */
|
||||
relax((256 / CYCLES_PER_RELAX_LOOP) * delta);
|
||||
|
||||
/*
|
||||
* Get a non-tns value to check; we don't need to tns
|
||||
* it ourselves. Since we're not tns'ing, we retry
|
||||
* more rapidly to get a valid value.
|
||||
*/
|
||||
while ((val = rwlock->lock) & 1)
|
||||
relax(4);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(arch_write_lock);
|
||||
|
||||
int arch_write_trylock(arch_rwlock_t *rwlock)
|
||||
{
|
||||
u32 val = __insn_tns((int *)&rwlock->lock);
|
||||
|
||||
/*
|
||||
* If a tns is in progress, or there's a waiting or active locker,
|
||||
* or active readers, we can't take the lock, so give up.
|
||||
*/
|
||||
if (unlikely(val != 0)) {
|
||||
if (!(val & 1))
|
||||
rwlock->lock = val;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Set the "next" field to mark it locked. */
|
||||
rwlock->lock = 1 << _WR_NEXT_SHIFT;
|
||||
return 1;
|
||||
}
|
||||
EXPORT_SYMBOL(arch_write_trylock);
|
||||
|
||||
void arch_write_unlock(arch_rwlock_t *rwlock)
|
||||
{
|
||||
u32 val, eq, mask;
|
||||
|
||||
mb(); /* guarantee anything modified under the lock is visible */
|
||||
val = __insn_tns((int *)&rwlock->lock);
|
||||
if (likely(val == (1 << _WR_NEXT_SHIFT))) {
|
||||
rwlock->lock = 0;
|
||||
return;
|
||||
}
|
||||
while (unlikely(val & 1)) {
|
||||
/* Limited backoff since we are the highest-priority task. */
|
||||
relax(4);
|
||||
val = __insn_tns((int *)&rwlock->lock);
|
||||
}
|
||||
mask = 1 << WR_CURR_SHIFT;
|
||||
val = __insn_addb(val, mask);
|
||||
eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT));
|
||||
val = __insn_mz(eq & mask, val);
|
||||
rwlock->lock = val;
|
||||
}
|
||||
EXPORT_SYMBOL(arch_write_unlock);
|
104
arch/tile/lib/spinlock_64.c
Normal file
104
arch/tile/lib/spinlock_64.c
Normal file
|
@ -0,0 +1,104 @@
|
|||
/*
|
||||
* Copyright 2011 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/module.h>
|
||||
#include <asm/processor.h>
|
||||
|
||||
#include "spinlock_common.h"
|
||||
|
||||
/*
|
||||
* Read the spinlock value without allocating in our cache and without
|
||||
* causing an invalidation to another cpu with a copy of the cacheline.
|
||||
* This is important when we are spinning waiting for the lock.
|
||||
*/
|
||||
static inline u32 arch_spin_read_noalloc(void *lock)
|
||||
{
|
||||
return atomic_cmpxchg((atomic_t *)lock, -1, -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait until the high bits (current) match my ticket.
|
||||
* If we notice the overflow bit set on entry, we clear it.
|
||||
*/
|
||||
void arch_spin_lock_slow(arch_spinlock_t *lock, u32 my_ticket)
|
||||
{
|
||||
if (unlikely(my_ticket & __ARCH_SPIN_NEXT_OVERFLOW)) {
|
||||
__insn_fetchand4(&lock->lock, ~__ARCH_SPIN_NEXT_OVERFLOW);
|
||||
my_ticket &= ~__ARCH_SPIN_NEXT_OVERFLOW;
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
u32 val = arch_spin_read_noalloc(lock);
|
||||
u32 delta = my_ticket - arch_spin_current(val);
|
||||
if (delta == 0)
|
||||
return;
|
||||
relax((128 / CYCLES_PER_RELAX_LOOP) * delta);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(arch_spin_lock_slow);
|
||||
|
||||
/*
|
||||
* Check the lock to see if it is plausible, and try to get it with cmpxchg().
|
||||
*/
|
||||
int arch_spin_trylock(arch_spinlock_t *lock)
|
||||
{
|
||||
u32 val = arch_spin_read_noalloc(lock);
|
||||
if (unlikely(arch_spin_current(val) != arch_spin_next(val)))
|
||||
return 0;
|
||||
return cmpxchg(&lock->lock, val, (val + 1) & ~__ARCH_SPIN_NEXT_OVERFLOW)
|
||||
== val;
|
||||
}
|
||||
EXPORT_SYMBOL(arch_spin_trylock);
|
||||
|
||||
void arch_spin_unlock_wait(arch_spinlock_t *lock)
|
||||
{
|
||||
u32 iterations = 0;
|
||||
while (arch_spin_is_locked(lock))
|
||||
delay_backoff(iterations++);
|
||||
}
|
||||
EXPORT_SYMBOL(arch_spin_unlock_wait);
|
||||
|
||||
/*
|
||||
* If the read lock fails due to a writer, we retry periodically
|
||||
* until the value is positive and we write our incremented reader count.
|
||||
*/
|
||||
void __read_lock_failed(arch_rwlock_t *rw)
|
||||
{
|
||||
u32 val;
|
||||
int iterations = 0;
|
||||
do {
|
||||
delay_backoff(iterations++);
|
||||
val = __insn_fetchaddgez4(&rw->lock, 1);
|
||||
} while (unlikely(arch_write_val_locked(val)));
|
||||
}
|
||||
EXPORT_SYMBOL(__read_lock_failed);
|
||||
|
||||
/*
|
||||
* If we failed because there were readers, clear the "writer" bit
|
||||
* so we don't block additional readers. Otherwise, there was another
|
||||
* writer anyway, so our "fetchor" made no difference. Then wait,
|
||||
* issuing periodic fetchor instructions, till we get the lock.
|
||||
*/
|
||||
void __write_lock_failed(arch_rwlock_t *rw, u32 val)
|
||||
{
|
||||
int iterations = 0;
|
||||
do {
|
||||
if (!arch_write_val_locked(val))
|
||||
val = __insn_fetchand4(&rw->lock, ~__WRITE_LOCK_BIT);
|
||||
delay_backoff(iterations++);
|
||||
val = __insn_fetchor4(&rw->lock, __WRITE_LOCK_BIT);
|
||||
} while (val != 0);
|
||||
}
|
||||
EXPORT_SYMBOL(__write_lock_failed);
|
64
arch/tile/lib/spinlock_common.h
Normal file
64
arch/tile/lib/spinlock_common.h
Normal file
|
@ -0,0 +1,64 @@
|
|||
/*
|
||||
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
* This file is included into spinlock_32.c or _64.c.
|
||||
*/
|
||||
|
||||
/*
|
||||
* The mfspr in __spinlock_relax() is 5 or 6 cycles plus 2 for loop
|
||||
* overhead.
|
||||
*/
|
||||
#ifdef __tilegx__
|
||||
#define CYCLES_PER_RELAX_LOOP 7
|
||||
#else
|
||||
#define CYCLES_PER_RELAX_LOOP 8
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Idle the core for CYCLES_PER_RELAX_LOOP * iterations cycles.
|
||||
*/
|
||||
static inline void
|
||||
relax(int iterations)
|
||||
{
|
||||
for (/*above*/; iterations > 0; iterations--)
|
||||
__insn_mfspr(SPR_PASS);
|
||||
barrier();
|
||||
}
|
||||
|
||||
/* Perform bounded exponential backoff.*/
|
||||
static void delay_backoff(int iterations)
|
||||
{
|
||||
u32 exponent, loops;
|
||||
|
||||
/*
|
||||
* 2^exponent is how many times we go around the loop,
|
||||
* which takes 8 cycles. We want to start with a 16- to 31-cycle
|
||||
* loop, so we need to go around minimum 2 = 2^1 times, so we
|
||||
* bias the original value up by 1.
|
||||
*/
|
||||
exponent = iterations + 1;
|
||||
|
||||
/*
|
||||
* Don't allow exponent to exceed 7, so we have 128 loops,
|
||||
* or 1,024 (to 2,047) cycles, as our maximum.
|
||||
*/
|
||||
if (exponent > 8)
|
||||
exponent = 8;
|
||||
|
||||
loops = 1 << exponent;
|
||||
|
||||
/* Add a randomness factor so two cpus never get in lock step. */
|
||||
loops += __insn_crc32_32(stack_pointer, get_cycles_low()) &
|
||||
(loops - 1);
|
||||
|
||||
relax(loops);
|
||||
}
|
64
arch/tile/lib/strchr_32.c
Normal file
64
arch/tile/lib/strchr_32.c
Normal file
|
@ -0,0 +1,64 @@
|
|||
/*
|
||||
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
char *strchr(const char *s, int c)
|
||||
{
|
||||
int z, g;
|
||||
|
||||
/* Get an aligned pointer. */
|
||||
const uintptr_t s_int = (uintptr_t) s;
|
||||
const uint32_t *p = (const uint32_t *)(s_int & -4);
|
||||
|
||||
/* Create four copies of the byte for which we are looking. */
|
||||
const uint32_t goal = 0x01010101 * (uint8_t) c;
|
||||
|
||||
/* Read the first aligned word, but force bytes before the string to
|
||||
* match neither zero nor goal (we make sure the high bit of each
|
||||
* byte is 1, and the low 7 bits are all the opposite of the goal
|
||||
* byte).
|
||||
*
|
||||
* Note that this shift count expression works because we know shift
|
||||
* counts are taken mod 32.
|
||||
*/
|
||||
const uint32_t before_mask = (1 << (s_int << 3)) - 1;
|
||||
uint32_t v = (*p | before_mask) ^ (goal & __insn_shrib(before_mask, 1));
|
||||
|
||||
uint32_t zero_matches, goal_matches;
|
||||
while (1) {
|
||||
/* Look for a terminating '\0'. */
|
||||
zero_matches = __insn_seqb(v, 0);
|
||||
|
||||
/* Look for the goal byte. */
|
||||
goal_matches = __insn_seqb(v, goal);
|
||||
|
||||
if (__builtin_expect(zero_matches | goal_matches, 0))
|
||||
break;
|
||||
|
||||
v = *++p;
|
||||
}
|
||||
|
||||
z = __insn_ctz(zero_matches);
|
||||
g = __insn_ctz(goal_matches);
|
||||
|
||||
/* If we found c before '\0' we got a match. Note that if c == '\0'
|
||||
* then g == z, and we correctly return the address of the '\0'
|
||||
* rather than NULL.
|
||||
*/
|
||||
return (g <= z) ? ((char *)p) + (g >> 3) : NULL;
|
||||
}
|
||||
EXPORT_SYMBOL(strchr);
|
62
arch/tile/lib/strchr_64.c
Normal file
62
arch/tile/lib/strchr_64.c
Normal file
|
@ -0,0 +1,62 @@
|
|||
/*
|
||||
* Copyright 2011 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/module.h>
|
||||
#include "string-endian.h"
|
||||
|
||||
char *strchr(const char *s, int c)
|
||||
{
|
||||
int z, g;
|
||||
|
||||
/* Get an aligned pointer. */
|
||||
const uintptr_t s_int = (uintptr_t) s;
|
||||
const uint64_t *p = (const uint64_t *)(s_int & -8);
|
||||
|
||||
/* Create eight copies of the byte for which we are looking. */
|
||||
const uint64_t goal = copy_byte(c);
|
||||
|
||||
/* Read the first aligned word, but force bytes before the string to
|
||||
* match neither zero nor goal (we make sure the high bit of each
|
||||
* byte is 1, and the low 7 bits are all the opposite of the goal
|
||||
* byte).
|
||||
*/
|
||||
const uint64_t before_mask = MASK(s_int);
|
||||
uint64_t v = (*p | before_mask) ^ (goal & __insn_v1shrui(before_mask, 1));
|
||||
|
||||
uint64_t zero_matches, goal_matches;
|
||||
while (1) {
|
||||
/* Look for a terminating '\0'. */
|
||||
zero_matches = __insn_v1cmpeqi(v, 0);
|
||||
|
||||
/* Look for the goal byte. */
|
||||
goal_matches = __insn_v1cmpeq(v, goal);
|
||||
|
||||
if (__builtin_expect((zero_matches | goal_matches) != 0, 0))
|
||||
break;
|
||||
|
||||
v = *++p;
|
||||
}
|
||||
|
||||
z = CFZ(zero_matches);
|
||||
g = CFZ(goal_matches);
|
||||
|
||||
/* If we found c before '\0' we got a match. Note that if c == '\0'
|
||||
* then g == z, and we correctly return the address of the '\0'
|
||||
* rather than NULL.
|
||||
*/
|
||||
return (g <= z) ? ((char *)p) + (g >> 3) : NULL;
|
||||
}
|
||||
EXPORT_SYMBOL(strchr);
|
44
arch/tile/lib/string-endian.h
Normal file
44
arch/tile/lib/string-endian.h
Normal file
|
@ -0,0 +1,44 @@
|
|||
/*
|
||||
* Copyright 2013 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*
|
||||
* Provide a mask based on the pointer alignment that
|
||||
* sets up non-zero bytes before the beginning of the string.
|
||||
* The MASK expression works because shift counts are taken mod 64.
|
||||
* Also, specify how to count "first" and "last" bits
|
||||
* when the bits have been read as a word.
|
||||
*/
|
||||
|
||||
#include <asm/byteorder.h>
|
||||
|
||||
#ifdef __LITTLE_ENDIAN
|
||||
#define MASK(x) (__insn_shl(1ULL, (x << 3)) - 1)
|
||||
#define NULMASK(x) ((2ULL << x) - 1)
|
||||
#define CFZ(x) __insn_ctz(x)
|
||||
#define REVCZ(x) __insn_clz(x)
|
||||
#else
|
||||
#define MASK(x) (__insn_shl(-2LL, ((-x << 3) - 1)))
|
||||
#define NULMASK(x) (-2LL << (63 - x))
|
||||
#define CFZ(x) __insn_clz(x)
|
||||
#define REVCZ(x) __insn_ctz(x)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Create eight copies of the byte in a uint64_t. Byte Shuffle uses
|
||||
* the bytes of srcB as the index into the dest vector to select a
|
||||
* byte. With all indices of zero, the first byte is copied into all
|
||||
* the other bytes.
|
||||
*/
|
||||
static inline uint64_t copy_byte(uint8_t byte)
|
||||
{
|
||||
return __insn_shufflebytes(byte, 0, 0);
|
||||
}
|
36
arch/tile/lib/strlen_32.c
Normal file
36
arch/tile/lib/strlen_32.c
Normal file
|
@ -0,0 +1,36 @@
|
|||
/*
|
||||
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
size_t strlen(const char *s)
|
||||
{
|
||||
/* Get an aligned pointer. */
|
||||
const uintptr_t s_int = (uintptr_t) s;
|
||||
const uint32_t *p = (const uint32_t *)(s_int & -4);
|
||||
|
||||
/* Read the first word, but force bytes before the string to be nonzero.
|
||||
* This expression works because we know shift counts are taken mod 32.
|
||||
*/
|
||||
uint32_t v = *p | ((1 << (s_int << 3)) - 1);
|
||||
|
||||
uint32_t bits;
|
||||
while ((bits = __insn_seqb(v, 0)) == 0)
|
||||
v = *++p;
|
||||
|
||||
return ((const char *)p) + (__insn_ctz(bits) >> 3) - s;
|
||||
}
|
||||
EXPORT_SYMBOL(strlen);
|
35
arch/tile/lib/strlen_64.c
Normal file
35
arch/tile/lib/strlen_64.c
Normal file
|
@ -0,0 +1,35 @@
|
|||
/*
|
||||
* Copyright 2011 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/module.h>
|
||||
#include "string-endian.h"
|
||||
|
||||
size_t strlen(const char *s)
|
||||
{
|
||||
/* Get an aligned pointer. */
|
||||
const uintptr_t s_int = (uintptr_t) s;
|
||||
const uint64_t *p = (const uint64_t *)(s_int & -8);
|
||||
|
||||
/* Read and MASK the first word. */
|
||||
uint64_t v = *p | MASK(s_int);
|
||||
|
||||
uint64_t bits;
|
||||
while ((bits = __insn_v1cmpeqi(v, 0)) == 0)
|
||||
v = *++p;
|
||||
|
||||
return ((const char *)p) + (CFZ(bits) >> 3) - s;
|
||||
}
|
||||
EXPORT_SYMBOL(strlen);
|
47
arch/tile/lib/strnlen_32.c
Normal file
47
arch/tile/lib/strnlen_32.c
Normal file
|
@ -0,0 +1,47 @@
|
|||
/*
|
||||
* Copyright 2013 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
size_t strnlen(const char *s, size_t count)
|
||||
{
|
||||
/* Get an aligned pointer. */
|
||||
const uintptr_t s_int = (uintptr_t) s;
|
||||
const uint32_t *p = (const uint32_t *)(s_int & -4);
|
||||
size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1));
|
||||
size_t len;
|
||||
uint32_t v, bits;
|
||||
|
||||
/* Avoid page fault risk by not reading any bytes when count is 0. */
|
||||
if (count == 0)
|
||||
return 0;
|
||||
|
||||
/* Read first word, but force bytes before the string to be nonzero. */
|
||||
v = *p | ((1 << ((s_int << 3) & 31)) - 1);
|
||||
|
||||
while ((bits = __insn_seqb(v, 0)) == 0) {
|
||||
if (bytes_read >= count) {
|
||||
/* Read COUNT bytes and didn't find the terminator. */
|
||||
return count;
|
||||
}
|
||||
v = *++p;
|
||||
bytes_read += sizeof(v);
|
||||
}
|
||||
|
||||
len = ((const char *) p) + (__insn_ctz(bits) >> 3) - s;
|
||||
return (len < count ? len : count);
|
||||
}
|
||||
EXPORT_SYMBOL(strnlen);
|
48
arch/tile/lib/strnlen_64.c
Normal file
48
arch/tile/lib/strnlen_64.c
Normal file
|
@ -0,0 +1,48 @@
|
|||
/*
|
||||
* Copyright 2013 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/module.h>
|
||||
#include "string-endian.h"
|
||||
|
||||
size_t strnlen(const char *s, size_t count)
|
||||
{
|
||||
/* Get an aligned pointer. */
|
||||
const uintptr_t s_int = (uintptr_t) s;
|
||||
const uint64_t *p = (const uint64_t *)(s_int & -8);
|
||||
size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1));
|
||||
size_t len;
|
||||
uint64_t v, bits;
|
||||
|
||||
/* Avoid page fault risk by not reading any bytes when count is 0. */
|
||||
if (count == 0)
|
||||
return 0;
|
||||
|
||||
/* Read and MASK the first word. */
|
||||
v = *p | MASK(s_int);
|
||||
|
||||
while ((bits = __insn_v1cmpeqi(v, 0)) == 0) {
|
||||
if (bytes_read >= count) {
|
||||
/* Read COUNT bytes and didn't find the terminator. */
|
||||
return count;
|
||||
}
|
||||
v = *++p;
|
||||
bytes_read += sizeof(v);
|
||||
}
|
||||
|
||||
len = ((const char *) p) + (CFZ(bits) >> 3) - s;
|
||||
return (len < count ? len : count);
|
||||
}
|
||||
EXPORT_SYMBOL(strnlen);
|
24
arch/tile/lib/uaccess.c
Normal file
24
arch/tile/lib/uaccess.c
Normal file
|
@ -0,0 +1,24 @@
|
|||
/*
|
||||
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
int __range_ok(unsigned long addr, unsigned long size)
|
||||
{
|
||||
unsigned long limit = current_thread_info()->addr_limit.seg;
|
||||
return !((addr < limit && size <= limit - addr) ||
|
||||
is_arch_mappable_range(addr, size));
|
||||
}
|
||||
EXPORT_SYMBOL(__range_ok);
|
135
arch/tile/lib/usercopy_32.S
Normal file
135
arch/tile/lib/usercopy_32.S
Normal file
|
@ -0,0 +1,135 @@
|
|||
/*
|
||||
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/errno.h>
|
||||
#include <asm/cache.h>
|
||||
#include <arch/chip.h>
|
||||
|
||||
/* Access user memory, but use MMU to avoid propagating kernel exceptions. */
|
||||
|
||||
/*
|
||||
* strnlen_user_asm takes the pointer in r0, and the length bound in r1.
|
||||
* It returns the length, including the terminating NUL, or zero on exception.
|
||||
* If length is greater than the bound, returns one plus the bound.
|
||||
*/
|
||||
STD_ENTRY(strnlen_user_asm)
|
||||
{ bz r1, 2f; addi r3, r0, -1 } /* bias down to include NUL */
|
||||
1: { lb_u r4, r0; addi r1, r1, -1 }
|
||||
bz r4, 2f
|
||||
{ bnzt r1, 1b; addi r0, r0, 1 }
|
||||
2: { sub r0, r0, r3; jrp lr }
|
||||
STD_ENDPROC(strnlen_user_asm)
|
||||
.pushsection .fixup,"ax"
|
||||
strnlen_user_fault:
|
||||
{ move r0, zero; jrp lr }
|
||||
ENDPROC(strnlen_user_fault)
|
||||
.section __ex_table,"a"
|
||||
.align 4
|
||||
.word 1b, strnlen_user_fault
|
||||
.popsection
|
||||
|
||||
/*
|
||||
* strncpy_from_user_asm takes the kernel target pointer in r0,
|
||||
* the userspace source pointer in r1, and the length bound (including
|
||||
* the trailing NUL) in r2. On success, it returns the string length
|
||||
* (not including the trailing NUL), or -EFAULT on failure.
|
||||
*/
|
||||
STD_ENTRY(strncpy_from_user_asm)
|
||||
{ bz r2, 2f; move r3, r0 }
|
||||
1: { lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
|
||||
{ sb r0, r4; addi r0, r0, 1 }
|
||||
bz r4, 2f
|
||||
bnzt r2, 1b
|
||||
{ sub r0, r0, r3; jrp lr }
|
||||
2: addi r0, r0, -1 /* don't count the trailing NUL */
|
||||
{ sub r0, r0, r3; jrp lr }
|
||||
STD_ENDPROC(strncpy_from_user_asm)
|
||||
.pushsection .fixup,"ax"
|
||||
strncpy_from_user_fault:
|
||||
{ movei r0, -EFAULT; jrp lr }
|
||||
ENDPROC(strncpy_from_user_fault)
|
||||
.section __ex_table,"a"
|
||||
.align 4
|
||||
.word 1b, strncpy_from_user_fault
|
||||
.popsection
|
||||
|
||||
/*
|
||||
* clear_user_asm takes the user target address in r0 and the
|
||||
* number of bytes to zero in r1.
|
||||
* It returns the number of uncopiable bytes (hopefully zero) in r0.
|
||||
* Note that we don't use a separate .fixup section here since we fall
|
||||
* through into the "fixup" code as the last straight-line bundle anyway.
|
||||
*/
|
||||
STD_ENTRY(clear_user_asm)
|
||||
{ bz r1, 2f; or r2, r0, r1 }
|
||||
andi r2, r2, 3
|
||||
bzt r2, .Lclear_aligned_user_asm
|
||||
1: { sb r0, zero; addi r0, r0, 1; addi r1, r1, -1 }
|
||||
bnzt r1, 1b
|
||||
2: { move r0, r1; jrp lr }
|
||||
.pushsection __ex_table,"a"
|
||||
.align 4
|
||||
.word 1b, 2b
|
||||
.popsection
|
||||
|
||||
.Lclear_aligned_user_asm:
|
||||
1: { sw r0, zero; addi r0, r0, 4; addi r1, r1, -4 }
|
||||
bnzt r1, 1b
|
||||
2: { move r0, r1; jrp lr }
|
||||
STD_ENDPROC(clear_user_asm)
|
||||
.pushsection __ex_table,"a"
|
||||
.align 4
|
||||
.word 1b, 2b
|
||||
.popsection
|
||||
|
||||
/*
|
||||
* flush_user_asm takes the user target address in r0 and the
|
||||
* number of bytes to flush in r1.
|
||||
* It returns the number of unflushable bytes (hopefully zero) in r0.
|
||||
*/
|
||||
STD_ENTRY(flush_user_asm)
|
||||
bz r1, 2f
|
||||
{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
|
||||
{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
|
||||
{ and r0, r0, r2; and r1, r1, r2 }
|
||||
{ sub r1, r1, r0 }
|
||||
1: { flush r0; addi r1, r1, -CHIP_FLUSH_STRIDE() }
|
||||
{ addi r0, r0, CHIP_FLUSH_STRIDE(); bnzt r1, 1b }
|
||||
2: { move r0, r1; jrp lr }
|
||||
STD_ENDPROC(flush_user_asm)
|
||||
.pushsection __ex_table,"a"
|
||||
.align 4
|
||||
.word 1b, 2b
|
||||
.popsection
|
||||
|
||||
/*
|
||||
* finv_user_asm takes the user target address in r0 and the
|
||||
* number of bytes to flush-invalidate in r1.
|
||||
* It returns the number of not finv'able bytes (hopefully zero) in r0.
|
||||
*/
|
||||
STD_ENTRY(finv_user_asm)
|
||||
bz r1, 2f
|
||||
{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
|
||||
{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
|
||||
{ and r0, r0, r2; and r1, r1, r2 }
|
||||
{ sub r1, r1, r0 }
|
||||
1: { finv r0; addi r1, r1, -CHIP_FINV_STRIDE() }
|
||||
{ addi r0, r0, CHIP_FINV_STRIDE(); bnzt r1, 1b }
|
||||
2: { move r0, r1; jrp lr }
|
||||
STD_ENDPROC(finv_user_asm)
|
||||
.pushsection __ex_table,"a"
|
||||
.align 4
|
||||
.word 1b, 2b
|
||||
.popsection
|
135
arch/tile/lib/usercopy_64.S
Normal file
135
arch/tile/lib/usercopy_64.S
Normal file
|
@ -0,0 +1,135 @@
|
|||
/*
|
||||
* Copyright 2011 Tilera Corporation. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/errno.h>
|
||||
#include <asm/cache.h>
|
||||
#include <arch/chip.h>
|
||||
|
||||
/* Access user memory, but use MMU to avoid propagating kernel exceptions. */
|
||||
|
||||
/*
|
||||
* strnlen_user_asm takes the pointer in r0, and the length bound in r1.
|
||||
* It returns the length, including the terminating NUL, or zero on exception.
|
||||
* If length is greater than the bound, returns one plus the bound.
|
||||
*/
|
||||
STD_ENTRY(strnlen_user_asm)
|
||||
{ beqz r1, 2f; addi r3, r0, -1 } /* bias down to include NUL */
|
||||
1: { ld1u r4, r0; addi r1, r1, -1 }
|
||||
beqz r4, 2f
|
||||
{ bnezt r1, 1b; addi r0, r0, 1 }
|
||||
2: { sub r0, r0, r3; jrp lr }
|
||||
STD_ENDPROC(strnlen_user_asm)
|
||||
.pushsection .fixup,"ax"
|
||||
strnlen_user_fault:
|
||||
{ move r0, zero; jrp lr }
|
||||
ENDPROC(strnlen_user_fault)
|
||||
.section __ex_table,"a"
|
||||
.align 8
|
||||
.quad 1b, strnlen_user_fault
|
||||
.popsection
|
||||
|
||||
/*
|
||||
* strncpy_from_user_asm takes the kernel target pointer in r0,
|
||||
* the userspace source pointer in r1, and the length bound (including
|
||||
* the trailing NUL) in r2. On success, it returns the string length
|
||||
* (not including the trailing NUL), or -EFAULT on failure.
|
||||
*/
|
||||
STD_ENTRY(strncpy_from_user_asm)
|
||||
{ beqz r2, 2f; move r3, r0 }
|
||||
1: { ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
|
||||
{ st1 r0, r4; addi r0, r0, 1 }
|
||||
beqz r4, 2f
|
||||
bnezt r2, 1b
|
||||
{ sub r0, r0, r3; jrp lr }
|
||||
2: addi r0, r0, -1 /* don't count the trailing NUL */
|
||||
{ sub r0, r0, r3; jrp lr }
|
||||
STD_ENDPROC(strncpy_from_user_asm)
|
||||
.pushsection .fixup,"ax"
|
||||
strncpy_from_user_fault:
|
||||
{ movei r0, -EFAULT; jrp lr }
|
||||
ENDPROC(strncpy_from_user_fault)
|
||||
.section __ex_table,"a"
|
||||
.align 8
|
||||
.quad 1b, strncpy_from_user_fault
|
||||
.popsection
|
||||
|
||||
/*
|
||||
* clear_user_asm takes the user target address in r0 and the
|
||||
* number of bytes to zero in r1.
|
||||
* It returns the number of uncopiable bytes (hopefully zero) in r0.
|
||||
* Note that we don't use a separate .fixup section here since we fall
|
||||
* through into the "fixup" code as the last straight-line bundle anyway.
|
||||
*/
|
||||
STD_ENTRY(clear_user_asm)
|
||||
{ beqz r1, 2f; or r2, r0, r1 }
|
||||
andi r2, r2, 7
|
||||
beqzt r2, .Lclear_aligned_user_asm
|
||||
1: { st1 r0, zero; addi r0, r0, 1; addi r1, r1, -1 }
|
||||
bnezt r1, 1b
|
||||
2: { move r0, r1; jrp lr }
|
||||
.pushsection __ex_table,"a"
|
||||
.align 8
|
||||
.quad 1b, 2b
|
||||
.popsection
|
||||
|
||||
.Lclear_aligned_user_asm:
|
||||
1: { st r0, zero; addi r0, r0, 8; addi r1, r1, -8 }
|
||||
bnezt r1, 1b
|
||||
2: { move r0, r1; jrp lr }
|
||||
STD_ENDPROC(clear_user_asm)
|
||||
.pushsection __ex_table,"a"
|
||||
.align 8
|
||||
.quad 1b, 2b
|
||||
.popsection
|
||||
|
||||
/*
|
||||
* flush_user_asm takes the user target address in r0 and the
|
||||
* number of bytes to flush in r1.
|
||||
* It returns the number of unflushable bytes (hopefully zero) in r0.
|
||||
*/
|
||||
STD_ENTRY(flush_user_asm)
|
||||
beqz r1, 2f
|
||||
{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
|
||||
{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
|
||||
{ and r0, r0, r2; and r1, r1, r2 }
|
||||
{ sub r1, r1, r0 }
|
||||
1: { flush r0; addi r1, r1, -CHIP_FLUSH_STRIDE() }
|
||||
{ addi r0, r0, CHIP_FLUSH_STRIDE(); bnezt r1, 1b }
|
||||
2: { move r0, r1; jrp lr }
|
||||
STD_ENDPROC(flush_user_asm)
|
||||
.pushsection __ex_table,"a"
|
||||
.align 8
|
||||
.quad 1b, 2b
|
||||
.popsection
|
||||
|
||||
/*
|
||||
* finv_user_asm takes the user target address in r0 and the
|
||||
* number of bytes to flush-invalidate in r1.
|
||||
* It returns the number of not finv'able bytes (hopefully zero) in r0.
|
||||
*/
|
||||
STD_ENTRY(finv_user_asm)
|
||||
beqz r1, 2f
|
||||
{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
|
||||
{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
|
||||
{ and r0, r0, r2; and r1, r1, r2 }
|
||||
{ sub r1, r1, r0 }
|
||||
1: { finv r0; addi r1, r1, -CHIP_FINV_STRIDE() }
|
||||
{ addi r0, r0, CHIP_FINV_STRIDE(); bnezt r1, 1b }
|
||||
2: { move r0, r1; jrp lr }
|
||||
STD_ENDPROC(finv_user_asm)
|
||||
.pushsection __ex_table,"a"
|
||||
.align 8
|
||||
.quad 1b, 2b
|
||||
.popsection
|
Loading…
Add table
Add a link
Reference in a new issue