Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

159
arch/powerpc/mm/40x_mmu.c Normal file
View file

@ -0,0 +1,159 @@
/*
* This file contains the routines for initializing the MMU
* on the 4xx series of chips.
* -- paulus
*
* Derived from arch/ppc/mm/init.c:
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
* and Cort Dougan (PReP) (cort@cs.nmt.edu)
* Copyright (C) 1996 Paul Mackerras
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/stddef.h>
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/highmem.h>
#include <linux/memblock.h>
#include <asm/pgalloc.h>
#include <asm/prom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/uaccess.h>
#include <asm/smp.h>
#include <asm/bootx.h>
#include <asm/machdep.h>
#include <asm/setup.h>
#include "mmu_decl.h"
extern int __map_without_ltlbs;
/*
* MMU_init_hw does the chip-specific initialization of the MMU hardware.
*/
void __init MMU_init_hw(void)
{
/*
* The Zone Protection Register (ZPR) defines how protection will
* be applied to every page which is a member of a given zone. At
* present, we utilize only two of the 4xx's zones.
* The zone index bits (of ZSEL) in the PTE are used for software
* indicators, except the LSB. For user access, zone 1 is used,
* for kernel access, zone 0 is used. We set all but zone 1
* to zero, allowing only kernel access as indicated in the PTE.
* For zone 1, we set a 01 binary (a value of 10 will not work)
* to allow user access as indicated in the PTE. This also allows
* kernel access as indicated in the PTE.
*/
mtspr(SPRN_ZPR, 0x10000000);
flush_instruction_cache();
/*
* Set up the real-mode cache parameters for the exception vector
* handlers (which are run in real-mode).
*/
mtspr(SPRN_DCWR, 0x00000000); /* All caching is write-back */
/*
* Cache instruction and data space where the exception
* vectors and the kernel live in real-mode.
*/
mtspr(SPRN_DCCR, 0xFFFF0000); /* 2GByte of data space at 0x0. */
mtspr(SPRN_ICCR, 0xFFFF0000); /* 2GByte of instr. space at 0x0. */
}
#define LARGE_PAGE_SIZE_16M (1<<24)
#define LARGE_PAGE_SIZE_4M (1<<22)
unsigned long __init mmu_mapin_ram(unsigned long top)
{
unsigned long v, s, mapped;
phys_addr_t p;
v = KERNELBASE;
p = 0;
s = total_lowmem;
if (__map_without_ltlbs)
return 0;
while (s >= LARGE_PAGE_SIZE_16M) {
pmd_t *pmdp;
unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE;
pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
pmd_val(*pmdp++) = val;
pmd_val(*pmdp++) = val;
pmd_val(*pmdp++) = val;
pmd_val(*pmdp++) = val;
v += LARGE_PAGE_SIZE_16M;
p += LARGE_PAGE_SIZE_16M;
s -= LARGE_PAGE_SIZE_16M;
}
while (s >= LARGE_PAGE_SIZE_4M) {
pmd_t *pmdp;
unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE;
pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
pmd_val(*pmdp) = val;
v += LARGE_PAGE_SIZE_4M;
p += LARGE_PAGE_SIZE_4M;
s -= LARGE_PAGE_SIZE_4M;
}
mapped = total_lowmem - s;
/* If the size of RAM is not an exact power of two, we may not
* have covered RAM in its entirety with 16 and 4 MiB
* pages. Consequently, restrict the top end of RAM currently
* allocable so that calls to the MEMBLOCK to allocate PTEs for "tail"
* coverage with normal-sized pages (or other reasons) do not
* attempt to allocate outside the allowed range.
*/
memblock_set_current_limit(mapped);
return mapped;
}
void setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
{
/* We don't currently support the first MEMBLOCK not mapping 0
* physical on those processors
*/
BUG_ON(first_memblock_base != 0);
/* 40x can only access 16MB at the moment (see head_40x.S) */
memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000));
}

254
arch/powerpc/mm/44x_mmu.c Normal file
View file

@ -0,0 +1,254 @@
/*
* Modifications by Matt Porter (mporter@mvista.com) to support
* PPC44x Book E processors.
*
* This file contains the routines for initializing the MMU
* on the 4xx series of chips.
* -- paulus
*
* Derived from arch/ppc/mm/init.c:
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
* and Cort Dougan (PReP) (cort@cs.nmt.edu)
* Copyright (C) 1996 Paul Mackerras
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/init.h>
#include <linux/memblock.h>
#include <asm/mmu.h>
#include <asm/page.h>
#include <asm/cacheflush.h>
#include "mmu_decl.h"
/* Used by the 44x TLB replacement exception handler.
* Just needed it declared someplace.
*/
unsigned int tlb_44x_index; /* = 0 */
unsigned int tlb_44x_hwater = PPC44x_TLB_SIZE - 1 - PPC44x_EARLY_TLBS;
int icache_44x_need_flush;
unsigned long tlb_47x_boltmap[1024/8];
static void ppc44x_update_tlb_hwater(void)
{
extern unsigned int tlb_44x_patch_hwater_D[];
extern unsigned int tlb_44x_patch_hwater_I[];
/* The TLB miss handlers hard codes the watermark in a cmpli
* instruction to improve performances rather than loading it
* from the global variable. Thus, we patch the instructions
* in the 2 TLB miss handlers when updating the value
*/
tlb_44x_patch_hwater_D[0] = (tlb_44x_patch_hwater_D[0] & 0xffff0000) |
tlb_44x_hwater;
flush_icache_range((unsigned long)&tlb_44x_patch_hwater_D[0],
(unsigned long)&tlb_44x_patch_hwater_D[1]);
tlb_44x_patch_hwater_I[0] = (tlb_44x_patch_hwater_I[0] & 0xffff0000) |
tlb_44x_hwater;
flush_icache_range((unsigned long)&tlb_44x_patch_hwater_I[0],
(unsigned long)&tlb_44x_patch_hwater_I[1]);
}
/*
* "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 44x type MMU
*/
static void __init ppc44x_pin_tlb(unsigned int virt, unsigned int phys)
{
unsigned int entry = tlb_44x_hwater--;
ppc44x_update_tlb_hwater();
mtspr(SPRN_MMUCR, 0);
__asm__ __volatile__(
"tlbwe %2,%3,%4\n"
"tlbwe %1,%3,%5\n"
"tlbwe %0,%3,%6\n"
:
: "r" (PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G),
"r" (phys),
"r" (virt | PPC44x_TLB_VALID | PPC44x_TLB_256M),
"r" (entry),
"i" (PPC44x_TLB_PAGEID),
"i" (PPC44x_TLB_XLAT),
"i" (PPC44x_TLB_ATTRIB));
}
static int __init ppc47x_find_free_bolted(void)
{
unsigned int mmube0 = mfspr(SPRN_MMUBE0);
unsigned int mmube1 = mfspr(SPRN_MMUBE1);
if (!(mmube0 & MMUBE0_VBE0))
return 0;
if (!(mmube0 & MMUBE0_VBE1))
return 1;
if (!(mmube0 & MMUBE0_VBE2))
return 2;
if (!(mmube1 & MMUBE1_VBE3))
return 3;
if (!(mmube1 & MMUBE1_VBE4))
return 4;
if (!(mmube1 & MMUBE1_VBE5))
return 5;
return -1;
}
static void __init ppc47x_update_boltmap(void)
{
unsigned int mmube0 = mfspr(SPRN_MMUBE0);
unsigned int mmube1 = mfspr(SPRN_MMUBE1);
if (mmube0 & MMUBE0_VBE0)
__set_bit((mmube0 >> MMUBE0_IBE0_SHIFT) & 0xff,
tlb_47x_boltmap);
if (mmube0 & MMUBE0_VBE1)
__set_bit((mmube0 >> MMUBE0_IBE1_SHIFT) & 0xff,
tlb_47x_boltmap);
if (mmube0 & MMUBE0_VBE2)
__set_bit((mmube0 >> MMUBE0_IBE2_SHIFT) & 0xff,
tlb_47x_boltmap);
if (mmube1 & MMUBE1_VBE3)
__set_bit((mmube1 >> MMUBE1_IBE3_SHIFT) & 0xff,
tlb_47x_boltmap);
if (mmube1 & MMUBE1_VBE4)
__set_bit((mmube1 >> MMUBE1_IBE4_SHIFT) & 0xff,
tlb_47x_boltmap);
if (mmube1 & MMUBE1_VBE5)
__set_bit((mmube1 >> MMUBE1_IBE5_SHIFT) & 0xff,
tlb_47x_boltmap);
}
/*
* "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU
*/
static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
{
unsigned int rA;
int bolted;
/* Base rA is HW way select, way 0, bolted bit set */
rA = 0x88000000;
/* Look for a bolted entry slot */
bolted = ppc47x_find_free_bolted();
BUG_ON(bolted < 0);
/* Insert bolted slot number */
rA |= bolted << 24;
pr_debug("256M TLB entry for 0x%08x->0x%08x in bolt slot %d\n",
virt, phys, bolted);
mtspr(SPRN_MMUCR, 0);
__asm__ __volatile__(
"tlbwe %2,%3,0\n"
"tlbwe %1,%3,1\n"
"tlbwe %0,%3,2\n"
:
: "r" (PPC47x_TLB2_SW | PPC47x_TLB2_SR |
PPC47x_TLB2_SX
#ifdef CONFIG_SMP
| PPC47x_TLB2_M
#endif
),
"r" (phys),
"r" (virt | PPC47x_TLB0_VALID | PPC47x_TLB0_256M),
"r" (rA));
}
void __init MMU_init_hw(void)
{
/* This is not useful on 47x but won't hurt either */
ppc44x_update_tlb_hwater();
flush_instruction_cache();
}
unsigned long __init mmu_mapin_ram(unsigned long top)
{
unsigned long addr;
unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
/* Pin in enough TLBs to cover any lowmem not covered by the
* initial 256M mapping established in head_44x.S */
for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr;
addr += PPC_PIN_SIZE) {
if (mmu_has_feature(MMU_FTR_TYPE_47x))
ppc47x_pin_tlb(addr + PAGE_OFFSET, addr);
else
ppc44x_pin_tlb(addr + PAGE_OFFSET, addr);
}
if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
ppc47x_update_boltmap();
#ifdef DEBUG
{
int i;
printk(KERN_DEBUG "bolted entries: ");
for (i = 0; i < 255; i++) {
if (test_bit(i, tlb_47x_boltmap))
printk("%d ", i);
}
printk("\n");
}
#endif /* DEBUG */
}
return total_lowmem;
}
void setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
{
u64 size;
#ifndef CONFIG_NONSTATIC_KERNEL
/* We don't currently support the first MEMBLOCK not mapping 0
* physical on those processors
*/
BUG_ON(first_memblock_base != 0);
#endif
/* 44x has a 256M TLB entry pinned at boot */
size = (min_t(u64, first_memblock_size, PPC_PIN_SIZE));
memblock_set_current_limit(first_memblock_base + size);
}
#ifdef CONFIG_SMP
void mmu_init_secondary(int cpu)
{
unsigned long addr;
unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
/* Pin in enough TLBs to cover any lowmem not covered by the
* initial 256M mapping established in head_44x.S
*
* WARNING: This is called with only the first 256M of the
* linear mapping in the TLB and we can't take faults yet
* so beware of what this code uses. It runs off a temporary
* stack. current (r2) isn't initialized, smp_processor_id()
* will not work, current thread info isn't accessible, ...
*/
for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr;
addr += PPC_PIN_SIZE) {
if (mmu_has_feature(MMU_FTR_TYPE_47x))
ppc47x_pin_tlb(addr + PAGE_OFFSET, addr);
else
ppc44x_pin_tlb(addr + PAGE_OFFSET, addr);
}
}
#endif /* CONFIG_SMP */

37
arch/powerpc/mm/Makefile Normal file
View file

@ -0,0 +1,37 @@
#
# Makefile for the linux ppc-specific parts of the memory manager.
#
subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
obj-y := fault.o mem.o pgtable.o gup.o mmap.o \
init_$(CONFIG_WORD_SIZE).o \
pgtable_$(CONFIG_WORD_SIZE).o
obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \
tlb_nohash_low.o
obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o
hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o
obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o slb_low.o slb.o $(hash64-y)
obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o
obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \
tlb_hash$(CONFIG_WORD_SIZE).o \
mmu_context_hash$(CONFIG_WORD_SIZE).o
obj-$(CONFIG_PPC_ICSWX) += icswx.o
obj-$(CONFIG_PPC_ICSWX_PID) += icswx_pid.o
obj-$(CONFIG_40x) += 40x_mmu.o
obj-$(CONFIG_44x) += 44x_mmu.o
obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o
obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
obj-$(CONFIG_PPC_MM_SLICES) += slice.o
obj-y += hugetlbpage.o
ifeq ($(CONFIG_HUGETLB_PAGE),y)
obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o
obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o
endif
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
obj-$(CONFIG_HIGHMEM) += highmem.o
obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o

View file

@ -0,0 +1,148 @@
/*
* CoProcessor (SPU/AFU) mm fault handler
*
* (C) Copyright IBM Deutschland Entwicklung GmbH 2007
*
* Author: Arnd Bergmann <arndb@de.ibm.com>
* Author: Jeremy Kerr <jk@ozlabs.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <asm/reg.h>
#include <asm/copro.h>
#include <asm/spu.h>
#include <misc/cxl.h>
/*
* This ought to be kept in sync with the powerpc specific do_page_fault
* function. Currently, there are a few corner cases that we haven't had
* to handle fortunately.
*/
int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
unsigned long dsisr, unsigned *flt)
{
struct vm_area_struct *vma;
unsigned long is_write;
int ret;
if (mm == NULL)
return -EFAULT;
if (mm->pgd == NULL)
return -EFAULT;
down_read(&mm->mmap_sem);
ret = -EFAULT;
vma = find_vma(mm, ea);
if (!vma)
goto out_unlock;
if (ea < vma->vm_start) {
if (!(vma->vm_flags & VM_GROWSDOWN))
goto out_unlock;
if (expand_stack(vma, ea))
goto out_unlock;
}
is_write = dsisr & DSISR_ISSTORE;
if (is_write) {
if (!(vma->vm_flags & VM_WRITE))
goto out_unlock;
} else {
if (dsisr & DSISR_PROTFAULT)
goto out_unlock;
if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
goto out_unlock;
}
ret = 0;
*flt = handle_mm_fault(mm, vma, ea, is_write ? FAULT_FLAG_WRITE : 0);
if (unlikely(*flt & VM_FAULT_ERROR)) {
if (*flt & VM_FAULT_OOM) {
ret = -ENOMEM;
goto out_unlock;
} else if (*flt & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) {
ret = -EFAULT;
goto out_unlock;
}
BUG();
}
if (*flt & VM_FAULT_MAJOR)
current->maj_flt++;
else
current->min_flt++;
out_unlock:
up_read(&mm->mmap_sem);
return ret;
}
EXPORT_SYMBOL_GPL(copro_handle_mm_fault);
int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
{
u64 vsid;
int psize, ssize;
switch (REGION_ID(ea)) {
case USER_REGION_ID:
pr_devel("%s: 0x%llx -- USER_REGION_ID\n", __func__, ea);
psize = get_slice_psize(mm, ea);
ssize = user_segment_size(ea);
vsid = get_vsid(mm->context.id, ea, ssize);
break;
case VMALLOC_REGION_ID:
pr_devel("%s: 0x%llx -- VMALLOC_REGION_ID\n", __func__, ea);
if (ea < VMALLOC_END)
psize = mmu_vmalloc_psize;
else
psize = mmu_io_psize;
ssize = mmu_kernel_ssize;
vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
break;
case KERNEL_REGION_ID:
pr_devel("%s: 0x%llx -- KERNEL_REGION_ID\n", __func__, ea);
psize = mmu_linear_psize;
ssize = mmu_kernel_ssize;
vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
break;
default:
pr_debug("%s: invalid region access at %016llx\n", __func__, ea);
return 1;
}
vsid = (vsid << slb_vsid_shift(ssize)) | SLB_VSID_USER;
vsid |= mmu_psize_defs[psize].sllp |
((ssize == MMU_SEGSIZE_1T) ? SLB_VSID_B_1T : 0);
slb->esid = (ea & (ssize == MMU_SEGSIZE_1T ? ESID_MASK_1T : ESID_MASK)) | SLB_ESID_V;
slb->vsid = vsid;
return 0;
}
EXPORT_SYMBOL_GPL(copro_calculate_slb);
void copro_flush_all_slbs(struct mm_struct *mm)
{
#ifdef CONFIG_SPU_BASE
spu_flush_all_slbs(mm);
#endif
cxl_slbia(mm);
}
EXPORT_SYMBOL_GPL(copro_flush_all_slbs);

View file

@ -0,0 +1,420 @@
/*
* PowerPC version derived from arch/arm/mm/consistent.c
* Copyright (C) 2001 Dan Malek (dmalek@jlc.net)
*
* Copyright (C) 2000 Russell King
*
* Consistent memory allocators. Used for DMA devices that want to
* share uncached memory with the processor core. The function return
* is the virtual address and 'dma_handle' is the physical address.
* Mostly stolen from the ARM port, with some changes for PowerPC.
* -- Dan
*
* Reorganized to get rid of the arch-specific consistent_* functions
* and provide non-coherent implementations for the DMA API. -Matt
*
* Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent()
* implementation. This is pulled straight from ARM and barely
* modified. -Matt
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/highmem.h>
#include <linux/dma-mapping.h>
#include <linux/export.h>
#include <asm/tlbflush.h>
#include <asm/dma.h>
#include "mmu_decl.h"
/*
* This address range defaults to a value that is safe for all
* platforms which currently set CONFIG_NOT_COHERENT_CACHE. It
* can be further configured for specific applications under
* the "Advanced Setup" menu. -Matt
*/
#define CONSISTENT_BASE (IOREMAP_TOP)
#define CONSISTENT_END (CONSISTENT_BASE + CONFIG_CONSISTENT_SIZE)
#define CONSISTENT_OFFSET(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT)
/*
* This is the page table (2MB) covering uncached, DMA consistent allocations
*/
static DEFINE_SPINLOCK(consistent_lock);
/*
* VM region handling support.
*
* This should become something generic, handling VM region allocations for
* vmalloc and similar (ioremap, module space, etc).
*
* I envisage vmalloc()'s supporting vm_struct becoming:
*
* struct vm_struct {
* struct vm_region region;
* unsigned long flags;
* struct page **pages;
* unsigned int nr_pages;
* unsigned long phys_addr;
* };
*
* get_vm_area() would then call vm_region_alloc with an appropriate
* struct vm_region head (eg):
*
* struct vm_region vmalloc_head = {
* .vm_list = LIST_HEAD_INIT(vmalloc_head.vm_list),
* .vm_start = VMALLOC_START,
* .vm_end = VMALLOC_END,
* };
*
* However, vmalloc_head.vm_start is variable (typically, it is dependent on
* the amount of RAM found at boot time.) I would imagine that get_vm_area()
* would have to initialise this each time prior to calling vm_region_alloc().
*/
struct ppc_vm_region {
struct list_head vm_list;
unsigned long vm_start;
unsigned long vm_end;
};
static struct ppc_vm_region consistent_head = {
.vm_list = LIST_HEAD_INIT(consistent_head.vm_list),
.vm_start = CONSISTENT_BASE,
.vm_end = CONSISTENT_END,
};
static struct ppc_vm_region *
ppc_vm_region_alloc(struct ppc_vm_region *head, size_t size, gfp_t gfp)
{
unsigned long addr = head->vm_start, end = head->vm_end - size;
unsigned long flags;
struct ppc_vm_region *c, *new;
new = kmalloc(sizeof(struct ppc_vm_region), gfp);
if (!new)
goto out;
spin_lock_irqsave(&consistent_lock, flags);
list_for_each_entry(c, &head->vm_list, vm_list) {
if ((addr + size) < addr)
goto nospc;
if ((addr + size) <= c->vm_start)
goto found;
addr = c->vm_end;
if (addr > end)
goto nospc;
}
found:
/*
* Insert this entry _before_ the one we found.
*/
list_add_tail(&new->vm_list, &c->vm_list);
new->vm_start = addr;
new->vm_end = addr + size;
spin_unlock_irqrestore(&consistent_lock, flags);
return new;
nospc:
spin_unlock_irqrestore(&consistent_lock, flags);
kfree(new);
out:
return NULL;
}
static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsigned long addr)
{
struct ppc_vm_region *c;
list_for_each_entry(c, &head->vm_list, vm_list) {
if (c->vm_start == addr)
goto out;
}
c = NULL;
out:
return c;
}
/*
* Allocate DMA-coherent memory space and return both the kernel remapped
* virtual and bus address for that space.
*/
void *
__dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp)
{
struct page *page;
struct ppc_vm_region *c;
unsigned long order;
u64 mask = ISA_DMA_THRESHOLD, limit;
if (dev) {
mask = dev->coherent_dma_mask;
/*
* Sanity check the DMA mask - it must be non-zero, and
* must be able to be satisfied by a DMA allocation.
*/
if (mask == 0) {
dev_warn(dev, "coherent DMA mask is unset\n");
goto no_page;
}
if ((~mask) & ISA_DMA_THRESHOLD) {
dev_warn(dev, "coherent DMA mask %#llx is smaller "
"than system GFP_DMA mask %#llx\n",
mask, (unsigned long long)ISA_DMA_THRESHOLD);
goto no_page;
}
}
size = PAGE_ALIGN(size);
limit = (mask + 1) & ~mask;
if ((limit && size >= limit) ||
size >= (CONSISTENT_END - CONSISTENT_BASE)) {
printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n",
size, mask);
return NULL;
}
order = get_order(size);
/* Might be useful if we ever have a real legacy DMA zone... */
if (mask != 0xffffffff)
gfp |= GFP_DMA;
page = alloc_pages(gfp, order);
if (!page)
goto no_page;
/*
* Invalidate any data that might be lurking in the
* kernel direct-mapped region for device DMA.
*/
{
unsigned long kaddr = (unsigned long)page_address(page);
memset(page_address(page), 0, size);
flush_dcache_range(kaddr, kaddr + size);
}
/*
* Allocate a virtual address in the consistent mapping region.
*/
c = ppc_vm_region_alloc(&consistent_head, size,
gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
if (c) {
unsigned long vaddr = c->vm_start;
struct page *end = page + (1 << order);
split_page(page, order);
/*
* Set the "dma handle"
*/
*handle = page_to_phys(page);
do {
SetPageReserved(page);
map_page(vaddr, page_to_phys(page),
pgprot_noncached(PAGE_KERNEL));
page++;
vaddr += PAGE_SIZE;
} while (size -= PAGE_SIZE);
/*
* Free the otherwise unused pages.
*/
while (page < end) {
__free_page(page);
page++;
}
return (void *)c->vm_start;
}
if (page)
__free_pages(page, order);
no_page:
return NULL;
}
EXPORT_SYMBOL(__dma_alloc_coherent);
/*
* free a page as defined by the above mapping.
*/
void __dma_free_coherent(size_t size, void *vaddr)
{
struct ppc_vm_region *c;
unsigned long flags, addr;
size = PAGE_ALIGN(size);
spin_lock_irqsave(&consistent_lock, flags);
c = ppc_vm_region_find(&consistent_head, (unsigned long)vaddr);
if (!c)
goto no_area;
if ((c->vm_end - c->vm_start) != size) {
printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n",
__func__, c->vm_end - c->vm_start, size);
dump_stack();
size = c->vm_end - c->vm_start;
}
addr = c->vm_start;
do {
pte_t *ptep;
unsigned long pfn;
ptep = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(addr),
addr),
addr),
addr);
if (!pte_none(*ptep) && pte_present(*ptep)) {
pfn = pte_pfn(*ptep);
pte_clear(&init_mm, addr, ptep);
if (pfn_valid(pfn)) {
struct page *page = pfn_to_page(pfn);
__free_reserved_page(page);
}
}
addr += PAGE_SIZE;
} while (size -= PAGE_SIZE);
flush_tlb_kernel_range(c->vm_start, c->vm_end);
list_del(&c->vm_list);
spin_unlock_irqrestore(&consistent_lock, flags);
kfree(c);
return;
no_area:
spin_unlock_irqrestore(&consistent_lock, flags);
printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n",
__func__, vaddr);
dump_stack();
}
EXPORT_SYMBOL(__dma_free_coherent);
/*
* make an area consistent.
*/
void __dma_sync(void *vaddr, size_t size, int direction)
{
unsigned long start = (unsigned long)vaddr;
unsigned long end = start + size;
switch (direction) {
case DMA_NONE:
BUG();
case DMA_FROM_DEVICE:
/*
* invalidate only when cache-line aligned otherwise there is
* the potential for discarding uncommitted data from the cache
*/
if ((start & (L1_CACHE_BYTES - 1)) || (size & (L1_CACHE_BYTES - 1)))
flush_dcache_range(start, end);
else
invalidate_dcache_range(start, end);
break;
case DMA_TO_DEVICE: /* writeback only */
clean_dcache_range(start, end);
break;
case DMA_BIDIRECTIONAL: /* writeback and invalidate */
flush_dcache_range(start, end);
break;
}
}
EXPORT_SYMBOL(__dma_sync);
#ifdef CONFIG_HIGHMEM
/*
* __dma_sync_page() implementation for systems using highmem.
* In this case, each page of a buffer must be kmapped/kunmapped
* in order to have a virtual address for __dma_sync(). This must
* not sleep so kmap_atomic()/kunmap_atomic() are used.
*
* Note: yes, it is possible and correct to have a buffer extend
* beyond the first page.
*/
static inline void __dma_sync_page_highmem(struct page *page,
unsigned long offset, size_t size, int direction)
{
size_t seg_size = min((size_t)(PAGE_SIZE - offset), size);
size_t cur_size = seg_size;
unsigned long flags, start, seg_offset = offset;
int nr_segs = 1 + ((size - seg_size) + PAGE_SIZE - 1)/PAGE_SIZE;
int seg_nr = 0;
local_irq_save(flags);
do {
start = (unsigned long)kmap_atomic(page + seg_nr) + seg_offset;
/* Sync this buffer segment */
__dma_sync((void *)start, seg_size, direction);
kunmap_atomic((void *)start);
seg_nr++;
/* Calculate next buffer segment size */
seg_size = min((size_t)PAGE_SIZE, size - cur_size);
/* Add the segment size to our running total */
cur_size += seg_size;
seg_offset = 0;
} while (seg_nr < nr_segs);
local_irq_restore(flags);
}
#endif /* CONFIG_HIGHMEM */
/*
* __dma_sync_page makes memory consistent. identical to __dma_sync, but
* takes a struct page instead of a virtual address
*/
void __dma_sync_page(struct page *page, unsigned long offset,
size_t size, int direction)
{
#ifdef CONFIG_HIGHMEM
__dma_sync_page_highmem(page, offset, size, direction);
#else
unsigned long start = (unsigned long)page_address(page) + offset;
__dma_sync((void *)start, size, direction);
#endif
}
EXPORT_SYMBOL(__dma_sync_page);
/*
* Return the PFN for a given cpu virtual address returned by
* __dma_alloc_coherent. This is used by dma_mmap_coherent()
*/
unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr)
{
/* This should always be populated, so we don't test every
* level. If that fails, we'll have a nice crash which
* will be as good as a BUG_ON()
*/
pgd_t *pgd = pgd_offset_k(cpu_addr);
pud_t *pud = pud_offset(pgd, cpu_addr);
pmd_t *pmd = pmd_offset(pud, cpu_addr);
pte_t *ptep = pte_offset_kernel(pmd, cpu_addr);
if (pte_none(*ptep) || !pte_present(*ptep))
return 0;
return pte_pfn(*ptep);
}

557
arch/powerpc/mm/fault.c Normal file
View file

@ -0,0 +1,557 @@
/*
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Derived from "arch/i386/mm/fault.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* Modified by Cort Dougan and Paul Mackerras.
*
* Modified for PPC64 by Dave Engebretsen (engebret@ibm.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
#include <linux/perf_event.h>
#include <linux/ratelimit.h>
#include <linux/context_tracking.h>
#include <linux/hugetlb.h>
#include <asm/firmware.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/siginfo.h>
#include <asm/debug.h>
#include <mm/mmu_decl.h>
#include "icswx.h"
#ifdef CONFIG_KPROBES
static inline int notify_page_fault(struct pt_regs *regs)
{
int ret = 0;
/* kprobe_running() needs smp_processor_id() */
if (!user_mode(regs)) {
preempt_disable();
if (kprobe_running() && kprobe_fault_handler(regs, 11))
ret = 1;
preempt_enable();
}
return ret;
}
#else
static inline int notify_page_fault(struct pt_regs *regs)
{
return 0;
}
#endif
/*
* Check whether the instruction at regs->nip is a store using
* an update addressing form which will update r1.
*/
static int store_updates_sp(struct pt_regs *regs)
{
unsigned int inst;
if (get_user(inst, (unsigned int __user *)regs->nip))
return 0;
/* check for 1 in the rA field */
if (((inst >> 16) & 0x1f) != 1)
return 0;
/* check major opcode */
switch (inst >> 26) {
case 37: /* stwu */
case 39: /* stbu */
case 45: /* sthu */
case 53: /* stfsu */
case 55: /* stfdu */
return 1;
case 62: /* std or stdu */
return (inst & 3) == 1;
case 31:
/* check minor opcode */
switch ((inst >> 1) & 0x3ff) {
case 181: /* stdux */
case 183: /* stwux */
case 247: /* stbux */
case 439: /* sthux */
case 695: /* stfsux */
case 759: /* stfdux */
return 1;
}
}
return 0;
}
/*
* do_page_fault error handling helpers
*/
#define MM_FAULT_RETURN 0
#define MM_FAULT_CONTINUE -1
#define MM_FAULT_ERR(sig) (sig)
static int do_sigbus(struct pt_regs *regs, unsigned long address,
unsigned int fault)
{
siginfo_t info;
unsigned int lsb = 0;
up_read(&current->mm->mmap_sem);
if (!user_mode(regs))
return MM_FAULT_ERR(SIGBUS);
current->thread.trap_nr = BUS_ADRERR;
info.si_signo = SIGBUS;
info.si_errno = 0;
info.si_code = BUS_ADRERR;
info.si_addr = (void __user *)address;
#ifdef CONFIG_MEMORY_FAILURE
if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
current->comm, current->pid, address);
info.si_code = BUS_MCEERR_AR;
}
if (fault & VM_FAULT_HWPOISON_LARGE)
lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
if (fault & VM_FAULT_HWPOISON)
lsb = PAGE_SHIFT;
#endif
info.si_addr_lsb = lsb;
force_sig_info(SIGBUS, &info, current);
return MM_FAULT_RETURN;
}
static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
{
/*
* Pagefault was interrupted by SIGKILL. We have no reason to
* continue the pagefault.
*/
if (fatal_signal_pending(current)) {
/*
* If we have retry set, the mmap semaphore will have
* alrady been released in __lock_page_or_retry(). Else
* we release it now.
*/
if (!(fault & VM_FAULT_RETRY))
up_read(&current->mm->mmap_sem);
/* Coming from kernel, we need to deal with uaccess fixups */
if (user_mode(regs))
return MM_FAULT_RETURN;
return MM_FAULT_ERR(SIGKILL);
}
/* No fault: be happy */
if (!(fault & VM_FAULT_ERROR))
return MM_FAULT_CONTINUE;
/* Out of memory */
if (fault & VM_FAULT_OOM) {
up_read(&current->mm->mmap_sem);
/*
* We ran out of memory, or some other thing happened to us that
* made us unable to handle the page fault gracefully.
*/
if (!user_mode(regs))
return MM_FAULT_ERR(SIGKILL);
pagefault_out_of_memory();
return MM_FAULT_RETURN;
}
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE))
return do_sigbus(regs, addr, fault);
/* We don't understand the fault code, this is fatal */
BUG();
return MM_FAULT_CONTINUE;
}
/*
* For 600- and 800-family processors, the error_code parameter is DSISR
* for a data fault, SRR1 for an instruction fault. For 400-family processors
* the error_code parameter is ESR for a data fault, 0 for an instruction
* fault.
* For 64-bit processors, the error_code parameter is
* - DSISR for a non-SLB data access fault,
* - SRR1 & 0x08000000 for a non-SLB instruction access fault
* - 0 any SLB fault.
*
* The return value is 0 if the fault was handled, or the signal
* number if this is a kernel fault that can't be handled here.
*/
int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
unsigned long error_code)
{
enum ctx_state prev_state = exception_enter();
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
int code = SEGV_MAPERR;
int is_write = 0;
int trap = TRAP(regs);
int is_exec = trap == 0x400;
int fault;
int rc = 0, store_update_sp = 0;
#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
/*
* Fortunately the bit assignments in SRR1 for an instruction
* fault and DSISR for a data fault are mostly the same for the
* bits we are interested in. But there are some bits which
* indicate errors in DSISR but can validly be set in SRR1.
*/
if (trap == 0x400)
error_code &= 0x48200000;
else
is_write = error_code & DSISR_ISSTORE;
#else
is_write = error_code & ESR_DST;
#endif /* CONFIG_4xx || CONFIG_BOOKE */
#ifdef CONFIG_PPC_ICSWX
/*
* we need to do this early because this "data storage
* interrupt" does not update the DAR/DEAR so we don't want to
* look at it
*/
if (error_code & ICSWX_DSI_UCT) {
rc = acop_handle_fault(regs, address, error_code);
if (rc)
goto bail;
}
#endif /* CONFIG_PPC_ICSWX */
if (notify_page_fault(regs))
goto bail;
if (unlikely(debugger_fault_handler(regs)))
goto bail;
/* On a kernel SLB miss we can only check for a valid exception entry */
if (!user_mode(regs) && (address >= TASK_SIZE)) {
rc = SIGSEGV;
goto bail;
}
#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \
defined(CONFIG_PPC_BOOK3S_64))
if (error_code & DSISR_DABRMATCH) {
/* breakpoint match */
do_break(regs, address, error_code);
goto bail;
}
#endif
/* We restore the interrupt state now */
if (!arch_irq_disabled_regs(regs))
local_irq_enable();
if (in_atomic() || mm == NULL) {
if (!user_mode(regs)) {
rc = SIGSEGV;
goto bail;
}
/* in_atomic() in user mode is really bad,
as is current->mm == NULL. */
printk(KERN_EMERG "Page fault in user mode with "
"in_atomic() = %d mm = %p\n", in_atomic(), mm);
printk(KERN_EMERG "NIP = %lx MSR = %lx\n",
regs->nip, regs->msr);
die("Weird page fault", regs, SIGSEGV);
}
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
/*
* We want to do this outside mmap_sem, because reading code around nip
* can result in fault, which will cause a deadlock when called with
* mmap_sem held
*/
if (user_mode(regs))
store_update_sp = store_updates_sp(regs);
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
* erroneous fault occurring in a code path which already holds mmap_sem
* we will deadlock attempting to validate the fault against the
* address space. Luckily the kernel only validly references user
* space from well defined areas of code, which are listed in the
* exceptions table.
*
* As the vast majority of faults will be valid we will only perform
* the source reference check when there is a possibility of a deadlock.
* Attempt to lock the address space, if we cannot we then validate the
* source. If this is invalid we can skip the address space check,
* thus avoiding the deadlock.
*/
if (!down_read_trylock(&mm->mmap_sem)) {
if (!user_mode(regs) && !search_exception_tables(regs->nip))
goto bad_area_nosemaphore;
retry:
down_read(&mm->mmap_sem);
} else {
/*
* The above down_read_trylock() might have succeeded in
* which case we'll have missed the might_sleep() from
* down_read():
*/
might_sleep();
}
vma = find_vma(mm, address);
if (!vma)
goto bad_area;
if (vma->vm_start <= address)
goto good_area;
if (!(vma->vm_flags & VM_GROWSDOWN))
goto bad_area;
/*
* N.B. The POWER/Open ABI allows programs to access up to
* 288 bytes below the stack pointer.
* The kernel signal delivery code writes up to about 1.5kB
* below the stack pointer (r1) before decrementing it.
* The exec code can write slightly over 640kB to the stack
* before setting the user r1. Thus we allow the stack to
* expand to 1MB without further checks.
*/
if (address + 0x100000 < vma->vm_end) {
/* get user regs even if this fault is in kernel mode */
struct pt_regs *uregs = current->thread.regs;
if (uregs == NULL)
goto bad_area;
/*
* A user-mode access to an address a long way below
* the stack pointer is only valid if the instruction
* is one which would update the stack pointer to the
* address accessed if the instruction completed,
* i.e. either stwu rs,n(r1) or stwux rs,r1,rb
* (or the byte, halfword, float or double forms).
*
* If we don't check this then any write to the area
* between the last mapped region and the stack will
* expand the stack rather than segfaulting.
*/
if (address + 2048 < uregs->gpr[1] && !store_update_sp)
goto bad_area;
}
if (expand_stack(vma, address))
goto bad_area;
good_area:
code = SEGV_ACCERR;
#if defined(CONFIG_6xx)
if (error_code & 0x95700000)
/* an error such as lwarx to I/O controller space,
address matching DABR, eciwx, etc. */
goto bad_area;
#endif /* CONFIG_6xx */
#if defined(CONFIG_8xx)
/* 8xx sometimes need to load a invalid/non-present TLBs.
* These must be invalidated separately as linux mm don't.
*/
if (error_code & 0x40000000) /* no translation? */
_tlbil_va(address, 0, 0, 0);
/* The MPC8xx seems to always set 0x80000000, which is
* "undefined". Of those that can be set, this is the only
* one which seems bad.
*/
if (error_code & 0x10000000)
/* Guarded storage error. */
goto bad_area;
#endif /* CONFIG_8xx */
if (is_exec) {
#ifdef CONFIG_PPC_STD_MMU
/* Protection fault on exec go straight to failure on
* Hash based MMUs as they either don't support per-page
* execute permission, or if they do, it's handled already
* at the hash level. This test would probably have to
* be removed if we change the way this works to make hash
* processors use the same I/D cache coherency mechanism
* as embedded.
*/
if (error_code & DSISR_PROTFAULT)
goto bad_area;
#endif /* CONFIG_PPC_STD_MMU */
/*
* Allow execution from readable areas if the MMU does not
* provide separate controls over reading and executing.
*
* Note: That code used to not be enabled for 4xx/BookE.
* It is now as I/D cache coherency for these is done at
* set_pte_at() time and I see no reason why the test
* below wouldn't be valid on those processors. This -may-
* break programs compiled with a really old ABI though.
*/
if (!(vma->vm_flags & VM_EXEC) &&
(cpu_has_feature(CPU_FTR_NOEXECUTE) ||
!(vma->vm_flags & (VM_READ | VM_WRITE))))
goto bad_area;
/* a write */
} else if (is_write) {
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
flags |= FAULT_FLAG_WRITE;
/* a read */
} else {
/* protection fault */
if (error_code & 0x08000000)
goto bad_area;
if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
goto bad_area;
}
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
fault = handle_mm_fault(mm, vma, address, flags);
if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
if (fault & VM_FAULT_SIGSEGV)
goto bad_area;
rc = mm_fault_error(regs, address, fault);
if (rc >= MM_FAULT_RETURN)
goto bail;
else
rc = 0;
}
/*
* Major/minor page fault accounting is only done on the
* initial attempt. If we go through a retry, it is extremely
* likely that the page will be found in page cache at that point.
*/
if (flags & FAULT_FLAG_ALLOW_RETRY) {
if (fault & VM_FAULT_MAJOR) {
current->maj_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
regs, address);
#ifdef CONFIG_PPC_SMLPAR
if (firmware_has_feature(FW_FEATURE_CMO)) {
u32 page_ins;
preempt_disable();
page_ins = be32_to_cpu(get_lppaca()->page_ins);
page_ins += 1 << PAGE_FACTOR;
get_lppaca()->page_ins = cpu_to_be32(page_ins);
preempt_enable();
}
#endif /* CONFIG_PPC_SMLPAR */
} else {
current->min_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
regs, address);
}
if (fault & VM_FAULT_RETRY) {
/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
* of starvation. */
flags &= ~FAULT_FLAG_ALLOW_RETRY;
flags |= FAULT_FLAG_TRIED;
goto retry;
}
}
up_read(&mm->mmap_sem);
goto bail;
bad_area:
up_read(&mm->mmap_sem);
bad_area_nosemaphore:
/* User mode accesses cause a SIGSEGV */
if (user_mode(regs)) {
_exception(SIGSEGV, regs, code, address);
goto bail;
}
if (is_exec && (error_code & DSISR_PROTFAULT))
printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected"
" page (%lx) - exploit attempt? (uid: %d)\n",
address, from_kuid(&init_user_ns, current_uid()));
rc = SIGSEGV;
bail:
exception_exit(prev_state);
return rc;
}
/*
* bad_page_fault is called when we have a bad access from the kernel.
* It is called from the DSI and ISI handlers in head.S and from some
* of the procedures in traps.c.
*/
void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
{
const struct exception_table_entry *entry;
/* Are we prepared to handle this fault? */
if ((entry = search_exception_tables(regs->nip)) != NULL) {
regs->nip = entry->fixup;
return;
}
/* kernel has accessed a bad area */
switch (regs->trap) {
case 0x300:
case 0x380:
printk(KERN_ALERT "Unable to handle kernel paging request for "
"data at address 0x%08lx\n", regs->dar);
break;
case 0x400:
case 0x480:
printk(KERN_ALERT "Unable to handle kernel paging request for "
"instruction fetch\n");
break;
default:
printk(KERN_ALERT "Unable to handle kernel paging request for "
"unknown fault\n");
break;
}
printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
regs->nip);
if (task_stack_end_corrupted(current))
printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
die("Kernel access of bad area", regs, sig);
}

View file

@ -0,0 +1,318 @@
/*
* Modifications by Kumar Gala (galak@kernel.crashing.org) to support
* E500 Book E processors.
*
* Copyright 2004,2010 Freescale Semiconductor, Inc.
*
* This file contains the routines for initializing the MMU
* on the 4xx series of chips.
* -- paulus
*
* Derived from arch/ppc/mm/init.c:
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
* and Cort Dougan (PReP) (cort@cs.nmt.edu)
* Copyright (C) 1996 Paul Mackerras
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/stddef.h>
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/highmem.h>
#include <linux/memblock.h>
#include <asm/pgalloc.h>
#include <asm/prom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/uaccess.h>
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/setup.h>
#include <asm/paca.h>
#include "mmu_decl.h"
unsigned int tlbcam_index;
#define NUM_TLBCAMS (64)
struct tlbcam TLBCAM[NUM_TLBCAMS];
struct tlbcamrange {
unsigned long start;
unsigned long limit;
phys_addr_t phys;
} tlbcam_addrs[NUM_TLBCAMS];
extern unsigned int tlbcam_index;
unsigned long tlbcam_sz(int idx)
{
return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1;
}
/*
* Return PA for this VA if it is mapped by a CAM, or 0
*/
phys_addr_t v_mapped_by_tlbcam(unsigned long va)
{
int b;
for (b = 0; b < tlbcam_index; ++b)
if (va >= tlbcam_addrs[b].start && va < tlbcam_addrs[b].limit)
return tlbcam_addrs[b].phys + (va - tlbcam_addrs[b].start);
return 0;
}
/*
* Return VA for a given PA or 0 if not mapped
*/
unsigned long p_mapped_by_tlbcam(phys_addr_t pa)
{
int b;
for (b = 0; b < tlbcam_index; ++b)
if (pa >= tlbcam_addrs[b].phys
&& pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start)
+tlbcam_addrs[b].phys)
return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys);
return 0;
}
/*
* Set up a variable-size TLB entry (tlbcam). The parameters are not checked;
* in particular size must be a power of 4 between 4k and the max supported by
* an implementation; max may further be limited by what can be represented in
* an unsigned long (for example, 32-bit implementations cannot support a 4GB
* size).
*/
static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
unsigned long size, unsigned long flags, unsigned int pid)
{
unsigned int tsize;
tsize = __ilog2(size) - 10;
#ifdef CONFIG_SMP
if ((flags & _PAGE_NO_CACHE) == 0)
flags |= _PAGE_COHERENT;
#endif
TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index) | MAS0_NV(index+1);
TLBCAM[index].MAS1 = MAS1_VALID | MAS1_IPROT | MAS1_TSIZE(tsize) | MAS1_TID(pid);
TLBCAM[index].MAS2 = virt & PAGE_MASK;
TLBCAM[index].MAS2 |= (flags & _PAGE_WRITETHRU) ? MAS2_W : 0;
TLBCAM[index].MAS2 |= (flags & _PAGE_NO_CACHE) ? MAS2_I : 0;
TLBCAM[index].MAS2 |= (flags & _PAGE_COHERENT) ? MAS2_M : 0;
TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0;
TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0;
TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR;
TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0);
if (mmu_has_feature(MMU_FTR_BIG_PHYS))
TLBCAM[index].MAS7 = (u64)phys >> 32;
/* Below is unlikely -- only for large user pages or similar */
if (pte_user(flags)) {
TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
}
tlbcam_addrs[index].start = virt;
tlbcam_addrs[index].limit = virt + size - 1;
tlbcam_addrs[index].phys = phys;
loadcam_entry(index);
}
unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
phys_addr_t phys)
{
unsigned int camsize = __ilog2(ram);
unsigned int align = __ffs(virt | phys);
unsigned long max_cam;
if ((mfspr(SPRN_MMUCFG) & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
/* Convert (4^max) kB to (2^max) bytes */
max_cam = ((mfspr(SPRN_TLB1CFG) >> 16) & 0xf) * 2 + 10;
camsize &= ~1U;
align &= ~1U;
} else {
/* Convert (2^max) kB to (2^max) bytes */
max_cam = __ilog2(mfspr(SPRN_TLB1PS)) + 10;
}
if (camsize > align)
camsize = align;
if (camsize > max_cam)
camsize = max_cam;
return 1UL << camsize;
}
static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
unsigned long ram, int max_cam_idx)
{
int i;
unsigned long amount_mapped = 0;
/* Calculate CAM values */
for (i = 0; ram && i < max_cam_idx; i++) {
unsigned long cam_sz;
cam_sz = calc_cam_sz(ram, virt, phys);
settlbcam(i, virt, phys, cam_sz, PAGE_KERNEL_X, 0);
ram -= cam_sz;
amount_mapped += cam_sz;
virt += cam_sz;
phys += cam_sz;
}
tlbcam_index = i;
#ifdef CONFIG_PPC64
get_paca()->tcd.esel_next = i;
get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
get_paca()->tcd.esel_first = i;
#endif
return amount_mapped;
}
unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx)
{
unsigned long virt = PAGE_OFFSET;
phys_addr_t phys = memstart_addr;
return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx);
}
#ifdef CONFIG_PPC32
#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS)
#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS"
#endif
unsigned long __init mmu_mapin_ram(unsigned long top)
{
return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1;
}
/*
* MMU_init_hw does the chip-specific initialization of the MMU hardware.
*/
void __init MMU_init_hw(void)
{
flush_instruction_cache();
}
void __init adjust_total_lowmem(void)
{
unsigned long ram;
int i;
/* adjust lowmem size to __max_low_memory */
ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
i = switch_to_as1();
__max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM);
restore_to_as0(i, 0, 0, 1);
pr_info("Memory CAM mapping: ");
for (i = 0; i < tlbcam_index - 1; i++)
pr_cont("%lu/", tlbcam_sz(i) >> 20);
pr_cont("%lu Mb, residual: %dMb\n", tlbcam_sz(tlbcam_index - 1) >> 20,
(unsigned int)((total_lowmem - __max_low_memory) >> 20));
memblock_set_current_limit(memstart_addr + __max_low_memory);
}
void setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
{
phys_addr_t limit = first_memblock_base + first_memblock_size;
/* 64M mapped initially according to head_fsl_booke.S */
memblock_set_current_limit(min_t(u64, limit, 0x04000000));
}
#ifdef CONFIG_RELOCATABLE
int __initdata is_second_reloc;
notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start)
{
unsigned long base = KERNELBASE;
kernstart_addr = start;
if (is_second_reloc) {
virt_phys_offset = PAGE_OFFSET - memstart_addr;
return;
}
/*
* Relocatable kernel support based on processing of dynamic
* relocation entries. Before we get the real memstart_addr,
* We will compute the virt_phys_offset like this:
* virt_phys_offset = stext.run - kernstart_addr
*
* stext.run = (KERNELBASE & ~0x3ffffff) +
* (kernstart_addr & 0x3ffffff)
* When we relocate, we have :
*
* (kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff)
*
* hence:
* virt_phys_offset = (KERNELBASE & ~0x3ffffff) -
* (kernstart_addr & ~0x3ffffff)
*
*/
start &= ~0x3ffffff;
base &= ~0x3ffffff;
virt_phys_offset = base - start;
early_get_first_memblock_info(__va(dt_ptr), NULL);
/*
* We now get the memstart_addr, then we should check if this
* address is the same as what the PAGE_OFFSET map to now. If
* not we have to change the map of PAGE_OFFSET to memstart_addr
* and do a second relocation.
*/
if (start != memstart_addr) {
int n;
long offset = start - memstart_addr;
is_second_reloc = 1;
n = switch_to_as1();
/* map a 64M area for the second relocation */
if (memstart_addr > start)
map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM);
else
map_mem_in_cams_addr(start, PAGE_OFFSET + offset,
0x4000000, CONFIG_LOWMEM_CAM_NUM);
restore_to_as0(n, offset, __va(dt_ptr), 1);
/* We should never reach here */
panic("Relocation error");
}
}
#endif
#endif

235
arch/powerpc/mm/gup.c Normal file
View file

@ -0,0 +1,235 @@
/*
* Lockless get_user_pages_fast for powerpc
*
* Copyright (C) 2008 Nick Piggin
* Copyright (C) 2008 Novell Inc.
*/
#undef DEBUG
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/vmstat.h>
#include <linux/pagemap.h>
#include <linux/rwsem.h>
#include <asm/pgtable.h>
#ifdef __HAVE_ARCH_PTE_SPECIAL
/*
* The performance critical leaf functions are made noinline otherwise gcc
* inlines everything into a single function which results in too much
* register pressure.
*/
static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long mask, result;
pte_t *ptep;
result = _PAGE_PRESENT|_PAGE_USER;
if (write)
result |= _PAGE_RW;
mask = result | _PAGE_SPECIAL;
ptep = pte_offset_kernel(&pmd, addr);
do {
pte_t pte = ACCESS_ONCE(*ptep);
struct page *page;
/*
* Similar to the PMD case, NUMA hinting must take slow path
*/
if (pte_numa(pte))
return 0;
if ((pte_val(pte) & mask) != result)
return 0;
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
if (!page_cache_get_speculative(page))
return 0;
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
put_page(page);
return 0;
}
pages[*nr] = page;
(*nr)++;
} while (ptep++, addr += PAGE_SIZE, addr != end);
return 1;
}
static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
unsigned long next;
pmd_t *pmdp;
pmdp = pmd_offset(&pud, addr);
do {
pmd_t pmd = ACCESS_ONCE(*pmdp);
next = pmd_addr_end(addr, end);
/*
* If we find a splitting transparent hugepage we
* return zero. That will result in taking the slow
* path which will call wait_split_huge_page()
* if the pmd is still in splitting state
*/
if (pmd_none(pmd) || pmd_trans_splitting(pmd))
return 0;
if (pmd_huge(pmd) || pmd_large(pmd)) {
/*
* NUMA hinting faults need to be handled in the GUP
* slowpath for accounting purposes and so that they
* can be serialised against THP migration.
*/
if (pmd_numa(pmd))
return 0;
if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
write, pages, nr))
return 0;
} else if (is_hugepd(pmdp)) {
if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
addr, next, write, pages, nr))
return 0;
} else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
return 1;
}
static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;
pudp = pud_offset(&pgd, addr);
do {
pud_t pud = ACCESS_ONCE(*pudp);
next = pud_addr_end(addr, end);
if (pud_none(pud))
return 0;
if (pud_huge(pud)) {
if (!gup_hugepte((pte_t *)pudp, PUD_SIZE, addr, next,
write, pages, nr))
return 0;
} else if (is_hugepd(pudp)) {
if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
addr, next, write, pages, nr))
return 0;
} else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
return 0;
} while (pudp++, addr = next, addr != end);
return 1;
}
int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
unsigned long next;
unsigned long flags;
pgd_t *pgdp;
int nr = 0;
pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
start &= PAGE_MASK;
addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len;
if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
start, len)))
return 0;
pr_devel(" aligned: %lx .. %lx\n", start, end);
/*
* XXX: batch / limit 'nr', to avoid large irq off latency
* needs some instrumenting to determine the common sizes used by
* important workloads (eg. DB2), and whether limiting the batch size
* will decrease performance.
*
* It seems like we're in the clear for the moment. Direct-IO is
* the main guy that batches up lots of get_user_pages, and even
* they are limited to 64-at-a-time which is not so many.
*/
/*
* This doesn't prevent pagetable teardown, but does prevent
* the pagetables from being freed on powerpc.
*
* So long as we atomically load page table pointers versus teardown,
* we can follow the address down to the the page and take a ref on it.
*/
local_irq_save(flags);
pgdp = pgd_offset(mm, addr);
do {
pgd_t pgd = ACCESS_ONCE(*pgdp);
pr_devel(" %016lx: normal pgd %p\n", addr,
(void *)pgd_val(pgd));
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
break;
if (pgd_huge(pgd)) {
if (!gup_hugepte((pte_t *)pgdp, PGDIR_SIZE, addr, next,
write, pages, &nr))
break;
} else if (is_hugepd(pgdp)) {
if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
addr, next, write, pages, &nr))
break;
} else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
break;
} while (pgdp++, addr = next, addr != end);
local_irq_restore(flags);
return nr;
}
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
struct mm_struct *mm = current->mm;
int nr, ret;
start &= PAGE_MASK;
nr = __get_user_pages_fast(start, nr_pages, write, pages);
ret = nr;
if (nr < nr_pages) {
pr_devel(" slow path ! nr = %d\n", nr);
/* Try to get the remaining pages with get_user_pages */
start += nr << PAGE_SHIFT;
pages += nr;
down_read(&mm->mmap_sem);
ret = get_user_pages(current, mm, start,
nr_pages - nr, write, 0, pages, NULL);
up_read(&mm->mmap_sem);
/* Have to be a bit careful with return values */
if (nr > 0) {
if (ret < 0)
ret = nr;
else
ret += nr;
}
}
return ret;
}
#endif /* __HAVE_ARCH_PTE_SPECIAL */

View file

@ -0,0 +1,712 @@
/*
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
* Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP
* Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu>
* Adapted for Power Macintosh by Paul Mackerras.
* Low-level exception handlers and MMU support
* rewritten by Paul Mackerras.
* Copyright (C) 1996 Paul Mackerras.
*
* This file contains low-level assembler routines for managing
* the PowerPC MMU hash table. (PPC 8xx processors don't use a
* hash table, so this file is not used on them.)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <asm/reg.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/cputable.h>
#include <asm/ppc_asm.h>
#include <asm/thread_info.h>
#include <asm/asm-offsets.h>
#ifdef CONFIG_SMP
.section .bss
.align 2
.globl mmu_hash_lock
mmu_hash_lock:
.space 4
#endif /* CONFIG_SMP */
/*
* Load a PTE into the hash table, if possible.
* The address is in r4, and r3 contains an access flag:
* _PAGE_RW (0x400) if a write.
* r9 contains the SRR1 value, from which we use the MSR_PR bit.
* SPRG_THREAD contains the physical address of the current task's thread.
*
* Returns to the caller if the access is illegal or there is no
* mapping for the address. Otherwise it places an appropriate PTE
* in the hash table and returns from the exception.
* Uses r0, r3 - r8, r10, ctr, lr.
*/
.text
_GLOBAL(hash_page)
tophys(r7,0) /* gets -KERNELBASE into r7 */
#ifdef CONFIG_SMP
addis r8,r7,mmu_hash_lock@h
ori r8,r8,mmu_hash_lock@l
lis r0,0x0fff
b 10f
11: lwz r6,0(r8)
cmpwi 0,r6,0
bne 11b
10: lwarx r6,0,r8
cmpwi 0,r6,0
bne- 11b
stwcx. r0,0,r8
bne- 10b
isync
#endif
/* Get PTE (linux-style) and check access */
lis r0,KERNELBASE@h /* check if kernel address */
cmplw 0,r4,r0
mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */
ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
lwz r5,PGDIR(r8) /* virt page-table root */
blt+ 112f /* assume user more likely */
lis r5,swapper_pg_dir@ha /* if kernel address, use */
addi r5,r5,swapper_pg_dir@l /* kernel page table */
rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */
112: add r5,r5,r7 /* convert to phys addr */
#ifndef CONFIG_PTE_64BIT
rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */
lwz r8,0(r5) /* get pmd entry */
rlwinm. r8,r8,0,0,19 /* extract address of pte page */
#else
rlwinm r8,r4,13,19,29 /* Compute pgdir/pmd offset */
lwzx r8,r8,r5 /* Get L1 entry */
rlwinm. r8,r8,0,0,20 /* extract pt base address */
#endif
#ifdef CONFIG_SMP
beq- hash_page_out /* return if no mapping */
#else
/* XXX it seems like the 601 will give a machine fault on the
rfi if its alignment is wrong (bottom 4 bits of address are
8 or 0xc) and we have had a not-taken conditional branch
to the address following the rfi. */
beqlr-
#endif
#ifndef CONFIG_PTE_64BIT
rlwimi r8,r4,22,20,29 /* insert next 10 bits of address */
#else
rlwimi r8,r4,23,20,28 /* compute pte address */
#endif
rlwinm r0,r3,32-3,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */
ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE
/*
* Update the linux PTE atomically. We do the lwarx up-front
* because almost always, there won't be a permission violation
* and there won't already be an HPTE, and thus we will have
* to update the PTE to set _PAGE_HASHPTE. -- paulus.
*
* If PTE_64BIT is set, the low word is the flags word; use that
* word for locking since it contains all the interesting bits.
*/
#if (PTE_FLAGS_OFFSET != 0)
addi r8,r8,PTE_FLAGS_OFFSET
#endif
retry:
lwarx r6,0,r8 /* get linux-style pte, flag word */
andc. r5,r3,r6 /* check access & ~permission */
#ifdef CONFIG_SMP
bne- hash_page_out /* return if access not permitted */
#else
bnelr-
#endif
or r5,r0,r6 /* set accessed/dirty bits */
#ifdef CONFIG_PTE_64BIT
#ifdef CONFIG_SMP
subf r10,r6,r8 /* create false data dependency */
subi r10,r10,PTE_FLAGS_OFFSET
lwzx r10,r6,r10 /* Get upper PTE word */
#else
lwz r10,-PTE_FLAGS_OFFSET(r8)
#endif /* CONFIG_SMP */
#endif /* CONFIG_PTE_64BIT */
stwcx. r5,0,r8 /* attempt to update PTE */
bne- retry /* retry if someone got there first */
mfsrin r3,r4 /* get segment reg for segment */
mfctr r0
stw r0,_CTR(r11)
bl create_hpte /* add the hash table entry */
#ifdef CONFIG_SMP
eieio
addis r8,r7,mmu_hash_lock@ha
li r0,0
stw r0,mmu_hash_lock@l(r8)
#endif
/* Return from the exception */
lwz r5,_CTR(r11)
mtctr r5
lwz r0,GPR0(r11)
lwz r7,GPR7(r11)
lwz r8,GPR8(r11)
b fast_exception_return
#ifdef CONFIG_SMP
hash_page_out:
eieio
addis r8,r7,mmu_hash_lock@ha
li r0,0
stw r0,mmu_hash_lock@l(r8)
blr
#endif /* CONFIG_SMP */
/*
* Add an entry for a particular page to the hash table.
*
* add_hash_page(unsigned context, unsigned long va, unsigned long pmdval)
*
* We assume any necessary modifications to the pte (e.g. setting
* the accessed bit) have already been done and that there is actually
* a hash table in use (i.e. we're not on a 603).
*/
_GLOBAL(add_hash_page)
mflr r0
stw r0,4(r1)
/* Convert context and va to VSID */
mulli r3,r3,897*16 /* multiply context by context skew */
rlwinm r0,r4,4,28,31 /* get ESID (top 4 bits of va) */
mulli r0,r0,0x111 /* multiply by ESID skew */
add r3,r3,r0 /* note create_hpte trims to 24 bits */
#ifdef CONFIG_SMP
CURRENT_THREAD_INFO(r8, r1) /* use cpu number to make tag */
lwz r8,TI_CPU(r8) /* to go in mmu_hash_lock */
oris r8,r8,12
#endif /* CONFIG_SMP */
/*
* We disable interrupts here, even on UP, because we don't
* want to race with hash_page, and because we want the
* _PAGE_HASHPTE bit to be a reliable indication of whether
* the HPTE exists (or at least whether one did once).
* We also turn off the MMU for data accesses so that we
* we can't take a hash table miss (assuming the code is
* covered by a BAT). -- paulus
*/
mfmsr r9
SYNC
rlwinm r0,r9,0,17,15 /* clear bit 16 (MSR_EE) */
rlwinm r0,r0,0,28,26 /* clear MSR_DR */
mtmsr r0
SYNC_601
isync
tophys(r7,0)
#ifdef CONFIG_SMP
addis r6,r7,mmu_hash_lock@ha
addi r6,r6,mmu_hash_lock@l
10: lwarx r0,0,r6 /* take the mmu_hash_lock */
cmpi 0,r0,0
bne- 11f
stwcx. r8,0,r6
beq+ 12f
11: lwz r0,0(r6)
cmpi 0,r0,0
beq 10b
b 11b
12: isync
#endif
/*
* Fetch the linux pte and test and set _PAGE_HASHPTE atomically.
* If _PAGE_HASHPTE was already set, we don't replace the existing
* HPTE, so we just unlock and return.
*/
mr r8,r5
#ifndef CONFIG_PTE_64BIT
rlwimi r8,r4,22,20,29
#else
rlwimi r8,r4,23,20,28
addi r8,r8,PTE_FLAGS_OFFSET
#endif
1: lwarx r6,0,r8
andi. r0,r6,_PAGE_HASHPTE
bne 9f /* if HASHPTE already set, done */
#ifdef CONFIG_PTE_64BIT
#ifdef CONFIG_SMP
subf r10,r6,r8 /* create false data dependency */
subi r10,r10,PTE_FLAGS_OFFSET
lwzx r10,r6,r10 /* Get upper PTE word */
#else
lwz r10,-PTE_FLAGS_OFFSET(r8)
#endif /* CONFIG_SMP */
#endif /* CONFIG_PTE_64BIT */
ori r5,r6,_PAGE_HASHPTE
stwcx. r5,0,r8
bne- 1b
bl create_hpte
9:
#ifdef CONFIG_SMP
addis r6,r7,mmu_hash_lock@ha
addi r6,r6,mmu_hash_lock@l
eieio
li r0,0
stw r0,0(r6) /* clear mmu_hash_lock */
#endif
/* reenable interrupts and DR */
mtmsr r9
SYNC_601
isync
lwz r0,4(r1)
mtlr r0
blr
/*
* This routine adds a hardware PTE to the hash table.
* It is designed to be called with the MMU either on or off.
* r3 contains the VSID, r4 contains the virtual address,
* r5 contains the linux PTE, r6 contains the old value of the
* linux PTE (before setting _PAGE_HASHPTE) and r7 contains the
* offset to be added to addresses (0 if the MMU is on,
* -KERNELBASE if it is off). r10 contains the upper half of
* the PTE if CONFIG_PTE_64BIT.
* On SMP, the caller should have the mmu_hash_lock held.
* We assume that the caller has (or will) set the _PAGE_HASHPTE
* bit in the linux PTE in memory. The value passed in r6 should
* be the old linux PTE value; if it doesn't have _PAGE_HASHPTE set
* this routine will skip the search for an existing HPTE.
* This procedure modifies r0, r3 - r6, r8, cr0.
* -- paulus.
*
* For speed, 4 of the instructions get patched once the size and
* physical address of the hash table are known. These definitions
* of Hash_base and Hash_bits below are just an example.
*/
Hash_base = 0xc0180000
Hash_bits = 12 /* e.g. 256kB hash table */
Hash_msk = (((1 << Hash_bits) - 1) * 64)
/* defines for the PTE format for 32-bit PPCs */
#define HPTE_SIZE 8
#define PTEG_SIZE 64
#define LG_PTEG_SIZE 6
#define LDPTEu lwzu
#define LDPTE lwz
#define STPTE stw
#define CMPPTE cmpw
#define PTE_H 0x40
#define PTE_V 0x80000000
#define TST_V(r) rlwinm. r,r,0,0,0
#define SET_V(r) oris r,r,PTE_V@h
#define CLR_V(r,t) rlwinm r,r,0,1,31
#define HASH_LEFT 31-(LG_PTEG_SIZE+Hash_bits-1)
#define HASH_RIGHT 31-LG_PTEG_SIZE
_GLOBAL(create_hpte)
/* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */
rlwinm r8,r5,32-10,31,31 /* _PAGE_RW -> PP lsb */
rlwinm r0,r5,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */
and r8,r8,r0 /* writable if _RW & _DIRTY */
rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */
rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */
ori r8,r8,0xe04 /* clear out reserved bits */
andc r8,r5,r8 /* PP = user? (rw&dirty? 2: 3): 0 */
BEGIN_FTR_SECTION
rlwinm r8,r8,0,~_PAGE_COHERENT /* clear M (coherence not required) */
END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
#ifdef CONFIG_PTE_64BIT
/* Put the XPN bits into the PTE */
rlwimi r8,r10,8,20,22
rlwimi r8,r10,2,29,29
#endif
/* Construct the high word of the PPC-style PTE (r5) */
rlwinm r5,r3,7,1,24 /* put VSID in 0x7fffff80 bits */
rlwimi r5,r4,10,26,31 /* put in API (abbrev page index) */
SET_V(r5) /* set V (valid) bit */
/* Get the address of the primary PTE group in the hash table (r3) */
_GLOBAL(hash_page_patch_A)
addis r0,r7,Hash_base@h /* base address of hash table */
rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */
rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
xor r3,r3,r0 /* make primary hash */
li r0,8 /* PTEs/group */
/*
* Test the _PAGE_HASHPTE bit in the old linux PTE, and skip the search
* if it is clear, meaning that the HPTE isn't there already...
*/
andi. r6,r6,_PAGE_HASHPTE
beq+ 10f /* no PTE: go look for an empty slot */
tlbie r4
addis r4,r7,htab_hash_searches@ha
lwz r6,htab_hash_searches@l(r4)
addi r6,r6,1 /* count how many searches we do */
stw r6,htab_hash_searches@l(r4)
/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
mtctr r0
addi r4,r3,-HPTE_SIZE
1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */
CMPPTE 0,r6,r5
bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */
beq+ found_slot
/* Search the secondary PTEG for a matching PTE */
ori r5,r5,PTE_H /* set H (secondary hash) bit */
_GLOBAL(hash_page_patch_B)
xoris r4,r3,Hash_msk>>16 /* compute secondary hash */
xori r4,r4,(-PTEG_SIZE & 0xffff)
addi r4,r4,-HPTE_SIZE
mtctr r0
2: LDPTEu r6,HPTE_SIZE(r4)
CMPPTE 0,r6,r5
bdnzf 2,2b
beq+ found_slot
xori r5,r5,PTE_H /* clear H bit again */
/* Search the primary PTEG for an empty slot */
10: mtctr r0
addi r4,r3,-HPTE_SIZE /* search primary PTEG */
1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */
TST_V(r6) /* test valid bit */
bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */
beq+ found_empty
/* update counter of times that the primary PTEG is full */
addis r4,r7,primary_pteg_full@ha
lwz r6,primary_pteg_full@l(r4)
addi r6,r6,1
stw r6,primary_pteg_full@l(r4)
/* Search the secondary PTEG for an empty slot */
ori r5,r5,PTE_H /* set H (secondary hash) bit */
_GLOBAL(hash_page_patch_C)
xoris r4,r3,Hash_msk>>16 /* compute secondary hash */
xori r4,r4,(-PTEG_SIZE & 0xffff)
addi r4,r4,-HPTE_SIZE
mtctr r0
2: LDPTEu r6,HPTE_SIZE(r4)
TST_V(r6)
bdnzf 2,2b
beq+ found_empty
xori r5,r5,PTE_H /* clear H bit again */
/*
* Choose an arbitrary slot in the primary PTEG to overwrite.
* Since both the primary and secondary PTEGs are full, and we
* have no information that the PTEs in the primary PTEG are
* more important or useful than those in the secondary PTEG,
* and we know there is a definite (although small) speed
* advantage to putting the PTE in the primary PTEG, we always
* put the PTE in the primary PTEG.
*
* In addition, we skip any slot that is mapping kernel text in
* order to avoid a deadlock when not using BAT mappings if
* trying to hash in the kernel hash code itself after it has
* already taken the hash table lock. This works in conjunction
* with pre-faulting of the kernel text.
*
* If the hash table bucket is full of kernel text entries, we'll
* lockup here but that shouldn't happen
*/
1: addis r4,r7,next_slot@ha /* get next evict slot */
lwz r6,next_slot@l(r4)
addi r6,r6,HPTE_SIZE /* search for candidate */
andi. r6,r6,7*HPTE_SIZE
stw r6,next_slot@l(r4)
add r4,r3,r6
LDPTE r0,HPTE_SIZE/2(r4) /* get PTE second word */
clrrwi r0,r0,12
lis r6,etext@h
ori r6,r6,etext@l /* get etext */
tophys(r6,r6)
cmpl cr0,r0,r6 /* compare and try again */
blt 1b
#ifndef CONFIG_SMP
/* Store PTE in PTEG */
found_empty:
STPTE r5,0(r4)
found_slot:
STPTE r8,HPTE_SIZE/2(r4)
#else /* CONFIG_SMP */
/*
* Between the tlbie above and updating the hash table entry below,
* another CPU could read the hash table entry and put it in its TLB.
* There are 3 cases:
* 1. using an empty slot
* 2. updating an earlier entry to change permissions (i.e. enable write)
* 3. taking over the PTE for an unrelated address
*
* In each case it doesn't really matter if the other CPUs have the old
* PTE in their TLB. So we don't need to bother with another tlbie here,
* which is convenient as we've overwritten the register that had the
* address. :-) The tlbie above is mainly to make sure that this CPU comes
* and gets the new PTE from the hash table.
*
* We do however have to make sure that the PTE is never in an invalid
* state with the V bit set.
*/
found_empty:
found_slot:
CLR_V(r5,r0) /* clear V (valid) bit in PTE */
STPTE r5,0(r4)
sync
TLBSYNC
STPTE r8,HPTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */
sync
SET_V(r5)
STPTE r5,0(r4) /* finally set V bit in PTE */
#endif /* CONFIG_SMP */
sync /* make sure pte updates get to memory */
blr
.section .bss
.align 2
next_slot:
.space 4
primary_pteg_full:
.space 4
htab_hash_searches:
.space 4
.previous
/*
* Flush the entry for a particular page from the hash table.
*
* flush_hash_pages(unsigned context, unsigned long va, unsigned long pmdval,
* int count)
*
* We assume that there is a hash table in use (Hash != 0).
*/
_GLOBAL(flush_hash_pages)
tophys(r7,0)
/*
* We disable interrupts here, even on UP, because we want
* the _PAGE_HASHPTE bit to be a reliable indication of
* whether the HPTE exists (or at least whether one did once).
* We also turn off the MMU for data accesses so that we
* we can't take a hash table miss (assuming the code is
* covered by a BAT). -- paulus
*/
mfmsr r10
SYNC
rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
rlwinm r0,r0,0,28,26 /* clear MSR_DR */
mtmsr r0
SYNC_601
isync
/* First find a PTE in the range that has _PAGE_HASHPTE set */
#ifndef CONFIG_PTE_64BIT
rlwimi r5,r4,22,20,29
#else
rlwimi r5,r4,23,20,28
#endif
1: lwz r0,PTE_FLAGS_OFFSET(r5)
cmpwi cr1,r6,1
andi. r0,r0,_PAGE_HASHPTE
bne 2f
ble cr1,19f
addi r4,r4,0x1000
addi r5,r5,PTE_SIZE
addi r6,r6,-1
b 1b
/* Convert context and va to VSID */
2: mulli r3,r3,897*16 /* multiply context by context skew */
rlwinm r0,r4,4,28,31 /* get ESID (top 4 bits of va) */
mulli r0,r0,0x111 /* multiply by ESID skew */
add r3,r3,r0 /* note code below trims to 24 bits */
/* Construct the high word of the PPC-style PTE (r11) */
rlwinm r11,r3,7,1,24 /* put VSID in 0x7fffff80 bits */
rlwimi r11,r4,10,26,31 /* put in API (abbrev page index) */
SET_V(r11) /* set V (valid) bit */
#ifdef CONFIG_SMP
addis r9,r7,mmu_hash_lock@ha
addi r9,r9,mmu_hash_lock@l
CURRENT_THREAD_INFO(r8, r1)
add r8,r8,r7
lwz r8,TI_CPU(r8)
oris r8,r8,9
10: lwarx r0,0,r9
cmpi 0,r0,0
bne- 11f
stwcx. r8,0,r9
beq+ 12f
11: lwz r0,0(r9)
cmpi 0,r0,0
beq 10b
b 11b
12: isync
#endif
/*
* Check the _PAGE_HASHPTE bit in the linux PTE. If it is
* already clear, we're done (for this pte). If not,
* clear it (atomically) and proceed. -- paulus.
*/
#if (PTE_FLAGS_OFFSET != 0)
addi r5,r5,PTE_FLAGS_OFFSET
#endif
33: lwarx r8,0,r5 /* fetch the pte flags word */
andi. r0,r8,_PAGE_HASHPTE
beq 8f /* done if HASHPTE is already clear */
rlwinm r8,r8,0,31,29 /* clear HASHPTE bit */
stwcx. r8,0,r5 /* update the pte */
bne- 33b
/* Get the address of the primary PTE group in the hash table (r3) */
_GLOBAL(flush_hash_patch_A)
addis r8,r7,Hash_base@h /* base address of hash table */
rlwimi r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */
rlwinm r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
xor r8,r0,r8 /* make primary hash */
/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
li r0,8 /* PTEs/group */
mtctr r0
addi r12,r8,-HPTE_SIZE
1: LDPTEu r0,HPTE_SIZE(r12) /* get next PTE */
CMPPTE 0,r0,r11
bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */
beq+ 3f
/* Search the secondary PTEG for a matching PTE */
ori r11,r11,PTE_H /* set H (secondary hash) bit */
li r0,8 /* PTEs/group */
_GLOBAL(flush_hash_patch_B)
xoris r12,r8,Hash_msk>>16 /* compute secondary hash */
xori r12,r12,(-PTEG_SIZE & 0xffff)
addi r12,r12,-HPTE_SIZE
mtctr r0
2: LDPTEu r0,HPTE_SIZE(r12)
CMPPTE 0,r0,r11
bdnzf 2,2b
xori r11,r11,PTE_H /* clear H again */
bne- 4f /* should rarely fail to find it */
3: li r0,0
STPTE r0,0(r12) /* invalidate entry */
4: sync
tlbie r4 /* in hw tlb too */
sync
8: ble cr1,9f /* if all ptes checked */
81: addi r6,r6,-1
addi r5,r5,PTE_SIZE
addi r4,r4,0x1000
lwz r0,0(r5) /* check next pte */
cmpwi cr1,r6,1
andi. r0,r0,_PAGE_HASHPTE
bne 33b
bgt cr1,81b
9:
#ifdef CONFIG_SMP
TLBSYNC
li r0,0
stw r0,0(r9) /* clear mmu_hash_lock */
#endif
19: mtmsr r10
SYNC_601
isync
blr
/*
* Flush an entry from the TLB
*/
_GLOBAL(_tlbie)
#ifdef CONFIG_SMP
CURRENT_THREAD_INFO(r8, r1)
lwz r8,TI_CPU(r8)
oris r8,r8,11
mfmsr r10
SYNC
rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
rlwinm r0,r0,0,28,26 /* clear DR */
mtmsr r0
SYNC_601
isync
lis r9,mmu_hash_lock@h
ori r9,r9,mmu_hash_lock@l
tophys(r9,r9)
10: lwarx r7,0,r9
cmpwi 0,r7,0
bne- 10b
stwcx. r8,0,r9
bne- 10b
eieio
tlbie r3
sync
TLBSYNC
li r0,0
stw r0,0(r9) /* clear mmu_hash_lock */
mtmsr r10
SYNC_601
isync
#else /* CONFIG_SMP */
tlbie r3
sync
#endif /* CONFIG_SMP */
blr
/*
* Flush the entire TLB. 603/603e only
*/
_GLOBAL(_tlbia)
#if defined(CONFIG_SMP)
CURRENT_THREAD_INFO(r8, r1)
lwz r8,TI_CPU(r8)
oris r8,r8,10
mfmsr r10
SYNC
rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
rlwinm r0,r0,0,28,26 /* clear DR */
mtmsr r0
SYNC_601
isync
lis r9,mmu_hash_lock@h
ori r9,r9,mmu_hash_lock@l
tophys(r9,r9)
10: lwarx r7,0,r9
cmpwi 0,r7,0
bne- 10b
stwcx. r8,0,r9
bne- 10b
sync
tlbia
sync
TLBSYNC
li r0,0
stw r0,0(r9) /* clear mmu_hash_lock */
mtmsr r10
SYNC_601
isync
#else /* CONFIG_SMP */
sync
tlbia
sync
#endif /* CONFIG_SMP */
blr

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,710 @@
/*
* native hashtable management.
*
* SMP scalability work:
* Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#undef DEBUG_LOW
#include <linux/spinlock.h>
#include <linux/bitops.h>
#include <linux/of.h>
#include <linux/threads.h>
#include <linux/smp.h>
#include <asm/machdep.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/cputable.h>
#include <asm/udbg.h>
#include <asm/kexec.h>
#include <asm/ppc-opcode.h>
#include <misc/cxl.h>
#ifdef DEBUG_LOW
#define DBG_LOW(fmt...) udbg_printf(fmt)
#else
#define DBG_LOW(fmt...)
#endif
#ifdef __BIG_ENDIAN__
#define HPTE_LOCK_BIT 3
#else
#define HPTE_LOCK_BIT (56+3)
#endif
DEFINE_RAW_SPINLOCK(native_tlbie_lock);
static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
{
unsigned long va;
unsigned int penc;
unsigned long sllp;
/*
* We need 14 to 65 bits of va for a tlibe of 4K page
* With vpn we ignore the lower VPN_SHIFT bits already.
* And top two bits are already ignored because we can
* only accomadate 76 bits in a 64 bit vpn with a VPN_SHIFT
* of 12.
*/
va = vpn << VPN_SHIFT;
/*
* clear top 16 bits of 64bit va, non SLS segment
* Older versions of the architecture (2.02 and earler) require the
* masking of the top 16 bits.
*/
va &= ~(0xffffULL << 48);
switch (psize) {
case MMU_PAGE_4K:
/* clear out bits after (52) [0....52.....63] */
va &= ~((1ul << (64 - 52)) - 1);
va |= ssize << 8;
sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
va |= sllp << 5;
asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
: : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
: "memory");
break;
default:
/* We need 14 to 14 + i bits of va */
penc = mmu_psize_defs[psize].penc[apsize];
va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
va |= penc << 12;
va |= ssize << 8;
/*
* AVAL bits:
* We don't need all the bits, but rest of the bits
* must be ignored by the processor.
* vpn cover upto 65 bits of va. (0...65) and we need
* 58..64 bits of va.
*/
va |= (vpn & 0xfe); /* AVAL */
va |= 1; /* L */
asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
: : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
: "memory");
break;
}
}
static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
{
unsigned long va;
unsigned int penc;
unsigned long sllp;
/* VPN_SHIFT can be atmost 12 */
va = vpn << VPN_SHIFT;
/*
* clear top 16 bits of 64 bit va, non SLS segment
* Older versions of the architecture (2.02 and earler) require the
* masking of the top 16 bits.
*/
va &= ~(0xffffULL << 48);
switch (psize) {
case MMU_PAGE_4K:
/* clear out bits after(52) [0....52.....63] */
va &= ~((1ul << (64 - 52)) - 1);
va |= ssize << 8;
sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
va |= sllp << 5;
asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)"
: : "r"(va) : "memory");
break;
default:
/* We need 14 to 14 + i bits of va */
penc = mmu_psize_defs[psize].penc[apsize];
va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
va |= penc << 12;
va |= ssize << 8;
/*
* AVAL bits:
* We don't need all the bits, but rest of the bits
* must be ignored by the processor.
* vpn cover upto 65 bits of va. (0...65) and we need
* 58..64 bits of va.
*/
va |= (vpn & 0xfe);
va |= 1; /* L */
asm volatile(".long 0x7c000224 | (%0 << 11) | (1 << 21)"
: : "r"(va) : "memory");
break;
}
}
static inline void tlbie(unsigned long vpn, int psize, int apsize,
int ssize, int local)
{
unsigned int use_local;
int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && !cxl_ctx_in_use();
if (use_local)
use_local = mmu_psize_defs[psize].tlbiel;
if (lock_tlbie && !use_local)
raw_spin_lock(&native_tlbie_lock);
asm volatile("ptesync": : :"memory");
if (use_local) {
__tlbiel(vpn, psize, apsize, ssize);
asm volatile("ptesync": : :"memory");
} else {
__tlbie(vpn, psize, apsize, ssize);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
if (lock_tlbie && !use_local)
raw_spin_unlock(&native_tlbie_lock);
}
static inline void native_lock_hpte(struct hash_pte *hptep)
{
unsigned long *word = (unsigned long *)&hptep->v;
while (1) {
if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
break;
while(test_bit(HPTE_LOCK_BIT, word))
cpu_relax();
}
}
static inline void native_unlock_hpte(struct hash_pte *hptep)
{
unsigned long *word = (unsigned long *)&hptep->v;
clear_bit_unlock(HPTE_LOCK_BIT, word);
}
static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
unsigned long pa, unsigned long rflags,
unsigned long vflags, int psize, int apsize, int ssize)
{
struct hash_pte *hptep = htab_address + hpte_group;
unsigned long hpte_v, hpte_r;
int i;
if (!(vflags & HPTE_V_BOLTED)) {
DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx,"
" rflags=%lx, vflags=%lx, psize=%d)\n",
hpte_group, vpn, pa, rflags, vflags, psize);
}
for (i = 0; i < HPTES_PER_GROUP; i++) {
if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
/* retry with lock held */
native_lock_hpte(hptep);
if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
break;
native_unlock_hpte(hptep);
}
hptep++;
}
if (i == HPTES_PER_GROUP)
return -1;
hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
if (!(vflags & HPTE_V_BOLTED)) {
DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
i, hpte_v, hpte_r);
}
hptep->r = cpu_to_be64(hpte_r);
/* Guarantee the second dword is visible before the valid bit */
eieio();
/*
* Now set the first dword including the valid bit
* NOTE: this also unlocks the hpte
*/
hptep->v = cpu_to_be64(hpte_v);
__asm__ __volatile__ ("ptesync" : : : "memory");
return i | (!!(vflags & HPTE_V_SECONDARY) << 3);
}
static long native_hpte_remove(unsigned long hpte_group)
{
struct hash_pte *hptep;
int i;
int slot_offset;
unsigned long hpte_v;
DBG_LOW(" remove(group=%lx)\n", hpte_group);
/* pick a random entry to start at */
slot_offset = mftb() & 0x7;
for (i = 0; i < HPTES_PER_GROUP; i++) {
hptep = htab_address + hpte_group + slot_offset;
hpte_v = be64_to_cpu(hptep->v);
if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
/* retry with lock held */
native_lock_hpte(hptep);
hpte_v = be64_to_cpu(hptep->v);
if ((hpte_v & HPTE_V_VALID)
&& !(hpte_v & HPTE_V_BOLTED))
break;
native_unlock_hpte(hptep);
}
slot_offset++;
slot_offset &= 0x7;
}
if (i == HPTES_PER_GROUP)
return -1;
/* Invalidate the hpte. NOTE: this also unlocks it */
hptep->v = 0;
return i;
}
static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
unsigned long vpn, int bpsize,
int apsize, int ssize, int local)
{
struct hash_pte *hptep = htab_address + slot;
unsigned long hpte_v, want_v;
int ret = 0;
want_v = hpte_encode_avpn(vpn, bpsize, ssize);
DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
vpn, want_v & HPTE_V_AVPN, slot, newpp);
native_lock_hpte(hptep);
hpte_v = be64_to_cpu(hptep->v);
/*
* We need to invalidate the TLB always because hpte_remove doesn't do
* a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
* random entry from it. When we do that we don't invalidate the TLB
* (hpte_remove) because we assume the old translation is still
* technically "valid".
*/
if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
DBG_LOW(" -> miss\n");
ret = -1;
} else {
DBG_LOW(" -> hit\n");
/* Update the HPTE */
hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & ~(HPTE_R_PP | HPTE_R_N)) |
(newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C)));
}
native_unlock_hpte(hptep);
/* Ensure it is out of the tlb too. */
tlbie(vpn, bpsize, apsize, ssize, local);
return ret;
}
static long native_hpte_find(unsigned long vpn, int psize, int ssize)
{
struct hash_pte *hptep;
unsigned long hash;
unsigned long i;
long slot;
unsigned long want_v, hpte_v;
hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
want_v = hpte_encode_avpn(vpn, psize, ssize);
/* Bolted mappings are only ever in the primary group */
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
for (i = 0; i < HPTES_PER_GROUP; i++) {
hptep = htab_address + slot;
hpte_v = be64_to_cpu(hptep->v);
if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
/* HPTE matches */
return slot;
++slot;
}
return -1;
}
/*
* Update the page protection bits. Intended to be used to create
* guard pages for kernel data structures on pages which are bolted
* in the HPT. Assumes pages being operated on will not be stolen.
*
* No need to lock here because we should be the only user.
*/
static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
int psize, int ssize)
{
unsigned long vpn;
unsigned long vsid;
long slot;
struct hash_pte *hptep;
vsid = get_kernel_vsid(ea, ssize);
vpn = hpt_vpn(ea, vsid, ssize);
slot = native_hpte_find(vpn, psize, ssize);
if (slot == -1)
panic("could not find page to bolt\n");
hptep = htab_address + slot;
/* Update the HPTE */
hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
~(HPTE_R_PP | HPTE_R_N)) |
(newpp & (HPTE_R_PP | HPTE_R_N)));
/*
* Ensure it is out of the tlb too. Bolted entries base and
* actual page size will be same.
*/
tlbie(vpn, psize, psize, ssize, 0);
}
static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
int bpsize, int apsize, int ssize, int local)
{
struct hash_pte *hptep = htab_address + slot;
unsigned long hpte_v;
unsigned long want_v;
unsigned long flags;
local_irq_save(flags);
DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
want_v = hpte_encode_avpn(vpn, bpsize, ssize);
native_lock_hpte(hptep);
hpte_v = be64_to_cpu(hptep->v);
/*
* We need to invalidate the TLB always because hpte_remove doesn't do
* a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
* random entry from it. When we do that we don't invalidate the TLB
* (hpte_remove) because we assume the old translation is still
* technically "valid".
*/
if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
native_unlock_hpte(hptep);
else
/* Invalidate the hpte. NOTE: this also unlocks it */
hptep->v = 0;
/* Invalidate the TLB */
tlbie(vpn, bpsize, apsize, ssize, local);
local_irq_restore(flags);
}
static void native_hugepage_invalidate(unsigned long vsid,
unsigned long addr,
unsigned char *hpte_slot_array,
int psize, int ssize)
{
int i;
struct hash_pte *hptep;
int actual_psize = MMU_PAGE_16M;
unsigned int max_hpte_count, valid;
unsigned long flags, s_addr = addr;
unsigned long hpte_v, want_v, shift;
unsigned long hidx, vpn = 0, hash, slot;
shift = mmu_psize_defs[psize].shift;
max_hpte_count = 1U << (PMD_SHIFT - shift);
local_irq_save(flags);
for (i = 0; i < max_hpte_count; i++) {
valid = hpte_valid(hpte_slot_array, i);
if (!valid)
continue;
hidx = hpte_hash_index(hpte_slot_array, i);
/* get the vpn */
addr = s_addr + (i * (1ul << shift));
vpn = hpt_vpn(addr, vsid, ssize);
hash = hpt_hash(vpn, shift, ssize);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
hptep = htab_address + slot;
want_v = hpte_encode_avpn(vpn, psize, ssize);
native_lock_hpte(hptep);
hpte_v = be64_to_cpu(hptep->v);
/* Even if we miss, we need to invalidate the TLB */
if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
native_unlock_hpte(hptep);
else
/* Invalidate the hpte. NOTE: this also unlocks it */
hptep->v = 0;
/*
* We need to do tlb invalidate for all the address, tlbie
* instruction compares entry_VA in tlb with the VA specified
* here
*/
tlbie(vpn, psize, actual_psize, ssize, 0);
}
local_irq_restore(flags);
}
static inline int __hpte_actual_psize(unsigned int lp, int psize)
{
int i, shift;
unsigned int mask;
/* start from 1 ignoring MMU_PAGE_4K */
for (i = 1; i < MMU_PAGE_COUNT; i++) {
/* invalid penc */
if (mmu_psize_defs[psize].penc[i] == -1)
continue;
/*
* encoding bits per actual page size
* PTE LP actual page size
* rrrr rrrz >=8KB
* rrrr rrzz >=16KB
* rrrr rzzz >=32KB
* rrrr zzzz >=64KB
* .......
*/
shift = mmu_psize_defs[i].shift - LP_SHIFT;
if (shift > LP_BITS)
shift = LP_BITS;
mask = (1 << shift) - 1;
if ((lp & mask) == mmu_psize_defs[psize].penc[i])
return i;
}
return -1;
}
static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
int *psize, int *apsize, int *ssize, unsigned long *vpn)
{
unsigned long avpn, pteg, vpi;
unsigned long hpte_v = be64_to_cpu(hpte->v);
unsigned long hpte_r = be64_to_cpu(hpte->r);
unsigned long vsid, seg_off;
int size, a_size, shift;
/* Look at the 8 bit LP value */
unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
if (!(hpte_v & HPTE_V_LARGE)) {
size = MMU_PAGE_4K;
a_size = MMU_PAGE_4K;
} else {
for (size = 0; size < MMU_PAGE_COUNT; size++) {
/* valid entries have a shift value */
if (!mmu_psize_defs[size].shift)
continue;
a_size = __hpte_actual_psize(lp, size);
if (a_size != -1)
break;
}
}
/* This works for all page sizes, and for 256M and 1T segments */
*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
shift = mmu_psize_defs[size].shift;
avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm);
pteg = slot / HPTES_PER_GROUP;
if (hpte_v & HPTE_V_SECONDARY)
pteg = ~pteg;
switch (*ssize) {
case MMU_SEGSIZE_256M:
/* We only have 28 - 23 bits of seg_off in avpn */
seg_off = (avpn & 0x1f) << 23;
vsid = avpn >> 5;
/* We can find more bits from the pteg value */
if (shift < 23) {
vpi = (vsid ^ pteg) & htab_hash_mask;
seg_off |= vpi << shift;
}
*vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT;
break;
case MMU_SEGSIZE_1T:
/* We only have 40 - 23 bits of seg_off in avpn */
seg_off = (avpn & 0x1ffff) << 23;
vsid = avpn >> 17;
if (shift < 23) {
vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask;
seg_off |= vpi << shift;
}
*vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT;
break;
default:
*vpn = size = 0;
}
*psize = size;
*apsize = a_size;
}
/*
* clear all mappings on kexec. All cpus are in real mode (or they will
* be when they isi), and we are the only one left. We rely on our kernel
* mapping being 0xC0's and the hardware ignoring those two real bits.
*
* TODO: add batching support when enabled. remember, no dynamic memory here,
* athough there is the control page available...
*/
static void native_hpte_clear(void)
{
unsigned long vpn = 0;
unsigned long slot, slots, flags;
struct hash_pte *hptep = htab_address;
unsigned long hpte_v;
unsigned long pteg_count;
int psize, apsize, ssize;
pteg_count = htab_hash_mask + 1;
local_irq_save(flags);
/* we take the tlbie lock and hold it. Some hardware will
* deadlock if we try to tlbie from two processors at once.
*/
raw_spin_lock(&native_tlbie_lock);
slots = pteg_count * HPTES_PER_GROUP;
for (slot = 0; slot < slots; slot++, hptep++) {
/*
* we could lock the pte here, but we are the only cpu
* running, right? and for crash dump, we probably
* don't want to wait for a maybe bad cpu.
*/
hpte_v = be64_to_cpu(hptep->v);
/*
* Call __tlbie() here rather than tlbie() since we
* already hold the native_tlbie_lock.
*/
if (hpte_v & HPTE_V_VALID) {
hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn);
hptep->v = 0;
__tlbie(vpn, psize, apsize, ssize);
}
}
asm volatile("eieio; tlbsync; ptesync":::"memory");
raw_spin_unlock(&native_tlbie_lock);
local_irq_restore(flags);
}
/*
* Batched hash table flush, we batch the tlbie's to avoid taking/releasing
* the lock all the time
*/
static void native_flush_hash_range(unsigned long number, int local)
{
unsigned long vpn;
unsigned long hash, index, hidx, shift, slot;
struct hash_pte *hptep;
unsigned long hpte_v;
unsigned long want_v;
unsigned long flags;
real_pte_t pte;
struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
unsigned long psize = batch->psize;
int ssize = batch->ssize;
int i;
local_irq_save(flags);
for (i = 0; i < number; i++) {
vpn = batch->vpn[i];
pte = batch->pte[i];
pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
hash = hpt_hash(vpn, shift, ssize);
hidx = __rpte_to_hidx(pte, index);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
hptep = htab_address + slot;
want_v = hpte_encode_avpn(vpn, psize, ssize);
native_lock_hpte(hptep);
hpte_v = be64_to_cpu(hptep->v);
if (!HPTE_V_COMPARE(hpte_v, want_v) ||
!(hpte_v & HPTE_V_VALID))
native_unlock_hpte(hptep);
else
hptep->v = 0;
} pte_iterate_hashed_end();
}
if (mmu_has_feature(MMU_FTR_TLBIEL) &&
mmu_psize_defs[psize].tlbiel && local) {
asm volatile("ptesync":::"memory");
for (i = 0; i < number; i++) {
vpn = batch->vpn[i];
pte = batch->pte[i];
pte_iterate_hashed_subpages(pte, psize,
vpn, index, shift) {
__tlbiel(vpn, psize, psize, ssize);
} pte_iterate_hashed_end();
}
asm volatile("ptesync":::"memory");
} else {
int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
if (lock_tlbie)
raw_spin_lock(&native_tlbie_lock);
asm volatile("ptesync":::"memory");
for (i = 0; i < number; i++) {
vpn = batch->vpn[i];
pte = batch->pte[i];
pte_iterate_hashed_subpages(pte, psize,
vpn, index, shift) {
__tlbie(vpn, psize, psize, ssize);
} pte_iterate_hashed_end();
}
asm volatile("eieio; tlbsync; ptesync":::"memory");
if (lock_tlbie)
raw_spin_unlock(&native_tlbie_lock);
}
local_irq_restore(flags);
}
void __init hpte_init_native(void)
{
ppc_md.hpte_invalidate = native_hpte_invalidate;
ppc_md.hpte_updatepp = native_hpte_updatepp;
ppc_md.hpte_updateboltedpp = native_hpte_updateboltedpp;
ppc_md.hpte_insert = native_hpte_insert;
ppc_md.hpte_remove = native_hpte_remove;
ppc_md.hpte_clear_all = native_hpte_clear;
ppc_md.flush_hash_range = native_flush_hash_range;
ppc_md.hugepage_invalidate = native_hugepage_invalidate;
}

File diff suppressed because it is too large Load diff

86
arch/powerpc/mm/highmem.c Normal file
View file

@ -0,0 +1,86 @@
/*
* highmem.c: virtual kernel memory mappings for high memory
*
* PowerPC version, stolen from the i386 version.
*
* Used in CONFIG_HIGHMEM systems for memory pages which
* are not addressable by direct kernel virtual addresses.
*
* Copyright (C) 1999 Gerhard Wichert, Siemens AG
* Gerhard.Wichert@pdb.siemens.de
*
*
* Redesigned the x86 32-bit VM architecture to deal with
* up to 16 Terrabyte physical memory. With current x86 CPUs
* we now support up to 64 Gigabytes physical RAM.
*
* Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
*
* Reworked for PowerPC by various contributors. Moved from
* highmem.h by Benjamin Herrenschmidt (c) 2009 IBM Corp.
*/
#include <linux/highmem.h>
#include <linux/module.h>
/*
* The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap
* gives a more generic (and caching) interface. But kmap_atomic can
* be used in IRQ contexts, so in some (very limited) cases we need
* it.
*/
void *kmap_atomic_prot(struct page *page, pgprot_t prot)
{
unsigned long vaddr;
int idx, type;
/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
pagefault_disable();
if (!PageHighMem(page))
return page_address(page);
type = kmap_atomic_idx_push();
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
#ifdef CONFIG_DEBUG_HIGHMEM
BUG_ON(!pte_none(*(kmap_pte-idx)));
#endif
__set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot), 1);
local_flush_tlb_page(NULL, vaddr);
return (void*) vaddr;
}
EXPORT_SYMBOL(kmap_atomic_prot);
void __kunmap_atomic(void *kvaddr)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
int type;
if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
pagefault_enable();
return;
}
type = kmap_atomic_idx();
#ifdef CONFIG_DEBUG_HIGHMEM
{
unsigned int idx;
idx = type + KM_TYPE_NR * smp_processor_id();
BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
/*
* force other mappings to Oops if they'll try to access
* this pte without first remap it
*/
pte_clear(&init_mm, vaddr, kmap_pte-idx);
local_flush_tlb_page(NULL, vaddr);
}
#endif
kmap_atomic_idx_pop();
pagefault_enable();
}
EXPORT_SYMBOL(__kunmap_atomic);

View file

@ -0,0 +1,245 @@
/*
* Copyright IBM Corporation, 2013
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2.1 of the GNU Lesser General Public License
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
*/
/*
* PPC64 THP Support for hash based MMUs
*/
#include <linux/mm.h>
#include <asm/machdep.h>
static void invalidate_old_hpte(unsigned long vsid, unsigned long addr,
pmd_t *pmdp, unsigned int psize, int ssize)
{
int i, max_hpte_count, valid;
unsigned long s_addr;
unsigned char *hpte_slot_array;
unsigned long hidx, shift, vpn, hash, slot;
s_addr = addr & HPAGE_PMD_MASK;
hpte_slot_array = get_hpte_slot_array(pmdp);
/*
* IF we try to do a HUGE PTE update after a withdraw is done.
* we will find the below NULL. This happens when we do
* split_huge_page_pmd
*/
if (!hpte_slot_array)
return;
if (ppc_md.hugepage_invalidate)
return ppc_md.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
psize, ssize);
/*
* No bluk hpte removal support, invalidate each entry
*/
shift = mmu_psize_defs[psize].shift;
max_hpte_count = HPAGE_PMD_SIZE >> shift;
for (i = 0; i < max_hpte_count; i++) {
/*
* 8 bits per each hpte entries
* 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
*/
valid = hpte_valid(hpte_slot_array, i);
if (!valid)
continue;
hidx = hpte_hash_index(hpte_slot_array, i);
/* get the vpn */
addr = s_addr + (i * (1ul << shift));
vpn = hpt_vpn(addr, vsid, ssize);
hash = hpt_hash(vpn, shift, ssize);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
ppc_md.hpte_invalidate(slot, vpn, psize,
MMU_PAGE_16M, ssize, 0);
}
}
int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
pmd_t *pmdp, unsigned long trap, int local, int ssize,
unsigned int psize)
{
unsigned int index, valid;
unsigned char *hpte_slot_array;
unsigned long rflags, pa, hidx;
unsigned long old_pmd, new_pmd;
int ret, lpsize = MMU_PAGE_16M;
unsigned long vpn, hash, shift, slot;
/*
* atomically mark the linux large page PMD busy and dirty
*/
do {
pmd_t pmd = ACCESS_ONCE(*pmdp);
old_pmd = pmd_val(pmd);
/* If PMD busy, retry the access */
if (unlikely(old_pmd & _PAGE_BUSY))
return 0;
/* If PMD is trans splitting retry the access */
if (unlikely(old_pmd & _PAGE_SPLITTING))
return 0;
/* If PMD permissions don't match, take page fault */
if (unlikely(access & ~old_pmd))
return 1;
/*
* Try to lock the PTE, add ACCESSED and DIRTY if it was
* a write access
*/
new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED;
if (access & _PAGE_RW)
new_pmd |= _PAGE_DIRTY;
} while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
old_pmd, new_pmd));
/*
* PP bits. _PAGE_USER is already PP bit 0x2, so we only
* need to add in 0x1 if it's a read-only user page
*/
rflags = new_pmd & _PAGE_USER;
if ((new_pmd & _PAGE_USER) && !((new_pmd & _PAGE_RW) &&
(new_pmd & _PAGE_DIRTY)))
rflags |= 0x1;
/*
* _PAGE_EXEC -> HW_NO_EXEC since it's inverted
*/
rflags |= ((new_pmd & _PAGE_EXEC) ? 0 : HPTE_R_N);
#if 0
if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
/*
* No CPU has hugepages but lacks no execute, so we
* don't need to worry about that case
*/
rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
}
#endif
/*
* Find the slot index details for this ea, using base page size.
*/
shift = mmu_psize_defs[psize].shift;
index = (ea & ~HPAGE_PMD_MASK) >> shift;
BUG_ON(index >= 4096);
vpn = hpt_vpn(ea, vsid, ssize);
hash = hpt_hash(vpn, shift, ssize);
hpte_slot_array = get_hpte_slot_array(pmdp);
if (psize == MMU_PAGE_4K) {
/*
* invalidate the old hpte entry if we have that mapped via 64K
* base page size. This is because demote_segment won't flush
* hash page table entries.
*/
if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO))
invalidate_old_hpte(vsid, ea, pmdp, MMU_PAGE_64K, ssize);
}
valid = hpte_valid(hpte_slot_array, index);
if (valid) {
/* update the hpte bits */
hidx = hpte_hash_index(hpte_slot_array, index);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
ret = ppc_md.hpte_updatepp(slot, rflags, vpn,
psize, lpsize, ssize, local);
/*
* We failed to update, try to insert a new entry.
*/
if (ret == -1) {
/*
* large pte is marked busy, so we can be sure
* nobody is looking at hpte_slot_array. hence we can
* safely update this here.
*/
valid = 0;
hpte_slot_array[index] = 0;
}
}
if (!valid) {
unsigned long hpte_group;
/* insert new entry */
pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
new_pmd |= _PAGE_HASHPTE;
/* Add in WIMG bits */
rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
_PAGE_GUARDED));
/*
* enable the memory coherence always
*/
rflags |= HPTE_R_M;
repeat:
hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
/* Insert into the hash table, primary slot */
slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
psize, lpsize, ssize);
/*
* Primary is full, try the secondary
*/
if (unlikely(slot == -1)) {
hpte_group = ((~hash & htab_hash_mask) *
HPTES_PER_GROUP) & ~0x7UL;
slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
rflags, HPTE_V_SECONDARY,
psize, lpsize, ssize);
if (slot == -1) {
if (mftb() & 0x1)
hpte_group = ((hash & htab_hash_mask) *
HPTES_PER_GROUP) & ~0x7UL;
ppc_md.hpte_remove(hpte_group);
goto repeat;
}
}
/*
* Hypervisor failure. Restore old pmd and return -1
* similar to __hash_page_*
*/
if (unlikely(slot == -2)) {
*pmdp = __pmd(old_pmd);
hash_failure_debug(ea, access, vsid, trap, ssize,
psize, lpsize, old_pmd);
return -1;
}
/*
* large pte is marked busy, so we can be sure
* nobody is looking at hpte_slot_array. hence we can
* safely update this here.
*/
mark_hpte_slot_valid(hpte_slot_array, index, slot);
}
/*
* Mark the pte with _PAGE_COMBO, if we are trying to hash it with
* base page size 4k.
*/
if (psize == MMU_PAGE_4K)
new_pmd |= _PAGE_COMBO;
/*
* The hpte valid is stored in the pgtable whose address is in the
* second half of the PMD. Order this against clearing of the busy bit in
* huge pmd.
*/
smp_wmb();
*pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
return 0;
}

View file

@ -0,0 +1,153 @@
/*
* PPC Huge TLB Page Support for Book3E MMU
*
* Copyright (C) 2009 David Gibson, IBM Corporation.
* Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
*
*/
#include <linux/mm.h>
#include <linux/hugetlb.h>
#ifdef CONFIG_PPC_FSL_BOOK3E
#ifdef CONFIG_PPC64
static inline int tlb1_next(void)
{
struct paca_struct *paca = get_paca();
struct tlb_core_data *tcd;
int this, next;
tcd = paca->tcd_ptr;
this = tcd->esel_next;
next = this + 1;
if (next >= tcd->esel_max)
next = tcd->esel_first;
tcd->esel_next = next;
return this;
}
#else
static inline int tlb1_next(void)
{
int index, ncams;
ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
index = __get_cpu_var(next_tlbcam_idx);
/* Just round-robin the entries and wrap when we hit the end */
if (unlikely(index == ncams - 1))
__get_cpu_var(next_tlbcam_idx) = tlbcam_index;
else
__get_cpu_var(next_tlbcam_idx)++;
return index;
}
#endif /* !PPC64 */
#endif /* FSL */
static inline int mmu_get_tsize(int psize)
{
return mmu_psize_defs[psize].enc;
}
static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid)
{
int found = 0;
mtspr(SPRN_MAS6, pid << 16);
if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) {
asm volatile(
"li %0,0\n"
"tlbsx. 0,%1\n"
"bne 1f\n"
"li %0,1\n"
"1:\n"
: "=&r"(found) : "r"(ea));
} else {
asm volatile(
"tlbsx 0,%1\n"
"mfspr %0,0x271\n"
"srwi %0,%0,31\n"
: "=&r"(found) : "r"(ea));
}
return found;
}
void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
pte_t pte)
{
unsigned long mas1, mas2;
u64 mas7_3;
unsigned long psize, tsize, shift;
unsigned long flags;
struct mm_struct *mm;
#ifdef CONFIG_PPC_FSL_BOOK3E
int index;
#endif
if (unlikely(is_kernel_addr(ea)))
return;
mm = vma->vm_mm;
#ifdef CONFIG_PPC_MM_SLICES
psize = get_slice_psize(mm, ea);
tsize = mmu_get_tsize(psize);
shift = mmu_psize_defs[psize].shift;
#else
psize = vma_mmu_pagesize(vma);
shift = __ilog2(psize);
tsize = shift - 10;
#endif
/*
* We can't be interrupted while we're setting up the MAS
* regusters or after we've confirmed that no tlb exists.
*/
local_irq_save(flags);
if (unlikely(book3e_tlb_exists(ea, mm->context.id))) {
local_irq_restore(flags);
return;
}
#ifdef CONFIG_PPC_FSL_BOOK3E
/* We have to use the CAM(TLB1) on FSL parts for hugepages */
index = tlb1_next();
mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1));
#endif
mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize);
mas2 = ea & ~((1UL << shift) - 1);
mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT;
mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK;
if (!pte_dirty(pte))
mas7_3 &= ~(MAS3_SW|MAS3_UW);
mtspr(SPRN_MAS1, mas1);
mtspr(SPRN_MAS2, mas2);
if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) {
mtspr(SPRN_MAS7_MAS3, mas7_3);
} else {
if (mmu_has_feature(MMU_FTR_BIG_PHYS))
mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
mtspr(SPRN_MAS3, lower_32_bits(mas7_3));
}
asm volatile ("tlbwe");
local_irq_restore(flags);
}
void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
{
struct hstate *hstate = hstate_file(vma->vm_file);
unsigned long tsize = huge_page_shift(hstate) - 10;
__flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0);
}

View file

@ -0,0 +1,129 @@
/*
* PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
*
* Copyright (C) 2003 David Gibson, IBM Corporation.
*
* Based on the IA-32 version:
* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
*/
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
#include <asm/machdep.h>
extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
unsigned long pa, unsigned long rlags,
unsigned long vflags, int psize, int ssize);
int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
pte_t *ptep, unsigned long trap, int local, int ssize,
unsigned int shift, unsigned int mmu_psize)
{
unsigned long vpn;
unsigned long old_pte, new_pte;
unsigned long rflags, pa, sz;
long slot;
BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
/* Search the Linux page table for a match with va */
vpn = hpt_vpn(ea, vsid, ssize);
/* At this point, we have a pte (old_pte) which can be used to build
* or update an HPTE. There are 2 cases:
*
* 1. There is a valid (present) pte with no associated HPTE (this is
* the most common case)
* 2. There is a valid (present) pte with an associated HPTE. The
* current values of the pp bits in the HPTE prevent access
* because we are doing software DIRTY bit management and the
* page is currently not DIRTY.
*/
do {
old_pte = pte_val(*ptep);
/* If PTE busy, retry the access */
if (unlikely(old_pte & _PAGE_BUSY))
return 0;
/* If PTE permissions don't match, take page fault */
if (unlikely(access & ~old_pte))
return 1;
/* Try to lock the PTE, add ACCESSED and DIRTY if it was
* a write access */
new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
if (access & _PAGE_RW)
new_pte |= _PAGE_DIRTY;
} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
old_pte, new_pte));
rflags = 0x2 | (!(new_pte & _PAGE_RW));
/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
sz = ((1UL) << shift);
if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
/* No CPU has hugepages but lacks no execute, so we
* don't need to worry about that case */
rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
/* Check if pte already has an hpte (case 2) */
if (unlikely(old_pte & _PAGE_HASHPTE)) {
/* There MIGHT be an HPTE for this pte */
unsigned long hash, slot;
hash = hpt_hash(vpn, shift, ssize);
if (old_pte & _PAGE_F_SECOND)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += (old_pte & _PAGE_F_GIX) >> 12;
if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize,
mmu_psize, ssize, local) == -1)
old_pte &= ~_PAGE_HPTEFLAGS;
}
if (likely(!(old_pte & _PAGE_HASHPTE))) {
unsigned long hash = hpt_hash(vpn, shift, ssize);
pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
/* clear HPTE slot informations in new PTE */
#ifdef CONFIG_PPC_64K_PAGES
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
#else
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
#endif
/* Add in WIMG bits */
rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
_PAGE_COHERENT | _PAGE_GUARDED));
/*
* enable the memory coherence always
*/
rflags |= HPTE_R_M;
slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
mmu_psize, ssize);
/*
* Hypervisor failure. Restore old pte and return -1
* similar to __hash_page_*
*/
if (unlikely(slot == -2)) {
*ptep = __pte(old_pte);
hash_failure_debug(ea, access, vsid, trap, ssize,
mmu_psize, mmu_psize, old_pte);
return -1;
}
new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
}
/*
* No need to use ldarx/stdcx here
*/
*ptep = __pte(new_pte & ~_PAGE_BUSY);
return 0;
}

File diff suppressed because it is too large Load diff

292
arch/powerpc/mm/icswx.c Normal file
View file

@ -0,0 +1,292 @@
/*
* ICSWX and ACOP Management
*
* Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/uaccess.h>
#include "icswx.h"
/*
* The processor and its L2 cache cause the icswx instruction to
* generate a COP_REQ transaction on PowerBus. The transaction has no
* address, and the processor does not perform an MMU access to
* authenticate the transaction. The command portion of the PowerBus
* COP_REQ transaction includes the LPAR_ID (LPID) and the coprocessor
* Process ID (PID), which the coprocessor compares to the authorized
* LPID and PID held in the coprocessor, to determine if the process
* is authorized to generate the transaction. The data of the COP_REQ
* transaction is 128-byte or less in size and is placed in cacheable
* memory on a 128-byte cache line boundary.
*
* The task to use a coprocessor should use use_cop() to mark the use
* of the Coprocessor Type (CT) and context switching. On a server
* class processor, the PID register is used only for coprocessor
* management + * and so a coprocessor PID is allocated before
* executing icswx + * instruction. Drop_cop() is used to free the
* coprocessor PID.
*
* Example:
* Host Fabric Interface (HFI) is a PowerPC network coprocessor.
* Each HFI have multiple windows. Each HFI window serves as a
* network device sending to and receiving from HFI network.
* HFI immediate send function uses icswx instruction. The immediate
* send function allows small (single cache-line) packets be sent
* without using the regular HFI send FIFO and doorbell, which are
* much slower than immediate send.
*
* For each task intending to use HFI immediate send, the HFI driver
* calls use_cop() to obtain a coprocessor PID for the task.
* The HFI driver then allocate a free HFI window and save the
* coprocessor PID to the HFI window to allow the task to use the
* HFI window.
*
* The HFI driver repeatedly creates immediate send packets and
* issues icswx instruction to send data through the HFI window.
* The HFI compares the coprocessor PID in the CPU PID register
* to the PID held in the HFI window to determine if the transaction
* is allowed.
*
* When the task to release the HFI window, the HFI driver calls
* drop_cop() to release the coprocessor PID.
*/
void switch_cop(struct mm_struct *next)
{
#ifdef CONFIG_PPC_ICSWX_PID
mtspr(SPRN_PID, next->context.cop_pid);
#endif
mtspr(SPRN_ACOP, next->context.acop);
}
/**
* Start using a coprocessor.
* @acop: mask of coprocessor to be used.
* @mm: The mm the coprocessor to associate with. Most likely current mm.
*
* Return a positive PID if successful. Negative errno otherwise.
* The returned PID will be fed to the coprocessor to determine if an
* icswx transaction is authenticated.
*/
int use_cop(unsigned long acop, struct mm_struct *mm)
{
int ret;
if (!cpu_has_feature(CPU_FTR_ICSWX))
return -ENODEV;
if (!mm || !acop)
return -EINVAL;
/* The page_table_lock ensures mm_users won't change under us */
spin_lock(&mm->page_table_lock);
spin_lock(mm->context.cop_lockp);
ret = get_cop_pid(mm);
if (ret < 0)
goto out;
/* update acop */
mm->context.acop |= acop;
sync_cop(mm);
/*
* If this is a threaded process then there might be other threads
* running. We need to send an IPI to force them to pick up any
* change in PID and ACOP.
*/
if (atomic_read(&mm->mm_users) > 1)
smp_call_function(sync_cop, mm, 1);
out:
spin_unlock(mm->context.cop_lockp);
spin_unlock(&mm->page_table_lock);
return ret;
}
EXPORT_SYMBOL_GPL(use_cop);
/**
* Stop using a coprocessor.
* @acop: mask of coprocessor to be stopped.
* @mm: The mm the coprocessor associated with.
*/
void drop_cop(unsigned long acop, struct mm_struct *mm)
{
int free_pid;
if (!cpu_has_feature(CPU_FTR_ICSWX))
return;
if (WARN_ON_ONCE(!mm))
return;
/* The page_table_lock ensures mm_users won't change under us */
spin_lock(&mm->page_table_lock);
spin_lock(mm->context.cop_lockp);
mm->context.acop &= ~acop;
free_pid = disable_cop_pid(mm);
sync_cop(mm);
/*
* If this is a threaded process then there might be other threads
* running. We need to send an IPI to force them to pick up any
* change in PID and ACOP.
*/
if (atomic_read(&mm->mm_users) > 1)
smp_call_function(sync_cop, mm, 1);
if (free_pid != COP_PID_NONE)
free_cop_pid(free_pid);
spin_unlock(mm->context.cop_lockp);
spin_unlock(&mm->page_table_lock);
}
EXPORT_SYMBOL_GPL(drop_cop);
static int acop_use_cop(int ct)
{
/* There is no alternate policy, yet */
return -1;
}
/*
* Get the instruction word at the NIP
*/
static u32 acop_get_inst(struct pt_regs *regs)
{
u32 inst;
u32 __user *p;
p = (u32 __user *)regs->nip;
if (!access_ok(VERIFY_READ, p, sizeof(*p)))
return 0;
if (__get_user(inst, p))
return 0;
return inst;
}
/**
* @regs: regsiters at time of interrupt
* @address: storage address
* @error_code: Fault code, usually the DSISR or ESR depending on
* processor type
*
* Return 0 if we are able to resolve the data storage fault that
* results from a CT miss in the ACOP register.
*/
int acop_handle_fault(struct pt_regs *regs, unsigned long address,
unsigned long error_code)
{
int ct;
u32 inst = 0;
if (!cpu_has_feature(CPU_FTR_ICSWX)) {
pr_info("No coprocessors available");
_exception(SIGILL, regs, ILL_ILLOPN, address);
}
if (!user_mode(regs)) {
/* this could happen if the HV denies the
* kernel access, for now we just die */
die("ICSWX from kernel failed", regs, SIGSEGV);
}
/* Some implementations leave us a hint for the CT */
ct = ICSWX_GET_CT_HINT(error_code);
if (ct < 0) {
/* we have to peek at the instruction word to figure out CT */
u32 ccw;
u32 rs;
inst = acop_get_inst(regs);
if (inst == 0)
return -1;
rs = (inst >> (31 - 10)) & 0x1f;
ccw = regs->gpr[rs];
ct = (ccw >> 16) & 0x3f;
}
/*
* We could be here because another thread has enabled acop
* but the ACOP register has yet to be updated.
*
* This should have been taken care of by the IPI to sync all
* the threads (see smp_call_function(sync_cop, mm, 1)), but
* that could take forever if there are a significant amount
* of threads.
*
* Given the number of threads on some of these systems,
* perhaps this is the best way to sync ACOP rather than whack
* every thread with an IPI.
*/
if ((acop_copro_type_bit(ct) & current->active_mm->context.acop) != 0) {
sync_cop(current->active_mm);
return 0;
}
/* check for alternate policy */
if (!acop_use_cop(ct))
return 0;
/* at this point the CT is unknown to the system */
pr_warn("%s[%d]: Coprocessor %d is unavailable\n",
current->comm, current->pid, ct);
/* get inst if we don't already have it */
if (inst == 0) {
inst = acop_get_inst(regs);
if (inst == 0)
return -1;
}
/* Check if the instruction is the "record form" */
if (inst & 1) {
/*
* the instruction is "record" form so we can reject
* using CR0
*/
regs->ccr &= ~(0xful << 28);
regs->ccr |= ICSWX_RC_NOT_FOUND << 28;
/* Move on to the next instruction */
regs->nip += 4;
} else {
/*
* There is no architected mechanism to report a bad
* CT so we could either SIGILL or report nothing.
* Since the non-record version should only bu used
* for "hints" or "don't care" we should probably do
* nothing. However, I could see how some people
* might want an SIGILL so it here if you want it.
*/
#ifdef CONFIG_PPC_ICSWX_USE_SIGILL
_exception(SIGILL, regs, ILL_ILLOPN, address);
#else
regs->nip += 4;
#endif
}
return 0;
}
EXPORT_SYMBOL_GPL(acop_handle_fault);

68
arch/powerpc/mm/icswx.h Normal file
View file

@ -0,0 +1,68 @@
#ifndef _ARCH_POWERPC_MM_ICSWX_H_
#define _ARCH_POWERPC_MM_ICSWX_H_
/*
* ICSWX and ACOP Management
*
* Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <asm/mmu_context.h>
/* also used to denote that PIDs are not used */
#define COP_PID_NONE 0
static inline void sync_cop(void *arg)
{
struct mm_struct *mm = arg;
if (mm == current->active_mm)
switch_cop(current->active_mm);
}
#ifdef CONFIG_PPC_ICSWX_PID
extern int get_cop_pid(struct mm_struct *mm);
extern int disable_cop_pid(struct mm_struct *mm);
extern void free_cop_pid(int free_pid);
#else
#define get_cop_pid(m) (COP_PID_NONE)
#define disable_cop_pid(m) (COP_PID_NONE)
#define free_cop_pid(p)
#endif
/*
* These are implementation bits for architected registers. If this
* ever becomes architecture the should be moved to reg.h et. al.
*/
/* UCT is the same bit for Server and Embedded */
#define ICSWX_DSI_UCT 0x00004000 /* Unavailable Coprocessor Type */
#ifdef CONFIG_PPC_BOOK3E
/* Embedded implementation gives us no hints as to what the CT is */
#define ICSWX_GET_CT_HINT(x) (-1)
#else
/* Server implementation contains the CT value in the DSISR */
#define ICSWX_DSISR_CTMASK 0x00003f00
#define ICSWX_GET_CT_HINT(x) (((x) & ICSWX_DSISR_CTMASK) >> 8)
#endif
#define ICSWX_RC_STARTED 0x8 /* The request has been started */
#define ICSWX_RC_NOT_IDLE 0x4 /* No coprocessor found idle */
#define ICSWX_RC_NOT_FOUND 0x2 /* No coprocessor found */
#define ICSWX_RC_UNDEFINED 0x1 /* Reserved */
extern int acop_handle_fault(struct pt_regs *regs, unsigned long address,
unsigned long error_code);
static inline u64 acop_copro_type_bit(unsigned int type)
{
return 1ULL << (63 - type);
}
#endif /* !_ARCH_POWERPC_MM_ICSWX_H_ */

View file

@ -0,0 +1,87 @@
/*
* ICSWX and ACOP/PID Management
*
* Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/spinlock.h>
#include <linux/idr.h>
#include <linux/module.h>
#include "icswx.h"
#define COP_PID_MIN (COP_PID_NONE + 1)
#define COP_PID_MAX (0xFFFF)
static DEFINE_SPINLOCK(mmu_context_acop_lock);
static DEFINE_IDA(cop_ida);
static int new_cop_pid(struct ida *ida, int min_id, int max_id,
spinlock_t *lock)
{
int index;
int err;
again:
if (!ida_pre_get(ida, GFP_KERNEL))
return -ENOMEM;
spin_lock(lock);
err = ida_get_new_above(ida, min_id, &index);
spin_unlock(lock);
if (err == -EAGAIN)
goto again;
else if (err)
return err;
if (index > max_id) {
spin_lock(lock);
ida_remove(ida, index);
spin_unlock(lock);
return -ENOMEM;
}
return index;
}
int get_cop_pid(struct mm_struct *mm)
{
int pid;
if (mm->context.cop_pid == COP_PID_NONE) {
pid = new_cop_pid(&cop_ida, COP_PID_MIN, COP_PID_MAX,
&mmu_context_acop_lock);
if (pid >= 0)
mm->context.cop_pid = pid;
}
return mm->context.cop_pid;
}
int disable_cop_pid(struct mm_struct *mm)
{
int free_pid = COP_PID_NONE;
if ((!mm->context.acop) && (mm->context.cop_pid != COP_PID_NONE)) {
free_pid = mm->context.cop_pid;
mm->context.cop_pid = COP_PID_NONE;
}
return free_pid;
}
void free_cop_pid(int free_pid)
{
spin_lock(&mmu_context_acop_lock);
ida_remove(&cop_ida, free_pid);
spin_unlock(&mmu_context_acop_lock);
}

224
arch/powerpc/mm/init_32.c Normal file
View file

@ -0,0 +1,224 @@
/*
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
* and Cort Dougan (PReP) (cort@cs.nmt.edu)
* Copyright (C) 1996 Paul Mackerras
* PPC44x/36-bit changes by Matt Porter (mporter@mvista.com)
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/stddef.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/highmem.h>
#include <linux/initrd.h>
#include <linux/pagemap.h>
#include <linux/memblock.h>
#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/hugetlb.h>
#include <asm/pgalloc.h>
#include <asm/prom.h>
#include <asm/io.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/btext.h>
#include <asm/tlb.h>
#include <asm/sections.h>
#include <asm/hugetlb.h>
#include "mmu_decl.h"
#if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL)
/* The amount of lowmem must be within 0xF0000000 - KERNELBASE. */
#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - PAGE_OFFSET))
#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_KERNEL_START"
#endif
#endif
#define MAX_LOW_MEM CONFIG_LOWMEM_SIZE
phys_addr_t total_memory;
phys_addr_t total_lowmem;
phys_addr_t memstart_addr = (phys_addr_t)~0ull;
EXPORT_SYMBOL(memstart_addr);
phys_addr_t kernstart_addr;
EXPORT_SYMBOL(kernstart_addr);
#ifdef CONFIG_RELOCATABLE_PPC32
/* Used in __va()/__pa() */
long long virt_phys_offset;
EXPORT_SYMBOL(virt_phys_offset);
#endif
phys_addr_t lowmem_end_addr;
int boot_mapsize;
#ifdef CONFIG_PPC_PMAC
unsigned long agp_special_page;
EXPORT_SYMBOL(agp_special_page);
#endif
void MMU_init(void);
/* XXX should be in current.h -- paulus */
extern struct task_struct *current_set[NR_CPUS];
/*
* this tells the system to map all of ram with the segregs
* (i.e. page tables) instead of the bats.
* -- Cort
*/
int __map_without_bats;
int __map_without_ltlbs;
/*
* This tells the system to allow ioremapping memory marked as reserved.
*/
int __allow_ioremap_reserved;
/* max amount of low RAM to map in */
unsigned long __max_low_memory = MAX_LOW_MEM;
/*
* Check for command-line options that affect what MMU_init will do.
*/
void __init MMU_setup(void)
{
/* Check for nobats option (used in mapin_ram). */
if (strstr(boot_command_line, "nobats")) {
__map_without_bats = 1;
}
if (strstr(boot_command_line, "noltlbs")) {
__map_without_ltlbs = 1;
}
#ifdef CONFIG_DEBUG_PAGEALLOC
__map_without_bats = 1;
__map_without_ltlbs = 1;
#endif
}
/*
* MMU_init sets up the basic memory mappings for the kernel,
* including both RAM and possibly some I/O regions,
* and sets up the page tables and the MMU hardware ready to go.
*/
void __init MMU_init(void)
{
if (ppc_md.progress)
ppc_md.progress("MMU:enter", 0x111);
/* parse args from command line */
MMU_setup();
/*
* Reserve gigantic pages for hugetlb. This MUST occur before
* lowmem_end_addr is initialized below.
*/
reserve_hugetlb_gpages();
if (memblock.memory.cnt > 1) {
#ifndef CONFIG_WII
memblock_enforce_memory_limit(memblock.memory.regions[0].size);
printk(KERN_WARNING "Only using first contiguous memory region");
#else
wii_memory_fixups();
#endif
}
total_lowmem = total_memory = memblock_end_of_DRAM() - memstart_addr;
lowmem_end_addr = memstart_addr + total_lowmem;
#ifdef CONFIG_FSL_BOOKE
/* Freescale Book-E parts expect lowmem to be mapped by fixed TLB
* entries, so we need to adjust lowmem to match the amount we can map
* in the fixed entries */
adjust_total_lowmem();
#endif /* CONFIG_FSL_BOOKE */
if (total_lowmem > __max_low_memory) {
total_lowmem = __max_low_memory;
lowmem_end_addr = memstart_addr + total_lowmem;
#ifndef CONFIG_HIGHMEM
total_memory = total_lowmem;
memblock_enforce_memory_limit(total_lowmem);
#endif /* CONFIG_HIGHMEM */
}
/* Initialize the MMU hardware */
if (ppc_md.progress)
ppc_md.progress("MMU:hw init", 0x300);
MMU_init_hw();
/* Map in all of RAM starting at KERNELBASE */
if (ppc_md.progress)
ppc_md.progress("MMU:mapin", 0x301);
mapin_ram();
/* Initialize early top-down ioremap allocator */
ioremap_bot = IOREMAP_TOP;
/* Map in I/O resources */
if (ppc_md.progress)
ppc_md.progress("MMU:setio", 0x302);
if (ppc_md.progress)
ppc_md.progress("MMU:exit", 0x211);
/* From now on, btext is no longer BAT mapped if it was at all */
#ifdef CONFIG_BOOTX_TEXT
btext_unmap();
#endif
/* Shortly after that, the entire linear mapping will be available */
memblock_set_current_limit(lowmem_end_addr);
}
/* This is only called until mem_init is done. */
void __init *early_get_page(void)
{
if (init_bootmem_done)
return alloc_bootmem_pages(PAGE_SIZE);
else
return __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE));
}
#ifdef CONFIG_8xx /* No 8xx specific .c file to put that in ... */
void setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
{
/* We don't currently support the first MEMBLOCK not mapping 0
* physical on those processors
*/
BUG_ON(first_memblock_base != 0);
#ifdef CONFIG_PIN_TLB
/* 8xx can only access 24MB at the moment */
memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01800000));
#else
/* 8xx can only access 8MB at the moment */
memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000));
#endif
}
#endif /* CONFIG_8xx */

462
arch/powerpc/mm/init_64.c Normal file
View file

@ -0,0 +1,462 @@
/*
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
* and Cort Dougan (PReP) (cort@cs.nmt.edu)
* Copyright (C) 1996 Paul Mackerras
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* Dave Engebretsen <engebret@us.ibm.com>
* Rework for PPC64 port.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#undef DEBUG
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/stddef.h>
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/bootmem.h>
#include <linux/highmem.h>
#include <linux/idr.h>
#include <linux/nodemask.h>
#include <linux/module.h>
#include <linux/poison.h>
#include <linux/memblock.h>
#include <linux/hugetlb.h>
#include <linux/slab.h>
#include <asm/pgalloc.h>
#include <asm/page.h>
#include <asm/prom.h>
#include <asm/rtas.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/uaccess.h>
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/tlb.h>
#include <asm/eeh.h>
#include <asm/processor.h>
#include <asm/mmzone.h>
#include <asm/cputable.h>
#include <asm/sections.h>
#include <asm/iommu.h>
#include <asm/vdso.h>
#include "mmu_decl.h"
#ifdef CONFIG_PPC_STD_MMU_64
#if PGTABLE_RANGE > USER_VSID_RANGE
#warning Limited user VSID range means pagetable space is wasted
#endif
#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
#warning TASK_SIZE is smaller than it needs to be.
#endif
#endif /* CONFIG_PPC_STD_MMU_64 */
phys_addr_t memstart_addr = ~0;
EXPORT_SYMBOL_GPL(memstart_addr);
phys_addr_t kernstart_addr;
EXPORT_SYMBOL_GPL(kernstart_addr);
static void pgd_ctor(void *addr)
{
memset(addr, 0, PGD_TABLE_SIZE);
}
static void pmd_ctor(void *addr)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
memset(addr, 0, PMD_TABLE_SIZE * 2);
#else
memset(addr, 0, PMD_TABLE_SIZE);
#endif
}
struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
/*
* Create a kmem_cache() for pagetables. This is not used for PTE
* pages - they're linked to struct page, come from the normal free
* pages pool and have a different entry size (see real_pte_t) to
* everything else. Caches created by this function are used for all
* the higher level pagetables, and for hugepage pagetables.
*/
void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
{
char *name;
unsigned long table_size = sizeof(void *) << shift;
unsigned long align = table_size;
/* When batching pgtable pointers for RCU freeing, we store
* the index size in the low bits. Table alignment must be
* big enough to fit it.
*
* Likewise, hugeapge pagetable pointers contain a (different)
* shift value in the low bits. All tables must be aligned so
* as to leave enough 0 bits in the address to contain it. */
unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
HUGEPD_SHIFT_MASK + 1);
struct kmem_cache *new;
/* It would be nice if this was a BUILD_BUG_ON(), but at the
* moment, gcc doesn't seem to recognize is_power_of_2 as a
* constant expression, so so much for that. */
BUG_ON(!is_power_of_2(minalign));
BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
if (PGT_CACHE(shift))
return; /* Already have a cache of this size */
align = max_t(unsigned long, align, minalign);
name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
new = kmem_cache_create(name, table_size, align, 0, ctor);
pgtable_cache[shift - 1] = new;
pr_debug("Allocated pgtable cache for order %d\n", shift);
}
void pgtable_cache_init(void)
{
pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
panic("Couldn't allocate pgtable caches");
/* In all current configs, when the PUD index exists it's the
* same size as either the pgd or pmd index. Verify that the
* initialization above has also created a PUD cache. This
* will need re-examiniation if we add new possibilities for
* the pagetable layout. */
BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE));
}
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
* Given an address within the vmemmap, determine the pfn of the page that
* represents the start of the section it is within. Note that we have to
* do this by hand as the proffered address may not be correctly aligned.
* Subtraction of non-aligned pointers produces undefined results.
*/
static unsigned long __meminit vmemmap_section_start(unsigned long page)
{
unsigned long offset = page - ((unsigned long)(vmemmap));
/* Return the pfn of the start of the section. */
return (offset / sizeof(struct page)) & PAGE_SECTION_MASK;
}
/*
* Check if this vmemmap page is already initialised. If any section
* which overlaps this vmemmap page is initialised then this page is
* initialised already.
*/
static int __meminit vmemmap_populated(unsigned long start, int page_size)
{
unsigned long end = start + page_size;
start = (unsigned long)(pfn_to_page(vmemmap_section_start(start)));
for (; start < end; start += (PAGES_PER_SECTION * sizeof(struct page)))
if (pfn_valid(page_to_pfn((struct page *)start)))
return 1;
return 0;
}
/* On hash-based CPUs, the vmemmap is bolted in the hash table.
*
* On Book3E CPUs, the vmemmap is currently mapped in the top half of
* the vmalloc space using normal page tables, though the size of
* pages encoded in the PTEs can be different
*/
#ifdef CONFIG_PPC_BOOK3E
static void __meminit vmemmap_create_mapping(unsigned long start,
unsigned long page_size,
unsigned long phys)
{
/* Create a PTE encoding without page size */
unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED |
_PAGE_KERNEL_RW;
/* PTEs only contain page size encodings up to 32M */
BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
/* Encode the size in the PTE */
flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
/* For each PTE for that area, map things. Note that we don't
* increment phys because all PTEs are of the large size and
* thus must have the low bits clear
*/
for (i = 0; i < page_size; i += PAGE_SIZE)
BUG_ON(map_kernel_page(start + i, phys, flags));
}
#ifdef CONFIG_MEMORY_HOTPLUG
static void vmemmap_remove_mapping(unsigned long start,
unsigned long page_size)
{
}
#endif
#else /* CONFIG_PPC_BOOK3E */
static void __meminit vmemmap_create_mapping(unsigned long start,
unsigned long page_size,
unsigned long phys)
{
int mapped = htab_bolt_mapping(start, start + page_size, phys,
pgprot_val(PAGE_KERNEL),
mmu_vmemmap_psize,
mmu_kernel_ssize);
BUG_ON(mapped < 0);
}
#ifdef CONFIG_MEMORY_HOTPLUG
static void vmemmap_remove_mapping(unsigned long start,
unsigned long page_size)
{
int mapped = htab_remove_mapping(start, start + page_size,
mmu_vmemmap_psize,
mmu_kernel_ssize);
BUG_ON(mapped < 0);
}
#endif
#endif /* CONFIG_PPC_BOOK3E */
struct vmemmap_backing *vmemmap_list;
static struct vmemmap_backing *next;
static int num_left;
static int num_freed;
static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node)
{
struct vmemmap_backing *vmem_back;
/* get from freed entries first */
if (num_freed) {
num_freed--;
vmem_back = next;
next = next->list;
return vmem_back;
}
/* allocate a page when required and hand out chunks */
if (!num_left) {
next = vmemmap_alloc_block(PAGE_SIZE, node);
if (unlikely(!next)) {
WARN_ON(1);
return NULL;
}
num_left = PAGE_SIZE / sizeof(struct vmemmap_backing);
}
num_left--;
return next++;
}
static __meminit void vmemmap_list_populate(unsigned long phys,
unsigned long start,
int node)
{
struct vmemmap_backing *vmem_back;
vmem_back = vmemmap_list_alloc(node);
if (unlikely(!vmem_back)) {
WARN_ON(1);
return;
}
vmem_back->phys = phys;
vmem_back->virt_addr = start;
vmem_back->list = vmemmap_list;
vmemmap_list = vmem_back;
}
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
{
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
/* Align to the page size of the linear mapping. */
start = _ALIGN_DOWN(start, page_size);
pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node);
for (; start < end; start += page_size) {
void *p;
if (vmemmap_populated(start, page_size))
continue;
p = vmemmap_alloc_block(page_size, node);
if (!p)
return -ENOMEM;
vmemmap_list_populate(__pa(p), start, node);
pr_debug(" * %016lx..%016lx allocated at %p\n",
start, start + page_size, p);
vmemmap_create_mapping(start, page_size, __pa(p));
}
return 0;
}
#ifdef CONFIG_MEMORY_HOTPLUG
static unsigned long vmemmap_list_free(unsigned long start)
{
struct vmemmap_backing *vmem_back, *vmem_back_prev;
vmem_back_prev = vmem_back = vmemmap_list;
/* look for it with prev pointer recorded */
for (; vmem_back; vmem_back = vmem_back->list) {
if (vmem_back->virt_addr == start)
break;
vmem_back_prev = vmem_back;
}
if (unlikely(!vmem_back)) {
WARN_ON(1);
return 0;
}
/* remove it from vmemmap_list */
if (vmem_back == vmemmap_list) /* remove head */
vmemmap_list = vmem_back->list;
else
vmem_back_prev->list = vmem_back->list;
/* next point to this freed entry */
vmem_back->list = next;
next = vmem_back;
num_freed++;
return vmem_back->phys;
}
void __ref vmemmap_free(unsigned long start, unsigned long end)
{
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
start = _ALIGN_DOWN(start, page_size);
pr_debug("vmemmap_free %lx...%lx\n", start, end);
for (; start < end; start += page_size) {
unsigned long addr;
/*
* the section has already be marked as invalid, so
* vmemmap_populated() true means some other sections still
* in this page, so skip it.
*/
if (vmemmap_populated(start, page_size))
continue;
addr = vmemmap_list_free(start);
if (addr) {
struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
if (PageReserved(page)) {
/* allocated from bootmem */
if (page_size < PAGE_SIZE) {
/*
* this shouldn't happen, but if it is
* the case, leave the memory there
*/
WARN_ON_ONCE(1);
} else {
unsigned int nr_pages =
1 << get_order(page_size);
while (nr_pages--)
free_reserved_page(page++);
}
} else
free_pages((unsigned long)(__va(addr)),
get_order(page_size));
vmemmap_remove_mapping(start, page_size);
}
}
}
#endif
void register_page_bootmem_memmap(unsigned long section_nr,
struct page *start_page, unsigned long size)
{
}
/*
* We do not have access to the sparsemem vmemmap, so we fallback to
* walking the list of sparsemem blocks which we already maintain for
* the sake of crashdump. In the long run, we might want to maintain
* a tree if performance of that linear walk becomes a problem.
*
* realmode_pfn_to_page functions can fail due to:
* 1) As real sparsemem blocks do not lay in RAM continously (they
* are in virtual address space which is not available in the real mode),
* the requested page struct can be split between blocks so get_page/put_page
* may fail.
* 2) When huge pages are used, the get_page/put_page API will fail
* in real mode as the linked addresses in the page struct are virtual
* too.
*/
struct page *realmode_pfn_to_page(unsigned long pfn)
{
struct vmemmap_backing *vmem_back;
struct page *page;
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
unsigned long pg_va = (unsigned long) pfn_to_page(pfn);
for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) {
if (pg_va < vmem_back->virt_addr)
continue;
/* After vmemmap_list entry free is possible, need check all */
if ((pg_va + sizeof(struct page)) <=
(vmem_back->virt_addr + page_size)) {
page = (struct page *) (vmem_back->phys + pg_va -
vmem_back->virt_addr);
return page;
}
}
/* Probably that page struct is split between real pages */
return NULL;
}
EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
#elif defined(CONFIG_FLATMEM)
struct page *realmode_pfn_to_page(unsigned long pfn)
{
struct page *page = pfn_to_page(pfn);
return page;
}
EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */

621
arch/powerpc/mm/mem.c Normal file
View file

@ -0,0 +1,621 @@
/*
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
* and Cort Dougan (PReP) (cort@cs.nmt.edu)
* Copyright (C) 1996 Paul Mackerras
* PPC44x/36-bit changes by Matt Porter (mporter@mvista.com)
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/gfp.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/stddef.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/highmem.h>
#include <linux/initrd.h>
#include <linux/pagemap.h>
#include <linux/suspend.h>
#include <linux/memblock.h>
#include <linux/hugetlb.h>
#include <linux/slab.h>
#include <asm/pgalloc.h>
#include <asm/prom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/btext.h>
#include <asm/tlb.h>
#include <asm/sections.h>
#include <asm/sparsemem.h>
#include <asm/vdso.h>
#include <asm/fixmap.h>
#include <asm/swiotlb.h>
#include <asm/rtas.h>
#include "mmu_decl.h"
#ifndef CPU_FTR_COHERENT_ICACHE
#define CPU_FTR_COHERENT_ICACHE 0 /* XXX for now */
#define CPU_FTR_NOEXECUTE 0
#endif
int init_bootmem_done;
int mem_init_done;
unsigned long long memory_limit;
#ifdef CONFIG_HIGHMEM
pte_t *kmap_pte;
EXPORT_SYMBOL(kmap_pte);
pgprot_t kmap_prot;
EXPORT_SYMBOL(kmap_prot);
static inline pte_t *virt_to_kpte(unsigned long vaddr)
{
return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
vaddr), vaddr), vaddr);
}
#endif
int page_is_ram(unsigned long pfn)
{
#ifndef CONFIG_PPC64 /* XXX for now */
return pfn < max_pfn;
#else
unsigned long paddr = (pfn << PAGE_SHIFT);
struct memblock_region *reg;
for_each_memblock(memory, reg)
if (paddr >= reg->base && paddr < (reg->base + reg->size))
return 1;
return 0;
#endif
}
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot)
{
if (ppc_md.phys_mem_access_prot)
return ppc_md.phys_mem_access_prot(file, pfn, size, vma_prot);
if (!page_is_ram(pfn))
vma_prot = pgprot_noncached(vma_prot);
return vma_prot;
}
EXPORT_SYMBOL(phys_mem_access_prot);
#ifdef CONFIG_MEMORY_HOTPLUG
#ifdef CONFIG_NUMA
int memory_add_physaddr_to_nid(u64 start)
{
return hot_add_scn_to_nid(start);
}
#endif
int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdata;
struct zone *zone;
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
pgdata = NODE_DATA(nid);
start = (unsigned long)__va(start);
if (create_section_mapping(start, start + size))
return -EINVAL;
/* this should work for most non-highmem platforms */
zone = pgdata->node_zones +
zone_for_memory(nid, start, size, 0);
return __add_pages(nid, zone, start_pfn, nr_pages);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
int arch_remove_memory(u64 start, u64 size)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
struct zone *zone;
int ret;
zone = page_zone(pfn_to_page(start_pfn));
ret = __remove_pages(zone, start_pfn, nr_pages);
if (!ret && (ppc_md.remove_memory))
ret = ppc_md.remove_memory(start, size);
return ret;
}
#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
/*
* walk_memory_resource() needs to make sure there is no holes in a given
* memory range. PPC64 does not maintain the memory layout in /proc/iomem.
* Instead it maintains it in memblock.memory structures. Walk through the
* memory regions, find holes and callback for contiguous regions.
*/
int
walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg, int (*func)(unsigned long, unsigned long, void *))
{
struct memblock_region *reg;
unsigned long end_pfn = start_pfn + nr_pages;
unsigned long tstart, tend;
int ret = -1;
for_each_memblock(memory, reg) {
tstart = max(start_pfn, memblock_region_memory_base_pfn(reg));
tend = min(end_pfn, memblock_region_memory_end_pfn(reg));
if (tstart >= tend)
continue;
ret = (*func)(tstart, tend - tstart, arg);
if (ret)
break;
}
return ret;
}
EXPORT_SYMBOL_GPL(walk_system_ram_range);
/*
* Initialize the bootmem system and give it all the memory we
* have available. If we are using highmem, we only put the
* lowmem into the bootmem system.
*/
#ifndef CONFIG_NEED_MULTIPLE_NODES
void __init do_init_bootmem(void)
{
unsigned long start, bootmap_pages;
unsigned long total_pages;
struct memblock_region *reg;
int boot_mapsize;
max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
total_pages = (memblock_end_of_DRAM() - memstart_addr) >> PAGE_SHIFT;
#ifdef CONFIG_HIGHMEM
total_pages = total_lowmem >> PAGE_SHIFT;
max_low_pfn = lowmem_end_addr >> PAGE_SHIFT;
#endif
/*
* Find an area to use for the bootmem bitmap. Calculate the size of
* bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE.
* Add 1 additional page in case the address isn't page-aligned.
*/
bootmap_pages = bootmem_bootmap_pages(total_pages);
start = memblock_alloc(bootmap_pages << PAGE_SHIFT, PAGE_SIZE);
min_low_pfn = MEMORY_START >> PAGE_SHIFT;
boot_mapsize = init_bootmem_node(NODE_DATA(0), start >> PAGE_SHIFT, min_low_pfn, max_low_pfn);
/* Place all memblock_regions in the same node and merge contiguous
* memblock_regions
*/
memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
/* Add all physical memory to the bootmem map, mark each area
* present.
*/
#ifdef CONFIG_HIGHMEM
free_bootmem_with_active_regions(0, lowmem_end_addr >> PAGE_SHIFT);
/* reserve the sections we're already using */
for_each_memblock(reserved, reg) {
unsigned long top = reg->base + reg->size - 1;
if (top < lowmem_end_addr)
reserve_bootmem(reg->base, reg->size, BOOTMEM_DEFAULT);
else if (reg->base < lowmem_end_addr) {
unsigned long trunc_size = lowmem_end_addr - reg->base;
reserve_bootmem(reg->base, trunc_size, BOOTMEM_DEFAULT);
}
}
#else
free_bootmem_with_active_regions(0, max_pfn);
/* reserve the sections we're already using */
for_each_memblock(reserved, reg)
reserve_bootmem(reg->base, reg->size, BOOTMEM_DEFAULT);
#endif
/* XXX need to clip this if using highmem? */
sparse_memory_present_with_active_regions(0);
init_bootmem_done = 1;
}
/* mark pages that don't exist as nosave */
static int __init mark_nonram_nosave(void)
{
struct memblock_region *reg, *prev = NULL;
for_each_memblock(memory, reg) {
if (prev &&
memblock_region_memory_end_pfn(prev) < memblock_region_memory_base_pfn(reg))
register_nosave_region(memblock_region_memory_end_pfn(prev),
memblock_region_memory_base_pfn(reg));
prev = reg;
}
return 0;
}
#else /* CONFIG_NEED_MULTIPLE_NODES */
static int __init mark_nonram_nosave(void)
{
return 0;
}
#endif
static bool zone_limits_final;
static unsigned long max_zone_pfns[MAX_NR_ZONES] = {
[0 ... MAX_NR_ZONES - 1] = ~0UL
};
/*
* Restrict the specified zone and all more restrictive zones
* to be below the specified pfn. May not be called after
* paging_init().
*/
void __init limit_zone_pfn(enum zone_type zone, unsigned long pfn_limit)
{
int i;
if (WARN_ON(zone_limits_final))
return;
for (i = zone; i >= 0; i--) {
if (max_zone_pfns[i] > pfn_limit)
max_zone_pfns[i] = pfn_limit;
}
}
/*
* Find the least restrictive zone that is entirely below the
* specified pfn limit. Returns < 0 if no suitable zone is found.
*
* pfn_limit must be u64 because it can exceed 32 bits even on 32-bit
* systems -- the DMA limit can be higher than any possible real pfn.
*/
int dma_pfn_limit_to_zone(u64 pfn_limit)
{
enum zone_type top_zone = ZONE_NORMAL;
int i;
#ifdef CONFIG_HIGHMEM
top_zone = ZONE_HIGHMEM;
#endif
for (i = top_zone; i >= 0; i--) {
if (max_zone_pfns[i] <= pfn_limit)
return i;
}
return -EPERM;
}
/*
* paging_init() sets up the page tables - in fact we've already done this.
*/
void __init paging_init(void)
{
unsigned long long total_ram = memblock_phys_mem_size();
phys_addr_t top_of_ram = memblock_end_of_DRAM();
enum zone_type top_zone;
#ifdef CONFIG_PPC32
unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1);
unsigned long end = __fix_to_virt(FIX_HOLE);
for (; v < end; v += PAGE_SIZE)
map_page(v, 0, 0); /* XXX gross */
#endif
#ifdef CONFIG_HIGHMEM
map_page(PKMAP_BASE, 0, 0); /* XXX gross */
pkmap_page_table = virt_to_kpte(PKMAP_BASE);
kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN));
kmap_prot = PAGE_KERNEL;
#endif /* CONFIG_HIGHMEM */
printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%llx\n",
(unsigned long long)top_of_ram, total_ram);
printk(KERN_DEBUG "Memory hole size: %ldMB\n",
(long int)((top_of_ram - total_ram) >> 20));
#ifdef CONFIG_HIGHMEM
top_zone = ZONE_HIGHMEM;
limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT);
#else
top_zone = ZONE_NORMAL;
#endif
limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT);
zone_limits_final = true;
free_area_init_nodes(max_zone_pfns);
mark_nonram_nosave();
}
static void __init register_page_bootmem_info(void)
{
int i;
for_each_online_node(i)
register_page_bootmem_info_node(NODE_DATA(i));
}
void __init mem_init(void)
{
/*
* book3s is limited to 16 page sizes due to encoding this in
* a 4-bit field for slices.
*/
BUILD_BUG_ON(MMU_PAGE_COUNT > 16);
#ifdef CONFIG_SWIOTLB
swiotlb_init(0);
#endif
register_page_bootmem_info();
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
set_max_mapnr(max_pfn);
free_all_bootmem();
#ifdef CONFIG_HIGHMEM
{
unsigned long pfn, highmem_mapnr;
highmem_mapnr = lowmem_end_addr >> PAGE_SHIFT;
for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) {
phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT;
struct page *page = pfn_to_page(pfn);
if (!memblock_is_reserved(paddr))
free_highmem_page(page);
}
}
#endif /* CONFIG_HIGHMEM */
#if defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_SMP)
/*
* If smp is enabled, next_tlbcam_idx is initialized in the cpu up
* functions.... do it here for the non-smp case.
*/
per_cpu(next_tlbcam_idx, smp_processor_id()) =
(mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1;
#endif
mem_init_print_info(NULL);
#ifdef CONFIG_PPC32
pr_info("Kernel virtual memory layout:\n");
pr_info(" * 0x%08lx..0x%08lx : fixmap\n", FIXADDR_START, FIXADDR_TOP);
#ifdef CONFIG_HIGHMEM
pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n",
PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP));
#endif /* CONFIG_HIGHMEM */
#ifdef CONFIG_NOT_COHERENT_CACHE
pr_info(" * 0x%08lx..0x%08lx : consistent mem\n",
IOREMAP_TOP, IOREMAP_TOP + CONFIG_CONSISTENT_SIZE);
#endif /* CONFIG_NOT_COHERENT_CACHE */
pr_info(" * 0x%08lx..0x%08lx : early ioremap\n",
ioremap_bot, IOREMAP_TOP);
pr_info(" * 0x%08lx..0x%08lx : vmalloc & ioremap\n",
VMALLOC_START, VMALLOC_END);
#endif /* CONFIG_PPC32 */
mem_init_done = 1;
}
void free_initmem(void)
{
ppc_md.progress = ppc_printk_progress;
free_initmem_default(POISON_FREE_INITMEM);
}
#ifdef CONFIG_BLK_DEV_INITRD
void __init free_initrd_mem(unsigned long start, unsigned long end)
{
free_reserved_area((void *)start, (void *)end, -1, "initrd");
}
#endif
/*
* This is called when a page has been modified by the kernel.
* It just marks the page as not i-cache clean. We do the i-cache
* flush later when the page is given to a user process, if necessary.
*/
void flush_dcache_page(struct page *page)
{
if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
return;
/* avoid an atomic op if possible */
if (test_bit(PG_arch_1, &page->flags))
clear_bit(PG_arch_1, &page->flags);
}
EXPORT_SYMBOL(flush_dcache_page);
void flush_dcache_icache_page(struct page *page)
{
#ifdef CONFIG_HUGETLB_PAGE
if (PageCompound(page)) {
flush_dcache_icache_hugepage(page);
return;
}
#endif
#ifdef CONFIG_BOOKE
{
void *start = kmap_atomic(page);
__flush_dcache_icache(start);
kunmap_atomic(start);
}
#elif defined(CONFIG_8xx) || defined(CONFIG_PPC64)
/* On 8xx there is no need to kmap since highmem is not supported */
__flush_dcache_icache(page_address(page));
#else
__flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT);
#endif
}
EXPORT_SYMBOL(flush_dcache_icache_page);
void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
{
clear_page(page);
/*
* We shouldn't have to do this, but some versions of glibc
* require it (ld.so assumes zero filled pages are icache clean)
* - Anton
*/
flush_dcache_page(pg);
}
EXPORT_SYMBOL(clear_user_page);
void copy_user_page(void *vto, void *vfrom, unsigned long vaddr,
struct page *pg)
{
copy_page(vto, vfrom);
/*
* We should be able to use the following optimisation, however
* there are two problems.
* Firstly a bug in some versions of binutils meant PLT sections
* were not marked executable.
* Secondly the first word in the GOT section is blrl, used
* to establish the GOT address. Until recently the GOT was
* not marked executable.
* - Anton
*/
#if 0
if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0))
return;
#endif
flush_dcache_page(pg);
}
void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
unsigned long addr, int len)
{
unsigned long maddr;
maddr = (unsigned long) kmap(page) + (addr & ~PAGE_MASK);
flush_icache_range(maddr, maddr + len);
kunmap(page);
}
EXPORT_SYMBOL(flush_icache_user_range);
/*
* This is called at the end of handling a user page fault, when the
* fault has been handled by updating a PTE in the linux page tables.
* We use it to preload an HPTE into the hash table corresponding to
* the updated linux PTE.
*
* This must always be called with the pte lock held.
*/
void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep)
{
#ifdef CONFIG_PPC_STD_MMU
/*
* We don't need to worry about _PAGE_PRESENT here because we are
* called with either mm->page_table_lock held or ptl lock held
*/
unsigned long access = 0, trap;
/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
if (!pte_young(*ptep) || address >= TASK_SIZE)
return;
/* We try to figure out if we are coming from an instruction
* access fault and pass that down to __hash_page so we avoid
* double-faulting on execution of fresh text. We have to test
* for regs NULL since init will get here first thing at boot
*
* We also avoid filling the hash if not coming from a fault
*/
if (current->thread.regs == NULL)
return;
trap = TRAP(current->thread.regs);
if (trap == 0x400)
access |= _PAGE_EXEC;
else if (trap != 0x300)
return;
hash_preload(vma->vm_mm, address, access, trap);
#endif /* CONFIG_PPC_STD_MMU */
#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \
&& defined(CONFIG_HUGETLB_PAGE)
if (is_vm_hugetlb_page(vma))
book3e_hugetlb_preload(vma, address, *ptep);
#endif
}
/*
* System memory should not be in /proc/iomem but various tools expect it
* (eg kdump).
*/
static int __init add_system_ram_resources(void)
{
struct memblock_region *reg;
for_each_memblock(memory, reg) {
struct resource *res;
unsigned long base = reg->base;
unsigned long size = reg->size;
res = kzalloc(sizeof(struct resource), GFP_KERNEL);
WARN_ON(!res);
if (res) {
res->name = "System RAM";
res->start = base;
res->end = base + size - 1;
res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
WARN_ON(request_resource(&iomem_resource, res) < 0);
}
}
return 0;
}
subsys_initcall(add_system_ram_resources);
#ifdef CONFIG_STRICT_DEVMEM
/*
* devmem_is_allowed(): check to see if /dev/mem access to a certain address
* is valid. The argument is a physical page number.
*
* Access has to be given to non-kernel-ram areas as well, these contain the
* PCI mmio resources as well as potential bios/acpi data regions.
*/
int devmem_is_allowed(unsigned long pfn)
{
if (iomem_is_exclusive(pfn << PAGE_SHIFT))
return 0;
if (!page_is_ram(pfn))
return 1;
if (page_is_rtas_user_buf(pfn))
return 1;
return 0;
}
#endif /* CONFIG_STRICT_DEVMEM */

99
arch/powerpc/mm/mmap.c Normal file
View file

@ -0,0 +1,99 @@
/*
* flexible mmap layout support
*
* Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*
* Started by Ingo Molnar <mingo@elte.hu>
*/
#include <linux/personality.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/sched.h>
/*
* Top of mmap area (just below the process stack).
*
* Leave at least a ~128 MB hole on 32bit applications.
*
* On 64bit applications we randomise the stack by 1GB so we need to
* space our mmap start address by a further 1GB, otherwise there is a
* chance the mmap area will end up closer to the stack than our ulimit
* requires.
*/
#define MIN_GAP32 (128*1024*1024)
#define MIN_GAP64 ((128 + 1024)*1024*1024UL)
#define MIN_GAP ((is_32bit_task()) ? MIN_GAP32 : MIN_GAP64)
#define MAX_GAP (TASK_SIZE/6*5)
static inline int mmap_is_legacy(void)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
return 1;
return sysctl_legacy_va_layout;
}
static unsigned long mmap_rnd(void)
{
unsigned long rnd = 0;
if (current->flags & PF_RANDOMIZE) {
/* 8MB for 32bit, 1GB for 64bit */
if (is_32bit_task())
rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT)));
else
rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT)));
}
return rnd << PAGE_SHIFT;
}
static inline unsigned long mmap_base(void)
{
unsigned long gap = rlimit(RLIMIT_STACK);
if (gap < MIN_GAP)
gap = MIN_GAP;
else if (gap > MAX_GAP)
gap = MAX_GAP;
return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
}
/*
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
{
/*
* Fall back to the standard layout if the personality
* bit is set, or if the expected stack growth is unlimited:
*/
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}

View file

@ -0,0 +1,119 @@
/*
* This file contains the routines for handling the MMU on those
* PowerPC implementations where the MMU substantially follows the
* architecture specification. This includes the 6xx, 7xx, 7xxx,
* and 8260 implementations but excludes the 8xx and 4xx.
* -- paulus
*
* Derived from arch/ppc/mm/init.c:
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
* and Cort Dougan (PReP) (cort@cs.nmt.edu)
* Copyright (C) 1996 Paul Mackerras
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/export.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
/*
* On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs
* (virtual segment identifiers) for each context. Although the
* hardware supports 24-bit VSIDs, and thus >1 million contexts,
* we only use 32,768 of them. That is ample, since there can be
* at most around 30,000 tasks in the system anyway, and it means
* that we can use a bitmap to indicate which contexts are in use.
* Using a bitmap means that we entirely avoid all of the problems
* that we used to have when the context number overflowed,
* particularly on SMP systems.
* -- paulus.
*/
#define NO_CONTEXT ((unsigned long) -1)
#define LAST_CONTEXT 32767
#define FIRST_CONTEXT 1
/*
* This function defines the mapping from contexts to VSIDs (virtual
* segment IDs). We use a skew on both the context and the high 4 bits
* of the 32-bit virtual address (the "effective segment ID") in order
* to spread out the entries in the MMU hash table. Note, if this
* function is changed then arch/ppc/mm/hashtable.S will have to be
* changed to correspond.
*
*
* CTX_TO_VSID(ctx, va) (((ctx) * (897 * 16) + ((va) >> 28) * 0x111) \
* & 0xffffff)
*/
static unsigned long next_mmu_context;
static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
unsigned long __init_new_context(void)
{
unsigned long ctx = next_mmu_context;
while (test_and_set_bit(ctx, context_map)) {
ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
if (ctx > LAST_CONTEXT)
ctx = 0;
}
next_mmu_context = (ctx + 1) & LAST_CONTEXT;
return ctx;
}
EXPORT_SYMBOL_GPL(__init_new_context);
/*
* Set up the context for a new address space.
*/
int init_new_context(struct task_struct *t, struct mm_struct *mm)
{
mm->context.id = __init_new_context();
return 0;
}
/*
* Free a context ID. Make sure to call this with preempt disabled!
*/
void __destroy_context(unsigned long ctx)
{
clear_bit(ctx, context_map);
}
EXPORT_SYMBOL_GPL(__destroy_context);
/*
* We're finished using the context for an address space.
*/
void destroy_context(struct mm_struct *mm)
{
preempt_disable();
if (mm->context.id != NO_CONTEXT) {
__destroy_context(mm->context.id);
mm->context.id = NO_CONTEXT;
}
preempt_enable();
}
/*
* Initialize the context management stuff.
*/
void __init mmu_context_init(void)
{
/* Reserve context 0 for kernel use */
context_map[0] = (1 << FIRST_CONTEXT) - 1;
next_mmu_context = FIRST_CONTEXT;
}

View file

@ -0,0 +1,146 @@
/*
* MMU context allocation for 64-bit kernels.
*
* Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/spinlock.h>
#include <linux/idr.h>
#include <linux/export.h>
#include <linux/gfp.h>
#include <linux/slab.h>
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
#include "icswx.h"
static DEFINE_SPINLOCK(mmu_context_lock);
static DEFINE_IDA(mmu_context_ida);
int __init_new_context(void)
{
int index;
int err;
again:
if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
return -ENOMEM;
spin_lock(&mmu_context_lock);
err = ida_get_new_above(&mmu_context_ida, 1, &index);
spin_unlock(&mmu_context_lock);
if (err == -EAGAIN)
goto again;
else if (err)
return err;
if (index > MAX_USER_CONTEXT) {
spin_lock(&mmu_context_lock);
ida_remove(&mmu_context_ida, index);
spin_unlock(&mmu_context_lock);
return -ENOMEM;
}
return index;
}
EXPORT_SYMBOL_GPL(__init_new_context);
int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
{
int index;
index = __init_new_context();
if (index < 0)
return index;
/* The old code would re-promote on fork, we don't do that
* when using slices as it could cause problem promoting slices
* that have been forced down to 4K
*/
if (slice_mm_new_context(mm))
slice_set_user_psize(mm, mmu_virtual_psize);
subpage_prot_init_new_context(mm);
mm->context.id = index;
#ifdef CONFIG_PPC_ICSWX
mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
if (!mm->context.cop_lockp) {
__destroy_context(index);
subpage_prot_free(mm);
mm->context.id = MMU_NO_CONTEXT;
return -ENOMEM;
}
spin_lock_init(mm->context.cop_lockp);
#endif /* CONFIG_PPC_ICSWX */
#ifdef CONFIG_PPC_64K_PAGES
mm->context.pte_frag = NULL;
#endif
return 0;
}
void __destroy_context(int context_id)
{
spin_lock(&mmu_context_lock);
ida_remove(&mmu_context_ida, context_id);
spin_unlock(&mmu_context_lock);
}
EXPORT_SYMBOL_GPL(__destroy_context);
#ifdef CONFIG_PPC_64K_PAGES
static void destroy_pagetable_page(struct mm_struct *mm)
{
int count;
void *pte_frag;
struct page *page;
pte_frag = mm->context.pte_frag;
if (!pte_frag)
return;
page = virt_to_page(pte_frag);
/* drop all the pending references */
count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
/* We allow PTE_FRAG_NR fragments from a PTE page */
count = atomic_sub_return(PTE_FRAG_NR - count, &page->_count);
if (!count) {
pgtable_page_dtor(page);
free_hot_cold_page(page, 0);
}
}
#else
static inline void destroy_pagetable_page(struct mm_struct *mm)
{
return;
}
#endif
void destroy_context(struct mm_struct *mm)
{
#ifdef CONFIG_PPC_ICSWX
drop_cop(mm->context.acop, mm);
kfree(mm->context.cop_lockp);
mm->context.cop_lockp = NULL;
#endif /* CONFIG_PPC_ICSWX */
destroy_pagetable_page(mm);
__destroy_context(mm->context.id);
subpage_prot_free(mm);
mm->context.id = MMU_NO_CONTEXT;
}

View file

@ -0,0 +1,449 @@
/*
* This file contains the routines for handling the MMU on those
* PowerPC implementations where the MMU is not using the hash
* table, such as 8xx, 4xx, BookE's etc...
*
* Copyright 2008 Ben Herrenschmidt <benh@kernel.crashing.org>
* IBM Corp.
*
* Derived from previous arch/powerpc/mm/mmu_context.c
* and arch/powerpc/include/asm/mmu_context.h
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* TODO:
*
* - The global context lock will not scale very well
* - The maps should be dynamically allocated to allow for processors
* that support more PID bits at runtime
* - Implement flush_tlb_mm() by making the context stale and picking
* a new one
* - More aggressively clear stale map bits and maybe find some way to
* also clear mm->cpu_vm_mask bits when processes are migrated
*/
//#define DEBUG_MAP_CONSISTENCY
//#define DEBUG_CLAMP_LAST_CONTEXT 31
//#define DEBUG_HARDER
/* We don't use DEBUG because it tends to be compiled in always nowadays
* and this would generate way too much output
*/
#ifdef DEBUG_HARDER
#define pr_hard(args...) printk(KERN_DEBUG args)
#define pr_hardcont(args...) printk(KERN_CONT args)
#else
#define pr_hard(args...) do { } while(0)
#define pr_hardcont(args...) do { } while(0)
#endif
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/bootmem.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/slab.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
static unsigned int first_context, last_context;
static unsigned int next_context, nr_free_contexts;
static unsigned long *context_map;
static unsigned long *stale_map[NR_CPUS];
static struct mm_struct **context_mm;
static DEFINE_RAW_SPINLOCK(context_lock);
#define CTX_MAP_SIZE \
(sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1))
/* Steal a context from a task that has one at the moment.
*
* This is used when we are running out of available PID numbers
* on the processors.
*
* This isn't an LRU system, it just frees up each context in
* turn (sort-of pseudo-random replacement :). This would be the
* place to implement an LRU scheme if anyone was motivated to do it.
* -- paulus
*
* For context stealing, we use a slightly different approach for
* SMP and UP. Basically, the UP one is simpler and doesn't use
* the stale map as we can just flush the local CPU
* -- benh
*/
#ifdef CONFIG_SMP
static unsigned int steal_context_smp(unsigned int id)
{
struct mm_struct *mm;
unsigned int cpu, max, i;
max = last_context - first_context;
/* Attempt to free next_context first and then loop until we manage */
while (max--) {
/* Pick up the victim mm */
mm = context_mm[id];
/* We have a candidate victim, check if it's active, on SMP
* we cannot steal active contexts
*/
if (mm->context.active) {
id++;
if (id > last_context)
id = first_context;
continue;
}
pr_hardcont(" | steal %d from 0x%p", id, mm);
/* Mark this mm has having no context anymore */
mm->context.id = MMU_NO_CONTEXT;
/* Mark it stale on all CPUs that used this mm. For threaded
* implementations, we set it on all threads on each core
* represented in the mask. A future implementation will use
* a core map instead but this will do for now.
*/
for_each_cpu(cpu, mm_cpumask(mm)) {
for (i = cpu_first_thread_sibling(cpu);
i <= cpu_last_thread_sibling(cpu); i++) {
if (stale_map[i])
__set_bit(id, stale_map[i]);
}
cpu = i - 1;
}
return id;
}
/* This will happen if you have more CPUs than available contexts,
* all we can do here is wait a bit and try again
*/
raw_spin_unlock(&context_lock);
cpu_relax();
raw_spin_lock(&context_lock);
/* This will cause the caller to try again */
return MMU_NO_CONTEXT;
}
#endif /* CONFIG_SMP */
/* Note that this will also be called on SMP if all other CPUs are
* offlined, which means that it may be called for cpu != 0. For
* this to work, we somewhat assume that CPUs that are onlined
* come up with a fully clean TLB (or are cleaned when offlined)
*/
static unsigned int steal_context_up(unsigned int id)
{
struct mm_struct *mm;
int cpu = smp_processor_id();
/* Pick up the victim mm */
mm = context_mm[id];
pr_hardcont(" | steal %d from 0x%p", id, mm);
/* Flush the TLB for that context */
local_flush_tlb_mm(mm);
/* Mark this mm has having no context anymore */
mm->context.id = MMU_NO_CONTEXT;
/* XXX This clear should ultimately be part of local_flush_tlb_mm */
__clear_bit(id, stale_map[cpu]);
return id;
}
#ifdef DEBUG_MAP_CONSISTENCY
static void context_check_map(void)
{
unsigned int id, nrf, nact;
nrf = nact = 0;
for (id = first_context; id <= last_context; id++) {
int used = test_bit(id, context_map);
if (!used)
nrf++;
if (used != (context_mm[id] != NULL))
pr_err("MMU: Context %d is %s and MM is %p !\n",
id, used ? "used" : "free", context_mm[id]);
if (context_mm[id] != NULL)
nact += context_mm[id]->context.active;
}
if (nrf != nr_free_contexts) {
pr_err("MMU: Free context count out of sync ! (%d vs %d)\n",
nr_free_contexts, nrf);
nr_free_contexts = nrf;
}
if (nact > num_online_cpus())
pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n",
nact, num_online_cpus());
if (first_context > 0 && !test_bit(0, context_map))
pr_err("MMU: Context 0 has been freed !!!\n");
}
#else
static void context_check_map(void) { }
#endif
void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
{
unsigned int i, id, cpu = smp_processor_id();
unsigned long *map;
/* No lockless fast path .. yet */
raw_spin_lock(&context_lock);
pr_hard("[%d] activating context for mm @%p, active=%d, id=%d",
cpu, next, next->context.active, next->context.id);
#ifdef CONFIG_SMP
/* Mark us active and the previous one not anymore */
next->context.active++;
if (prev) {
pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active);
WARN_ON(prev->context.active < 1);
prev->context.active--;
}
again:
#endif /* CONFIG_SMP */
/* If we already have a valid assigned context, skip all that */
id = next->context.id;
if (likely(id != MMU_NO_CONTEXT)) {
#ifdef DEBUG_MAP_CONSISTENCY
if (context_mm[id] != next)
pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n",
next, id, id, context_mm[id]);
#endif
goto ctxt_ok;
}
/* We really don't have a context, let's try to acquire one */
id = next_context;
if (id > last_context)
id = first_context;
map = context_map;
/* No more free contexts, let's try to steal one */
if (nr_free_contexts == 0) {
#ifdef CONFIG_SMP
if (num_online_cpus() > 1) {
id = steal_context_smp(id);
if (id == MMU_NO_CONTEXT)
goto again;
goto stolen;
}
#endif /* CONFIG_SMP */
id = steal_context_up(id);
goto stolen;
}
nr_free_contexts--;
/* We know there's at least one free context, try to find it */
while (__test_and_set_bit(id, map)) {
id = find_next_zero_bit(map, last_context+1, id);
if (id > last_context)
id = first_context;
}
stolen:
next_context = id + 1;
context_mm[id] = next;
next->context.id = id;
pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts);
context_check_map();
ctxt_ok:
/* If that context got marked stale on this CPU, then flush the
* local TLB for it and unmark it before we use it
*/
if (test_bit(id, stale_map[cpu])) {
pr_hardcont(" | stale flush %d [%d..%d]",
id, cpu_first_thread_sibling(cpu),
cpu_last_thread_sibling(cpu));
local_flush_tlb_mm(next);
/* XXX This clear should ultimately be part of local_flush_tlb_mm */
for (i = cpu_first_thread_sibling(cpu);
i <= cpu_last_thread_sibling(cpu); i++) {
if (stale_map[i])
__clear_bit(id, stale_map[i]);
}
}
/* Flick the MMU and release lock */
pr_hardcont(" -> %d\n", id);
set_context(id, next->pgd);
raw_spin_unlock(&context_lock);
}
/*
* Set up the context for a new address space.
*/
int init_new_context(struct task_struct *t, struct mm_struct *mm)
{
pr_hard("initing context for mm @%p\n", mm);
mm->context.id = MMU_NO_CONTEXT;
mm->context.active = 0;
#ifdef CONFIG_PPC_MM_SLICES
if (slice_mm_new_context(mm))
slice_set_user_psize(mm, mmu_virtual_psize);
#endif
return 0;
}
/*
* We're finished using the context for an address space.
*/
void destroy_context(struct mm_struct *mm)
{
unsigned long flags;
unsigned int id;
if (mm->context.id == MMU_NO_CONTEXT)
return;
WARN_ON(mm->context.active != 0);
raw_spin_lock_irqsave(&context_lock, flags);
id = mm->context.id;
if (id != MMU_NO_CONTEXT) {
__clear_bit(id, context_map);
mm->context.id = MMU_NO_CONTEXT;
#ifdef DEBUG_MAP_CONSISTENCY
mm->context.active = 0;
#endif
context_mm[id] = NULL;
nr_free_contexts++;
}
raw_spin_unlock_irqrestore(&context_lock, flags);
}
#ifdef CONFIG_SMP
static int mmu_context_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned int)(long)hcpu;
/* We don't touch CPU 0 map, it's allocated at aboot and kept
* around forever
*/
if (cpu == boot_cpuid)
return NOTIFY_OK;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu);
stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu);
kfree(stale_map[cpu]);
stale_map[cpu] = NULL;
/* We also clear the cpu_vm_mask bits of CPUs going away */
clear_tasks_mm_cpumask(cpu);
break;
#endif /* CONFIG_HOTPLUG_CPU */
}
return NOTIFY_OK;
}
static struct notifier_block mmu_context_cpu_nb = {
.notifier_call = mmu_context_cpu_notify,
};
#endif /* CONFIG_SMP */
/*
* Initialize the context management stuff.
*/
void __init mmu_context_init(void)
{
/* Mark init_mm as being active on all possible CPUs since
* we'll get called with prev == init_mm the first time
* we schedule on a given CPU
*/
init_mm.context.active = NR_CPUS;
/*
* The MPC8xx has only 16 contexts. We rotate through them on each
* task switch. A better way would be to keep track of tasks that
* own contexts, and implement an LRU usage. That way very active
* tasks don't always have to pay the TLB reload overhead. The
* kernel pages are mapped shared, so the kernel can run on behalf
* of any task that makes a kernel entry. Shared does not mean they
* are not protected, just that the ASID comparison is not performed.
* -- Dan
*
* The IBM4xx has 256 contexts, so we can just rotate through these
* as a way of "switching" contexts. If the TID of the TLB is zero,
* the PID/TID comparison is disabled, so we can use a TID of zero
* to represent all kernel pages as shared among all contexts.
* -- Dan
*
* The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We
* should normally never have to steal though the facility is
* present if needed.
* -- BenH
*/
if (mmu_has_feature(MMU_FTR_TYPE_8xx)) {
first_context = 0;
last_context = 15;
} else if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
first_context = 1;
last_context = 65535;
} else {
first_context = 1;
last_context = 255;
}
#ifdef DEBUG_CLAMP_LAST_CONTEXT
last_context = DEBUG_CLAMP_LAST_CONTEXT;
#endif
/*
* Allocate the maps used by context management
*/
context_map = alloc_bootmem(CTX_MAP_SIZE);
context_mm = alloc_bootmem(sizeof(void *) * (last_context + 1));
#ifndef CONFIG_SMP
stale_map[0] = alloc_bootmem(CTX_MAP_SIZE);
#else
stale_map[boot_cpuid] = alloc_bootmem(CTX_MAP_SIZE);
register_cpu_notifier(&mmu_context_cpu_nb);
#endif
printk(KERN_INFO
"MMU: Allocated %zu bytes of context maps for %d contexts\n",
2 * CTX_MAP_SIZE + (sizeof(void *) * (last_context + 1)),
last_context - first_context + 1);
/*
* Some processors have too few contexts to reserve one for
* init_mm, and require using context 0 for a normal task.
* Other processors reserve the use of context zero for the kernel.
* This code assumes first_context < 32.
*/
context_map[0] = (1 << first_context) - 1;
next_context = first_context;
nr_free_contexts = last_context - first_context + 1;
}

167
arch/powerpc/mm/mmu_decl.h Normal file
View file

@ -0,0 +1,167 @@
/*
* Declarations of procedures and variables shared between files
* in arch/ppc/mm/.
*
* Derived from arch/ppc/mm/init.c:
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
* and Cort Dougan (PReP) (cort@cs.nmt.edu)
* Copyright (C) 1996 Paul Mackerras
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/mm.h>
#include <asm/tlbflush.h>
#include <asm/mmu.h>
#ifdef CONFIG_PPC_MMU_NOHASH
/*
* On 40x and 8xx, we directly inline tlbia and tlbivax
*/
#if defined(CONFIG_40x) || defined(CONFIG_8xx)
static inline void _tlbil_all(void)
{
asm volatile ("sync; tlbia; isync" : : : "memory");
}
static inline void _tlbil_pid(unsigned int pid)
{
asm volatile ("sync; tlbia; isync" : : : "memory");
}
#define _tlbil_pid_noind(pid) _tlbil_pid(pid)
#else /* CONFIG_40x || CONFIG_8xx */
extern void _tlbil_all(void);
extern void _tlbil_pid(unsigned int pid);
#ifdef CONFIG_PPC_BOOK3E
extern void _tlbil_pid_noind(unsigned int pid);
#else
#define _tlbil_pid_noind(pid) _tlbil_pid(pid)
#endif
#endif /* !(CONFIG_40x || CONFIG_8xx) */
/*
* On 8xx, we directly inline tlbie, on others, it's extern
*/
#ifdef CONFIG_8xx
static inline void _tlbil_va(unsigned long address, unsigned int pid,
unsigned int tsize, unsigned int ind)
{
asm volatile ("tlbie %0; sync" : : "r" (address) : "memory");
}
#elif defined(CONFIG_PPC_BOOK3E)
extern void _tlbil_va(unsigned long address, unsigned int pid,
unsigned int tsize, unsigned int ind);
#else
extern void __tlbil_va(unsigned long address, unsigned int pid);
static inline void _tlbil_va(unsigned long address, unsigned int pid,
unsigned int tsize, unsigned int ind)
{
__tlbil_va(address, pid);
}
#endif /* CONIFG_8xx */
#if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_PPC_47x)
extern void _tlbivax_bcast(unsigned long address, unsigned int pid,
unsigned int tsize, unsigned int ind);
#else
static inline void _tlbivax_bcast(unsigned long address, unsigned int pid,
unsigned int tsize, unsigned int ind)
{
BUG();
}
#endif
#else /* CONFIG_PPC_MMU_NOHASH */
extern void hash_preload(struct mm_struct *mm, unsigned long ea,
unsigned long access, unsigned long trap);
extern void _tlbie(unsigned long address);
extern void _tlbia(void);
#endif /* CONFIG_PPC_MMU_NOHASH */
#ifdef CONFIG_PPC32
extern void mapin_ram(void);
extern int map_page(unsigned long va, phys_addr_t pa, int flags);
extern void setbat(int index, unsigned long virt, phys_addr_t phys,
unsigned int size, int flags);
extern int __map_without_bats;
extern int __allow_ioremap_reserved;
extern unsigned long ioremap_base;
extern unsigned int rtas_data, rtas_size;
struct hash_pte;
extern struct hash_pte *Hash, *Hash_end;
extern unsigned long Hash_size, Hash_mask;
#endif /* CONFIG_PPC32 */
#ifdef CONFIG_PPC64
extern int map_kernel_page(unsigned long ea, unsigned long pa, int flags);
#endif /* CONFIG_PPC64 */
extern unsigned long ioremap_bot;
extern unsigned long __max_low_memory;
extern phys_addr_t __initial_memory_limit_addr;
extern phys_addr_t total_memory;
extern phys_addr_t total_lowmem;
extern phys_addr_t memstart_addr;
extern phys_addr_t lowmem_end_addr;
#ifdef CONFIG_WII
extern unsigned long wii_hole_start;
extern unsigned long wii_hole_size;
extern unsigned long wii_mmu_mapin_mem2(unsigned long top);
extern void wii_memory_fixups(void);
#endif
/* ...and now those things that may be slightly different between processor
* architectures. -- Dan
*/
#if defined(CONFIG_8xx)
#define MMU_init_hw() do { } while(0)
#define mmu_mapin_ram(top) (0UL)
#elif defined(CONFIG_4xx)
extern void MMU_init_hw(void);
extern unsigned long mmu_mapin_ram(unsigned long top);
#elif defined(CONFIG_PPC_FSL_BOOK3E)
extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx);
extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
phys_addr_t phys);
#ifdef CONFIG_PPC32
extern void MMU_init_hw(void);
extern unsigned long mmu_mapin_ram(unsigned long top);
extern void adjust_total_lowmem(void);
extern int switch_to_as1(void);
extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu);
#endif
extern void loadcam_entry(unsigned int index);
struct tlbcam {
u32 MAS0;
u32 MAS1;
unsigned long MAS2;
u32 MAS3;
u32 MAS7;
};
#elif defined(CONFIG_PPC32)
/* anything 32-bit except 4xx or 8xx */
extern void MMU_init_hw(void);
extern unsigned long mmu_mapin_ram(unsigned long top);
#endif

1849
arch/powerpc/mm/numa.c Normal file

File diff suppressed because it is too large Load diff

236
arch/powerpc/mm/pgtable.c Normal file
View file

@ -0,0 +1,236 @@
/*
* This file contains common routines for dealing with free of page tables
* Along with common page table handling code
*
* Derived from arch/powerpc/mm/tlb_64.c:
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
* and Cort Dougan (PReP) (cort@cs.nmt.edu)
* Copyright (C) 1996 Paul Mackerras
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* Dave Engebretsen <engebret@us.ibm.com>
* Rework for PPC64 port.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/hugetlb.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
static inline int is_exec_fault(void)
{
return current->thread.regs && TRAP(current->thread.regs) == 0x400;
}
/* We only try to do i/d cache coherency on stuff that looks like
* reasonably "normal" PTEs. We currently require a PTE to be present
* and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that
* on userspace PTEs
*/
static inline int pte_looks_normal(pte_t pte)
{
return (pte_val(pte) &
(_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) ==
(_PAGE_PRESENT | _PAGE_USER);
}
static struct page *maybe_pte_to_page(pte_t pte)
{
unsigned long pfn = pte_pfn(pte);
struct page *page;
if (unlikely(!pfn_valid(pfn)))
return NULL;
page = pfn_to_page(pfn);
if (PageReserved(page))
return NULL;
return page;
}
#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0
/* Server-style MMU handles coherency when hashing if HW exec permission
* is supposed per page (currently 64-bit only). If not, then, we always
* flush the cache for valid PTEs in set_pte. Embedded CPU without HW exec
* support falls into the same category.
*/
static pte_t set_pte_filter(pte_t pte)
{
pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
cpu_has_feature(CPU_FTR_NOEXECUTE))) {
struct page *pg = maybe_pte_to_page(pte);
if (!pg)
return pte;
if (!test_bit(PG_arch_1, &pg->flags)) {
flush_dcache_icache_page(pg);
set_bit(PG_arch_1, &pg->flags);
}
}
return pte;
}
static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
int dirty)
{
return pte;
}
#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */
/* Embedded type MMU with HW exec support. This is a bit more complicated
* as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
* instead we "filter out" the exec permission for non clean pages.
*/
static pte_t set_pte_filter(pte_t pte)
{
struct page *pg;
/* No exec permission in the first place, move on */
if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte))
return pte;
/* If you set _PAGE_EXEC on weird pages you're on your own */
pg = maybe_pte_to_page(pte);
if (unlikely(!pg))
return pte;
/* If the page clean, we move on */
if (test_bit(PG_arch_1, &pg->flags))
return pte;
/* If it's an exec fault, we flush the cache and make it clean */
if (is_exec_fault()) {
flush_dcache_icache_page(pg);
set_bit(PG_arch_1, &pg->flags);
return pte;
}
/* Else, we filter out _PAGE_EXEC */
return __pte(pte_val(pte) & ~_PAGE_EXEC);
}
static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
int dirty)
{
struct page *pg;
/* So here, we only care about exec faults, as we use them
* to recover lost _PAGE_EXEC and perform I$/D$ coherency
* if necessary. Also if _PAGE_EXEC is already set, same deal,
* we just bail out
*/
if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault())
return pte;
#ifdef CONFIG_DEBUG_VM
/* So this is an exec fault, _PAGE_EXEC is not set. If it was
* an error we would have bailed out earlier in do_page_fault()
* but let's make sure of it
*/
if (WARN_ON(!(vma->vm_flags & VM_EXEC)))
return pte;
#endif /* CONFIG_DEBUG_VM */
/* If you set _PAGE_EXEC on weird pages you're on your own */
pg = maybe_pte_to_page(pte);
if (unlikely(!pg))
goto bail;
/* If the page is already clean, we move on */
if (test_bit(PG_arch_1, &pg->flags))
goto bail;
/* Clean the page and set PG_arch_1 */
flush_dcache_icache_page(pg);
set_bit(PG_arch_1, &pg->flags);
bail:
return __pte(pte_val(pte) | _PAGE_EXEC);
}
#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */
/*
* set_pte stores a linux PTE into the linux page table.
*/
void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
pte_t pte)
{
#ifdef CONFIG_DEBUG_VM
WARN_ON(pte_val(*ptep) & _PAGE_PRESENT);
#endif
/* Note: mm->context.id might not yet have been assigned as
* this context might not have been activated yet when this
* is called.
*/
pte = set_pte_filter(pte);
/* Perform the setting of the PTE */
__set_pte_at(mm, addr, ptep, pte, 0);
}
/*
* This is called when relaxing access to a PTE. It's also called in the page
* fault path when we don't hit any of the major fault cases, ie, a minor
* update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
* handled those two for us, we additionally deal with missing execute
* permission here on some processors
*/
int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep, pte_t entry, int dirty)
{
int changed;
entry = set_access_flags_filter(entry, vma, dirty);
changed = !pte_same(*(ptep), entry);
if (changed) {
if (!is_vm_hugetlb_page(vma))
assert_pte_locked(vma->vm_mm, address);
__ptep_set_access_flags(ptep, entry);
flush_tlb_page_nohash(vma, address);
}
return changed;
}
#ifdef CONFIG_DEBUG_VM
void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
if (mm == &init_mm)
return;
pgd = mm->pgd + pgd_index(addr);
BUG_ON(pgd_none(*pgd));
pud = pud_offset(pgd, addr);
BUG_ON(pud_none(*pud));
pmd = pmd_offset(pud, addr);
/*
* khugepaged to collapse normal pages to hugepage, first set
* pmd to none to force page fault/gup to take mmap_sem. After
* pmd is set to none, we do a pte_clear which does this assertion
* so if we find pmd none, return.
*/
if (pmd_none(*pmd))
return;
BUG_ON(!pmd_present(*pmd));
assert_spin_locked(pte_lockptr(mm, pmd));
}
#endif /* CONFIG_DEBUG_VM */

View file

@ -0,0 +1,460 @@
/*
* This file contains the routines setting up the linux page tables.
* -- paulus
*
* Derived from arch/ppc/mm/init.c:
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
* and Cort Dougan (PReP) (cort@cs.nmt.edu)
* Copyright (C) 1996 Paul Mackerras
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/memblock.h>
#include <linux/slab.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
#include <asm/io.h>
#include <asm/setup.h>
#include "mmu_decl.h"
unsigned long ioremap_base;
unsigned long ioremap_bot;
EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */
#ifdef CONFIG_6xx
#define HAVE_BATS 1
#endif
#if defined(CONFIG_FSL_BOOKE)
#define HAVE_TLBCAM 1
#endif
extern char etext[], _stext[];
#ifdef HAVE_BATS
extern phys_addr_t v_mapped_by_bats(unsigned long va);
extern unsigned long p_mapped_by_bats(phys_addr_t pa);
void setbat(int index, unsigned long virt, phys_addr_t phys,
unsigned int size, int flags);
#else /* !HAVE_BATS */
#define v_mapped_by_bats(x) (0UL)
#define p_mapped_by_bats(x) (0UL)
#endif /* HAVE_BATS */
#ifdef HAVE_TLBCAM
extern unsigned int tlbcam_index;
extern phys_addr_t v_mapped_by_tlbcam(unsigned long va);
extern unsigned long p_mapped_by_tlbcam(phys_addr_t pa);
#else /* !HAVE_TLBCAM */
#define v_mapped_by_tlbcam(x) (0UL)
#define p_mapped_by_tlbcam(x) (0UL)
#endif /* HAVE_TLBCAM */
#define PGDIR_ORDER (32 + PGD_T_LOG2 - PGDIR_SHIFT)
pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *ret;
/* pgdir take page or two with 4K pages and a page fraction otherwise */
#ifndef CONFIG_PPC_4K_PAGES
ret = kzalloc(1 << PGDIR_ORDER, GFP_KERNEL);
#else
ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
PGDIR_ORDER - PAGE_SHIFT);
#endif
return ret;
}
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
#ifndef CONFIG_PPC_4K_PAGES
kfree((void *)pgd);
#else
free_pages((unsigned long)pgd, PGDIR_ORDER - PAGE_SHIFT);
#endif
}
__init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
pte_t *pte;
extern int mem_init_done;
extern void *early_get_page(void);
if (mem_init_done) {
pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
} else {
pte = (pte_t *)early_get_page();
if (pte)
clear_page(pte);
}
return pte;
}
pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
struct page *ptepage;
gfp_t flags = GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO;
ptepage = alloc_pages(flags, 0);
if (!ptepage)
return NULL;
if (!pgtable_page_ctor(ptepage)) {
__free_page(ptepage);
return NULL;
}
return ptepage;
}
void __iomem *
ioremap(phys_addr_t addr, unsigned long size)
{
return __ioremap_caller(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED,
__builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap);
void __iomem *
ioremap_wc(phys_addr_t addr, unsigned long size)
{
return __ioremap_caller(addr, size, _PAGE_NO_CACHE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap_wc);
void __iomem *
ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
{
/* writeable implies dirty for kernel addresses */
if (flags & _PAGE_RW)
flags |= _PAGE_DIRTY | _PAGE_HWWRITE;
/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
flags &= ~(_PAGE_USER | _PAGE_EXEC);
#ifdef _PAGE_BAP_SR
/* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
* which means that we just cleared supervisor access... oops ;-) This
* restores it
*/
flags |= _PAGE_BAP_SR;
#endif
return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap_prot);
void __iomem *
__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
{
return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
}
void __iomem *
__ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags,
void *caller)
{
unsigned long v, i;
phys_addr_t p;
int err;
/* Make sure we have the base flags */
if ((flags & _PAGE_PRESENT) == 0)
flags |= PAGE_KERNEL;
/* Non-cacheable page cannot be coherent */
if (flags & _PAGE_NO_CACHE)
flags &= ~_PAGE_COHERENT;
/*
* Choose an address to map it to.
* Once the vmalloc system is running, we use it.
* Before then, we use space going down from ioremap_base
* (ioremap_bot records where we're up to).
*/
p = addr & PAGE_MASK;
size = PAGE_ALIGN(addr + size) - p;
/*
* If the address lies within the first 16 MB, assume it's in ISA
* memory space
*/
if (p < 16*1024*1024)
p += _ISA_MEM_BASE;
#ifndef CONFIG_CRASH_DUMP
/*
* Don't allow anybody to remap normal RAM that we're using.
* mem_init() sets high_memory so only do the check after that.
*/
if (mem_init_done && (p < virt_to_phys(high_memory)) &&
!(__allow_ioremap_reserved && memblock_is_region_reserved(p, size))) {
printk("__ioremap(): phys addr 0x%llx is RAM lr %pf\n",
(unsigned long long)p, __builtin_return_address(0));
return NULL;
}
#endif
if (size == 0)
return NULL;
/*
* Is it already mapped? Perhaps overlapped by a previous
* BAT mapping. If the whole area is mapped then we're done,
* otherwise remap it since we want to keep the virt addrs for
* each request contiguous.
*
* We make the assumption here that if the bottom and top
* of the range we want are mapped then it's mapped to the
* same virt address (and this is contiguous).
* -- Cort
*/
if ((v = p_mapped_by_bats(p)) /*&& p_mapped_by_bats(p+size-1)*/ )
goto out;
if ((v = p_mapped_by_tlbcam(p)))
goto out;
if (mem_init_done) {
struct vm_struct *area;
area = get_vm_area_caller(size, VM_IOREMAP, caller);
if (area == 0)
return NULL;
area->phys_addr = p;
v = (unsigned long) area->addr;
} else {
v = (ioremap_bot -= size);
}
/*
* Should check if it is a candidate for a BAT mapping
*/
err = 0;
for (i = 0; i < size && err == 0; i += PAGE_SIZE)
err = map_page(v+i, p+i, flags);
if (err) {
if (mem_init_done)
vunmap((void *)v);
return NULL;
}
out:
return (void __iomem *) (v + ((unsigned long)addr & ~PAGE_MASK));
}
EXPORT_SYMBOL(__ioremap);
void iounmap(volatile void __iomem *addr)
{
/*
* If mapped by BATs then there is nothing to do.
* Calling vfree() generates a benign warning.
*/
if (v_mapped_by_bats((unsigned long)addr)) return;
if (addr > high_memory && (unsigned long) addr < ioremap_bot)
vunmap((void *) (PAGE_MASK & (unsigned long)addr));
}
EXPORT_SYMBOL(iounmap);
int map_page(unsigned long va, phys_addr_t pa, int flags)
{
pmd_t *pd;
pte_t *pg;
int err = -ENOMEM;
/* Use upper 10 bits of VA to index the first level map */
pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va);
/* Use middle 10 bits of VA to index the second-level map */
pg = pte_alloc_kernel(pd, va);
if (pg != 0) {
err = 0;
/* The PTE should never be already set nor present in the
* hash table
*/
BUG_ON((pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE)) &&
flags);
set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT,
__pgprot(flags)));
}
smp_wmb();
return err;
}
/*
* Map in a chunk of physical memory starting at start.
*/
void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
{
unsigned long v, s, f;
phys_addr_t p;
int ktext;
s = offset;
v = PAGE_OFFSET + s;
p = memstart_addr + s;
for (; s < top; s += PAGE_SIZE) {
ktext = ((char *) v >= _stext && (char *) v < etext);
f = ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL;
map_page(v, p, f);
#ifdef CONFIG_PPC_STD_MMU_32
if (ktext)
hash_preload(&init_mm, v, 0, 0x300);
#endif
v += PAGE_SIZE;
p += PAGE_SIZE;
}
}
void __init mapin_ram(void)
{
unsigned long s, top;
#ifndef CONFIG_WII
top = total_lowmem;
s = mmu_mapin_ram(top);
__mapin_ram_chunk(s, top);
#else
if (!wii_hole_size) {
s = mmu_mapin_ram(total_lowmem);
__mapin_ram_chunk(s, total_lowmem);
} else {
top = wii_hole_start;
s = mmu_mapin_ram(top);
__mapin_ram_chunk(s, top);
top = memblock_end_of_DRAM();
s = wii_mmu_mapin_mem2(top);
__mapin_ram_chunk(s, top);
}
#endif
}
/* Scan the real Linux page tables and return a PTE pointer for
* a virtual address in a context.
* Returns true (1) if PTE was found, zero otherwise. The pointer to
* the PTE pointer is unmodified if PTE is not found.
*/
int
get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
int retval = 0;
pgd = pgd_offset(mm, addr & PAGE_MASK);
if (pgd) {
pud = pud_offset(pgd, addr & PAGE_MASK);
if (pud && pud_present(*pud)) {
pmd = pmd_offset(pud, addr & PAGE_MASK);
if (pmd_present(*pmd)) {
pte = pte_offset_map(pmd, addr & PAGE_MASK);
if (pte) {
retval = 1;
*ptep = pte;
if (pmdp)
*pmdp = pmd;
/* XXX caller needs to do pte_unmap, yuck */
}
}
}
}
return(retval);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
static int __change_page_attr(struct page *page, pgprot_t prot)
{
pte_t *kpte;
pmd_t *kpmd;
unsigned long address;
BUG_ON(PageHighMem(page));
address = (unsigned long)page_address(page);
if (v_mapped_by_bats(address) || v_mapped_by_tlbcam(address))
return 0;
if (!get_pteptr(&init_mm, address, &kpte, &kpmd))
return -EINVAL;
__set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0);
wmb();
flush_tlb_page(NULL, address);
pte_unmap(kpte);
return 0;
}
/*
* Change the page attributes of an page in the linear mapping.
*
* THIS CONFLICTS WITH BAT MAPPINGS, DEBUG USE ONLY
*/
static int change_page_attr(struct page *page, int numpages, pgprot_t prot)
{
int i, err = 0;
unsigned long flags;
local_irq_save(flags);
for (i = 0; i < numpages; i++, page++) {
err = __change_page_attr(page, prot);
if (err)
break;
}
local_irq_restore(flags);
return err;
}
void kernel_map_pages(struct page *page, int numpages, int enable)
{
if (PageHighMem(page))
return;
change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
}
#endif /* CONFIG_DEBUG_PAGEALLOC */
static int fixmaps;
void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
{
unsigned long address = __fix_to_virt(idx);
if (idx >= __end_of_fixed_addresses) {
BUG();
return;
}
map_page(address, phys, pgprot_val(flags));
fixmaps++;
}
void __this_fixmap_does_not_exist(void)
{
WARN_ON(1);
}

View file

@ -0,0 +1,904 @@
/*
* This file contains ioremap and related functions for 64-bit machines.
*
* Derived from arch/ppc64/mm/init.c
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Modifications by Paul Mackerras (PowerMac) (paulus@samba.org)
* and Cort Dougan (PReP) (cort@cs.nmt.edu)
* Copyright (C) 1996 Paul Mackerras
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* Dave Engebretsen <engebret@us.ibm.com>
* Rework for PPC64 port.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/stddef.h>
#include <linux/vmalloc.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/slab.h>
#include <asm/pgalloc.h>
#include <asm/page.h>
#include <asm/prom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/tlb.h>
#include <asm/processor.h>
#include <asm/cputable.h>
#include <asm/sections.h>
#include <asm/firmware.h>
#include "mmu_decl.h"
#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>
/* Some sanity checking */
#if TASK_SIZE_USER64 > PGTABLE_RANGE
#error TASK_SIZE_USER64 exceeds pagetable range
#endif
#ifdef CONFIG_PPC_STD_MMU_64
#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
#error TASK_SIZE_USER64 exceeds user VSID range
#endif
#endif
unsigned long ioremap_bot = IOREMAP_BASE;
#ifdef CONFIG_PPC_MMU_NOHASH
static __ref void *early_alloc_pgtable(unsigned long size)
{
void *pt;
if (init_bootmem_done)
pt = __alloc_bootmem(size, size, __pa(MAX_DMA_ADDRESS));
else
pt = __va(memblock_alloc_base(size, size,
__pa(MAX_DMA_ADDRESS)));
memset(pt, 0, size);
return pt;
}
#endif /* CONFIG_PPC_MMU_NOHASH */
/*
* map_kernel_page currently only called by __ioremap
* map_kernel_page adds an entry to the ioremap page table
* and adds an entry to the HPT, possibly bolting it
*/
int map_kernel_page(unsigned long ea, unsigned long pa, int flags)
{
pgd_t *pgdp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
if (slab_is_available()) {
pgdp = pgd_offset_k(ea);
pudp = pud_alloc(&init_mm, pgdp, ea);
if (!pudp)
return -ENOMEM;
pmdp = pmd_alloc(&init_mm, pudp, ea);
if (!pmdp)
return -ENOMEM;
ptep = pte_alloc_kernel(pmdp, ea);
if (!ptep)
return -ENOMEM;
set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
__pgprot(flags)));
} else {
#ifdef CONFIG_PPC_MMU_NOHASH
/* Warning ! This will blow up if bootmem is not initialized
* which our ppc64 code is keen to do that, we'll need to
* fix it and/or be more careful
*/
pgdp = pgd_offset_k(ea);
#ifdef PUD_TABLE_SIZE
if (pgd_none(*pgdp)) {
pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
BUG_ON(pudp == NULL);
pgd_populate(&init_mm, pgdp, pudp);
}
#endif /* PUD_TABLE_SIZE */
pudp = pud_offset(pgdp, ea);
if (pud_none(*pudp)) {
pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
BUG_ON(pmdp == NULL);
pud_populate(&init_mm, pudp, pmdp);
}
pmdp = pmd_offset(pudp, ea);
if (!pmd_present(*pmdp)) {
ptep = early_alloc_pgtable(PAGE_SIZE);
BUG_ON(ptep == NULL);
pmd_populate_kernel(&init_mm, pmdp, ptep);
}
ptep = pte_offset_kernel(pmdp, ea);
set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
__pgprot(flags)));
#else /* CONFIG_PPC_MMU_NOHASH */
/*
* If the mm subsystem is not fully up, we cannot create a
* linux page table entry for this mapping. Simply bolt an
* entry in the hardware page table.
*
*/
if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
mmu_io_psize, mmu_kernel_ssize)) {
printk(KERN_ERR "Failed to do bolted mapping IO "
"memory at %016lx !\n", pa);
return -ENOMEM;
}
#endif /* !CONFIG_PPC_MMU_NOHASH */
}
#ifdef CONFIG_PPC_BOOK3E_64
/*
* With hardware tablewalk, a sync is needed to ensure that
* subsequent accesses see the PTE we just wrote. Unlike userspace
* mappings, we can't tolerate spurious faults, so make sure
* the new PTE will be seen the first time.
*/
mb();
#else
smp_wmb();
#endif
return 0;
}
/**
* __ioremap_at - Low level function to establish the page tables
* for an IO mapping
*/
void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
unsigned long flags)
{
unsigned long i;
/* Make sure we have the base flags */
if ((flags & _PAGE_PRESENT) == 0)
flags |= pgprot_val(PAGE_KERNEL);
/* Non-cacheable page cannot be coherent */
if (flags & _PAGE_NO_CACHE)
flags &= ~_PAGE_COHERENT;
/* We don't support the 4K PFN hack with ioremap */
if (flags & _PAGE_4K_PFN)
return NULL;
WARN_ON(pa & ~PAGE_MASK);
WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
WARN_ON(size & ~PAGE_MASK);
for (i = 0; i < size; i += PAGE_SIZE)
if (map_kernel_page((unsigned long)ea+i, pa+i, flags))
return NULL;
return (void __iomem *)ea;
}
/**
* __iounmap_from - Low level function to tear down the page tables
* for an IO mapping. This is used for mappings that
* are manipulated manually, like partial unmapping of
* PCI IOs or ISA space.
*/
void __iounmap_at(void *ea, unsigned long size)
{
WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
WARN_ON(size & ~PAGE_MASK);
unmap_kernel_range((unsigned long)ea, size);
}
void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
unsigned long flags, void *caller)
{
phys_addr_t paligned;
void __iomem *ret;
/*
* Choose an address to map it to.
* Once the imalloc system is running, we use it.
* Before that, we map using addresses going
* up from ioremap_bot. imalloc will use
* the addresses from ioremap_bot through
* IMALLOC_END
*
*/
paligned = addr & PAGE_MASK;
size = PAGE_ALIGN(addr + size) - paligned;
if ((size == 0) || (paligned == 0))
return NULL;
if (mem_init_done) {
struct vm_struct *area;
area = __get_vm_area_caller(size, VM_IOREMAP,
ioremap_bot, IOREMAP_END,
caller);
if (area == NULL)
return NULL;
area->phys_addr = paligned;
ret = __ioremap_at(paligned, area->addr, size, flags);
if (!ret)
vunmap(area->addr);
} else {
ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags);
if (ret)
ioremap_bot += size;
}
if (ret)
ret += addr & ~PAGE_MASK;
return ret;
}
void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
unsigned long flags)
{
return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
}
void __iomem * ioremap(phys_addr_t addr, unsigned long size)
{
unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED;
void *caller = __builtin_return_address(0);
if (ppc_md.ioremap)
return ppc_md.ioremap(addr, size, flags, caller);
return __ioremap_caller(addr, size, flags, caller);
}
void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
{
unsigned long flags = _PAGE_NO_CACHE;
void *caller = __builtin_return_address(0);
if (ppc_md.ioremap)
return ppc_md.ioremap(addr, size, flags, caller);
return __ioremap_caller(addr, size, flags, caller);
}
void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
unsigned long flags)
{
void *caller = __builtin_return_address(0);
/* writeable implies dirty for kernel addresses */
if (flags & _PAGE_RW)
flags |= _PAGE_DIRTY;
/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
flags &= ~(_PAGE_USER | _PAGE_EXEC);
#ifdef _PAGE_BAP_SR
/* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
* which means that we just cleared supervisor access... oops ;-) This
* restores it
*/
flags |= _PAGE_BAP_SR;
#endif
if (ppc_md.ioremap)
return ppc_md.ioremap(addr, size, flags, caller);
return __ioremap_caller(addr, size, flags, caller);
}
/*
* Unmap an IO region and remove it from imalloc'd list.
* Access to IO memory should be serialized by driver.
*/
void __iounmap(volatile void __iomem *token)
{
void *addr;
if (!mem_init_done)
return;
addr = (void *) ((unsigned long __force)
PCI_FIX_ADDR(token) & PAGE_MASK);
if ((unsigned long)addr < ioremap_bot) {
printk(KERN_WARNING "Attempt to iounmap early bolted mapping"
" at 0x%p\n", addr);
return;
}
vunmap(addr);
}
void iounmap(volatile void __iomem *token)
{
if (ppc_md.iounmap)
ppc_md.iounmap(token);
else
__iounmap(token);
}
EXPORT_SYMBOL(ioremap);
EXPORT_SYMBOL(ioremap_wc);
EXPORT_SYMBOL(ioremap_prot);
EXPORT_SYMBOL(__ioremap);
EXPORT_SYMBOL(__ioremap_at);
EXPORT_SYMBOL(iounmap);
EXPORT_SYMBOL(__iounmap);
EXPORT_SYMBOL(__iounmap_at);
/*
* For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
* For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
*/
struct page *pmd_page(pmd_t pmd)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pmd_trans_huge(pmd))
return pfn_to_page(pmd_pfn(pmd));
#endif
return virt_to_page(pmd_page_vaddr(pmd));
}
#ifdef CONFIG_PPC_64K_PAGES
static pte_t *get_from_cache(struct mm_struct *mm)
{
void *pte_frag, *ret;
spin_lock(&mm->page_table_lock);
ret = mm->context.pte_frag;
if (ret) {
pte_frag = ret + PTE_FRAG_SIZE;
/*
* If we have taken up all the fragments mark PTE page NULL
*/
if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
pte_frag = NULL;
mm->context.pte_frag = pte_frag;
}
spin_unlock(&mm->page_table_lock);
return (pte_t *)ret;
}
static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
{
void *ret = NULL;
struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
__GFP_REPEAT | __GFP_ZERO);
if (!page)
return NULL;
if (!kernel && !pgtable_page_ctor(page)) {
__free_page(page);
return NULL;
}
ret = page_address(page);
spin_lock(&mm->page_table_lock);
/*
* If we find pgtable_page set, we return
* the allocated page with single fragement
* count.
*/
if (likely(!mm->context.pte_frag)) {
atomic_set(&page->_count, PTE_FRAG_NR);
mm->context.pte_frag = ret + PTE_FRAG_SIZE;
}
spin_unlock(&mm->page_table_lock);
return (pte_t *)ret;
}
pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
{
pte_t *pte;
pte = get_from_cache(mm);
if (pte)
return pte;
return __alloc_for_cache(mm, kernel);
}
void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
{
struct page *page = virt_to_page(table);
if (put_page_testzero(page)) {
if (!kernel)
pgtable_page_dtor(page);
free_hot_cold_page(page, 0);
}
}
#ifdef CONFIG_SMP
static void page_table_free_rcu(void *table)
{
struct page *page = virt_to_page(table);
if (put_page_testzero(page)) {
pgtable_page_dtor(page);
free_hot_cold_page(page, 0);
}
}
void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
{
unsigned long pgf = (unsigned long)table;
BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
pgf |= shift;
tlb_remove_table(tlb, (void *)pgf);
}
void __tlb_remove_table(void *_table)
{
void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
if (!shift)
/* PTE page needs special handling */
page_table_free_rcu(table);
else {
BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
kmem_cache_free(PGT_CACHE(shift), table);
}
}
#else
void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
{
if (!shift) {
/* PTE page needs special handling */
struct page *page = virt_to_page(table);
if (put_page_testzero(page)) {
pgtable_page_dtor(page);
free_hot_cold_page(page, 0);
}
} else {
BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
kmem_cache_free(PGT_CACHE(shift), table);
}
}
#endif
#endif /* CONFIG_PPC_64K_PAGES */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* This is called when relaxing access to a hugepage. It's also called in the page
* fault path when we don't hit any of the major fault cases, ie, a minor
* update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
* handled those two for us, we additionally deal with missing execute
* permission here on some processors
*/
int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp, pmd_t entry, int dirty)
{
int changed;
#ifdef CONFIG_DEBUG_VM
WARN_ON(!pmd_trans_huge(*pmdp));
assert_spin_locked(&vma->vm_mm->page_table_lock);
#endif
changed = !pmd_same(*(pmdp), entry);
if (changed) {
__ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
/*
* Since we are not supporting SW TLB systems, we don't
* have any thing similar to flush_tlb_page_nohash()
*/
}
return changed;
}
unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, unsigned long clr,
unsigned long set)
{
unsigned long old, tmp;
#ifdef CONFIG_DEBUG_VM
WARN_ON(!pmd_trans_huge(*pmdp));
assert_spin_locked(&mm->page_table_lock);
#endif
#ifdef PTE_ATOMIC_UPDATES
__asm__ __volatile__(
"1: ldarx %0,0,%3\n\
andi. %1,%0,%6\n\
bne- 1b \n\
andc %1,%0,%4 \n\
or %1,%1,%7\n\
stdcx. %1,0,%3 \n\
bne- 1b"
: "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
: "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set)
: "cc" );
#else
old = pmd_val(*pmdp);
*pmdp = __pmd((old & ~clr) | set);
#endif
trace_hugepage_update(addr, old, clr, set);
if (old & _PAGE_HASHPTE)
hpte_do_hugepage_flush(mm, addr, pmdp, old);
return old;
}
pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
if (pmd_trans_huge(*pmdp)) {
pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
} else {
/*
* khugepaged calls this for normal pmd
*/
pmd = *pmdp;
pmd_clear(pmdp);
/*
* Wait for all pending hash_page to finish. This is needed
* in case of subpage collapse. When we collapse normal pages
* to hugepage, we first clear the pmd, then invalidate all
* the PTE entries. The assumption here is that any low level
* page fault will see a none pmd and take the slow path that
* will wait on mmap_sem. But we could very well be in a
* hash_page with local ptep pointer value. Such a hash page
* can result in adding new HPTE entries for normal subpages.
* That means we could be modifying the page content as we
* copy them to a huge page. So wait for parallel hash_page
* to finish before invalidating HPTE entries. We can do this
* by sending an IPI to all the cpus and executing a dummy
* function there.
*/
kick_all_cpus_sync();
/*
* Now invalidate the hpte entries in the range
* covered by pmd. This make sure we take a
* fault and will find the pmd as none, which will
* result in a major fault which takes mmap_sem and
* hence wait for collapse to complete. Without this
* the __collapse_huge_page_copy can result in copying
* the old content.
*/
flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
}
return pmd;
}
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
}
/*
* We currently remove entries from the hashtable regardless of whether
* the entry was young or dirty. The generic routines only flush if the
* entry was young or dirty which is not good enough.
*
* We should be more intelligent about this but for the moment we override
* these functions and force a tlb flush unconditionally
*/
int pmdp_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
}
/*
* We mark the pmd splitting and invalidate all the hpte
* entries for this hugepage.
*/
void pmdp_splitting_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
unsigned long old, tmp;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
#ifdef CONFIG_DEBUG_VM
WARN_ON(!pmd_trans_huge(*pmdp));
assert_spin_locked(&vma->vm_mm->page_table_lock);
#endif
#ifdef PTE_ATOMIC_UPDATES
__asm__ __volatile__(
"1: ldarx %0,0,%3\n\
andi. %1,%0,%6\n\
bne- 1b \n\
ori %1,%0,%4 \n\
stdcx. %1,0,%3 \n\
bne- 1b"
: "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
: "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
: "cc" );
#else
old = pmd_val(*pmdp);
*pmdp = __pmd(old | _PAGE_SPLITTING);
#endif
/*
* If we didn't had the splitting flag set, go and flush the
* HPTE entries.
*/
trace_hugepage_splitting(address, old);
if (!(old & _PAGE_SPLITTING)) {
/* We need to flush the hpte */
if (old & _PAGE_HASHPTE)
hpte_do_hugepage_flush(vma->vm_mm, address, pmdp, old);
}
/*
* This ensures that generic code that rely on IRQ disabling
* to prevent a parallel THP split work as expected.
*/
kick_all_cpus_sync();
}
/*
* We want to put the pgtable in pmd and use pgtable for tracking
* the base page size hptes
*/
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
pgtable_t *pgtable_slot;
assert_spin_locked(&mm->page_table_lock);
/*
* we store the pgtable in the second half of PMD
*/
pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
*pgtable_slot = pgtable;
/*
* expose the deposited pgtable to other cpus.
* before we set the hugepage PTE at pmd level
* hash fault code looks at the deposted pgtable
* to store hash index values.
*/
smp_wmb();
}
pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
pgtable_t pgtable;
pgtable_t *pgtable_slot;
assert_spin_locked(&mm->page_table_lock);
pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
pgtable = *pgtable_slot;
/*
* Once we withdraw, mark the entry NULL.
*/
*pgtable_slot = NULL;
/*
* We store HPTE information in the deposited PTE fragment.
* zero out the content on withdraw.
*/
memset(pgtable, 0, PTE_FRAG_SIZE);
return pgtable;
}
/*
* set a new huge pmd. We should not be called for updating
* an existing pmd entry. That should go via pmd_hugepage_update.
*/
void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
#ifdef CONFIG_DEBUG_VM
WARN_ON(pmd_val(*pmdp) & _PAGE_PRESENT);
assert_spin_locked(&mm->page_table_lock);
WARN_ON(!pmd_trans_huge(pmd));
#endif
trace_hugepage_set_pmd(addr, pmd);
return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
}
void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
}
/*
* A linux hugepage PMD was changed and the corresponding hash table entries
* neesd to be flushed.
*/
void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, unsigned long old_pmd)
{
int ssize, i;
unsigned long s_addr;
int max_hpte_count;
unsigned int psize, valid;
unsigned char *hpte_slot_array;
unsigned long hidx, vpn, vsid, hash, shift, slot;
/*
* Flush all the hptes mapping this hugepage
*/
s_addr = addr & HPAGE_PMD_MASK;
hpte_slot_array = get_hpte_slot_array(pmdp);
/*
* IF we try to do a HUGE PTE update after a withdraw is done.
* we will find the below NULL. This happens when we do
* split_huge_page_pmd
*/
if (!hpte_slot_array)
return;
/* get the base page size,vsid and segment size */
#ifdef CONFIG_DEBUG_VM
psize = get_slice_psize(mm, s_addr);
BUG_ON(psize == MMU_PAGE_16M);
#endif
if (old_pmd & _PAGE_COMBO)
psize = MMU_PAGE_4K;
else
psize = MMU_PAGE_64K;
if (!is_kernel_addr(s_addr)) {
ssize = user_segment_size(s_addr);
vsid = get_vsid(mm->context.id, s_addr, ssize);
WARN_ON(vsid == 0);
} else {
vsid = get_kernel_vsid(s_addr, mmu_kernel_ssize);
ssize = mmu_kernel_ssize;
}
if (ppc_md.hugepage_invalidate)
return ppc_md.hugepage_invalidate(vsid, s_addr,
hpte_slot_array,
psize, ssize);
/*
* No bluk hpte removal support, invalidate each entry
*/
shift = mmu_psize_defs[psize].shift;
max_hpte_count = HPAGE_PMD_SIZE >> shift;
for (i = 0; i < max_hpte_count; i++) {
/*
* 8 bits per each hpte entries
* 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
*/
valid = hpte_valid(hpte_slot_array, i);
if (!valid)
continue;
hidx = hpte_hash_index(hpte_slot_array, i);
/* get the vpn */
addr = s_addr + (i * (1ul << shift));
vpn = hpt_vpn(addr, vsid, ssize);
hash = hpt_hash(vpn, shift, ssize);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
ppc_md.hpte_invalidate(slot, vpn, psize,
MMU_PAGE_16M, ssize, 0);
}
}
static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
{
pmd_val(pmd) |= pgprot_val(pgprot);
return pmd;
}
pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
{
pmd_t pmd;
/*
* For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always
* set. We use this to check THP page at pmd level.
* leaf pte for huge page, bottom two bits != 00
*/
pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
pmd_val(pmd) |= _PAGE_THP_HUGE;
pmd = pmd_set_protbits(pmd, pgprot);
return pmd;
}
pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
{
return pfn_pmd(page_to_pfn(page), pgprot);
}
pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
pmd_val(pmd) &= _HPAGE_CHG_MASK;
pmd = pmd_set_protbits(pmd, newprot);
return pmd;
}
/*
* This is called at the end of handling a user page fault, when the
* fault has been handled by updating a HUGE PMD entry in the linux page tables.
* We use it to preload an HPTE into the hash table corresponding to
* the updated linux HUGE PMD entry.
*/
void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd)
{
return;
}
pmd_t pmdp_get_and_clear(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
pmd_t old_pmd;
pgtable_t pgtable;
unsigned long old;
pgtable_t *pgtable_slot;
old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
old_pmd = __pmd(old);
/*
* We have pmd == none and we are holding page_table_lock.
* So we can safely go and clear the pgtable hash
* index info.
*/
pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
pgtable = *pgtable_slot;
/*
* Let's zero out old valid and hash index details
* hash fault look at them.
*/
memset(pgtable, 0, PTE_FRAG_SIZE);
return old_pmd;
}
int has_transparent_hugepage(void)
{
if (!mmu_has_feature(MMU_FTR_16M_PAGE))
return 0;
/*
* We support THP only if PMD_SIZE is 16MB.
*/
if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
return 0;
/*
* We need to make sure that we support 16MB hugepage in a segement
* with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
* of 64K.
*/
/*
* If we have 64K HPTE, we will be using that by default
*/
if (mmu_psize_defs[MMU_PAGE_64K].shift &&
(mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
return 0;
/*
* Ok we only have 4K HPTE
*/
if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
return 0;
return 1;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

View file

@ -0,0 +1,288 @@
/*
* This file contains the routines for handling the MMU on those
* PowerPC implementations where the MMU substantially follows the
* architecture specification. This includes the 6xx, 7xx, 7xxx,
* and 8260 implementations but excludes the 8xx and 4xx.
* -- paulus
*
* Derived from arch/ppc/mm/init.c:
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
* and Cort Dougan (PReP) (cort@cs.nmt.edu)
* Copyright (C) 1996 Paul Mackerras
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/memblock.h>
#include <asm/prom.h>
#include <asm/mmu.h>
#include <asm/machdep.h>
#include "mmu_decl.h"
struct hash_pte *Hash, *Hash_end;
unsigned long Hash_size, Hash_mask;
unsigned long _SDR1;
struct ppc_bat BATS[8][2]; /* 8 pairs of IBAT, DBAT */
struct batrange { /* stores address ranges mapped by BATs */
unsigned long start;
unsigned long limit;
phys_addr_t phys;
} bat_addrs[8];
/*
* Return PA for this VA if it is mapped by a BAT, or 0
*/
phys_addr_t v_mapped_by_bats(unsigned long va)
{
int b;
for (b = 0; b < 4; ++b)
if (va >= bat_addrs[b].start && va < bat_addrs[b].limit)
return bat_addrs[b].phys + (va - bat_addrs[b].start);
return 0;
}
/*
* Return VA for a given PA or 0 if not mapped
*/
unsigned long p_mapped_by_bats(phys_addr_t pa)
{
int b;
for (b = 0; b < 4; ++b)
if (pa >= bat_addrs[b].phys
&& pa < (bat_addrs[b].limit-bat_addrs[b].start)
+bat_addrs[b].phys)
return bat_addrs[b].start+(pa-bat_addrs[b].phys);
return 0;
}
unsigned long __init mmu_mapin_ram(unsigned long top)
{
unsigned long tot, bl, done;
unsigned long max_size = (256<<20);
if (__map_without_bats) {
printk(KERN_DEBUG "RAM mapped without BATs\n");
return 0;
}
/* Set up BAT2 and if necessary BAT3 to cover RAM. */
/* Make sure we don't map a block larger than the
smallest alignment of the physical address. */
tot = top;
for (bl = 128<<10; bl < max_size; bl <<= 1) {
if (bl * 2 > tot)
break;
}
setbat(2, PAGE_OFFSET, 0, bl, PAGE_KERNEL_X);
done = (unsigned long)bat_addrs[2].limit - PAGE_OFFSET + 1;
if ((done < tot) && !bat_addrs[3].limit) {
/* use BAT3 to cover a bit more */
tot -= done;
for (bl = 128<<10; bl < max_size; bl <<= 1)
if (bl * 2 > tot)
break;
setbat(3, PAGE_OFFSET+done, done, bl, PAGE_KERNEL_X);
done = (unsigned long)bat_addrs[3].limit - PAGE_OFFSET + 1;
}
return done;
}
/*
* Set up one of the I/D BAT (block address translation) register pairs.
* The parameters are not checked; in particular size must be a power
* of 2 between 128k and 256M.
*/
void __init setbat(int index, unsigned long virt, phys_addr_t phys,
unsigned int size, int flags)
{
unsigned int bl;
int wimgxpp;
struct ppc_bat *bat = BATS[index];
if ((flags & _PAGE_NO_CACHE) ||
(cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0))
flags &= ~_PAGE_COHERENT;
bl = (size >> 17) - 1;
if (PVR_VER(mfspr(SPRN_PVR)) != 1) {
/* 603, 604, etc. */
/* Do DBAT first */
wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
| _PAGE_COHERENT | _PAGE_GUARDED);
wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX;
bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
if (flags & _PAGE_USER)
bat[1].batu |= 1; /* Vp = 1 */
if (flags & _PAGE_GUARDED) {
/* G bit must be zero in IBATs */
bat[0].batu = bat[0].batl = 0;
} else {
/* make IBAT same as DBAT */
bat[0] = bat[1];
}
} else {
/* 601 cpu */
if (bl > BL_8M)
bl = BL_8M;
wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
| _PAGE_COHERENT);
wimgxpp |= (flags & _PAGE_RW)?
((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX;
bat->batu = virt | wimgxpp | 4; /* Ks=0, Ku=1 */
bat->batl = phys | bl | 0x40; /* V=1 */
}
bat_addrs[index].start = virt;
bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1;
bat_addrs[index].phys = phys;
}
/*
* Preload a translation in the hash table
*/
void hash_preload(struct mm_struct *mm, unsigned long ea,
unsigned long access, unsigned long trap)
{
pmd_t *pmd;
if (Hash == 0)
return;
pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea);
if (!pmd_none(*pmd))
add_hash_page(mm->context.id, ea, pmd_val(*pmd));
}
/*
* Initialize the hash table and patch the instructions in hashtable.S.
*/
void __init MMU_init_hw(void)
{
unsigned int hmask, mb, mb2;
unsigned int n_hpteg, lg_n_hpteg;
extern unsigned int hash_page_patch_A[];
extern unsigned int hash_page_patch_B[], hash_page_patch_C[];
extern unsigned int hash_page[];
extern unsigned int flush_hash_patch_A[], flush_hash_patch_B[];
if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) {
/*
* Put a blr (procedure return) instruction at the
* start of hash_page, since we can still get DSI
* exceptions on a 603.
*/
hash_page[0] = 0x4e800020;
flush_icache_range((unsigned long) &hash_page[0],
(unsigned long) &hash_page[1]);
return;
}
if ( ppc_md.progress ) ppc_md.progress("hash:enter", 0x105);
#define LG_HPTEG_SIZE 6 /* 64 bytes per HPTEG */
#define SDR1_LOW_BITS ((n_hpteg - 1) >> 10)
#define MIN_N_HPTEG 1024 /* min 64kB hash table */
/*
* Allow 1 HPTE (1/8 HPTEG) for each page of memory.
* This is less than the recommended amount, but then
* Linux ain't AIX.
*/
n_hpteg = total_memory / (PAGE_SIZE * 8);
if (n_hpteg < MIN_N_HPTEG)
n_hpteg = MIN_N_HPTEG;
lg_n_hpteg = __ilog2(n_hpteg);
if (n_hpteg & (n_hpteg - 1)) {
++lg_n_hpteg; /* round up if not power of 2 */
n_hpteg = 1 << lg_n_hpteg;
}
Hash_size = n_hpteg << LG_HPTEG_SIZE;
/*
* Find some memory for the hash table.
*/
if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
Hash = __va(memblock_alloc(Hash_size, Hash_size));
cacheable_memzero(Hash, Hash_size);
_SDR1 = __pa(Hash) | SDR1_LOW_BITS;
Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size);
printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n",
(unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash);
/*
* Patch up the instructions in hashtable.S:create_hpte
*/
if ( ppc_md.progress ) ppc_md.progress("hash:patch", 0x345);
Hash_mask = n_hpteg - 1;
hmask = Hash_mask >> (16 - LG_HPTEG_SIZE);
mb2 = mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg;
if (lg_n_hpteg > 16)
mb2 = 16 - LG_HPTEG_SIZE;
hash_page_patch_A[0] = (hash_page_patch_A[0] & ~0xffff)
| ((unsigned int)(Hash) >> 16);
hash_page_patch_A[1] = (hash_page_patch_A[1] & ~0x7c0) | (mb << 6);
hash_page_patch_A[2] = (hash_page_patch_A[2] & ~0x7c0) | (mb2 << 6);
hash_page_patch_B[0] = (hash_page_patch_B[0] & ~0xffff) | hmask;
hash_page_patch_C[0] = (hash_page_patch_C[0] & ~0xffff) | hmask;
/*
* Ensure that the locations we've patched have been written
* out from the data cache and invalidated in the instruction
* cache, on those machines with split caches.
*/
flush_icache_range((unsigned long) &hash_page_patch_A[0],
(unsigned long) &hash_page_patch_C[1]);
/*
* Patch up the instructions in hashtable.S:flush_hash_page
*/
flush_hash_patch_A[0] = (flush_hash_patch_A[0] & ~0xffff)
| ((unsigned int)(Hash) >> 16);
flush_hash_patch_A[1] = (flush_hash_patch_A[1] & ~0x7c0) | (mb << 6);
flush_hash_patch_A[2] = (flush_hash_patch_A[2] & ~0x7c0) | (mb2 << 6);
flush_hash_patch_B[0] = (flush_hash_patch_B[0] & ~0xffff) | hmask;
flush_icache_range((unsigned long) &flush_hash_patch_A[0],
(unsigned long) &flush_hash_patch_B[1]);
if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205);
}
void setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
{
/* We don't currently support the first MEMBLOCK not mapping 0
* physical on those processors
*/
BUG_ON(first_memblock_base != 0);
/* 601 can only access 16MB at the moment */
if (PVR_VER(mfspr(SPRN_PVR)) == 1)
memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000));
else /* Anything else has 256M mapped */
memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000));
}

332
arch/powerpc/mm/slb.c Normal file
View file

@ -0,0 +1,332 @@
/*
* PowerPC64 SLB support.
*
* Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
* Based on earlier code written by:
* Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
* Copyright (c) 2001 Dave Engebretsen
* Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/paca.h>
#include <asm/cputable.h>
#include <asm/cacheflush.h>
#include <asm/smp.h>
#include <linux/compiler.h>
#include <asm/udbg.h>
#include <asm/code-patching.h>
extern void slb_allocate_realmode(unsigned long ea);
extern void slb_allocate_user(unsigned long ea);
static void slb_allocate(unsigned long ea)
{
/* Currently, we do real mode for all SLBs including user, but
* that will change if we bring back dynamic VSIDs
*/
slb_allocate_realmode(ea);
}
#define slb_esid_mask(ssize) \
(((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
unsigned long slot)
{
return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot;
}
static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
unsigned long flags)
{
return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags |
((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
}
static inline void slb_shadow_update(unsigned long ea, int ssize,
unsigned long flags,
unsigned long entry)
{
/*
* Clear the ESID first so the entry is not valid while we are
* updating it. No write barriers are needed here, provided
* we only update the current CPU's SLB shadow buffer.
*/
get_slb_shadow()->save_area[entry].esid = 0;
get_slb_shadow()->save_area[entry].vsid =
cpu_to_be64(mk_vsid_data(ea, ssize, flags));
get_slb_shadow()->save_area[entry].esid =
cpu_to_be64(mk_esid_data(ea, ssize, entry));
}
static inline void slb_shadow_clear(unsigned long entry)
{
get_slb_shadow()->save_area[entry].esid = 0;
}
static inline void create_shadowed_slbe(unsigned long ea, int ssize,
unsigned long flags,
unsigned long entry)
{
/*
* Updating the shadow buffer before writing the SLB ensures
* we don't get a stale entry here if we get preempted by PHYP
* between these two statements.
*/
slb_shadow_update(ea, ssize, flags, entry);
asm volatile("slbmte %0,%1" :
: "r" (mk_vsid_data(ea, ssize, flags)),
"r" (mk_esid_data(ea, ssize, entry))
: "memory" );
}
static void __slb_flush_and_rebolt(void)
{
/* If you change this make sure you change SLB_NUM_BOLTED
* and PR KVM appropriately too. */
unsigned long linear_llp, vmalloc_llp, lflags, vflags;
unsigned long ksp_esid_data, ksp_vsid_data;
linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
lflags = SLB_VSID_KERNEL | linear_llp;
vflags = SLB_VSID_KERNEL | vmalloc_llp;
ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, 2);
if ((ksp_esid_data & ~0xfffffffUL) <= PAGE_OFFSET) {
ksp_esid_data &= ~SLB_ESID_V;
ksp_vsid_data = 0;
slb_shadow_clear(2);
} else {
/* Update stack entry; others don't change */
slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, 2);
ksp_vsid_data =
be64_to_cpu(get_slb_shadow()->save_area[2].vsid);
}
/* We need to do this all in asm, so we're sure we don't touch
* the stack between the slbia and rebolting it. */
asm volatile("isync\n"
"slbia\n"
/* Slot 1 - first VMALLOC segment */
"slbmte %0,%1\n"
/* Slot 2 - kernel stack */
"slbmte %2,%3\n"
"isync"
:: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)),
"r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, 1)),
"r"(ksp_vsid_data),
"r"(ksp_esid_data)
: "memory");
}
void slb_flush_and_rebolt(void)
{
WARN_ON(!irqs_disabled());
/*
* We can't take a PMU exception in the following code, so hard
* disable interrupts.
*/
hard_irq_disable();
__slb_flush_and_rebolt();
get_paca()->slb_cache_ptr = 0;
}
void slb_vmalloc_update(void)
{
unsigned long vflags;
vflags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmalloc_psize].sllp;
slb_shadow_update(VMALLOC_START, mmu_kernel_ssize, vflags, 1);
slb_flush_and_rebolt();
}
/* Helper function to compare esids. There are four cases to handle.
* 1. The system is not 1T segment size capable. Use the GET_ESID compare.
* 2. The system is 1T capable, both addresses are < 1T, use the GET_ESID compare.
* 3. The system is 1T capable, only one of the two addresses is > 1T. This is not a match.
* 4. The system is 1T capable, both addresses are > 1T, use the GET_ESID_1T macro to compare.
*/
static inline int esids_match(unsigned long addr1, unsigned long addr2)
{
int esid_1t_count;
/* System is not 1T segment size capable. */
if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
return (GET_ESID(addr1) == GET_ESID(addr2));
esid_1t_count = (((addr1 >> SID_SHIFT_1T) != 0) +
((addr2 >> SID_SHIFT_1T) != 0));
/* both addresses are < 1T */
if (esid_1t_count == 0)
return (GET_ESID(addr1) == GET_ESID(addr2));
/* One address < 1T, the other > 1T. Not a match */
if (esid_1t_count == 1)
return 0;
/* Both addresses are > 1T. */
return (GET_ESID_1T(addr1) == GET_ESID_1T(addr2));
}
/* Flush all user entries from the segment table of the current processor. */
void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
{
unsigned long offset;
unsigned long slbie_data = 0;
unsigned long pc = KSTK_EIP(tsk);
unsigned long stack = KSTK_ESP(tsk);
unsigned long exec_base;
/*
* We need interrupts hard-disabled here, not just soft-disabled,
* so that a PMU interrupt can't occur, which might try to access
* user memory (to get a stack trace) and possible cause an SLB miss
* which would update the slb_cache/slb_cache_ptr fields in the PACA.
*/
hard_irq_disable();
offset = get_paca()->slb_cache_ptr;
if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
offset <= SLB_CACHE_ENTRIES) {
int i;
asm volatile("isync" : : : "memory");
for (i = 0; i < offset; i++) {
slbie_data = (unsigned long)get_paca()->slb_cache[i]
<< SID_SHIFT; /* EA */
slbie_data |= user_segment_size(slbie_data)
<< SLBIE_SSIZE_SHIFT;
slbie_data |= SLBIE_C; /* C set for user addresses */
asm volatile("slbie %0" : : "r" (slbie_data));
}
asm volatile("isync" : : : "memory");
} else {
__slb_flush_and_rebolt();
}
/* Workaround POWER5 < DD2.1 issue */
if (offset == 1 || offset > SLB_CACHE_ENTRIES)
asm volatile("slbie %0" : : "r" (slbie_data));
get_paca()->slb_cache_ptr = 0;
get_paca()->context = mm->context;
/*
* preload some userspace segments into the SLB.
* Almost all 32 and 64bit PowerPC executables are linked at
* 0x10000000 so it makes sense to preload this segment.
*/
exec_base = 0x10000000;
if (is_kernel_addr(pc) || is_kernel_addr(stack) ||
is_kernel_addr(exec_base))
return;
slb_allocate(pc);
if (!esids_match(pc, stack))
slb_allocate(stack);
if (!esids_match(pc, exec_base) &&
!esids_match(stack, exec_base))
slb_allocate(exec_base);
}
static inline void patch_slb_encoding(unsigned int *insn_addr,
unsigned int immed)
{
int insn = (*insn_addr & 0xffff0000) | immed;
patch_instruction(insn_addr, insn);
}
extern u32 slb_compare_rr_to_size[];
extern u32 slb_miss_kernel_load_linear[];
extern u32 slb_miss_kernel_load_io[];
extern u32 slb_compare_rr_to_size[];
extern u32 slb_miss_kernel_load_vmemmap[];
void slb_set_size(u16 size)
{
if (mmu_slb_size == size)
return;
mmu_slb_size = size;
patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size);
}
void slb_initialize(void)
{
unsigned long linear_llp, vmalloc_llp, io_llp;
unsigned long lflags, vflags;
static int slb_encoding_inited;
#ifdef CONFIG_SPARSEMEM_VMEMMAP
unsigned long vmemmap_llp;
#endif
/* Prepare our SLB miss handler based on our page size */
linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
io_llp = mmu_psize_defs[mmu_io_psize].sllp;
vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp;
#ifdef CONFIG_SPARSEMEM_VMEMMAP
vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp;
#endif
if (!slb_encoding_inited) {
slb_encoding_inited = 1;
patch_slb_encoding(slb_miss_kernel_load_linear,
SLB_VSID_KERNEL | linear_llp);
patch_slb_encoding(slb_miss_kernel_load_io,
SLB_VSID_KERNEL | io_llp);
patch_slb_encoding(slb_compare_rr_to_size,
mmu_slb_size);
pr_devel("SLB: linear LLP = %04lx\n", linear_llp);
pr_devel("SLB: io LLP = %04lx\n", io_llp);
#ifdef CONFIG_SPARSEMEM_VMEMMAP
patch_slb_encoding(slb_miss_kernel_load_vmemmap,
SLB_VSID_KERNEL | vmemmap_llp);
pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp);
#endif
}
get_paca()->stab_rr = SLB_NUM_BOLTED;
lflags = SLB_VSID_KERNEL | linear_llp;
vflags = SLB_VSID_KERNEL | vmalloc_llp;
/* Invalidate the entire SLB (even slot 0) & all the ERATS */
asm volatile("isync":::"memory");
asm volatile("slbmte %0,%0"::"r" (0) : "memory");
asm volatile("isync; slbia; isync":::"memory");
create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, 0);
create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, 1);
/* For the boot cpu, we're running on the stack in init_thread_union,
* which is in the first segment of the linear mapping, and also
* get_paca()->kstack hasn't been initialized yet.
* For secondary cpus, we need to bolt the kernel stack entry now.
*/
slb_shadow_clear(2);
if (raw_smp_processor_id() != boot_cpuid &&
(get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET)
create_shadowed_slbe(get_paca()->kstack,
mmu_kernel_ssize, lflags, 2);
asm volatile("isync":::"memory");
}

321
arch/powerpc/mm/slb_low.S Normal file
View file

@ -0,0 +1,321 @@
/*
* Low-level SLB routines
*
* Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
*
* Based on earlier C version:
* Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
* Copyright (c) 2001 Dave Engebretsen
* Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <asm/processor.h>
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
#include <asm/cputable.h>
#include <asm/page.h>
#include <asm/mmu.h>
#include <asm/pgtable.h>
#include <asm/firmware.h>
/* void slb_allocate_realmode(unsigned long ea);
*
* Create an SLB entry for the given EA (user or kernel).
* r3 = faulting address, r13 = PACA
* r9, r10, r11 are clobbered by this function
* No other registers are examined or changed.
*/
_GLOBAL(slb_allocate_realmode)
/*
* check for bad kernel/user address
* (ea & ~REGION_MASK) >= PGTABLE_RANGE
*/
rldicr. r9,r3,4,(63 - PGTABLE_EADDR_SIZE - 4)
bne- 8f
srdi r9,r3,60 /* get region */
srdi r10,r3,SID_SHIFT /* get esid */
cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */
/* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */
blt cr7,0f /* user or kernel? */
/* kernel address: proto-VSID = ESID */
/* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but
* this code will generate the protoVSID 0xfffffffff for the
* top segment. That's ok, the scramble below will translate
* it to VSID 0, which is reserved as a bad VSID - one which
* will never have any pages in it. */
/* Check if hitting the linear mapping or some other kernel space
*/
bne cr7,1f
/* Linear mapping encoding bits, the "li" instruction below will
* be patched by the kernel at boot
*/
.globl slb_miss_kernel_load_linear
slb_miss_kernel_load_linear:
li r11,0
/*
* context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
* r9 = region id.
*/
addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
BEGIN_FTR_SECTION
b slb_finish_load
END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
b slb_finish_load_1T
1:
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/* Check virtual memmap region. To be patches at kernel boot */
cmpldi cr0,r9,0xf
bne 1f
.globl slb_miss_kernel_load_vmemmap
slb_miss_kernel_load_vmemmap:
li r11,0
b 6f
1:
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
/* vmalloc mapping gets the encoding from the PACA as the mapping
* can be demoted from 64K -> 4K dynamically on some machines
*/
clrldi r11,r10,48
cmpldi r11,(VMALLOC_SIZE >> 28) - 1
bgt 5f
lhz r11,PACAVMALLOCSLLP(r13)
b 6f
5:
/* IO mapping */
.globl slb_miss_kernel_load_io
slb_miss_kernel_load_io:
li r11,0
6:
/*
* context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
* r9 = region id.
*/
addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
BEGIN_FTR_SECTION
b slb_finish_load
END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
b slb_finish_load_1T
0:
/* when using slices, we extract the psize off the slice bitmaps
* and then we need to get the sllp encoding off the mmu_psize_defs
* array.
*
* XXX This is a bit inefficient especially for the normal case,
* so we should try to implement a fast path for the standard page
* size using the old sllp value so we avoid the array. We cannot
* really do dynamic patching unfortunately as processes might flip
* between 4k and 64k standard page size
*/
#ifdef CONFIG_PPC_MM_SLICES
/* r10 have esid */
cmpldi r10,16
/* below SLICE_LOW_TOP */
blt 5f
/*
* Handle hpsizes,
* r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index
*/
srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */
addi r9,r11,PACAHIGHSLICEPSIZE
lbzx r9,r13,r9 /* r9 is hpsizes[r11] */
/* r11 = (r10 >> (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)) & 0x1 */
rldicl r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63
b 6f
5:
/*
* Handle lpsizes
* r9 is get_paca()->context.low_slices_psize, r11 is index
*/
ld r9,PACALOWSLICESPSIZE(r13)
mr r11,r10
6:
sldi r11,r11,2 /* index * 4 */
/* Extract the psize and multiply to get an array offset */
srd r9,r9,r11
andi. r9,r9,0xf
mulli r9,r9,MMUPSIZEDEFSIZE
/* Now get to the array and obtain the sllp
*/
ld r11,PACATOC(r13)
ld r11,mmu_psize_defs@got(r11)
add r11,r11,r9
ld r11,MMUPSIZESLLP(r11)
ori r11,r11,SLB_VSID_USER
#else
/* paca context sllp already contains the SLB_VSID_USER bits */
lhz r11,PACACONTEXTSLLP(r13)
#endif /* CONFIG_PPC_MM_SLICES */
ld r9,PACACONTEXTID(r13)
BEGIN_FTR_SECTION
cmpldi r10,0x1000
bge slb_finish_load_1T
END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
b slb_finish_load
8: /* invalid EA */
li r10,0 /* BAD_VSID */
li r9,0 /* BAD_VSID */
li r11,SLB_VSID_USER /* flags don't much matter */
b slb_finish_load
#ifdef __DISABLED__
/* void slb_allocate_user(unsigned long ea);
*
* Create an SLB entry for the given EA (user or kernel).
* r3 = faulting address, r13 = PACA
* r9, r10, r11 are clobbered by this function
* No other registers are examined or changed.
*
* It is called with translation enabled in order to be able to walk the
* page tables. This is not currently used.
*/
_GLOBAL(slb_allocate_user)
/* r3 = faulting address */
srdi r10,r3,28 /* get esid */
crset 4*cr7+lt /* set "user" flag for later */
/* check if we fit in the range covered by the pagetables*/
srdi. r9,r3,PGTABLE_EADDR_SIZE
crnot 4*cr0+eq,4*cr0+eq
beqlr
/* now we need to get to the page tables in order to get the page
* size encoding from the PMD. In the future, we'll be able to deal
* with 1T segments too by getting the encoding from the PGD instead
*/
ld r9,PACAPGDIR(r13)
cmpldi cr0,r9,0
beqlr
rlwinm r11,r10,8,25,28
ldx r9,r9,r11 /* get pgd_t */
cmpldi cr0,r9,0
beqlr
rlwinm r11,r10,3,17,28
ldx r9,r9,r11 /* get pmd_t */
cmpldi cr0,r9,0
beqlr
/* build vsid flags */
andi. r11,r9,SLB_VSID_LLP
ori r11,r11,SLB_VSID_USER
/* get context to calculate proto-VSID */
ld r9,PACACONTEXTID(r13)
/* fall through slb_finish_load */
#endif /* __DISABLED__ */
/*
* Finish loading of an SLB entry and return
*
* r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET
*/
slb_finish_load:
rldimi r10,r9,ESID_BITS,0
ASM_VSID_SCRAMBLE(r10,r9,256M)
/*
* bits above VSID_BITS_256M need to be ignored from r10
* also combine VSID and flags
*/
rldimi r11,r10,SLB_VSID_SHIFT,(64 - (SLB_VSID_SHIFT + VSID_BITS_256M))
/* r3 = EA, r11 = VSID data */
/*
* Find a slot, round robin. Previously we tried to find a
* free slot first but that took too long. Unfortunately we
* dont have any LRU information to help us choose a slot.
*/
7: ld r10,PACASTABRR(r13)
addi r10,r10,1
/* This gets soft patched on boot. */
.globl slb_compare_rr_to_size
slb_compare_rr_to_size:
cmpldi r10,0
blt+ 4f
li r10,SLB_NUM_BOLTED
4:
std r10,PACASTABRR(r13)
3:
rldimi r3,r10,0,36 /* r3= EA[0:35] | entry */
oris r10,r3,SLB_ESID_V@h /* r3 |= SLB_ESID_V */
/* r3 = ESID data, r11 = VSID data */
/*
* No need for an isync before or after this slbmte. The exception
* we enter with and the rfid we exit with are context synchronizing.
*/
slbmte r11,r10
/* we're done for kernel addresses */
crclr 4*cr0+eq /* set result to "success" */
bgelr cr7
/* Update the slb cache */
lhz r3,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */
cmpldi r3,SLB_CACHE_ENTRIES
bge 1f
/* still room in the slb cache */
sldi r11,r3,2 /* r11 = offset * sizeof(u32) */
srdi r10,r10,28 /* get the 36 bits of the ESID */
add r11,r11,r13 /* r11 = (u32 *)paca + offset */
stw r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */
addi r3,r3,1 /* offset++ */
b 2f
1: /* offset >= SLB_CACHE_ENTRIES */
li r3,SLB_CACHE_ENTRIES+1
2:
sth r3,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */
crclr 4*cr0+eq /* set result to "success" */
blr
/*
* Finish loading of a 1T SLB entry (for the kernel linear mapping) and return.
*
* r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9
*/
slb_finish_load_1T:
srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */
rldimi r10,r9,ESID_BITS_1T,0
ASM_VSID_SCRAMBLE(r10,r9,1T)
/*
* bits above VSID_BITS_1T need to be ignored from r10
* also combine VSID and flags
*/
rldimi r11,r10,SLB_VSID_SHIFT_1T,(64 - (SLB_VSID_SHIFT_1T + VSID_BITS_1T))
li r10,MMU_SEGSIZE_1T
rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */
/* r3 = EA, r11 = VSID data */
clrrdi r3,r3,SID_SHIFT_1T /* clear out non-ESID bits */
b 7b

730
arch/powerpc/mm/slice.c Normal file
View file

@ -0,0 +1,730 @@
/*
* address space "slices" (meta-segments) support
*
* Copyright (C) 2007 Benjamin Herrenschmidt, IBM Corporation.
*
* Based on hugetlb implementation
*
* Copyright (C) 2003 David Gibson, IBM Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#undef DEBUG
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/err.h>
#include <linux/spinlock.h>
#include <linux/export.h>
#include <linux/hugetlb.h>
#include <asm/mman.h>
#include <asm/mmu.h>
#include <asm/copro.h>
#include <asm/hugetlb.h>
/* some sanity checks */
#if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE
#error PGTABLE_RANGE exceeds slice_mask high_slices size
#endif
static DEFINE_SPINLOCK(slice_convert_lock);
#ifdef DEBUG
int _slice_debug = 1;
static void slice_print_mask(const char *label, struct slice_mask mask)
{
char *p, buf[16 + 3 + 64 + 1];
int i;
if (!_slice_debug)
return;
p = buf;
for (i = 0; i < SLICE_NUM_LOW; i++)
*(p++) = (mask.low_slices & (1 << i)) ? '1' : '0';
*(p++) = ' ';
*(p++) = '-';
*(p++) = ' ';
for (i = 0; i < SLICE_NUM_HIGH; i++)
*(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0';
*(p++) = 0;
printk(KERN_DEBUG "%s:%s\n", label, buf);
}
#define slice_dbg(fmt...) do { if (_slice_debug) pr_debug(fmt); } while(0)
#else
static void slice_print_mask(const char *label, struct slice_mask mask) {}
#define slice_dbg(fmt...)
#endif
static struct slice_mask slice_range_to_mask(unsigned long start,
unsigned long len)
{
unsigned long end = start + len - 1;
struct slice_mask ret = { 0, 0 };
if (start < SLICE_LOW_TOP) {
unsigned long mend = min(end, SLICE_LOW_TOP);
unsigned long mstart = min(start, SLICE_LOW_TOP);
ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
- (1u << GET_LOW_SLICE_INDEX(mstart));
}
if ((start + len) > SLICE_LOW_TOP)
ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1))
- (1ul << GET_HIGH_SLICE_INDEX(start));
return ret;
}
static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
unsigned long len)
{
struct vm_area_struct *vma;
if ((mm->task_size - len) < addr)
return 0;
vma = find_vma(mm, addr);
return (!vma || (addr + len) <= vma->vm_start);
}
static int slice_low_has_vma(struct mm_struct *mm, unsigned long slice)
{
return !slice_area_is_free(mm, slice << SLICE_LOW_SHIFT,
1ul << SLICE_LOW_SHIFT);
}
static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice)
{
unsigned long start = slice << SLICE_HIGH_SHIFT;
unsigned long end = start + (1ul << SLICE_HIGH_SHIFT);
/* Hack, so that each addresses is controlled by exactly one
* of the high or low area bitmaps, the first high area starts
* at 4GB, not 0 */
if (start == 0)
start = SLICE_LOW_TOP;
return !slice_area_is_free(mm, start, end - start);
}
static struct slice_mask slice_mask_for_free(struct mm_struct *mm)
{
struct slice_mask ret = { 0, 0 };
unsigned long i;
for (i = 0; i < SLICE_NUM_LOW; i++)
if (!slice_low_has_vma(mm, i))
ret.low_slices |= 1u << i;
if (mm->task_size <= SLICE_LOW_TOP)
return ret;
for (i = 0; i < SLICE_NUM_HIGH; i++)
if (!slice_high_has_vma(mm, i))
ret.high_slices |= 1ul << i;
return ret;
}
static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
{
unsigned char *hpsizes;
int index, mask_index;
struct slice_mask ret = { 0, 0 };
unsigned long i;
u64 lpsizes;
lpsizes = mm->context.low_slices_psize;
for (i = 0; i < SLICE_NUM_LOW; i++)
if (((lpsizes >> (i * 4)) & 0xf) == psize)
ret.low_slices |= 1u << i;
hpsizes = mm->context.high_slices_psize;
for (i = 0; i < SLICE_NUM_HIGH; i++) {
mask_index = i & 0x1;
index = i >> 1;
if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
ret.high_slices |= 1ul << i;
}
return ret;
}
static int slice_check_fit(struct slice_mask mask, struct slice_mask available)
{
return (mask.low_slices & available.low_slices) == mask.low_slices &&
(mask.high_slices & available.high_slices) == mask.high_slices;
}
static void slice_flush_segments(void *parm)
{
struct mm_struct *mm = parm;
unsigned long flags;
if (mm != current->active_mm)
return;
/* update the paca copy of the context struct */
get_paca()->context = current->active_mm->context;
local_irq_save(flags);
slb_flush_and_rebolt();
local_irq_restore(flags);
}
static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize)
{
int index, mask_index;
/* Write the new slice psize bits */
unsigned char *hpsizes;
u64 lpsizes;
unsigned long i, flags;
slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize);
slice_print_mask(" mask", mask);
/* We need to use a spinlock here to protect against
* concurrent 64k -> 4k demotion ...
*/
spin_lock_irqsave(&slice_convert_lock, flags);
lpsizes = mm->context.low_slices_psize;
for (i = 0; i < SLICE_NUM_LOW; i++)
if (mask.low_slices & (1u << i))
lpsizes = (lpsizes & ~(0xful << (i * 4))) |
(((unsigned long)psize) << (i * 4));
/* Assign the value back */
mm->context.low_slices_psize = lpsizes;
hpsizes = mm->context.high_slices_psize;
for (i = 0; i < SLICE_NUM_HIGH; i++) {
mask_index = i & 0x1;
index = i >> 1;
if (mask.high_slices & (1ul << i))
hpsizes[index] = (hpsizes[index] &
~(0xf << (mask_index * 4))) |
(((unsigned long)psize) << (mask_index * 4));
}
slice_dbg(" lsps=%lx, hsps=%lx\n",
mm->context.low_slices_psize,
mm->context.high_slices_psize);
spin_unlock_irqrestore(&slice_convert_lock, flags);
copro_flush_all_slbs(mm);
}
/*
* Compute which slice addr is part of;
* set *boundary_addr to the start or end boundary of that slice
* (depending on 'end' parameter);
* return boolean indicating if the slice is marked as available in the
* 'available' slice_mark.
*/
static bool slice_scan_available(unsigned long addr,
struct slice_mask available,
int end,
unsigned long *boundary_addr)
{
unsigned long slice;
if (addr < SLICE_LOW_TOP) {
slice = GET_LOW_SLICE_INDEX(addr);
*boundary_addr = (slice + end) << SLICE_LOW_SHIFT;
return !!(available.low_slices & (1u << slice));
} else {
slice = GET_HIGH_SLICE_INDEX(addr);
*boundary_addr = (slice + end) ?
((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
return !!(available.high_slices & (1ul << slice));
}
}
static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
unsigned long len,
struct slice_mask available,
int psize)
{
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
unsigned long addr, found, next_end;
struct vm_unmapped_area_info info;
info.flags = 0;
info.length = len;
info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
info.align_offset = 0;
addr = TASK_UNMAPPED_BASE;
while (addr < TASK_SIZE) {
info.low_limit = addr;
if (!slice_scan_available(addr, available, 1, &addr))
continue;
next_slice:
/*
* At this point [info.low_limit; addr) covers
* available slices only and ends at a slice boundary.
* Check if we need to reduce the range, or if we can
* extend it to cover the next available slice.
*/
if (addr >= TASK_SIZE)
addr = TASK_SIZE;
else if (slice_scan_available(addr, available, 1, &next_end)) {
addr = next_end;
goto next_slice;
}
info.high_limit = addr;
found = vm_unmapped_area(&info);
if (!(found & ~PAGE_MASK))
return found;
}
return -ENOMEM;
}
static unsigned long slice_find_area_topdown(struct mm_struct *mm,
unsigned long len,
struct slice_mask available,
int psize)
{
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
unsigned long addr, found, prev;
struct vm_unmapped_area_info info;
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
info.align_offset = 0;
addr = mm->mmap_base;
while (addr > PAGE_SIZE) {
info.high_limit = addr;
if (!slice_scan_available(addr - 1, available, 0, &addr))
continue;
prev_slice:
/*
* At this point [addr; info.high_limit) covers
* available slices only and starts at a slice boundary.
* Check if we need to reduce the range, or if we can
* extend it to cover the previous available slice.
*/
if (addr < PAGE_SIZE)
addr = PAGE_SIZE;
else if (slice_scan_available(addr - 1, available, 0, &prev)) {
addr = prev;
goto prev_slice;
}
info.low_limit = addr;
found = vm_unmapped_area(&info);
if (!(found & ~PAGE_MASK))
return found;
}
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
* can happen with large stack limits and large mmap()
* allocations.
*/
return slice_find_area_bottomup(mm, len, available, psize);
}
static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
struct slice_mask mask, int psize,
int topdown)
{
if (topdown)
return slice_find_area_topdown(mm, len, mask, psize);
else
return slice_find_area_bottomup(mm, len, mask, psize);
}
#define or_mask(dst, src) do { \
(dst).low_slices |= (src).low_slices; \
(dst).high_slices |= (src).high_slices; \
} while (0)
#define andnot_mask(dst, src) do { \
(dst).low_slices &= ~(src).low_slices; \
(dst).high_slices &= ~(src).high_slices; \
} while (0)
#ifdef CONFIG_PPC_64K_PAGES
#define MMU_PAGE_BASE MMU_PAGE_64K
#else
#define MMU_PAGE_BASE MMU_PAGE_4K
#endif
unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
unsigned long flags, unsigned int psize,
int topdown)
{
struct slice_mask mask = {0, 0};
struct slice_mask good_mask;
struct slice_mask potential_mask = {0,0} /* silence stupid warning */;
struct slice_mask compat_mask = {0, 0};
int fixed = (flags & MAP_FIXED);
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
struct mm_struct *mm = current->mm;
unsigned long newaddr;
/* Sanity checks */
BUG_ON(mm->task_size == 0);
slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n",
addr, len, flags, topdown);
if (len > mm->task_size)
return -ENOMEM;
if (len & ((1ul << pshift) - 1))
return -EINVAL;
if (fixed && (addr & ((1ul << pshift) - 1)))
return -EINVAL;
if (fixed && addr > (mm->task_size - len))
return -ENOMEM;
/* If hint, make sure it matches our alignment restrictions */
if (!fixed && addr) {
addr = _ALIGN_UP(addr, 1ul << pshift);
slice_dbg(" aligned addr=%lx\n", addr);
/* Ignore hint if it's too large or overlaps a VMA */
if (addr > mm->task_size - len ||
!slice_area_is_free(mm, addr, len))
addr = 0;
}
/* First make up a "good" mask of slices that have the right size
* already
*/
good_mask = slice_mask_for_size(mm, psize);
slice_print_mask(" good_mask", good_mask);
/*
* Here "good" means slices that are already the right page size,
* "compat" means slices that have a compatible page size (i.e.
* 4k in a 64k pagesize kernel), and "free" means slices without
* any VMAs.
*
* If MAP_FIXED:
* check if fits in good | compat => OK
* check if fits in good | compat | free => convert free
* else bad
* If have hint:
* check if hint fits in good => OK
* check if hint fits in good | free => convert free
* Otherwise:
* search in good, found => OK
* search in good | free, found => convert free
* search in good | compat | free, found => convert free.
*/
#ifdef CONFIG_PPC_64K_PAGES
/* If we support combo pages, we can allow 64k pages in 4k slices */
if (psize == MMU_PAGE_64K) {
compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
if (fixed)
or_mask(good_mask, compat_mask);
}
#endif
/* First check hint if it's valid or if we have MAP_FIXED */
if (addr != 0 || fixed) {
/* Build a mask for the requested range */
mask = slice_range_to_mask(addr, len);
slice_print_mask(" mask", mask);
/* Check if we fit in the good mask. If we do, we just return,
* nothing else to do
*/
if (slice_check_fit(mask, good_mask)) {
slice_dbg(" fits good !\n");
return addr;
}
} else {
/* Now let's see if we can find something in the existing
* slices for that size
*/
newaddr = slice_find_area(mm, len, good_mask, psize, topdown);
if (newaddr != -ENOMEM) {
/* Found within the good mask, we don't have to setup,
* we thus return directly
*/
slice_dbg(" found area at 0x%lx\n", newaddr);
return newaddr;
}
}
/* We don't fit in the good mask, check what other slices are
* empty and thus can be converted
*/
potential_mask = slice_mask_for_free(mm);
or_mask(potential_mask, good_mask);
slice_print_mask(" potential", potential_mask);
if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) {
slice_dbg(" fits potential !\n");
goto convert;
}
/* If we have MAP_FIXED and failed the above steps, then error out */
if (fixed)
return -EBUSY;
slice_dbg(" search...\n");
/* If we had a hint that didn't work out, see if we can fit
* anywhere in the good area.
*/
if (addr) {
addr = slice_find_area(mm, len, good_mask, psize, topdown);
if (addr != -ENOMEM) {
slice_dbg(" found area at 0x%lx\n", addr);
return addr;
}
}
/* Now let's see if we can find something in the existing slices
* for that size plus free slices
*/
addr = slice_find_area(mm, len, potential_mask, psize, topdown);
#ifdef CONFIG_PPC_64K_PAGES
if (addr == -ENOMEM && psize == MMU_PAGE_64K) {
/* retry the search with 4k-page slices included */
or_mask(potential_mask, compat_mask);
addr = slice_find_area(mm, len, potential_mask, psize,
topdown);
}
#endif
if (addr == -ENOMEM)
return -ENOMEM;
mask = slice_range_to_mask(addr, len);
slice_dbg(" found potential area at 0x%lx\n", addr);
slice_print_mask(" mask", mask);
convert:
andnot_mask(mask, good_mask);
andnot_mask(mask, compat_mask);
if (mask.low_slices || mask.high_slices) {
slice_convert(mm, mask, psize);
if (psize > MMU_PAGE_BASE)
on_each_cpu(slice_flush_segments, mm, 1);
}
return addr;
}
EXPORT_SYMBOL_GPL(slice_get_unmapped_area);
unsigned long arch_get_unmapped_area(struct file *filp,
unsigned long addr,
unsigned long len,
unsigned long pgoff,
unsigned long flags)
{
return slice_get_unmapped_area(addr, len, flags,
current->mm->context.user_psize, 0);
}
unsigned long arch_get_unmapped_area_topdown(struct file *filp,
const unsigned long addr0,
const unsigned long len,
const unsigned long pgoff,
const unsigned long flags)
{
return slice_get_unmapped_area(addr0, len, flags,
current->mm->context.user_psize, 1);
}
unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
{
unsigned char *hpsizes;
int index, mask_index;
if (addr < SLICE_LOW_TOP) {
u64 lpsizes;
lpsizes = mm->context.low_slices_psize;
index = GET_LOW_SLICE_INDEX(addr);
return (lpsizes >> (index * 4)) & 0xf;
}
hpsizes = mm->context.high_slices_psize;
index = GET_HIGH_SLICE_INDEX(addr);
mask_index = index & 0x1;
return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xf;
}
EXPORT_SYMBOL_GPL(get_slice_psize);
/*
* This is called by hash_page when it needs to do a lazy conversion of
* an address space from real 64K pages to combo 4K pages (typically
* when hitting a non cacheable mapping on a processor or hypervisor
* that won't allow them for 64K pages).
*
* This is also called in init_new_context() to change back the user
* psize from whatever the parent context had it set to
* N.B. This may be called before mm->context.id has been set.
*
* This function will only change the content of the {low,high)_slice_psize
* masks, it will not flush SLBs as this shall be handled lazily by the
* caller.
*/
void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
{
int index, mask_index;
unsigned char *hpsizes;
unsigned long flags, lpsizes;
unsigned int old_psize;
int i;
slice_dbg("slice_set_user_psize(mm=%p, psize=%d)\n", mm, psize);
spin_lock_irqsave(&slice_convert_lock, flags);
old_psize = mm->context.user_psize;
slice_dbg(" old_psize=%d\n", old_psize);
if (old_psize == psize)
goto bail;
mm->context.user_psize = psize;
wmb();
lpsizes = mm->context.low_slices_psize;
for (i = 0; i < SLICE_NUM_LOW; i++)
if (((lpsizes >> (i * 4)) & 0xf) == old_psize)
lpsizes = (lpsizes & ~(0xful << (i * 4))) |
(((unsigned long)psize) << (i * 4));
/* Assign the value back */
mm->context.low_slices_psize = lpsizes;
hpsizes = mm->context.high_slices_psize;
for (i = 0; i < SLICE_NUM_HIGH; i++) {
mask_index = i & 0x1;
index = i >> 1;
if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == old_psize)
hpsizes[index] = (hpsizes[index] &
~(0xf << (mask_index * 4))) |
(((unsigned long)psize) << (mask_index * 4));
}
slice_dbg(" lsps=%lx, hsps=%lx\n",
mm->context.low_slices_psize,
mm->context.high_slices_psize);
bail:
spin_unlock_irqrestore(&slice_convert_lock, flags);
}
void slice_set_psize(struct mm_struct *mm, unsigned long address,
unsigned int psize)
{
unsigned char *hpsizes;
unsigned long i, flags;
u64 *lpsizes;
spin_lock_irqsave(&slice_convert_lock, flags);
if (address < SLICE_LOW_TOP) {
i = GET_LOW_SLICE_INDEX(address);
lpsizes = &mm->context.low_slices_psize;
*lpsizes = (*lpsizes & ~(0xful << (i * 4))) |
((unsigned long) psize << (i * 4));
} else {
int index, mask_index;
i = GET_HIGH_SLICE_INDEX(address);
hpsizes = mm->context.high_slices_psize;
mask_index = i & 0x1;
index = i >> 1;
hpsizes[index] = (hpsizes[index] &
~(0xf << (mask_index * 4))) |
(((unsigned long)psize) << (mask_index * 4));
}
spin_unlock_irqrestore(&slice_convert_lock, flags);
copro_flush_all_slbs(mm);
}
void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
unsigned long len, unsigned int psize)
{
struct slice_mask mask = slice_range_to_mask(start, len);
slice_convert(mm, mask, psize);
}
#ifdef CONFIG_HUGETLB_PAGE
/*
* is_hugepage_only_range() is used by generic code to verify whether
* a normal mmap mapping (non hugetlbfs) is valid on a given area.
*
* until the generic code provides a more generic hook and/or starts
* calling arch get_unmapped_area for MAP_FIXED (which our implementation
* here knows how to deal with), we hijack it to keep standard mappings
* away from us.
*
* because of that generic code limitation, MAP_FIXED mapping cannot
* "convert" back a slice with no VMAs to the standard page size, only
* get_unmapped_area() can. It would be possible to fix it here but I
* prefer working on fixing the generic code instead.
*
* WARNING: This will not work if hugetlbfs isn't enabled since the
* generic code will redefine that function as 0 in that. This is ok
* for now as we only use slices with hugetlbfs enabled. This should
* be fixed as the generic code gets fixed.
*/
int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
unsigned long len)
{
struct slice_mask mask, available;
unsigned int psize = mm->context.user_psize;
mask = slice_range_to_mask(addr, len);
available = slice_mask_for_size(mm, psize);
#ifdef CONFIG_PPC_64K_PAGES
/* We need to account for 4k slices too */
if (psize == MMU_PAGE_64K) {
struct slice_mask compat_mask;
compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
or_mask(available, compat_mask);
}
#endif
#if 0 /* too verbose */
slice_dbg("is_hugepage_only_range(mm=%p, addr=%lx, len=%lx)\n",
mm, addr, len);
slice_print_mask(" mask", mask);
slice_print_mask(" available", available);
#endif
return !slice_check_fit(mask, available);
}
#endif

View file

@ -0,0 +1,269 @@
/*
* Copyright 2007-2008 Paul Mackerras, IBM Corp.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <asm/pgtable.h>
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
/*
* Free all pages allocated for subpage protection maps and pointers.
* Also makes sure that the subpage_prot_table structure is
* reinitialized for the next user.
*/
void subpage_prot_free(struct mm_struct *mm)
{
struct subpage_prot_table *spt = &mm->context.spt;
unsigned long i, j, addr;
u32 **p;
for (i = 0; i < 4; ++i) {
if (spt->low_prot[i]) {
free_page((unsigned long)spt->low_prot[i]);
spt->low_prot[i] = NULL;
}
}
addr = 0;
for (i = 0; i < 2; ++i) {
p = spt->protptrs[i];
if (!p)
continue;
spt->protptrs[i] = NULL;
for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr;
++j, addr += PAGE_SIZE)
if (p[j])
free_page((unsigned long)p[j]);
free_page((unsigned long)p);
}
spt->maxaddr = 0;
}
void subpage_prot_init_new_context(struct mm_struct *mm)
{
struct subpage_prot_table *spt = &mm->context.spt;
memset(spt, 0, sizeof(*spt));
}
static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
int npages)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
spinlock_t *ptl;
pgd = pgd_offset(mm, addr);
if (pgd_none(*pgd))
return;
pud = pud_offset(pgd, addr);
if (pud_none(*pud))
return;
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
return;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
for (; npages > 0; --npages) {
pte_update(mm, addr, pte, 0, 0, 0);
addr += PAGE_SIZE;
++pte;
}
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
}
/*
* Clear the subpage protection map for an address range, allowing
* all accesses that are allowed by the pte permissions.
*/
static void subpage_prot_clear(unsigned long addr, unsigned long len)
{
struct mm_struct *mm = current->mm;
struct subpage_prot_table *spt = &mm->context.spt;
u32 **spm, *spp;
unsigned long i;
size_t nw;
unsigned long next, limit;
down_write(&mm->mmap_sem);
limit = addr + len;
if (limit > spt->maxaddr)
limit = spt->maxaddr;
for (; addr < limit; addr = next) {
next = pmd_addr_end(addr, limit);
if (addr < 0x100000000UL) {
spm = spt->low_prot;
} else {
spm = spt->protptrs[addr >> SBP_L3_SHIFT];
if (!spm)
continue;
}
spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
if (!spp)
continue;
spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
nw = PTRS_PER_PTE - i;
if (addr + (nw << PAGE_SHIFT) > next)
nw = (next - addr) >> PAGE_SHIFT;
memset(spp, 0, nw * sizeof(u32));
/* now flush any existing HPTEs for the range */
hpte_flush_range(mm, addr, nw);
}
up_write(&mm->mmap_sem);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->private;
split_huge_page_pmd(vma, addr, pmd);
return 0;
}
static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
unsigned long len)
{
struct vm_area_struct *vma;
struct mm_walk subpage_proto_walk = {
.mm = mm,
.pmd_entry = subpage_walk_pmd_entry,
};
/*
* We don't try too hard, we just mark all the vma in that range
* VM_NOHUGEPAGE and split them.
*/
vma = find_vma(mm, addr);
/*
* If the range is in unmapped range, just return
*/
if (vma && ((addr + len) <= vma->vm_start))
return;
while (vma) {
if (vma->vm_start >= (addr + len))
break;
vma->vm_flags |= VM_NOHUGEPAGE;
subpage_proto_walk.private = vma;
walk_page_range(vma->vm_start, vma->vm_end,
&subpage_proto_walk);
vma = vma->vm_next;
}
}
#else
static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
unsigned long len)
{
return;
}
#endif
/*
* Copy in a subpage protection map for an address range.
* The map has 2 bits per 4k subpage, so 32 bits per 64k page.
* Each 2-bit field is 0 to allow any access, 1 to prevent writes,
* 2 or 3 to prevent all accesses.
* Note that the normal page protections also apply; the subpage
* protection mechanism is an additional constraint, so putting 0
* in a 2-bit field won't allow writes to a page that is otherwise
* write-protected.
*/
long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
{
struct mm_struct *mm = current->mm;
struct subpage_prot_table *spt = &mm->context.spt;
u32 **spm, *spp;
unsigned long i;
size_t nw;
unsigned long next, limit;
int err;
/* Check parameters */
if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
addr >= TASK_SIZE || len >= TASK_SIZE || addr + len > TASK_SIZE)
return -EINVAL;
if (is_hugepage_only_range(mm, addr, len))
return -EINVAL;
if (!map) {
/* Clear out the protection map for the address range */
subpage_prot_clear(addr, len);
return 0;
}
if (!access_ok(VERIFY_READ, map, (len >> PAGE_SHIFT) * sizeof(u32)))
return -EFAULT;
down_write(&mm->mmap_sem);
subpage_mark_vma_nohuge(mm, addr, len);
for (limit = addr + len; addr < limit; addr = next) {
next = pmd_addr_end(addr, limit);
err = -ENOMEM;
if (addr < 0x100000000UL) {
spm = spt->low_prot;
} else {
spm = spt->protptrs[addr >> SBP_L3_SHIFT];
if (!spm) {
spm = (u32 **)get_zeroed_page(GFP_KERNEL);
if (!spm)
goto out;
spt->protptrs[addr >> SBP_L3_SHIFT] = spm;
}
}
spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1);
spp = *spm;
if (!spp) {
spp = (u32 *)get_zeroed_page(GFP_KERNEL);
if (!spp)
goto out;
*spm = spp;
}
spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
local_irq_disable();
demote_segment_4k(mm, addr);
local_irq_enable();
i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
nw = PTRS_PER_PTE - i;
if (addr + (nw << PAGE_SHIFT) > next)
nw = (next - addr) >> PAGE_SHIFT;
up_write(&mm->mmap_sem);
err = -EFAULT;
if (__copy_from_user(spp, map, nw * sizeof(u32)))
goto out2;
map += nw;
down_write(&mm->mmap_sem);
/* now flush any existing HPTEs for the range */
hpte_flush_range(mm, addr, nw);
}
if (limit > spt->maxaddr)
spt->maxaddr = limit;
err = 0;
out:
up_write(&mm->mmap_sem);
out2:
return err;
}

View file

@ -0,0 +1,184 @@
/*
* This file contains the routines for TLB flushing.
* On machines where the MMU uses a hash table to store virtual to
* physical translations, these routines flush entries from the
* hash table also.
* -- paulus
*
* Derived from arch/ppc/mm/init.c:
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
* and Cort Dougan (PReP) (cort@cs.nmt.edu)
* Copyright (C) 1996 Paul Mackerras
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/export.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include "mmu_decl.h"
/*
* Called when unmapping pages to flush entries from the TLB/hash table.
*/
void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
{
unsigned long ptephys;
if (Hash != 0) {
ptephys = __pa(ptep) & PAGE_MASK;
flush_hash_pages(mm->context.id, addr, ptephys, 1);
}
}
EXPORT_SYMBOL(flush_hash_entry);
/*
* Called by ptep_set_access_flags, must flush on CPUs for which the
* DSI handler can't just "fixup" the TLB on a write fault
*/
void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long addr)
{
if (Hash != 0)
return;
_tlbie(addr);
}
/*
* Called at the end of a mmu_gather operation to make sure the
* TLB flush is completely done.
*/
void tlb_flush(struct mmu_gather *tlb)
{
if (Hash == 0) {
/*
* 603 needs to flush the whole TLB here since
* it doesn't use a hash table.
*/
_tlbia();
}
}
/*
* TLB flushing:
*
* - flush_tlb_mm(mm) flushes the specified mm context TLB's
* - flush_tlb_page(vma, vmaddr) flushes one page
* - flush_tlb_range(vma, start, end) flushes a range of pages
* - flush_tlb_kernel_range(start, end) flushes kernel pages
*
* since the hardware hash table functions as an extension of the
* tlb as far as the linux tables are concerned, flush it too.
* -- Cort
*/
static void flush_range(struct mm_struct *mm, unsigned long start,
unsigned long end)
{
pmd_t *pmd;
unsigned long pmd_end;
int count;
unsigned int ctx = mm->context.id;
if (Hash == 0) {
_tlbia();
return;
}
start &= PAGE_MASK;
if (start >= end)
return;
end = (end - 1) | ~PAGE_MASK;
pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start);
for (;;) {
pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
if (pmd_end > end)
pmd_end = end;
if (!pmd_none(*pmd)) {
count = ((pmd_end - start) >> PAGE_SHIFT) + 1;
flush_hash_pages(ctx, start, pmd_val(*pmd), count);
}
if (pmd_end == end)
break;
start = pmd_end + 1;
++pmd;
}
}
/*
* Flush kernel TLB entries in the given range
*/
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
flush_range(&init_mm, start, end);
}
EXPORT_SYMBOL(flush_tlb_kernel_range);
/*
* Flush all the (user) entries for the address space described by mm.
*/
void flush_tlb_mm(struct mm_struct *mm)
{
struct vm_area_struct *mp;
if (Hash == 0) {
_tlbia();
return;
}
/*
* It is safe to go down the mm's list of vmas when called
* from dup_mmap, holding mmap_sem. It would also be safe from
* unmap_region or exit_mmap, but not from vmtruncate on SMP -
* but it seems dup_mmap is the only SMP case which gets here.
*/
for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
}
EXPORT_SYMBOL(flush_tlb_mm);
void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
{
struct mm_struct *mm;
pmd_t *pmd;
if (Hash == 0) {
_tlbie(vmaddr);
return;
}
mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
if (!pmd_none(*pmd))
flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
}
EXPORT_SYMBOL(flush_tlb_page);
/*
* For each address in the range, find the pte for the address
* and check _PAGE_HASHPTE bit; if it is set, find and destroy
* the corresponding HPTE.
*/
void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end)
{
flush_range(vma->vm_mm, start, end);
}
EXPORT_SYMBOL(flush_tlb_range);
void __init early_init_mmu(void)
{
}

View file

@ -0,0 +1,256 @@
/*
* This file contains the routines for flushing entries from the
* TLB and MMU hash table.
*
* Derived from arch/ppc64/mm/init.c:
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
* and Cort Dougan (PReP) (cort@cs.nmt.edu)
* Copyright (C) 1996 Paul Mackerras
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* Dave Engebretsen <engebret@us.ibm.com>
* Rework for PPC64 port.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/bug.h>
#include <trace/events/thp.h>
DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
/*
* A linux PTE was changed and the corresponding hash table entry
* neesd to be flushed. This function will either perform the flush
* immediately or will batch it up if the current CPU has an active
* batch on it.
*/
void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long pte, int huge)
{
unsigned long vpn;
struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
unsigned long vsid;
unsigned int psize;
int ssize;
real_pte_t rpte;
int i;
i = batch->index;
/* Get page size (maybe move back to caller).
*
* NOTE: when using special 64K mappings in 4K environment like
* for SPEs, we obtain the page size from the slice, which thus
* must still exist (and thus the VMA not reused) at the time
* of this call
*/
if (huge) {
#ifdef CONFIG_HUGETLB_PAGE
psize = get_slice_psize(mm, addr);
/* Mask the address for the correct page size */
addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1);
#else
BUG();
psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
#endif
} else {
psize = pte_pagesize_index(mm, addr, pte);
/* Mask the address for the standard page size. If we
* have a 64k page kernel, but the hardware does not
* support 64k pages, this might be different from the
* hardware page size encoded in the slice table. */
addr &= PAGE_MASK;
}
/* Build full vaddr */
if (!is_kernel_addr(addr)) {
ssize = user_segment_size(addr);
vsid = get_vsid(mm->context.id, addr, ssize);
} else {
vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
ssize = mmu_kernel_ssize;
}
WARN_ON(vsid == 0);
vpn = hpt_vpn(addr, vsid, ssize);
rpte = __real_pte(__pte(pte), ptep);
/*
* Check if we have an active batch on this CPU. If not, just
* flush now and return. For now, we don global invalidates
* in that case, might be worth testing the mm cpu mask though
* and decide to use local invalidates instead...
*/
if (!batch->active) {
flush_hash_page(vpn, rpte, psize, ssize, 0);
put_cpu_var(ppc64_tlb_batch);
return;
}
/*
* This can happen when we are in the middle of a TLB batch and
* we encounter memory pressure (eg copy_page_range when it tries
* to allocate a new pte). If we have to reclaim memory and end
* up scanning and resetting referenced bits then our batch context
* will change mid stream.
*
* We also need to ensure only one page size is present in a given
* batch
*/
if (i != 0 && (mm != batch->mm || batch->psize != psize ||
batch->ssize != ssize)) {
__flush_tlb_pending(batch);
i = 0;
}
if (i == 0) {
batch->mm = mm;
batch->psize = psize;
batch->ssize = ssize;
}
batch->pte[i] = rpte;
batch->vpn[i] = vpn;
batch->index = ++i;
if (i >= PPC64_TLB_BATCH_NR)
__flush_tlb_pending(batch);
put_cpu_var(ppc64_tlb_batch);
}
/*
* This function is called when terminating an mmu batch or when a batch
* is full. It will perform the flush of all the entries currently stored
* in a batch.
*
* Must be called from within some kind of spinlock/non-preempt region...
*/
void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
{
const struct cpumask *tmp;
int i, local = 0;
i = batch->index;
tmp = cpumask_of(smp_processor_id());
if (cpumask_equal(mm_cpumask(batch->mm), tmp))
local = 1;
if (i == 1)
flush_hash_page(batch->vpn[0], batch->pte[0],
batch->psize, batch->ssize, local);
else
flush_hash_range(i, local);
batch->index = 0;
}
void tlb_flush(struct mmu_gather *tlb)
{
struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch);
/* If there's a TLB batch pending, then we must flush it because the
* pages are going to be freed and we really don't want to have a CPU
* access a freed page because it has a stale TLB
*/
if (tlbbatch->index)
__flush_tlb_pending(tlbbatch);
put_cpu_var(ppc64_tlb_batch);
}
/**
* __flush_hash_table_range - Flush all HPTEs for a given address range
* from the hash table (and the TLB). But keeps
* the linux PTEs intact.
*
* @mm : mm_struct of the target address space (generally init_mm)
* @start : starting address
* @end : ending address (not included in the flush)
*
* This function is mostly to be used by some IO hotplug code in order
* to remove all hash entries from a given address range used to map IO
* space on a removed PCI-PCI bidge without tearing down the full mapping
* since 64K pages may overlap with other bridges when using 64K pages
* with 4K HW pages on IO space.
*
* Because of that usage pattern, it is implemented for small size rather
* than speed.
*/
void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
unsigned long end)
{
int hugepage_shift;
unsigned long flags;
start = _ALIGN_DOWN(start, PAGE_SIZE);
end = _ALIGN_UP(end, PAGE_SIZE);
BUG_ON(!mm->pgd);
/* Note: Normally, we should only ever use a batch within a
* PTE locked section. This violates the rule, but will work
* since we don't actually modify the PTEs, we just flush the
* hash while leaving the PTEs intact (including their reference
* to being hashed). This is not the most performance oriented
* way to do things but is fine for our needs here.
*/
local_irq_save(flags);
arch_enter_lazy_mmu_mode();
for (; start < end; start += PAGE_SIZE) {
pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start,
&hugepage_shift);
unsigned long pte;
if (ptep == NULL)
continue;
pte = pte_val(*ptep);
if (hugepage_shift)
trace_hugepage_invalidate(start, pte_val(pte));
if (!(pte & _PAGE_HASHPTE))
continue;
if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
else
hpte_need_flush(mm, start, ptep, pte, 0);
}
arch_leave_lazy_mmu_mode();
local_irq_restore(flags);
}
void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
{
pte_t *pte;
pte_t *start_pte;
unsigned long flags;
addr = _ALIGN_DOWN(addr, PMD_SIZE);
/* Note: Normally, we should only ever use a batch within a
* PTE locked section. This violates the rule, but will work
* since we don't actually modify the PTEs, we just flush the
* hash while leaving the PTEs intact (including their reference
* to being hashed). This is not the most performance oriented
* way to do things but is fine for our needs here.
*/
local_irq_save(flags);
arch_enter_lazy_mmu_mode();
start_pte = pte_offset_map(pmd, addr);
for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
unsigned long pteval = pte_val(*pte);
if (pteval & _PAGE_HASHPTE)
hpte_need_flush(mm, addr, pte, pteval, 0);
addr += PAGE_SIZE;
}
arch_leave_lazy_mmu_mode();
local_irq_restore(flags);
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,753 @@
/*
* This file contains the routines for TLB flushing.
* On machines where the MMU does not use a hash table to store virtual to
* physical translations (ie, SW loaded TLBs or Book3E compilant processors,
* this does -not- include 603 however which shares the implementation with
* hash based processors)
*
* -- BenH
*
* Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org>
* IBM Corp.
*
* Derived from arch/ppc/mm/init.c:
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
* and Cort Dougan (PReP) (cort@cs.nmt.edu)
* Copyright (C) 1996 Paul Mackerras
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/preempt.h>
#include <linux/spinlock.h>
#include <linux/memblock.h>
#include <linux/of_fdt.h>
#include <linux/hugetlb.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/code-patching.h>
#include <asm/hugetlb.h>
#include <asm/paca.h>
#include "mmu_decl.h"
/*
* This struct lists the sw-supported page sizes. The hardawre MMU may support
* other sizes not listed here. The .ind field is only used on MMUs that have
* indirect page table entries.
*/
#ifdef CONFIG_PPC_BOOK3E_MMU
#ifdef CONFIG_PPC_FSL_BOOK3E
struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
[MMU_PAGE_4K] = {
.shift = 12,
.enc = BOOK3E_PAGESZ_4K,
},
[MMU_PAGE_2M] = {
.shift = 21,
.enc = BOOK3E_PAGESZ_2M,
},
[MMU_PAGE_4M] = {
.shift = 22,
.enc = BOOK3E_PAGESZ_4M,
},
[MMU_PAGE_16M] = {
.shift = 24,
.enc = BOOK3E_PAGESZ_16M,
},
[MMU_PAGE_64M] = {
.shift = 26,
.enc = BOOK3E_PAGESZ_64M,
},
[MMU_PAGE_256M] = {
.shift = 28,
.enc = BOOK3E_PAGESZ_256M,
},
[MMU_PAGE_1G] = {
.shift = 30,
.enc = BOOK3E_PAGESZ_1GB,
},
};
#else
struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
[MMU_PAGE_4K] = {
.shift = 12,
.ind = 20,
.enc = BOOK3E_PAGESZ_4K,
},
[MMU_PAGE_16K] = {
.shift = 14,
.enc = BOOK3E_PAGESZ_16K,
},
[MMU_PAGE_64K] = {
.shift = 16,
.ind = 28,
.enc = BOOK3E_PAGESZ_64K,
},
[MMU_PAGE_1M] = {
.shift = 20,
.enc = BOOK3E_PAGESZ_1M,
},
[MMU_PAGE_16M] = {
.shift = 24,
.ind = 36,
.enc = BOOK3E_PAGESZ_16M,
},
[MMU_PAGE_256M] = {
.shift = 28,
.enc = BOOK3E_PAGESZ_256M,
},
[MMU_PAGE_1G] = {
.shift = 30,
.enc = BOOK3E_PAGESZ_1GB,
},
};
#endif /* CONFIG_FSL_BOOKE */
static inline int mmu_get_tsize(int psize)
{
return mmu_psize_defs[psize].enc;
}
#else
static inline int mmu_get_tsize(int psize)
{
/* This isn't used on !Book3E for now */
return 0;
}
#endif /* CONFIG_PPC_BOOK3E_MMU */
/* The variables below are currently only used on 64-bit Book3E
* though this will probably be made common with other nohash
* implementations at some point
*/
#ifdef CONFIG_PPC64
int mmu_linear_psize; /* Page size used for the linear mapping */
int mmu_pte_psize; /* Page size used for PTE pages */
int mmu_vmemmap_psize; /* Page size used for the virtual mem map */
int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */
unsigned long linear_map_top; /* Top of linear mapping */
/*
* Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug
* exceptions. This is used for bolted and e6500 TLB miss handlers which
* do not modify this SPRG in the TLB miss code; for other TLB miss handlers,
* this is set to zero.
*/
int extlb_level_exc;
#endif /* CONFIG_PPC64 */
#ifdef CONFIG_PPC_FSL_BOOK3E
/* next_tlbcam_idx is used to round-robin tlbcam entry assignment */
DEFINE_PER_CPU(int, next_tlbcam_idx);
EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx);
#endif
/*
* Base TLB flushing operations:
*
* - flush_tlb_mm(mm) flushes the specified mm context TLB's
* - flush_tlb_page(vma, vmaddr) flushes one page
* - flush_tlb_range(vma, start, end) flushes a range of pages
* - flush_tlb_kernel_range(start, end) flushes kernel pages
*
* - local_* variants of page and mm only apply to the current
* processor
*/
/*
* These are the base non-SMP variants of page and mm flushing
*/
void local_flush_tlb_mm(struct mm_struct *mm)
{
unsigned int pid;
preempt_disable();
pid = mm->context.id;
if (pid != MMU_NO_CONTEXT)
_tlbil_pid(pid);
preempt_enable();
}
EXPORT_SYMBOL(local_flush_tlb_mm);
void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
int tsize, int ind)
{
unsigned int pid;
preempt_disable();
pid = mm ? mm->context.id : 0;
if (pid != MMU_NO_CONTEXT)
_tlbil_va(vmaddr, pid, tsize, ind);
preempt_enable();
}
void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
{
__local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
mmu_get_tsize(mmu_virtual_psize), 0);
}
EXPORT_SYMBOL(local_flush_tlb_page);
/*
* And here are the SMP non-local implementations
*/
#ifdef CONFIG_SMP
static DEFINE_RAW_SPINLOCK(tlbivax_lock);
static int mm_is_core_local(struct mm_struct *mm)
{
return cpumask_subset(mm_cpumask(mm),
topology_thread_cpumask(smp_processor_id()));
}
struct tlb_flush_param {
unsigned long addr;
unsigned int pid;
unsigned int tsize;
unsigned int ind;
};
static void do_flush_tlb_mm_ipi(void *param)
{
struct tlb_flush_param *p = param;
_tlbil_pid(p ? p->pid : 0);
}
static void do_flush_tlb_page_ipi(void *param)
{
struct tlb_flush_param *p = param;
_tlbil_va(p->addr, p->pid, p->tsize, p->ind);
}
/* Note on invalidations and PID:
*
* We snapshot the PID with preempt disabled. At this point, it can still
* change either because:
* - our context is being stolen (PID -> NO_CONTEXT) on another CPU
* - we are invaliating some target that isn't currently running here
* and is concurrently acquiring a new PID on another CPU
* - some other CPU is re-acquiring a lost PID for this mm
* etc...
*
* However, this shouldn't be a problem as we only guarantee
* invalidation of TLB entries present prior to this call, so we
* don't care about the PID changing, and invalidating a stale PID
* is generally harmless.
*/
void flush_tlb_mm(struct mm_struct *mm)
{
unsigned int pid;
preempt_disable();
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
goto no_context;
if (!mm_is_core_local(mm)) {
struct tlb_flush_param p = { .pid = pid };
/* Ignores smp_processor_id() even if set. */
smp_call_function_many(mm_cpumask(mm),
do_flush_tlb_mm_ipi, &p, 1);
}
_tlbil_pid(pid);
no_context:
preempt_enable();
}
EXPORT_SYMBOL(flush_tlb_mm);
void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
int tsize, int ind)
{
struct cpumask *cpu_mask;
unsigned int pid;
preempt_disable();
pid = mm ? mm->context.id : 0;
if (unlikely(pid == MMU_NO_CONTEXT))
goto bail;
cpu_mask = mm_cpumask(mm);
if (!mm_is_core_local(mm)) {
/* If broadcast tlbivax is supported, use it */
if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) {
int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL);
if (lock)
raw_spin_lock(&tlbivax_lock);
_tlbivax_bcast(vmaddr, pid, tsize, ind);
if (lock)
raw_spin_unlock(&tlbivax_lock);
goto bail;
} else {
struct tlb_flush_param p = {
.pid = pid,
.addr = vmaddr,
.tsize = tsize,
.ind = ind,
};
/* Ignores smp_processor_id() even if set in cpu_mask */
smp_call_function_many(cpu_mask,
do_flush_tlb_page_ipi, &p, 1);
}
}
_tlbil_va(vmaddr, pid, tsize, ind);
bail:
preempt_enable();
}
void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
{
#ifdef CONFIG_HUGETLB_PAGE
if (vma && is_vm_hugetlb_page(vma))
flush_hugetlb_page(vma, vmaddr);
#endif
__flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
mmu_get_tsize(mmu_virtual_psize), 0);
}
EXPORT_SYMBOL(flush_tlb_page);
#endif /* CONFIG_SMP */
#ifdef CONFIG_PPC_47x
void __init early_init_mmu_47x(void)
{
#ifdef CONFIG_SMP
unsigned long root = of_get_flat_dt_root();
if (of_get_flat_dt_prop(root, "cooperative-partition", NULL))
mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST);
#endif /* CONFIG_SMP */
}
#endif /* CONFIG_PPC_47x */
/*
* Flush kernel TLB entries in the given range
*/
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
#ifdef CONFIG_SMP
preempt_disable();
smp_call_function(do_flush_tlb_mm_ipi, NULL, 1);
_tlbil_pid(0);
preempt_enable();
#else
_tlbil_pid(0);
#endif
}
EXPORT_SYMBOL(flush_tlb_kernel_range);
/*
* Currently, for range flushing, we just do a full mm flush. This should
* be optimized based on a threshold on the size of the range, since
* some implementation can stack multiple tlbivax before a tlbsync but
* for now, we keep it that way
*/
void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end)
{
flush_tlb_mm(vma->vm_mm);
}
EXPORT_SYMBOL(flush_tlb_range);
void tlb_flush(struct mmu_gather *tlb)
{
flush_tlb_mm(tlb->mm);
}
/*
* Below are functions specific to the 64-bit variant of Book3E though that
* may change in the future
*/
#ifdef CONFIG_PPC64
/*
* Handling of virtual linear page tables or indirect TLB entries
* flushing when PTE pages are freed
*/
void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
{
int tsize = mmu_psize_defs[mmu_pte_psize].enc;
if (book3e_htw_mode != PPC_HTW_NONE) {
unsigned long start = address & PMD_MASK;
unsigned long end = address + PMD_SIZE;
unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
/* This isn't the most optimal, ideally we would factor out the
* while preempt & CPU mask mucking around, or even the IPI but
* it will do for now
*/
while (start < end) {
__flush_tlb_page(tlb->mm, start, tsize, 1);
start += size;
}
} else {
unsigned long rmask = 0xf000000000000000ul;
unsigned long rid = (address & rmask) | 0x1000000000000000ul;
unsigned long vpte = address & ~rmask;
#ifdef CONFIG_PPC_64K_PAGES
vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful;
#else
vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful;
#endif
vpte |= rid;
__flush_tlb_page(tlb->mm, vpte, tsize, 0);
}
}
static void setup_page_sizes(void)
{
unsigned int tlb0cfg;
unsigned int tlb0ps;
unsigned int eptcfg;
int i, psize;
#ifdef CONFIG_PPC_FSL_BOOK3E
unsigned int mmucfg = mfspr(SPRN_MMUCFG);
int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E);
if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG);
unsigned int min_pg, max_pg;
min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT;
max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT;
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
struct mmu_psize_def *def;
unsigned int shift;
def = &mmu_psize_defs[psize];
shift = def->shift;
if (shift == 0 || shift & 1)
continue;
/* adjust to be in terms of 4^shift Kb */
shift = (shift - 10) >> 1;
if ((shift >= min_pg) && (shift <= max_pg))
def->flags |= MMU_PAGE_SIZE_DIRECT;
}
goto out;
}
if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) {
u32 tlb1cfg, tlb1ps;
tlb0cfg = mfspr(SPRN_TLB0CFG);
tlb1cfg = mfspr(SPRN_TLB1CFG);
tlb1ps = mfspr(SPRN_TLB1PS);
eptcfg = mfspr(SPRN_EPTCFG);
if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT))
book3e_htw_mode = PPC_HTW_E6500;
/*
* We expect 4K subpage size and unrestricted indirect size.
* The lack of a restriction on indirect size is a Freescale
* extension, indicated by PSn = 0 but SPSn != 0.
*/
if (eptcfg != 2)
book3e_htw_mode = PPC_HTW_NONE;
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
struct mmu_psize_def *def = &mmu_psize_defs[psize];
if (tlb1ps & (1U << (def->shift - 10))) {
def->flags |= MMU_PAGE_SIZE_DIRECT;
if (book3e_htw_mode && psize == MMU_PAGE_2M)
def->flags |= MMU_PAGE_SIZE_INDIRECT;
}
}
goto out;
}
#endif
tlb0cfg = mfspr(SPRN_TLB0CFG);
tlb0ps = mfspr(SPRN_TLB0PS);
eptcfg = mfspr(SPRN_EPTCFG);
/* Look for supported direct sizes */
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
struct mmu_psize_def *def = &mmu_psize_defs[psize];
if (tlb0ps & (1U << (def->shift - 10)))
def->flags |= MMU_PAGE_SIZE_DIRECT;
}
/* Indirect page sizes supported ? */
if ((tlb0cfg & TLBnCFG_IND) == 0 ||
(tlb0cfg & TLBnCFG_PT) == 0)
goto out;
book3e_htw_mode = PPC_HTW_IBM;
/* Now, we only deal with one IND page size for each
* direct size. Hopefully all implementations today are
* unambiguous, but we might want to be careful in the
* future.
*/
for (i = 0; i < 3; i++) {
unsigned int ps, sps;
sps = eptcfg & 0x1f;
eptcfg >>= 5;
ps = eptcfg & 0x1f;
eptcfg >>= 5;
if (!ps || !sps)
continue;
for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
struct mmu_psize_def *def = &mmu_psize_defs[psize];
if (ps == (def->shift - 10))
def->flags |= MMU_PAGE_SIZE_INDIRECT;
if (sps == (def->shift - 10))
def->ind = ps + 10;
}
}
out:
/* Cleanup array and print summary */
pr_info("MMU: Supported page sizes\n");
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
struct mmu_psize_def *def = &mmu_psize_defs[psize];
const char *__page_type_names[] = {
"unsupported",
"direct",
"indirect",
"direct & indirect"
};
if (def->flags == 0) {
def->shift = 0;
continue;
}
pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10),
__page_type_names[def->flags & 0x3]);
}
}
static void setup_mmu_htw(void)
{
/*
* If we want to use HW tablewalk, enable it by patching the TLB miss
* handlers to branch to the one dedicated to it.
*/
switch (book3e_htw_mode) {
case PPC_HTW_IBM:
patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e);
patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e);
break;
#ifdef CONFIG_PPC_FSL_BOOK3E
case PPC_HTW_E6500:
extlb_level_exc = EX_TLB_SIZE;
patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e);
patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e);
break;
#endif
}
pr_info("MMU: Book3E HW tablewalk %s\n",
book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported");
}
/*
* Early initialization of the MMU TLB code
*/
static void early_init_this_mmu(void)
{
unsigned int mas4;
/* Set MAS4 based on page table setting */
mas4 = 0x4 << MAS4_WIMGED_SHIFT;
switch (book3e_htw_mode) {
case PPC_HTW_E6500:
mas4 |= MAS4_INDD;
mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT;
mas4 |= MAS4_TLBSELD(1);
mmu_pte_psize = MMU_PAGE_2M;
break;
case PPC_HTW_IBM:
mas4 |= MAS4_INDD;
#ifdef CONFIG_PPC_64K_PAGES
mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT;
mmu_pte_psize = MMU_PAGE_256M;
#else
mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT;
mmu_pte_psize = MMU_PAGE_1M;
#endif
break;
case PPC_HTW_NONE:
#ifdef CONFIG_PPC_64K_PAGES
mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT;
#else
mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
#endif
mmu_pte_psize = mmu_virtual_psize;
break;
}
mtspr(SPRN_MAS4, mas4);
#ifdef CONFIG_PPC_FSL_BOOK3E
if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
unsigned int num_cams;
/* use a quarter of the TLBCAM for bolted linear map */
num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
linear_map_top = map_mem_in_cams(linear_map_top, num_cams);
}
#endif
/* A sync won't hurt us after mucking around with
* the MMU configuration
*/
mb();
}
static void __init early_init_mmu_global(void)
{
/* XXX This will have to be decided at runtime, but right
* now our boot and TLB miss code hard wires it. Ideally
* we should find out a suitable page size and patch the
* TLB miss code (either that or use the PACA to store
* the value we want)
*/
mmu_linear_psize = MMU_PAGE_1G;
/* XXX This should be decided at runtime based on supported
* page sizes in the TLB, but for now let's assume 16M is
* always there and a good fit (which it probably is)
*
* Freescale booke only supports 4K pages in TLB0, so use that.
*/
if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
mmu_vmemmap_psize = MMU_PAGE_4K;
else
mmu_vmemmap_psize = MMU_PAGE_16M;
/* XXX This code only checks for TLB 0 capabilities and doesn't
* check what page size combos are supported by the HW. It
* also doesn't handle the case where a separate array holds
* the IND entries from the array loaded by the PT.
*/
/* Look for supported page sizes */
setup_page_sizes();
/* Look for HW tablewalk support */
setup_mmu_htw();
#ifdef CONFIG_PPC_FSL_BOOK3E
if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
if (book3e_htw_mode == PPC_HTW_NONE) {
extlb_level_exc = EX_TLB_SIZE;
patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
patch_exception(0x1e0,
exc_instruction_tlb_miss_bolted_book3e);
}
}
#endif
/* Set the global containing the top of the linear mapping
* for use by the TLB miss code
*/
linear_map_top = memblock_end_of_DRAM();
}
static void __init early_mmu_set_memory_limit(void)
{
#ifdef CONFIG_PPC_FSL_BOOK3E
if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
/*
* Limit memory so we dont have linear faults.
* Unlike memblock_set_current_limit, which limits
* memory available during early boot, this permanently
* reduces the memory available to Linux. We need to
* do this because highmem is not supported on 64-bit.
*/
memblock_enforce_memory_limit(linear_map_top);
}
#endif
memblock_set_current_limit(linear_map_top);
}
/* boot cpu only */
void __init early_init_mmu(void)
{
early_init_mmu_global();
early_init_this_mmu();
early_mmu_set_memory_limit();
}
void early_init_mmu_secondary(void)
{
early_init_this_mmu();
}
void setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
{
/* On non-FSL Embedded 64-bit, we adjust the RMA size to match
* the bolted TLB entry. We know for now that only 1G
* entries are supported though that may eventually
* change.
*
* on FSL Embedded 64-bit, we adjust the RMA size to match the
* first bolted TLB entry size. We still limit max to 1G even if
* the TLB could cover more. This is due to what the early init
* code is setup to do.
*
* We crop it to the size of the first MEMBLOCK to
* avoid going over total available memory just in case...
*/
#ifdef CONFIG_PPC_FSL_BOOK3E
if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
unsigned long linear_sz;
linear_sz = calc_cam_sz(first_memblock_size, PAGE_OFFSET,
first_memblock_base);
ppc64_rma_size = min_t(u64, linear_sz, 0x40000000);
} else
#endif
ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
/* Finally limit subsequent allocations */
memblock_set_current_limit(first_memblock_base + ppc64_rma_size);
}
#else /* ! CONFIG_PPC64 */
void __init early_init_mmu(void)
{
#ifdef CONFIG_PPC_47x
early_init_mmu_47x();
#endif
}
#endif /* CONFIG_PPC64 */

View file

@ -0,0 +1,426 @@
/*
* This file contains low-level functions for performing various
* types of TLB invalidations on various processors with no hash
* table.
*
* This file implements the following functions for all no-hash
* processors. Some aren't implemented for some variants. Some
* are inline in tlbflush.h
*
* - tlbil_va
* - tlbil_pid
* - tlbil_all
* - tlbivax_bcast
*
* Code mostly moved over from misc_32.S
*
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Partially rewritten by Cort Dougan (cort@cs.nmt.edu)
* Paul Mackerras, Kumar Gala and Benjamin Herrenschmidt.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <asm/reg.h>
#include <asm/page.h>
#include <asm/cputable.h>
#include <asm/mmu.h>
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
#include <asm/processor.h>
#include <asm/bug.h>
#if defined(CONFIG_40x)
/*
* 40x implementation needs only tlbil_va
*/
_GLOBAL(__tlbil_va)
/* We run the search with interrupts disabled because we have to change
* the PID and I don't want to preempt when that happens.
*/
mfmsr r5
mfspr r6,SPRN_PID
wrteei 0
mtspr SPRN_PID,r4
tlbsx. r3, 0, r3
mtspr SPRN_PID,r6
wrtee r5
bne 1f
sync
/* There are only 64 TLB entries, so r3 < 64, which means bit 25 is
* clear. Since 25 is the V bit in the TLB_TAG, loading this value
* will invalidate the TLB entry. */
tlbwe r3, r3, TLB_TAG
isync
1: blr
#elif defined(CONFIG_8xx)
/*
* Nothing to do for 8xx, everything is inline
*/
#elif defined(CONFIG_44x) /* Includes 47x */
/*
* 440 implementation uses tlbsx/we for tlbil_va and a full sweep
* of the TLB for everything else.
*/
_GLOBAL(__tlbil_va)
mfspr r5,SPRN_MMUCR
mfmsr r10
/*
* We write 16 bits of STID since 47x supports that much, we
* will never be passed out of bounds values on 440 (hopefully)
*/
rlwimi r5,r4,0,16,31
/* We have to run the search with interrupts disabled, otherwise
* an interrupt which causes a TLB miss can clobber the MMUCR
* between the mtspr and the tlbsx.
*
* Critical and Machine Check interrupts take care of saving
* and restoring MMUCR, so only normal interrupts have to be
* taken care of.
*/
wrteei 0
mtspr SPRN_MMUCR,r5
tlbsx. r6,0,r3
bne 10f
sync
BEGIN_MMU_FTR_SECTION
b 2f
END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
/* On 440 There are only 64 TLB entries, so r3 < 64, which means bit
* 22, is clear. Since 22 is the V bit in the TLB_PAGEID, loading this
* value will invalidate the TLB entry.
*/
tlbwe r6,r6,PPC44x_TLB_PAGEID
isync
10: wrtee r10
blr
2:
#ifdef CONFIG_PPC_47x
oris r7,r6,0x8000 /* specify way explicitely */
clrrwi r4,r3,12 /* get an EPN for the hashing with V = 0 */
ori r4,r4,PPC47x_TLBE_SIZE
tlbwe r4,r7,0 /* write it */
isync
wrtee r10
blr
#else /* CONFIG_PPC_47x */
1: trap
EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0;
#endif /* !CONFIG_PPC_47x */
_GLOBAL(_tlbil_all)
_GLOBAL(_tlbil_pid)
BEGIN_MMU_FTR_SECTION
b 2f
END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
li r3,0
sync
/* Load high watermark */
lis r4,tlb_44x_hwater@ha
lwz r5,tlb_44x_hwater@l(r4)
1: tlbwe r3,r3,PPC44x_TLB_PAGEID
addi r3,r3,1
cmpw 0,r3,r5
ble 1b
isync
blr
2:
#ifdef CONFIG_PPC_47x
/* 476 variant. There's not simple way to do this, hopefully we'll
* try to limit the amount of such full invalidates
*/
mfmsr r11 /* Interrupts off */
wrteei 0
li r3,-1 /* Current set */
lis r10,tlb_47x_boltmap@h
ori r10,r10,tlb_47x_boltmap@l
lis r7,0x8000 /* Specify way explicitely */
b 9f /* For each set */
1: li r9,4 /* Number of ways */
li r4,0 /* Current way */
li r6,0 /* Default entry value 0 */
andi. r0,r8,1 /* Check if way 0 is bolted */
mtctr r9 /* Load way counter */
bne- 3f /* Bolted, skip loading it */
2: /* For each way */
or r5,r3,r4 /* Make way|index for tlbre */
rlwimi r5,r5,16,8,15 /* Copy index into position */
tlbre r6,r5,0 /* Read entry */
3: addis r4,r4,0x2000 /* Next way */
andi. r0,r6,PPC47x_TLB0_VALID /* Valid entry ? */
beq 4f /* Nope, skip it */
rlwimi r7,r5,0,1,2 /* Insert way number */
rlwinm r6,r6,0,21,19 /* Clear V */
tlbwe r6,r7,0 /* Write it */
4: bdnz 2b /* Loop for each way */
srwi r8,r8,1 /* Next boltmap bit */
9: cmpwi cr1,r3,255 /* Last set done ? */
addi r3,r3,1 /* Next set */
beq cr1,1f /* End of loop */
andi. r0,r3,0x1f /* Need to load a new boltmap word ? */
bne 1b /* No, loop */
lwz r8,0(r10) /* Load boltmap entry */
addi r10,r10,4 /* Next word */
b 1b /* Then loop */
1: isync /* Sync shadows */
wrtee r11
#else /* CONFIG_PPC_47x */
1: trap
EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0;
#endif /* !CONFIG_PPC_47x */
blr
#ifdef CONFIG_PPC_47x
/*
* _tlbivax_bcast is only on 47x. We don't bother doing a runtime
* check though, it will blow up soon enough if we mistakenly try
* to use it on a 440.
*/
_GLOBAL(_tlbivax_bcast)
mfspr r5,SPRN_MMUCR
mfmsr r10
rlwimi r5,r4,0,16,31
wrteei 0
mtspr SPRN_MMUCR,r5
isync
PPC_TLBIVAX(0, R3)
isync
eieio
tlbsync
BEGIN_FTR_SECTION
b 1f
END_FTR_SECTION_IFSET(CPU_FTR_476_DD2)
sync
wrtee r10
blr
/*
* DD2 HW could hang if in instruction fetch happens before msync completes.
* Touch enough instruction cache lines to ensure cache hits
*/
1: mflr r9
bl 2f
2: mflr r6
li r7,32
PPC_ICBT(0,R6,R7) /* touch next cache line */
add r6,r6,r7
PPC_ICBT(0,R6,R7) /* touch next cache line */
add r6,r6,r7
PPC_ICBT(0,R6,R7) /* touch next cache line */
sync
nop
nop
nop
nop
nop
nop
nop
nop
mtlr r9
wrtee r10
blr
#endif /* CONFIG_PPC_47x */
#elif defined(CONFIG_FSL_BOOKE)
/*
* FSL BookE implementations.
*
* Since feature sections are using _SECTION_ELSE we need
* to have the larger code path before the _SECTION_ELSE
*/
/*
* Flush MMU TLB on the local processor
*/
_GLOBAL(_tlbil_all)
BEGIN_MMU_FTR_SECTION
li r3,(MMUCSR0_TLBFI)@l
mtspr SPRN_MMUCSR0, r3
1:
mfspr r3,SPRN_MMUCSR0
andi. r3,r3,MMUCSR0_TLBFI@l
bne 1b
MMU_FTR_SECTION_ELSE
PPC_TLBILX_ALL(0,R0)
ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
msync
isync
blr
_GLOBAL(_tlbil_pid)
BEGIN_MMU_FTR_SECTION
slwi r3,r3,16
mfmsr r10
wrteei 0
mfspr r4,SPRN_MAS6 /* save MAS6 */
mtspr SPRN_MAS6,r3
PPC_TLBILX_PID(0,R0)
mtspr SPRN_MAS6,r4 /* restore MAS6 */
wrtee r10
MMU_FTR_SECTION_ELSE
li r3,(MMUCSR0_TLBFI)@l
mtspr SPRN_MMUCSR0, r3
1:
mfspr r3,SPRN_MMUCSR0
andi. r3,r3,MMUCSR0_TLBFI@l
bne 1b
ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBILX)
msync
isync
blr
/*
* Flush MMU TLB for a particular address, but only on the local processor
* (no broadcast)
*/
_GLOBAL(__tlbil_va)
mfmsr r10
wrteei 0
slwi r4,r4,16
ori r4,r4,(MAS6_ISIZE(BOOK3E_PAGESZ_4K))@l
mtspr SPRN_MAS6,r4 /* assume AS=0 for now */
BEGIN_MMU_FTR_SECTION
tlbsx 0,r3
mfspr r4,SPRN_MAS1 /* check valid */
andis. r3,r4,MAS1_VALID@h
beq 1f
rlwinm r4,r4,0,1,31
mtspr SPRN_MAS1,r4
tlbwe
MMU_FTR_SECTION_ELSE
PPC_TLBILX_VA(0,R3)
ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
msync
isync
1: wrtee r10
blr
#elif defined(CONFIG_PPC_BOOK3E)
/*
* New Book3E (>= 2.06) implementation
*
* Note: We may be able to get away without the interrupt masking stuff
* if we save/restore MAS6 on exceptions that might modify it
*/
_GLOBAL(_tlbil_pid)
slwi r4,r3,MAS6_SPID_SHIFT
mfmsr r10
wrteei 0
mtspr SPRN_MAS6,r4
PPC_TLBILX_PID(0,R0)
wrtee r10
msync
isync
blr
_GLOBAL(_tlbil_pid_noind)
slwi r4,r3,MAS6_SPID_SHIFT
mfmsr r10
ori r4,r4,MAS6_SIND
wrteei 0
mtspr SPRN_MAS6,r4
PPC_TLBILX_PID(0,R0)
wrtee r10
msync
isync
blr
_GLOBAL(_tlbil_all)
PPC_TLBILX_ALL(0,R0)
msync
isync
blr
_GLOBAL(_tlbil_va)
mfmsr r10
wrteei 0
cmpwi cr0,r6,0
slwi r4,r4,MAS6_SPID_SHIFT
rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK
beq 1f
rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */
PPC_TLBILX_VA(0,R3)
msync
isync
wrtee r10
blr
_GLOBAL(_tlbivax_bcast)
mfmsr r10
wrteei 0
cmpwi cr0,r6,0
slwi r4,r4,MAS6_SPID_SHIFT
rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK
beq 1f
rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */
PPC_TLBIVAX(0,R3)
eieio
tlbsync
sync
wrtee r10
blr
_GLOBAL(set_context)
#ifdef CONFIG_BDI_SWITCH
/* Context switch the PTE pointer for the Abatron BDI2000.
* The PGDIR is the second parameter.
*/
lis r5, abatron_pteptrs@h
ori r5, r5, abatron_pteptrs@l
stw r4, 0x4(r5)
#endif
mtspr SPRN_PID,r3
isync /* Force context change */
blr
#else
#error Unsupported processor type !
#endif
#if defined(CONFIG_PPC_FSL_BOOK3E)
/*
* extern void loadcam_entry(unsigned int index)
*
* Load TLBCAM[index] entry in to the L2 CAM MMU
*/
_GLOBAL(loadcam_entry)
mflr r5
LOAD_REG_ADDR_PIC(r4, TLBCAM)
mtlr r5
mulli r5,r3,TLBCAM_SIZE
add r3,r5,r4
lwz r4,TLBCAM_MAS0(r3)
mtspr SPRN_MAS0,r4
lwz r4,TLBCAM_MAS1(r3)
mtspr SPRN_MAS1,r4
PPC_LL r4,TLBCAM_MAS2(r3)
mtspr SPRN_MAS2,r4
lwz r4,TLBCAM_MAS3(r3)
mtspr SPRN_MAS3,r4
BEGIN_MMU_FTR_SECTION
lwz r4,TLBCAM_MAS7(r3)
mtspr SPRN_MAS7,r4
END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
isync
tlbwe
isync
blr
#endif