Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

120
arch/x86/kernel/Makefile Normal file
View file

@ -0,0 +1,120 @@
#
# Makefile for the linux kernel.
#
extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds
CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
CFLAGS_REMOVE_tsc.o = -pg
CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
CFLAGS_REMOVE_pvclock.o = -pg
CFLAGS_REMOVE_kvmclock.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
endif
CFLAGS_irq.o := -I$(src)/../include/asm/trace
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o
obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-y += probe_roms.o
obj-$(CONFIG_X86_32) += i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += mcount_64.o
obj-y += syscall_$(BITS).o vsyscall_gtod.o
obj-$(CONFIG_X86_64) += vsyscall_64.o
obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
obj-y += pci-iommu_table.o
obj-y += resource.o
obj-y += process.o
obj-y += i387.o xsave.o
obj-y += ptrace.o
obj-$(CONFIG_X86_32) += tls.o
obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
obj-$(CONFIG_INTEL_TXT) += tboot.o
obj-$(CONFIG_ISA_DMA_API) += i8237.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += cpu/
obj-y += acpi/
obj-y += reboot.o
obj-$(CONFIG_X86_MSR) += msr.o
obj-$(CONFIG_X86_CPUID) += cpuid.o
obj-$(CONFIG_PCI) += early-quirks.o
apm-y := apm_32.o
obj-$(CONFIG_APM) += apm.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_SMP) += smpboot.o
obj-$(CONFIG_SMP) += tsc_sync.o
obj-$(CONFIG_SMP) += setup_percpu.o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
obj-y += apic/
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-y += kprobes/
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_APB_TIMER) += apb_timer.o
obj-$(CONFIG_AMD_NB) += amd_nb.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
obj-$(CONFIG_OF) += devicetree.o
obj-$(CONFIG_UPROBES) += uprobes.o
obj-y += sysfb.o
obj-$(CONFIG_X86_SYSFB) += sysfb_simplefb.o
obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o
obj-$(CONFIG_PMC_ATOM) += pmc_atom.o
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_AUDIT) += audit_64.o
obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o
obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
endif

View file

@ -0,0 +1,8 @@
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
endif

View file

@ -0,0 +1,62 @@
/*
* Arch-specific APEI-related functions.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include <acpi/apei.h>
#include <asm/mce.h>
#include <asm/tlbflush.h>
int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
{
#ifdef CONFIG_X86_MCE
int i;
struct acpi_hest_ia_corrected *cmc;
struct acpi_hest_ia_error_bank *mc_bank;
if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
return 0;
cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
if (!cmc->enabled)
return 0;
/*
* We expect HEST to provide a list of MC banks that report errors
* in firmware first mode. Otherwise, return non-zero value to
* indicate that we are done parsing HEST.
*/
if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) ||
!cmc->num_hardware_banks)
return 1;
pr_info("HEST: Enabling Firmware First mode for corrected errors.\n");
mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1);
for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
mce_disable_bank(mc_bank->bank_number);
#endif
return 1;
}
void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
{
#ifdef CONFIG_X86_MCE
apei_mce_report_mem_error(sev, mem_err);
#endif
}
void arch_apei_flush_tlb_one(unsigned long addr)
{
__flush_tlb_one(addr);
}

1642
arch/x86/kernel/acpi/boot.c Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,183 @@
/*
* Copyright (C) 2005 Intel Corporation
* Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
* - Added _PDC for SMP C-states on Intel CPUs
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/acpi.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <acpi/processor.h>
#include <asm/acpi.h>
#include <asm/mwait.h>
#include <asm/special_insns.h>
/*
* Initialize bm_flags based on the CPU cache properties
* On SMP it depends on cache configuration
* - When cache is not shared among all CPUs, we flush cache
* before entering C3.
* - When cache is shared among all CPUs, we use bm_check
* mechanism as in UP case
*
* This routine is called only after all the CPUs are online
*/
void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
unsigned int cpu)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
flags->bm_check = 0;
if (num_online_cpus() == 1)
flags->bm_check = 1;
else if (c->x86_vendor == X86_VENDOR_INTEL) {
/*
* Today all MP CPUs that support C3 share cache.
* And caches should not be flushed by software while
* entering C3 type state.
*/
flags->bm_check = 1;
}
/*
* On all recent Intel platforms, ARB_DISABLE is a nop.
* So, set bm_control to zero to indicate that ARB_DISABLE
* is not required while entering C3 type state on
* P4, Core and beyond CPUs
*/
if (c->x86_vendor == X86_VENDOR_INTEL &&
(c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f)))
flags->bm_control = 0;
}
EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
/* The code below handles cstate entry with monitor-mwait pair on Intel*/
struct cstate_entry {
struct {
unsigned int eax;
unsigned int ecx;
} states[ACPI_PROCESSOR_MAX_POWER];
};
static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */
static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
#define NATIVE_CSTATE_BEYOND_HALT (2)
static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
{
struct acpi_processor_cx *cx = _cx;
long retval;
unsigned int eax, ebx, ecx, edx;
unsigned int edx_part;
unsigned int cstate_type; /* C-state type and not ACPI C-state type */
unsigned int num_cstate_subtype;
cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
/* Check whether this particular cx_type (in CST) is supported or not */
cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) &
MWAIT_CSTATE_MASK) + 1;
edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
retval = 0;
/* If the HW does not support any sub-states in this C-state */
if (num_cstate_subtype == 0) {
pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", cx->address, edx_part);
retval = -1;
goto out;
}
/* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */
if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
!(ecx & CPUID5_ECX_INTERRUPT_BREAK)) {
retval = -1;
goto out;
}
if (!mwait_supported[cstate_type]) {
mwait_supported[cstate_type] = 1;
printk(KERN_DEBUG
"Monitor-Mwait will be used to enter C-%d "
"state\n", cx->type);
}
snprintf(cx->desc,
ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x",
cx->address);
out:
return retval;
}
int acpi_processor_ffh_cstate_probe(unsigned int cpu,
struct acpi_processor_cx *cx, struct acpi_power_register *reg)
{
struct cstate_entry *percpu_entry;
struct cpuinfo_x86 *c = &cpu_data(cpu);
long retval;
if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF)
return -1;
if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)
return -1;
percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
percpu_entry->states[cx->index].eax = 0;
percpu_entry->states[cx->index].ecx = 0;
/* Make sure we are running on right CPU */
retval = work_on_cpu(cpu, acpi_processor_ffh_cstate_probe_cpu, cx);
if (retval == 0) {
/* Use the hint in CST */
percpu_entry->states[cx->index].eax = cx->address;
percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK;
}
/*
* For _CST FFH on Intel, if GAS.access_size bit 1 is cleared,
* then we should skip checking BM_STS for this C-state.
* ref: "Intel Processor Vendor-Specific ACPI Interface Specification"
*/
if ((c->x86_vendor == X86_VENDOR_INTEL) && !(reg->access_size & 0x2))
cx->bm_sts_skip = 1;
return retval;
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
{
unsigned int cpu = smp_processor_id();
struct cstate_entry *percpu_entry;
percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
mwait_idle_with_hints(percpu_entry->states[cx->index].eax,
percpu_entry->states[cx->index].ecx);
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_enter);
static int __init ffh_cstate_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86_vendor != X86_VENDOR_INTEL)
return -1;
cpu_cstate_entry = alloc_percpu(struct cstate_entry);
return 0;
}
static void __exit ffh_cstate_exit(void)
{
free_percpu(cpu_cstate_entry);
cpu_cstate_entry = NULL;
}
arch_initcall(ffh_cstate_init);
__exitcall(ffh_cstate_exit);

View file

@ -0,0 +1,140 @@
/*
* sleep.c - x86-specific ACPI sleep support.
*
* Copyright (C) 2001-2003 Patrick Mochel
* Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz>
*/
#include <linux/acpi.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/dmi.h>
#include <linux/cpumask.h>
#include <asm/segment.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/realmode.h>
#include "../../realmode/rm/wakeup.h"
#include "sleep.h"
unsigned long acpi_realmode_flags;
#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
static char temp_stack[4096];
#endif
/**
* x86_acpi_enter_sleep_state - enter sleep state
* @state: Sleep state to enter.
*
* Wrapper around acpi_enter_sleep_state() to be called by assmebly.
*/
acpi_status asmlinkage __visible x86_acpi_enter_sleep_state(u8 state)
{
return acpi_enter_sleep_state(state);
}
/**
* x86_acpi_suspend_lowlevel - save kernel state
*
* Create an identity mapped page table and copy the wakeup routine to
* low memory.
*/
int x86_acpi_suspend_lowlevel(void)
{
struct wakeup_header *header =
(struct wakeup_header *) __va(real_mode_header->wakeup_header);
if (header->signature != WAKEUP_HEADER_SIGNATURE) {
printk(KERN_ERR "wakeup header does not match\n");
return -EINVAL;
}
header->video_mode = saved_video_mode;
header->pmode_behavior = 0;
#ifndef CONFIG_64BIT
native_store_gdt((struct desc_ptr *)&header->pmode_gdt);
/*
* We have to check that we can write back the value, and not
* just read it. At least on 90 nm Pentium M (Family 6, Model
* 13), reading an invalid MSR is not guaranteed to trap, see
* Erratum X4 in "Intel Pentium M Processor on 90 nm Process
* with 2-MB L2 Cache and Intel® Processor A100 and A110 on 90
* nm process with 512-KB L2 Cache Specification Update".
*/
if (!rdmsr_safe(MSR_EFER,
&header->pmode_efer_low,
&header->pmode_efer_high) &&
!wrmsr_safe(MSR_EFER,
header->pmode_efer_low,
header->pmode_efer_high))
header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER);
#endif /* !CONFIG_64BIT */
header->pmode_cr0 = read_cr0();
if (__this_cpu_read(cpu_info.cpuid_level) >= 0) {
header->pmode_cr4 = read_cr4();
header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
}
if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
&header->pmode_misc_en_low,
&header->pmode_misc_en_high) &&
!wrmsr_safe(MSR_IA32_MISC_ENABLE,
header->pmode_misc_en_low,
header->pmode_misc_en_high))
header->pmode_behavior |=
(1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE);
header->realmode_flags = acpi_realmode_flags;
header->real_magic = 0x12345678;
#ifndef CONFIG_64BIT
header->pmode_entry = (u32)&wakeup_pmode_return;
header->pmode_cr3 = (u32)__pa_symbol(initial_page_table);
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
#ifdef CONFIG_SMP
stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
(unsigned long)get_cpu_gdt_table(smp_processor_id());
initial_gs = per_cpu_offset(smp_processor_id());
#endif
initial_code = (unsigned long)wakeup_long64;
saved_magic = 0x123456789abcdef0L;
#endif /* CONFIG_64BIT */
do_suspend_lowlevel();
return 0;
}
static int __init acpi_sleep_setup(char *str)
{
while ((str != NULL) && (*str != '\0')) {
if (strncmp(str, "s3_bios", 7) == 0)
acpi_realmode_flags |= 1;
if (strncmp(str, "s3_mode", 7) == 0)
acpi_realmode_flags |= 2;
if (strncmp(str, "s3_beep", 7) == 0)
acpi_realmode_flags |= 4;
#ifdef CONFIG_HIBERNATION
if (strncmp(str, "s4_nohwsig", 10) == 0)
acpi_no_s4_hw_signature();
#endif
if (strncmp(str, "nonvs", 5) == 0)
acpi_nvs_nosave();
if (strncmp(str, "nonvs_s3", 8) == 0)
acpi_nvs_nosave_s3();
if (strncmp(str, "old_ordering", 12) == 0)
acpi_old_suspend_ordering();
str = strchr(str, ',');
if (str != NULL)
str += strspn(str, ", \t");
}
return 1;
}
__setup("acpi_sleep=", acpi_sleep_setup);

View file

@ -0,0 +1,21 @@
/*
* Variables and functions used by the code in sleep.c
*/
#include <asm/realmode.h>
extern unsigned long saved_video_mode;
extern long saved_magic;
extern int wakeup_pmode_return;
extern u8 wake_sleep_flags;
extern unsigned long acpi_copy_wakeup_routine(unsigned long);
extern void wakeup_long64(void);
extern void do_suspend_lowlevel(void);
extern int x86_acpi_suspend_lowlevel(void);
acpi_status asmlinkage x86_acpi_enter_sleep_state(u8 state);

View file

@ -0,0 +1,97 @@
.text
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/page_types.h>
# Copyright 2003, 2008 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
.code32
ALIGN
ENTRY(wakeup_pmode_return)
wakeup_pmode_return:
movw $__KERNEL_DS, %ax
movw %ax, %ss
movw %ax, %ds
movw %ax, %es
movw %ax, %fs
movw %ax, %gs
# reload the gdt, as we need the full 32 bit address
lidt saved_idt
lldt saved_ldt
ljmp $(__KERNEL_CS), $1f
1:
movl %cr3, %eax
movl %eax, %cr3
wbinvd
# and restore the stack ... but you need gdt for this to work
movl saved_context_esp, %esp
movl %cs:saved_magic, %eax
cmpl $0x12345678, %eax
jne bogus_magic
# jump to place where we left off
movl saved_eip, %eax
jmp *%eax
bogus_magic:
jmp bogus_magic
save_registers:
sidt saved_idt
sldt saved_ldt
str saved_tss
leal 4(%esp), %eax
movl %eax, saved_context_esp
movl %ebx, saved_context_ebx
movl %ebp, saved_context_ebp
movl %esi, saved_context_esi
movl %edi, saved_context_edi
pushfl
popl saved_context_eflags
movl $ret_point, saved_eip
ret
restore_registers:
movl saved_context_ebp, %ebp
movl saved_context_ebx, %ebx
movl saved_context_esi, %esi
movl saved_context_edi, %edi
pushl saved_context_eflags
popfl
ret
ENTRY(do_suspend_lowlevel)
call save_processor_state
call save_registers
pushl $3
call x86_acpi_enter_sleep_state
addl $4, %esp
# In case of S3 failure, we'll emerge here. Jump
# to ret_point to recover
jmp ret_point
.p2align 4,,7
ret_point:
call restore_registers
call restore_processor_state
ret
.data
ALIGN
ENTRY(saved_magic) .long 0
ENTRY(saved_eip) .long 0
# saved registers
saved_idt: .long 0,0
saved_ldt: .long 0
saved_tss: .long 0

View file

@ -0,0 +1,124 @@
.text
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/pgtable_types.h>
#include <asm/page_types.h>
#include <asm/msr.h>
#include <asm/asm-offsets.h>
# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
.code64
/*
* Hooray, we are in Long 64-bit mode (but still running in low memory)
*/
ENTRY(wakeup_long64)
movq saved_magic, %rax
movq $0x123456789abcdef0, %rdx
cmpq %rdx, %rax
jne bogus_64_magic
movw $__KERNEL_DS, %ax
movw %ax, %ss
movw %ax, %ds
movw %ax, %es
movw %ax, %fs
movw %ax, %gs
movq saved_rsp, %rsp
movq saved_rbx, %rbx
movq saved_rdi, %rdi
movq saved_rsi, %rsi
movq saved_rbp, %rbp
movq saved_rip, %rax
jmp *%rax
ENDPROC(wakeup_long64)
bogus_64_magic:
jmp bogus_64_magic
ENTRY(do_suspend_lowlevel)
subq $8, %rsp
xorl %eax, %eax
call save_processor_state
movq $saved_context, %rax
movq %rsp, pt_regs_sp(%rax)
movq %rbp, pt_regs_bp(%rax)
movq %rsi, pt_regs_si(%rax)
movq %rdi, pt_regs_di(%rax)
movq %rbx, pt_regs_bx(%rax)
movq %rcx, pt_regs_cx(%rax)
movq %rdx, pt_regs_dx(%rax)
movq %r8, pt_regs_r8(%rax)
movq %r9, pt_regs_r9(%rax)
movq %r10, pt_regs_r10(%rax)
movq %r11, pt_regs_r11(%rax)
movq %r12, pt_regs_r12(%rax)
movq %r13, pt_regs_r13(%rax)
movq %r14, pt_regs_r14(%rax)
movq %r15, pt_regs_r15(%rax)
pushfq
popq pt_regs_flags(%rax)
movq $resume_point, saved_rip(%rip)
movq %rsp, saved_rsp
movq %rbp, saved_rbp
movq %rbx, saved_rbx
movq %rdi, saved_rdi
movq %rsi, saved_rsi
addq $8, %rsp
movl $3, %edi
xorl %eax, %eax
call x86_acpi_enter_sleep_state
/* in case something went wrong, restore the machine status and go on */
jmp resume_point
.align 4
resume_point:
/* We don't restore %rax, it must be 0 anyway */
movq $saved_context, %rax
movq saved_context_cr4(%rax), %rbx
movq %rbx, %cr4
movq saved_context_cr3(%rax), %rbx
movq %rbx, %cr3
movq saved_context_cr2(%rax), %rbx
movq %rbx, %cr2
movq saved_context_cr0(%rax), %rbx
movq %rbx, %cr0
pushq pt_regs_flags(%rax)
popfq
movq pt_regs_sp(%rax), %rsp
movq pt_regs_bp(%rax), %rbp
movq pt_regs_si(%rax), %rsi
movq pt_regs_di(%rax), %rdi
movq pt_regs_bx(%rax), %rbx
movq pt_regs_cx(%rax), %rcx
movq pt_regs_dx(%rax), %rdx
movq pt_regs_r8(%rax), %r8
movq pt_regs_r9(%rax), %r9
movq pt_regs_r10(%rax), %r10
movq pt_regs_r11(%rax), %r11
movq pt_regs_r12(%rax), %r12
movq pt_regs_r13(%rax), %r13
movq pt_regs_r14(%rax), %r14
movq pt_regs_r15(%rax), %r15
xorl %eax, %eax
addq $8, %rsp
jmp restore_processor_state
ENDPROC(do_suspend_lowlevel)
.data
ENTRY(saved_rbp) .quad 0
ENTRY(saved_rsi) .quad 0
ENTRY(saved_rdi) .quad 0
ENTRY(saved_rbx) .quad 0
ENTRY(saved_rip) .quad 0
ENTRY(saved_rsp) .quad 0
ENTRY(saved_magic) .quad 0

View file

@ -0,0 +1,677 @@
#define pr_fmt(fmt) "SMP alternatives: " fmt
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/stringify.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
#include <linux/stop_machine.h>
#include <linux/slab.h>
#include <linux/kdebug.h>
#include <asm/alternative.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
#include <asm/mce.h>
#include <asm/nmi.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
#include <asm/fixmap.h>
#define MAX_PATCH_LEN (255-1)
static int __initdata_or_module debug_alternative;
static int __init debug_alt(char *str)
{
debug_alternative = 1;
return 1;
}
__setup("debug-alternative", debug_alt);
static int noreplace_smp;
static int __init setup_noreplace_smp(char *str)
{
noreplace_smp = 1;
return 1;
}
__setup("noreplace-smp", setup_noreplace_smp);
#ifdef CONFIG_PARAVIRT
static int __initdata_or_module noreplace_paravirt = 0;
static int __init setup_noreplace_paravirt(char *str)
{
noreplace_paravirt = 1;
return 1;
}
__setup("noreplace-paravirt", setup_noreplace_paravirt);
#endif
#define DPRINTK(fmt, ...) \
do { \
if (debug_alternative) \
printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
} while (0)
/*
* Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
* that correspond to that nop. Getting from one nop to the next, we
* add to the array the offset that is equal to the sum of all sizes of
* nops preceding the one we are after.
*
* Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
* nice symmetry of sizes of the previous nops.
*/
#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
static const unsigned char intelnops[] =
{
GENERIC_NOP1,
GENERIC_NOP2,
GENERIC_NOP3,
GENERIC_NOP4,
GENERIC_NOP5,
GENERIC_NOP6,
GENERIC_NOP7,
GENERIC_NOP8,
GENERIC_NOP5_ATOMIC
};
static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
{
NULL,
intelnops,
intelnops + 1,
intelnops + 1 + 2,
intelnops + 1 + 2 + 3,
intelnops + 1 + 2 + 3 + 4,
intelnops + 1 + 2 + 3 + 4 + 5,
intelnops + 1 + 2 + 3 + 4 + 5 + 6,
intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
};
#endif
#ifdef K8_NOP1
static const unsigned char k8nops[] =
{
K8_NOP1,
K8_NOP2,
K8_NOP3,
K8_NOP4,
K8_NOP5,
K8_NOP6,
K8_NOP7,
K8_NOP8,
K8_NOP5_ATOMIC
};
static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
{
NULL,
k8nops,
k8nops + 1,
k8nops + 1 + 2,
k8nops + 1 + 2 + 3,
k8nops + 1 + 2 + 3 + 4,
k8nops + 1 + 2 + 3 + 4 + 5,
k8nops + 1 + 2 + 3 + 4 + 5 + 6,
k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
};
#endif
#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
static const unsigned char k7nops[] =
{
K7_NOP1,
K7_NOP2,
K7_NOP3,
K7_NOP4,
K7_NOP5,
K7_NOP6,
K7_NOP7,
K7_NOP8,
K7_NOP5_ATOMIC
};
static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
{
NULL,
k7nops,
k7nops + 1,
k7nops + 1 + 2,
k7nops + 1 + 2 + 3,
k7nops + 1 + 2 + 3 + 4,
k7nops + 1 + 2 + 3 + 4 + 5,
k7nops + 1 + 2 + 3 + 4 + 5 + 6,
k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
};
#endif
#ifdef P6_NOP1
static const unsigned char p6nops[] =
{
P6_NOP1,
P6_NOP2,
P6_NOP3,
P6_NOP4,
P6_NOP5,
P6_NOP6,
P6_NOP7,
P6_NOP8,
P6_NOP5_ATOMIC
};
static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
{
NULL,
p6nops,
p6nops + 1,
p6nops + 1 + 2,
p6nops + 1 + 2 + 3,
p6nops + 1 + 2 + 3 + 4,
p6nops + 1 + 2 + 3 + 4 + 5,
p6nops + 1 + 2 + 3 + 4 + 5 + 6,
p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
};
#endif
/* Initialize these to a safe default */
#ifdef CONFIG_X86_64
const unsigned char * const *ideal_nops = p6_nops;
#else
const unsigned char * const *ideal_nops = intel_nops;
#endif
void __init arch_init_ideal_nops(void)
{
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_INTEL:
/*
* Due to a decoder implementation quirk, some
* specific Intel CPUs actually perform better with
* the "k8_nops" than with the SDM-recommended NOPs.
*/
if (boot_cpu_data.x86 == 6 &&
boot_cpu_data.x86_model >= 0x0f &&
boot_cpu_data.x86_model != 0x1c &&
boot_cpu_data.x86_model != 0x26 &&
boot_cpu_data.x86_model != 0x27 &&
boot_cpu_data.x86_model < 0x30) {
ideal_nops = k8_nops;
} else if (boot_cpu_has(X86_FEATURE_NOPL)) {
ideal_nops = p6_nops;
} else {
#ifdef CONFIG_X86_64
ideal_nops = k8_nops;
#else
ideal_nops = intel_nops;
#endif
}
break;
default:
#ifdef CONFIG_X86_64
ideal_nops = k8_nops;
#else
if (boot_cpu_has(X86_FEATURE_K8))
ideal_nops = k8_nops;
else if (boot_cpu_has(X86_FEATURE_K7))
ideal_nops = k7_nops;
else
ideal_nops = intel_nops;
#endif
}
}
/* Use this to add nops to a buffer, then text_poke the whole buffer. */
static void __init_or_module add_nops(void *insns, unsigned int len)
{
while (len > 0) {
unsigned int noplen = len;
if (noplen > ASM_NOP_MAX)
noplen = ASM_NOP_MAX;
memcpy(insns, ideal_nops[noplen], noplen);
insns += noplen;
len -= noplen;
}
}
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
void *text_poke_early(void *addr, const void *opcode, size_t len);
/* Replace instructions with better alternatives for this CPU type.
This runs before SMP is initialized to avoid SMP problems with
self modifying code. This implies that asymmetric systems where
APs have less capabilities than the boot processor are not handled.
Tough. Make sure you disable such features by hand. */
void __init_or_module apply_alternatives(struct alt_instr *start,
struct alt_instr *end)
{
struct alt_instr *a;
u8 *instr, *replacement;
u8 insnbuf[MAX_PATCH_LEN];
DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
/*
* The scan order should be from start to end. A later scanned
* alternative code can overwrite a previous scanned alternative code.
* Some kernel functions (e.g. memcpy, memset, etc) use this order to
* patch code.
*
* So be careful if you want to change the scan order to any other
* order.
*/
for (a = start; a < end; a++) {
instr = (u8 *)&a->instr_offset + a->instr_offset;
replacement = (u8 *)&a->repl_offset + a->repl_offset;
BUG_ON(a->replacementlen > a->instrlen);
BUG_ON(a->instrlen > sizeof(insnbuf));
BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
if (!boot_cpu_has(a->cpuid))
continue;
memcpy(insnbuf, replacement, a->replacementlen);
/* 0xe8 is a relative jump; fix the offset. */
if (*insnbuf == 0xe8 && a->replacementlen == 5)
*(s32 *)(insnbuf + 1) += replacement - instr;
add_nops(insnbuf + a->replacementlen,
a->instrlen - a->replacementlen);
text_poke_early(instr, insnbuf, a->instrlen);
}
}
#ifdef CONFIG_SMP
static void alternatives_smp_lock(const s32 *start, const s32 *end,
u8 *text, u8 *text_end)
{
const s32 *poff;
mutex_lock(&text_mutex);
for (poff = start; poff < end; poff++) {
u8 *ptr = (u8 *)poff + *poff;
if (!*poff || ptr < text || ptr >= text_end)
continue;
/* turn DS segment override prefix into lock prefix */
if (*ptr == 0x3e)
text_poke(ptr, ((unsigned char []){0xf0}), 1);
}
mutex_unlock(&text_mutex);
}
static void alternatives_smp_unlock(const s32 *start, const s32 *end,
u8 *text, u8 *text_end)
{
const s32 *poff;
mutex_lock(&text_mutex);
for (poff = start; poff < end; poff++) {
u8 *ptr = (u8 *)poff + *poff;
if (!*poff || ptr < text || ptr >= text_end)
continue;
/* turn lock prefix into DS segment override prefix */
if (*ptr == 0xf0)
text_poke(ptr, ((unsigned char []){0x3E}), 1);
}
mutex_unlock(&text_mutex);
}
struct smp_alt_module {
/* what is this ??? */
struct module *mod;
char *name;
/* ptrs to lock prefixes */
const s32 *locks;
const s32 *locks_end;
/* .text segment, needed to avoid patching init code ;) */
u8 *text;
u8 *text_end;
struct list_head next;
};
static LIST_HEAD(smp_alt_modules);
static DEFINE_MUTEX(smp_alt);
static bool uniproc_patched = false; /* protected by smp_alt */
void __init_or_module alternatives_smp_module_add(struct module *mod,
char *name,
void *locks, void *locks_end,
void *text, void *text_end)
{
struct smp_alt_module *smp;
mutex_lock(&smp_alt);
if (!uniproc_patched)
goto unlock;
if (num_possible_cpus() == 1)
/* Don't bother remembering, we'll never have to undo it. */
goto smp_unlock;
smp = kzalloc(sizeof(*smp), GFP_KERNEL);
if (NULL == smp)
/* we'll run the (safe but slow) SMP code then ... */
goto unlock;
smp->mod = mod;
smp->name = name;
smp->locks = locks;
smp->locks_end = locks_end;
smp->text = text;
smp->text_end = text_end;
DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
__func__, smp->locks, smp->locks_end,
smp->text, smp->text_end, smp->name);
list_add_tail(&smp->next, &smp_alt_modules);
smp_unlock:
alternatives_smp_unlock(locks, locks_end, text, text_end);
unlock:
mutex_unlock(&smp_alt);
}
void __init_or_module alternatives_smp_module_del(struct module *mod)
{
struct smp_alt_module *item;
mutex_lock(&smp_alt);
list_for_each_entry(item, &smp_alt_modules, next) {
if (mod != item->mod)
continue;
list_del(&item->next);
kfree(item);
break;
}
mutex_unlock(&smp_alt);
}
void alternatives_enable_smp(void)
{
struct smp_alt_module *mod;
/* Why bother if there are no other CPUs? */
BUG_ON(num_possible_cpus() == 1);
mutex_lock(&smp_alt);
if (uniproc_patched) {
pr_info("switching to SMP code\n");
BUG_ON(num_online_cpus() != 1);
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
list_for_each_entry(mod, &smp_alt_modules, next)
alternatives_smp_lock(mod->locks, mod->locks_end,
mod->text, mod->text_end);
uniproc_patched = false;
}
mutex_unlock(&smp_alt);
}
/* Return 1 if the address range is reserved for smp-alternatives */
int alternatives_text_reserved(void *start, void *end)
{
struct smp_alt_module *mod;
const s32 *poff;
u8 *text_start = start;
u8 *text_end = end;
list_for_each_entry(mod, &smp_alt_modules, next) {
if (mod->text > text_end || mod->text_end < text_start)
continue;
for (poff = mod->locks; poff < mod->locks_end; poff++) {
const u8 *ptr = (const u8 *)poff + *poff;
if (text_start <= ptr && text_end > ptr)
return 1;
}
}
return 0;
}
#endif
#ifdef CONFIG_PARAVIRT
void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
struct paravirt_patch_site *end)
{
struct paravirt_patch_site *p;
char insnbuf[MAX_PATCH_LEN];
if (noreplace_paravirt)
return;
for (p = start; p < end; p++) {
unsigned int used;
BUG_ON(p->len > MAX_PATCH_LEN);
/* prep the buffer with the original instructions */
memcpy(insnbuf, p->instr, p->len);
used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
(unsigned long)p->instr, p->len);
BUG_ON(used > p->len);
/* Pad the rest with nops */
add_nops(insnbuf + used, p->len - used);
text_poke_early(p->instr, insnbuf, p->len);
}
}
extern struct paravirt_patch_site __start_parainstructions[],
__stop_parainstructions[];
#endif /* CONFIG_PARAVIRT */
void __init alternative_instructions(void)
{
/* The patching is not fully atomic, so try to avoid local interruptions
that might execute the to be patched code.
Other CPUs are not running. */
stop_nmi();
/*
* Don't stop machine check exceptions while patching.
* MCEs only happen when something got corrupted and in this
* case we must do something about the corruption.
* Ignoring it is worse than a unlikely patching race.
* Also machine checks tend to be broadcast and if one CPU
* goes into machine check the others follow quickly, so we don't
* expect a machine check to cause undue problems during to code
* patching.
*/
apply_alternatives(__alt_instructions, __alt_instructions_end);
#ifdef CONFIG_SMP
/* Patch to UP if other cpus not imminent. */
if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
uniproc_patched = true;
alternatives_smp_module_add(NULL, "core kernel",
__smp_locks, __smp_locks_end,
_text, _etext);
}
if (!uniproc_patched || num_possible_cpus() == 1)
free_init_pages("SMP alternatives",
(unsigned long)__smp_locks,
(unsigned long)__smp_locks_end);
#endif
apply_paravirt(__parainstructions, __parainstructions_end);
restart_nmi();
}
/**
* text_poke_early - Update instructions on a live kernel at boot time
* @addr: address to modify
* @opcode: source of the copy
* @len: length to copy
*
* When you use this code to patch more than one byte of an instruction
* you need to make sure that other CPUs cannot execute this code in parallel.
* Also no thread must be currently preempted in the middle of these
* instructions. And on the local CPU you need to be protected again NMI or MCE
* handlers seeing an inconsistent instruction while you patch.
*/
void *__init_or_module text_poke_early(void *addr, const void *opcode,
size_t len)
{
unsigned long flags;
local_irq_save(flags);
memcpy(addr, opcode, len);
sync_core();
local_irq_restore(flags);
/* Could also do a CLFLUSH here to speed up CPU recovery; but
that causes hangs on some VIA CPUs. */
return addr;
}
/**
* text_poke - Update instructions on a live kernel
* @addr: address to modify
* @opcode: source of the copy
* @len: length to copy
*
* Only atomic text poke/set should be allowed when not doing early patching.
* It means the size must be writable atomically and the address must be aligned
* in a way that permits an atomic write. It also makes sure we fit on a single
* page.
*
* Note: Must be called under text_mutex.
*/
void *text_poke(void *addr, const void *opcode, size_t len)
{
unsigned long flags;
char *vaddr;
struct page *pages[2];
int i;
if (!core_kernel_text((unsigned long)addr)) {
pages[0] = vmalloc_to_page(addr);
pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
} else {
pages[0] = virt_to_page(addr);
WARN_ON(!PageReserved(pages[0]));
pages[1] = virt_to_page(addr + PAGE_SIZE);
}
BUG_ON(!pages[0]);
local_irq_save(flags);
set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
if (pages[1])
set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
clear_fixmap(FIX_TEXT_POKE0);
if (pages[1])
clear_fixmap(FIX_TEXT_POKE1);
local_flush_tlb();
sync_core();
/* Could also do a CLFLUSH here to speed up CPU recovery; but
that causes hangs on some VIA CPUs. */
for (i = 0; i < len; i++)
BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
local_irq_restore(flags);
return addr;
}
static void do_sync_core(void *info)
{
sync_core();
}
static bool bp_patching_in_progress;
static void *bp_int3_handler, *bp_int3_addr;
int poke_int3_handler(struct pt_regs *regs)
{
/* bp_patching_in_progress */
smp_rmb();
if (likely(!bp_patching_in_progress))
return 0;
if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr)
return 0;
/* set up the specified breakpoint handler */
regs->ip = (unsigned long) bp_int3_handler;
return 1;
}
/**
* text_poke_bp() -- update instructions on live kernel on SMP
* @addr: address to patch
* @opcode: opcode of new instruction
* @len: length to copy
* @handler: address to jump to when the temporary breakpoint is hit
*
* Modify multi-byte instruction by using int3 breakpoint on SMP.
* We completely avoid stop_machine() here, and achieve the
* synchronization using int3 breakpoint.
*
* The way it is done:
* - add a int3 trap to the address that will be patched
* - sync cores
* - update all but the first byte of the patched range
* - sync cores
* - replace the first byte (int3) by the first byte of
* replacing opcode
* - sync cores
*
* Note: must be called under text_mutex.
*/
void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
{
unsigned char int3 = 0xcc;
bp_int3_handler = handler;
bp_int3_addr = (u8 *)addr + sizeof(int3);
bp_patching_in_progress = true;
/*
* Corresponding read barrier in int3 notifier for
* making sure the in_progress flags is correctly ordered wrt.
* patching
*/
smp_wmb();
text_poke(addr, &int3, sizeof(int3));
on_each_cpu(do_sync_core, NULL, 1);
if (len - sizeof(int3) > 0) {
/* patch all but the first byte */
text_poke((char *)addr + sizeof(int3),
(const char *) opcode + sizeof(int3),
len - sizeof(int3));
/*
* According to Intel, this core syncing is very likely
* not necessary and we'd be safe even without it. But
* better safe than sorry (plus there's not only Intel).
*/
on_each_cpu(do_sync_core, NULL, 1);
}
/* patch the first byte */
text_poke(addr, opcode, sizeof(int3));
on_each_cpu(do_sync_core, NULL, 1);
bp_patching_in_progress = false;
smp_wmb();
return addr;
}

View file

@ -0,0 +1,898 @@
/*
* Dynamic DMA mapping support for AMD Hammer.
*
* Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
* This allows to use PCI devices that only support 32bit addresses on systems
* with more than 4GB.
*
* See Documentation/DMA-API-HOWTO.txt for the interface specification.
*
* Copyright 2002 Andi Kleen, SuSE Labs.
* Subject to the GNU General Public License v2 only.
*/
#include <linux/types.h>
#include <linux/ctype.h>
#include <linux/agp_backend.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/pci.h>
#include <linux/module.h>
#include <linux/topology.h>
#include <linux/interrupt.h>
#include <linux/bitmap.h>
#include <linux/kdebug.h>
#include <linux/scatterlist.h>
#include <linux/iommu-helper.h>
#include <linux/syscore_ops.h>
#include <linux/io.h>
#include <linux/gfp.h>
#include <linux/atomic.h>
#include <asm/mtrr.h>
#include <asm/pgtable.h>
#include <asm/proto.h>
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/cacheflush.h>
#include <asm/swiotlb.h>
#include <asm/dma.h>
#include <asm/amd_nb.h>
#include <asm/x86_init.h>
#include <asm/iommu_table.h>
static unsigned long iommu_bus_base; /* GART remapping area (physical) */
static unsigned long iommu_size; /* size of remapping area bytes */
static unsigned long iommu_pages; /* .. and in pages */
static u32 *iommu_gatt_base; /* Remapping table */
static dma_addr_t bad_dma_addr;
/*
* If this is disabled the IOMMU will use an optimized flushing strategy
* of only flushing when an mapping is reused. With it true the GART is
* flushed for every mapping. Problem is that doing the lazy flush seems
* to trigger bugs with some popular PCI cards, in particular 3ware (but
* has been also also seen with Qlogic at least).
*/
static int iommu_fullflush = 1;
/* Allocation bitmap for the remapping area: */
static DEFINE_SPINLOCK(iommu_bitmap_lock);
/* Guarded by iommu_bitmap_lock: */
static unsigned long *iommu_gart_bitmap;
static u32 gart_unmapped_entry;
#define GPTE_VALID 1
#define GPTE_COHERENT 2
#define GPTE_ENCODE(x) \
(((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
#define EMERGENCY_PAGES 32 /* = 128KB */
#ifdef CONFIG_AGP
#define AGPEXTERN extern
#else
#define AGPEXTERN
#endif
/* GART can only remap to physical addresses < 1TB */
#define GART_MAX_PHYS_ADDR (1ULL << 40)
/* backdoor interface to AGP driver */
AGPEXTERN int agp_memory_reserved;
AGPEXTERN __u32 *agp_gatt_table;
static unsigned long next_bit; /* protected by iommu_bitmap_lock */
static bool need_flush; /* global flush state. set for each gart wrap */
static unsigned long alloc_iommu(struct device *dev, int size,
unsigned long align_mask)
{
unsigned long offset, flags;
unsigned long boundary_size;
unsigned long base_index;
base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
PAGE_SIZE) >> PAGE_SHIFT;
boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1,
PAGE_SIZE) >> PAGE_SHIFT;
spin_lock_irqsave(&iommu_bitmap_lock, flags);
offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
size, base_index, boundary_size, align_mask);
if (offset == -1) {
need_flush = true;
offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
size, base_index, boundary_size,
align_mask);
}
if (offset != -1) {
next_bit = offset+size;
if (next_bit >= iommu_pages) {
next_bit = 0;
need_flush = true;
}
}
if (iommu_fullflush)
need_flush = true;
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
return offset;
}
static void free_iommu(unsigned long offset, int size)
{
unsigned long flags;
spin_lock_irqsave(&iommu_bitmap_lock, flags);
bitmap_clear(iommu_gart_bitmap, offset, size);
if (offset >= next_bit)
next_bit = offset + size;
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
}
/*
* Use global flush state to avoid races with multiple flushers.
*/
static void flush_gart(void)
{
unsigned long flags;
spin_lock_irqsave(&iommu_bitmap_lock, flags);
if (need_flush) {
amd_flush_garts();
need_flush = false;
}
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
}
#ifdef CONFIG_IOMMU_LEAK
/* Debugging aid for drivers that don't free their IOMMU tables */
static int leak_trace;
static int iommu_leak_pages = 20;
static void dump_leak(void)
{
static int dump;
if (dump)
return;
dump = 1;
show_stack(NULL, NULL);
debug_dma_dump_mappings(NULL);
}
#endif
static void iommu_full(struct device *dev, size_t size, int dir)
{
/*
* Ran out of IOMMU space for this operation. This is very bad.
* Unfortunately the drivers cannot handle this operation properly.
* Return some non mapped prereserved space in the aperture and
* let the Northbridge deal with it. This will result in garbage
* in the IO operation. When the size exceeds the prereserved space
* memory corruption will occur or random memory will be DMAed
* out. Hopefully no network devices use single mappings that big.
*/
dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size);
if (size > PAGE_SIZE*EMERGENCY_PAGES) {
if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
panic("PCI-DMA: Memory would be corrupted\n");
if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
panic(KERN_ERR
"PCI-DMA: Random memory would be DMAed\n");
}
#ifdef CONFIG_IOMMU_LEAK
dump_leak();
#endif
}
static inline int
need_iommu(struct device *dev, unsigned long addr, size_t size)
{
return force_iommu || !dma_capable(dev, addr, size);
}
static inline int
nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
{
return !dma_capable(dev, addr, size);
}
/* Map a single continuous physical area into the IOMMU.
* Caller needs to check if the iommu is needed and flush.
*/
static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
size_t size, int dir, unsigned long align_mask)
{
unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
unsigned long iommu_page;
int i;
if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR))
return bad_dma_addr;
iommu_page = alloc_iommu(dev, npages, align_mask);
if (iommu_page == -1) {
if (!nonforced_iommu(dev, phys_mem, size))
return phys_mem;
if (panic_on_overflow)
panic("dma_map_area overflow %lu bytes\n", size);
iommu_full(dev, size, dir);
return bad_dma_addr;
}
for (i = 0; i < npages; i++) {
iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
phys_mem += PAGE_SIZE;
}
return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
}
/* Map a single area into the IOMMU */
static dma_addr_t gart_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size,
enum dma_data_direction dir,
struct dma_attrs *attrs)
{
unsigned long bus;
phys_addr_t paddr = page_to_phys(page) + offset;
if (!dev)
dev = &x86_dma_fallback_dev;
if (!need_iommu(dev, paddr, size))
return paddr;
bus = dma_map_area(dev, paddr, size, dir, 0);
flush_gart();
return bus;
}
/*
* Free a DMA mapping.
*/
static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
size_t size, enum dma_data_direction dir,
struct dma_attrs *attrs)
{
unsigned long iommu_page;
int npages;
int i;
if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
dma_addr >= iommu_bus_base + iommu_size)
return;
iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
for (i = 0; i < npages; i++) {
iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
}
free_iommu(iommu_page, npages);
}
/*
* Wrapper for pci_unmap_single working with scatterlists.
*/
static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
enum dma_data_direction dir, struct dma_attrs *attrs)
{
struct scatterlist *s;
int i;
for_each_sg(sg, s, nents, i) {
if (!s->dma_length || !s->length)
break;
gart_unmap_page(dev, s->dma_address, s->dma_length, dir, NULL);
}
}
/* Fallback for dma_map_sg in case of overflow */
static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
int nents, int dir)
{
struct scatterlist *s;
int i;
#ifdef CONFIG_IOMMU_DEBUG
pr_debug("dma_map_sg overflow\n");
#endif
for_each_sg(sg, s, nents, i) {
unsigned long addr = sg_phys(s);
if (nonforced_iommu(dev, addr, s->length)) {
addr = dma_map_area(dev, addr, s->length, dir, 0);
if (addr == bad_dma_addr) {
if (i > 0)
gart_unmap_sg(dev, sg, i, dir, NULL);
nents = 0;
sg[0].dma_length = 0;
break;
}
}
s->dma_address = addr;
s->dma_length = s->length;
}
flush_gart();
return nents;
}
/* Map multiple scatterlist entries continuous into the first. */
static int __dma_map_cont(struct device *dev, struct scatterlist *start,
int nelems, struct scatterlist *sout,
unsigned long pages)
{
unsigned long iommu_start = alloc_iommu(dev, pages, 0);
unsigned long iommu_page = iommu_start;
struct scatterlist *s;
int i;
if (iommu_start == -1)
return -1;
for_each_sg(start, s, nelems, i) {
unsigned long pages, addr;
unsigned long phys_addr = s->dma_address;
BUG_ON(s != start && s->offset);
if (s == start) {
sout->dma_address = iommu_bus_base;
sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
sout->dma_length = s->length;
} else {
sout->dma_length += s->length;
}
addr = phys_addr;
pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
while (pages--) {
iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
addr += PAGE_SIZE;
iommu_page++;
}
}
BUG_ON(iommu_page - iommu_start != pages);
return 0;
}
static inline int
dma_map_cont(struct device *dev, struct scatterlist *start, int nelems,
struct scatterlist *sout, unsigned long pages, int need)
{
if (!need) {
BUG_ON(nelems != 1);
sout->dma_address = start->dma_address;
sout->dma_length = start->length;
return 0;
}
return __dma_map_cont(dev, start, nelems, sout, pages);
}
/*
* DMA map all entries in a scatterlist.
* Merge chunks that have page aligned sizes into a continuous mapping.
*/
static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
enum dma_data_direction dir, struct dma_attrs *attrs)
{
struct scatterlist *s, *ps, *start_sg, *sgmap;
int need = 0, nextneed, i, out, start;
unsigned long pages = 0;
unsigned int seg_size;
unsigned int max_seg_size;
if (nents == 0)
return 0;
if (!dev)
dev = &x86_dma_fallback_dev;
out = 0;
start = 0;
start_sg = sg;
sgmap = sg;
seg_size = 0;
max_seg_size = dma_get_max_seg_size(dev);
ps = NULL; /* shut up gcc */
for_each_sg(sg, s, nents, i) {
dma_addr_t addr = sg_phys(s);
s->dma_address = addr;
BUG_ON(s->length == 0);
nextneed = need_iommu(dev, addr, s->length);
/* Handle the previous not yet processed entries */
if (i > start) {
/*
* Can only merge when the last chunk ends on a
* page boundary and the new one doesn't have an
* offset.
*/
if (!iommu_merge || !nextneed || !need || s->offset ||
(s->length + seg_size > max_seg_size) ||
(ps->offset + ps->length) % PAGE_SIZE) {
if (dma_map_cont(dev, start_sg, i - start,
sgmap, pages, need) < 0)
goto error;
out++;
seg_size = 0;
sgmap = sg_next(sgmap);
pages = 0;
start = i;
start_sg = s;
}
}
seg_size += s->length;
need = nextneed;
pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
ps = s;
}
if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
goto error;
out++;
flush_gart();
if (out < nents) {
sgmap = sg_next(sgmap);
sgmap->dma_length = 0;
}
return out;
error:
flush_gart();
gart_unmap_sg(dev, sg, out, dir, NULL);
/* When it was forced or merged try again in a dumb way */
if (force_iommu || iommu_merge) {
out = dma_map_sg_nonforce(dev, sg, nents, dir);
if (out > 0)
return out;
}
if (panic_on_overflow)
panic("dma_map_sg: overflow on %lu pages\n", pages);
iommu_full(dev, pages << PAGE_SHIFT, dir);
for_each_sg(sg, s, nents, i)
s->dma_address = bad_dma_addr;
return 0;
}
/* allocate and map a coherent mapping */
static void *
gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
gfp_t flag, struct dma_attrs *attrs)
{
dma_addr_t paddr;
unsigned long align_mask;
struct page *page;
if (force_iommu && !(flag & GFP_DMA)) {
flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
page = alloc_pages(flag | __GFP_ZERO, get_order(size));
if (!page)
return NULL;
align_mask = (1UL << get_order(size)) - 1;
paddr = dma_map_area(dev, page_to_phys(page), size,
DMA_BIDIRECTIONAL, align_mask);
flush_gart();
if (paddr != bad_dma_addr) {
*dma_addr = paddr;
return page_address(page);
}
__free_pages(page, get_order(size));
} else
return dma_generic_alloc_coherent(dev, size, dma_addr, flag,
attrs);
return NULL;
}
/* free a coherent mapping */
static void
gart_free_coherent(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_addr, struct dma_attrs *attrs)
{
gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
}
static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
{
return (dma_addr == bad_dma_addr);
}
static int no_agp;
static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
{
unsigned long a;
if (!iommu_size) {
iommu_size = aper_size;
if (!no_agp)
iommu_size /= 2;
}
a = aper + iommu_size;
iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
if (iommu_size < 64*1024*1024) {
pr_warning(
"PCI-DMA: Warning: Small IOMMU %luMB."
" Consider increasing the AGP aperture in BIOS\n",
iommu_size >> 20);
}
return iommu_size;
}
static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
{
unsigned aper_size = 0, aper_base_32, aper_order;
u64 aper_base;
pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32);
pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order);
aper_order = (aper_order >> 1) & 7;
aper_base = aper_base_32 & 0x7fff;
aper_base <<= 25;
aper_size = (32 * 1024 * 1024) << aper_order;
if (aper_base + aper_size > 0x100000000UL || !aper_size)
aper_base = 0;
*size = aper_size;
return aper_base;
}
static void enable_gart_translations(void)
{
int i;
if (!amd_nb_has_feature(AMD_NB_GART))
return;
for (i = 0; i < amd_nb_num(); i++) {
struct pci_dev *dev = node_to_amd_nb(i)->misc;
enable_gart_translation(dev, __pa(agp_gatt_table));
}
/* Flush the GART-TLB to remove stale entries */
amd_flush_garts();
}
/*
* If fix_up_north_bridges is set, the north bridges have to be fixed up on
* resume in the same way as they are handled in gart_iommu_hole_init().
*/
static bool fix_up_north_bridges;
static u32 aperture_order;
static u32 aperture_alloc;
void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
{
fix_up_north_bridges = true;
aperture_order = aper_order;
aperture_alloc = aper_alloc;
}
static void gart_fixup_northbridges(void)
{
int i;
if (!fix_up_north_bridges)
return;
if (!amd_nb_has_feature(AMD_NB_GART))
return;
pr_info("PCI-DMA: Restoring GART aperture settings\n");
for (i = 0; i < amd_nb_num(); i++) {
struct pci_dev *dev = node_to_amd_nb(i)->misc;
/*
* Don't enable translations just yet. That is the next
* step. Restore the pre-suspend aperture settings.
*/
gart_set_size_and_enable(dev, aperture_order);
pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
}
}
static void gart_resume(void)
{
pr_info("PCI-DMA: Resuming GART IOMMU\n");
gart_fixup_northbridges();
enable_gart_translations();
}
static struct syscore_ops gart_syscore_ops = {
.resume = gart_resume,
};
/*
* Private Northbridge GATT initialization in case we cannot use the
* AGP driver for some reason.
*/
static __init int init_amd_gatt(struct agp_kern_info *info)
{
unsigned aper_size, gatt_size, new_aper_size;
unsigned aper_base, new_aper_base;
struct pci_dev *dev;
void *gatt;
int i;
pr_info("PCI-DMA: Disabling AGP.\n");
aper_size = aper_base = info->aper_size = 0;
dev = NULL;
for (i = 0; i < amd_nb_num(); i++) {
dev = node_to_amd_nb(i)->misc;
new_aper_base = read_aperture(dev, &new_aper_size);
if (!new_aper_base)
goto nommu;
if (!aper_base) {
aper_size = new_aper_size;
aper_base = new_aper_base;
}
if (aper_size != new_aper_size || aper_base != new_aper_base)
goto nommu;
}
if (!aper_base)
goto nommu;
info->aper_base = aper_base;
info->aper_size = aper_size >> 20;
gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(gatt_size));
if (!gatt)
panic("Cannot allocate GATT table");
if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
panic("Could not set GART PTEs to uncacheable pages");
agp_gatt_table = gatt;
register_syscore_ops(&gart_syscore_ops);
flush_gart();
pr_info("PCI-DMA: aperture base @ %x size %u KB\n",
aper_base, aper_size>>10);
return 0;
nommu:
/* Should not happen anymore */
pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
"falling back to iommu=soft.\n");
return -1;
}
static struct dma_map_ops gart_dma_ops = {
.map_sg = gart_map_sg,
.unmap_sg = gart_unmap_sg,
.map_page = gart_map_page,
.unmap_page = gart_unmap_page,
.alloc = gart_alloc_coherent,
.free = gart_free_coherent,
.mapping_error = gart_mapping_error,
};
static void gart_iommu_shutdown(void)
{
struct pci_dev *dev;
int i;
/* don't shutdown it if there is AGP installed */
if (!no_agp)
return;
if (!amd_nb_has_feature(AMD_NB_GART))
return;
for (i = 0; i < amd_nb_num(); i++) {
u32 ctl;
dev = node_to_amd_nb(i)->misc;
pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
ctl &= ~GARTEN;
pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
}
}
int __init gart_iommu_init(void)
{
struct agp_kern_info info;
unsigned long iommu_start;
unsigned long aper_base, aper_size;
unsigned long start_pfn, end_pfn;
unsigned long scratch;
long i;
if (!amd_nb_has_feature(AMD_NB_GART))
return 0;
#ifndef CONFIG_AGP_AMD64
no_agp = 1;
#else
/* Makefile puts PCI initialization via subsys_initcall first. */
/* Add other AMD AGP bridge drivers here */
no_agp = no_agp ||
(agp_amd64_init() < 0) ||
(agp_copy_info(agp_bridge, &info) < 0);
#endif
if (no_iommu ||
(!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
!gart_iommu_aperture ||
(no_agp && init_amd_gatt(&info) < 0)) {
if (max_pfn > MAX_DMA32_PFN) {
pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
pr_warning("falling back to iommu=soft.\n");
}
return 0;
}
/* need to map that range */
aper_size = info.aper_size << 20;
aper_base = info.aper_base;
end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
start_pfn = PFN_DOWN(aper_base);
if (!pfn_range_is_mapped(start_pfn, end_pfn))
init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
pr_info("PCI-DMA: using GART IOMMU.\n");
iommu_size = check_iommu_size(info.aper_base, aper_size);
iommu_pages = iommu_size >> PAGE_SHIFT;
iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(iommu_pages/8));
if (!iommu_gart_bitmap)
panic("Cannot allocate iommu bitmap\n");
#ifdef CONFIG_IOMMU_LEAK
if (leak_trace) {
int ret;
ret = dma_debug_resize_entries(iommu_pages);
if (ret)
pr_debug("PCI-DMA: Cannot trace all the entries\n");
}
#endif
/*
* Out of IOMMU space handling.
* Reserve some invalid pages at the beginning of the GART.
*/
bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
iommu_size >> 20);
agp_memory_reserved = iommu_size;
iommu_start = aper_size - iommu_size;
iommu_bus_base = info.aper_base + iommu_start;
bad_dma_addr = iommu_bus_base;
iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
/*
* Unmap the IOMMU part of the GART. The alias of the page is
* always mapped with cache enabled and there is no full cache
* coherency across the GART remapping. The unmapping avoids
* automatic prefetches from the CPU allocating cache lines in
* there. All CPU accesses are done via the direct mapping to
* the backing memory. The GART address is only used by PCI
* devices.
*/
set_memory_np((unsigned long)__va(iommu_bus_base),
iommu_size >> PAGE_SHIFT);
/*
* Tricky. The GART table remaps the physical memory range,
* so the CPU wont notice potential aliases and if the memory
* is remapped to UC later on, we might surprise the PCI devices
* with a stray writeout of a cacheline. So play it sure and
* do an explicit, full-scale wbinvd() _after_ having marked all
* the pages as Not-Present:
*/
wbinvd();
/*
* Now all caches are flushed and we can safely enable
* GART hardware. Doing it early leaves the possibility
* of stale cache entries that can lead to GART PTE
* errors.
*/
enable_gart_translations();
/*
* Try to workaround a bug (thanks to BenH):
* Set unmapped entries to a scratch page instead of 0.
* Any prefetches that hit unmapped entries won't get an bus abort
* then. (P2P bridge may be prefetching on DMA reads).
*/
scratch = get_zeroed_page(GFP_KERNEL);
if (!scratch)
panic("Cannot allocate iommu scratch page");
gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
iommu_gatt_base[i] = gart_unmapped_entry;
flush_gart();
dma_ops = &gart_dma_ops;
x86_platform.iommu_shutdown = gart_iommu_shutdown;
swiotlb = 0;
return 0;
}
void __init gart_parse_options(char *p)
{
int arg;
#ifdef CONFIG_IOMMU_LEAK
if (!strncmp(p, "leak", 4)) {
leak_trace = 1;
p += 4;
if (*p == '=')
++p;
if (isdigit(*p) && get_option(&p, &arg))
iommu_leak_pages = arg;
}
#endif
if (isdigit(*p) && get_option(&p, &arg))
iommu_size = arg;
if (!strncmp(p, "fullflush", 9))
iommu_fullflush = 1;
if (!strncmp(p, "nofullflush", 11))
iommu_fullflush = 0;
if (!strncmp(p, "noagp", 5))
no_agp = 1;
if (!strncmp(p, "noaperture", 10))
fix_aperture = 0;
/* duplicated from pci-dma.c */
if (!strncmp(p, "force", 5))
gart_iommu_aperture_allowed = 1;
if (!strncmp(p, "allowed", 7))
gart_iommu_aperture_allowed = 1;
if (!strncmp(p, "memaper", 7)) {
fallback_aper_force = 1;
p += 7;
if (*p == '=') {
++p;
if (get_option(&p, &arg))
fallback_aper_order = arg;
}
}
}
IOMMU_INIT_POST(gart_iommu_hole_init);

297
arch/x86/kernel/amd_nb.c Normal file
View file

@ -0,0 +1,297 @@
/*
* Shared support code for AMD K8 northbridges and derivates.
* Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <asm/amd_nb.h>
static u32 *flush_words;
const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
{}
};
EXPORT_SYMBOL(amd_nb_misc_ids);
static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
{}
};
const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
{ 0x00, 0x18, 0x20 },
{ 0xff, 0x00, 0x20 },
{ 0xfe, 0x00, 0x20 },
{ }
};
struct amd_northbridge_info amd_northbridges;
EXPORT_SYMBOL(amd_northbridges);
static struct pci_dev *next_northbridge(struct pci_dev *dev,
const struct pci_device_id *ids)
{
do {
dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
if (!dev)
break;
} while (!pci_match_id(ids, dev));
return dev;
}
int amd_cache_northbridges(void)
{
u16 i = 0;
struct amd_northbridge *nb;
struct pci_dev *misc, *link;
if (amd_nb_num())
return 0;
misc = NULL;
while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
i++;
if (i == 0)
return 0;
nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
if (!nb)
return -ENOMEM;
amd_northbridges.nb = nb;
amd_northbridges.num = i;
link = misc = NULL;
for (i = 0; i != amd_nb_num(); i++) {
node_to_amd_nb(i)->misc = misc =
next_northbridge(misc, amd_nb_misc_ids);
node_to_amd_nb(i)->link = link =
next_northbridge(link, amd_nb_link_ids);
}
/* GART present only on Fam15h upto model 0fh */
if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
(boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
amd_northbridges.flags |= AMD_NB_GART;
/*
* Check for L3 cache presence.
*/
if (!cpuid_edx(0x80000006))
return 0;
/*
* Some CPU families support L3 Cache Index Disable. There are some
* limitations because of E382 and E388 on family 0x10.
*/
if (boot_cpu_data.x86 == 0x10 &&
boot_cpu_data.x86_model >= 0x8 &&
(boot_cpu_data.x86_model > 0x9 ||
boot_cpu_data.x86_mask >= 0x1))
amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
if (boot_cpu_data.x86 == 0x15)
amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
/* L3 cache partitioning is supported on family 0x15 */
if (boot_cpu_data.x86 == 0x15)
amd_northbridges.flags |= AMD_NB_L3_PARTITIONING;
return 0;
}
EXPORT_SYMBOL_GPL(amd_cache_northbridges);
/*
* Ignores subdevice/subvendor but as far as I can figure out
* they're useless anyways
*/
bool __init early_is_amd_nb(u32 device)
{
const struct pci_device_id *id;
u32 vendor = device & 0xffff;
device >>= 16;
for (id = amd_nb_misc_ids; id->vendor; id++)
if (vendor == id->vendor && device == id->device)
return true;
return false;
}
struct resource *amd_get_mmconfig_range(struct resource *res)
{
u32 address;
u64 base, msr;
unsigned segn_busn_bits;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return NULL;
/* assume all cpus from fam10h have mmconfig */
if (boot_cpu_data.x86 < 0x10)
return NULL;
address = MSR_FAM10H_MMIO_CONF_BASE;
rdmsrl(address, msr);
/* mmconfig is not enabled */
if (!(msr & FAM10H_MMIO_CONF_ENABLE))
return NULL;
base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
FAM10H_MMIO_CONF_BUSRANGE_MASK;
res->flags = IORESOURCE_MEM;
res->start = base;
res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
return res;
}
int amd_get_subcaches(int cpu)
{
struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
unsigned int mask;
int cuid;
if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
return 0;
pci_read_config_dword(link, 0x1d4, &mask);
cuid = cpu_data(cpu).compute_unit_id;
return (mask >> (4 * cuid)) & 0xf;
}
int amd_set_subcaches(int cpu, unsigned long mask)
{
static unsigned int reset, ban;
struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
unsigned int reg;
int cuid;
if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
return -EINVAL;
/* if necessary, collect reset state of L3 partitioning and BAN mode */
if (reset == 0) {
pci_read_config_dword(nb->link, 0x1d4, &reset);
pci_read_config_dword(nb->misc, 0x1b8, &ban);
ban &= 0x180000;
}
/* deactivate BAN mode if any subcaches are to be disabled */
if (mask != 0xf) {
pci_read_config_dword(nb->misc, 0x1b8, &reg);
pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
}
cuid = cpu_data(cpu).compute_unit_id;
mask <<= 4 * cuid;
mask |= (0xf ^ (1 << cuid)) << 26;
pci_write_config_dword(nb->link, 0x1d4, mask);
/* reset BAN mode if L3 partitioning returned to reset state */
pci_read_config_dword(nb->link, 0x1d4, &reg);
if (reg == reset) {
pci_read_config_dword(nb->misc, 0x1b8, &reg);
reg &= ~0x180000;
pci_write_config_dword(nb->misc, 0x1b8, reg | ban);
}
return 0;
}
static int amd_cache_gart(void)
{
u16 i;
if (!amd_nb_has_feature(AMD_NB_GART))
return 0;
flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
if (!flush_words) {
amd_northbridges.flags &= ~AMD_NB_GART;
return -ENOMEM;
}
for (i = 0; i != amd_nb_num(); i++)
pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c,
&flush_words[i]);
return 0;
}
void amd_flush_garts(void)
{
int flushed, i;
unsigned long flags;
static DEFINE_SPINLOCK(gart_lock);
if (!amd_nb_has_feature(AMD_NB_GART))
return;
/* Avoid races between AGP and IOMMU. In theory it's not needed
but I'm not sure if the hardware won't lose flush requests
when another is pending. This whole thing is so expensive anyways
that it doesn't matter to serialize more. -AK */
spin_lock_irqsave(&gart_lock, flags);
flushed = 0;
for (i = 0; i < amd_nb_num(); i++) {
pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
flush_words[i] | 1);
flushed++;
}
for (i = 0; i < amd_nb_num(); i++) {
u32 w;
/* Make sure the hardware actually executed the flush*/
for (;;) {
pci_read_config_dword(node_to_amd_nb(i)->misc,
0x9c, &w);
if (!(w & 1))
break;
cpu_relax();
}
}
spin_unlock_irqrestore(&gart_lock, flags);
if (!flushed)
pr_notice("nothing to flush?\n");
}
EXPORT_SYMBOL_GPL(amd_flush_garts);
static __init int init_amd_nbs(void)
{
int err = 0;
err = amd_cache_northbridges();
if (err < 0)
pr_notice("Cannot enumerate AMD northbridges\n");
if (amd_cache_gart() < 0)
pr_notice("Cannot initialize GART flush words, GART support disabled\n");
return err;
}
/* This has to go after the PCI subsystem */
fs_initcall(init_amd_nbs);

425
arch/x86/kernel/apb_timer.c Normal file
View file

@ -0,0 +1,425 @@
/*
* apb_timer.c: Driver for Langwell APB timers
*
* (C) Copyright 2009 Intel Corporation
* Author: Jacob Pan (jacob.jun.pan@intel.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*
* Note:
* Langwell is the south complex of Intel Moorestown MID platform. There are
* eight external timers in total that can be used by the operating system.
* The timer information, such as frequency and addresses, is provided to the
* OS via SFI tables.
* Timer interrupts are routed via FW/HW emulated IOAPIC independently via
* individual redirection table entries (RTE).
* Unlike HPET, there is no master counter, therefore one of the timers are
* used as clocksource. The overall allocation looks like:
* - timer 0 - NR_CPUs for per cpu timer
* - one timer for clocksource
* - one timer for watchdog driver.
* It is also worth notice that APB timer does not support true one-shot mode,
* free-running mode will be used here to emulate one-shot mode.
* APB timer can also be used as broadcast timer along with per cpu local APIC
* timer, but by default APB timer has higher rating than local APIC timers.
*/
#include <linux/delay.h>
#include <linux/dw_apb_timer.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/pm.h>
#include <linux/sfi.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/irq.h>
#include <asm/fixmap.h>
#include <asm/apb_timer.h>
#include <asm/intel-mid.h>
#include <asm/time.h>
#define APBT_CLOCKEVENT_RATING 110
#define APBT_CLOCKSOURCE_RATING 250
#define APBT_CLOCKEVENT0_NUM (0)
#define APBT_CLOCKSOURCE_NUM (2)
static phys_addr_t apbt_address;
static int apb_timer_block_enabled;
static void __iomem *apbt_virt_address;
/*
* Common DW APB timer info
*/
static unsigned long apbt_freq;
struct apbt_dev {
struct dw_apb_clock_event_device *timer;
unsigned int num;
int cpu;
unsigned int irq;
char name[10];
};
static struct dw_apb_clocksource *clocksource_apbt;
static inline void __iomem *adev_virt_addr(struct apbt_dev *adev)
{
return apbt_virt_address + adev->num * APBTMRS_REG_SIZE;
}
static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
#ifdef CONFIG_SMP
static unsigned int apbt_num_timers_used;
#endif
static inline void apbt_set_mapping(void)
{
struct sfi_timer_table_entry *mtmr;
int phy_cs_timer_id = 0;
if (apbt_virt_address) {
pr_debug("APBT base already mapped\n");
return;
}
mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
if (mtmr == NULL) {
printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
APBT_CLOCKEVENT0_NUM);
return;
}
apbt_address = (phys_addr_t)mtmr->phys_addr;
if (!apbt_address) {
printk(KERN_WARNING "No timer base from SFI, use default\n");
apbt_address = APBT_DEFAULT_BASE;
}
apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
if (!apbt_virt_address) {
pr_debug("Failed mapping APBT phy address at %lu\n",\
(unsigned long)apbt_address);
goto panic_noapbt;
}
apbt_freq = mtmr->freq_hz;
sfi_free_mtmr(mtmr);
/* Now figure out the physical timer id for clocksource device */
mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM);
if (mtmr == NULL)
goto panic_noapbt;
/* Now figure out the physical timer id */
pr_debug("Use timer %d for clocksource\n",
(int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE);
phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) /
APBTMRS_REG_SIZE;
clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING,
"apbt0", apbt_virt_address + phy_cs_timer_id *
APBTMRS_REG_SIZE, apbt_freq);
return;
panic_noapbt:
panic("Failed to setup APB system timer\n");
}
static inline void apbt_clear_mapping(void)
{
iounmap(apbt_virt_address);
apbt_virt_address = NULL;
}
/*
* APBT timer interrupt enable / disable
*/
static inline int is_apbt_capable(void)
{
return apbt_virt_address ? 1 : 0;
}
static int __init apbt_clockevent_register(void)
{
struct sfi_timer_table_entry *mtmr;
struct apbt_dev *adev = this_cpu_ptr(&cpu_apbt_dev);
mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
if (mtmr == NULL) {
printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
APBT_CLOCKEVENT0_NUM);
return -ENODEV;
}
adev->num = smp_processor_id();
adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0",
intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ?
APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,
adev_virt_addr(adev), 0, apbt_freq);
/* Firmware does EOI handling for us. */
adev->timer->eoi = NULL;
if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {
global_clock_event = &adev->timer->ced;
printk(KERN_DEBUG "%s clockevent registered as global\n",
global_clock_event->name);
}
dw_apb_clockevent_register(adev->timer);
sfi_free_mtmr(mtmr);
return 0;
}
#ifdef CONFIG_SMP
static void apbt_setup_irq(struct apbt_dev *adev)
{
/* timer0 irq has been setup early */
if (adev->irq == 0)
return;
irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
}
/* Should be called with per cpu */
void apbt_setup_secondary_clock(void)
{
struct apbt_dev *adev;
int cpu;
/* Don't register boot CPU clockevent */
cpu = smp_processor_id();
if (!cpu)
return;
adev = this_cpu_ptr(&cpu_apbt_dev);
if (!adev->timer) {
adev->timer = dw_apb_clockevent_init(cpu, adev->name,
APBT_CLOCKEVENT_RATING, adev_virt_addr(adev),
adev->irq, apbt_freq);
adev->timer->eoi = NULL;
} else {
dw_apb_clockevent_resume(adev->timer);
}
printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n",
cpu, adev->name, adev->cpu);
apbt_setup_irq(adev);
dw_apb_clockevent_register(adev->timer);
return;
}
/*
* this notify handler process CPU hotplug events. in case of S0i3, nonboot
* cpus are disabled/enabled frequently, for performance reasons, we keep the
* per cpu timer irq registered so that we do need to do free_irq/request_irq.
*
* TODO: it might be more reliable to directly disable percpu clockevent device
* without the notifier chain. currently, cpu 0 may get interrupts from other
* cpu timers during the offline process due to the ordering of notification.
* the extra interrupt is harmless.
*/
static int apbt_cpuhp_notify(struct notifier_block *n,
unsigned long action, void *hcpu)
{
unsigned long cpu = (unsigned long)hcpu;
struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
switch (action & 0xf) {
case CPU_DEAD:
dw_apb_clockevent_pause(adev->timer);
if (system_state == SYSTEM_RUNNING) {
pr_debug("skipping APBT CPU %lu offline\n", cpu);
} else {
pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
dw_apb_clockevent_stop(adev->timer);
}
break;
default:
pr_debug("APBT notified %lu, no action\n", action);
}
return NOTIFY_OK;
}
static __init int apbt_late_init(void)
{
if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ||
!apb_timer_block_enabled)
return 0;
/* This notifier should be called after workqueue is ready */
hotcpu_notifier(apbt_cpuhp_notify, -20);
return 0;
}
fs_initcall(apbt_late_init);
#else
void apbt_setup_secondary_clock(void) {}
#endif /* CONFIG_SMP */
static int apbt_clocksource_register(void)
{
u64 start, now;
cycle_t t1;
/* Start the counter, use timer 2 as source, timer 0/1 for event */
dw_apb_clocksource_start(clocksource_apbt);
/* Verify whether apbt counter works */
t1 = dw_apb_clocksource_read(clocksource_apbt);
rdtscll(start);
/*
* We don't know the TSC frequency yet, but waiting for
* 200000 TSC cycles is safe:
* 4 GHz == 50us
* 1 GHz == 200us
*/
do {
rep_nop();
rdtscll(now);
} while ((now - start) < 200000UL);
/* APBT is the only always on clocksource, it has to work! */
if (t1 == dw_apb_clocksource_read(clocksource_apbt))
panic("APBT counter not counting. APBT disabled\n");
dw_apb_clocksource_register(clocksource_apbt);
return 0;
}
/*
* Early setup the APBT timer, only use timer 0 for booting then switch to
* per CPU timer if possible.
* returns 1 if per cpu apbt is setup
* returns 0 if no per cpu apbt is chosen
* panic if set up failed, this is the only platform timer on Moorestown.
*/
void __init apbt_time_init(void)
{
#ifdef CONFIG_SMP
int i;
struct sfi_timer_table_entry *p_mtmr;
struct apbt_dev *adev;
#endif
if (apb_timer_block_enabled)
return;
apbt_set_mapping();
if (!apbt_virt_address)
goto out_noapbt;
/*
* Read the frequency and check for a sane value, for ESL model
* we extend the possible clock range to allow time scaling.
*/
if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq);
goto out_noapbt;
}
if (apbt_clocksource_register()) {
pr_debug("APBT has failed to register clocksource\n");
goto out_noapbt;
}
if (!apbt_clockevent_register())
apb_timer_block_enabled = 1;
else {
pr_debug("APBT has failed to register clockevent\n");
goto out_noapbt;
}
#ifdef CONFIG_SMP
/* kernel cmdline disable apb timer, so we will use lapic timers */
if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {
printk(KERN_INFO "apbt: disabled per cpu timer\n");
return;
}
pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
if (num_possible_cpus() <= sfi_mtimer_num)
apbt_num_timers_used = num_possible_cpus();
else
apbt_num_timers_used = 1;
pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
/* here we set up per CPU timer data structure */
for (i = 0; i < apbt_num_timers_used; i++) {
adev = &per_cpu(cpu_apbt_dev, i);
adev->num = i;
adev->cpu = i;
p_mtmr = sfi_get_mtmr(i);
if (p_mtmr)
adev->irq = p_mtmr->irq;
else
printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i);
}
#endif
return;
out_noapbt:
apbt_clear_mapping();
apb_timer_block_enabled = 0;
panic("failed to enable APB timer\n");
}
/* called before apb_timer_enable, use early map */
unsigned long apbt_quick_calibrate(void)
{
int i, scale;
u64 old, new;
cycle_t t1, t2;
unsigned long khz = 0;
u32 loop, shift;
apbt_set_mapping();
dw_apb_clocksource_start(clocksource_apbt);
/* check if the timer can count down, otherwise return */
old = dw_apb_clocksource_read(clocksource_apbt);
i = 10000;
while (--i) {
if (old != dw_apb_clocksource_read(clocksource_apbt))
break;
}
if (!i)
goto failed;
/* count 16 ms */
loop = (apbt_freq / 1000) << 4;
/* restart the timer to ensure it won't get to 0 in the calibration */
dw_apb_clocksource_start(clocksource_apbt);
old = dw_apb_clocksource_read(clocksource_apbt);
old += loop;
t1 = __native_read_tsc();
do {
new = dw_apb_clocksource_read(clocksource_apbt);
} while (new < old);
t2 = __native_read_tsc();
shift = 5;
if (unlikely(loop >> shift == 0)) {
printk(KERN_INFO
"APBT TSC calibration failed, not enough resolution\n");
return 0;
}
scale = (int)div_u64((t2 - t1), loop >> shift);
khz = (scale * (apbt_freq / 1000)) >> shift;
printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
return khz;
failed:
return 0;
}

View file

@ -0,0 +1,502 @@
/*
* Firmware replacement code.
*
* Work around broken BIOSes that don't set an aperture, only set the
* aperture in the AGP bridge, or set too small aperture.
*
* If all fails map the aperture over some low memory. This is cheaper than
* doing bounce buffering. The memory is lost. This is done at early boot
* because only the bootmem allocator can allocate 32+MB.
*
* Copyright 2002 Andi Kleen, SuSE Labs.
*/
#define pr_fmt(fmt) "AGP: " fmt
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/memblock.h>
#include <linux/mmzone.h>
#include <linux/pci_ids.h>
#include <linux/pci.h>
#include <linux/bitops.h>
#include <linux/suspend.h>
#include <asm/e820.h>
#include <asm/io.h>
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
#include <asm/amd_nb.h>
#include <asm/x86_init.h>
/*
* Using 512M as goal, in case kexec will load kernel_big
* that will do the on-position decompress, and could overlap with
* with the gart aperture that is used.
* Sequence:
* kernel_small
* ==> kexec (with kdump trigger path or gart still enabled)
* ==> kernel_small (gart area become e820_reserved)
* ==> kexec (with kdump trigger path or gart still enabled)
* ==> kerne_big (uncompressed size will be big than 64M or 128M)
* So don't use 512M below as gart iommu, leave the space for kernel
* code for safe.
*/
#define GART_MIN_ADDR (512ULL << 20)
#define GART_MAX_ADDR (1ULL << 32)
int gart_iommu_aperture;
int gart_iommu_aperture_disabled __initdata;
int gart_iommu_aperture_allowed __initdata;
int fallback_aper_order __initdata = 1; /* 64MB */
int fallback_aper_force __initdata;
int fix_aperture __initdata = 1;
/* This code runs before the PCI subsystem is initialized, so just
access the northbridge directly. */
static u32 __init allocate_aperture(void)
{
u32 aper_size;
unsigned long addr;
/* aper_size should <= 1G */
if (fallback_aper_order > 5)
fallback_aper_order = 5;
aper_size = (32 * 1024 * 1024) << fallback_aper_order;
/*
* Aperture has to be naturally aligned. This means a 2GB aperture
* won't have much chance of finding a place in the lower 4GB of
* memory. Unfortunately we cannot move it up because that would
* make the IOMMU useless.
*/
addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
aper_size, aper_size);
if (!addr) {
pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n",
addr, addr + aper_size - 1, aper_size >> 10);
return 0;
}
memblock_reserve(addr, aper_size);
pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n",
addr, addr + aper_size - 1, aper_size >> 10);
register_nosave_region(addr >> PAGE_SHIFT,
(addr+aper_size) >> PAGE_SHIFT);
return (u32)addr;
}
/* Find a PCI capability */
static u32 __init find_cap(int bus, int slot, int func, int cap)
{
int bytes;
u8 pos;
if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) &
PCI_STATUS_CAP_LIST))
return 0;
pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST);
for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
u8 id;
pos &= ~3;
id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID);
if (id == 0xff)
break;
if (id == cap)
return pos;
pos = read_pci_config_byte(bus, slot, func,
pos+PCI_CAP_LIST_NEXT);
}
return 0;
}
/* Read a standard AGPv3 bridge header */
static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
{
u32 apsize;
u32 apsizereg;
int nbits;
u32 aper_low, aper_hi;
u64 aper;
u32 old_order;
pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func);
apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);
if (apsizereg == 0xffffffff) {
pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n",
bus, slot, func);
return 0;
}
/* old_order could be the value from NB gart setting */
old_order = *order;
apsize = apsizereg & 0xfff;
/* Some BIOS use weird encodings not in the AGPv3 table. */
if (apsize & 0xff)
apsize |= 0xf00;
nbits = hweight16(apsize);
*order = 7 - nbits;
if ((int)*order < 0) /* < 32MB */
*order = 0;
aper_low = read_pci_config(bus, slot, func, 0x10);
aper_hi = read_pci_config(bus, slot, func, 0x14);
aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
/*
* On some sick chips, APSIZE is 0. It means it wants 4G
* so let double check that order, and lets trust AMD NB settings:
*/
pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n",
bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1,
32 << old_order);
if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) {
pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n",
bus, slot, func, 32 << *order, apsizereg);
*order = old_order;
}
pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n",
bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1,
32 << *order, apsizereg);
if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))
return 0;
return (u32)aper;
}
/*
* Look for an AGP bridge. Windows only expects the aperture in the
* AGP bridge and some BIOS forget to initialize the Northbridge too.
* Work around this here.
*
* Do an PCI bus scan by hand because we're running before the PCI
* subsystem.
*
* All AMD AGP bridges are AGPv3 compliant, so we can do this scan
* generically. It's probably overkill to always scan all slots because
* the AGP bridges should be always an own bus on the HT hierarchy,
* but do it here for future safety.
*/
static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
{
int bus, slot, func;
/* Poor man's PCI discovery */
for (bus = 0; bus < 256; bus++) {
for (slot = 0; slot < 32; slot++) {
for (func = 0; func < 8; func++) {
u32 class, cap;
u8 type;
class = read_pci_config(bus, slot, func,
PCI_CLASS_REVISION);
if (class == 0xffffffff)
break;
switch (class >> 16) {
case PCI_CLASS_BRIDGE_HOST:
case PCI_CLASS_BRIDGE_OTHER: /* needed? */
/* AGP bridge? */
cap = find_cap(bus, slot, func,
PCI_CAP_ID_AGP);
if (!cap)
break;
*valid_agp = 1;
return read_agp(bus, slot, func, cap,
order);
}
/* No multi-function device? */
type = read_pci_config_byte(bus, slot, func,
PCI_HEADER_TYPE);
if (!(type & 0x80))
break;
}
}
}
pr_info("No AGP bridge found\n");
return 0;
}
static int gart_fix_e820 __initdata = 1;
static int __init parse_gart_mem(char *p)
{
if (!p)
return -EINVAL;
if (!strncmp(p, "off", 3))
gart_fix_e820 = 0;
else if (!strncmp(p, "on", 2))
gart_fix_e820 = 1;
return 0;
}
early_param("gart_fix_e820", parse_gart_mem);
void __init early_gart_iommu_check(void)
{
/*
* in case it is enabled before, esp for kexec/kdump,
* previous kernel already enable that. memset called
* by allocate_aperture/__alloc_bootmem_nopanic cause restart.
* or second kernel have different position for GART hole. and new
* kernel could use hole as RAM that is still used by GART set by
* first kernel
* or BIOS forget to put that in reserved.
* try to update e820 to make that region as reserved.
*/
u32 agp_aper_order = 0;
int i, fix, slot, valid_agp = 0;
u32 ctl;
u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
u64 aper_base = 0, last_aper_base = 0;
int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0;
if (!early_pci_allowed())
return;
/* This is mostly duplicate of iommu_hole_init */
search_agp_bridge(&agp_aper_order, &valid_agp);
fix = 0;
for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
bus = amd_nb_bus_dev_ranges[i].bus;
dev_base = amd_nb_bus_dev_ranges[i].dev_base;
dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
aper_enabled = ctl & GARTEN;
aper_order = (ctl >> 1) & 7;
aper_size = (32 * 1024 * 1024) << aper_order;
aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
aper_base <<= 25;
if (last_valid) {
if ((aper_order != last_aper_order) ||
(aper_base != last_aper_base) ||
(aper_enabled != last_aper_enabled)) {
fix = 1;
break;
}
}
last_aper_order = aper_order;
last_aper_base = aper_base;
last_aper_enabled = aper_enabled;
last_valid = 1;
}
}
if (!fix && !aper_enabled)
return;
if (!aper_base || !aper_size || aper_base + aper_size > 0x100000000UL)
fix = 1;
if (gart_fix_e820 && !fix && aper_enabled) {
if (e820_any_mapped(aper_base, aper_base + aper_size,
E820_RAM)) {
/* reserve it, so we can reuse it in second kernel */
pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n",
aper_base, aper_base + aper_size - 1);
e820_add_region(aper_base, aper_size, E820_RESERVED);
update_e820();
}
}
if (valid_agp)
return;
/* disable them all at first */
for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
bus = amd_nb_bus_dev_ranges[i].bus;
dev_base = amd_nb_bus_dev_ranges[i].dev_base;
dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
ctl &= ~GARTEN;
write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
}
}
}
static int __initdata printed_gart_size_msg;
int __init gart_iommu_hole_init(void)
{
u32 agp_aper_base = 0, agp_aper_order = 0;
u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
u64 aper_base, last_aper_base = 0;
int fix, slot, valid_agp = 0;
int i, node;
if (gart_iommu_aperture_disabled || !fix_aperture ||
!early_pci_allowed())
return -ENODEV;
pr_info("Checking aperture...\n");
if (!fallback_aper_force)
agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
fix = 0;
node = 0;
for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
u32 ctl;
bus = amd_nb_bus_dev_ranges[i].bus;
dev_base = amd_nb_bus_dev_ranges[i].dev_base;
dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
iommu_detected = 1;
gart_iommu_aperture = 1;
x86_init.iommu.iommu_init = gart_iommu_init;
ctl = read_pci_config(bus, slot, 3,
AMD64_GARTAPERTURECTL);
/*
* Before we do anything else disable the GART. It may
* still be enabled if we boot into a crash-kernel here.
* Reconfiguring the GART while it is enabled could have
* unknown side-effects.
*/
ctl &= ~GARTEN;
write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
aper_order = (ctl >> 1) & 7;
aper_size = (32 * 1024 * 1024) << aper_order;
aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
aper_base <<= 25;
pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n",
node, aper_base, aper_base + aper_size - 1,
aper_size >> 20);
node++;
if (!aperture_valid(aper_base, aper_size, 64<<20)) {
if (valid_agp && agp_aper_base &&
agp_aper_base == aper_base &&
agp_aper_order == aper_order) {
/* the same between two setting from NB and agp */
if (!no_iommu &&
max_pfn > MAX_DMA32_PFN &&
!printed_gart_size_msg) {
pr_err("you are using iommu with agp, but GART size is less than 64MB\n");
pr_err("please increase GART size in your BIOS setup\n");
pr_err("if BIOS doesn't have that option, contact your HW vendor!\n");
printed_gart_size_msg = 1;
}
} else {
fix = 1;
goto out;
}
}
if ((last_aper_order && aper_order != last_aper_order) ||
(last_aper_base && aper_base != last_aper_base)) {
fix = 1;
goto out;
}
last_aper_order = aper_order;
last_aper_base = aper_base;
}
}
out:
if (!fix && !fallback_aper_force) {
if (last_aper_base)
return 1;
return 0;
}
if (!fallback_aper_force) {
aper_alloc = agp_aper_base;
aper_order = agp_aper_order;
}
if (aper_alloc) {
/* Got the aperture from the AGP bridge */
} else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
force_iommu ||
valid_agp ||
fallback_aper_force) {
pr_info("Your BIOS doesn't leave a aperture memory hole\n");
pr_info("Please enable the IOMMU option in the BIOS setup\n");
pr_info("This costs you %dMB of RAM\n",
32 << fallback_aper_order);
aper_order = fallback_aper_order;
aper_alloc = allocate_aperture();
if (!aper_alloc) {
/*
* Could disable AGP and IOMMU here, but it's
* probably not worth it. But the later users
* cannot deal with bad apertures and turning
* on the aperture over memory causes very
* strange problems, so it's better to panic
* early.
*/
panic("Not enough memory for aperture");
}
} else {
return 0;
}
/* Fix up the north bridges */
for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus, dev_base, dev_limit;
/*
* Don't enable translation yet but enable GART IO and CPU
* accesses and set DISTLBWALKPRB since GART table memory is UC.
*/
u32 ctl = aper_order << 1;
bus = amd_nb_bus_dev_ranges[i].bus;
dev_base = amd_nb_bus_dev_ranges[i].dev_base;
dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
}
}
set_up_gart_resume(aper_order, aper_alloc);
return 1;
}

View file

@ -0,0 +1,24 @@
#
# Makefile for local APIC drivers and for the IO-APIC code
#
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o
obj-y += hw_nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_SMP) += ipi.o
ifeq ($(CONFIG_X86_64),y)
# APIC probe will depend on the listing order here
obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o
obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
obj-y += apic_flat_64.o
endif
# APIC probe will depend on the listing order here
obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
# For 32bit, probe_32 need to be listed last
obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o

2578
arch/x86/kernel/apic/apic.c Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,317 @@
/*
* Copyright 2004 James Cleverdon, IBM.
* Subject to the GNU Public License, v.2
*
* Flat APIC subarch code.
*
* Hacked for x86-64 by James Cleverdon from i386 architecture code by
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
#include <linux/errno.h>
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/hardirq.h>
#include <linux/module.h>
#include <asm/smp.h>
#include <asm/apic.h>
#include <asm/ipi.h>
#include <linux/acpi.h>
static struct apic apic_physflat;
static struct apic apic_flat;
struct apic __read_mostly *apic = &apic_flat;
EXPORT_SYMBOL_GPL(apic);
static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
return 1;
}
/*
* Set up the logical destination ID.
*
* Intel recommends to set DFR, LDR and TPR before enabling
* an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
* document number 292116). So here it goes...
*/
void flat_init_apic_ldr(void)
{
unsigned long val;
unsigned long num, id;
num = smp_processor_id();
id = 1UL << num;
apic_write(APIC_DFR, APIC_DFR_FLAT);
val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
val |= SET_APIC_LOGICAL_ID(id);
apic_write(APIC_LDR, val);
}
static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
{
unsigned long flags;
local_irq_save(flags);
__default_send_IPI_dest_field(mask, vector, apic->dest_logical);
local_irq_restore(flags);
}
static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
{
unsigned long mask = cpumask_bits(cpumask)[0];
_flat_send_IPI_mask(mask, vector);
}
static void
flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
{
unsigned long mask = cpumask_bits(cpumask)[0];
int cpu = smp_processor_id();
if (cpu < BITS_PER_LONG)
clear_bit(cpu, &mask);
_flat_send_IPI_mask(mask, vector);
}
static void flat_send_IPI_allbutself(int vector)
{
int cpu = smp_processor_id();
#ifdef CONFIG_HOTPLUG_CPU
int hotplug = 1;
#else
int hotplug = 0;
#endif
if (hotplug || vector == NMI_VECTOR) {
if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
unsigned long mask = cpumask_bits(cpu_online_mask)[0];
if (cpu < BITS_PER_LONG)
clear_bit(cpu, &mask);
_flat_send_IPI_mask(mask, vector);
}
} else if (num_online_cpus() > 1) {
__default_send_IPI_shortcut(APIC_DEST_ALLBUT,
vector, apic->dest_logical);
}
}
static void flat_send_IPI_all(int vector)
{
if (vector == NMI_VECTOR) {
flat_send_IPI_mask(cpu_online_mask, vector);
} else {
__default_send_IPI_shortcut(APIC_DEST_ALLINC,
vector, apic->dest_logical);
}
}
static unsigned int flat_get_apic_id(unsigned long x)
{
unsigned int id;
id = (((x)>>24) & 0xFFu);
return id;
}
static unsigned long set_apic_id(unsigned int id)
{
unsigned long x;
x = ((id & 0xFFu)<<24);
return x;
}
static unsigned int read_xapic_id(void)
{
unsigned int id;
id = flat_get_apic_id(apic_read(APIC_ID));
return id;
}
static int flat_apic_id_registered(void)
{
return physid_isset(read_xapic_id(), phys_cpu_present_map);
}
static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
{
return initial_apic_id >> index_msb;
}
static int flat_probe(void)
{
return 1;
}
static struct apic apic_flat = {
.name = "flat",
.probe = flat_probe,
.acpi_madt_oem_check = flat_acpi_madt_oem_check,
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = flat_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
.irq_dest_mode = 1, /* logical */
.target_cpus = online_target_cpus,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = NULL,
.vector_allocation_domain = flat_vector_allocation_domain,
.init_apic_ldr = flat_init_apic_ldr,
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = flat_phys_pkg_id,
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
.apic_id_mask = 0xFFu << 24,
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
.send_IPI_mask = flat_send_IPI_mask,
.send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
.send_IPI_allbutself = flat_send_IPI_allbutself,
.send_IPI_all = flat_send_IPI_all,
.send_IPI_self = apic_send_IPI_self,
.wait_for_init_deassert = false,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
.write = native_apic_mem_write,
.eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
};
/*
* Physflat mode is used when there are more than 8 CPUs on a system.
* We cannot use logical delivery in this case because the mask
* overflows, so use physical mode.
*/
static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
#ifdef CONFIG_ACPI
/*
* Quirk: some x86_64 machines can only use physical APIC mode
* regardless of how many processors are present (x86_64 ES7000
* is an example).
*/
if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
printk(KERN_DEBUG "system APIC only can use physical flat");
return 1;
}
if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) {
printk(KERN_DEBUG "IBM Summit detected, will use apic physical");
return 1;
}
#endif
return 0;
}
static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
{
default_send_IPI_mask_sequence_phys(cpumask, vector);
}
static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
int vector)
{
default_send_IPI_mask_allbutself_phys(cpumask, vector);
}
static void physflat_send_IPI_allbutself(int vector)
{
default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
}
static void physflat_send_IPI_all(int vector)
{
physflat_send_IPI_mask(cpu_online_mask, vector);
}
static int physflat_probe(void)
{
if (apic == &apic_physflat || num_possible_cpus() > 8)
return 1;
return 0;
}
static struct apic apic_physflat = {
.name = "physical flat",
.probe = physflat_probe,
.acpi_madt_oem_check = physflat_acpi_madt_oem_check,
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = flat_apic_id_registered,
.irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 0, /* physical */
.target_cpus = online_target_cpus,
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,
.vector_allocation_domain = default_vector_allocation_domain,
/* not needed, but shouldn't hurt: */
.init_apic_ldr = flat_init_apic_ldr,
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = flat_phys_pkg_id,
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
.apic_id_mask = 0xFFu << 24,
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
.send_IPI_mask = physflat_send_IPI_mask,
.send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
.send_IPI_allbutself = physflat_send_IPI_allbutself,
.send_IPI_all = physflat_send_IPI_all,
.send_IPI_self = apic_send_IPI_self,
.wait_for_init_deassert = false,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
.write = native_apic_mem_write,
.eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
};
/*
* We need to check for physflat first, so this order is important.
*/
apic_drivers(apic_physflat, apic_flat);

View file

@ -0,0 +1,169 @@
/*
* NOOP APIC driver.
*
* Does almost nothing and should be substituted by a real apic driver via
* probe routine.
*
* Though in case if apic is disabled (for some reason) we try
* to not uglify the caller's code and allow to call (some) apic routines
* like self-ipi, etc...
*/
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/errno.h>
#include <asm/fixmap.h>
#include <asm/mpspec.h>
#include <asm/apicdef.h>
#include <asm/apic.h>
#include <asm/setup.h>
#include <linux/smp.h>
#include <asm/ipi.h>
#include <linux/interrupt.h>
#include <asm/acpi.h>
#include <asm/e820.h>
static void noop_init_apic_ldr(void) { }
static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
static void noop_send_IPI_allbutself(int vector) { }
static void noop_send_IPI_all(int vector) { }
static void noop_send_IPI_self(int vector) { }
static void noop_apic_wait_icr_idle(void) { }
static void noop_apic_icr_write(u32 low, u32 id) { }
static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
{
return -1;
}
static u32 noop_safe_apic_wait_icr_idle(void)
{
return 0;
}
static u64 noop_apic_icr_read(void)
{
return 0;
}
static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
{
return 0;
}
static unsigned int noop_get_apic_id(unsigned long x)
{
return 0;
}
static int noop_probe(void)
{
/*
* NOOP apic should not ever be
* enabled via probe routine
*/
return 0;
}
static int noop_apic_id_registered(void)
{
/*
* if we would be really "pedantic"
* we should pass read_apic_id() here
* but since NOOP suppose APIC ID = 0
* lets save a few cycles
*/
return physid_isset(0, phys_cpu_present_map);
}
static const struct cpumask *noop_target_cpus(void)
{
/* only BSP here */
return cpumask_of(0);
}
static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask,
const struct cpumask *mask)
{
if (cpu != 0)
pr_warning("APIC: Vector allocated for non-BSP cpu\n");
cpumask_copy(retmask, cpumask_of(cpu));
}
static u32 noop_apic_read(u32 reg)
{
WARN_ON_ONCE((cpu_has_apic && !disable_apic));
return 0;
}
static void noop_apic_write(u32 reg, u32 v)
{
WARN_ON_ONCE(cpu_has_apic && !disable_apic);
}
struct apic apic_noop = {
.name = "noop",
.probe = noop_probe,
.acpi_madt_oem_check = NULL,
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = noop_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
/* logical delivery broadcast to all CPUs: */
.irq_dest_mode = 1,
.target_cpus = noop_target_cpus,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = default_check_apicid_used,
.vector_allocation_domain = noop_vector_allocation_domain,
.init_apic_ldr = noop_init_apic_ldr,
.ioapic_phys_id_map = default_ioapic_phys_id_map,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = physid_set_mask_of_physid,
.check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = noop_phys_pkg_id,
.get_apic_id = noop_get_apic_id,
.set_apic_id = NULL,
.apic_id_mask = 0x0F << 24,
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
.send_IPI_mask = noop_send_IPI_mask,
.send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
.send_IPI_allbutself = noop_send_IPI_allbutself,
.send_IPI_all = noop_send_IPI_all,
.send_IPI_self = noop_send_IPI_self,
.wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
.wait_for_init_deassert = false,
.inquire_remote_apic = NULL,
.read = noop_apic_read,
.write = noop_apic_write,
.eoi_write = noop_apic_write,
.icr_read = noop_apic_icr_read,
.icr_write = noop_apic_icr_write,
.wait_icr_idle = noop_apic_wait_icr_idle,
.safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
#ifdef CONFIG_X86_32
.x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
#endif
};

View file

@ -0,0 +1,256 @@
/*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*
* Numascale NumaConnect-Specific APIC Code
*
* Copyright (C) 2011 Numascale AS. All rights reserved.
*
* Send feedback to <support@numascale.com>
*
*/
#include <linux/errno.h>
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/hardirq.h>
#include <linux/delay.h>
#include <asm/numachip/numachip.h>
#include <asm/numachip/numachip_csr.h>
#include <asm/smp.h>
#include <asm/apic.h>
#include <asm/ipi.h>
#include <asm/apic_flat_64.h>
#include <asm/pgtable.h>
static int numachip_system __read_mostly;
static const struct apic apic_numachip;
static unsigned int get_apic_id(unsigned long x)
{
unsigned long value;
unsigned int id;
rdmsrl(MSR_FAM10H_NODE_ID, value);
id = ((x >> 24) & 0xffU) | ((value << 2) & 0xff00U);
return id;
}
static unsigned long set_apic_id(unsigned int id)
{
unsigned long x;
x = ((id & 0xffU) << 24);
return x;
}
static unsigned int read_xapic_id(void)
{
return get_apic_id(apic_read(APIC_ID));
}
static int numachip_apic_id_valid(int apicid)
{
/* Trust what bootloader passes in MADT */
return 1;
}
static int numachip_apic_id_registered(void)
{
return physid_isset(read_xapic_id(), phys_cpu_present_map);
}
static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
{
return initial_apic_id >> index_msb;
}
static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
{
union numachip_csr_g3_ext_irq_gen int_gen;
int_gen.s._destination_apic_id = phys_apicid;
int_gen.s._vector = 0;
int_gen.s._msgtype = APIC_DM_INIT >> 8;
int_gen.s._index = 0;
write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
int_gen.s._msgtype = APIC_DM_STARTUP >> 8;
int_gen.s._vector = start_rip >> 12;
write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
atomic_set(&init_deasserted, 1);
return 0;
}
static void numachip_send_IPI_one(int cpu, int vector)
{
union numachip_csr_g3_ext_irq_gen int_gen;
int apicid = per_cpu(x86_cpu_to_apicid, cpu);
int_gen.s._destination_apic_id = apicid;
int_gen.s._vector = vector;
int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8;
int_gen.s._index = 0;
write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
}
static void numachip_send_IPI_mask(const struct cpumask *mask, int vector)
{
unsigned int cpu;
for_each_cpu(cpu, mask)
numachip_send_IPI_one(cpu, vector);
}
static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask,
int vector)
{
unsigned int this_cpu = smp_processor_id();
unsigned int cpu;
for_each_cpu(cpu, mask) {
if (cpu != this_cpu)
numachip_send_IPI_one(cpu, vector);
}
}
static void numachip_send_IPI_allbutself(int vector)
{
unsigned int this_cpu = smp_processor_id();
unsigned int cpu;
for_each_online_cpu(cpu) {
if (cpu != this_cpu)
numachip_send_IPI_one(cpu, vector);
}
}
static void numachip_send_IPI_all(int vector)
{
numachip_send_IPI_mask(cpu_online_mask, vector);
}
static void numachip_send_IPI_self(int vector)
{
__default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
}
static int __init numachip_probe(void)
{
return apic == &apic_numachip;
}
static void __init map_csrs(void)
{
printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n",
NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1);
init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n",
NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1);
init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE);
}
static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
{
if (c->phys_proc_id != node) {
c->phys_proc_id = node;
per_cpu(cpu_llc_id, smp_processor_id()) = node;
}
}
static int __init numachip_system_init(void)
{
unsigned int val;
if (!numachip_system)
return 0;
x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
x86_init.pci.arch_init = pci_numachip_init;
map_csrs();
val = read_lcsr(CSR_G0_NODE_IDS);
printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val);
return 0;
}
early_initcall(numachip_system_init);
static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
if (!strncmp(oem_id, "NUMASC", 6)) {
numachip_system = 1;
return 1;
}
return 0;
}
static const struct apic apic_numachip __refconst = {
.name = "NumaConnect system",
.probe = numachip_probe,
.acpi_madt_oem_check = numachip_acpi_madt_oem_check,
.apic_id_valid = numachip_apic_id_valid,
.apic_id_registered = numachip_apic_id_registered,
.irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 0, /* physical */
.target_cpus = online_target_cpus,
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,
.vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = flat_init_apic_ldr,
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = numachip_phys_pkg_id,
.get_apic_id = get_apic_id,
.set_apic_id = set_apic_id,
.apic_id_mask = 0xffU << 24,
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
.send_IPI_mask = numachip_send_IPI_mask,
.send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
.send_IPI_allbutself = numachip_send_IPI_allbutself,
.send_IPI_all = numachip_send_IPI_all,
.send_IPI_self = numachip_send_IPI_self,
.wakeup_secondary_cpu = numachip_wakeup_secondary,
.wait_for_init_deassert = false,
.inquire_remote_apic = NULL, /* REMRD not supported */
.read = native_apic_mem_read,
.write = native_apic_mem_write,
.eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
};
apic_driver(apic_numachip);

View file

@ -0,0 +1,223 @@
/*
* APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs.
*
* Drives the local APIC in "clustered mode".
*/
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/dmi.h>
#include <linux/smp.h>
#include <asm/apicdef.h>
#include <asm/fixmap.h>
#include <asm/mpspec.h>
#include <asm/apic.h>
#include <asm/ipi.h>
static unsigned bigsmp_get_apic_id(unsigned long x)
{
return (x >> 24) & 0xFF;
}
static int bigsmp_apic_id_registered(void)
{
return 1;
}
static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
{
return 0;
}
static int bigsmp_early_logical_apicid(int cpu)
{
/* on bigsmp, logical apicid is the same as physical */
return early_per_cpu(x86_cpu_to_apicid, cpu);
}
static inline unsigned long calculate_ldr(int cpu)
{
unsigned long val, id;
val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
id = per_cpu(x86_bios_cpu_apicid, cpu);
val |= SET_APIC_LOGICAL_ID(id);
return val;
}
/*
* Set up the logical destination ID.
*
* Intel recommends to set DFR, LDR and TPR before enabling
* an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
* document number 292116). So here it goes...
*/
static void bigsmp_init_apic_ldr(void)
{
unsigned long val;
int cpu = smp_processor_id();
apic_write(APIC_DFR, APIC_DFR_FLAT);
val = calculate_ldr(cpu);
apic_write(APIC_LDR, val);
}
static void bigsmp_setup_apic_routing(void)
{
printk(KERN_INFO
"Enabling APIC mode: Physflat. Using %d I/O APICs\n",
nr_ioapics);
}
static int bigsmp_cpu_present_to_apicid(int mps_cpu)
{
if (mps_cpu < nr_cpu_ids)
return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
return BAD_APICID;
}
static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
{
/* For clustered we don't have a good way to do this yet - hack */
physids_promote(0xFFL, retmap);
}
static int bigsmp_check_phys_apicid_present(int phys_apicid)
{
return 1;
}
static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
{
return cpuid_apic >> index_msb;
}
static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector)
{
default_send_IPI_mask_sequence_phys(mask, vector);
}
static void bigsmp_send_IPI_allbutself(int vector)
{
default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
}
static void bigsmp_send_IPI_all(int vector)
{
bigsmp_send_IPI_mask(cpu_online_mask, vector);
}
static int dmi_bigsmp; /* can be set by dmi scanners */
static int hp_ht_bigsmp(const struct dmi_system_id *d)
{
printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
dmi_bigsmp = 1;
return 0;
}
static const struct dmi_system_id bigsmp_dmi_table[] = {
{ hp_ht_bigsmp, "HP ProLiant DL760 G2",
{ DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
DMI_MATCH(DMI_BIOS_VERSION, "P44-"),
}
},
{ hp_ht_bigsmp, "HP ProLiant DL740",
{ DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
DMI_MATCH(DMI_BIOS_VERSION, "P47-"),
}
},
{ } /* NULL entry stops DMI scanning */
};
static int probe_bigsmp(void)
{
if (def_to_bigsmp)
dmi_bigsmp = 1;
else
dmi_check_system(bigsmp_dmi_table);
return dmi_bigsmp;
}
static struct apic apic_bigsmp = {
.name = "bigsmp",
.probe = probe_bigsmp,
.acpi_madt_oem_check = NULL,
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = bigsmp_apic_id_registered,
.irq_delivery_mode = dest_Fixed,
/* phys delivery to target CPU: */
.irq_dest_mode = 0,
.target_cpus = default_target_cpus,
.disable_esr = 1,
.dest_logical = 0,
.check_apicid_used = bigsmp_check_apicid_used,
.vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = bigsmp_init_apic_ldr,
.ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
.setup_apic_routing = bigsmp_setup_apic_routing,
.cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
.apicid_to_cpu_present = physid_set_mask_of_physid,
.check_phys_apicid_present = bigsmp_check_phys_apicid_present,
.phys_pkg_id = bigsmp_phys_pkg_id,
.get_apic_id = bigsmp_get_apic_id,
.set_apic_id = NULL,
.apic_id_mask = 0xFF << 24,
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
.send_IPI_mask = bigsmp_send_IPI_mask,
.send_IPI_mask_allbutself = NULL,
.send_IPI_allbutself = bigsmp_send_IPI_allbutself,
.send_IPI_all = bigsmp_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
.wait_for_init_deassert = true,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
.write = native_apic_mem_write,
.eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
.x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
};
void __init generic_bigsmp_probe(void)
{
unsigned int cpu;
if (!probe_bigsmp())
return;
apic = &apic_bigsmp;
for_each_possible_cpu(cpu) {
if (early_per_cpu(x86_cpu_to_logical_apicid,
cpu) == BAD_APICID)
continue;
early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
bigsmp_early_logical_apicid(cpu);
}
pr_info("Overriding APIC driver with %s\n", apic_bigsmp.name);
}
apic_driver(apic_bigsmp);

View file

@ -0,0 +1,102 @@
/*
* HW NMI watchdog support
*
* started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
*
* Arch specific calls to support NMI watchdog
*
* Bits copied from original nmi.c file
*
*/
#include <asm/apic.h>
#include <asm/nmi.h>
#include <linux/cpumask.h>
#include <linux/kdebug.h>
#include <linux/notifier.h>
#include <linux/kprobes.h>
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/delay.h>
#ifdef CONFIG_HARDLOCKUP_DETECTOR
u64 hw_nmi_get_sample_period(int watchdog_thresh)
{
return (u64)(cpu_khz) * 1000 * watchdog_thresh;
}
#endif
#ifdef arch_trigger_all_cpu_backtrace
/* For reliability, we're prepared to waste bits here. */
static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;
void arch_trigger_all_cpu_backtrace(bool include_self)
{
int i;
int cpu = get_cpu();
if (test_and_set_bit(0, &backtrace_flag)) {
/*
* If there is already a trigger_all_cpu_backtrace() in progress
* (backtrace_flag == 1), don't output double cpu dump infos.
*/
put_cpu();
return;
}
cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
if (!include_self)
cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
if (!cpumask_empty(to_cpumask(backtrace_mask))) {
pr_info("sending NMI to %s CPUs:\n",
(include_self ? "all" : "other"));
apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
}
/* Wait for up to 10 seconds for all CPUs to do the backtrace */
for (i = 0; i < 10 * 1000; i++) {
if (cpumask_empty(to_cpumask(backtrace_mask)))
break;
mdelay(1);
touch_softlockup_watchdog();
}
clear_bit(0, &backtrace_flag);
smp_mb__after_atomic();
put_cpu();
}
static int
arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
{
int cpu;
cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
arch_spin_lock(&lock);
printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
show_regs(regs);
arch_spin_unlock(&lock);
cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
return NMI_HANDLED;
}
return NMI_DONE;
}
NOKPROBE_SYMBOL(arch_trigger_all_cpu_backtrace_handler);
static int __init register_trigger_all_cpu_backtrace(void)
{
register_nmi_handler(NMI_LOCAL, arch_trigger_all_cpu_backtrace_handler,
0, "arch_bt");
return 0;
}
early_initcall(register_trigger_all_cpu_backtrace);
#endif

File diff suppressed because it is too large Load diff

166
arch/x86/kernel/apic/ipi.c Normal file
View file

@ -0,0 +1,166 @@
#include <linux/cpumask.h>
#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/delay.h>
#include <linux/spinlock.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/cache.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <asm/smp.h>
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/apic.h>
#include <asm/proto.h>
#include <asm/ipi.h>
void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
{
unsigned long query_cpu;
unsigned long flags;
/*
* Hack. The clustered APIC addressing mode doesn't allow us to send
* to an arbitrary mask, so I do a unicast to each CPU instead.
* - mbligh
*/
local_irq_save(flags);
for_each_cpu(query_cpu, mask) {
__default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
query_cpu), vector, APIC_DEST_PHYSICAL);
}
local_irq_restore(flags);
}
void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
int vector)
{
unsigned int this_cpu = smp_processor_id();
unsigned int query_cpu;
unsigned long flags;
/* See Hack comment above */
local_irq_save(flags);
for_each_cpu(query_cpu, mask) {
if (query_cpu == this_cpu)
continue;
__default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
query_cpu), vector, APIC_DEST_PHYSICAL);
}
local_irq_restore(flags);
}
#ifdef CONFIG_X86_32
void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
int vector)
{
unsigned long flags;
unsigned int query_cpu;
/*
* Hack. The clustered APIC addressing mode doesn't allow us to send
* to an arbitrary mask, so I do a unicasts to each CPU instead. This
* should be modified to do 1 message per cluster ID - mbligh
*/
local_irq_save(flags);
for_each_cpu(query_cpu, mask)
__default_send_IPI_dest_field(
early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
vector, apic->dest_logical);
local_irq_restore(flags);
}
void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
int vector)
{
unsigned long flags;
unsigned int query_cpu;
unsigned int this_cpu = smp_processor_id();
/* See Hack comment above */
local_irq_save(flags);
for_each_cpu(query_cpu, mask) {
if (query_cpu == this_cpu)
continue;
__default_send_IPI_dest_field(
early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
vector, apic->dest_logical);
}
local_irq_restore(flags);
}
/*
* This is only used on smaller machines.
*/
void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
{
unsigned long mask = cpumask_bits(cpumask)[0];
unsigned long flags;
if (!mask)
return;
local_irq_save(flags);
WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
__default_send_IPI_dest_field(mask, vector, apic->dest_logical);
local_irq_restore(flags);
}
void default_send_IPI_allbutself(int vector)
{
/*
* if there are no other CPUs in the system then we get an APIC send
* error if we try to broadcast, thus avoid sending IPIs in this case.
*/
if (!(num_online_cpus() > 1))
return;
__default_local_send_IPI_allbutself(vector);
}
void default_send_IPI_all(int vector)
{
__default_local_send_IPI_all(vector);
}
void default_send_IPI_self(int vector)
{
__default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical);
}
/* must come after the send_IPI functions above for inlining */
static int convert_apicid_to_cpu(int apic_id)
{
int i;
for_each_possible_cpu(i) {
if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
return i;
}
return -1;
}
int safe_smp_processor_id(void)
{
int apicid, cpuid;
if (!cpu_has_apic)
return 0;
apicid = hard_smp_processor_id();
if (apicid == BAD_APICID)
return 0;
cpuid = convert_apicid_to_cpu(apicid);
return cpuid >= 0 ? cpuid : 0;
}
#endif

View file

@ -0,0 +1,227 @@
/*
* Default generic APIC driver. This handles up to 8 CPUs.
*
* Copyright 2003 Andi Kleen, SuSE Labs.
* Subject to the GNU Public License, v.2
*
* Generic x86 APIC driver probe layer.
*/
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <asm/fixmap.h>
#include <asm/mpspec.h>
#include <asm/apicdef.h>
#include <asm/apic.h>
#include <asm/setup.h>
#include <linux/smp.h>
#include <asm/ipi.h>
#include <linux/interrupt.h>
#include <asm/acpi.h>
#include <asm/e820.h>
#ifdef CONFIG_HOTPLUG_CPU
#define DEFAULT_SEND_IPI (1)
#else
#define DEFAULT_SEND_IPI (0)
#endif
int no_broadcast = DEFAULT_SEND_IPI;
static __init int no_ipi_broadcast(char *str)
{
get_option(&str, &no_broadcast);
pr_info("Using %s mode\n",
no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
return 1;
}
__setup("no_ipi_broadcast=", no_ipi_broadcast);
static int __init print_ipi_mode(void)
{
pr_info("Using IPI %s mode\n",
no_broadcast ? "No-Shortcut" : "Shortcut");
return 0;
}
late_initcall(print_ipi_mode);
static int default_x86_32_early_logical_apicid(int cpu)
{
return 1 << cpu;
}
static void setup_apic_flat_routing(void)
{
#ifdef CONFIG_X86_IO_APIC
printk(KERN_INFO
"Enabling APIC mode: Flat. Using %d I/O APICs\n",
nr_ioapics);
#endif
}
/* should be called last. */
static int probe_default(void)
{
return 1;
}
static struct apic apic_default = {
.name = "default",
.probe = probe_default,
.acpi_madt_oem_check = NULL,
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = default_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
/* logical delivery broadcast to all CPUs: */
.irq_dest_mode = 1,
.target_cpus = default_target_cpus,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = default_check_apicid_used,
.vector_allocation_domain = flat_vector_allocation_domain,
.init_apic_ldr = default_init_apic_ldr,
.ioapic_phys_id_map = default_ioapic_phys_id_map,
.setup_apic_routing = setup_apic_flat_routing,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = physid_set_mask_of_physid,
.check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = default_phys_pkg_id,
.get_apic_id = default_get_apic_id,
.set_apic_id = NULL,
.apic_id_mask = 0x0F << 24,
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
.send_IPI_mask = default_send_IPI_mask_logical,
.send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
.send_IPI_allbutself = default_send_IPI_allbutself,
.send_IPI_all = default_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
.wait_for_init_deassert = true,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
.write = native_apic_mem_write,
.eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
.x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
};
apic_driver(apic_default);
struct apic *apic = &apic_default;
EXPORT_SYMBOL_GPL(apic);
static int cmdline_apic __initdata;
static int __init parse_apic(char *arg)
{
struct apic **drv;
if (!arg)
return -EINVAL;
for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
if (!strcmp((*drv)->name, arg)) {
apic = *drv;
cmdline_apic = 1;
return 0;
}
}
/* Parsed again by __setup for debug/verbose */
return 0;
}
early_param("apic", parse_apic);
void __init default_setup_apic_routing(void)
{
int version = apic_version[boot_cpu_physical_apicid];
if (num_possible_cpus() > 8) {
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_INTEL:
if (!APIC_XAPIC(version)) {
def_to_bigsmp = 0;
break;
}
/* If P4 and above fall through */
case X86_VENDOR_AMD:
def_to_bigsmp = 1;
}
}
#ifdef CONFIG_X86_BIGSMP
/*
* This is used to switch to bigsmp mode when
* - There is no apic= option specified by the user
* - generic_apic_probe() has chosen apic_default as the sub_arch
* - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
*/
if (!cmdline_apic && apic == &apic_default)
generic_bigsmp_probe();
#endif
if (apic->setup_apic_routing)
apic->setup_apic_routing();
if (x86_platform.apic_post_init)
x86_platform.apic_post_init();
}
void __init generic_apic_probe(void)
{
if (!cmdline_apic) {
struct apic **drv;
for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
if ((*drv)->probe()) {
apic = *drv;
break;
}
}
/* Not visible without early console */
if (drv == __apicdrivers_end)
panic("Didn't find an APIC driver");
}
printk(KERN_INFO "Using APIC driver %s\n", apic->name);
}
/* This function can switch the APIC even after the initial ->probe() */
int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
struct apic **drv;
for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
if (!(*drv)->acpi_madt_oem_check)
continue;
if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id))
continue;
if (!cmdline_apic) {
apic = *drv;
printk(KERN_INFO "Switched to APIC driver `%s'.\n",
apic->name);
}
return 1;
}
return 0;
}

View file

@ -0,0 +1,72 @@
/*
* Copyright 2004 James Cleverdon, IBM.
* Subject to the GNU Public License, v.2
*
* Generic APIC sub-arch probe layer.
*
* Hacked for x86-64 by James Cleverdon from i386 architecture code by
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/string.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/hardirq.h>
#include <linux/dmar.h>
#include <asm/smp.h>
#include <asm/apic.h>
#include <asm/ipi.h>
#include <asm/setup.h>
/*
* Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
*/
void __init default_setup_apic_routing(void)
{
struct apic **drv;
enable_IR_x2apic();
for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
if ((*drv)->probe && (*drv)->probe()) {
if (apic != *drv) {
apic = *drv;
pr_info("Switched APIC routing to %s.\n",
apic->name);
}
break;
}
}
if (x86_platform.apic_post_init)
x86_platform.apic_post_init();
}
/* Same for both flat and physical. */
void apic_send_IPI_self(int vector)
{
__default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
}
int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
struct apic **drv;
for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) {
if (apic != *drv) {
apic = *drv;
pr_info("Setting APIC routing to %s.\n",
apic->name);
}
return 1;
}
}
return 0;
}

View file

@ -0,0 +1,287 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/dmar.h>
#include <linux/cpu.h>
#include <asm/smp.h>
#include <asm/x2apic.h>
static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
return x2apic_enabled();
}
static inline u32 x2apic_cluster(int cpu)
{
return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
}
static void
__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
struct cpumask *cpus_in_cluster_ptr;
struct cpumask *ipi_mask_ptr;
unsigned int cpu, this_cpu;
unsigned long flags;
u32 dest;
x2apic_wrmsr_fence();
local_irq_save(flags);
this_cpu = smp_processor_id();
/*
* We are to modify mask, so we need an own copy
* and be sure it's manipulated with irq off.
*/
ipi_mask_ptr = this_cpu_cpumask_var_ptr(ipi_mask);
cpumask_copy(ipi_mask_ptr, mask);
/*
* The idea is to send one IPI per cluster.
*/
for_each_cpu(cpu, ipi_mask_ptr) {
unsigned long i;
cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
dest = 0;
/* Collect cpus in cluster. */
for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) {
if (apic_dest == APIC_DEST_ALLINC || i != this_cpu)
dest |= per_cpu(x86_cpu_to_logical_apicid, i);
}
if (!dest)
continue;
__x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
/*
* Cluster sibling cpus should be discared now so
* we would not send IPI them second time.
*/
cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
}
local_irq_restore(flags);
}
static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
{
__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
}
static void
x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
{
__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
}
static void x2apic_send_IPI_allbutself(int vector)
{
__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
}
static void x2apic_send_IPI_all(int vector)
{
__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
}
static int
x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
const struct cpumask *andmask,
unsigned int *apicid)
{
u32 dest = 0;
u16 cluster;
int i;
for_each_cpu_and(i, cpumask, andmask) {
if (!cpumask_test_cpu(i, cpu_online_mask))
continue;
dest = per_cpu(x86_cpu_to_logical_apicid, i);
cluster = x2apic_cluster(i);
break;
}
if (!dest)
return -EINVAL;
for_each_cpu_and(i, cpumask, andmask) {
if (!cpumask_test_cpu(i, cpu_online_mask))
continue;
if (cluster != x2apic_cluster(i))
continue;
dest |= per_cpu(x86_cpu_to_logical_apicid, i);
}
*apicid = dest;
return 0;
}
static void init_x2apic_ldr(void)
{
unsigned int this_cpu = smp_processor_id();
unsigned int cpu;
per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
__cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
for_each_online_cpu(cpu) {
if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
continue;
__cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
__cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
}
}
/*
* At CPU state changes, update the x2apic cluster sibling info.
*/
static int
update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
unsigned int this_cpu = (unsigned long)hcpu;
unsigned int cpu;
int err = 0;
switch (action) {
case CPU_UP_PREPARE:
if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu),
GFP_KERNEL)) {
err = -ENOMEM;
} else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu),
GFP_KERNEL)) {
free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
err = -ENOMEM;
}
break;
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
for_each_online_cpu(cpu) {
if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
continue;
__cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu));
__cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
}
free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
free_cpumask_var(per_cpu(ipi_mask, this_cpu));
break;
}
return notifier_from_errno(err);
}
static struct notifier_block __refdata x2apic_cpu_notifier = {
.notifier_call = update_clusterinfo,
};
static int x2apic_init_cpu_notifier(void)
{
int cpu = smp_processor_id();
zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
__cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
register_hotcpu_notifier(&x2apic_cpu_notifier);
return 1;
}
static int x2apic_cluster_probe(void)
{
if (x2apic_mode)
return x2apic_init_cpu_notifier();
else
return 0;
}
static const struct cpumask *x2apic_cluster_target_cpus(void)
{
return cpu_all_mask;
}
/*
* Each x2apic cluster is an allocation domain.
*/
static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask,
const struct cpumask *mask)
{
/*
* To minimize vector pressure, default case of boot, device bringup
* etc will use a single cpu for the interrupt destination.
*
* On explicit migration requests coming from irqbalance etc,
* interrupts will be routed to the x2apic cluster (cluster-id
* derived from the first cpu in the mask) members specified
* in the mask.
*/
if (mask == x2apic_cluster_target_cpus())
cpumask_copy(retmask, cpumask_of(cpu));
else
cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
}
static struct apic apic_x2apic_cluster = {
.name = "cluster x2apic",
.probe = x2apic_cluster_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
.apic_id_valid = x2apic_apic_id_valid,
.apic_id_registered = x2apic_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
.irq_dest_mode = 1, /* logical */
.target_cpus = x2apic_cluster_target_cpus,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = NULL,
.vector_allocation_domain = cluster_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = x2apic_phys_pkg_id,
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
.apic_id_mask = 0xFFFFFFFFu,
.cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
.send_IPI_mask = x2apic_send_IPI_mask,
.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
.wait_for_init_deassert = false,
.inquire_remote_apic = NULL,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
.eoi_write = native_apic_msr_eoi_write,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
.wait_icr_idle = native_x2apic_wait_icr_idle,
.safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
apic_driver(apic_x2apic_cluster);

View file

@ -0,0 +1,141 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/dmar.h>
#include <asm/smp.h>
#include <asm/x2apic.h>
int x2apic_phys;
static struct apic apic_x2apic_phys;
static int set_x2apic_phys_mode(char *arg)
{
x2apic_phys = 1;
return 0;
}
early_param("x2apic_phys", set_x2apic_phys_mode);
static bool x2apic_fadt_phys(void)
{
if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
printk(KERN_DEBUG "System requires x2apic physical mode\n");
return true;
}
return false;
}
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys());
}
static void
__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
unsigned long query_cpu;
unsigned long this_cpu;
unsigned long flags;
x2apic_wrmsr_fence();
local_irq_save(flags);
this_cpu = smp_processor_id();
for_each_cpu(query_cpu, mask) {
if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu)
continue;
__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
vector, APIC_DEST_PHYSICAL);
}
local_irq_restore(flags);
}
static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
{
__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
}
static void
x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
{
__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
}
static void x2apic_send_IPI_allbutself(int vector)
{
__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
}
static void x2apic_send_IPI_all(int vector)
{
__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
}
static void init_x2apic_ldr(void)
{
}
static int x2apic_phys_probe(void)
{
if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys()))
return 1;
return apic == &apic_x2apic_phys;
}
static struct apic apic_x2apic_phys = {
.name = "physical x2apic",
.probe = x2apic_phys_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
.apic_id_valid = x2apic_apic_id_valid,
.apic_id_registered = x2apic_apic_id_registered,
.irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 0, /* physical */
.target_cpus = online_target_cpus,
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,
.vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = x2apic_phys_pkg_id,
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
.apic_id_mask = 0xFFFFFFFFu,
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
.send_IPI_mask = x2apic_send_IPI_mask,
.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
.wait_for_init_deassert = false,
.inquire_remote_apic = NULL,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
.eoi_write = native_apic_msr_eoi_write,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
.wait_icr_idle = native_x2apic_wait_icr_idle,
.safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
apic_driver(apic_x2apic_phys);

File diff suppressed because it is too large Load diff

2449
arch/x86/kernel/apm_32.c Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,74 @@
/*
* Generate definitions needed by assembly language modules.
* This code generates raw asm output which is post-processed to extract
* and format the required data.
*/
#define COMPILE_OFFSETS
#include <linux/crypto.h>
#include <linux/sched.h>
#include <linux/stddef.h>
#include <linux/hardirq.h>
#include <linux/suspend.h>
#include <linux/kbuild.h>
#include <asm/processor.h>
#include <asm/thread_info.h>
#include <asm/sigframe.h>
#include <asm/bootparam.h>
#include <asm/suspend.h>
#ifdef CONFIG_XEN
#include <xen/interface/xen.h>
#endif
#ifdef CONFIG_X86_32
# include "asm-offsets_32.c"
#else
# include "asm-offsets_64.c"
#endif
void common(void) {
BLANK();
OFFSET(TI_flags, thread_info, flags);
OFFSET(TI_status, thread_info, status);
OFFSET(TI_addr_limit, thread_info, addr_limit);
BLANK();
OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
BLANK();
OFFSET(pbe_address, pbe, address);
OFFSET(pbe_orig_address, pbe, orig_address);
OFFSET(pbe_next, pbe, next);
#ifdef CONFIG_PARAVIRT
BLANK();
OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
#endif
#ifdef CONFIG_XEN
BLANK();
OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
#endif
BLANK();
OFFSET(BP_scratch, boot_params, scratch);
OFFSET(BP_loadflags, boot_params, hdr.loadflags);
OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
OFFSET(BP_version, boot_params, hdr.version);
OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
OFFSET(BP_pref_address, boot_params, hdr.pref_address);
OFFSET(BP_code32_start, boot_params, hdr.code32_start);
BLANK();
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
}

View file

@ -0,0 +1,89 @@
#include <asm/ucontext.h>
#include <linux/lguest.h>
#include "../../../drivers/lguest/lg.h"
#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
static char syscalls[] = {
#include <asm/syscalls_32.h>
};
/* workaround for a warning with -Wmissing-prototypes */
void foo(void);
void foo(void)
{
OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax);
OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx);
OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx);
OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx);
OFFSET(IA32_SIGCONTEXT_si, sigcontext, si);
OFFSET(IA32_SIGCONTEXT_di, sigcontext, di);
OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp);
OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp);
OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip);
BLANK();
OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask);
OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
BLANK();
OFFSET(TI_sysenter_return, thread_info, sysenter_return);
OFFSET(TI_cpu, thread_info, cpu);
BLANK();
OFFSET(PT_EBX, pt_regs, bx);
OFFSET(PT_ECX, pt_regs, cx);
OFFSET(PT_EDX, pt_regs, dx);
OFFSET(PT_ESI, pt_regs, si);
OFFSET(PT_EDI, pt_regs, di);
OFFSET(PT_EBP, pt_regs, bp);
OFFSET(PT_EAX, pt_regs, ax);
OFFSET(PT_DS, pt_regs, ds);
OFFSET(PT_ES, pt_regs, es);
OFFSET(PT_FS, pt_regs, fs);
OFFSET(PT_GS, pt_regs, gs);
OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
OFFSET(PT_EIP, pt_regs, ip);
OFFSET(PT_CS, pt_regs, cs);
OFFSET(PT_EFLAGS, pt_regs, flags);
OFFSET(PT_OLDESP, pt_regs, sp);
OFFSET(PT_OLDSS, pt_regs, ss);
BLANK();
OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
BLANK();
OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
BLANK();
/* Offset from the sysenter stack to tss.sp0 */
DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
sizeof(struct tss_struct));
#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
BLANK();
OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
#endif
BLANK();
DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
DEFINE(NR_syscalls, sizeof(syscalls));
}

View file

@ -0,0 +1,90 @@
#include <asm/ia32.h>
#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
#ifdef CONFIG_X86_X32_ABI
# define __SYSCALL_X32(nr, sym, compat) [nr] = 1,
#else
# define __SYSCALL_X32(nr, sym, compat) /* nothing */
#endif
static char syscalls_64[] = {
#include <asm/syscalls_64.h>
};
#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
static char syscalls_ia32[] = {
#include <asm/syscalls_32.h>
};
int main(void)
{
#ifdef CONFIG_PARAVIRT
OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
BLANK();
#endif
#ifdef CONFIG_IA32_EMULATION
OFFSET(TI_sysenter_return, thread_info, sysenter_return);
BLANK();
#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry)
ENTRY(ax);
ENTRY(bx);
ENTRY(cx);
ENTRY(dx);
ENTRY(si);
ENTRY(di);
ENTRY(bp);
ENTRY(sp);
ENTRY(ip);
BLANK();
#undef ENTRY
OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
BLANK();
#endif
#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
ENTRY(bx);
ENTRY(bx);
ENTRY(cx);
ENTRY(dx);
ENTRY(sp);
ENTRY(bp);
ENTRY(si);
ENTRY(di);
ENTRY(r8);
ENTRY(r9);
ENTRY(r10);
ENTRY(r11);
ENTRY(r12);
ENTRY(r13);
ENTRY(r14);
ENTRY(r15);
ENTRY(flags);
BLANK();
#undef ENTRY
#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry)
ENTRY(cr0);
ENTRY(cr2);
ENTRY(cr3);
ENTRY(cr4);
ENTRY(cr8);
ENTRY(gdt_desc);
BLANK();
#undef ENTRY
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
BLANK();
DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
DEFINE(NR_syscalls, sizeof(syscalls_64));
DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1);
DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
return 0;
}

View file

@ -0,0 +1,81 @@
#include <linux/init.h>
#include <linux/types.h>
#include <linux/audit.h>
#include <asm/unistd.h>
static unsigned dir_class[] = {
#include <asm-generic/audit_dir_write.h>
~0U
};
static unsigned read_class[] = {
#include <asm-generic/audit_read.h>
~0U
};
static unsigned write_class[] = {
#include <asm-generic/audit_write.h>
~0U
};
static unsigned chattr_class[] = {
#include <asm-generic/audit_change_attr.h>
~0U
};
static unsigned signal_class[] = {
#include <asm-generic/audit_signal.h>
~0U
};
int audit_classify_arch(int arch)
{
#ifdef CONFIG_IA32_EMULATION
if (arch == AUDIT_ARCH_I386)
return 1;
#endif
return 0;
}
int audit_classify_syscall(int abi, unsigned syscall)
{
#ifdef CONFIG_IA32_EMULATION
extern int ia32_classify_syscall(unsigned);
if (abi == AUDIT_ARCH_I386)
return ia32_classify_syscall(syscall);
#endif
switch(syscall) {
case __NR_open:
return 2;
case __NR_openat:
return 3;
case __NR_execve:
return 5;
default:
return 0;
}
}
static int __init audit_classes_init(void)
{
#ifdef CONFIG_IA32_EMULATION
extern __u32 ia32_dir_class[];
extern __u32 ia32_write_class[];
extern __u32 ia32_read_class[];
extern __u32 ia32_chattr_class[];
extern __u32 ia32_signal_class[];
audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class);
audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class);
audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class);
audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class);
audit_register_class(AUDIT_CLASS_SIGNAL_32, ia32_signal_class);
#endif
audit_register_class(AUDIT_CLASS_WRITE, write_class);
audit_register_class(AUDIT_CLASS_READ, read_class);
audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
audit_register_class(AUDIT_CLASS_CHATTR, chattr_class);
audit_register_class(AUDIT_CLASS_SIGNAL, signal_class);
return 0;
}
__initcall(audit_classes_init);

101
arch/x86/kernel/bootflag.c Normal file
View file

@ -0,0 +1,101 @@
/*
* Implement 'Simple Boot Flag Specification 2.0'
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/acpi.h>
#include <asm/io.h>
#include <linux/mc146818rtc.h>
#define SBF_RESERVED (0x78)
#define SBF_PNPOS (1<<0)
#define SBF_BOOTING (1<<1)
#define SBF_DIAG (1<<2)
#define SBF_PARITY (1<<7)
int sbf_port __initdata = -1; /* set via acpi_boot_init() */
static int __init parity(u8 v)
{
int x = 0;
int i;
for (i = 0; i < 8; i++) {
x ^= (v & 1);
v >>= 1;
}
return x;
}
static void __init sbf_write(u8 v)
{
unsigned long flags;
if (sbf_port != -1) {
v &= ~SBF_PARITY;
if (!parity(v))
v |= SBF_PARITY;
printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n",
sbf_port, v);
spin_lock_irqsave(&rtc_lock, flags);
CMOS_WRITE(v, sbf_port);
spin_unlock_irqrestore(&rtc_lock, flags);
}
}
static u8 __init sbf_read(void)
{
unsigned long flags;
u8 v;
if (sbf_port == -1)
return 0;
spin_lock_irqsave(&rtc_lock, flags);
v = CMOS_READ(sbf_port);
spin_unlock_irqrestore(&rtc_lock, flags);
return v;
}
static int __init sbf_value_valid(u8 v)
{
if (v & SBF_RESERVED) /* Reserved bits */
return 0;
if (!parity(v))
return 0;
return 1;
}
static int __init sbf_init(void)
{
u8 v;
if (sbf_port == -1)
return 0;
v = sbf_read();
if (!sbf_value_valid(v)) {
printk(KERN_WARNING "Simple Boot Flag value 0x%x read from "
"CMOS RAM was invalid\n", v);
}
v &= ~SBF_RESERVED;
v &= ~SBF_BOOTING;
v &= ~SBF_DIAG;
#if defined(CONFIG_ISAPNP)
v |= SBF_PNPOS;
#endif
sbf_write(v);
return 0;
}
module_init(sbf_init);

167
arch/x86/kernel/check.c Normal file
View file

@ -0,0 +1,167 @@
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/kthread.h>
#include <linux/workqueue.h>
#include <linux/memblock.h>
#include <asm/proto.h>
/*
* Some BIOSes seem to corrupt the low 64k of memory during events
* like suspend/resume and unplugging an HDMI cable. Reserve all
* remaining free memory in that area and fill it with a distinct
* pattern.
*/
#define MAX_SCAN_AREAS 8
static int __read_mostly memory_corruption_check = -1;
static unsigned __read_mostly corruption_check_size = 64*1024;
static unsigned __read_mostly corruption_check_period = 60; /* seconds */
static struct scan_area {
u64 addr;
u64 size;
} scan_areas[MAX_SCAN_AREAS];
static int num_scan_areas;
static __init int set_corruption_check(char *arg)
{
ssize_t ret;
unsigned long val;
ret = kstrtoul(arg, 10, &val);
if (ret)
return ret;
memory_corruption_check = val;
return 0;
}
early_param("memory_corruption_check", set_corruption_check);
static __init int set_corruption_check_period(char *arg)
{
ssize_t ret;
unsigned long val;
ret = kstrtoul(arg, 10, &val);
if (ret)
return ret;
corruption_check_period = val;
return 0;
}
early_param("memory_corruption_check_period", set_corruption_check_period);
static __init int set_corruption_check_size(char *arg)
{
char *end;
unsigned size;
size = memparse(arg, &end);
if (*end == '\0')
corruption_check_size = size;
return (size == corruption_check_size) ? 0 : -EINVAL;
}
early_param("memory_corruption_check_size", set_corruption_check_size);
void __init setup_bios_corruption_check(void)
{
phys_addr_t start, end;
u64 i;
if (memory_corruption_check == -1) {
memory_corruption_check =
#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
1
#else
0
#endif
;
}
if (corruption_check_size == 0)
memory_corruption_check = 0;
if (!memory_corruption_check)
return;
corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
PAGE_SIZE, corruption_check_size);
end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
PAGE_SIZE, corruption_check_size);
if (start >= end)
continue;
memblock_reserve(start, end - start);
scan_areas[num_scan_areas].addr = start;
scan_areas[num_scan_areas].size = end - start;
/* Assume we've already mapped this early memory */
memset(__va(start), 0, end - start);
if (++num_scan_areas >= MAX_SCAN_AREAS)
break;
}
if (num_scan_areas)
printk(KERN_INFO "Scanning %d areas for low memory corruption\n", num_scan_areas);
}
void check_for_bios_corruption(void)
{
int i;
int corruption = 0;
if (!memory_corruption_check)
return;
for (i = 0; i < num_scan_areas; i++) {
unsigned long *addr = __va(scan_areas[i].addr);
unsigned long size = scan_areas[i].size;
for (; size; addr++, size -= sizeof(unsigned long)) {
if (!*addr)
continue;
printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
addr, __pa(addr), *addr);
corruption = 1;
*addr = 0;
}
}
WARN_ONCE(corruption, KERN_ERR "Memory corruption detected in low memory\n");
}
static void check_corruption(struct work_struct *dummy);
static DECLARE_DELAYED_WORK(bios_check_work, check_corruption);
static void check_corruption(struct work_struct *dummy)
{
check_for_bios_corruption();
schedule_delayed_work(&bios_check_work,
round_jiffies_relative(corruption_check_period*HZ));
}
static int start_periodic_check_for_corruption(void)
{
if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0)
return 0;
printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
corruption_check_period);
/* First time we run the checks right away */
schedule_delayed_work(&bios_check_work, 0);
return 0;
}
module_init(start_periodic_check_for_corruption);

View file

@ -0,0 +1,69 @@
#
# Makefile for x86-compatible CPU details, features and quirks
#
# Don't trace early stages of a secondary CPU boot
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_common.o = -pg
CFLAGS_REMOVE_perf_event.o = -pg
endif
# Make sure load_percpu_segment has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_common.o := $(nostackp)
obj-y := intel_cacheinfo.o scattered.o topology.o
obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
obj-$(CONFIG_X86_32) += bugs.o
obj-$(CONFIG_X86_64) += bugs_64.o
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o
ifdef CONFIG_PERF_EVENTS
obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o perf_event_amd_uncore.o
ifdef CONFIG_AMD_IOMMU
obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
endif
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o
obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
perf_event_intel_uncore_snb.o \
perf_event_intel_uncore_snbep.o \
perf_event_intel_uncore_nhmex.o
endif
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_MICROCODE) += microcode/
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o
obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o
ifdef CONFIG_X86_FEATURE_NAMES
quiet_cmd_mkcapflags = MKCAP $@
cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
cpufeature = $(src)/../../include/asm/cpufeature.h
targets += capflags.c
$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
$(call if_changed,mkcapflags)
endif
clean-files += capflags.c

872
arch/x86/kernel/cpu/amd.c Normal file
View file

@ -0,0 +1,872 @@
#include <linux/export.h>
#include <linux/bitops.h>
#include <linux/elf.h>
#include <linux/mm.h>
#include <linux/io.h>
#include <linux/sched.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cpu.h>
#include <asm/smp.h>
#include <asm/pci-direct.h>
#ifdef CONFIG_X86_64
# include <asm/mmconfig.h>
# include <asm/cacheflush.h>
#endif
#include "cpu.h"
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
{
u32 gprs[8] = { 0 };
int err;
WARN_ONCE((boot_cpu_data.x86 != 0xf),
"%s should only be used on K8!\n", __func__);
gprs[1] = msr;
gprs[7] = 0x9c5a203a;
err = rdmsr_safe_regs(gprs);
*p = gprs[0] | ((u64)gprs[2] << 32);
return err;
}
static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
{
u32 gprs[8] = { 0 };
WARN_ONCE((boot_cpu_data.x86 != 0xf),
"%s should only be used on K8!\n", __func__);
gprs[0] = (u32)val;
gprs[1] = msr;
gprs[2] = val >> 32;
gprs[7] = 0x9c5a203a;
return wrmsr_safe_regs(gprs);
}
/*
* B step AMD K6 before B 9730xxxx have hardware bugs that can cause
* misexecution of code under Linux. Owners of such processors should
* contact AMD for precise details and a CPU swap.
*
* See http://www.multimania.com/poulot/k6bug.html
* and section 2.6.2 of "AMD-K6 Processor Revision Guide - Model 6"
* (Publication # 21266 Issue Date: August 1998)
*
* The following test is erm.. interesting. AMD neglected to up
* the chip setting when fixing the bug but they also tweaked some
* performance at the same time..
*/
extern __visible void vide(void);
__asm__(".globl vide\n\t.align 4\nvide: ret");
static void init_amd_k5(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
/*
* General Systems BIOSen alias the cpu frequency registers
* of the Elan at 0x000df000. Unfortuantly, one of the Linux
* drivers subsequently pokes it, and changes the CPU speed.
* Workaround : Remove the unneeded alias.
*/
#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
#define CBAR_ENB (0x80000000)
#define CBAR_KEY (0X000000CB)
if (c->x86_model == 9 || c->x86_model == 10) {
if (inl(CBAR) & CBAR_ENB)
outl(0 | CBAR_KEY, CBAR);
}
#endif
}
static void init_amd_k6(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
u32 l, h;
int mbytes = get_num_physpages() >> (20-PAGE_SHIFT);
if (c->x86_model < 6) {
/* Based on AMD doc 20734R - June 2000 */
if (c->x86_model == 0) {
clear_cpu_cap(c, X86_FEATURE_APIC);
set_cpu_cap(c, X86_FEATURE_PGE);
}
return;
}
if (c->x86_model == 6 && c->x86_mask == 1) {
const int K6_BUG_LOOP = 1000000;
int n;
void (*f_vide)(void);
unsigned long d, d2;
printk(KERN_INFO "AMD K6 stepping B detected - ");
/*
* It looks like AMD fixed the 2.6.2 bug and improved indirect
* calls at the same time.
*/
n = K6_BUG_LOOP;
f_vide = vide;
rdtscl(d);
while (n--)
f_vide();
rdtscl(d2);
d = d2-d;
if (d > 20*K6_BUG_LOOP)
printk(KERN_CONT
"system stability may be impaired when more than 32 MB are used.\n");
else
printk(KERN_CONT "probably OK (after B9730xxxx).\n");
}
/* K6 with old style WHCR */
if (c->x86_model < 8 ||
(c->x86_model == 8 && c->x86_mask < 8)) {
/* We can only write allocate on the low 508Mb */
if (mbytes > 508)
mbytes = 508;
rdmsr(MSR_K6_WHCR, l, h);
if ((l&0x0000FFFF) == 0) {
unsigned long flags;
l = (1<<0)|((mbytes/4)<<1);
local_irq_save(flags);
wbinvd();
wrmsr(MSR_K6_WHCR, l, h);
local_irq_restore(flags);
printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
mbytes);
}
return;
}
if ((c->x86_model == 8 && c->x86_mask > 7) ||
c->x86_model == 9 || c->x86_model == 13) {
/* The more serious chips .. */
if (mbytes > 4092)
mbytes = 4092;
rdmsr(MSR_K6_WHCR, l, h);
if ((l&0xFFFF0000) == 0) {
unsigned long flags;
l = ((mbytes>>2)<<22)|(1<<16);
local_irq_save(flags);
wbinvd();
wrmsr(MSR_K6_WHCR, l, h);
local_irq_restore(flags);
printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
mbytes);
}
return;
}
if (c->x86_model == 10) {
/* AMD Geode LX is model 10 */
/* placeholder for any needed mods */
return;
}
#endif
}
static void init_amd_k7(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
u32 l, h;
/*
* Bit 15 of Athlon specific MSR 15, needs to be 0
* to enable SSE on Palomino/Morgan/Barton CPU's.
* If the BIOS didn't enable it already, enable it here.
*/
if (c->x86_model >= 6 && c->x86_model <= 10) {
if (!cpu_has(c, X86_FEATURE_XMM)) {
printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
msr_clear_bit(MSR_K7_HWCR, 15);
set_cpu_cap(c, X86_FEATURE_XMM);
}
}
/*
* It's been determined by AMD that Athlons since model 8 stepping 1
* are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
* As per AMD technical note 27212 0.2
*/
if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
rdmsr(MSR_K7_CLK_CTL, l, h);
if ((l & 0xfff00000) != 0x20000000) {
printk(KERN_INFO
"CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
l, ((l & 0x000fffff)|0x20000000));
wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
}
}
set_cpu_cap(c, X86_FEATURE_K7);
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
return;
/*
* Certain Athlons might work (for various values of 'work') in SMP
* but they are not certified as MP capable.
*/
/* Athlon 660/661 is valid. */
if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
(c->x86_mask == 1)))
return;
/* Duron 670 is valid */
if ((c->x86_model == 7) && (c->x86_mask == 0))
return;
/*
* Athlon 662, Duron 671, and Athlon >model 7 have capability
* bit. It's worth noting that the A5 stepping (662) of some
* Athlon XP's have the MP bit set.
* See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
* more.
*/
if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
((c->x86_model == 7) && (c->x86_mask >= 1)) ||
(c->x86_model > 7))
if (cpu_has(c, X86_FEATURE_MP))
return;
/* If we get here, not a certified SMP capable AMD system. */
/*
* Don't taint if we are running SMP kernel on a single non-MP
* approved Athlon
*/
WARN_ONCE(1, "WARNING: This combination of AMD"
" processors is not suitable for SMP.\n");
add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
#endif
}
#ifdef CONFIG_NUMA
/*
* To workaround broken NUMA config. Read the comment in
* srat_detect_node().
*/
static int nearby_node(int apicid)
{
int i, node;
for (i = apicid - 1; i >= 0; i--) {
node = __apicid_to_node[i];
if (node != NUMA_NO_NODE && node_online(node))
return node;
}
for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
node = __apicid_to_node[i];
if (node != NUMA_NO_NODE && node_online(node))
return node;
}
return first_node(node_online_map); /* Shouldn't happen */
}
#endif
/*
* Fixup core topology information for
* (1) AMD multi-node processors
* Assumption: Number of cores in each internal node is the same.
* (2) AMD processors supporting compute units
*/
#ifdef CONFIG_X86_HT
static void amd_get_topology(struct cpuinfo_x86 *c)
{
u32 nodes, cores_per_cu = 1;
u8 node_id;
int cpu = smp_processor_id();
/* get information required for multi-node processors */
if (cpu_has_topoext) {
u32 eax, ebx, ecx, edx;
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
nodes = ((ecx >> 8) & 7) + 1;
node_id = ecx & 7;
/* get compute unit information */
smp_num_siblings = ((ebx >> 8) & 3) + 1;
c->compute_unit_id = ebx & 0xff;
cores_per_cu += ((ebx >> 8) & 3);
} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
nodes = ((value >> 3) & 7) + 1;
node_id = value & 7;
} else
return;
/* fixup multi-node processor information */
if (nodes > 1) {
u32 cores_per_node;
u32 cus_per_node;
set_cpu_cap(c, X86_FEATURE_AMD_DCM);
cores_per_node = c->x86_max_cores / nodes;
cus_per_node = cores_per_node / cores_per_cu;
/* store NodeID, use llc_shared_map to store sibling info */
per_cpu(cpu_llc_id, cpu) = node_id;
/* core id has to be in the [0 .. cores_per_node - 1] range */
c->cpu_core_id %= cores_per_node;
c->compute_unit_id %= cus_per_node;
}
}
#endif
/*
* On a AMD dual core setup the lower bits of the APIC id distinguish the cores.
* Assumes number of cores is a power of two.
*/
static void amd_detect_cmp(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_HT
unsigned bits;
int cpu = smp_processor_id();
bits = c->x86_coreid_bits;
/* Low order bits define the core id (index of core in socket) */
c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
/* Convert the initial APIC ID into the socket ID */
c->phys_proc_id = c->initial_apicid >> bits;
/* use socket ID also for last level cache */
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
amd_get_topology(c);
#endif
}
u16 amd_get_nb_id(int cpu)
{
u16 id = 0;
#ifdef CONFIG_SMP
id = per_cpu(cpu_llc_id, cpu);
#endif
return id;
}
EXPORT_SYMBOL_GPL(amd_get_nb_id);
static void srat_detect_node(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_NUMA
int cpu = smp_processor_id();
int node;
unsigned apicid = c->apicid;
node = numa_cpu_node(cpu);
if (node == NUMA_NO_NODE)
node = per_cpu(cpu_llc_id, cpu);
/*
* On multi-fabric platform (e.g. Numascale NumaChip) a
* platform-specific handler needs to be called to fixup some
* IDs of the CPU.
*/
if (x86_cpuinit.fixup_cpu_id)
x86_cpuinit.fixup_cpu_id(c, node);
if (!node_online(node)) {
/*
* Two possibilities here:
*
* - The CPU is missing memory and no node was created. In
* that case try picking one from a nearby CPU.
*
* - The APIC IDs differ from the HyperTransport node IDs
* which the K8 northbridge parsing fills in. Assume
* they are all increased by a constant offset, but in
* the same order as the HT nodeids. If that doesn't
* result in a usable node fall back to the path for the
* previous case.
*
* This workaround operates directly on the mapping between
* APIC ID and NUMA node, assuming certain relationship
* between APIC ID, HT node ID and NUMA topology. As going
* through CPU mapping may alter the outcome, directly
* access __apicid_to_node[].
*/
int ht_nodeid = c->initial_apicid;
if (ht_nodeid >= 0 &&
__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
node = __apicid_to_node[ht_nodeid];
/* Pick a nearby node */
if (!node_online(node))
node = nearby_node(apicid);
}
numa_set_node(cpu, node);
#endif
}
static void early_init_amd_mc(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_HT
unsigned bits, ecx;
/* Multi core CPU? */
if (c->extended_cpuid_level < 0x80000008)
return;
ecx = cpuid_ecx(0x80000008);
c->x86_max_cores = (ecx & 0xff) + 1;
/* CPU telling us the core id bits shift? */
bits = (ecx >> 12) & 0xF;
/* Otherwise recompute */
if (bits == 0) {
while ((1 << bits) < c->x86_max_cores)
bits++;
}
c->x86_coreid_bits = bits;
#endif
}
static void bsp_init_amd(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_64
if (c->x86 >= 0xf) {
unsigned long long tseg;
/*
* Split up direct mapping around the TSEG SMM area.
* Don't do it for gbpages because there seems very little
* benefit in doing so.
*/
if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
unsigned long pfn = tseg >> PAGE_SHIFT;
printk(KERN_DEBUG "tseg: %010llx\n", tseg);
if (pfn_range_is_mapped(pfn, pfn + 1))
set_memory_4k((unsigned long)__va(tseg), 1);
}
}
#endif
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
if (c->x86 > 0x10 ||
(c->x86 == 0x10 && c->x86_model >= 0x2)) {
u64 val;
rdmsrl(MSR_K7_HWCR, val);
if (!(val & BIT(24)))
printk(KERN_WARNING FW_BUG "TSC doesn't count "
"with P0 frequency!\n");
}
}
if (c->x86 == 0x15) {
unsigned long upperbit;
u32 cpuid, assoc;
cpuid = cpuid_edx(0x80000005);
assoc = cpuid >> 16 & 0xff;
upperbit = ((cpuid >> 24) << 10) / assoc;
va_align.mask = (upperbit - 1) & PAGE_MASK;
va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
}
}
static void early_init_amd(struct cpuinfo_x86 *c)
{
early_init_amd_mc(c);
/*
* c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
* with P/T states and does not stop in deep C-states
*/
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
if (!check_tsc_unstable())
set_sched_clock_stable();
}
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
#else
/* Set MTRR capability flag if appropriate */
if (c->x86 == 5)
if (c->x86_model == 13 || c->x86_model == 9 ||
(c->x86_model == 8 && c->x86_mask >= 8))
set_cpu_cap(c, X86_FEATURE_K6_MTRR);
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
/* check CPU config space for extended APIC ID */
if (cpu_has_apic && c->x86 >= 0xf) {
unsigned int val;
val = read_pci_config(0, 24, 0, 0x68);
if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
}
#endif
/*
* This is only needed to tell the kernel whether to use VMCALL
* and VMMCALL. VMMCALL is never executed except under virt, so
* we can set it unconditionally.
*/
set_cpu_cap(c, X86_FEATURE_VMMCALL);
/* F16h erratum 793, CVE-2013-6885 */
if (c->x86 == 0x16 && c->x86_model <= 0xf)
msr_set_bit(MSR_AMD64_LS_CFG, 15);
}
static const int amd_erratum_383[];
static const int amd_erratum_400[];
static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
static void init_amd_k8(struct cpuinfo_x86 *c)
{
u32 level;
u64 value;
/* On C+ stepping K8 rep microcode works well for copy/memset */
level = cpuid_eax(1);
if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
/*
* Some BIOSes incorrectly force this feature, but only K8 revision D
* (model = 0x14) and later actually support it.
* (AMD Erratum #110, docId: 25759).
*/
if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
if (!rdmsrl_amd_safe(0xc001100d, &value)) {
value &= ~BIT_64(32);
wrmsrl_amd_safe(0xc001100d, value);
}
}
if (!c->x86_model_id[0])
strcpy(c->x86_model_id, "Hammer");
}
static void init_amd_gh(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_64
/* do this for boot cpu */
if (c == &boot_cpu_data)
check_enable_amd_mmconf_dmi();
fam10h_check_enable_mmcfg();
#endif
/*
* Disable GART TLB Walk Errors on Fam10h. We do this here because this
* is always needed when GART is enabled, even in a kernel which has no
* MCE support built in. BIOS should disable GartTlbWlk Errors already.
* If it doesn't, we do it here as suggested by the BKDG.
*
* Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
*/
msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
/*
* On family 10h BIOS may not have properly enabled WC+ support, causing
* it to be converted to CD memtype. This may result in performance
* degradation for certain nested-paging guests. Prevent this conversion
* by clearing bit 24 in MSR_AMD64_BU_CFG2.
*
* NOTE: we want to use the _safe accessors so as not to #GP kvm
* guests on older kvm hosts.
*/
msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
if (cpu_has_amd_erratum(c, amd_erratum_383))
set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
}
static void init_amd_bd(struct cpuinfo_x86 *c)
{
u64 value;
/* re-enable TopologyExtensions if switched off by BIOS */
if ((c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
!cpu_has(c, X86_FEATURE_TOPOEXT)) {
if (msr_set_bit(0xc0011005, 54) > 0) {
rdmsrl(0xc0011005, value);
if (value & BIT_64(54)) {
set_cpu_cap(c, X86_FEATURE_TOPOEXT);
pr_info(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
}
}
}
/*
* The way access filter has a performance penalty on some workloads.
* Disable it on the affected CPUs.
*/
if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) {
value |= 0x1E;
wrmsrl_safe(0xc0011021, value);
}
}
}
static void init_amd(struct cpuinfo_x86 *c)
{
u32 dummy;
#ifdef CONFIG_SMP
/*
* Disable TLB flush filter by setting HWCR.FFDIS on K8
* bit 6 of msr C001_0015
*
* Errata 63 for SH-B3 steppings
* Errata 122 for all steppings (F+ have it disabled by default)
*/
if (c->x86 == 0xf)
msr_set_bit(MSR_K7_HWCR, 6);
#endif
early_init_amd(c);
/*
* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
* 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
*/
clear_cpu_cap(c, 0*32+31);
if (c->x86 >= 0x10)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
/* get apicid instead of initial apic id from cpuid */
c->apicid = hard_smp_processor_id();
/* K6s reports MCEs but don't actually have all the MSRs */
if (c->x86 < 6)
clear_cpu_cap(c, X86_FEATURE_MCE);
switch (c->x86) {
case 4: init_amd_k5(c); break;
case 5: init_amd_k6(c); break;
case 6: init_amd_k7(c); break;
case 0xf: init_amd_k8(c); break;
case 0x10: init_amd_gh(c); break;
case 0x15: init_amd_bd(c); break;
}
/* Enable workaround for FXSAVE leak */
if (c->x86 >= 6)
set_cpu_bug(c, X86_BUG_FXSAVE_LEAK);
cpu_detect_cache_sizes(c);
/* Multi core CPU? */
if (c->extended_cpuid_level >= 0x80000008) {
amd_detect_cmp(c);
srat_detect_node(c);
}
#ifdef CONFIG_X86_32
detect_ht(c);
#endif
init_amd_cacheinfo(c);
if (c->x86 >= 0xf)
set_cpu_cap(c, X86_FEATURE_K8);
if (cpu_has_xmm2) {
/* MFENCE stops RDTSC speculation */
set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
}
/*
* Family 0x12 and above processors have APIC timer
* running in deep C states.
*/
if (c->x86 > 0x11)
set_cpu_cap(c, X86_FEATURE_ARAT);
if (cpu_has_amd_erratum(c, amd_erratum_400))
set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
}
#ifdef CONFIG_X86_32
static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/* AMD errata T13 (order #21922) */
if ((c->x86 == 6)) {
/* Duron Rev A0 */
if (c->x86_model == 3 && c->x86_mask == 0)
size = 64;
/* Tbird rev A1/A2 */
if (c->x86_model == 4 &&
(c->x86_mask == 0 || c->x86_mask == 1))
size = 256;
}
return size;
}
#endif
static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
{
u32 ebx, eax, ecx, edx;
u16 mask = 0xfff;
if (c->x86 < 0xf)
return;
if (c->extended_cpuid_level < 0x80000006)
return;
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
tlb_lli_4k[ENTRIES] = ebx & mask;
/*
* K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB
* characteristics from the CPUID function 0x80000005 instead.
*/
if (c->x86 == 0xf) {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
mask = 0xff;
}
/* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!((eax >> 16) & mask))
tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
else
tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
/* a 4M entry uses two 2M entries */
tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
/* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!(eax & mask)) {
/* Erratum 658 */
if (c->x86 == 0x15 && c->x86_model <= 0x1f) {
tlb_lli_2m[ENTRIES] = 1024;
} else {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
tlb_lli_2m[ENTRIES] = eax & 0xff;
}
} else
tlb_lli_2m[ENTRIES] = eax & mask;
tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
}
static const struct cpu_dev amd_cpu_dev = {
.c_vendor = "AMD",
.c_ident = { "AuthenticAMD" },
#ifdef CONFIG_X86_32
.legacy_models = {
{ .family = 4, .model_names =
{
[3] = "486 DX/2",
[7] = "486 DX/2-WB",
[8] = "486 DX/4",
[9] = "486 DX/4-WB",
[14] = "Am5x86-WT",
[15] = "Am5x86-WB"
}
},
},
.legacy_cache_size = amd_size_cache,
#endif
.c_early_init = early_init_amd,
.c_detect_tlb = cpu_detect_tlb_amd,
.c_bsp_init = bsp_init_amd,
.c_init = init_amd,
.c_x86_vendor = X86_VENDOR_AMD,
};
cpu_dev_register(amd_cpu_dev);
/*
* AMD errata checking
*
* Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
* AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
* have an OSVW id assigned, which it takes as first argument. Both take a
* variable number of family-specific model-stepping ranges created by
* AMD_MODEL_RANGE().
*
* Example:
*
* const int amd_erratum_319[] =
* AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
* AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
* AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
*/
#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
static const int amd_erratum_400[] =
AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
static const int amd_erratum_383[] =
AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
{
int osvw_id = *erratum++;
u32 range;
u32 ms;
if (osvw_id >= 0 && osvw_id < 65536 &&
cpu_has(cpu, X86_FEATURE_OSVW)) {
u64 osvw_len;
rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
if (osvw_id < osvw_len) {
u64 osvw_bits;
rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
osvw_bits);
return osvw_bits & (1ULL << (osvw_id & 0x3f));
}
}
/* OSVW unavailable or ID unknown, match family-model-stepping range */
ms = (cpu->x86_model << 4) | cpu->x86_mask;
while ((range = *erratum++))
if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
(ms >= AMD_MODEL_RANGE_START(range)) &&
(ms <= AMD_MODEL_RANGE_END(range)))
return true;
return false;
}

View file

@ -0,0 +1,94 @@
/*
* Copyright (C) 1994 Linus Torvalds
*
* Cyrix stuff, June 1998 by:
* - Rafael R. Reilova (moved everything from head.S),
* <rreilova@ececs.uc.edu>
* - Channing Corn (tests & fixes),
* - Andrew D. Balsa (code cleanup).
*/
#include <linux/init.h>
#include <linux/utsname.h>
#include <asm/bugs.h>
#include <asm/processor.h>
#include <asm/processor-flags.h>
#include <asm/i387.h>
#include <asm/msr.h>
#include <asm/paravirt.h>
#include <asm/alternative.h>
static double __initdata x = 4195835.0;
static double __initdata y = 3145727.0;
/*
* This used to check for exceptions..
* However, it turns out that to support that,
* the XMM trap handlers basically had to
* be buggy. So let's have a correct XMM trap
* handler, and forget about printing out
* some status at boot.
*
* We should really only care about bugs here
* anyway. Not features.
*/
static void __init check_fpu(void)
{
s32 fdiv_bug;
kernel_fpu_begin();
/*
* trap_init() enabled FXSR and company _before_ testing for FP
* problems here.
*
* Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug
*/
__asm__("fninit\n\t"
"fldl %1\n\t"
"fdivl %2\n\t"
"fmull %2\n\t"
"fldl %1\n\t"
"fsubp %%st,%%st(1)\n\t"
"fistpl %0\n\t"
"fwait\n\t"
"fninit"
: "=m" (*&fdiv_bug)
: "m" (*&x), "m" (*&y));
kernel_fpu_end();
if (fdiv_bug) {
set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
pr_warn("Hmm, FPU with FDIV bug\n");
}
}
void __init check_bugs(void)
{
identify_boot_cpu();
#ifndef CONFIG_SMP
pr_info("CPU: ");
print_cpu_info(&boot_cpu_data);
#endif
/*
* Check whether we are able to run this kernel safely on SMP.
*
* - i386 is no longer supported.
* - In order to run on anything without a TSC, we need to be
* compiled for a i486.
*/
if (boot_cpu_data.x86 < 4)
panic("Kernel requires i486+ for 'invlpg' and other features");
init_utsname()->machine[1] =
'0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
alternative_instructions();
/*
* kernel_fpu_begin/end() in check_fpu() relies on the patched
* alternative instructions.
*/
if (cpu_has_fpu)
check_fpu();
}

View file

@ -0,0 +1,33 @@
/*
* Copyright (C) 1994 Linus Torvalds
* Copyright (C) 2000 SuSE
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <asm/alternative.h>
#include <asm/bugs.h>
#include <asm/processor.h>
#include <asm/mtrr.h>
#include <asm/cacheflush.h>
void __init check_bugs(void)
{
identify_boot_cpu();
#if !defined(CONFIG_SMP)
printk(KERN_INFO "CPU: ");
print_cpu_info(&boot_cpu_data);
#endif
alternative_instructions();
/*
* Make sure the first 2MB area is not mapped by huge pages
* There are typically fixed size MTRRs in there and overlapping
* MTRRs into large pages causes slow downs.
*
* Right now we don't do that with gbpages because there seems
* very little benefit for that case.
*/
if (!direct_gbpages)
set_memory_4k((unsigned long)__va(0), 1);
}

View file

@ -0,0 +1,229 @@
#include <linux/bitops.h>
#include <linux/kernel.h>
#include <asm/processor.h>
#include <asm/e820.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include "cpu.h"
#define ACE_PRESENT (1 << 6)
#define ACE_ENABLED (1 << 7)
#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */
#define RNG_PRESENT (1 << 2)
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */
static void init_c3(struct cpuinfo_x86 *c)
{
u32 lo, hi;
/* Test for Centaur Extended Feature Flags presence */
if (cpuid_eax(0xC0000000) >= 0xC0000001) {
u32 tmp = cpuid_edx(0xC0000001);
/* enable ACE unit, if present and disabled */
if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) {
rdmsr(MSR_VIA_FCR, lo, hi);
lo |= ACE_FCR; /* enable ACE unit */
wrmsr(MSR_VIA_FCR, lo, hi);
printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n");
}
/* enable RNG unit, if present and disabled */
if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) {
rdmsr(MSR_VIA_RNG, lo, hi);
lo |= RNG_ENABLE; /* enable RNG unit */
wrmsr(MSR_VIA_RNG, lo, hi);
printk(KERN_INFO "CPU: Enabled h/w RNG\n");
}
/* store Centaur Extended Feature Flags as
* word 5 of the CPU capability bit array
*/
c->x86_capability[5] = cpuid_edx(0xC0000001);
}
#ifdef CONFIG_X86_32
/* Cyrix III family needs CX8 & PGE explicitly enabled. */
if (c->x86_model >= 6 && c->x86_model <= 13) {
rdmsr(MSR_VIA_FCR, lo, hi);
lo |= (1<<1 | 1<<7);
wrmsr(MSR_VIA_FCR, lo, hi);
set_cpu_cap(c, X86_FEATURE_CX8);
}
/* Before Nehemiah, the C3's had 3dNOW! */
if (c->x86_model >= 6 && c->x86_model < 9)
set_cpu_cap(c, X86_FEATURE_3DNOW);
#endif
if (c->x86 == 0x6 && c->x86_model >= 0xf) {
c->x86_cache_alignment = c->x86_clflush_size * 2;
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
}
cpu_detect_cache_sizes(c);
}
enum {
ECX8 = 1<<1,
EIERRINT = 1<<2,
DPM = 1<<3,
DMCE = 1<<4,
DSTPCLK = 1<<5,
ELINEAR = 1<<6,
DSMC = 1<<7,
DTLOCK = 1<<8,
EDCTLB = 1<<8,
EMMX = 1<<9,
DPDC = 1<<11,
EBRPRED = 1<<12,
DIC = 1<<13,
DDC = 1<<14,
DNA = 1<<15,
ERETSTK = 1<<16,
E2MMX = 1<<19,
EAMD3D = 1<<20,
};
static void early_init_centaur(struct cpuinfo_x86 *c)
{
switch (c->x86) {
#ifdef CONFIG_X86_32
case 5:
/* Emulate MTRRs using Centaur's MCR. */
set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
break;
#endif
case 6:
if (c->x86_model >= 0xf)
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
break;
}
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSENTER32);
#endif
}
static void init_centaur(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
char *name;
u32 fcr_set = 0;
u32 fcr_clr = 0;
u32 lo, hi, newlo;
u32 aa, bb, cc, dd;
/*
* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
* 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
*/
clear_cpu_cap(c, 0*32+31);
#endif
early_init_centaur(c);
switch (c->x86) {
#ifdef CONFIG_X86_32
case 5:
switch (c->x86_model) {
case 4:
name = "C6";
fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
fcr_clr = DPDC;
printk(KERN_NOTICE "Disabling bugged TSC.\n");
clear_cpu_cap(c, X86_FEATURE_TSC);
break;
case 8:
switch (c->x86_mask) {
default:
name = "2";
break;
case 7 ... 9:
name = "2A";
break;
case 10 ... 15:
name = "2B";
break;
}
fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
E2MMX|EAMD3D;
fcr_clr = DPDC;
break;
case 9:
name = "3";
fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
E2MMX|EAMD3D;
fcr_clr = DPDC;
break;
default:
name = "??";
}
rdmsr(MSR_IDT_FCR1, lo, hi);
newlo = (lo|fcr_set) & (~fcr_clr);
if (newlo != lo) {
printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n",
lo, newlo);
wrmsr(MSR_IDT_FCR1, newlo, hi);
} else {
printk(KERN_INFO "Centaur FCR is 0x%X\n", lo);
}
/* Emulate MTRRs using Centaur's MCR. */
set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
/* Report CX8 */
set_cpu_cap(c, X86_FEATURE_CX8);
/* Set 3DNow! on Winchip 2 and above. */
if (c->x86_model >= 8)
set_cpu_cap(c, X86_FEATURE_3DNOW);
/* See if we can find out some more. */
if (cpuid_eax(0x80000000) >= 0x80000005) {
/* Yes, we can. */
cpuid(0x80000005, &aa, &bb, &cc, &dd);
/* Add L1 data and code cache sizes. */
c->x86_cache_size = (cc>>24)+(dd>>24);
}
sprintf(c->x86_model_id, "WinChip %s", name);
break;
#endif
case 6:
init_c3(c);
break;
}
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
#endif
}
#ifdef CONFIG_X86_32
static unsigned int
centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/* VIA C3 CPUs (670-68F) need further shifting. */
if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
size >>= 8;
/*
* There's also an erratum in Nehemiah stepping 1, which
* returns '65KB' instead of '64KB'
* - Note, it seems this may only be in engineering samples.
*/
if ((c->x86 == 6) && (c->x86_model == 9) &&
(c->x86_mask == 1) && (size == 65))
size -= 1;
return size;
}
#endif
static const struct cpu_dev centaur_cpu_dev = {
.c_vendor = "Centaur",
.c_ident = { "CentaurHauls" },
.c_early_init = early_init_centaur,
.c_init = init_centaur,
#ifdef CONFIG_X86_32
.legacy_cache_size = centaur_size_cache,
#endif
.c_x86_vendor = X86_VENDOR_CENTAUR,
};
cpu_dev_register(centaur_cpu_dev);

1450
arch/x86/kernel/cpu/common.c Normal file

File diff suppressed because it is too large Load diff

48
arch/x86/kernel/cpu/cpu.h Normal file
View file

@ -0,0 +1,48 @@
#ifndef ARCH_X86_CPU_H
#define ARCH_X86_CPU_H
/* attempt to consolidate cpu attributes */
struct cpu_dev {
const char *c_vendor;
/* some have two possibilities for cpuid string */
const char *c_ident[2];
void (*c_early_init)(struct cpuinfo_x86 *);
void (*c_bsp_init)(struct cpuinfo_x86 *);
void (*c_init)(struct cpuinfo_x86 *);
void (*c_identify)(struct cpuinfo_x86 *);
void (*c_detect_tlb)(struct cpuinfo_x86 *);
int c_x86_vendor;
#ifdef CONFIG_X86_32
/* Optional vendor specific routine to obtain the cache size. */
unsigned int (*legacy_cache_size)(struct cpuinfo_x86 *,
unsigned int);
/* Family/stepping-based lookup table for model names. */
struct legacy_cpu_model_info {
int family;
const char *model_names[16];
} legacy_models[5];
#endif
};
struct _tlb_table {
unsigned char descriptor;
char tlb_type;
unsigned int entries;
/* unsigned int ways; */
char info[128];
};
#define cpu_dev_register(cpu_devX) \
static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
__attribute__((__section__(".x86_cpu_dev.init"))) = \
&cpu_devX;
extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
#endif /* ARCH_X86_CPU_H */

461
arch/x86/kernel/cpu/cyrix.c Normal file
View file

@ -0,0 +1,461 @@
#include <linux/bitops.h>
#include <linux/delay.h>
#include <linux/pci.h>
#include <asm/dma.h>
#include <linux/io.h>
#include <asm/processor-cyrix.h>
#include <asm/processor-flags.h>
#include <linux/timer.h>
#include <asm/pci-direct.h>
#include <asm/tsc.h>
#include "cpu.h"
/*
* Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
*/
static void __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
{
unsigned char ccr2, ccr3;
/* we test for DEVID by checking whether CCR3 is writable */
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, ccr3 ^ 0x80);
getCx86(0xc0); /* dummy to change bus */
if (getCx86(CX86_CCR3) == ccr3) { /* no DEVID regs. */
ccr2 = getCx86(CX86_CCR2);
setCx86(CX86_CCR2, ccr2 ^ 0x04);
getCx86(0xc0); /* dummy */
if (getCx86(CX86_CCR2) == ccr2) /* old Cx486SLC/DLC */
*dir0 = 0xfd;
else { /* Cx486S A step */
setCx86(CX86_CCR2, ccr2);
*dir0 = 0xfe;
}
} else {
setCx86(CX86_CCR3, ccr3); /* restore CCR3 */
/* read DIR0 and DIR1 CPU registers */
*dir0 = getCx86(CX86_DIR0);
*dir1 = getCx86(CX86_DIR1);
}
}
static void do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
{
unsigned long flags;
local_irq_save(flags);
__do_cyrix_devid(dir0, dir1);
local_irq_restore(flags);
}
/*
* Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in
* order to identify the Cyrix CPU model after we're out of setup.c
*
* Actually since bugs.h doesn't even reference this perhaps someone should
* fix the documentation ???
*/
static unsigned char Cx86_dir0_msb = 0;
static const char Cx86_model[][9] = {
"Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
"M II ", "Unknown"
};
static const char Cx486_name[][5] = {
"SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
"SRx2", "DRx2"
};
static const char Cx486S_name[][4] = {
"S", "S2", "Se", "S2e"
};
static const char Cx486D_name[][4] = {
"DX", "DX2", "?", "?", "?", "DX4"
};
static char Cx86_cb[] = "?.5x Core/Bus Clock";
static const char cyrix_model_mult1[] = "12??43";
static const char cyrix_model_mult2[] = "12233445";
/*
* Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
* BIOSes for compatibility with DOS games. This makes the udelay loop
* work correctly, and improves performance.
*
* FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP
*/
static void check_cx686_slop(struct cpuinfo_x86 *c)
{
unsigned long flags;
if (Cx86_dir0_msb == 3) {
unsigned char ccr3, ccr5;
local_irq_save(flags);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
ccr5 = getCx86(CX86_CCR5);
if (ccr5 & 2)
setCx86(CX86_CCR5, ccr5 & 0xfd); /* reset SLOP */
setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
local_irq_restore(flags);
if (ccr5 & 2) { /* possible wrong calibration done */
printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n");
calibrate_delay();
c->loops_per_jiffy = loops_per_jiffy;
}
}
}
static void set_cx86_reorder(void)
{
u8 ccr3;
printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n");
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
/* Load/Store Serialize to mem access disable (=reorder it) */
setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
/* set load/store serialize from 1GB to 4GB */
ccr3 |= 0xe0;
setCx86(CX86_CCR3, ccr3);
}
static void set_cx86_memwb(void)
{
printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
/* CCR2 bit 2: unlock NW bit */
setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
/* set 'Not Write-through' */
write_cr0(read_cr0() | X86_CR0_NW);
/* CCR2 bit 2: lock NW bit and set WT1 */
setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
}
/*
* Configure later MediaGX and/or Geode processor.
*/
static void geode_configure(void)
{
unsigned long flags;
u8 ccr3;
local_irq_save(flags);
/* Suspend on halt power saving and enable #SUSP pin */
setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
/* FPU fast, DTE cache, Mem bypass */
setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
set_cx86_memwb();
set_cx86_reorder();
local_irq_restore(flags);
}
static void early_init_cyrix(struct cpuinfo_x86 *c)
{
unsigned char dir0, dir0_msn, dir1 = 0;
__do_cyrix_devid(&dir0, &dir1);
dir0_msn = dir0 >> 4; /* identifies CPU "family" */
switch (dir0_msn) {
case 3: /* 6x86/6x86L */
/* Emulate MTRRs using Cyrix's ARRs. */
set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
break;
case 5: /* 6x86MX/M II */
/* Emulate MTRRs using Cyrix's ARRs. */
set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
break;
}
}
static void init_cyrix(struct cpuinfo_x86 *c)
{
unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
char *buf = c->x86_model_id;
const char *p = NULL;
/*
* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
* 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
*/
clear_cpu_cap(c, 0*32+31);
/* Cyrix used bit 24 in extended (AMD) CPUID for Cyrix MMX extensions */
if (test_cpu_cap(c, 1*32+24)) {
clear_cpu_cap(c, 1*32+24);
set_cpu_cap(c, X86_FEATURE_CXMMX);
}
do_cyrix_devid(&dir0, &dir1);
check_cx686_slop(c);
Cx86_dir0_msb = dir0_msn = dir0 >> 4; /* identifies CPU "family" */
dir0_lsn = dir0 & 0xf; /* model or clock multiplier */
/* common case step number/rev -- exceptions handled below */
c->x86_model = (dir1 >> 4) + 1;
c->x86_mask = dir1 & 0xf;
/* Now cook; the original recipe is by Channing Corn, from Cyrix.
* We do the same thing for each generation: we work out
* the model, multiplier and stepping. Black magic included,
* to make the silicon step/rev numbers match the printed ones.
*/
switch (dir0_msn) {
unsigned char tmp;
case 0: /* Cx486SLC/DLC/SRx/DRx */
p = Cx486_name[dir0_lsn & 7];
break;
case 1: /* Cx486S/DX/DX2/DX4 */
p = (dir0_lsn & 8) ? Cx486D_name[dir0_lsn & 5]
: Cx486S_name[dir0_lsn & 3];
break;
case 2: /* 5x86 */
Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5];
p = Cx86_cb+2;
break;
case 3: /* 6x86/6x86L */
Cx86_cb[1] = ' ';
Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5];
if (dir1 > 0x21) { /* 686L */
Cx86_cb[0] = 'L';
p = Cx86_cb;
(c->x86_model)++;
} else /* 686 */
p = Cx86_cb+1;
/* Emulate MTRRs using Cyrix's ARRs. */
set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
/* 6x86's contain this bug */
set_cpu_bug(c, X86_BUG_COMA);
break;
case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
#ifdef CONFIG_PCI
{
u32 vendor, device;
/*
* It isn't really a PCI quirk directly, but the cure is the
* same. The MediaGX has deep magic SMM stuff that handles the
* SB emulation. It throws away the fifo on disable_dma() which
* is wrong and ruins the audio.
*
* Bug2: VSA1 has a wrap bug so that using maximum sized DMA
* causes bad things. According to NatSemi VSA2 has another
* bug to do with 'hlt'. I've not seen any boards using VSA2
* and X doesn't seem to support it either so who cares 8).
* VSA1 we work around however.
*/
printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
isa_dma_bridge_buggy = 2;
/* We do this before the PCI layer is running. However we
are safe here as we know the bridge must be a Cyrix
companion and must be present */
vendor = read_pci_config_16(0, 0, 0x12, PCI_VENDOR_ID);
device = read_pci_config_16(0, 0, 0x12, PCI_DEVICE_ID);
/*
* The 5510/5520 companion chips have a funky PIT.
*/
if (vendor == PCI_VENDOR_ID_CYRIX &&
(device == PCI_DEVICE_ID_CYRIX_5510 ||
device == PCI_DEVICE_ID_CYRIX_5520))
mark_tsc_unstable("cyrix 5510/5520 detected");
}
#endif
c->x86_cache_size = 16; /* Yep 16K integrated cache thats it */
/* GXm supports extended cpuid levels 'ala' AMD */
if (c->cpuid_level == 2) {
/* Enable cxMMX extensions (GX1 Datasheet 54) */
setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
/*
* GXm : 0x30 ... 0x5f GXm datasheet 51
* GXlv: 0x6x GXlv datasheet 54
* ? : 0x7x
* GX1 : 0x8x GX1 datasheet 56
*/
if ((0x30 <= dir1 && dir1 <= 0x6f) ||
(0x80 <= dir1 && dir1 <= 0x8f))
geode_configure();
return;
} else { /* MediaGX */
Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
p = Cx86_cb+2;
c->x86_model = (dir1 & 0x20) ? 1 : 2;
}
break;
case 5: /* 6x86MX/M II */
if (dir1 > 7) {
dir0_msn++; /* M II */
/* Enable MMX extensions (App note 108) */
setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
} else {
/* A 6x86MX - it has the bug. */
set_cpu_bug(c, X86_BUG_COMA);
}
tmp = (!(dir0_lsn & 7) || dir0_lsn & 1) ? 2 : 0;
Cx86_cb[tmp] = cyrix_model_mult2[dir0_lsn & 7];
p = Cx86_cb+tmp;
if (((dir1 & 0x0f) > 4) || ((dir1 & 0xf0) == 0x20))
(c->x86_model)++;
/* Emulate MTRRs using Cyrix's ARRs. */
set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
break;
case 0xf: /* Cyrix 486 without DEVID registers */
switch (dir0_lsn) {
case 0xd: /* either a 486SLC or DLC w/o DEVID */
dir0_msn = 0;
p = Cx486_name[(cpu_has_fpu ? 1 : 0)];
break;
case 0xe: /* a 486S A step */
dir0_msn = 0;
p = Cx486S_name[0];
break;
}
break;
default: /* unknown (shouldn't happen, we know everyone ;-) */
dir0_msn = 7;
break;
}
strcpy(buf, Cx86_model[dir0_msn & 7]);
if (p)
strcat(buf, p);
return;
}
/*
* Handle National Semiconductor branded processors
*/
static void init_nsc(struct cpuinfo_x86 *c)
{
/*
* There may be GX1 processors in the wild that are branded
* NSC and not Cyrix.
*
* This function only handles the GX processor, and kicks every
* thing else to the Cyrix init function above - that should
* cover any processors that might have been branded differently
* after NSC acquired Cyrix.
*
* If this breaks your GX1 horribly, please e-mail
* info-linux@ldcmail.amd.com to tell us.
*/
/* Handle the GX (Formally known as the GX2) */
if (c->x86 == 5 && c->x86_model == 5)
cpu_detect_cache_sizes(c);
else
init_cyrix(c);
}
/*
* Cyrix CPUs without cpuid or with cpuid not yet enabled can be detected
* by the fact that they preserve the flags across the division of 5/2.
* PII and PPro exhibit this behavior too, but they have cpuid available.
*/
/*
* Perform the Cyrix 5/2 test. A Cyrix won't change
* the flags, while other 486 chips will.
*/
static inline int test_cyrix_52div(void)
{
unsigned int test;
__asm__ __volatile__(
"sahf\n\t" /* clear flags (%eax = 0x0005) */
"div %b2\n\t" /* divide 5 by 2 */
"lahf" /* store flags into %ah */
: "=a" (test)
: "0" (5), "q" (2)
: "cc");
/* AH is 0x02 on Cyrix after the divide.. */
return (unsigned char) (test >> 8) == 0x02;
}
static void cyrix_identify(struct cpuinfo_x86 *c)
{
/* Detect Cyrix with disabled CPUID */
if (c->x86 == 4 && test_cyrix_52div()) {
unsigned char dir0, dir1;
strcpy(c->x86_vendor_id, "CyrixInstead");
c->x86_vendor = X86_VENDOR_CYRIX;
/* Actually enable cpuid on the older cyrix */
/* Retrieve CPU revisions */
do_cyrix_devid(&dir0, &dir1);
dir0 >>= 4;
/* Check it is an affected model */
if (dir0 == 5 || dir0 == 3) {
unsigned char ccr3;
unsigned long flags;
printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
local_irq_save(flags);
ccr3 = getCx86(CX86_CCR3);
/* enable MAPEN */
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
/* enable cpuid */
setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80);
/* disable MAPEN */
setCx86(CX86_CCR3, ccr3);
local_irq_restore(flags);
}
}
}
static const struct cpu_dev cyrix_cpu_dev = {
.c_vendor = "Cyrix",
.c_ident = { "CyrixInstead" },
.c_early_init = early_init_cyrix,
.c_init = init_cyrix,
.c_identify = cyrix_identify,
.c_x86_vendor = X86_VENDOR_CYRIX,
};
cpu_dev_register(cyrix_cpu_dev);
static const struct cpu_dev nsc_cpu_dev = {
.c_vendor = "NSC",
.c_ident = { "Geode by NSC" },
.c_init = init_nsc,
.c_x86_vendor = X86_VENDOR_NSC,
};
cpu_dev_register(nsc_cpu_dev);

View file

@ -0,0 +1,87 @@
/*
* Common hypervisor code
*
* Copyright (C) 2008, VMware, Inc.
* Author : Alok N Kataria <akataria@vmware.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
*/
#include <linux/module.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
#ifdef CONFIG_XEN_PVHVM
&x86_hyper_xen_hvm,
#endif
&x86_hyper_vmware,
&x86_hyper_ms_hyperv,
#ifdef CONFIG_KVM_GUEST
&x86_hyper_kvm,
#endif
};
const struct hypervisor_x86 *x86_hyper;
EXPORT_SYMBOL(x86_hyper);
static inline void __init
detect_hypervisor_vendor(void)
{
const struct hypervisor_x86 *h, * const *p;
uint32_t pri, max_pri = 0;
for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
h = *p;
pri = h->detect();
if (pri != 0 && pri > max_pri) {
max_pri = pri;
x86_hyper = h;
}
}
if (max_pri)
printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name);
}
void init_hypervisor(struct cpuinfo_x86 *c)
{
if (x86_hyper && x86_hyper->set_cpu_features)
x86_hyper->set_cpu_features(c);
}
void __init init_hypervisor_platform(void)
{
detect_hypervisor_vendor();
if (!x86_hyper)
return;
init_hypervisor(&boot_cpu_data);
if (x86_hyper->init_platform)
x86_hyper->init_platform();
}
bool __init hypervisor_x2apic_available(void)
{
return x86_hyper &&
x86_hyper->x2apic_available &&
x86_hyper->x2apic_available();
}

756
arch/x86/kernel/cpu/intel.c Normal file
View file

@ -0,0 +1,756 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/bitops.h>
#include <linux/smp.h>
#include <linux/sched.h>
#include <linux/thread_info.h>
#include <linux/module.h>
#include <linux/uaccess.h>
#include <asm/processor.h>
#include <asm/pgtable.h>
#include <asm/msr.h>
#include <asm/bugs.h>
#include <asm/cpu.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
#endif
#include "cpu.h"
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/mpspec.h>
#include <asm/apic.h>
#endif
static void early_init_intel(struct cpuinfo_x86 *c)
{
u64 misc_enable;
/* Unmask CPUID levels if masked: */
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) {
c->cpuid_level = cpuid_eax(0);
get_cpu_cap(c);
}
}
if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
(c->x86 == 0x6 && c->x86_model >= 0x0e))
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) {
unsigned lower_word;
wrmsr(MSR_IA32_UCODE_REV, 0, 0);
/* Required by the SDM */
sync_core();
rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode);
}
/*
* Atom erratum AAE44/AAF40/AAG38/AAH41:
*
* A race condition between speculative fetches and invalidating
* a large page. This is worked around in microcode, but we
* need the microcode to have already been loaded... so if it is
* not, recommend a BIOS update and disable large pages.
*/
if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 &&
c->microcode < 0x20e) {
printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
clear_cpu_cap(c, X86_FEATURE_PSE);
}
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSENTER32);
#else
/* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
if (c->x86 == 15 && c->x86_cache_alignment == 64)
c->x86_cache_alignment = 128;
#endif
/* CPUID workaround for 0F33/0F34 CPU */
if (c->x86 == 0xF && c->x86_model == 0x3
&& (c->x86_mask == 0x3 || c->x86_mask == 0x4))
c->x86_phys_bits = 36;
/*
* c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
* with P/T states and does not stop in deep C-states.
*
* It is also reliable across cores and sockets. (but not across
* cabinets - we turn it off in that case explicitly.)
*/
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
if (!check_tsc_unstable())
set_sched_clock_stable();
}
/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
if (c->x86 == 6) {
switch (c->x86_model) {
case 0x27: /* Penwell */
case 0x35: /* Cloverview */
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
break;
default:
break;
}
}
/*
* There is a known erratum on Pentium III and Core Solo
* and Core Duo CPUs.
* " Page with PAT set to WC while associated MTRR is UC
* may consolidate to UC "
* Because of this erratum, it is better to stick with
* setting WC in MTRR rather than using PAT on these CPUs.
*
* Enable PAT WC only on P4, Core 2 or later CPUs.
*/
if (c->x86 == 6 && c->x86_model < 15)
clear_cpu_cap(c, X86_FEATURE_PAT);
#ifdef CONFIG_KMEMCHECK
/*
* P4s have a "fast strings" feature which causes single-
* stepping REP instructions to only generate a #DB on
* cache-line boundaries.
*
* Ingo Molnar reported a Pentium D (model 6) and a Xeon
* (model 2) with the same problem.
*/
if (c->x86 == 15)
if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0)
pr_info("kmemcheck: Disabling fast string operations\n");
#endif
/*
* If fast string is not enabled in IA32_MISC_ENABLE for any reason,
* clear the fast string and enhanced fast string CPU capabilities.
*/
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
printk(KERN_INFO "Disabled fast string operations\n");
setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
setup_clear_cpu_cap(X86_FEATURE_ERMS);
}
}
/*
* Intel Quark Core DevMan_001.pdf section 6.4.11
* "The operating system also is required to invalidate (i.e., flush)
* the TLB when any changes are made to any of the page table entries.
* The operating system must reload CR3 to cause the TLB to be flushed"
*
* As a result cpu_has_pge() in arch/x86/include/asm/tlbflush.h should
* be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE
* to be modified
*/
if (c->x86 == 5 && c->x86_model == 9) {
pr_info("Disabling PGE capability bit\n");
setup_clear_cpu_cap(X86_FEATURE_PGE);
}
}
#ifdef CONFIG_X86_32
/*
* Early probe support logic for ppro memory erratum #50
*
* This is called before we do cpu ident work
*/
int ppro_with_ram_bug(void)
{
/* Uses data from early_cpu_detect now */
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
boot_cpu_data.x86 == 6 &&
boot_cpu_data.x86_model == 1 &&
boot_cpu_data.x86_mask < 8) {
printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n");
return 1;
}
return 0;
}
static void intel_smp_check(struct cpuinfo_x86 *c)
{
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
return;
/*
* Mask B, Pentium, but not Pentium MMX
*/
if (c->x86 == 5 &&
c->x86_mask >= 1 && c->x86_mask <= 4 &&
c->x86_model <= 3) {
/*
* Remember we have B step Pentia with bugs
*/
WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
"with B stepping processors.\n");
}
}
static int forcepae;
static int __init forcepae_setup(char *__unused)
{
forcepae = 1;
return 1;
}
__setup("forcepae", forcepae_setup);
static void intel_workarounds(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_F00F_BUG
/*
* All models of Pentium and Pentium with MMX technology CPUs
* have the F0 0F bug, which lets nonprivileged users lock up the
* system. Announce that the fault handler will be checking for it.
* The Quark is also family 5, but does not have the same bug.
*/
clear_cpu_bug(c, X86_BUG_F00F);
if (!paravirt_enabled() && c->x86 == 5 && c->x86_model < 9) {
static int f00f_workaround_enabled;
set_cpu_bug(c, X86_BUG_F00F);
if (!f00f_workaround_enabled) {
printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
f00f_workaround_enabled = 1;
}
}
#endif
/*
* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
* model 3 mask 3
*/
if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
clear_cpu_cap(c, X86_FEATURE_SEP);
/*
* PAE CPUID issue: many Pentium M report no PAE but may have a
* functionally usable PAE implementation.
* Forcefully enable PAE if kernel parameter "forcepae" is present.
*/
if (forcepae) {
printk(KERN_WARNING "PAE forced!\n");
set_cpu_cap(c, X86_FEATURE_PAE);
add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
}
/*
* P4 Xeon errata 037 workaround.
* Hardware prefetcher may cause stale data to be loaded into the cache.
*/
if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
if (msr_set_bit(MSR_IA32_MISC_ENABLE,
MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT)
> 0) {
pr_info("CPU: C0 stepping P4 Xeon detected.\n");
pr_info("CPU: Disabling hardware prefetching (Errata 037)\n");
}
}
/*
* See if we have a good local APIC by checking for buggy Pentia,
* i.e. all B steppings and the C2 stepping of P54C when using their
* integrated APIC (see 11AP erratum in "Pentium Processor
* Specification Update").
*/
if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
(c->x86_mask < 0x6 || c->x86_mask == 0xb))
set_cpu_bug(c, X86_BUG_11AP);
#ifdef CONFIG_X86_INTEL_USERCOPY
/*
* Set up the preferred alignment for movsl bulk memory moves
*/
switch (c->x86) {
case 4: /* 486: untested */
break;
case 5: /* Old Pentia: untested */
break;
case 6: /* PII/PIII only like movsl with 8-byte alignment */
movsl_mask.mask = 7;
break;
case 15: /* P4 is OK down to 8-byte alignment */
movsl_mask.mask = 7;
break;
}
#endif
intel_smp_check(c);
}
#else
static void intel_workarounds(struct cpuinfo_x86 *c)
{
}
#endif
static void srat_detect_node(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_NUMA
unsigned node;
int cpu = smp_processor_id();
/* Don't do the funky fallback heuristics the AMD version employs
for now. */
node = numa_cpu_node(cpu);
if (node == NUMA_NO_NODE || !node_online(node)) {
/* reuse the value from init_cpu_to_node() */
node = cpu_to_node(cpu);
}
numa_set_node(cpu, node);
#endif
}
/*
* find out the number of processor cores on the die
*/
static int intel_num_cpu_cores(struct cpuinfo_x86 *c)
{
unsigned int eax, ebx, ecx, edx;
if (c->cpuid_level < 4)
return 1;
/* Intel has a non-standard dependency on %ecx for this CPUID level. */
cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
if (eax & 0x1f)
return (eax >> 26) + 1;
else
return 1;
}
static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
{
/* Intel VMX MSR indicated features */
#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
clear_cpu_cap(c, X86_FEATURE_VNMI);
clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
clear_cpu_cap(c, X86_FEATURE_EPT);
clear_cpu_cap(c, X86_FEATURE_VPID);
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
msr_ctl = vmx_msr_high | vmx_msr_low;
if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
set_cpu_cap(c, X86_FEATURE_VNMI);
if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
vmx_msr_low, vmx_msr_high);
msr_ctl2 = vmx_msr_high | vmx_msr_low;
if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
(msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
set_cpu_cap(c, X86_FEATURE_EPT);
if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
set_cpu_cap(c, X86_FEATURE_VPID);
}
}
static void init_intel(struct cpuinfo_x86 *c)
{
unsigned int l2 = 0;
early_init_intel(c);
intel_workarounds(c);
/*
* Detect the extended topology information if available. This
* will reinitialise the initial_apicid which will be used
* in init_intel_cacheinfo()
*/
detect_extended_topology(c);
if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
/*
* let's use the legacy cpuid vector 0x1 and 0x4 for topology
* detection.
*/
c->x86_max_cores = intel_num_cpu_cores(c);
#ifdef CONFIG_X86_32
detect_ht(c);
#endif
}
l2 = init_intel_cacheinfo(c);
/* Detect legacy cache sizes if init_intel_cacheinfo did not */
if (l2 == 0) {
cpu_detect_cache_sizes(c);
l2 = c->x86_cache_size;
}
if (c->cpuid_level > 9) {
unsigned eax = cpuid_eax(10);
/* Check for version and the number of counters */
if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
}
if (cpu_has_xmm2)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
if (cpu_has_ds) {
unsigned int l1;
rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
if (!(l1 & (1<<11)))
set_cpu_cap(c, X86_FEATURE_BTS);
if (!(l1 & (1<<12)))
set_cpu_cap(c, X86_FEATURE_PEBS);
}
if (c->x86 == 6 && cpu_has_clflush &&
(c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
#ifdef CONFIG_X86_64
if (c->x86 == 15)
c->x86_cache_alignment = c->x86_clflush_size * 2;
if (c->x86 == 6)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
#else
/*
* Names for the Pentium II/Celeron processors
* detectable only by also checking the cache size.
* Dixon is NOT a Celeron.
*/
if (c->x86 == 6) {
char *p = NULL;
switch (c->x86_model) {
case 5:
if (l2 == 0)
p = "Celeron (Covington)";
else if (l2 == 256)
p = "Mobile Pentium II (Dixon)";
break;
case 6:
if (l2 == 128)
p = "Celeron (Mendocino)";
else if (c->x86_mask == 0 || c->x86_mask == 5)
p = "Celeron-A";
break;
case 8:
if (l2 == 128)
p = "Celeron (Coppermine)";
break;
}
if (p)
strcpy(c->x86_model_id, p);
}
if (c->x86 == 15)
set_cpu_cap(c, X86_FEATURE_P4);
if (c->x86 == 6)
set_cpu_cap(c, X86_FEATURE_P3);
#endif
/* Work around errata */
srat_detect_node(c);
if (cpu_has(c, X86_FEATURE_VMX))
detect_vmx_virtcap(c);
/*
* Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
* x86_energy_perf_policy(8) is available to change it at run-time
*/
if (cpu_has(c, X86_FEATURE_EPB)) {
u64 epb;
rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
printk_once(KERN_WARNING "ENERGY_PERF_BIAS:"
" Set to 'normal', was 'performance'\n"
"ENERGY_PERF_BIAS: View and update with"
" x86_energy_perf_policy(8)\n");
epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
}
}
}
#ifdef CONFIG_X86_32
static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/*
* Intel PIII Tualatin. This comes in two flavours.
* One has 256kb of cache, the other 512. We have no way
* to determine which, so we use a boottime override
* for the 512kb model, and assume 256 otherwise.
*/
if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0))
size = 256;
/*
* Intel Quark SoC X1000 contains a 4-way set associative
* 16K cache with a 16 byte cache line and 256 lines per tag
*/
if ((c->x86 == 5) && (c->x86_model == 9))
size = 16;
return size;
}
#endif
#define TLB_INST_4K 0x01
#define TLB_INST_4M 0x02
#define TLB_INST_2M_4M 0x03
#define TLB_INST_ALL 0x05
#define TLB_INST_1G 0x06
#define TLB_DATA_4K 0x11
#define TLB_DATA_4M 0x12
#define TLB_DATA_2M_4M 0x13
#define TLB_DATA_4K_4M 0x14
#define TLB_DATA_1G 0x16
#define TLB_DATA0_4K 0x21
#define TLB_DATA0_4M 0x22
#define TLB_DATA0_2M_4M 0x23
#define STLB_4K 0x41
#define STLB_4K_2M 0x42
static const struct _tlb_table intel_tlb_table[] = {
{ 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
{ 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" },
{ 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" },
{ 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
{ 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
{ 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
{ 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" },
{ 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
{ 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
{ 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
{ 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
{ 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" },
{ 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" },
{ 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" },
{ 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" },
{ 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
{ 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
{ 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
{ 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" },
{ 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" },
{ 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
{ 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
{ 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
{ 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
{ 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
{ 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
{ 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set ssociative" },
{ 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set ssociative" },
{ 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
{ 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
{ 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
{ 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" },
{ 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
{ 0x00, 0, 0 }
};
static void intel_tlb_lookup(const unsigned char desc)
{
unsigned char k;
if (desc == 0)
return;
/* look up this descriptor in the table */
for (k = 0; intel_tlb_table[k].descriptor != desc && \
intel_tlb_table[k].descriptor != 0; k++)
;
if (intel_tlb_table[k].tlb_type == 0)
return;
switch (intel_tlb_table[k].tlb_type) {
case STLB_4K:
if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
break;
case STLB_4K_2M:
if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_INST_ALL:
if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_INST_4K:
if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_INST_4M:
if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_INST_2M_4M:
if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_DATA_4K:
case TLB_DATA0_4K:
if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_DATA_4M:
case TLB_DATA0_4M:
if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_DATA_2M_4M:
case TLB_DATA0_2M_4M:
if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_DATA_4K_4M:
if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_DATA_1G:
if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
break;
}
}
static void intel_detect_tlb(struct cpuinfo_x86 *c)
{
int i, j, n;
unsigned int regs[4];
unsigned char *desc = (unsigned char *)regs;
if (c->cpuid_level < 2)
return;
/* Number of times to iterate */
n = cpuid_eax(2) & 0xFF;
for (i = 0 ; i < n ; i++) {
cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
/* If bit 31 is set, this is an unknown format */
for (j = 0 ; j < 3 ; j++)
if (regs[j] & (1 << 31))
regs[j] = 0;
/* Byte 0 is level count, not a descriptor */
for (j = 1 ; j < 16 ; j++)
intel_tlb_lookup(desc[j]);
}
}
static const struct cpu_dev intel_cpu_dev = {
.c_vendor = "Intel",
.c_ident = { "GenuineIntel" },
#ifdef CONFIG_X86_32
.legacy_models = {
{ .family = 4, .model_names =
{
[0] = "486 DX-25/33",
[1] = "486 DX-50",
[2] = "486 SX",
[3] = "486 DX/2",
[4] = "486 SL",
[5] = "486 SX/2",
[7] = "486 DX/2-WB",
[8] = "486 DX/4",
[9] = "486 DX/4-WB"
}
},
{ .family = 5, .model_names =
{
[0] = "Pentium 60/66 A-step",
[1] = "Pentium 60/66",
[2] = "Pentium 75 - 200",
[3] = "OverDrive PODP5V83",
[4] = "Pentium MMX",
[7] = "Mobile Pentium 75 - 200",
[8] = "Mobile Pentium MMX",
[9] = "Quark SoC X1000",
}
},
{ .family = 6, .model_names =
{
[0] = "Pentium Pro A-step",
[1] = "Pentium Pro",
[3] = "Pentium II (Klamath)",
[4] = "Pentium II (Deschutes)",
[5] = "Pentium II (Deschutes)",
[6] = "Mobile Pentium II",
[7] = "Pentium III (Katmai)",
[8] = "Pentium III (Coppermine)",
[10] = "Pentium III (Cascades)",
[11] = "Pentium III (Tualatin)",
}
},
{ .family = 15, .model_names =
{
[0] = "Pentium 4 (Unknown)",
[1] = "Pentium 4 (Willamette)",
[2] = "Pentium 4 (Northwood)",
[4] = "Pentium 4 (Foster)",
[5] = "Pentium 4 (Foster)",
}
},
},
.legacy_cache_size = intel_size_cache,
#endif
.c_detect_tlb = intel_detect_tlb,
.c_early_init = early_init_intel,
.c_init = init_intel,
.c_x86_vendor = X86_VENDOR_INTEL,
};
cpu_dev_register(intel_cpu_dev);

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,49 @@
#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/slab.h>
/**
* x86_match_cpu - match current CPU again an array of x86_cpu_ids
* @match: Pointer to array of x86_cpu_ids. Last entry terminated with
* {}.
*
* Return the entry if the current CPU matches the entries in the
* passed x86_cpu_id match table. Otherwise NULL. The match table
* contains vendor (X86_VENDOR_*), family, model and feature bits or
* respective wildcard entries.
*
* A typical table entry would be to match a specific CPU
* { X86_VENDOR_INTEL, 6, 0x12 }
* or to match a specific CPU feature
* { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) }
*
* Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
* %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
*
* Arrays used to match for this should also be declared using
* MODULE_DEVICE_TABLE(x86cpu, ...)
*
* This always matches against the boot cpu, assuming models and features are
* consistent over all CPUs.
*/
const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
{
const struct x86_cpu_id *m;
struct cpuinfo_x86 *c = &boot_cpu_data;
for (m = match; m->vendor | m->family | m->model | m->feature; m++) {
if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
continue;
if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
continue;
if (m->model != X86_MODEL_ANY && c->x86_model != m->model)
continue;
if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
continue;
return m;
}
return NULL;
}
EXPORT_SYMBOL(x86_match_cpu);

View file

@ -0,0 +1,11 @@
obj-y = mce.o mce-severity.o
obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
obj-$(CONFIG_ACPI_APEI) += mce-apei.o

View file

@ -0,0 +1,155 @@
/*
* Bridge between MCE and APEI
*
* On some machine, corrected memory errors are reported via APEI
* generic hardware error source (GHES) instead of corrected Machine
* Check. These corrected memory errors can be reported to user space
* through /dev/mcelog via faking a corrected Machine Check, so that
* the error memory page can be offlined by /sbin/mcelog if the error
* count for one page is beyond the threshold.
*
* For fatal MCE, save MCE record into persistent storage via ERST, so
* that the MCE record can be logged after reboot via ERST.
*
* Copyright 2010 Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/cper.h>
#include <acpi/apei.h>
#include <acpi/ghes.h>
#include <asm/mce.h>
#include "mce-internal.h"
void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
{
struct mce m;
if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
return;
mce_setup(&m);
m.bank = 1;
/* Fake a memory read error with unknown channel */
m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
if (severity >= GHES_SEV_RECOVERABLE)
m.status |= MCI_STATUS_UC;
if (severity >= GHES_SEV_PANIC)
m.status |= MCI_STATUS_PCC;
m.addr = mem_err->physical_addr;
mce_log(&m);
mce_notify_irq();
}
EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
#define CPER_CREATOR_MCE \
UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
0x64, 0x90, 0xb8, 0x9d)
#define CPER_SECTION_TYPE_MCE \
UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
0x04, 0x4a, 0x38, 0xfc)
/*
* CPER specification (in UEFI specification 2.3 appendix N) requires
* byte-packed.
*/
struct cper_mce_record {
struct cper_record_header hdr;
struct cper_section_descriptor sec_hdr;
struct mce mce;
} __packed;
int apei_write_mce(struct mce *m)
{
struct cper_mce_record rcd;
memset(&rcd, 0, sizeof(rcd));
memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
rcd.hdr.revision = CPER_RECORD_REV;
rcd.hdr.signature_end = CPER_SIG_END;
rcd.hdr.section_count = 1;
rcd.hdr.error_severity = CPER_SEV_FATAL;
/* timestamp, platform_id, partition_id are all invalid */
rcd.hdr.validation_bits = 0;
rcd.hdr.record_length = sizeof(rcd);
rcd.hdr.creator_id = CPER_CREATOR_MCE;
rcd.hdr.notification_type = CPER_NOTIFY_MCE;
rcd.hdr.record_id = cper_next_record_id();
rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
rcd.sec_hdr.section_length = sizeof(rcd.mce);
rcd.sec_hdr.revision = CPER_SEC_REV;
/* fru_id and fru_text is invalid */
rcd.sec_hdr.validation_bits = 0;
rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
memcpy(&rcd.mce, m, sizeof(*m));
return erst_write(&rcd.hdr);
}
ssize_t apei_read_mce(struct mce *m, u64 *record_id)
{
struct cper_mce_record rcd;
int rc, pos;
rc = erst_get_record_id_begin(&pos);
if (rc)
return rc;
retry:
rc = erst_get_record_id_next(&pos, record_id);
if (rc)
goto out;
/* no more record */
if (*record_id == APEI_ERST_INVALID_RECORD_ID)
goto out;
rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
/* someone else has cleared the record, try next one */
if (rc == -ENOENT)
goto retry;
else if (rc < 0)
goto out;
/* try to skip other type records in storage */
else if (rc != sizeof(rcd) ||
uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
goto retry;
memcpy(m, &rcd.mce, sizeof(*m));
rc = sizeof(*m);
out:
erst_get_record_id_end();
return rc;
}
/* Check whether there is record in ERST */
int apei_check_mce(void)
{
return erst_get_record_count();
}
int apei_clear_mce(u64 record_id)
{
return erst_clear(record_id);
}

View file

@ -0,0 +1,256 @@
/*
* Machine check injection support.
* Copyright 2008 Intel Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*
* Authors:
* Andi Kleen
* Ying Huang
*/
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/timer.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/preempt.h>
#include <linux/smp.h>
#include <linux/notifier.h>
#include <linux/kdebug.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <linux/gfp.h>
#include <asm/mce.h>
#include <asm/apic.h>
#include <asm/nmi.h>
/* Update fake mce registers on current CPU. */
static void inject_mce(struct mce *m)
{
struct mce *i = &per_cpu(injectm, m->extcpu);
/* Make sure no one reads partially written injectm */
i->finished = 0;
mb();
m->finished = 0;
/* First set the fields after finished */
i->extcpu = m->extcpu;
mb();
/* Now write record in order, finished last (except above) */
memcpy(i, m, sizeof(struct mce));
/* Finally activate it */
mb();
i->finished = 1;
}
static void raise_poll(struct mce *m)
{
unsigned long flags;
mce_banks_t b;
memset(&b, 0xff, sizeof(mce_banks_t));
local_irq_save(flags);
machine_check_poll(0, &b);
local_irq_restore(flags);
m->finished = 0;
}
static void raise_exception(struct mce *m, struct pt_regs *pregs)
{
struct pt_regs regs;
unsigned long flags;
if (!pregs) {
memset(&regs, 0, sizeof(struct pt_regs));
regs.ip = m->ip;
regs.cs = m->cs;
pregs = &regs;
}
/* in mcheck exeception handler, irq will be disabled */
local_irq_save(flags);
do_machine_check(pregs, 0);
local_irq_restore(flags);
m->finished = 0;
}
static cpumask_var_t mce_inject_cpumask;
static DEFINE_MUTEX(mce_inject_mutex);
static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
{
int cpu = smp_processor_id();
struct mce *m = this_cpu_ptr(&injectm);
if (!cpumask_test_cpu(cpu, mce_inject_cpumask))
return NMI_DONE;
cpumask_clear_cpu(cpu, mce_inject_cpumask);
if (m->inject_flags & MCJ_EXCEPTION)
raise_exception(m, regs);
else if (m->status)
raise_poll(m);
return NMI_HANDLED;
}
static void mce_irq_ipi(void *info)
{
int cpu = smp_processor_id();
struct mce *m = this_cpu_ptr(&injectm);
if (cpumask_test_cpu(cpu, mce_inject_cpumask) &&
m->inject_flags & MCJ_EXCEPTION) {
cpumask_clear_cpu(cpu, mce_inject_cpumask);
raise_exception(m, NULL);
}
}
/* Inject mce on current CPU */
static int raise_local(void)
{
struct mce *m = this_cpu_ptr(&injectm);
int context = MCJ_CTX(m->inject_flags);
int ret = 0;
int cpu = m->extcpu;
if (m->inject_flags & MCJ_EXCEPTION) {
printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
switch (context) {
case MCJ_CTX_IRQ:
/*
* Could do more to fake interrupts like
* calling irq_enter, but the necessary
* machinery isn't exported currently.
*/
/*FALL THROUGH*/
case MCJ_CTX_PROCESS:
raise_exception(m, NULL);
break;
default:
printk(KERN_INFO "Invalid MCE context\n");
ret = -EINVAL;
}
printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
} else if (m->status) {
printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
raise_poll(m);
mce_notify_irq();
printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
} else
m->finished = 0;
return ret;
}
static void raise_mce(struct mce *m)
{
int context = MCJ_CTX(m->inject_flags);
inject_mce(m);
if (context == MCJ_CTX_RANDOM)
return;
#ifdef CONFIG_X86_LOCAL_APIC
if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) {
unsigned long start;
int cpu;
get_online_cpus();
cpumask_copy(mce_inject_cpumask, cpu_online_mask);
cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
for_each_online_cpu(cpu) {
struct mce *mcpu = &per_cpu(injectm, cpu);
if (!mcpu->finished ||
MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
cpumask_clear_cpu(cpu, mce_inject_cpumask);
}
if (!cpumask_empty(mce_inject_cpumask)) {
if (m->inject_flags & MCJ_IRQ_BROADCAST) {
/*
* don't wait because mce_irq_ipi is necessary
* to be sync with following raise_local
*/
preempt_disable();
smp_call_function_many(mce_inject_cpumask,
mce_irq_ipi, NULL, 0);
preempt_enable();
} else if (m->inject_flags & MCJ_NMI_BROADCAST)
apic->send_IPI_mask(mce_inject_cpumask,
NMI_VECTOR);
}
start = jiffies;
while (!cpumask_empty(mce_inject_cpumask)) {
if (!time_before(jiffies, start + 2*HZ)) {
printk(KERN_ERR
"Timeout waiting for mce inject %lx\n",
*cpumask_bits(mce_inject_cpumask));
break;
}
cpu_relax();
}
raise_local();
put_cpu();
put_online_cpus();
} else
#endif
{
preempt_disable();
raise_local();
preempt_enable();
}
}
/* Error injection interface */
static ssize_t mce_write(struct file *filp, const char __user *ubuf,
size_t usize, loff_t *off)
{
struct mce m;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
/*
* There are some cases where real MSR reads could slip
* through.
*/
if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
return -EIO;
if ((unsigned long)usize > sizeof(struct mce))
usize = sizeof(struct mce);
if (copy_from_user(&m, ubuf, usize))
return -EFAULT;
if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
return -EINVAL;
/*
* Need to give user space some time to set everything up,
* so do it a jiffie or two later everywhere.
*/
schedule_timeout(2);
mutex_lock(&mce_inject_mutex);
raise_mce(&m);
mutex_unlock(&mce_inject_mutex);
return usize;
}
static int inject_init(void)
{
if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
return -ENOMEM;
printk(KERN_INFO "Machine check injector initialized\n");
register_mce_write_callback(mce_write);
register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0,
"mce_notify");
return 0;
}
module_init(inject_init);
/*
* Cannot tolerate unloading currently because we cannot
* guarantee all openers of mce_chrdev will get a reference to us.
*/
MODULE_LICENSE("GPL");

View file

@ -0,0 +1,66 @@
#include <linux/device.h>
#include <asm/mce.h>
enum severity_level {
MCE_NO_SEVERITY,
MCE_KEEP_SEVERITY,
MCE_SOME_SEVERITY,
MCE_AO_SEVERITY,
MCE_UC_SEVERITY,
MCE_AR_SEVERITY,
MCE_PANIC_SEVERITY,
};
#define ATTR_LEN 16
/* One object for each MCE bank, shared by all CPUs */
struct mce_bank {
u64 ctl; /* subevents to enable */
unsigned char init; /* initialise bank? */
struct device_attribute attr; /* device attribute */
char attrname[ATTR_LEN]; /* attribute name */
};
int mce_severity(struct mce *a, int tolerant, char **msg);
struct dentry *mce_get_debugfs_dir(void);
extern struct mce_bank *mce_banks;
extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
unsigned long mce_intel_adjust_timer(unsigned long interval);
void mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
void cmci_disable_bank(int bank);
#else
# define mce_intel_adjust_timer mce_adjust_timer_default
static inline void mce_intel_cmci_poll(void) { }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
static inline void cmci_disable_bank(int bank) { }
#endif
void mce_timer_kick(unsigned long interval);
#ifdef CONFIG_ACPI_APEI
int apei_write_mce(struct mce *m);
ssize_t apei_read_mce(struct mce *m, u64 *record_id);
int apei_check_mce(void);
int apei_clear_mce(u64 record_id);
#else
static inline int apei_write_mce(struct mce *m)
{
return -EINVAL;
}
static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
{
return 0;
}
static inline int apei_check_mce(void)
{
return 0;
}
static inline int apei_clear_mce(u64 record_id)
{
return -EINVAL;
}
#endif

View file

@ -0,0 +1,283 @@
/*
* MCE grading rules.
* Copyright 2008, 2009 Intel Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*
* Author: Andi Kleen
*/
#include <linux/kernel.h>
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/debugfs.h>
#include <asm/mce.h>
#include "mce-internal.h"
/*
* Grade an mce by severity. In general the most severe ones are processed
* first. Since there are quite a lot of combinations test the bits in a
* table-driven way. The rules are simply processed in order, first
* match wins.
*
* Note this is only used for machine check exceptions, the corrected
* errors use much simpler rules. The exceptions still check for the corrected
* errors, but only to leave them alone for the CMCI handler (except for
* panic situations)
*/
enum context { IN_KERNEL = 1, IN_USER = 2 };
enum ser { SER_REQUIRED = 1, NO_SER = 2 };
static struct severity {
u64 mask;
u64 result;
unsigned char sev;
unsigned char mcgmask;
unsigned char mcgres;
unsigned char ser;
unsigned char context;
unsigned char covered;
char *msg;
} severities[] = {
#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
#define KERNEL .context = IN_KERNEL
#define USER .context = IN_USER
#define SER .ser = SER_REQUIRED
#define NOSER .ser = NO_SER
#define BITCLR(x) .mask = x, .result = 0
#define BITSET(x) .mask = x, .result = x
#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
#define MASK(x, y) .mask = x, .result = y
#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
MCESEV(
NO, "Invalid",
BITCLR(MCI_STATUS_VAL)
),
MCESEV(
NO, "Not enabled",
BITCLR(MCI_STATUS_EN)
),
MCESEV(
PANIC, "Processor context corrupt",
BITSET(MCI_STATUS_PCC)
),
/* When MCIP is not set something is very confused */
MCESEV(
PANIC, "MCIP not set in MCA handler",
MCGMASK(MCG_STATUS_MCIP, 0)
),
/* Neither return not error IP -- no chance to recover -> PANIC */
MCESEV(
PANIC, "Neither restart nor error IP",
MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
),
MCESEV(
PANIC, "In kernel and no restart IP",
KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
),
MCESEV(
KEEP, "Corrected error",
NOSER, BITCLR(MCI_STATUS_UC)
),
/* ignore OVER for UCNA */
MCESEV(
KEEP, "Uncorrected no action required",
SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
),
MCESEV(
PANIC, "Illegal combination (UCNA with AR=1)",
SER,
MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
),
MCESEV(
KEEP, "Non signalled machine check",
SER, BITCLR(MCI_STATUS_S)
),
MCESEV(
PANIC, "Action required with lost events",
SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
),
/* known AR MCACODs: */
#ifdef CONFIG_MEMORY_FAILURE
MCESEV(
KEEP, "Action required but unaffected thread is continuable",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
),
MCESEV(
AR, "Action required: data load error in a user process",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
USER
),
MCESEV(
AR, "Action required: instruction fetch error in a user process",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
USER
),
#endif
MCESEV(
PANIC, "Action required: unknown MCACOD",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
),
/* known AO MCACODs: */
MCESEV(
AO, "Action optional: memory scrubbing error",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
),
MCESEV(
AO, "Action optional: last level cache writeback error",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
),
MCESEV(
SOME, "Action optional: unknown MCACOD",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
),
MCESEV(
SOME, "Action optional with lost events",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
),
MCESEV(
PANIC, "Overflowed uncorrected",
BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
),
MCESEV(
UC, "Uncorrected",
BITSET(MCI_STATUS_UC)
),
MCESEV(
SOME, "No match",
BITSET(0)
) /* always matches. keep at end */
};
/*
* If mcgstatus indicated that ip/cs on the stack were
* no good, then "m->cs" will be zero and we will have
* to assume the worst case (IN_KERNEL) as we actually
* have no idea what we were executing when the machine
* check hit.
* If we do have a good "m->cs" (or a faked one in the
* case we were executing in VM86 mode) we can use it to
* distinguish an exception taken in user from from one
* taken in the kernel.
*/
static int error_context(struct mce *m)
{
return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
}
int mce_severity(struct mce *m, int tolerant, char **msg)
{
enum context ctx = error_context(m);
struct severity *s;
for (s = severities;; s++) {
if ((m->status & s->mask) != s->result)
continue;
if ((m->mcgstatus & s->mcgmask) != s->mcgres)
continue;
if (s->ser == SER_REQUIRED && !mca_cfg.ser)
continue;
if (s->ser == NO_SER && mca_cfg.ser)
continue;
if (s->context && ctx != s->context)
continue;
if (msg)
*msg = s->msg;
s->covered = 1;
if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
if (panic_on_oops || tolerant < 1)
return MCE_PANIC_SEVERITY;
}
return s->sev;
}
}
#ifdef CONFIG_DEBUG_FS
static void *s_start(struct seq_file *f, loff_t *pos)
{
if (*pos >= ARRAY_SIZE(severities))
return NULL;
return &severities[*pos];
}
static void *s_next(struct seq_file *f, void *data, loff_t *pos)
{
if (++(*pos) >= ARRAY_SIZE(severities))
return NULL;
return &severities[*pos];
}
static void s_stop(struct seq_file *f, void *data)
{
}
static int s_show(struct seq_file *f, void *data)
{
struct severity *ser = data;
seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
return 0;
}
static const struct seq_operations severities_seq_ops = {
.start = s_start,
.next = s_next,
.stop = s_stop,
.show = s_show,
};
static int severities_coverage_open(struct inode *inode, struct file *file)
{
return seq_open(file, &severities_seq_ops);
}
static ssize_t severities_coverage_write(struct file *file,
const char __user *ubuf,
size_t count, loff_t *ppos)
{
int i;
for (i = 0; i < ARRAY_SIZE(severities); i++)
severities[i].covered = 0;
return count;
}
static const struct file_operations severities_coverage_fops = {
.open = severities_coverage_open,
.release = seq_release,
.read = seq_read,
.write = severities_coverage_write,
.llseek = seq_lseek,
};
static int __init severities_debugfs_init(void)
{
struct dentry *dmce, *fsev;
dmce = mce_get_debugfs_dir();
if (!dmce)
goto err_out;
fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
&severities_coverage_fops);
if (!fsev)
goto err_out;
return 0;
err_out:
return -ENOMEM;
}
late_initcall(severities_debugfs_init);
#endif /* CONFIG_DEBUG_FS */

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,789 @@
/*
* (c) 2005-2012 Advanced Micro Devices, Inc.
* Your use of this code is subject to the terms and conditions of the
* GNU general public license version 2. See "COPYING" or
* http://www.gnu.org/licenses/gpl.html
*
* Written by Jacob Shin - AMD, Inc.
*
* Maintained by: Borislav Petkov <bp@alien8.de>
*
* April 2006
* - added support for AMD Family 0x10 processors
* May 2012
* - major scrubbing
*
* All MC4_MISCi registers are shared between multi-cores
*/
#include <linux/interrupt.h>
#include <linux/notifier.h>
#include <linux/kobject.h>
#include <linux/percpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <asm/amd_nb.h>
#include <asm/apic.h>
#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/msr.h>
#define NR_BLOCKS 9
#define THRESHOLD_MAX 0xFFF
#define INT_TYPE_APIC 0x00020000
#define MASK_VALID_HI 0x80000000
#define MASK_CNTP_HI 0x40000000
#define MASK_LOCKED_HI 0x20000000
#define MASK_LVTOFF_HI 0x00F00000
#define MASK_COUNT_EN_HI 0x00080000
#define MASK_INT_TYPE_HI 0x00060000
#define MASK_OVERFLOW_HI 0x00010000
#define MASK_ERR_COUNT_HI 0x00000FFF
#define MASK_BLKPTR_LO 0xFF000000
#define MCG_XBLK_ADDR 0xC0000400
static const char * const th_names[] = {
"load_store",
"insn_fetch",
"combined_unit",
"",
"northbridge",
"execution_unit",
};
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
static void amd_threshold_interrupt(void);
/*
* CPU Initialization
*/
struct thresh_restart {
struct threshold_block *b;
int reset;
int set_lvt_off;
int lvt_off;
u16 old_limit;
};
static inline bool is_shared_bank(int bank)
{
/* Bank 4 is for northbridge reporting and is thus shared */
return (bank == 4);
}
static const char * const bank4_names(struct threshold_block *b)
{
switch (b->address) {
/* MSR4_MISC0 */
case 0x00000413:
return "dram";
case 0xc0000408:
return "ht_links";
case 0xc0000409:
return "l3_cache";
default:
WARN(1, "Funny MSR: 0x%08x\n", b->address);
return "";
}
};
static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
{
/*
* bank 4 supports APIC LVT interrupts implicitly since forever.
*/
if (bank == 4)
return true;
/*
* IntP: interrupt present; if this bit is set, the thresholding
* bank can generate APIC LVT interrupts
*/
return msr_high_bits & BIT(28);
}
static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
{
int msr = (hi & MASK_LVTOFF_HI) >> 20;
if (apic < 0) {
pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
b->bank, b->block, b->address, hi, lo);
return 0;
}
if (apic != msr) {
pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n",
b->cpu, apic, b->bank, b->block, b->address, hi, lo);
return 0;
}
return 1;
};
/*
* Called via smp_call_function_single(), must be called with correct
* cpu affinity.
*/
static void threshold_restart_bank(void *_tr)
{
struct thresh_restart *tr = _tr;
u32 hi, lo;
rdmsr(tr->b->address, lo, hi);
if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
tr->reset = 1; /* limit cannot be lower than err count */
if (tr->reset) { /* reset err count and overflow bit */
hi =
(hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
(THRESHOLD_MAX - tr->b->threshold_limit);
} else if (tr->old_limit) { /* change limit w/o reset */
int new_count = (hi & THRESHOLD_MAX) +
(tr->old_limit - tr->b->threshold_limit);
hi = (hi & ~MASK_ERR_COUNT_HI) |
(new_count & THRESHOLD_MAX);
}
/* clear IntType */
hi &= ~MASK_INT_TYPE_HI;
if (!tr->b->interrupt_capable)
goto done;
if (tr->set_lvt_off) {
if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
/* set new lvt offset */
hi &= ~MASK_LVTOFF_HI;
hi |= tr->lvt_off << 20;
}
}
if (tr->b->interrupt_enable)
hi |= INT_TYPE_APIC;
done:
hi |= MASK_COUNT_EN_HI;
wrmsr(tr->b->address, lo, hi);
}
static void mce_threshold_block_init(struct threshold_block *b, int offset)
{
struct thresh_restart tr = {
.b = b,
.set_lvt_off = 1,
.lvt_off = offset,
};
b->threshold_limit = THRESHOLD_MAX;
threshold_restart_bank(&tr);
};
static int setup_APIC_mce(int reserved, int new)
{
if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
APIC_EILVT_MSG_FIX, 0))
return new;
return reserved;
}
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
struct threshold_block b;
unsigned int cpu = smp_processor_id();
u32 low = 0, high = 0, address = 0;
unsigned int bank, block;
int offset = -1;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
if (block == 0)
address = MSR_IA32_MC0_MISC + bank * 4;
else if (block == 1) {
address = (low & MASK_BLKPTR_LO) >> 21;
if (!address)
break;
address += MCG_XBLK_ADDR;
} else
++address;
if (rdmsr_safe(address, &low, &high))
break;
if (!(high & MASK_VALID_HI))
continue;
if (!(high & MASK_CNTP_HI) ||
(high & MASK_LOCKED_HI))
continue;
if (!block)
per_cpu(bank_map, cpu) |= (1 << bank);
memset(&b, 0, sizeof(b));
b.cpu = cpu;
b.bank = bank;
b.block = block;
b.address = address;
b.interrupt_capable = lvt_interrupt_supported(bank, high);
if (b.interrupt_capable) {
int new = (high & MASK_LVTOFF_HI) >> 20;
offset = setup_APIC_mce(offset, new);
}
mce_threshold_block_init(&b, offset);
mce_threshold_vector = amd_threshold_interrupt;
}
}
}
/*
* APIC Interrupt Handler
*/
/*
* threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
* the interrupt goes off when error_count reaches threshold_limit.
* the handler will simply log mcelog w/ software defined bank number.
*/
static void amd_threshold_interrupt(void)
{
u32 low = 0, high = 0, address = 0;
unsigned int bank, block;
struct mce m;
mce_setup(&m);
/* assume first bank caused it */
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
continue;
for (block = 0; block < NR_BLOCKS; ++block) {
if (block == 0) {
address = MSR_IA32_MC0_MISC + bank * 4;
} else if (block == 1) {
address = (low & MASK_BLKPTR_LO) >> 21;
if (!address)
break;
address += MCG_XBLK_ADDR;
} else {
++address;
}
if (rdmsr_safe(address, &low, &high))
break;
if (!(high & MASK_VALID_HI)) {
if (block)
continue;
else
break;
}
if (!(high & MASK_CNTP_HI) ||
(high & MASK_LOCKED_HI))
continue;
/*
* Log the machine check that caused the threshold
* event.
*/
machine_check_poll(MCP_TIMESTAMP,
this_cpu_ptr(&mce_poll_banks));
if (high & MASK_OVERFLOW_HI) {
rdmsrl(address, m.misc);
rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
m.status);
m.bank = K8_MCE_THRESHOLD_BASE
+ bank * NR_BLOCKS
+ block;
mce_log(&m);
return;
}
}
}
}
/*
* Sysfs Interface
*/
struct threshold_attr {
struct attribute attr;
ssize_t (*show) (struct threshold_block *, char *);
ssize_t (*store) (struct threshold_block *, const char *, size_t count);
};
#define SHOW_FIELDS(name) \
static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
{ \
return sprintf(buf, "%lu\n", (unsigned long) b->name); \
}
SHOW_FIELDS(interrupt_enable)
SHOW_FIELDS(threshold_limit)
static ssize_t
store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
{
struct thresh_restart tr;
unsigned long new;
if (!b->interrupt_capable)
return -EINVAL;
if (kstrtoul(buf, 0, &new) < 0)
return -EINVAL;
b->interrupt_enable = !!new;
memset(&tr, 0, sizeof(tr));
tr.b = b;
smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
return size;
}
static ssize_t
store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
{
struct thresh_restart tr;
unsigned long new;
if (kstrtoul(buf, 0, &new) < 0)
return -EINVAL;
if (new > THRESHOLD_MAX)
new = THRESHOLD_MAX;
if (new < 1)
new = 1;
memset(&tr, 0, sizeof(tr));
tr.old_limit = b->threshold_limit;
b->threshold_limit = new;
tr.b = b;
smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
return size;
}
static ssize_t show_error_count(struct threshold_block *b, char *buf)
{
u32 lo, hi;
rdmsr_on_cpu(b->cpu, b->address, &lo, &hi);
return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
(THRESHOLD_MAX - b->threshold_limit)));
}
static struct threshold_attr error_count = {
.attr = {.name = __stringify(error_count), .mode = 0444 },
.show = show_error_count,
};
#define RW_ATTR(val) \
static struct threshold_attr val = { \
.attr = {.name = __stringify(val), .mode = 0644 }, \
.show = show_## val, \
.store = store_## val, \
};
RW_ATTR(interrupt_enable);
RW_ATTR(threshold_limit);
static struct attribute *default_attrs[] = {
&threshold_limit.attr,
&error_count.attr,
NULL, /* possibly interrupt_enable if supported, see below */
NULL,
};
#define to_block(k) container_of(k, struct threshold_block, kobj)
#define to_attr(a) container_of(a, struct threshold_attr, attr)
static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
{
struct threshold_block *b = to_block(kobj);
struct threshold_attr *a = to_attr(attr);
ssize_t ret;
ret = a->show ? a->show(b, buf) : -EIO;
return ret;
}
static ssize_t store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t count)
{
struct threshold_block *b = to_block(kobj);
struct threshold_attr *a = to_attr(attr);
ssize_t ret;
ret = a->store ? a->store(b, buf, count) : -EIO;
return ret;
}
static const struct sysfs_ops threshold_ops = {
.show = show,
.store = store,
};
static struct kobj_type threshold_ktype = {
.sysfs_ops = &threshold_ops,
.default_attrs = default_attrs,
};
static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
unsigned int block, u32 address)
{
struct threshold_block *b = NULL;
u32 low, high;
int err;
if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
return 0;
if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
return 0;
if (!(high & MASK_VALID_HI)) {
if (block)
goto recurse;
else
return 0;
}
if (!(high & MASK_CNTP_HI) ||
(high & MASK_LOCKED_HI))
goto recurse;
b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
if (!b)
return -ENOMEM;
b->block = block;
b->bank = bank;
b->cpu = cpu;
b->address = address;
b->interrupt_enable = 0;
b->interrupt_capable = lvt_interrupt_supported(bank, high);
b->threshold_limit = THRESHOLD_MAX;
if (b->interrupt_capable)
threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
else
threshold_ktype.default_attrs[2] = NULL;
INIT_LIST_HEAD(&b->miscj);
if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
list_add(&b->miscj,
&per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
} else {
per_cpu(threshold_banks, cpu)[bank]->blocks = b;
}
err = kobject_init_and_add(&b->kobj, &threshold_ktype,
per_cpu(threshold_banks, cpu)[bank]->kobj,
(bank == 4 ? bank4_names(b) : th_names[bank]));
if (err)
goto out_free;
recurse:
if (!block) {
address = (low & MASK_BLKPTR_LO) >> 21;
if (!address)
return 0;
address += MCG_XBLK_ADDR;
} else {
++address;
}
err = allocate_threshold_blocks(cpu, bank, ++block, address);
if (err)
goto out_free;
if (b)
kobject_uevent(&b->kobj, KOBJ_ADD);
return err;
out_free:
if (b) {
kobject_put(&b->kobj);
list_del(&b->miscj);
kfree(b);
}
return err;
}
static int __threshold_add_blocks(struct threshold_bank *b)
{
struct list_head *head = &b->blocks->miscj;
struct threshold_block *pos = NULL;
struct threshold_block *tmp = NULL;
int err = 0;
err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
if (err)
return err;
list_for_each_entry_safe(pos, tmp, head, miscj) {
err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
if (err) {
list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
kobject_del(&pos->kobj);
return err;
}
}
return err;
}
static int threshold_create_bank(unsigned int cpu, unsigned int bank)
{
struct device *dev = per_cpu(mce_device, cpu);
struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
const char *name = th_names[bank];
int err = 0;
if (is_shared_bank(bank)) {
nb = node_to_amd_nb(amd_get_nb_id(cpu));
/* threshold descriptor already initialized on this node? */
if (nb && nb->bank4) {
/* yes, use it */
b = nb->bank4;
err = kobject_add(b->kobj, &dev->kobj, name);
if (err)
goto out;
per_cpu(threshold_banks, cpu)[bank] = b;
atomic_inc(&b->cpus);
err = __threshold_add_blocks(b);
goto out;
}
}
b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
if (!b) {
err = -ENOMEM;
goto out;
}
b->kobj = kobject_create_and_add(name, &dev->kobj);
if (!b->kobj) {
err = -EINVAL;
goto out_free;
}
per_cpu(threshold_banks, cpu)[bank] = b;
if (is_shared_bank(bank)) {
atomic_set(&b->cpus, 1);
/* nb is already initialized, see above */
if (nb) {
WARN_ON(nb->bank4);
nb->bank4 = b;
}
}
err = allocate_threshold_blocks(cpu, bank, 0,
MSR_IA32_MC0_MISC + bank * 4);
if (!err)
goto out;
out_free:
kfree(b);
out:
return err;
}
/* create dir/files for all valid threshold banks */
static int threshold_create_device(unsigned int cpu)
{
unsigned int bank;
struct threshold_bank **bp;
int err = 0;
bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks,
GFP_KERNEL);
if (!bp)
return -ENOMEM;
per_cpu(threshold_banks, cpu) = bp;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
err = threshold_create_bank(cpu, bank);
if (err)
return err;
}
return err;
}
static void deallocate_threshold_block(unsigned int cpu,
unsigned int bank)
{
struct threshold_block *pos = NULL;
struct threshold_block *tmp = NULL;
struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
if (!head)
return;
list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
kobject_put(&pos->kobj);
list_del(&pos->miscj);
kfree(pos);
}
kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
}
static void __threshold_remove_blocks(struct threshold_bank *b)
{
struct threshold_block *pos = NULL;
struct threshold_block *tmp = NULL;
kobject_del(b->kobj);
list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
kobject_del(&pos->kobj);
}
static void threshold_remove_bank(unsigned int cpu, int bank)
{
struct amd_northbridge *nb;
struct threshold_bank *b;
b = per_cpu(threshold_banks, cpu)[bank];
if (!b)
return;
if (!b->blocks)
goto free_out;
if (is_shared_bank(bank)) {
if (!atomic_dec_and_test(&b->cpus)) {
__threshold_remove_blocks(b);
per_cpu(threshold_banks, cpu)[bank] = NULL;
return;
} else {
/*
* the last CPU on this node using the shared bank is
* going away, remove that bank now.
*/
nb = node_to_amd_nb(amd_get_nb_id(cpu));
nb->bank4 = NULL;
}
}
deallocate_threshold_block(cpu, bank);
free_out:
kobject_del(b->kobj);
kobject_put(b->kobj);
kfree(b);
per_cpu(threshold_banks, cpu)[bank] = NULL;
}
static void threshold_remove_device(unsigned int cpu)
{
unsigned int bank;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
threshold_remove_bank(cpu, bank);
}
kfree(per_cpu(threshold_banks, cpu));
}
/* get notified when a cpu comes on/off */
static void
amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
{
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
threshold_create_device(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
threshold_remove_device(cpu);
break;
default:
break;
}
}
static __init int threshold_init_device(void)
{
unsigned lcpu = 0;
/* to hit CPUs online before the notifier is up */
for_each_online_cpu(lcpu) {
int err = threshold_create_device(lcpu);
if (err)
return err;
}
threshold_cpu_callback = amd_64_threshold_cpu_callback;
return 0;
}
/*
* there are 3 funcs which need to be _initcalled in a logic sequence:
* 1. xen_late_init_mcelog
* 2. mcheck_init_device
* 3. threshold_init_device
*
* xen_late_init_mcelog must register xen_mce_chrdev_device before
* native mce_chrdev_device registration if running under xen platform;
*
* mcheck_init_device should be inited before threshold_init_device to
* initialize mce_device, otherwise a NULL ptr dereference will cause panic.
*
* so we use following _initcalls
* 1. device_initcall(xen_late_init_mcelog);
* 2. device_initcall_sync(mcheck_init_device);
* 3. late_initcall(threshold_init_device);
*
* when running under xen, the initcall order is 1,2,3;
* on baremetal, we skip 1 and we do only 2 and 3.
*/
late_initcall(threshold_init_device);

View file

@ -0,0 +1,391 @@
/*
* Intel specific MCE features.
* Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
* Copyright (C) 2008, 2009 Intel Corporation
* Author: Andi Kleen
*/
#include <linux/gfp.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/cpumask.h>
#include <asm/apic.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
#include "mce-internal.h"
/*
* Support for Intel Correct Machine Check Interrupts. This allows
* the CPU to raise an interrupt when a corrected machine check happened.
* Normally we pick those up using a regular polling timer.
* Also supports reliable discovery of shared banks.
*/
/*
* CMCI can be delivered to multiple cpus that share a machine check bank
* so we need to designate a single cpu to process errors logged in each bank
* in the interrupt handler (otherwise we would have many races and potential
* double reporting of the same error).
* Note that this can change when a cpu is offlined or brought online since
* some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear()
* disables CMCI on all banks owned by the cpu and clears this bitfield. At
* this point, cmci_rediscover() kicks in and a different cpu may end up
* taking ownership of some of the shared MCA banks that were previously
* owned by the offlined cpu.
*/
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
/*
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
#define CMCI_THRESHOLD 1
#define CMCI_POLL_INTERVAL (30 * HZ)
#define CMCI_STORM_INTERVAL (1 * HZ)
#define CMCI_STORM_THRESHOLD 15
static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
enum {
CMCI_STORM_NONE,
CMCI_STORM_ACTIVE,
CMCI_STORM_SUBSIDED,
};
static atomic_t cmci_storm_on_cpus;
static int cmci_supported(int *banks)
{
u64 cap;
if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
return 0;
/*
* Vendor check is not strictly needed, but the initial
* initialization is vendor keyed and this
* makes sure none of the backdoors are entered otherwise.
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 0;
if (!cpu_has_apic || lapic_get_maxlvt() < 6)
return 0;
rdmsrl(MSR_IA32_MCG_CAP, cap);
*banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
return !!(cap & MCG_CMCI_P);
}
void mce_intel_cmci_poll(void)
{
if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
return;
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
}
void mce_intel_hcpu_update(unsigned long cpu)
{
if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
atomic_dec(&cmci_storm_on_cpus);
per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
}
unsigned long mce_intel_adjust_timer(unsigned long interval)
{
int r;
if (interval < CMCI_POLL_INTERVAL)
return interval;
switch (__this_cpu_read(cmci_storm_state)) {
case CMCI_STORM_ACTIVE:
/*
* We switch back to interrupt mode once the poll timer has
* silenced itself. That means no events recorded and the
* timer interval is back to our poll interval.
*/
__this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
r = atomic_sub_return(1, &cmci_storm_on_cpus);
if (r == 0)
pr_notice("CMCI storm subsided: switching to interrupt mode\n");
/* FALLTHROUGH */
case CMCI_STORM_SUBSIDED:
/*
* We wait for all cpus to go back to SUBSIDED
* state. When that happens we switch back to
* interrupt mode.
*/
if (!atomic_read(&cmci_storm_on_cpus)) {
__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
cmci_reenable();
cmci_recheck();
}
return CMCI_POLL_INTERVAL;
default:
/*
* We have shiny weather. Let the poll do whatever it
* thinks.
*/
return interval;
}
}
static void cmci_storm_disable_banks(void)
{
unsigned long flags, *owned;
int bank;
u64 val;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
owned = this_cpu_ptr(mce_banks_owned);
for_each_set_bit(bank, owned, MAX_NR_BANKS) {
rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static bool cmci_storm_detect(void)
{
unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
unsigned long ts = __this_cpu_read(cmci_time_stamp);
unsigned long now = jiffies;
int r;
if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
return true;
if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
cnt++;
} else {
cnt = 1;
__this_cpu_write(cmci_time_stamp, now);
}
__this_cpu_write(cmci_storm_cnt, cnt);
if (cnt <= CMCI_STORM_THRESHOLD)
return false;
cmci_storm_disable_banks();
__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
r = atomic_add_return(1, &cmci_storm_on_cpus);
mce_timer_kick(CMCI_POLL_INTERVAL);
if (r == 1)
pr_notice("CMCI storm detected: switching to poll mode\n");
return true;
}
/*
* The interrupt handler. This is called on every event.
* Just call the poller directly to log any events.
* This could in theory increase the threshold under high load,
* but doesn't for now.
*/
static void intel_threshold_interrupt(void)
{
if (cmci_storm_detect())
return;
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
mce_notify_irq();
}
/*
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
* on this CPU. Use the algorithm recommended in the SDM to discover shared
* banks.
*/
static void cmci_discover(int banks)
{
unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
unsigned long flags;
int i;
int bios_wrong_thresh = 0;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
u64 val;
int bios_zero_thresh = 0;
if (test_bit(i, owned))
continue;
/* Skip banks in firmware first mode */
if (test_bit(i, mce_banks_ce_disabled))
continue;
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Already owned by someone else? */
if (val & MCI_CTL2_CMCI_EN) {
clear_bit(i, owned);
__clear_bit(i, this_cpu_ptr(mce_poll_banks));
continue;
}
if (!mca_cfg.bios_cmci_threshold) {
val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
val |= CMCI_THRESHOLD;
} else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
/*
* If bios_cmci_threshold boot option was specified
* but the threshold is zero, we'll try to initialize
* it to 1.
*/
bios_zero_thresh = 1;
val |= CMCI_THRESHOLD;
}
val |= MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Did the enable bit stick? -- the bank supports CMCI */
if (val & MCI_CTL2_CMCI_EN) {
set_bit(i, owned);
__clear_bit(i, this_cpu_ptr(mce_poll_banks));
/*
* We are able to set thresholds for some banks that
* had a threshold of 0. This means the BIOS has not
* set the thresholds properly or does not work with
* this boot option. Note down now and report later.
*/
if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
(val & MCI_CTL2_CMCI_THRESHOLD_MASK))
bios_wrong_thresh = 1;
} else {
WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks)));
}
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
pr_info_once(
"bios_cmci_threshold: Some banks do not have valid thresholds set\n");
pr_info_once(
"bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
}
}
/*
* Just in case we missed an event during initialization check
* all the CMCI owned banks.
*/
void cmci_recheck(void)
{
unsigned long flags;
int banks;
if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
return;
local_irq_save(flags);
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
local_irq_restore(flags);
}
/* Caller must hold the lock on cmci_discover_lock */
static void __cmci_disable_bank(int bank)
{
u64 val;
if (!test_bit(bank, this_cpu_ptr(mce_banks_owned)))
return;
rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
__clear_bit(bank, this_cpu_ptr(mce_banks_owned));
}
/*
* Disable CMCI on this CPU for all banks it owns when it goes down.
* This allows other CPUs to claim the banks on rediscovery.
*/
void cmci_clear(void)
{
unsigned long flags;
int i;
int banks;
if (!cmci_supported(&banks))
return;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++)
__cmci_disable_bank(i);
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static void cmci_rediscover_work_func(void *arg)
{
int banks;
/* Recheck banks in case CPUs don't all have the same */
if (cmci_supported(&banks))
cmci_discover(banks);
}
/* After a CPU went down cycle through all the others and rediscover */
void cmci_rediscover(void)
{
int banks;
if (!cmci_supported(&banks))
return;
on_each_cpu(cmci_rediscover_work_func, NULL, 1);
}
/*
* Reenable CMCI on this CPU in case a CPU down failed.
*/
void cmci_reenable(void)
{
int banks;
if (cmci_supported(&banks))
cmci_discover(banks);
}
void cmci_disable_bank(int bank)
{
int banks;
unsigned long flags;
if (!cmci_supported(&banks))
return;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
__cmci_disable_bank(bank);
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static void intel_init_cmci(void)
{
int banks;
if (!cmci_supported(&banks))
return;
mce_threshold_vector = intel_threshold_interrupt;
cmci_discover(banks);
/*
* For CPU #0 this runs with still disabled APIC, but that's
* ok because only the vector is set up. We still do another
* check for the banks later for CPU #0 just to make sure
* to not miss any events.
*/
apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
cmci_recheck();
}
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
intel_init_cmci();
}

View file

@ -0,0 +1,66 @@
/*
* P5 specific Machine Check Exception Reporting
* (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
*/
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/mce.h>
#include <asm/msr.h>
/* By default disabled */
int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
static void pentium_machine_check(struct pt_regs *regs, long error_code)
{
u32 loaddr, hi, lotype;
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
printk(KERN_EMERG
"CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
smp_processor_id(), loaddr, lotype);
if (lotype & (1<<5)) {
printk(KERN_EMERG
"CPU#%d: Possible thermal failure (CPU on fire ?).\n",
smp_processor_id());
}
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
}
/* Set up machine check reporting for processors with Intel style MCE: */
void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
/* Default P5 to off as its often misconnected: */
if (!mce_p5_enabled)
return;
/* Check for MCE support: */
if (!cpu_has(c, X86_FEATURE_MCE))
return;
machine_check_vector = pentium_machine_check;
/* Make sure the vector pointer is visible before we enable MCEs: */
wmb();
/* Read registers before enabling: */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
printk(KERN_INFO
"Intel old style machine check architecture supported.\n");
/* Enable MCE: */
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO
"Intel old style machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
}

View file

@ -0,0 +1,573 @@
/*
* Thermal throttle event support code (such as syslog messaging and rate
* limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
*
* This allows consistent reporting of CPU thermal throttle events.
*
* Maintains a counter in /sys that keeps track of the number of thermal
* events, such that the user knows how bad the thermal problem might be
* (since the logging to syslog and mcelog is rate limited).
*
* Author: Dmitriy Zavin (dmitriyz@google.com)
*
* Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
* Inspired by Ross Biro's and Al Borchers' counter code.
*/
#include <linux/interrupt.h>
#include <linux/notifier.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/trace/irq_vectors.h>
/* How long to wait between reporting thermal events */
#define CHECK_INTERVAL (300 * HZ)
#define THERMAL_THROTTLING_EVENT 0
#define POWER_LIMIT_EVENT 1
/*
* Current thermal event state:
*/
struct _thermal_state {
bool new_event;
int event;
u64 next_check;
unsigned long count;
unsigned long last_count;
};
struct thermal_state {
struct _thermal_state core_throttle;
struct _thermal_state core_power_limit;
struct _thermal_state package_throttle;
struct _thermal_state package_power_limit;
struct _thermal_state core_thresh0;
struct _thermal_state core_thresh1;
struct _thermal_state pkg_thresh0;
struct _thermal_state pkg_thresh1;
};
/* Callback to handle core threshold interrupts */
int (*platform_thermal_notify)(__u64 msr_val);
EXPORT_SYMBOL(platform_thermal_notify);
/* Callback to handle core package threshold_interrupts */
int (*platform_thermal_package_notify)(__u64 msr_val);
EXPORT_SYMBOL_GPL(platform_thermal_package_notify);
/* Callback support of rate control, return true, if
* callback has rate control */
bool (*platform_thermal_package_rate_control)(void);
EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);
static DEFINE_PER_CPU(struct thermal_state, thermal_state);
static atomic_t therm_throt_en = ATOMIC_INIT(0);
static u32 lvtthmr_init __read_mostly;
#ifdef CONFIG_SYSFS
#define define_therm_throt_device_one_ro(_name) \
static DEVICE_ATTR(_name, 0444, \
therm_throt_device_show_##_name, \
NULL) \
#define define_therm_throt_device_show_func(event, name) \
\
static ssize_t therm_throt_device_show_##event##_##name( \
struct device *dev, \
struct device_attribute *attr, \
char *buf) \
{ \
unsigned int cpu = dev->id; \
ssize_t ret; \
\
preempt_disable(); /* CPU hotplug */ \
if (cpu_online(cpu)) { \
ret = sprintf(buf, "%lu\n", \
per_cpu(thermal_state, cpu).event.name); \
} else \
ret = 0; \
preempt_enable(); \
\
return ret; \
}
define_therm_throt_device_show_func(core_throttle, count);
define_therm_throt_device_one_ro(core_throttle_count);
define_therm_throt_device_show_func(core_power_limit, count);
define_therm_throt_device_one_ro(core_power_limit_count);
define_therm_throt_device_show_func(package_throttle, count);
define_therm_throt_device_one_ro(package_throttle_count);
define_therm_throt_device_show_func(package_power_limit, count);
define_therm_throt_device_one_ro(package_power_limit_count);
static struct attribute *thermal_throttle_attrs[] = {
&dev_attr_core_throttle_count.attr,
NULL
};
static struct attribute_group thermal_attr_group = {
.attrs = thermal_throttle_attrs,
.name = "thermal_throttle"
};
#endif /* CONFIG_SYSFS */
#define CORE_LEVEL 0
#define PACKAGE_LEVEL 1
/***
* therm_throt_process - Process thermal throttling event from interrupt
* @curr: Whether the condition is current or not (boolean), since the
* thermal interrupt normally gets called both when the thermal
* event begins and once the event has ended.
*
* This function is called by the thermal interrupt after the
* IRQ has been acknowledged.
*
* It will take care of rate limiting and printing messages to the syslog.
*
* Returns: 0 : Event should NOT be further logged, i.e. still in
* "timeout" from previous log message.
* 1 : Event should be logged further, and a message has been
* printed to the syslog.
*/
static int therm_throt_process(bool new_event, int event, int level)
{
struct _thermal_state *state;
unsigned int this_cpu = smp_processor_id();
bool old_event;
u64 now;
struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
now = get_jiffies_64();
if (level == CORE_LEVEL) {
if (event == THERMAL_THROTTLING_EVENT)
state = &pstate->core_throttle;
else if (event == POWER_LIMIT_EVENT)
state = &pstate->core_power_limit;
else
return 0;
} else if (level == PACKAGE_LEVEL) {
if (event == THERMAL_THROTTLING_EVENT)
state = &pstate->package_throttle;
else if (event == POWER_LIMIT_EVENT)
state = &pstate->package_power_limit;
else
return 0;
} else
return 0;
old_event = state->new_event;
state->new_event = new_event;
if (new_event)
state->count++;
if (time_before64(now, state->next_check) &&
state->count != state->last_count)
return 0;
state->next_check = now + CHECK_INTERVAL;
state->last_count = state->count;
/* if we just entered the thermal event */
if (new_event) {
if (event == THERMAL_THROTTLING_EVENT)
printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
this_cpu,
level == CORE_LEVEL ? "Core" : "Package",
state->count);
return 1;
}
if (old_event) {
if (event == THERMAL_THROTTLING_EVENT)
printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
this_cpu,
level == CORE_LEVEL ? "Core" : "Package");
return 1;
}
return 0;
}
static int thresh_event_valid(int level, int event)
{
struct _thermal_state *state;
unsigned int this_cpu = smp_processor_id();
struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
u64 now = get_jiffies_64();
if (level == PACKAGE_LEVEL)
state = (event == 0) ? &pstate->pkg_thresh0 :
&pstate->pkg_thresh1;
else
state = (event == 0) ? &pstate->core_thresh0 :
&pstate->core_thresh1;
if (time_before64(now, state->next_check))
return 0;
state->next_check = now + CHECK_INTERVAL;
return 1;
}
static bool int_pln_enable;
static int __init int_pln_enable_setup(char *s)
{
int_pln_enable = true;
return 1;
}
__setup("int_pln_enable", int_pln_enable_setup);
#ifdef CONFIG_SYSFS
/* Add/Remove thermal_throttle interface for CPU device: */
static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
{
int err;
struct cpuinfo_x86 *c = &cpu_data(cpu);
err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
if (err)
return err;
if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_core_power_limit_count.attr,
thermal_attr_group.name);
if (cpu_has(c, X86_FEATURE_PTS)) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_throttle_count.attr,
thermal_attr_group.name);
if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_power_limit_count.attr,
thermal_attr_group.name);
}
return err;
}
static void thermal_throttle_remove_dev(struct device *dev)
{
sysfs_remove_group(&dev->kobj, &thermal_attr_group);
}
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int
thermal_throttle_cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
struct device *dev;
int err = 0;
dev = get_cpu_device(cpu);
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
err = thermal_throttle_add_dev(dev, cpu);
WARN_ON(err);
break;
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
thermal_throttle_remove_dev(dev);
break;
}
return notifier_from_errno(err);
}
static struct notifier_block thermal_throttle_cpu_notifier =
{
.notifier_call = thermal_throttle_cpu_callback,
};
static __init int thermal_throttle_init_device(void)
{
unsigned int cpu = 0;
int err;
if (!atomic_read(&therm_throt_en))
return 0;
cpu_notifier_register_begin();
/* connect live CPUs to sysfs */
for_each_online_cpu(cpu) {
err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
WARN_ON(err);
}
__register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
cpu_notifier_register_done();
return 0;
}
device_initcall(thermal_throttle_init_device);
#endif /* CONFIG_SYSFS */
static void notify_package_thresholds(__u64 msr_val)
{
bool notify_thres_0 = false;
bool notify_thres_1 = false;
if (!platform_thermal_package_notify)
return;
/* lower threshold check */
if (msr_val & THERM_LOG_THRESHOLD0)
notify_thres_0 = true;
/* higher threshold check */
if (msr_val & THERM_LOG_THRESHOLD1)
notify_thres_1 = true;
if (!notify_thres_0 && !notify_thres_1)
return;
if (platform_thermal_package_rate_control &&
platform_thermal_package_rate_control()) {
/* Rate control is implemented in callback */
platform_thermal_package_notify(msr_val);
return;
}
/* lower threshold reached */
if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
platform_thermal_package_notify(msr_val);
/* higher threshold reached */
if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
platform_thermal_package_notify(msr_val);
}
static void notify_thresholds(__u64 msr_val)
{
/* check whether the interrupt handler is defined;
* otherwise simply return
*/
if (!platform_thermal_notify)
return;
/* lower threshold reached */
if ((msr_val & THERM_LOG_THRESHOLD0) &&
thresh_event_valid(CORE_LEVEL, 0))
platform_thermal_notify(msr_val);
/* higher threshold reached */
if ((msr_val & THERM_LOG_THRESHOLD1) &&
thresh_event_valid(CORE_LEVEL, 1))
platform_thermal_notify(msr_val);
}
/* Thermal transition interrupt handler */
static void intel_thermal_interrupt(void)
{
__u64 msr_val;
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
/* Check for violation of core thermal thresholds*/
notify_thresholds(msr_val);
if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
CORE_LEVEL) != 0)
mce_log_therm_throt_event(msr_val);
if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
CORE_LEVEL);
if (this_cpu_has(X86_FEATURE_PTS)) {
rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
/* check violations of package thermal thresholds */
notify_package_thresholds(msr_val);
therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
PACKAGE_LEVEL);
if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
therm_throt_process(msr_val &
PACKAGE_THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
PACKAGE_LEVEL);
}
}
static void unexpected_thermal_interrupt(void)
{
printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
smp_processor_id());
}
static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
static inline void __smp_thermal_interrupt(void)
{
inc_irq_stat(irq_thermal_count);
smp_thermal_vector();
}
asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs)
{
entering_irq();
__smp_thermal_interrupt();
exiting_ack_irq();
}
asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)
{
entering_irq();
trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
__smp_thermal_interrupt();
trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
exiting_ack_irq();
}
/* Thermal monitoring depends on APIC, ACPI and clock modulation */
static int intel_thermal_supported(struct cpuinfo_x86 *c)
{
if (!cpu_has_apic)
return 0;
if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
return 0;
return 1;
}
void __init mcheck_intel_therm_init(void)
{
/*
* This function is only called on boot CPU. Save the init thermal
* LVT value on BSP and use that value to restore APs' thermal LVT
* entry BIOS programmed later
*/
if (intel_thermal_supported(&boot_cpu_data))
lvtthmr_init = apic_read(APIC_LVTTHMR);
}
void intel_init_thermal(struct cpuinfo_x86 *c)
{
unsigned int cpu = smp_processor_id();
int tm2 = 0;
u32 l, h;
if (!intel_thermal_supported(c))
return;
/*
* First check if its enabled already, in which case there might
* be some SMM goo which handles it, so we can't even put a handler
* since it might be delivered via SMI already:
*/
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
h = lvtthmr_init;
/*
* The initial value of thermal LVT entries on all APs always reads
* 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
* sequence to them and LVT registers are reset to 0s except for
* the mask bits which are set to 1s when APs receive INIT IPI.
* If BIOS takes over the thermal interrupt and sets its interrupt
* delivery mode to SMI (not fixed), it restores the value that the
* BIOS has programmed on AP based on BSP's info we saved since BIOS
* is always setting the same value for all threads/cores.
*/
if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
apic_write(APIC_LVTTHMR, lvtthmr_init);
if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
if (system_state == SYSTEM_BOOTING)
printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu);
return;
}
/* Check whether a vector already exists */
if (h & APIC_VECTOR_MASK) {
printk(KERN_DEBUG
"CPU%d: Thermal LVT vector (%#x) already installed\n",
cpu, (h & APIC_VECTOR_MASK));
return;
}
/* early Pentium M models use different method for enabling TM2 */
if (cpu_has(c, X86_FEATURE_TM2)) {
if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
rdmsr(MSR_THERM2_CTL, l, h);
if (l & MSR_THERM2_CTL_TM_SELECT)
tm2 = 1;
} else if (l & MSR_IA32_MISC_ENABLE_TM2)
tm2 = 1;
}
/* We'll mask the thermal vector in the lapic till we're ready: */
h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
apic_write(APIC_LVTTHMR, h);
rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
wrmsr(MSR_IA32_THERM_INTERRUPT,
(l | (THERM_INT_LOW_ENABLE
| THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h);
else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
wrmsr(MSR_IA32_THERM_INTERRUPT,
l | (THERM_INT_LOW_ENABLE
| THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
else
wrmsr(MSR_IA32_THERM_INTERRUPT,
l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
if (cpu_has(c, X86_FEATURE_PTS)) {
rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
(l | (PACKAGE_THERM_INT_LOW_ENABLE
| PACKAGE_THERM_INT_HIGH_ENABLE))
& ~PACKAGE_THERM_INT_PLN_ENABLE, h);
else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
l | (PACKAGE_THERM_INT_LOW_ENABLE
| PACKAGE_THERM_INT_HIGH_ENABLE
| PACKAGE_THERM_INT_PLN_ENABLE), h);
else
wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
l | (PACKAGE_THERM_INT_LOW_ENABLE
| PACKAGE_THERM_INT_HIGH_ENABLE), h);
}
smp_thermal_vector = intel_thermal_interrupt;
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
/* Unmask the thermal vector: */
l = apic_read(APIC_LVTTHMR);
apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
tm2 ? "TM2" : "TM1");
/* enable thermal throttle processing */
atomic_set(&therm_throt_en, 1);
}

View file

@ -0,0 +1,41 @@
/*
* Common corrected MCE threshold handler code:
*/
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <asm/irq_vectors.h>
#include <asm/apic.h>
#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>
static void default_threshold_interrupt(void)
{
printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
THRESHOLD_APIC_VECTOR);
}
void (*mce_threshold_vector)(void) = default_threshold_interrupt;
static inline void __smp_threshold_interrupt(void)
{
inc_irq_stat(irq_threshold_count);
mce_threshold_vector();
}
asmlinkage __visible void smp_threshold_interrupt(void)
{
entering_irq();
__smp_threshold_interrupt();
exiting_ack_irq();
}
asmlinkage __visible void smp_trace_threshold_interrupt(void)
{
entering_irq();
trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
__smp_threshold_interrupt();
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
exiting_ack_irq();
}

View file

@ -0,0 +1,38 @@
/*
* IDT Winchip specific Machine Check Exception Reporting
* (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
*/
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <asm/processor.h>
#include <asm/mce.h>
#include <asm/msr.h>
/* Machine check handler for WinChip C6: */
static void winchip_machine_check(struct pt_regs *regs, long error_code)
{
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
}
/* Set up machine check reporting on the Winchip C6 series */
void winchip_mcheck_init(struct cpuinfo_x86 *c)
{
u32 lo, hi;
machine_check_vector = winchip_machine_check;
/* Make sure the vector pointer is visible before we enable MCEs: */
wmb();
rdmsr(MSR_IDT_FCR1, lo, hi);
lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
lo &= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO
"Winchip machine check reporting enabled on CPU#0.\n");
}

View file

@ -0,0 +1,7 @@
microcode-y := core.o
obj-$(CONFIG_MICROCODE) += microcode.o
microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o
microcode-$(CONFIG_MICROCODE_AMD) += amd.o
obj-$(CONFIG_MICROCODE_EARLY) += core_early.o
obj-$(CONFIG_MICROCODE_INTEL_EARLY) += intel_early.o
obj-$(CONFIG_MICROCODE_AMD_EARLY) += amd_early.o

View file

@ -0,0 +1,492 @@
/*
* AMD CPU Microcode Update Driver for Linux
* Copyright (C) 2008-2011 Advanced Micro Devices Inc.
*
* Author: Peter Oruba <peter.oruba@amd.com>
*
* Based on work by:
* Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
*
* Maintainers:
* Andreas Herrmann <herrmann.der.user@googlemail.com>
* Borislav Petkov <bp@alien8.de>
*
* This driver allows to upgrade microcode on F10h AMD
* CPUs and later.
*
* Licensed under the terms of the GNU General Public
* License version 2. See file COPYING for details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/firmware.h>
#include <linux/pci_ids.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <asm/microcode.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/microcode_amd.h>
MODULE_DESCRIPTION("AMD Microcode Update Driver");
MODULE_AUTHOR("Peter Oruba");
MODULE_LICENSE("GPL v2");
static struct equiv_cpu_entry *equiv_cpu_table;
struct ucode_patch {
struct list_head plist;
void *data;
u32 patch_id;
u16 equiv_cpu;
};
static LIST_HEAD(pcache);
static u16 __find_equiv_id(unsigned int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
return find_equiv_id(equiv_cpu_table, uci->cpu_sig.sig);
}
static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu)
{
int i = 0;
BUG_ON(!equiv_cpu_table);
while (equiv_cpu_table[i].equiv_cpu != 0) {
if (equiv_cpu == equiv_cpu_table[i].equiv_cpu)
return equiv_cpu_table[i].installed_cpu;
i++;
}
return 0;
}
/*
* a small, trivial cache of per-family ucode patches
*/
static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
{
struct ucode_patch *p;
list_for_each_entry(p, &pcache, plist)
if (p->equiv_cpu == equiv_cpu)
return p;
return NULL;
}
static void update_cache(struct ucode_patch *new_patch)
{
struct ucode_patch *p;
list_for_each_entry(p, &pcache, plist) {
if (p->equiv_cpu == new_patch->equiv_cpu) {
if (p->patch_id >= new_patch->patch_id)
/* we already have the latest patch */
return;
list_replace(&p->plist, &new_patch->plist);
kfree(p->data);
kfree(p);
return;
}
}
/* no patch found, add it */
list_add_tail(&new_patch->plist, &pcache);
}
static void free_cache(void)
{
struct ucode_patch *p, *tmp;
list_for_each_entry_safe(p, tmp, &pcache, plist) {
__list_del(p->plist.prev, p->plist.next);
kfree(p->data);
kfree(p);
}
}
static struct ucode_patch *find_patch(unsigned int cpu)
{
u16 equiv_id;
equiv_id = __find_equiv_id(cpu);
if (!equiv_id)
return NULL;
return cache_find_patch(equiv_id);
}
static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct ucode_patch *p;
csig->sig = cpuid_eax(0x00000001);
csig->rev = c->microcode;
/*
* a patch could have been loaded early, set uci->mc so that
* mc_bp_resume() can call apply_microcode()
*/
p = find_patch(cpu);
if (p && (p->patch_id == csig->rev))
uci->mc = p->data;
pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
return 0;
}
static unsigned int verify_patch_size(u8 family, u32 patch_size,
unsigned int size)
{
u32 max_size;
#define F1XH_MPB_MAX_SIZE 2048
#define F14H_MPB_MAX_SIZE 1824
#define F15H_MPB_MAX_SIZE 4096
#define F16H_MPB_MAX_SIZE 3458
switch (family) {
case 0x14:
max_size = F14H_MPB_MAX_SIZE;
break;
case 0x15:
max_size = F15H_MPB_MAX_SIZE;
break;
case 0x16:
max_size = F16H_MPB_MAX_SIZE;
break;
default:
max_size = F1XH_MPB_MAX_SIZE;
break;
}
if (patch_size > min_t(u32, size, max_size)) {
pr_err("patch size mismatch\n");
return 0;
}
return patch_size;
}
int __apply_microcode_amd(struct microcode_amd *mc_amd)
{
u32 rev, dummy;
native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
/* verify patch application was successful */
native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
if (rev != mc_amd->hdr.patch_id)
return -1;
return 0;
}
int apply_microcode_amd(int cpu)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
struct microcode_amd *mc_amd;
struct ucode_cpu_info *uci;
struct ucode_patch *p;
u32 rev, dummy;
BUG_ON(raw_smp_processor_id() != cpu);
uci = ucode_cpu_info + cpu;
p = find_patch(cpu);
if (!p)
return 0;
mc_amd = p->data;
uci->mc = p->data;
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
/* need to apply patch? */
if (rev >= mc_amd->hdr.patch_id) {
c->microcode = rev;
uci->cpu_sig.rev = rev;
return 0;
}
if (__apply_microcode_amd(mc_amd)) {
pr_err("CPU%d: update failed for patch_level=0x%08x\n",
cpu, mc_amd->hdr.patch_id);
return -1;
}
pr_info("CPU%d: new patch_level=0x%08x\n", cpu,
mc_amd->hdr.patch_id);
uci->cpu_sig.rev = mc_amd->hdr.patch_id;
c->microcode = mc_amd->hdr.patch_id;
return 0;
}
static int install_equiv_cpu_table(const u8 *buf)
{
unsigned int *ibuf = (unsigned int *)buf;
unsigned int type = ibuf[1];
unsigned int size = ibuf[2];
if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
pr_err("empty section/"
"invalid type field in container file section header\n");
return -EINVAL;
}
equiv_cpu_table = vmalloc(size);
if (!equiv_cpu_table) {
pr_err("failed to allocate equivalent CPU table\n");
return -ENOMEM;
}
memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
/* add header length */
return size + CONTAINER_HDR_SZ;
}
static void free_equiv_cpu_table(void)
{
vfree(equiv_cpu_table);
equiv_cpu_table = NULL;
}
static void cleanup(void)
{
free_equiv_cpu_table();
free_cache();
}
/*
* We return the current size even if some of the checks failed so that
* we can skip over the next patch. If we return a negative value, we
* signal a grave error like a memory allocation has failed and the
* driver cannot continue functioning normally. In such cases, we tear
* down everything we've used up so far and exit.
*/
static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
{
struct microcode_header_amd *mc_hdr;
struct ucode_patch *patch;
unsigned int patch_size, crnt_size, ret;
u32 proc_fam;
u16 proc_id;
patch_size = *(u32 *)(fw + 4);
crnt_size = patch_size + SECTION_HDR_SIZE;
mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
proc_id = mc_hdr->processor_rev_id;
proc_fam = find_cpu_family_by_equiv_cpu(proc_id);
if (!proc_fam) {
pr_err("No patch family for equiv ID: 0x%04x\n", proc_id);
return crnt_size;
}
/* check if patch is for the current family */
proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff);
if (proc_fam != family)
return crnt_size;
if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n",
mc_hdr->patch_id);
return crnt_size;
}
ret = verify_patch_size(family, patch_size, leftover);
if (!ret) {
pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id);
return crnt_size;
}
patch = kzalloc(sizeof(*patch), GFP_KERNEL);
if (!patch) {
pr_err("Patch allocation failure.\n");
return -EINVAL;
}
patch->data = kzalloc(patch_size, GFP_KERNEL);
if (!patch->data) {
pr_err("Patch data allocation failure.\n");
kfree(patch);
return -EINVAL;
}
/* All looks ok, copy patch... */
memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size);
INIT_LIST_HEAD(&patch->plist);
patch->patch_id = mc_hdr->patch_id;
patch->equiv_cpu = proc_id;
pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
__func__, patch->patch_id, proc_id);
/* ... and add to cache. */
update_cache(patch);
return crnt_size;
}
static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
size_t size)
{
enum ucode_state ret = UCODE_ERROR;
unsigned int leftover;
u8 *fw = (u8 *)data;
int crnt_size = 0;
int offset;
offset = install_equiv_cpu_table(data);
if (offset < 0) {
pr_err("failed to create equivalent cpu table\n");
return ret;
}
fw += offset;
leftover = size - offset;
if (*(u32 *)fw != UCODE_UCODE_TYPE) {
pr_err("invalid type field in container file section header\n");
free_equiv_cpu_table();
return ret;
}
while (leftover) {
crnt_size = verify_and_add_patch(family, fw, leftover);
if (crnt_size < 0)
return ret;
fw += crnt_size;
leftover -= crnt_size;
}
return UCODE_OK;
}
enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
{
enum ucode_state ret;
/* free old equiv table */
free_equiv_cpu_table();
ret = __load_microcode_amd(family, data, size);
if (ret != UCODE_OK)
cleanup();
#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32)
/* save BSP's matching patch for early load */
if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) {
struct ucode_patch *p = find_patch(cpu);
if (p) {
memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
PATCH_MAX_SIZE));
}
}
#endif
return ret;
}
/*
* AMD microcode firmware naming convention, up to family 15h they are in
* the legacy file:
*
* amd-ucode/microcode_amd.bin
*
* This legacy file is always smaller than 2K in size.
*
* Beginning with family 15h, they are in family-specific firmware files:
*
* amd-ucode/microcode_amd_fam15h.bin
* amd-ucode/microcode_amd_fam16h.bin
* ...
*
* These might be larger than 2K.
*/
static enum ucode_state request_microcode_amd(int cpu, struct device *device,
bool refresh_fw)
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
struct cpuinfo_x86 *c = &cpu_data(cpu);
enum ucode_state ret = UCODE_NFOUND;
const struct firmware *fw;
/* reload ucode container only on the boot cpu */
if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index)
return UCODE_OK;
if (c->x86 >= 0x15)
snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
if (request_firmware_direct(&fw, (const char *)fw_name, device)) {
pr_debug("failed to load file %s\n", fw_name);
goto out;
}
ret = UCODE_ERROR;
if (*(u32 *)fw->data != UCODE_MAGIC) {
pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data);
goto fw_release;
}
ret = load_microcode_amd(cpu, c->x86, fw->data, fw->size);
fw_release:
release_firmware(fw);
out:
return ret;
}
static enum ucode_state
request_microcode_user(int cpu, const void __user *buf, size_t size)
{
return UCODE_ERROR;
}
static void microcode_fini_cpu_amd(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
uci->mc = NULL;
}
static struct microcode_ops microcode_amd_ops = {
.request_microcode_user = request_microcode_user,
.request_microcode_fw = request_microcode_amd,
.collect_cpu_info = collect_cpu_info_amd,
.apply_microcode = apply_microcode_amd,
.microcode_fini_cpu = microcode_fini_cpu_amd,
};
struct microcode_ops * __init init_amd_microcode(void)
{
struct cpuinfo_x86 *c = &cpu_data(0);
if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
return NULL;
}
return &microcode_amd_ops;
}
void __exit exit_amd_microcode(void)
{
cleanup();
}

View file

@ -0,0 +1,422 @@
/*
* Copyright (C) 2013 Advanced Micro Devices, Inc.
*
* Author: Jacob Shin <jacob.shin@amd.com>
* Fixes: Borislav Petkov <bp@suse.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/earlycpio.h>
#include <linux/initrd.h>
#include <asm/cpu.h>
#include <asm/setup.h>
#include <asm/microcode_amd.h>
/*
* This points to the current valid container of microcode patches which we will
* save from the initrd before jettisoning its contents.
*/
static u8 *container;
static size_t container_size;
static u32 ucode_new_rev;
u8 amd_ucode_patch[PATCH_MAX_SIZE];
static u16 this_equiv_id;
static struct cpio_data ucode_cpio;
/*
* Microcode patch container file is prepended to the initrd in cpio format.
* See Documentation/x86/early-microcode.txt
*/
static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
static struct cpio_data __init find_ucode_in_initrd(void)
{
long offset = 0;
char *path;
void *start;
size_t size;
#ifdef CONFIG_X86_32
struct boot_params *p;
/*
* On 32-bit, early load occurs before paging is turned on so we need
* to use physical addresses.
*/
p = (struct boot_params *)__pa_nodebug(&boot_params);
path = (char *)__pa_nodebug(ucode_path);
start = (void *)p->hdr.ramdisk_image;
size = p->hdr.ramdisk_size;
#else
path = ucode_path;
start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
size = boot_params.hdr.ramdisk_size;
#endif
return find_cpio_data(path, start, size, &offset);
}
static size_t compute_container_size(u8 *data, u32 total_size)
{
size_t size = 0;
u32 *header = (u32 *)data;
if (header[0] != UCODE_MAGIC ||
header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
header[2] == 0) /* size */
return size;
size = header[2] + CONTAINER_HDR_SZ;
total_size -= size;
data += size;
while (total_size) {
u16 patch_size;
header = (u32 *)data;
if (header[0] != UCODE_UCODE_TYPE)
break;
/*
* Sanity-check patch size.
*/
patch_size = header[1];
if (patch_size > PATCH_MAX_SIZE)
break;
size += patch_size + SECTION_HDR_SIZE;
data += patch_size + SECTION_HDR_SIZE;
total_size -= patch_size + SECTION_HDR_SIZE;
}
return size;
}
/*
* Early load occurs before we can vmalloc(). So we look for the microcode
* patch container file in initrd, traverse equivalent cpu table, look for a
* matching microcode patch, and update, all in initrd memory in place.
* When vmalloc() is available for use later -- on 64-bit during first AP load,
* and on 32-bit during save_microcode_in_initrd_amd() -- we can call
* load_microcode_amd() to save equivalent cpu table and microcode patches in
* kernel heap memory.
*/
static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
{
struct equiv_cpu_entry *eq;
size_t *cont_sz;
u32 *header;
u8 *data, **cont;
u8 (*patch)[PATCH_MAX_SIZE];
u16 eq_id = 0;
int offset, left;
u32 rev, eax, ebx, ecx, edx;
u32 *new_rev;
#ifdef CONFIG_X86_32
new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
cont_sz = (size_t *)__pa_nodebug(&container_size);
cont = (u8 **)__pa_nodebug(&container);
patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
#else
new_rev = &ucode_new_rev;
cont_sz = &container_size;
cont = &container;
patch = &amd_ucode_patch;
#endif
data = ucode;
left = size;
header = (u32 *)data;
/* find equiv cpu table */
if (header[0] != UCODE_MAGIC ||
header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
header[2] == 0) /* size */
return;
eax = 0x00000001;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
while (left > 0) {
eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
*cont = data;
/* Advance past the container header */
offset = header[2] + CONTAINER_HDR_SZ;
data += offset;
left -= offset;
eq_id = find_equiv_id(eq, eax);
if (eq_id) {
this_equiv_id = eq_id;
*cont_sz = compute_container_size(*cont, left + offset);
/*
* truncate how much we need to iterate over in the
* ucode update loop below
*/
left = *cont_sz - offset;
break;
}
/*
* support multiple container files appended together. if this
* one does not have a matching equivalent cpu entry, we fast
* forward to the next container file.
*/
while (left > 0) {
header = (u32 *)data;
if (header[0] == UCODE_MAGIC &&
header[1] == UCODE_EQUIV_CPU_TABLE_TYPE)
break;
offset = header[1] + SECTION_HDR_SIZE;
data += offset;
left -= offset;
}
/* mark where the next microcode container file starts */
offset = data - (u8 *)ucode;
ucode = data;
}
if (!eq_id) {
*cont = NULL;
*cont_sz = 0;
return;
}
/* find ucode and update if needed */
native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
while (left > 0) {
struct microcode_amd *mc;
header = (u32 *)data;
if (header[0] != UCODE_UCODE_TYPE || /* type */
header[1] == 0) /* size */
break;
mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
if (!__apply_microcode_amd(mc)) {
rev = mc->hdr.patch_id;
*new_rev = rev;
if (save_patch)
memcpy(patch, mc,
min_t(u32, header[1], PATCH_MAX_SIZE));
}
}
offset = header[1] + SECTION_HDR_SIZE;
data += offset;
left -= offset;
}
}
void __init load_ucode_amd_bsp(void)
{
struct cpio_data cp;
void **data;
size_t *size;
#ifdef CONFIG_X86_32
data = (void **)__pa_nodebug(&ucode_cpio.data);
size = (size_t *)__pa_nodebug(&ucode_cpio.size);
#else
data = &ucode_cpio.data;
size = &ucode_cpio.size;
#endif
cp = find_ucode_in_initrd();
if (!cp.data)
return;
*data = cp.data;
*size = cp.size;
apply_ucode_in_initrd(cp.data, cp.size, true);
}
#ifdef CONFIG_X86_32
/*
* On 32-bit, since AP's early load occurs before paging is turned on, we
* cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
* cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
* save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
* which is used upon resume from suspend.
*/
void load_ucode_amd_ap(void)
{
struct microcode_amd *mc;
size_t *usize;
void **ucode;
mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
__apply_microcode_amd(mc);
return;
}
ucode = (void *)__pa_nodebug(&container);
usize = (size_t *)__pa_nodebug(&container_size);
if (!*ucode || !*usize)
return;
apply_ucode_in_initrd(*ucode, *usize, false);
}
static void __init collect_cpu_sig_on_bsp(void *arg)
{
unsigned int cpu = smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
uci->cpu_sig.sig = cpuid_eax(0x00000001);
}
static void __init get_bsp_sig(void)
{
unsigned int bsp = boot_cpu_data.cpu_index;
struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
if (!uci->cpu_sig.sig)
smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
}
#else
void load_ucode_amd_ap(void)
{
unsigned int cpu = smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct equiv_cpu_entry *eq;
struct microcode_amd *mc;
u32 rev, eax;
u16 eq_id;
/* Exit if called on the BSP. */
if (!cpu)
return;
if (!container)
return;
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
uci->cpu_sig.rev = rev;
uci->cpu_sig.sig = eax;
eax = cpuid_eax(0x00000001);
eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
eq_id = find_equiv_id(eq, eax);
if (!eq_id)
return;
if (eq_id == this_equiv_id) {
mc = (struct microcode_amd *)amd_ucode_patch;
if (mc && rev < mc->hdr.patch_id) {
if (!__apply_microcode_amd(mc))
ucode_new_rev = mc->hdr.patch_id;
}
} else {
if (!ucode_cpio.data)
return;
/*
* AP has a different equivalence ID than BSP, looks like
* mixed-steppings silicon so go through the ucode blob anew.
*/
apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false);
}
}
#endif
int __init save_microcode_in_initrd_amd(void)
{
unsigned long cont;
int retval = 0;
enum ucode_state ret;
u8 *cont_va;
u32 eax;
if (!container)
return -EINVAL;
#ifdef CONFIG_X86_32
get_bsp_sig();
cont = (unsigned long)container;
cont_va = __va(container);
#else
/*
* We need the physical address of the container for both bitness since
* boot_params.hdr.ramdisk_image is a physical address.
*/
cont = __pa(container);
cont_va = container;
#endif
/*
* Take into account the fact that the ramdisk might get relocated and
* therefore we need to recompute the container's position in virtual
* memory space.
*/
if (relocated_ramdisk)
container = (u8 *)(__va(relocated_ramdisk) +
(cont - boot_params.hdr.ramdisk_image));
else
container = cont_va;
if (ucode_new_rev)
pr_info("microcode: updated early to new patch_level=0x%08x\n",
ucode_new_rev);
eax = cpuid_eax(0x00000001);
eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
ret = load_microcode_amd(smp_processor_id(), eax, container, container_size);
if (ret != UCODE_OK)
retval = -EINVAL;
/*
* This will be freed any msec now, stash patches for the current
* family and switch to patch cache for cpu hotplug, etc later.
*/
container = NULL;
container_size = 0;
return retval;
}
void reload_ucode_amd(void)
{
struct microcode_amd *mc;
u32 rev, eax;
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
mc = (struct microcode_amd *)amd_ucode_patch;
if (mc && rev < mc->hdr.patch_id) {
if (!__apply_microcode_amd(mc)) {
ucode_new_rev = mc->hdr.patch_id;
pr_info("microcode: reload patch_level=0x%08x\n",
ucode_new_rev);
}
}
}

View file

@ -0,0 +1,653 @@
/*
* Intel CPU Microcode Update Driver for Linux
*
* Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
* 2006 Shaohua Li <shaohua.li@intel.com>
*
* This driver allows to upgrade microcode on Intel processors
* belonging to IA-32 family - PentiumPro, Pentium II,
* Pentium III, Xeon, Pentium 4, etc.
*
* Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
* Software Developer's Manual
* Order Number 253668 or free download from:
*
* http://developer.intel.com/Assets/PDF/manual/253668.pdf
*
* For more information, go to http://www.urbanmyth.org/microcode
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Initial release.
* 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Added read() support + cleanups.
* 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Added 'device trimming' support. open(O_WRONLY) zeroes
* and frees the saved copy of applied microcode.
* 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Made to use devfs (/dev/cpu/microcode) + cleanups.
* 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
* Added misc device support (now uses both devfs and misc).
* Added MICROCODE_IOCFREE ioctl to clear memory.
* 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
* Messages for error cases (non Intel & no suitable microcode).
* 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
* Removed ->release(). Removed exclusive open and status bitmap.
* Added microcode_rwsem to serialize read()/write()/ioctl().
* Removed global kernel lock usage.
* 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
* Write 0 to 0x8B msr and then cpuid before reading revision,
* so that it works even if there were no update done by the
* BIOS. Otherwise, reading from 0x8B gives junk (which happened
* to be 0 on my machine which is why it worked even when I
* disabled update by the BIOS)
* Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
* 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
* Tigran Aivazian <tigran@veritas.com>
* Intel Pentium 4 processor support and bugfixes.
* 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
* Bugfix for HT (Hyper-Threading) enabled processors
* whereby processor resources are shared by all logical processors
* in a single CPU package.
* 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
* Tigran Aivazian <tigran@veritas.com>,
* Serialize updates as required on HT processors due to
* speculative nature of implementation.
* 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
* Fix the panic when writing zero-length microcode chunk.
* 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
* Jun Nakajima <jun.nakajima@intel.com>
* Support for the microcode updates in the new format.
* 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
* Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
* because we no longer hold a copy of applied microcode
* in kernel memory.
* 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
* Fix sigmatch() macro to handle old CPUs with pf == 0.
* Thanks to Stuart Swales for pointing out this bug.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/platform_device.h>
#include <linux/miscdevice.h>
#include <linux/capability.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/cpu.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/syscore_ops.h>
#include <asm/microcode.h>
#include <asm/processor.h>
#include <asm/cpu_device_id.h>
#include <asm/perf_event.h>
MODULE_DESCRIPTION("Microcode Update Driver");
MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
MODULE_LICENSE("GPL");
#define MICROCODE_VERSION "2.00"
static struct microcode_ops *microcode_ops;
bool dis_ucode_ldr;
module_param(dis_ucode_ldr, bool, 0);
/*
* Synchronization.
*
* All non cpu-hotplug-callback call sites use:
*
* - microcode_mutex to synchronize with each other;
* - get/put_online_cpus() to synchronize with
* the cpu-hotplug-callback call sites.
*
* We guarantee that only a single cpu is being
* updated at any particular moment of time.
*/
static DEFINE_MUTEX(microcode_mutex);
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
EXPORT_SYMBOL_GPL(ucode_cpu_info);
/*
* Operations that are run on a target cpu:
*/
struct cpu_info_ctx {
struct cpu_signature *cpu_sig;
int err;
};
static void collect_cpu_info_local(void *arg)
{
struct cpu_info_ctx *ctx = arg;
ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
ctx->cpu_sig);
}
static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
{
struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
int ret;
ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
if (!ret)
ret = ctx.err;
return ret;
}
static int collect_cpu_info(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
int ret;
memset(uci, 0, sizeof(*uci));
ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
if (!ret)
uci->valid = 1;
return ret;
}
struct apply_microcode_ctx {
int err;
};
static void apply_microcode_local(void *arg)
{
struct apply_microcode_ctx *ctx = arg;
ctx->err = microcode_ops->apply_microcode(smp_processor_id());
}
static int apply_microcode_on_target(int cpu)
{
struct apply_microcode_ctx ctx = { .err = 0 };
int ret;
ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
if (!ret)
ret = ctx.err;
return ret;
}
#ifdef CONFIG_MICROCODE_OLD_INTERFACE
static int do_microcode_update(const void __user *buf, size_t size)
{
int error = 0;
int cpu;
for_each_online_cpu(cpu) {
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
enum ucode_state ustate;
if (!uci->valid)
continue;
ustate = microcode_ops->request_microcode_user(cpu, buf, size);
if (ustate == UCODE_ERROR) {
error = -1;
break;
} else if (ustate == UCODE_OK)
apply_microcode_on_target(cpu);
}
return error;
}
static int microcode_open(struct inode *inode, struct file *file)
{
return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
}
static ssize_t microcode_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos)
{
ssize_t ret = -EINVAL;
if ((len >> PAGE_SHIFT) > totalram_pages) {
pr_err("too much data (max %ld pages)\n", totalram_pages);
return ret;
}
get_online_cpus();
mutex_lock(&microcode_mutex);
if (do_microcode_update(buf, len) == 0)
ret = (ssize_t)len;
if (ret > 0)
perf_check_microcode();
mutex_unlock(&microcode_mutex);
put_online_cpus();
return ret;
}
static const struct file_operations microcode_fops = {
.owner = THIS_MODULE,
.write = microcode_write,
.open = microcode_open,
.llseek = no_llseek,
};
static struct miscdevice microcode_dev = {
.minor = MICROCODE_MINOR,
.name = "microcode",
.nodename = "cpu/microcode",
.fops = &microcode_fops,
};
static int __init microcode_dev_init(void)
{
int error;
error = misc_register(&microcode_dev);
if (error) {
pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
return error;
}
return 0;
}
static void __exit microcode_dev_exit(void)
{
misc_deregister(&microcode_dev);
}
MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
MODULE_ALIAS("devname:cpu/microcode");
#else
#define microcode_dev_init() 0
#define microcode_dev_exit() do { } while (0)
#endif
/* fake device for request_firmware */
static struct platform_device *microcode_pdev;
static int reload_for_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
enum ucode_state ustate;
int err = 0;
if (!uci->valid)
return err;
ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, true);
if (ustate == UCODE_OK)
apply_microcode_on_target(cpu);
else
if (ustate == UCODE_ERROR)
err = -EINVAL;
return err;
}
static ssize_t reload_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t size)
{
unsigned long val;
int cpu;
ssize_t ret = 0, tmp_ret;
ret = kstrtoul(buf, 0, &val);
if (ret)
return ret;
if (val != 1)
return size;
get_online_cpus();
mutex_lock(&microcode_mutex);
for_each_online_cpu(cpu) {
tmp_ret = reload_for_cpu(cpu);
if (tmp_ret != 0)
pr_warn("Error reloading microcode on CPU %d\n", cpu);
/* save retval of the first encountered reload error */
if (!ret)
ret = tmp_ret;
}
if (!ret)
perf_check_microcode();
mutex_unlock(&microcode_mutex);
put_online_cpus();
if (!ret)
ret = size;
return ret;
}
static ssize_t version_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
}
static ssize_t pf_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
}
static DEVICE_ATTR(reload, 0200, NULL, reload_store);
static DEVICE_ATTR(version, 0400, version_show, NULL);
static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
static struct attribute *mc_default_attrs[] = {
&dev_attr_version.attr,
&dev_attr_processor_flags.attr,
NULL
};
static struct attribute_group mc_attr_group = {
.attrs = mc_default_attrs,
.name = "microcode",
};
static void microcode_fini_cpu(int cpu)
{
microcode_ops->microcode_fini_cpu(cpu);
}
static enum ucode_state microcode_resume_cpu(int cpu)
{
pr_debug("CPU%d updated upon resume\n", cpu);
if (apply_microcode_on_target(cpu))
return UCODE_ERROR;
return UCODE_OK;
}
static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
{
enum ucode_state ustate;
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
if (uci && uci->valid)
return UCODE_OK;
if (collect_cpu_info(cpu))
return UCODE_ERROR;
/* --dimm. Trigger a delayed update? */
if (system_state != SYSTEM_RUNNING)
return UCODE_NFOUND;
ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev,
refresh_fw);
if (ustate == UCODE_OK) {
pr_debug("CPU%d updated upon init\n", cpu);
apply_microcode_on_target(cpu);
}
return ustate;
}
static enum ucode_state microcode_update_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
if (uci->valid)
return microcode_resume_cpu(cpu);
return microcode_init_cpu(cpu, false);
}
static int mc_device_add(struct device *dev, struct subsys_interface *sif)
{
int err, cpu = dev->id;
if (!cpu_online(cpu))
return 0;
pr_debug("CPU%d added\n", cpu);
err = sysfs_create_group(&dev->kobj, &mc_attr_group);
if (err)
return err;
if (microcode_init_cpu(cpu, true) == UCODE_ERROR)
return -EINVAL;
return err;
}
static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
{
int cpu = dev->id;
if (!cpu_online(cpu))
return 0;
pr_debug("CPU%d removed\n", cpu);
microcode_fini_cpu(cpu);
sysfs_remove_group(&dev->kobj, &mc_attr_group);
return 0;
}
static struct subsys_interface mc_cpu_interface = {
.name = "microcode",
.subsys = &cpu_subsys,
.add_dev = mc_device_add,
.remove_dev = mc_device_remove,
};
/**
* mc_bp_resume - Update boot CPU microcode during resume.
*/
static void mc_bp_resume(void)
{
int cpu = smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
if (uci->valid && uci->mc)
microcode_ops->apply_microcode(cpu);
else if (!uci->mc)
reload_early_microcode();
}
static struct syscore_ops mc_syscore_ops = {
.resume = mc_bp_resume,
};
static int
mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
struct device *dev;
dev = get_cpu_device(cpu);
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
microcode_update_cpu(cpu);
pr_debug("CPU%d added\n", cpu);
/*
* "break" is missing on purpose here because we want to fall
* through in order to create the sysfs group.
*/
case CPU_DOWN_FAILED:
if (sysfs_create_group(&dev->kobj, &mc_attr_group))
pr_err("Failed to create group for CPU%d\n", cpu);
break;
case CPU_DOWN_PREPARE:
/* Suspend is in progress, only remove the interface */
sysfs_remove_group(&dev->kobj, &mc_attr_group);
pr_debug("CPU%d removed\n", cpu);
break;
/*
* case CPU_DEAD:
*
* When a CPU goes offline, don't free up or invalidate the copy of
* the microcode in kernel memory, so that we can reuse it when the
* CPU comes back online without unnecessarily requesting the userspace
* for it again.
*/
}
/* The CPU refused to come up during a system resume */
if (action == CPU_UP_CANCELED_FROZEN)
microcode_fini_cpu(cpu);
return NOTIFY_OK;
}
static struct notifier_block __refdata mc_cpu_notifier = {
.notifier_call = mc_cpu_callback,
};
#ifdef MODULE
/* Autoload on Intel and AMD systems */
static const struct x86_cpu_id __initconst microcode_id[] = {
#ifdef CONFIG_MICROCODE_INTEL
{ X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
#endif
#ifdef CONFIG_MICROCODE_AMD
{ X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, },
#endif
{}
};
MODULE_DEVICE_TABLE(x86cpu, microcode_id);
#endif
static struct attribute *cpu_root_microcode_attrs[] = {
&dev_attr_reload.attr,
NULL
};
static struct attribute_group cpu_root_microcode_group = {
.name = "microcode",
.attrs = cpu_root_microcode_attrs,
};
static int __init microcode_init(void)
{
struct cpuinfo_x86 *c = &cpu_data(0);
int error;
if (paravirt_enabled() || dis_ucode_ldr)
return -EINVAL;
if (c->x86_vendor == X86_VENDOR_INTEL)
microcode_ops = init_intel_microcode();
else if (c->x86_vendor == X86_VENDOR_AMD)
microcode_ops = init_amd_microcode();
else
pr_err("no support for this CPU vendor\n");
if (!microcode_ops)
return -ENODEV;
microcode_pdev = platform_device_register_simple("microcode", -1,
NULL, 0);
if (IS_ERR(microcode_pdev))
return PTR_ERR(microcode_pdev);
get_online_cpus();
mutex_lock(&microcode_mutex);
error = subsys_interface_register(&mc_cpu_interface);
if (!error)
perf_check_microcode();
mutex_unlock(&microcode_mutex);
put_online_cpus();
if (error)
goto out_pdev;
error = sysfs_create_group(&cpu_subsys.dev_root->kobj,
&cpu_root_microcode_group);
if (error) {
pr_err("Error creating microcode group!\n");
goto out_driver;
}
error = microcode_dev_init();
if (error)
goto out_ucode_group;
register_syscore_ops(&mc_syscore_ops);
register_hotcpu_notifier(&mc_cpu_notifier);
pr_info("Microcode Update Driver: v" MICROCODE_VERSION
" <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
return 0;
out_ucode_group:
sysfs_remove_group(&cpu_subsys.dev_root->kobj,
&cpu_root_microcode_group);
out_driver:
get_online_cpus();
mutex_lock(&microcode_mutex);
subsys_interface_unregister(&mc_cpu_interface);
mutex_unlock(&microcode_mutex);
put_online_cpus();
out_pdev:
platform_device_unregister(microcode_pdev);
return error;
}
module_init(microcode_init);
static void __exit microcode_exit(void)
{
struct cpuinfo_x86 *c = &cpu_data(0);
microcode_dev_exit();
unregister_hotcpu_notifier(&mc_cpu_notifier);
unregister_syscore_ops(&mc_syscore_ops);
sysfs_remove_group(&cpu_subsys.dev_root->kobj,
&cpu_root_microcode_group);
get_online_cpus();
mutex_lock(&microcode_mutex);
subsys_interface_unregister(&mc_cpu_interface);
mutex_unlock(&microcode_mutex);
put_online_cpus();
platform_device_unregister(microcode_pdev);
microcode_ops = NULL;
if (c->x86_vendor == X86_VENDOR_AMD)
exit_amd_microcode();
pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
}
module_exit(microcode_exit);

View file

@ -0,0 +1,199 @@
/*
* X86 CPU microcode early update for Linux
*
* Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
* H Peter Anvin" <hpa@zytor.com>
*
* This driver allows to early upgrade microcode on Intel processors
* belonging to IA-32 family - PentiumPro, Pentium II,
* Pentium III, Xeon, Pentium 4, etc.
*
* Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
* Software Developer's Manual.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <asm/microcode.h>
#include <asm/microcode_intel.h>
#include <asm/microcode_amd.h>
#include <asm/processor.h>
#include <asm/cmdline.h>
#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
#define CPUID_IS(a, b, c, ebx, ecx, edx) \
(!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
/*
* In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
* x86_vendor() gets vendor id for BSP.
*
* In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
* coding, we still use x86_vendor() to get vendor id for AP.
*
* x86_vendor() gets vendor information directly through cpuid.
*/
static int x86_vendor(void)
{
u32 eax = 0x00000000;
u32 ebx, ecx = 0, edx;
native_cpuid(&eax, &ebx, &ecx, &edx);
if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
return X86_VENDOR_INTEL;
if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
return X86_VENDOR_AMD;
return X86_VENDOR_UNKNOWN;
}
static int x86_family(void)
{
u32 eax = 0x00000001;
u32 ebx, ecx = 0, edx;
int x86;
native_cpuid(&eax, &ebx, &ecx, &edx);
x86 = (eax >> 8) & 0xf;
if (x86 == 15)
x86 += (eax >> 20) & 0xff;
return x86;
}
static bool __init check_loader_disabled_bsp(void)
{
#ifdef CONFIG_X86_32
const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
const char *opt = "dis_ucode_ldr";
const char *option = (const char *)__pa_nodebug(opt);
bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
#else /* CONFIG_X86_64 */
const char *cmdline = boot_command_line;
const char *option = "dis_ucode_ldr";
bool *res = &dis_ucode_ldr;
#endif
if (cmdline_find_option_bool(cmdline, option))
*res = true;
return *res;
}
void __init load_ucode_bsp(void)
{
int vendor, x86;
if (check_loader_disabled_bsp())
return;
if (!have_cpuid_p())
return;
vendor = x86_vendor();
x86 = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
if (x86 >= 6)
load_ucode_intel_bsp();
break;
case X86_VENDOR_AMD:
if (x86 >= 0x10)
load_ucode_amd_bsp();
break;
default:
break;
}
}
static bool check_loader_disabled_ap(void)
{
#ifdef CONFIG_X86_32
return *((bool *)__pa_nodebug(&dis_ucode_ldr));
#else
return dis_ucode_ldr;
#endif
}
void load_ucode_ap(void)
{
int vendor, x86;
if (check_loader_disabled_ap())
return;
if (!have_cpuid_p())
return;
vendor = x86_vendor();
x86 = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
if (x86 >= 6)
load_ucode_intel_ap();
break;
case X86_VENDOR_AMD:
if (x86 >= 0x10)
load_ucode_amd_ap();
break;
default:
break;
}
}
int __init save_microcode_in_initrd(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
if (c->x86 >= 6)
save_microcode_in_initrd_intel();
break;
case X86_VENDOR_AMD:
if (c->x86 >= 0x10)
save_microcode_in_initrd_amd();
break;
default:
break;
}
return 0;
}
void reload_early_microcode(void)
{
int vendor, x86;
vendor = x86_vendor();
x86 = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
if (x86 >= 6)
reload_ucode_intel();
break;
case X86_VENDOR_AMD:
if (x86 >= 0x10)
reload_ucode_amd();
break;
default:
break;
}
}

View file

@ -0,0 +1,333 @@
/*
* Intel CPU Microcode Update Driver for Linux
*
* Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
* 2006 Shaohua Li <shaohua.li@intel.com>
*
* This driver allows to upgrade microcode on Intel processors
* belonging to IA-32 family - PentiumPro, Pentium II,
* Pentium III, Xeon, Pentium 4, etc.
*
* Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
* Software Developer's Manual
* Order Number 253668 or free download from:
*
* http://developer.intel.com/Assets/PDF/manual/253668.pdf
*
* For more information, go to http://www.urbanmyth.org/microcode
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Initial release.
* 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Added read() support + cleanups.
* 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Added 'device trimming' support. open(O_WRONLY) zeroes
* and frees the saved copy of applied microcode.
* 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Made to use devfs (/dev/cpu/microcode) + cleanups.
* 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
* Added misc device support (now uses both devfs and misc).
* Added MICROCODE_IOCFREE ioctl to clear memory.
* 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
* Messages for error cases (non Intel & no suitable microcode).
* 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
* Removed ->release(). Removed exclusive open and status bitmap.
* Added microcode_rwsem to serialize read()/write()/ioctl().
* Removed global kernel lock usage.
* 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
* Write 0 to 0x8B msr and then cpuid before reading revision,
* so that it works even if there were no update done by the
* BIOS. Otherwise, reading from 0x8B gives junk (which happened
* to be 0 on my machine which is why it worked even when I
* disabled update by the BIOS)
* Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
* 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
* Tigran Aivazian <tigran@veritas.com>
* Intel Pentium 4 processor support and bugfixes.
* 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
* Bugfix for HT (Hyper-Threading) enabled processors
* whereby processor resources are shared by all logical processors
* in a single CPU package.
* 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
* Tigran Aivazian <tigran@veritas.com>,
* Serialize updates as required on HT processors due to
* speculative nature of implementation.
* 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
* Fix the panic when writing zero-length microcode chunk.
* 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
* Jun Nakajima <jun.nakajima@intel.com>
* Support for the microcode updates in the new format.
* 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
* Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
* because we no longer hold a copy of applied microcode
* in kernel memory.
* 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
* Fix sigmatch() macro to handle old CPUs with pf == 0.
* Thanks to Stuart Swales for pointing out this bug.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/firmware.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <asm/microcode_intel.h>
#include <asm/processor.h>
#include <asm/msr.h>
MODULE_DESCRIPTION("Microcode Update Driver");
MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
MODULE_LICENSE("GPL");
static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu_num);
unsigned int val[2];
memset(csig, 0, sizeof(*csig));
csig->sig = cpuid_eax(0x00000001);
if ((c->x86_model >= 5) || (c->x86 > 6)) {
/* get processor flags from MSR 0x17 */
rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
csig->pf = 1 << ((val[1] >> 18) & 7);
}
csig->rev = c->microcode;
pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
cpu_num, csig->sig, csig->pf, csig->rev);
return 0;
}
/*
* return 0 - no update found
* return 1 - found update
*/
static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
{
struct cpu_signature cpu_sig;
unsigned int csig, cpf, crev;
collect_cpu_info(cpu, &cpu_sig);
csig = cpu_sig.sig;
cpf = cpu_sig.pf;
crev = cpu_sig.rev;
return get_matching_microcode(csig, cpf, mc_intel, crev);
}
static int apply_microcode_intel(int cpu)
{
struct microcode_intel *mc_intel;
struct ucode_cpu_info *uci;
unsigned int val[2];
int cpu_num = raw_smp_processor_id();
struct cpuinfo_x86 *c = &cpu_data(cpu_num);
uci = ucode_cpu_info + cpu;
mc_intel = uci->mc;
/* We should bind the task to the CPU */
BUG_ON(cpu_num != cpu);
if (mc_intel == NULL)
return 0;
/*
* Microcode on this CPU could be updated earlier. Only apply the
* microcode patch in mc_intel when it is newer than the one on this
* CPU.
*/
if (get_matching_mc(mc_intel, cpu) == 0)
return 0;
/* write microcode via MSR 0x79 */
wrmsr(MSR_IA32_UCODE_WRITE,
(unsigned long) mc_intel->bits,
(unsigned long) mc_intel->bits >> 16 >> 16);
wrmsr(MSR_IA32_UCODE_REV, 0, 0);
/* As documented in the SDM: Do a CPUID 1 here */
sync_core();
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
if (val[1] != mc_intel->hdr.rev) {
pr_err("CPU%d update to revision 0x%x failed\n",
cpu_num, mc_intel->hdr.rev);
return -1;
}
pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
cpu_num, val[1],
mc_intel->hdr.date & 0xffff,
mc_intel->hdr.date >> 24,
(mc_intel->hdr.date >> 16) & 0xff);
uci->cpu_sig.rev = val[1];
c->microcode = val[1];
return 0;
}
static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
int (*get_ucode_data)(void *, const void *, size_t))
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
int new_rev = uci->cpu_sig.rev;
unsigned int leftover = size;
enum ucode_state state = UCODE_OK;
unsigned int curr_mc_size = 0;
unsigned int csig, cpf;
while (leftover) {
struct microcode_header_intel mc_header;
unsigned int mc_size;
if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
break;
mc_size = get_totalsize(&mc_header);
if (!mc_size || mc_size > leftover) {
pr_err("error! Bad data in microcode data file\n");
break;
}
/* For performance reasons, reuse mc area when possible */
if (!mc || mc_size > curr_mc_size) {
vfree(mc);
mc = vmalloc(mc_size);
if (!mc)
break;
curr_mc_size = mc_size;
}
if (get_ucode_data(mc, ucode_ptr, mc_size) ||
microcode_sanity_check(mc, 1) < 0) {
break;
}
csig = uci->cpu_sig.sig;
cpf = uci->cpu_sig.pf;
if (get_matching_microcode(csig, cpf, mc, new_rev)) {
vfree(new_mc);
new_rev = mc_header.rev;
new_mc = mc;
mc = NULL; /* trigger new vmalloc */
}
ucode_ptr += mc_size;
leftover -= mc_size;
}
vfree(mc);
if (leftover) {
vfree(new_mc);
state = UCODE_ERROR;
goto out;
}
if (!new_mc) {
state = UCODE_NFOUND;
goto out;
}
vfree(uci->mc);
uci->mc = (struct microcode_intel *)new_mc;
/*
* If early loading microcode is supported, save this mc into
* permanent memory. So it will be loaded early when a CPU is hot added
* or resumes.
*/
save_mc_for_early(new_mc);
pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
out:
return state;
}
static int get_ucode_fw(void *to, const void *from, size_t n)
{
memcpy(to, from, n);
return 0;
}
static enum ucode_state request_microcode_fw(int cpu, struct device *device,
bool refresh_fw)
{
char name[30];
struct cpuinfo_x86 *c = &cpu_data(cpu);
const struct firmware *firmware;
enum ucode_state ret;
sprintf(name, "intel-ucode/%02x-%02x-%02x",
c->x86, c->x86_model, c->x86_mask);
if (request_firmware_direct(&firmware, name, device)) {
pr_debug("data file %s load failed\n", name);
return UCODE_NFOUND;
}
ret = generic_load_microcode(cpu, (void *)firmware->data,
firmware->size, &get_ucode_fw);
release_firmware(firmware);
return ret;
}
static int get_ucode_user(void *to, const void *from, size_t n)
{
return copy_from_user(to, from, n);
}
static enum ucode_state
request_microcode_user(int cpu, const void __user *buf, size_t size)
{
return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
}
static void microcode_fini_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
vfree(uci->mc);
uci->mc = NULL;
}
static struct microcode_ops microcode_intel_ops = {
.request_microcode_user = request_microcode_user,
.request_microcode_fw = request_microcode_fw,
.collect_cpu_info = collect_cpu_info,
.apply_microcode = apply_microcode_intel,
.microcode_fini_cpu = microcode_fini_cpu,
};
struct microcode_ops * __init init_intel_microcode(void)
{
struct cpuinfo_x86 *c = &cpu_data(0);
if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
cpu_has(c, X86_FEATURE_IA64)) {
pr_err("Intel CPU family 0x%x not supported\n", c->x86);
return NULL;
}
return &microcode_intel_ops;
}

View file

@ -0,0 +1,813 @@
/*
* Intel CPU microcode early update for Linux
*
* Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
* H Peter Anvin" <hpa@zytor.com>
*
* This allows to early upgrade microcode on Intel processors
* belonging to IA-32 family - PentiumPro, Pentium II,
* Pentium III, Xeon, Pentium 4, etc.
*
* Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
* Software Developer's Manual.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/earlycpio.h>
#include <linux/initrd.h>
#include <linux/cpu.h>
#include <asm/msr.h>
#include <asm/microcode_intel.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/setup.h>
static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
static struct mc_saved_data {
unsigned int mc_saved_count;
struct microcode_intel **mc_saved;
} mc_saved_data;
static enum ucode_state
generic_load_microcode_early(struct microcode_intel **mc_saved_p,
unsigned int mc_saved_count,
struct ucode_cpu_info *uci)
{
struct microcode_intel *ucode_ptr, *new_mc = NULL;
int new_rev = uci->cpu_sig.rev;
enum ucode_state state = UCODE_OK;
unsigned int mc_size;
struct microcode_header_intel *mc_header;
unsigned int csig = uci->cpu_sig.sig;
unsigned int cpf = uci->cpu_sig.pf;
int i;
for (i = 0; i < mc_saved_count; i++) {
ucode_ptr = mc_saved_p[i];
mc_header = (struct microcode_header_intel *)ucode_ptr;
mc_size = get_totalsize(mc_header);
if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) {
new_rev = mc_header->rev;
new_mc = ucode_ptr;
}
}
if (!new_mc) {
state = UCODE_NFOUND;
goto out;
}
uci->mc = (struct microcode_intel *)new_mc;
out:
return state;
}
static void
microcode_pointer(struct microcode_intel **mc_saved,
unsigned long *mc_saved_in_initrd,
unsigned long initrd_start, int mc_saved_count)
{
int i;
for (i = 0; i < mc_saved_count; i++)
mc_saved[i] = (struct microcode_intel *)
(mc_saved_in_initrd[i] + initrd_start);
}
#ifdef CONFIG_X86_32
static void
microcode_phys(struct microcode_intel **mc_saved_tmp,
struct mc_saved_data *mc_saved_data)
{
int i;
struct microcode_intel ***mc_saved;
mc_saved = (struct microcode_intel ***)
__pa_nodebug(&mc_saved_data->mc_saved);
for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
struct microcode_intel *p;
p = *(struct microcode_intel **)
__pa_nodebug(mc_saved_data->mc_saved + i);
mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
}
}
#endif
static enum ucode_state
load_microcode(struct mc_saved_data *mc_saved_data,
unsigned long *mc_saved_in_initrd,
unsigned long initrd_start,
struct ucode_cpu_info *uci)
{
struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
unsigned int count = mc_saved_data->mc_saved_count;
if (!mc_saved_data->mc_saved) {
microcode_pointer(mc_saved_tmp, mc_saved_in_initrd,
initrd_start, count);
return generic_load_microcode_early(mc_saved_tmp, count, uci);
} else {
#ifdef CONFIG_X86_32
microcode_phys(mc_saved_tmp, mc_saved_data);
return generic_load_microcode_early(mc_saved_tmp, count, uci);
#else
return generic_load_microcode_early(mc_saved_data->mc_saved,
count, uci);
#endif
}
}
static u8 get_x86_family(unsigned long sig)
{
u8 x86;
x86 = (sig >> 8) & 0xf;
if (x86 == 0xf)
x86 += (sig >> 20) & 0xff;
return x86;
}
static u8 get_x86_model(unsigned long sig)
{
u8 x86, x86_model;
x86 = get_x86_family(sig);
x86_model = (sig >> 4) & 0xf;
if (x86 == 0x6 || x86 == 0xf)
x86_model += ((sig >> 16) & 0xf) << 4;
return x86_model;
}
/*
* Given CPU signature and a microcode patch, this function finds if the
* microcode patch has matching family and model with the CPU.
*/
static enum ucode_state
matching_model_microcode(struct microcode_header_intel *mc_header,
unsigned long sig)
{
u8 x86, x86_model;
u8 x86_ucode, x86_model_ucode;
struct extended_sigtable *ext_header;
unsigned long total_size = get_totalsize(mc_header);
unsigned long data_size = get_datasize(mc_header);
int ext_sigcount, i;
struct extended_signature *ext_sig;
x86 = get_x86_family(sig);
x86_model = get_x86_model(sig);
x86_ucode = get_x86_family(mc_header->sig);
x86_model_ucode = get_x86_model(mc_header->sig);
if (x86 == x86_ucode && x86_model == x86_model_ucode)
return UCODE_OK;
/* Look for ext. headers: */
if (total_size <= data_size + MC_HEADER_SIZE)
return UCODE_NFOUND;
ext_header = (struct extended_sigtable *)
mc_header + data_size + MC_HEADER_SIZE;
ext_sigcount = ext_header->count;
ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
for (i = 0; i < ext_sigcount; i++) {
x86_ucode = get_x86_family(ext_sig->sig);
x86_model_ucode = get_x86_model(ext_sig->sig);
if (x86 == x86_ucode && x86_model == x86_model_ucode)
return UCODE_OK;
ext_sig++;
}
return UCODE_NFOUND;
}
static int
save_microcode(struct mc_saved_data *mc_saved_data,
struct microcode_intel **mc_saved_src,
unsigned int mc_saved_count)
{
int i, j;
struct microcode_intel **mc_saved_p;
int ret;
if (!mc_saved_count)
return -EINVAL;
/*
* Copy new microcode data.
*/
mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *),
GFP_KERNEL);
if (!mc_saved_p)
return -ENOMEM;
for (i = 0; i < mc_saved_count; i++) {
struct microcode_intel *mc = mc_saved_src[i];
struct microcode_header_intel *mc_header = &mc->hdr;
unsigned long mc_size = get_totalsize(mc_header);
mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL);
if (!mc_saved_p[i]) {
ret = -ENOMEM;
goto err;
}
if (!mc_saved_src[i]) {
ret = -EINVAL;
goto err;
}
memcpy(mc_saved_p[i], mc, mc_size);
}
/*
* Point to newly saved microcode.
*/
mc_saved_data->mc_saved = mc_saved_p;
mc_saved_data->mc_saved_count = mc_saved_count;
return 0;
err:
for (j = 0; j <= i; j++)
kfree(mc_saved_p[j]);
kfree(mc_saved_p);
return ret;
}
/*
* A microcode patch in ucode_ptr is saved into mc_saved
* - if it has matching signature and newer revision compared to an existing
* patch mc_saved.
* - or if it is a newly discovered microcode patch.
*
* The microcode patch should have matching model with CPU.
*/
static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr,
unsigned int *mc_saved_count_p)
{
int i;
int found = 0;
unsigned int mc_saved_count = *mc_saved_count_p;
struct microcode_header_intel *mc_header;
mc_header = (struct microcode_header_intel *)ucode_ptr;
for (i = 0; i < mc_saved_count; i++) {
unsigned int sig, pf;
unsigned int new_rev;
struct microcode_header_intel *mc_saved_header =
(struct microcode_header_intel *)mc_saved[i];
sig = mc_saved_header->sig;
pf = mc_saved_header->pf;
new_rev = mc_header->rev;
if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) {
found = 1;
if (update_match_revision(mc_header, new_rev)) {
/*
* Found an older ucode saved before.
* Replace the older one with this newer
* one.
*/
mc_saved[i] =
(struct microcode_intel *)ucode_ptr;
break;
}
}
}
if (i >= mc_saved_count && !found)
/*
* This ucode is first time discovered in ucode file.
* Save it to memory.
*/
mc_saved[mc_saved_count++] =
(struct microcode_intel *)ucode_ptr;
*mc_saved_count_p = mc_saved_count;
}
/*
* Get microcode matching with BSP's model. Only CPUs with the same model as
* BSP can stay in the platform.
*/
static enum ucode_state __init
get_matching_model_microcode(int cpu, unsigned long start,
void *data, size_t size,
struct mc_saved_data *mc_saved_data,
unsigned long *mc_saved_in_initrd,
struct ucode_cpu_info *uci)
{
u8 *ucode_ptr = data;
unsigned int leftover = size;
enum ucode_state state = UCODE_OK;
unsigned int mc_size;
struct microcode_header_intel *mc_header;
struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
int i;
while (leftover) {
mc_header = (struct microcode_header_intel *)ucode_ptr;
mc_size = get_totalsize(mc_header);
if (!mc_size || mc_size > leftover ||
microcode_sanity_check(ucode_ptr, 0) < 0)
break;
leftover -= mc_size;
/*
* Since APs with same family and model as the BSP may boot in
* the platform, we need to find and save microcode patches
* with the same family and model as the BSP.
*/
if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
UCODE_OK) {
ucode_ptr += mc_size;
continue;
}
_save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count);
ucode_ptr += mc_size;
}
if (leftover) {
state = UCODE_ERROR;
goto out;
}
if (mc_saved_count == 0) {
state = UCODE_NFOUND;
goto out;
}
for (i = 0; i < mc_saved_count; i++)
mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
mc_saved_data->mc_saved_count = mc_saved_count;
out:
return state;
}
static int collect_cpu_info_early(struct ucode_cpu_info *uci)
{
unsigned int val[2];
u8 x86, x86_model;
struct cpu_signature csig;
unsigned int eax, ebx, ecx, edx;
csig.sig = 0;
csig.pf = 0;
csig.rev = 0;
memset(uci, 0, sizeof(*uci));
eax = 0x00000001;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
csig.sig = eax;
x86 = get_x86_family(csig.sig);
x86_model = get_x86_model(csig.sig);
if ((x86_model >= 5) || (x86 > 6)) {
/* get processor flags from MSR 0x17 */
native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
csig.pf = 1 << ((val[1] >> 18) & 7);
}
native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
/* As documented in the SDM: Do a CPUID 1 here */
sync_core();
/* get the current revision from MSR 0x8B */
native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
csig.rev = val[1];
uci->cpu_sig = csig;
uci->valid = 1;
return 0;
}
#ifdef DEBUG
static void __ref show_saved_mc(void)
{
int i, j;
unsigned int sig, pf, rev, total_size, data_size, date;
struct ucode_cpu_info uci;
if (mc_saved_data.mc_saved_count == 0) {
pr_debug("no microcode data saved.\n");
return;
}
pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
collect_cpu_info_early(&uci);
sig = uci.cpu_sig.sig;
pf = uci.cpu_sig.pf;
rev = uci.cpu_sig.rev;
pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n",
smp_processor_id(), sig, pf, rev);
for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
struct microcode_header_intel *mc_saved_header;
struct extended_sigtable *ext_header;
int ext_sigcount;
struct extended_signature *ext_sig;
mc_saved_header = (struct microcode_header_intel *)
mc_saved_data.mc_saved[i];
sig = mc_saved_header->sig;
pf = mc_saved_header->pf;
rev = mc_saved_header->rev;
total_size = get_totalsize(mc_saved_header);
data_size = get_datasize(mc_saved_header);
date = mc_saved_header->date;
pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n",
i, sig, pf, rev, total_size,
date & 0xffff,
date >> 24,
(date >> 16) & 0xff);
/* Look for ext. headers: */
if (total_size <= data_size + MC_HEADER_SIZE)
continue;
ext_header = (struct extended_sigtable *)
mc_saved_header + data_size + MC_HEADER_SIZE;
ext_sigcount = ext_header->count;
ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
for (j = 0; j < ext_sigcount; j++) {
sig = ext_sig->sig;
pf = ext_sig->pf;
pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
j, sig, pf);
ext_sig++;
}
}
}
#else
static inline void show_saved_mc(void)
{
}
#endif
#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU)
static DEFINE_MUTEX(x86_cpu_microcode_mutex);
/*
* Save this mc into mc_saved_data. So it will be loaded early when a CPU is
* hot added or resumes.
*
* Please make sure this mc should be a valid microcode patch before calling
* this function.
*/
int save_mc_for_early(u8 *mc)
{
struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
unsigned int mc_saved_count_init;
unsigned int mc_saved_count;
struct microcode_intel **mc_saved;
int ret = 0;
int i;
/*
* Hold hotplug lock so mc_saved_data is not accessed by a CPU in
* hotplug.
*/
mutex_lock(&x86_cpu_microcode_mutex);
mc_saved_count_init = mc_saved_data.mc_saved_count;
mc_saved_count = mc_saved_data.mc_saved_count;
mc_saved = mc_saved_data.mc_saved;
if (mc_saved && mc_saved_count)
memcpy(mc_saved_tmp, mc_saved,
mc_saved_count * sizeof(struct microcode_intel *));
/*
* Save the microcode patch mc in mc_save_tmp structure if it's a newer
* version.
*/
_save_mc(mc_saved_tmp, mc, &mc_saved_count);
/*
* Save the mc_save_tmp in global mc_saved_data.
*/
ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
if (ret) {
pr_err("Cannot save microcode patch.\n");
goto out;
}
show_saved_mc();
/*
* Free old saved microcode data.
*/
if (mc_saved) {
for (i = 0; i < mc_saved_count_init; i++)
kfree(mc_saved[i]);
kfree(mc_saved);
}
out:
mutex_unlock(&x86_cpu_microcode_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(save_mc_for_early);
#endif
static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
static __init enum ucode_state
scan_microcode(unsigned long start, unsigned long end,
struct mc_saved_data *mc_saved_data,
unsigned long *mc_saved_in_initrd,
struct ucode_cpu_info *uci)
{
unsigned int size = end - start + 1;
struct cpio_data cd;
long offset = 0;
#ifdef CONFIG_X86_32
char *p = (char *)__pa_nodebug(ucode_name);
#else
char *p = ucode_name;
#endif
cd.data = NULL;
cd.size = 0;
cd = find_cpio_data(p, (void *)start, size, &offset);
if (!cd.data)
return UCODE_ERROR;
return get_matching_model_microcode(0, start, cd.data, cd.size,
mc_saved_data, mc_saved_in_initrd,
uci);
}
/*
* Print ucode update info.
*/
static void
print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
{
int cpu = smp_processor_id();
pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
cpu,
uci->cpu_sig.rev,
date & 0xffff,
date >> 24,
(date >> 16) & 0xff);
}
#ifdef CONFIG_X86_32
static int delay_ucode_info;
static int current_mc_date;
/*
* Print early updated ucode info after printk works. This is delayed info dump.
*/
void show_ucode_info_early(void)
{
struct ucode_cpu_info uci;
if (delay_ucode_info) {
collect_cpu_info_early(&uci);
print_ucode_info(&uci, current_mc_date);
delay_ucode_info = 0;
}
}
/*
* At this point, we can not call printk() yet. Keep microcode patch number in
* mc_saved_data.mc_saved and delay printing microcode info in
* show_ucode_info_early() until printk() works.
*/
static void print_ucode(struct ucode_cpu_info *uci)
{
struct microcode_intel *mc_intel;
int *delay_ucode_info_p;
int *current_mc_date_p;
mc_intel = uci->mc;
if (mc_intel == NULL)
return;
delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
*delay_ucode_info_p = 1;
*current_mc_date_p = mc_intel->hdr.date;
}
#else
/*
* Flush global tlb. We only do this in x86_64 where paging has been enabled
* already and PGE should be enabled as well.
*/
static inline void flush_tlb_early(void)
{
__native_flush_tlb_global_irq_disabled();
}
static inline void print_ucode(struct ucode_cpu_info *uci)
{
struct microcode_intel *mc_intel;
mc_intel = uci->mc;
if (mc_intel == NULL)
return;
print_ucode_info(uci, mc_intel->hdr.date);
}
#endif
static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
{
struct microcode_intel *mc_intel;
unsigned int val[2];
mc_intel = uci->mc;
if (mc_intel == NULL)
return 0;
/* write microcode via MSR 0x79 */
native_wrmsr(MSR_IA32_UCODE_WRITE,
(unsigned long) mc_intel->bits,
(unsigned long) mc_intel->bits >> 16 >> 16);
native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
/* As documented in the SDM: Do a CPUID 1 here */
sync_core();
/* get the current revision from MSR 0x8B */
native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
if (val[1] != mc_intel->hdr.rev)
return -1;
#ifdef CONFIG_X86_64
/* Flush global tlb. This is precaution. */
flush_tlb_early();
#endif
uci->cpu_sig.rev = val[1];
if (early)
print_ucode(uci);
else
print_ucode_info(uci, mc_intel->hdr.date);
return 0;
}
/*
* This function converts microcode patch offsets previously stored in
* mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
*/
int __init save_microcode_in_initrd_intel(void)
{
unsigned int count = mc_saved_data.mc_saved_count;
struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
int ret = 0;
if (count == 0)
return ret;
microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count);
ret = save_microcode(&mc_saved_data, mc_saved, count);
if (ret)
pr_err("Cannot save microcode patches from initrd.\n");
show_saved_mc();
return ret;
}
static void __init
_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
unsigned long *mc_saved_in_initrd,
unsigned long initrd_start_early,
unsigned long initrd_end_early,
struct ucode_cpu_info *uci)
{
enum ucode_state ret;
collect_cpu_info_early(uci);
scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
mc_saved_in_initrd, uci);
ret = load_microcode(mc_saved_data, mc_saved_in_initrd,
initrd_start_early, uci);
if (ret == UCODE_OK)
apply_microcode_early(uci, true);
}
void __init
load_ucode_intel_bsp(void)
{
u64 ramdisk_image, ramdisk_size;
unsigned long initrd_start_early, initrd_end_early;
struct ucode_cpu_info uci;
#ifdef CONFIG_X86_32
struct boot_params *boot_params_p;
boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params);
ramdisk_image = boot_params_p->hdr.ramdisk_image;
ramdisk_size = boot_params_p->hdr.ramdisk_size;
initrd_start_early = ramdisk_image;
initrd_end_early = initrd_start_early + ramdisk_size;
_load_ucode_intel_bsp(
(struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
(unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
initrd_start_early, initrd_end_early, &uci);
#else
ramdisk_image = boot_params.hdr.ramdisk_image;
ramdisk_size = boot_params.hdr.ramdisk_size;
initrd_start_early = ramdisk_image + PAGE_OFFSET;
initrd_end_early = initrd_start_early + ramdisk_size;
_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
initrd_start_early, initrd_end_early,
&uci);
#endif
}
void load_ucode_intel_ap(void)
{
struct mc_saved_data *mc_saved_data_p;
struct ucode_cpu_info uci;
unsigned long *mc_saved_in_initrd_p;
unsigned long initrd_start_addr;
#ifdef CONFIG_X86_32
unsigned long *initrd_start_p;
mc_saved_in_initrd_p =
(unsigned long *)__pa_nodebug(mc_saved_in_initrd);
mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
#else
mc_saved_data_p = &mc_saved_data;
mc_saved_in_initrd_p = mc_saved_in_initrd;
initrd_start_addr = initrd_start;
#endif
/*
* If there is no valid ucode previously saved in memory, no need to
* update ucode on this AP.
*/
if (mc_saved_data_p->mc_saved_count == 0)
return;
collect_cpu_info_early(&uci);
load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
initrd_start_addr, &uci);
apply_microcode_early(&uci, true);
}
void reload_ucode_intel(void)
{
struct ucode_cpu_info uci;
enum ucode_state ret;
if (!mc_saved_data.mc_saved_count)
return;
collect_cpu_info_early(&uci);
ret = generic_load_microcode_early(mc_saved_data.mc_saved,
mc_saved_data.mc_saved_count, &uci);
if (ret != UCODE_OK)
return;
apply_microcode_early(&uci, false);
}

View file

@ -0,0 +1,174 @@
/*
* Intel CPU Microcode Update Driver for Linux
*
* Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
* H Peter Anvin" <hpa@zytor.com>
*
* This driver allows to upgrade microcode on Intel processors
* belonging to IA-32 family - PentiumPro, Pentium II,
* Pentium III, Xeon, Pentium 4, etc.
*
* Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
* Software Developer's Manual
* Order Number 253668 or free download from:
*
* http://developer.intel.com/Assets/PDF/manual/253668.pdf
*
* For more information, go to http://www.urbanmyth.org/microcode
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/firmware.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/microcode_intel.h>
#include <asm/processor.h>
#include <asm/msr.h>
static inline int
update_match_cpu(unsigned int csig, unsigned int cpf,
unsigned int sig, unsigned int pf)
{
return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
}
int
update_match_revision(struct microcode_header_intel *mc_header, int rev)
{
return (mc_header->rev <= rev) ? 0 : 1;
}
int microcode_sanity_check(void *mc, int print_err)
{
unsigned long total_size, data_size, ext_table_size;
struct microcode_header_intel *mc_header = mc;
struct extended_sigtable *ext_header = NULL;
int sum, orig_sum, ext_sigcount = 0, i;
struct extended_signature *ext_sig;
total_size = get_totalsize(mc_header);
data_size = get_datasize(mc_header);
if (data_size + MC_HEADER_SIZE > total_size) {
if (print_err)
pr_err("error! Bad data size in microcode data file\n");
return -EINVAL;
}
if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
if (print_err)
pr_err("error! Unknown microcode update format\n");
return -EINVAL;
}
ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
if (ext_table_size) {
if ((ext_table_size < EXT_HEADER_SIZE)
|| ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
if (print_err)
pr_err("error! Small exttable size in microcode data file\n");
return -EINVAL;
}
ext_header = mc + MC_HEADER_SIZE + data_size;
if (ext_table_size != exttable_size(ext_header)) {
if (print_err)
pr_err("error! Bad exttable size in microcode data file\n");
return -EFAULT;
}
ext_sigcount = ext_header->count;
}
/* check extended table checksum */
if (ext_table_size) {
int ext_table_sum = 0;
int *ext_tablep = (int *)ext_header;
i = ext_table_size / DWSIZE;
while (i--)
ext_table_sum += ext_tablep[i];
if (ext_table_sum) {
if (print_err)
pr_warn("aborting, bad extended signature table checksum\n");
return -EINVAL;
}
}
/* calculate the checksum */
orig_sum = 0;
i = (MC_HEADER_SIZE + data_size) / DWSIZE;
while (i--)
orig_sum += ((int *)mc)[i];
if (orig_sum) {
if (print_err)
pr_err("aborting, bad checksum\n");
return -EINVAL;
}
if (!ext_table_size)
return 0;
/* check extended signature checksum */
for (i = 0; i < ext_sigcount; i++) {
ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
EXT_SIGNATURE_SIZE * i;
sum = orig_sum
- (mc_header->sig + mc_header->pf + mc_header->cksum)
+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
if (sum) {
if (print_err)
pr_err("aborting, bad checksum\n");
return -EINVAL;
}
}
return 0;
}
EXPORT_SYMBOL_GPL(microcode_sanity_check);
/*
* return 0 - no update found
* return 1 - found update
*/
int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
{
struct microcode_header_intel *mc_header = mc;
struct extended_sigtable *ext_header;
unsigned long total_size = get_totalsize(mc_header);
int ext_sigcount, i;
struct extended_signature *ext_sig;
if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf))
return 1;
/* Look for ext. headers: */
if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
return 0;
ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
ext_sigcount = ext_header->count;
ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
for (i = 0; i < ext_sigcount; i++) {
if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf))
return 1;
ext_sig++;
}
return 0;
}
/*
* return 0 - no update found
* return 1 - found update
*/
int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev)
{
struct microcode_header_intel *mc_header = mc;
if (!update_match_revision(mc_header, rev))
return 0;
return get_matching_sig(csig, cpf, mc, rev);
}
EXPORT_SYMBOL_GPL(get_matching_microcode);

View file

@ -0,0 +1,64 @@
#!/bin/sh
#
# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h
#
IN=$1
OUT=$2
function dump_array()
{
ARRAY=$1
SIZE=$2
PFX=$3
POSTFIX=$4
PFX_SZ=$(echo $PFX | wc -c)
TABS="$(printf '\t\t\t\t\t')"
echo "const char * const $ARRAY[$SIZE] = {"
# Iterate through any input lines starting with #define $PFX
sed -n -e 's/\t/ /g' -e "s/^ *# *define *$PFX//p" $IN |
while read i
do
# Name is everything up to the first whitespace
NAME="$(echo "$i" | sed 's/ .*//')"
# If the /* comment */ starts with a quote string, grab that.
VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')"
[ -z "$VALUE" ] && VALUE="\"$NAME\""
[ "$VALUE" == '""' ] && continue
# Name is uppercase, VALUE is all lowercase
VALUE="$(echo "$VALUE" | tr A-Z a-z)"
if [ -n "$POSTFIX" ]; then
T=$(( $PFX_SZ + $(echo $POSTFIX | wc -c) + 2 ))
TABS="$(printf '\t\t\t\t\t\t')"
TABCOUNT=$(( ( 6*8 - ($T + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
printf "\t[%s - %s]%.*s = %s,\n" "$PFX$NAME" "$POSTFIX" "$TABCOUNT" "$TABS" "$VALUE"
else
TABCOUNT=$(( ( 5*8 - ($PFX_SZ + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
printf "\t[%s]%.*s = %s,\n" "$PFX$NAME" "$TABCOUNT" "$TABS" "$VALUE"
fi
done
echo "};"
}
trap 'rm "$OUT"' EXIT
(
echo "#ifndef _ASM_X86_CPUFEATURE_H"
echo "#include <asm/cpufeature.h>"
echo "#endif"
echo ""
dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" ""
echo ""
dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32"
) > $OUT
trap - EXIT

View file

@ -0,0 +1,153 @@
/*
* HyperV Detection code.
*
* Copyright (C) 2010, Novell, Inc.
* Author : K. Y. Srinivasan <ksrinivasan@novell.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; version 2 of the License.
*
*/
#include <linux/types.h>
#include <linux/time.h>
#include <linux/clocksource.h>
#include <linux/module.h>
#include <linux/hardirq.h>
#include <linux/efi.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
#include <asm/hyperv.h>
#include <asm/mshyperv.h>
#include <asm/desc.h>
#include <asm/idle.h>
#include <asm/irq_regs.h>
#include <asm/i8259.h>
#include <asm/apic.h>
#include <asm/timer.h>
struct ms_hyperv_info ms_hyperv;
EXPORT_SYMBOL_GPL(ms_hyperv);
#if IS_ENABLED(CONFIG_HYPERV)
static void (*vmbus_handler)(void);
void hyperv_vector_handler(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
irq_enter();
exit_idle();
inc_irq_stat(irq_hv_callback_count);
if (vmbus_handler)
vmbus_handler();
irq_exit();
set_irq_regs(old_regs);
}
void hv_setup_vmbus_irq(void (*handler)(void))
{
vmbus_handler = handler;
/*
* Setup the IDT for hypervisor callback. Prevent reallocation
* at module reload.
*/
if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors))
alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
hyperv_callback_vector);
}
void hv_remove_vmbus_irq(void)
{
/* We have no way to deallocate the interrupt gate */
vmbus_handler = NULL;
}
EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);
EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
#endif
static uint32_t __init ms_hyperv_platform(void)
{
u32 eax;
u32 hyp_signature[3];
if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
return 0;
cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
&eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
if (eax >= HYPERV_CPUID_MIN &&
eax <= HYPERV_CPUID_MAX &&
!memcmp("Microsoft Hv", hyp_signature, 12))
return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
return 0;
}
static cycle_t read_hv_clock(struct clocksource *arg)
{
cycle_t current_tick;
/*
* Read the partition counter to get the current tick count. This count
* is set to 0 when the partition is created and is incremented in
* 100 nanosecond units.
*/
rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
return current_tick;
}
static struct clocksource hyperv_cs = {
.name = "hyperv_clocksource",
.rating = 400, /* use this when running on Hyperv*/
.read = read_hv_clock,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
static void __init ms_hyperv_init_platform(void)
{
/*
* Extract the features and hints
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
ms_hyperv.features, ms_hyperv.hints);
#ifdef CONFIG_X86_LOCAL_APIC
if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
/*
* Get the APIC frequency.
*/
u64 hv_lapic_frequency;
rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
lapic_timer_frequency = hv_lapic_frequency;
printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n",
lapic_timer_frequency);
}
#endif
if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
#ifdef CONFIG_X86_IO_APIC
no_timer_check = 1;
#endif
}
const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.name = "Microsoft HyperV",
.detect = ms_hyperv_platform,
.init_platform = ms_hyperv_init_platform,
};
EXPORT_SYMBOL(x86_hyper_ms_hyperv);

View file

@ -0,0 +1,3 @@
obj-y := main.o if.o generic.o cleanup.o
obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o

View file

@ -0,0 +1,124 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include "mtrr.h"
static void
amd_get_mtrr(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type *type)
{
unsigned long low, high;
rdmsr(MSR_K6_UWCCR, low, high);
/* Upper dword is region 1, lower is region 0 */
if (reg == 1)
low = high;
/* The base masks off on the right alignment */
*base = (low & 0xFFFE0000) >> PAGE_SHIFT;
*type = 0;
if (low & 1)
*type = MTRR_TYPE_UNCACHABLE;
if (low & 2)
*type = MTRR_TYPE_WRCOMB;
if (!(low & 3)) {
*size = 0;
return;
}
/*
* This needs a little explaining. The size is stored as an
* inverted mask of bits of 128K granularity 15 bits long offset
* 2 bits.
*
* So to get a size we do invert the mask and add 1 to the lowest
* mask bit (4 as its 2 bits in). This gives us a size we then shift
* to turn into 128K blocks.
*
* eg 111 1111 1111 1100 is 512K
*
* invert 000 0000 0000 0011
* +1 000 0000 0000 0100
* *128K ...
*/
low = (~low) & 0x1FFFC;
*size = (low + 4) << (15 - PAGE_SHIFT);
}
/**
* amd_set_mtrr - Set variable MTRR register on the local CPU.
*
* @reg The register to set.
* @base The base address of the region.
* @size The size of the region. If this is 0 the region is disabled.
* @type The type of the region.
*
* Returns nothing.
*/
static void
amd_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
{
u32 regs[2];
/*
* Low is MTRR0, High MTRR 1
*/
rdmsr(MSR_K6_UWCCR, regs[0], regs[1]);
/*
* Blank to disable
*/
if (size == 0) {
regs[reg] = 0;
} else {
/*
* Set the register to the base, the type (off by one) and an
* inverted bitmask of the size The size is the only odd
* bit. We are fed say 512K We invert this and we get 111 1111
* 1111 1011 but if you subtract one and invert you get the
* desired 111 1111 1111 1100 mask
*
* But ~(x - 1) == ~x + 1 == -x. Two's complement rocks!
*/
regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC)
| (base << PAGE_SHIFT) | (type + 1);
}
/*
* The writeback rule is quite specific. See the manual. Its
* disable local interrupts, write back the cache, set the mtrr
*/
wbinvd();
wrmsr(MSR_K6_UWCCR, regs[0], regs[1]);
}
static int
amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
{
/*
* Apply the K6 block alignment and size rules
* In order
* o Uncached or gathering only
* o 128K or bigger block
* o Power of 2 block
* o base suitably aligned to the power
*/
if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT))
|| (size & ~(size - 1)) - size || (base & (size - 1)))
return -EINVAL;
return 0;
}
static const struct mtrr_ops amd_mtrr_ops = {
.vendor = X86_VENDOR_AMD,
.set = amd_set_mtrr,
.get = amd_get_mtrr,
.get_free_region = generic_get_free_region,
.validate_add_page = amd_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
int __init amd_init_mtrr(void)
{
set_mtrr_ops(&amd_mtrr_ops);
return 0;
}

View file

@ -0,0 +1,126 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include "mtrr.h"
static struct {
unsigned long high;
unsigned long low;
} centaur_mcr[8];
static u8 centaur_mcr_reserved;
static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */
/**
* centaur_get_free_region - Get a free MTRR.
*
* @base: The starting (base) address of the region.
* @size: The size (in bytes) of the region.
*
* Returns: the index of the region on success, else -1 on error.
*/
static int
centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
{
unsigned long lbase, lsize;
mtrr_type ltype;
int i, max;
max = num_var_ranges;
if (replace_reg >= 0 && replace_reg < max)
return replace_reg;
for (i = 0; i < max; ++i) {
if (centaur_mcr_reserved & (1 << i))
continue;
mtrr_if->get(i, &lbase, &lsize, &ltype);
if (lsize == 0)
return i;
}
return -ENOSPC;
}
/*
* Report boot time MCR setups
*/
void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
{
centaur_mcr[mcr].low = lo;
centaur_mcr[mcr].high = hi;
}
static void
centaur_get_mcr(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type * type)
{
*base = centaur_mcr[reg].high >> PAGE_SHIFT;
*size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
*type = MTRR_TYPE_WRCOMB; /* write-combining */
if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2))
*type = MTRR_TYPE_UNCACHABLE;
if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25)
*type = MTRR_TYPE_WRBACK;
if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31)
*type = MTRR_TYPE_WRBACK;
}
static void
centaur_set_mcr(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type)
{
unsigned long low, high;
if (size == 0) {
/* Disable */
high = low = 0;
} else {
high = base << PAGE_SHIFT;
if (centaur_mcr_type == 0) {
/* Only support write-combining... */
low = -size << PAGE_SHIFT | 0x1f;
} else {
if (type == MTRR_TYPE_UNCACHABLE)
low = -size << PAGE_SHIFT | 0x02; /* NC */
else
low = -size << PAGE_SHIFT | 0x09; /* WWO, WC */
}
}
centaur_mcr[reg].high = high;
centaur_mcr[reg].low = low;
wrmsr(MSR_IDT_MCR0 + reg, low, high);
}
static int
centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
{
/*
* FIXME: Winchip2 supports uncached
*/
if (type != MTRR_TYPE_WRCOMB &&
(centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
pr_warning("mtrr: only write-combining%s supported\n",
centaur_mcr_type ? " and uncacheable are" : " is");
return -EINVAL;
}
return 0;
}
static const struct mtrr_ops centaur_mtrr_ops = {
.vendor = X86_VENDOR_CENTAUR,
.set = centaur_set_mcr,
.get = centaur_get_mcr,
.get_free_region = centaur_get_free_region,
.validate_add_page = centaur_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
int __init centaur_init_mtrr(void)
{
set_mtrr_ops(&centaur_mtrr_ops);
return 0;
}

View file

@ -0,0 +1,980 @@
/*
* MTRR (Memory Type Range Register) cleanup
*
* Copyright (C) 2009 Yinghai Lu
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/module.h>
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
#include <linux/uaccess.h>
#include <linux/kvm_para.h>
#include <linux/range.h>
#include <asm/processor.h>
#include <asm/e820.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include "mtrr.h"
struct var_mtrr_range_state {
unsigned long base_pfn;
unsigned long size_pfn;
mtrr_type type;
};
struct var_mtrr_state {
unsigned long range_startk;
unsigned long range_sizek;
unsigned long chunk_sizek;
unsigned long gran_sizek;
unsigned int reg;
};
/* Should be related to MTRR_VAR_RANGES nums */
#define RANGE_NUM 256
static struct range __initdata range[RANGE_NUM];
static int __initdata nr_range;
static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
static int __initdata debug_print;
#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
#define BIOS_BUG_MSG KERN_WARNING \
"WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
static int __init
x86_get_mtrr_mem_range(struct range *range, int nr_range,
unsigned long extra_remove_base,
unsigned long extra_remove_size)
{
unsigned long base, size;
mtrr_type type;
int i;
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
if (type != MTRR_TYPE_WRBACK)
continue;
base = range_state[i].base_pfn;
size = range_state[i].size_pfn;
nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
base, base + size);
}
if (debug_print) {
printk(KERN_DEBUG "After WB checking\n");
for (i = 0; i < nr_range; i++)
printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
range[i].start, range[i].end);
}
/* Take out UC ranges: */
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
if (type != MTRR_TYPE_UNCACHABLE &&
type != MTRR_TYPE_WRPROT)
continue;
size = range_state[i].size_pfn;
if (!size)
continue;
base = range_state[i].base_pfn;
if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
(mtrr_state.enabled & 1)) {
/* Var MTRR contains UC entry below 1M? Skip it: */
printk(BIOS_BUG_MSG, i);
if (base + size <= (1<<(20-PAGE_SHIFT)))
continue;
size -= (1<<(20-PAGE_SHIFT)) - base;
base = 1<<(20-PAGE_SHIFT);
}
subtract_range(range, RANGE_NUM, base, base + size);
}
if (extra_remove_size)
subtract_range(range, RANGE_NUM, extra_remove_base,
extra_remove_base + extra_remove_size);
if (debug_print) {
printk(KERN_DEBUG "After UC checking\n");
for (i = 0; i < RANGE_NUM; i++) {
if (!range[i].end)
continue;
printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
range[i].start, range[i].end);
}
}
/* sort the ranges */
nr_range = clean_sort_range(range, RANGE_NUM);
if (debug_print) {
printk(KERN_DEBUG "After sorting\n");
for (i = 0; i < nr_range; i++)
printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
range[i].start, range[i].end);
}
return nr_range;
}
#ifdef CONFIG_MTRR_SANITIZER
static unsigned long __init sum_ranges(struct range *range, int nr_range)
{
unsigned long sum = 0;
int i;
for (i = 0; i < nr_range; i++)
sum += range[i].end - range[i].start;
return sum;
}
static int enable_mtrr_cleanup __initdata =
CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
static int __init disable_mtrr_cleanup_setup(char *str)
{
enable_mtrr_cleanup = 0;
return 0;
}
early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
static int __init enable_mtrr_cleanup_setup(char *str)
{
enable_mtrr_cleanup = 1;
return 0;
}
early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
static int __init mtrr_cleanup_debug_setup(char *str)
{
debug_print = 1;
return 0;
}
early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
static void __init
set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
unsigned char type, unsigned int address_bits)
{
u32 base_lo, base_hi, mask_lo, mask_hi;
u64 base, mask;
if (!sizek) {
fill_mtrr_var_range(reg, 0, 0, 0, 0);
return;
}
mask = (1ULL << address_bits) - 1;
mask &= ~((((u64)sizek) << 10) - 1);
base = ((u64)basek) << 10;
base |= type;
mask |= 0x800;
base_lo = base & ((1ULL<<32) - 1);
base_hi = base >> 32;
mask_lo = mask & ((1ULL<<32) - 1);
mask_hi = mask >> 32;
fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
}
static void __init
save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
unsigned char type)
{
range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
range_state[reg].type = type;
}
static void __init set_var_mtrr_all(unsigned int address_bits)
{
unsigned long basek, sizek;
unsigned char type;
unsigned int reg;
for (reg = 0; reg < num_var_ranges; reg++) {
basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
type = range_state[reg].type;
set_var_mtrr(reg, basek, sizek, type, address_bits);
}
}
static unsigned long to_size_factor(unsigned long sizek, char *factorp)
{
unsigned long base = sizek;
char factor;
if (base & ((1<<10) - 1)) {
/* Not MB-aligned: */
factor = 'K';
} else if (base & ((1<<20) - 1)) {
factor = 'M';
base >>= 10;
} else {
factor = 'G';
base >>= 20;
}
*factorp = factor;
return base;
}
static unsigned int __init
range_to_mtrr(unsigned int reg, unsigned long range_startk,
unsigned long range_sizek, unsigned char type)
{
if (!range_sizek || (reg >= num_var_ranges))
return reg;
while (range_sizek) {
unsigned long max_align, align;
unsigned long sizek;
/* Compute the maximum size with which we can make a range: */
if (range_startk)
max_align = __ffs(range_startk);
else
max_align = BITS_PER_LONG - 1;
align = __fls(range_sizek);
if (align > max_align)
align = max_align;
sizek = 1UL << align;
if (debug_print) {
char start_factor = 'K', size_factor = 'K';
unsigned long start_base, size_base;
start_base = to_size_factor(range_startk, &start_factor);
size_base = to_size_factor(sizek, &size_factor);
Dprintk("Setting variable MTRR %d, "
"base: %ld%cB, range: %ld%cB, type %s\n",
reg, start_base, start_factor,
size_base, size_factor,
(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")
);
}
save_var_mtrr(reg++, range_startk, sizek, type);
range_startk += sizek;
range_sizek -= sizek;
if (reg >= num_var_ranges)
break;
}
return reg;
}
static unsigned __init
range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
unsigned long sizek)
{
unsigned long hole_basek, hole_sizek;
unsigned long second_basek, second_sizek;
unsigned long range0_basek, range0_sizek;
unsigned long range_basek, range_sizek;
unsigned long chunk_sizek;
unsigned long gran_sizek;
hole_basek = 0;
hole_sizek = 0;
second_basek = 0;
second_sizek = 0;
chunk_sizek = state->chunk_sizek;
gran_sizek = state->gran_sizek;
/* Align with gran size, prevent small block used up MTRRs: */
range_basek = ALIGN(state->range_startk, gran_sizek);
if ((range_basek > basek) && basek)
return second_sizek;
state->range_sizek -= (range_basek - state->range_startk);
range_sizek = ALIGN(state->range_sizek, gran_sizek);
while (range_sizek > state->range_sizek) {
range_sizek -= gran_sizek;
if (!range_sizek)
return 0;
}
state->range_sizek = range_sizek;
/* Try to append some small hole: */
range0_basek = state->range_startk;
range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
/* No increase: */
if (range0_sizek == state->range_sizek) {
Dprintk("rangeX: %016lx - %016lx\n",
range0_basek<<10,
(range0_basek + state->range_sizek)<<10);
state->reg = range_to_mtrr(state->reg, range0_basek,
state->range_sizek, MTRR_TYPE_WRBACK);
return 0;
}
/* Only cut back when it is not the last: */
if (sizek) {
while (range0_basek + range0_sizek > (basek + sizek)) {
if (range0_sizek >= chunk_sizek)
range0_sizek -= chunk_sizek;
else
range0_sizek = 0;
if (!range0_sizek)
break;
}
}
second_try:
range_basek = range0_basek + range0_sizek;
/* One hole in the middle: */
if (range_basek > basek && range_basek <= (basek + sizek))
second_sizek = range_basek - basek;
if (range0_sizek > state->range_sizek) {
/* One hole in middle or at the end: */
hole_sizek = range0_sizek - state->range_sizek - second_sizek;
/* Hole size should be less than half of range0 size: */
if (hole_sizek >= (range0_sizek >> 1) &&
range0_sizek >= chunk_sizek) {
range0_sizek -= chunk_sizek;
second_sizek = 0;
hole_sizek = 0;
goto second_try;
}
}
if (range0_sizek) {
Dprintk("range0: %016lx - %016lx\n",
range0_basek<<10,
(range0_basek + range0_sizek)<<10);
state->reg = range_to_mtrr(state->reg, range0_basek,
range0_sizek, MTRR_TYPE_WRBACK);
}
if (range0_sizek < state->range_sizek) {
/* Need to handle left over range: */
range_sizek = state->range_sizek - range0_sizek;
Dprintk("range: %016lx - %016lx\n",
range_basek<<10,
(range_basek + range_sizek)<<10);
state->reg = range_to_mtrr(state->reg, range_basek,
range_sizek, MTRR_TYPE_WRBACK);
}
if (hole_sizek) {
hole_basek = range_basek - hole_sizek - second_sizek;
Dprintk("hole: %016lx - %016lx\n",
hole_basek<<10,
(hole_basek + hole_sizek)<<10);
state->reg = range_to_mtrr(state->reg, hole_basek,
hole_sizek, MTRR_TYPE_UNCACHABLE);
}
return second_sizek;
}
static void __init
set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
unsigned long size_pfn)
{
unsigned long basek, sizek;
unsigned long second_sizek = 0;
if (state->reg >= num_var_ranges)
return;
basek = base_pfn << (PAGE_SHIFT - 10);
sizek = size_pfn << (PAGE_SHIFT - 10);
/* See if I can merge with the last range: */
if ((basek <= 1024) ||
(state->range_startk + state->range_sizek == basek)) {
unsigned long endk = basek + sizek;
state->range_sizek = endk - state->range_startk;
return;
}
/* Write the range mtrrs: */
if (state->range_sizek != 0)
second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
/* Allocate an msr: */
state->range_startk = basek + second_sizek;
state->range_sizek = sizek - second_sizek;
}
/* Mininum size of mtrr block that can take hole: */
static u64 mtrr_chunk_size __initdata = (256ULL<<20);
static int __init parse_mtrr_chunk_size_opt(char *p)
{
if (!p)
return -EINVAL;
mtrr_chunk_size = memparse(p, &p);
return 0;
}
early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
/* Granularity of mtrr of block: */
static u64 mtrr_gran_size __initdata;
static int __init parse_mtrr_gran_size_opt(char *p)
{
if (!p)
return -EINVAL;
mtrr_gran_size = memparse(p, &p);
return 0;
}
early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
static unsigned long nr_mtrr_spare_reg __initdata =
CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
static int __init parse_mtrr_spare_reg(char *arg)
{
if (arg)
nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
return 0;
}
early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
static int __init
x86_setup_var_mtrrs(struct range *range, int nr_range,
u64 chunk_size, u64 gran_size)
{
struct var_mtrr_state var_state;
int num_reg;
int i;
var_state.range_startk = 0;
var_state.range_sizek = 0;
var_state.reg = 0;
var_state.chunk_sizek = chunk_size >> 10;
var_state.gran_sizek = gran_size >> 10;
memset(range_state, 0, sizeof(range_state));
/* Write the range: */
for (i = 0; i < nr_range; i++) {
set_var_mtrr_range(&var_state, range[i].start,
range[i].end - range[i].start);
}
/* Write the last range: */
if (var_state.range_sizek != 0)
range_to_mtrr_with_hole(&var_state, 0, 0);
num_reg = var_state.reg;
/* Clear out the extra MTRR's: */
while (var_state.reg < num_var_ranges) {
save_var_mtrr(var_state.reg, 0, 0, 0);
var_state.reg++;
}
return num_reg;
}
struct mtrr_cleanup_result {
unsigned long gran_sizek;
unsigned long chunk_sizek;
unsigned long lose_cover_sizek;
unsigned int num_reg;
int bad;
};
/*
* gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
* chunk size: gran_size, ..., 2G
* so we need (1+16)*8
*/
#define NUM_RESULT 136
#define PSHIFT (PAGE_SHIFT - 10)
static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
static unsigned long __initdata min_loss_pfn[RANGE_NUM];
static void __init print_out_mtrr_range_state(void)
{
char start_factor = 'K', size_factor = 'K';
unsigned long start_base, size_base;
mtrr_type type;
int i;
for (i = 0; i < num_var_ranges; i++) {
size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
if (!size_base)
continue;
size_base = to_size_factor(size_base, &size_factor),
start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
start_base = to_size_factor(start_base, &start_factor),
type = range_state[i].type;
printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
i, start_base, start_factor,
size_base, size_factor,
(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
((type == MTRR_TYPE_WRPROT) ? "WP" :
((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
);
}
}
static int __init mtrr_need_cleanup(void)
{
int i;
mtrr_type type;
unsigned long size;
/* Extra one for all 0: */
int num[MTRR_NUM_TYPES + 1];
/* Check entries number: */
memset(num, 0, sizeof(num));
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
size = range_state[i].size_pfn;
if (type >= MTRR_NUM_TYPES)
continue;
if (!size)
type = MTRR_NUM_TYPES;
num[type]++;
}
/* Check if we got UC entries: */
if (!num[MTRR_TYPE_UNCACHABLE])
return 0;
/* Check if we only had WB and UC */
if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
num_var_ranges - num[MTRR_NUM_TYPES])
return 0;
return 1;
}
static unsigned long __initdata range_sums;
static void __init
mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
unsigned long x_remove_base,
unsigned long x_remove_size, int i)
{
static struct range range_new[RANGE_NUM];
unsigned long range_sums_new;
static int nr_range_new;
int num_reg;
/* Convert ranges to var ranges state: */
num_reg = x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
/* We got new setting in range_state, check it: */
memset(range_new, 0, sizeof(range_new));
nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
x_remove_base, x_remove_size);
range_sums_new = sum_ranges(range_new, nr_range_new);
result[i].chunk_sizek = chunk_size >> 10;
result[i].gran_sizek = gran_size >> 10;
result[i].num_reg = num_reg;
if (range_sums < range_sums_new) {
result[i].lose_cover_sizek = (range_sums_new - range_sums) << PSHIFT;
result[i].bad = 1;
} else {
result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT;
}
/* Double check it: */
if (!result[i].bad && !result[i].lose_cover_sizek) {
if (nr_range_new != nr_range || memcmp(range, range_new, sizeof(range)))
result[i].bad = 1;
}
if (!result[i].bad && (range_sums - range_sums_new < min_loss_pfn[num_reg]))
min_loss_pfn[num_reg] = range_sums - range_sums_new;
}
static void __init mtrr_print_out_one_result(int i)
{
unsigned long gran_base, chunk_base, lose_base;
char gran_factor, chunk_factor, lose_factor;
gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
result[i].bad ? "*BAD*" : " ",
gran_base, gran_factor, chunk_base, chunk_factor);
pr_cont("num_reg: %d \tlose cover RAM: %s%ld%c\n",
result[i].num_reg, result[i].bad ? "-" : "",
lose_base, lose_factor);
}
static int __init mtrr_search_optimal_index(void)
{
int num_reg_good;
int index_good;
int i;
if (nr_mtrr_spare_reg >= num_var_ranges)
nr_mtrr_spare_reg = num_var_ranges - 1;
num_reg_good = -1;
for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
if (!min_loss_pfn[i])
num_reg_good = i;
}
index_good = -1;
if (num_reg_good != -1) {
for (i = 0; i < NUM_RESULT; i++) {
if (!result[i].bad &&
result[i].num_reg == num_reg_good &&
!result[i].lose_cover_sizek) {
index_good = i;
break;
}
}
}
return index_good;
}
int __init mtrr_cleanup(unsigned address_bits)
{
unsigned long x_remove_base, x_remove_size;
unsigned long base, size, def, dummy;
u64 chunk_size, gran_size;
mtrr_type type;
int index_good;
int i;
if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
return 0;
rdmsr(MSR_MTRRdefType, def, dummy);
def &= 0xff;
if (def != MTRR_TYPE_UNCACHABLE)
return 0;
/* Get it and store it aside: */
memset(range_state, 0, sizeof(range_state));
for (i = 0; i < num_var_ranges; i++) {
mtrr_if->get(i, &base, &size, &type);
range_state[i].base_pfn = base;
range_state[i].size_pfn = size;
range_state[i].type = type;
}
/* Check if we need handle it and can handle it: */
if (!mtrr_need_cleanup())
return 0;
/* Print original var MTRRs at first, for debugging: */
printk(KERN_DEBUG "original variable MTRRs\n");
print_out_mtrr_range_state();
memset(range, 0, sizeof(range));
x_remove_size = 0;
x_remove_base = 1 << (32 - PAGE_SHIFT);
if (mtrr_tom2)
x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base;
/*
* [0, 1M) should always be covered by var mtrr with WB
* and fixed mtrrs should take effect before var mtrr for it:
*/
nr_range = add_range_with_merge(range, RANGE_NUM, 0, 0,
1ULL<<(20 - PAGE_SHIFT));
/* add from var mtrr at last */
nr_range = x86_get_mtrr_mem_range(range, nr_range,
x_remove_base, x_remove_size);
range_sums = sum_ranges(range, nr_range);
printk(KERN_INFO "total RAM covered: %ldM\n",
range_sums >> (20 - PAGE_SHIFT));
if (mtrr_chunk_size && mtrr_gran_size) {
i = 0;
mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
x_remove_base, x_remove_size, i);
mtrr_print_out_one_result(i);
if (!result[i].bad) {
set_var_mtrr_all(address_bits);
printk(KERN_DEBUG "New variable MTRRs\n");
print_out_mtrr_range_state();
return 1;
}
printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
"will find optimal one\n");
}
i = 0;
memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
memset(result, 0, sizeof(result));
for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
for (chunk_size = gran_size; chunk_size < (1ULL<<32);
chunk_size <<= 1) {
if (i >= NUM_RESULT)
continue;
mtrr_calc_range_state(chunk_size, gran_size,
x_remove_base, x_remove_size, i);
if (debug_print) {
mtrr_print_out_one_result(i);
printk(KERN_INFO "\n");
}
i++;
}
}
/* Try to find the optimal index: */
index_good = mtrr_search_optimal_index();
if (index_good != -1) {
printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
i = index_good;
mtrr_print_out_one_result(i);
/* Convert ranges to var ranges state: */
chunk_size = result[i].chunk_sizek;
chunk_size <<= 10;
gran_size = result[i].gran_sizek;
gran_size <<= 10;
x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
set_var_mtrr_all(address_bits);
printk(KERN_DEBUG "New variable MTRRs\n");
print_out_mtrr_range_state();
return 1;
} else {
/* print out all */
for (i = 0; i < NUM_RESULT; i++)
mtrr_print_out_one_result(i);
}
printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
return 0;
}
#else
int __init mtrr_cleanup(unsigned address_bits)
{
return 0;
}
#endif
static int disable_mtrr_trim;
static int __init disable_mtrr_trim_setup(char *str)
{
disable_mtrr_trim = 1;
return 0;
}
early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
/*
* Newer AMD K8s and later CPUs have a special magic MSR way to force WB
* for memory >4GB. Check for that here.
* Note this won't check if the MTRRs < 4GB where the magic bit doesn't
* apply to are wrong, but so far we don't know of any such case in the wild.
*/
#define Tom2Enabled (1U << 21)
#define Tom2ForceMemTypeWB (1U << 22)
int __init amd_special_default_mtrr(void)
{
u32 l, h;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return 0;
if (boot_cpu_data.x86 < 0xf)
return 0;
/* In case some hypervisor doesn't pass SYSCFG through: */
if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
return 0;
/*
* Memory between 4GB and top of mem is forced WB by this magic bit.
* Reserved before K8RevF, but should be zero there.
*/
if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
(Tom2Enabled | Tom2ForceMemTypeWB))
return 1;
return 0;
}
static u64 __init
real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn)
{
u64 trim_start, trim_size;
trim_start = start_pfn;
trim_start <<= PAGE_SHIFT;
trim_size = limit_pfn;
trim_size <<= PAGE_SHIFT;
trim_size -= trim_start;
return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED);
}
/**
* mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
* @end_pfn: ending page frame number
*
* Some buggy BIOSes don't setup the MTRRs properly for systems with certain
* memory configurations. This routine checks that the highest MTRR matches
* the end of memory, to make sure the MTRRs having a write back type cover
* all of the memory the kernel is intending to use. If not, it'll trim any
* memory off the end by adjusting end_pfn, removing it from the kernel's
* allocation pools, warning the user with an obnoxious message.
*/
int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
{
unsigned long i, base, size, highest_pfn = 0, def, dummy;
mtrr_type type;
u64 total_trim_size;
/* extra one for all 0 */
int num[MTRR_NUM_TYPES + 1];
/*
* Make sure we only trim uncachable memory on machines that
* support the Intel MTRR architecture:
*/
if (!is_cpu(INTEL) || disable_mtrr_trim)
return 0;
rdmsr(MSR_MTRRdefType, def, dummy);
def &= 0xff;
if (def != MTRR_TYPE_UNCACHABLE)
return 0;
/* Get it and store it aside: */
memset(range_state, 0, sizeof(range_state));
for (i = 0; i < num_var_ranges; i++) {
mtrr_if->get(i, &base, &size, &type);
range_state[i].base_pfn = base;
range_state[i].size_pfn = size;
range_state[i].type = type;
}
/* Find highest cached pfn: */
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
if (type != MTRR_TYPE_WRBACK)
continue;
base = range_state[i].base_pfn;
size = range_state[i].size_pfn;
if (highest_pfn < base + size)
highest_pfn = base + size;
}
/* kvm/qemu doesn't have mtrr set right, don't trim them all: */
if (!highest_pfn) {
printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
return 0;
}
/* Check entries number: */
memset(num, 0, sizeof(num));
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
if (type >= MTRR_NUM_TYPES)
continue;
size = range_state[i].size_pfn;
if (!size)
type = MTRR_NUM_TYPES;
num[type]++;
}
/* No entry for WB? */
if (!num[MTRR_TYPE_WRBACK])
return 0;
/* Check if we only had WB and UC: */
if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
num_var_ranges - num[MTRR_NUM_TYPES])
return 0;
memset(range, 0, sizeof(range));
nr_range = 0;
if (mtrr_tom2) {
range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT;
if (highest_pfn < range[nr_range].end)
highest_pfn = range[nr_range].end;
nr_range++;
}
nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
/* Check the head: */
total_trim_size = 0;
if (range[0].start)
total_trim_size += real_trim_memory(0, range[0].start);
/* Check the holes: */
for (i = 0; i < nr_range - 1; i++) {
if (range[i].end < range[i+1].start)
total_trim_size += real_trim_memory(range[i].end,
range[i+1].start);
}
/* Check the top: */
i = nr_range - 1;
if (range[i].end < end_pfn)
total_trim_size += real_trim_memory(range[i].end,
end_pfn);
if (total_trim_size) {
pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20);
if (!changed_by_mtrr_cleanup)
WARN_ON(1);
pr_info("update e820 for mtrr\n");
update_e820();
return 1;
}
return 0;
}

View file

@ -0,0 +1,282 @@
#include <linux/init.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <asm/processor-cyrix.h>
#include <asm/processor-flags.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include "mtrr.h"
static void
cyrix_get_arr(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type * type)
{
unsigned char arr, ccr3, rcr, shift;
unsigned long flags;
arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
local_irq_save(flags);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
((unsigned char *)base)[3] = getCx86(arr);
((unsigned char *)base)[2] = getCx86(arr + 1);
((unsigned char *)base)[1] = getCx86(arr + 2);
rcr = getCx86(CX86_RCR_BASE + reg);
setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
local_irq_restore(flags);
shift = ((unsigned char *) base)[1] & 0x0f;
*base >>= PAGE_SHIFT;
/*
* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7
* Note: shift==0xf means 4G, this is unsupported.
*/
if (shift)
*size = (reg < 7 ? 0x1UL : 0x40UL) << (shift - 1);
else
*size = 0;
/* Bit 0 is Cache Enable on ARR7, Cache Disable on ARR0-ARR6 */
if (reg < 7) {
switch (rcr) {
case 1:
*type = MTRR_TYPE_UNCACHABLE;
break;
case 8:
*type = MTRR_TYPE_WRBACK;
break;
case 9:
*type = MTRR_TYPE_WRCOMB;
break;
case 24:
default:
*type = MTRR_TYPE_WRTHROUGH;
break;
}
} else {
switch (rcr) {
case 0:
*type = MTRR_TYPE_UNCACHABLE;
break;
case 8:
*type = MTRR_TYPE_WRCOMB;
break;
case 9:
*type = MTRR_TYPE_WRBACK;
break;
case 25:
default:
*type = MTRR_TYPE_WRTHROUGH;
break;
}
}
}
/*
* cyrix_get_free_region - get a free ARR.
*
* @base: the starting (base) address of the region.
* @size: the size (in bytes) of the region.
*
* Returns: the index of the region on success, else -1 on error.
*/
static int
cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
{
unsigned long lbase, lsize;
mtrr_type ltype;
int i;
switch (replace_reg) {
case 7:
if (size < 0x40)
break;
case 6:
case 5:
case 4:
return replace_reg;
case 3:
case 2:
case 1:
case 0:
return replace_reg;
}
/* If we are to set up a region >32M then look at ARR7 immediately */
if (size > 0x2000) {
cyrix_get_arr(7, &lbase, &lsize, &ltype);
if (lsize == 0)
return 7;
/* Else try ARR0-ARR6 first */
} else {
for (i = 0; i < 7; i++) {
cyrix_get_arr(i, &lbase, &lsize, &ltype);
if (lsize == 0)
return i;
}
/*
* ARR0-ARR6 isn't free
* try ARR7 but its size must be at least 256K
*/
cyrix_get_arr(i, &lbase, &lsize, &ltype);
if ((lsize == 0) && (size >= 0x40))
return i;
}
return -ENOSPC;
}
static u32 cr4, ccr3;
static void prepare_set(void)
{
u32 cr0;
/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (cpu_has_pge) {
cr4 = read_cr4();
write_cr4(cr4 & ~X86_CR4_PGE);
}
/*
* Disable and flush caches.
* Note that wbinvd flushes the TLBs as a side-effect
*/
cr0 = read_cr0() | X86_CR0_CD;
wbinvd();
write_cr0(cr0);
wbinvd();
/* Cyrix ARRs - everything else was excluded at the top */
ccr3 = getCx86(CX86_CCR3);
/* Cyrix ARRs - everything else was excluded at the top */
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
}
static void post_set(void)
{
/* Flush caches and TLBs */
wbinvd();
/* Cyrix ARRs - everything else was excluded at the top */
setCx86(CX86_CCR3, ccr3);
/* Enable caches */
write_cr0(read_cr0() & ~X86_CR0_CD);
/* Restore value of CR4 */
if (cpu_has_pge)
write_cr4(cr4);
}
static void cyrix_set_arr(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type)
{
unsigned char arr, arr_type, arr_size;
arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
/* count down from 32M (ARR0-ARR6) or from 2G (ARR7) */
if (reg >= 7)
size >>= 6;
size &= 0x7fff; /* make sure arr_size <= 14 */
for (arr_size = 0; size; arr_size++, size >>= 1)
;
if (reg < 7) {
switch (type) {
case MTRR_TYPE_UNCACHABLE:
arr_type = 1;
break;
case MTRR_TYPE_WRCOMB:
arr_type = 9;
break;
case MTRR_TYPE_WRTHROUGH:
arr_type = 24;
break;
default:
arr_type = 8;
break;
}
} else {
switch (type) {
case MTRR_TYPE_UNCACHABLE:
arr_type = 0;
break;
case MTRR_TYPE_WRCOMB:
arr_type = 8;
break;
case MTRR_TYPE_WRTHROUGH:
arr_type = 25;
break;
default:
arr_type = 9;
break;
}
}
prepare_set();
base <<= PAGE_SHIFT;
setCx86(arr + 0, ((unsigned char *)&base)[3]);
setCx86(arr + 1, ((unsigned char *)&base)[2]);
setCx86(arr + 2, (((unsigned char *)&base)[1]) | arr_size);
setCx86(CX86_RCR_BASE + reg, arr_type);
post_set();
}
typedef struct {
unsigned long base;
unsigned long size;
mtrr_type type;
} arr_state_t;
static arr_state_t arr_state[8] = {
{0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL},
{0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}
};
static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 };
static void cyrix_set_all(void)
{
int i;
prepare_set();
/* the CCRs are not contiguous */
for (i = 0; i < 4; i++)
setCx86(CX86_CCR0 + i, ccr_state[i]);
for (; i < 7; i++)
setCx86(CX86_CCR4 + i, ccr_state[i]);
for (i = 0; i < 8; i++) {
cyrix_set_arr(i, arr_state[i].base,
arr_state[i].size, arr_state[i].type);
}
post_set();
}
static const struct mtrr_ops cyrix_mtrr_ops = {
.vendor = X86_VENDOR_CYRIX,
.set_all = cyrix_set_all,
.set = cyrix_set_arr,
.get = cyrix_get_arr,
.get_free_region = cyrix_get_free_region,
.validate_add_page = generic_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
int __init cyrix_init_mtrr(void)
{
set_mtrr_ops(&cyrix_mtrr_ops);
return 0;
}

View file

@ -0,0 +1,845 @@
/*
* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
* because MTRRs can span up to 40 bits (36bits on most modern x86)
*/
#define DEBUG
#include <linux/module.h>
#include <linux/init.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <asm/processor-flags.h>
#include <asm/cpufeature.h>
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include <asm/pat.h>
#include "mtrr.h"
struct fixed_range_block {
int base_msr; /* start address of an MTRR block */
int ranges; /* number of MTRRs in this block */
};
static struct fixed_range_block fixed_range_blocks[] = {
{ MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */
{ MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */
{ MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */
{}
};
static unsigned long smp_changes_mask;
static int mtrr_state_set;
u64 mtrr_tom2;
struct mtrr_state_type mtrr_state;
EXPORT_SYMBOL_GPL(mtrr_state);
/*
* BIOS is expected to clear MtrrFixDramModEn bit, see for example
* "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
* Opteron Processors" (26094 Rev. 3.30 February 2006), section
* "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
* to 1 during BIOS initalization of the fixed MTRRs, then cleared to
* 0 for operation."
*/
static inline void k8_check_syscfg_dram_mod_en(void)
{
u32 lo, hi;
if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
(boot_cpu_data.x86 >= 0x0f)))
return;
rdmsr(MSR_K8_SYSCFG, lo, hi);
if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
" not cleared by BIOS, clearing this bit\n",
smp_processor_id());
lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
}
}
/* Get the size of contiguous MTRR range */
static u64 get_mtrr_size(u64 mask)
{
u64 size;
mask >>= PAGE_SHIFT;
mask |= size_or_mask;
size = -mask;
size <<= PAGE_SHIFT;
return size;
}
/*
* Check and return the effective type for MTRR-MTRR type overlap.
* Returns 1 if the effective type is UNCACHEABLE, else returns 0
*/
static int check_type_overlap(u8 *prev, u8 *curr)
{
if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) {
*prev = MTRR_TYPE_UNCACHABLE;
*curr = MTRR_TYPE_UNCACHABLE;
return 1;
}
if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) ||
(*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) {
*prev = MTRR_TYPE_WRTHROUGH;
*curr = MTRR_TYPE_WRTHROUGH;
}
if (*prev != *curr) {
*prev = MTRR_TYPE_UNCACHABLE;
*curr = MTRR_TYPE_UNCACHABLE;
return 1;
}
return 0;
}
/*
* Error/Semi-error returns:
* 0xFF - when MTRR is not enabled
* *repeat == 1 implies [start:end] spanned across MTRR range and type returned
* corresponds only to [start:*partial_end].
* Caller has to lookup again for [*partial_end:end].
*/
static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
{
int i;
u64 base, mask;
u8 prev_match, curr_match;
*repeat = 0;
if (!mtrr_state_set)
return 0xFF;
if (!mtrr_state.enabled)
return 0xFF;
/* Make end inclusive end, instead of exclusive */
end--;
/* Look in fixed ranges. Just return the type as per start */
if (mtrr_state.have_fixed && (start < 0x100000)) {
int idx;
if (start < 0x80000) {
idx = 0;
idx += (start >> 16);
return mtrr_state.fixed_ranges[idx];
} else if (start < 0xC0000) {
idx = 1 * 8;
idx += ((start - 0x80000) >> 14);
return mtrr_state.fixed_ranges[idx];
} else if (start < 0x1000000) {
idx = 3 * 8;
idx += ((start - 0xC0000) >> 12);
return mtrr_state.fixed_ranges[idx];
}
}
/*
* Look in variable ranges
* Look of multiple ranges matching this address and pick type
* as per MTRR precedence
*/
if (!(mtrr_state.enabled & 2))
return mtrr_state.def_type;
prev_match = 0xFF;
for (i = 0; i < num_var_ranges; ++i) {
unsigned short start_state, end_state;
if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
continue;
base = (((u64)mtrr_state.var_ranges[i].base_hi) << 32) +
(mtrr_state.var_ranges[i].base_lo & PAGE_MASK);
mask = (((u64)mtrr_state.var_ranges[i].mask_hi) << 32) +
(mtrr_state.var_ranges[i].mask_lo & PAGE_MASK);
start_state = ((start & mask) == (base & mask));
end_state = ((end & mask) == (base & mask));
if (start_state != end_state) {
/*
* We have start:end spanning across an MTRR.
* We split the region into
* either
* (start:mtrr_end) (mtrr_end:end)
* or
* (start:mtrr_start) (mtrr_start:end)
* depending on kind of overlap.
* Return the type for first region and a pointer to
* the start of second region so that caller will
* lookup again on the second region.
* Note: This way we handle multiple overlaps as well.
*/
if (start_state)
*partial_end = base + get_mtrr_size(mask);
else
*partial_end = base;
if (unlikely(*partial_end <= start)) {
WARN_ON(1);
*partial_end = start + PAGE_SIZE;
}
end = *partial_end - 1; /* end is inclusive */
*repeat = 1;
}
if ((start & mask) != (base & mask))
continue;
curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
if (prev_match == 0xFF) {
prev_match = curr_match;
continue;
}
if (check_type_overlap(&prev_match, &curr_match))
return curr_match;
}
if (mtrr_tom2) {
if (start >= (1ULL<<32) && (end < mtrr_tom2))
return MTRR_TYPE_WRBACK;
}
if (prev_match != 0xFF)
return prev_match;
return mtrr_state.def_type;
}
/*
* Returns the effective MTRR type for the region
* Error return:
* 0xFF - when MTRR is not enabled
*/
u8 mtrr_type_lookup(u64 start, u64 end)
{
u8 type, prev_type;
int repeat;
u64 partial_end;
type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
/*
* Common path is with repeat = 0.
* However, we can have cases where [start:end] spans across some
* MTRR range. Do repeated lookups for that case here.
*/
while (repeat) {
prev_type = type;
start = partial_end;
type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
if (check_type_overlap(&prev_type, &type))
return type;
}
return type;
}
/* Get the MSR pair relating to a var range */
static void
get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
{
rdmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
}
/* Fill the MSR pair relating to a var range */
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi)
{
struct mtrr_var_range *vr;
vr = mtrr_state.var_ranges;
vr[index].base_lo = base_lo;
vr[index].base_hi = base_hi;
vr[index].mask_lo = mask_lo;
vr[index].mask_hi = mask_hi;
}
static void get_fixed_ranges(mtrr_type *frs)
{
unsigned int *p = (unsigned int *)frs;
int i;
k8_check_syscfg_dram_mod_en();
rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
for (i = 0; i < 2; i++)
rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
for (i = 0; i < 8; i++)
rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
}
void mtrr_save_fixed_ranges(void *info)
{
if (cpu_has_mtrr)
get_fixed_ranges(mtrr_state.fixed_ranges);
}
static unsigned __initdata last_fixed_start;
static unsigned __initdata last_fixed_end;
static mtrr_type __initdata last_fixed_type;
static void __init print_fixed_last(void)
{
if (!last_fixed_end)
return;
pr_debug(" %05X-%05X %s\n", last_fixed_start,
last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
last_fixed_end = 0;
}
static void __init update_fixed_last(unsigned base, unsigned end,
mtrr_type type)
{
last_fixed_start = base;
last_fixed_end = end;
last_fixed_type = type;
}
static void __init
print_fixed(unsigned base, unsigned step, const mtrr_type *types)
{
unsigned i;
for (i = 0; i < 8; ++i, ++types, base += step) {
if (last_fixed_end == 0) {
update_fixed_last(base, base + step, *types);
continue;
}
if (last_fixed_end == base && last_fixed_type == *types) {
last_fixed_end = base + step;
continue;
}
/* new segments: gap or different type */
print_fixed_last();
update_fixed_last(base, base + step, *types);
}
}
static void prepare_set(void);
static void post_set(void);
static void __init print_mtrr_state(void)
{
unsigned int i;
int high_width;
pr_debug("MTRR default type: %s\n",
mtrr_attrib_to_str(mtrr_state.def_type));
if (mtrr_state.have_fixed) {
pr_debug("MTRR fixed ranges %sabled:\n",
mtrr_state.enabled & 1 ? "en" : "dis");
print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
for (i = 0; i < 2; ++i)
print_fixed(0x80000 + i * 0x20000, 0x04000,
mtrr_state.fixed_ranges + (i + 1) * 8);
for (i = 0; i < 8; ++i)
print_fixed(0xC0000 + i * 0x08000, 0x01000,
mtrr_state.fixed_ranges + (i + 3) * 8);
/* tail */
print_fixed_last();
}
pr_debug("MTRR variable ranges %sabled:\n",
mtrr_state.enabled & 2 ? "en" : "dis");
high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
for (i = 0; i < num_var_ranges; ++i) {
if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n",
i,
high_width,
mtrr_state.var_ranges[i].base_hi,
mtrr_state.var_ranges[i].base_lo >> 12,
high_width,
mtrr_state.var_ranges[i].mask_hi,
mtrr_state.var_ranges[i].mask_lo >> 12,
mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
else
pr_debug(" %u disabled\n", i);
}
if (mtrr_tom2)
pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
}
/* Grab all of the MTRR state for this CPU into *state */
void __init get_mtrr_state(void)
{
struct mtrr_var_range *vrs;
unsigned long flags;
unsigned lo, dummy;
unsigned int i;
vrs = mtrr_state.var_ranges;
rdmsr(MSR_MTRRcap, lo, dummy);
mtrr_state.have_fixed = (lo >> 8) & 1;
for (i = 0; i < num_var_ranges; i++)
get_mtrr_var_range(i, &vrs[i]);
if (mtrr_state.have_fixed)
get_fixed_ranges(mtrr_state.fixed_ranges);
rdmsr(MSR_MTRRdefType, lo, dummy);
mtrr_state.def_type = (lo & 0xff);
mtrr_state.enabled = (lo & 0xc00) >> 10;
if (amd_special_default_mtrr()) {
unsigned low, high;
/* TOP_MEM2 */
rdmsr(MSR_K8_TOP_MEM2, low, high);
mtrr_tom2 = high;
mtrr_tom2 <<= 32;
mtrr_tom2 |= low;
mtrr_tom2 &= 0xffffff800000ULL;
}
print_mtrr_state();
mtrr_state_set = 1;
/* PAT setup for BP. We need to go through sync steps here */
local_irq_save(flags);
prepare_set();
pat_init();
post_set();
local_irq_restore(flags);
}
/* Some BIOS's are messed up and don't set all MTRRs the same! */
void __init mtrr_state_warn(void)
{
unsigned long mask = smp_changes_mask;
if (!mask)
return;
if (mask & MTRR_CHANGE_MASK_FIXED)
pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
if (mask & MTRR_CHANGE_MASK_VARIABLE)
pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n");
if (mask & MTRR_CHANGE_MASK_DEFTYPE)
pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n");
printk(KERN_INFO "mtrr: corrected configuration.\n");
}
/*
* Doesn't attempt to pass an error out to MTRR users
* because it's quite complicated in some cases and probably not
* worth it because the best error handling is to ignore it.
*/
void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
{
if (wrmsr_safe(msr, a, b) < 0) {
printk(KERN_ERR
"MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
smp_processor_id(), msr, a, b);
}
}
/**
* set_fixed_range - checks & updates a fixed-range MTRR if it
* differs from the value it should have
* @msr: MSR address of the MTTR which should be checked and updated
* @changed: pointer which indicates whether the MTRR needed to be changed
* @msrwords: pointer to the MSR values which the MSR should have
*/
static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
{
unsigned lo, hi;
rdmsr(msr, lo, hi);
if (lo != msrwords[0] || hi != msrwords[1]) {
mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
*changed = true;
}
}
/**
* generic_get_free_region - Get a free MTRR.
* @base: The starting (base) address of the region.
* @size: The size (in bytes) of the region.
* @replace_reg: mtrr index to be replaced; set to invalid value if none.
*
* Returns: The index of the region on success, else negative on error.
*/
int
generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
{
unsigned long lbase, lsize;
mtrr_type ltype;
int i, max;
max = num_var_ranges;
if (replace_reg >= 0 && replace_reg < max)
return replace_reg;
for (i = 0; i < max; ++i) {
mtrr_if->get(i, &lbase, &lsize, &ltype);
if (lsize == 0)
return i;
}
return -ENOSPC;
}
static void generic_get_mtrr(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type *type)
{
u32 mask_lo, mask_hi, base_lo, base_hi;
unsigned int hi;
u64 tmp, mask;
/*
* get_mtrr doesn't need to update mtrr_state, also it could be called
* from any cpu, so try to print it out directly.
*/
get_cpu();
rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
if ((mask_lo & 0x800) == 0) {
/* Invalid (i.e. free) range */
*base = 0;
*size = 0;
*type = 0;
goto out_put_cpu;
}
rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
/* Work out the shifted address mask: */
tmp = (u64)mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
mask = size_or_mask | tmp;
/* Expand tmp with high bits to all 1s: */
hi = fls64(tmp);
if (hi > 0) {
tmp |= ~((1ULL<<(hi - 1)) - 1);
if (tmp != mask) {
printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
mask = tmp;
}
}
/*
* This works correctly if size is a power of two, i.e. a
* contiguous range:
*/
*size = -mask;
*base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
*type = base_lo & 0xff;
out_put_cpu:
put_cpu();
}
/**
* set_fixed_ranges - checks & updates the fixed-range MTRRs if they
* differ from the saved set
* @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges()
*/
static int set_fixed_ranges(mtrr_type *frs)
{
unsigned long long *saved = (unsigned long long *)frs;
bool changed = false;
int block = -1, range;
k8_check_syscfg_dram_mod_en();
while (fixed_range_blocks[++block].ranges) {
for (range = 0; range < fixed_range_blocks[block].ranges; range++)
set_fixed_range(fixed_range_blocks[block].base_msr + range,
&changed, (unsigned int *)saved++);
}
return changed;
}
/*
* Set the MSR pair relating to a var range.
* Returns true if changes are made.
*/
static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
{
unsigned int lo, hi;
bool changed = false;
rdmsr(MTRRphysBase_MSR(index), lo, hi);
if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
|| (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
(hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
changed = true;
}
rdmsr(MTRRphysMask_MSR(index), lo, hi);
if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL)
|| (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
(hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
changed = true;
}
return changed;
}
static u32 deftype_lo, deftype_hi;
/**
* set_mtrr_state - Set the MTRR state for this CPU.
*
* NOTE: The CPU must already be in a safe state for MTRR changes.
* RETURNS: 0 if no changes made, else a mask indicating what was changed.
*/
static unsigned long set_mtrr_state(void)
{
unsigned long change_mask = 0;
unsigned int i;
for (i = 0; i < num_var_ranges; i++) {
if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i]))
change_mask |= MTRR_CHANGE_MASK_VARIABLE;
}
if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges))
change_mask |= MTRR_CHANGE_MASK_FIXED;
/*
* Set_mtrr_restore restores the old value of MTRRdefType,
* so to set it we fiddle with the saved value:
*/
if ((deftype_lo & 0xff) != mtrr_state.def_type
|| ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type |
(mtrr_state.enabled << 10);
change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
}
return change_mask;
}
static unsigned long cr4;
static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
/*
* Since we are disabling the cache don't allow any interrupts,
* they would run extremely slow and would only increase the pain.
*
* The caller must ensure that local interrupts are disabled and
* are reenabled after post_set() has been called.
*/
static void prepare_set(void) __acquires(set_atomicity_lock)
{
unsigned long cr0;
/*
* Note that this is not ideal
* since the cache is only flushed/disabled for this CPU while the
* MTRRs are changed, but changing this requires more invasive
* changes to the way the kernel boots
*/
raw_spin_lock(&set_atomicity_lock);
/* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
cr0 = read_cr0() | X86_CR0_CD;
write_cr0(cr0);
wbinvd();
/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (cpu_has_pge) {
cr4 = read_cr4();
write_cr4(cr4 & ~X86_CR4_PGE);
}
/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
/* Save MTRR state */
rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
/* Disable MTRRs, and set the default type to uncached */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
wbinvd();
}
static void post_set(void) __releases(set_atomicity_lock)
{
/* Flush TLBs (no need to flush caches - they are disabled) */
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
/* Intel (P6) standard MTRRs */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
/* Enable caches */
write_cr0(read_cr0() & ~X86_CR0_CD);
/* Restore value of CR4 */
if (cpu_has_pge)
write_cr4(cr4);
raw_spin_unlock(&set_atomicity_lock);
}
static void generic_set_all(void)
{
unsigned long mask, count;
unsigned long flags;
local_irq_save(flags);
prepare_set();
/* Actually set the state */
mask = set_mtrr_state();
/* also set PAT */
pat_init();
post_set();
local_irq_restore(flags);
/* Use the atomic bitops to update the global mask */
for (count = 0; count < sizeof mask * 8; ++count) {
if (mask & 0x01)
set_bit(count, &smp_changes_mask);
mask >>= 1;
}
}
/**
* generic_set_mtrr - set variable MTRR register on the local CPU.
*
* @reg: The register to set.
* @base: The base address of the region.
* @size: The size of the region. If this is 0 the region is disabled.
* @type: The type of the region.
*
* Returns nothing.
*/
static void generic_set_mtrr(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type)
{
unsigned long flags;
struct mtrr_var_range *vr;
vr = &mtrr_state.var_ranges[reg];
local_irq_save(flags);
prepare_set();
if (size == 0) {
/*
* The invalid bit is kept in the mask, so we simply
* clear the relevant mask register to disable a range.
*/
mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0);
memset(vr, 0, sizeof(struct mtrr_var_range));
} else {
vr->base_lo = base << PAGE_SHIFT | type;
vr->base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT);
vr->mask_lo = -size << PAGE_SHIFT | 0x800;
vr->mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT);
mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi);
mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi);
}
post_set();
local_irq_restore(flags);
}
int generic_validate_add_page(unsigned long base, unsigned long size,
unsigned int type)
{
unsigned long lbase, last;
/*
* For Intel PPro stepping <= 7
* must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF
*/
if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 &&
boot_cpu_data.x86_model == 1 &&
boot_cpu_data.x86_mask <= 7) {
if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
return -EINVAL;
}
if (!(base + size < 0x70000 || base > 0x7003F) &&
(type == MTRR_TYPE_WRCOMB
|| type == MTRR_TYPE_WRBACK)) {
pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
return -EINVAL;
}
}
/*
* Check upper bits of base and last are equal and lower bits are 0
* for base and 1 for last
*/
last = base + size - 1;
for (lbase = base; !(lbase & 1) && (last & 1);
lbase = lbase >> 1, last = last >> 1)
;
if (lbase != last) {
pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
return -EINVAL;
}
return 0;
}
static int generic_have_wrcomb(void)
{
unsigned long config, dummy;
rdmsr(MSR_MTRRcap, config, dummy);
return config & (1 << 10);
}
int positive_have_wrcomb(void)
{
return 1;
}
/*
* Generic structure...
*/
const struct mtrr_ops generic_mtrr_ops = {
.use_intel_if = 1,
.set_all = generic_set_all,
.get = generic_get_mtrr,
.get_free_region = generic_get_free_region,
.set = generic_set_mtrr,
.validate_add_page = generic_validate_add_page,
.have_wrcomb = generic_have_wrcomb,
};

View file

@ -0,0 +1,451 @@
#include <linux/capability.h>
#include <linux/seq_file.h>
#include <linux/uaccess.h>
#include <linux/proc_fs.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/init.h>
#define LINE_SIZE 80
#include <asm/mtrr.h>
#include "mtrr.h"
#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
static const char *const mtrr_strings[MTRR_NUM_TYPES] =
{
"uncachable", /* 0 */
"write-combining", /* 1 */
"?", /* 2 */
"?", /* 3 */
"write-through", /* 4 */
"write-protect", /* 5 */
"write-back", /* 6 */
};
const char *mtrr_attrib_to_str(int x)
{
return (x <= 6) ? mtrr_strings[x] : "?";
}
#ifdef CONFIG_PROC_FS
static int
mtrr_file_add(unsigned long base, unsigned long size,
unsigned int type, bool increment, struct file *file, int page)
{
unsigned int *fcount = FILE_FCOUNT(file);
int reg, max;
max = num_var_ranges;
if (fcount == NULL) {
fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL);
if (!fcount)
return -ENOMEM;
FILE_FCOUNT(file) = fcount;
}
if (!page) {
if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)))
return -EINVAL;
base >>= PAGE_SHIFT;
size >>= PAGE_SHIFT;
}
reg = mtrr_add_page(base, size, type, true);
if (reg >= 0)
++fcount[reg];
return reg;
}
static int
mtrr_file_del(unsigned long base, unsigned long size,
struct file *file, int page)
{
unsigned int *fcount = FILE_FCOUNT(file);
int reg;
if (!page) {
if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)))
return -EINVAL;
base >>= PAGE_SHIFT;
size >>= PAGE_SHIFT;
}
reg = mtrr_del_page(-1, base, size);
if (reg < 0)
return reg;
if (fcount == NULL)
return reg;
if (fcount[reg] < 1)
return -EINVAL;
--fcount[reg];
return reg;
}
/*
* seq_file can seek but we ignore it.
*
* Format of control line:
* "base=%Lx size=%Lx type=%s" or "disable=%d"
*/
static ssize_t
mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
{
int i, err;
unsigned long reg;
unsigned long long base, size;
char *ptr;
char line[LINE_SIZE];
int length;
size_t linelen;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
memset(line, 0, LINE_SIZE);
length = len;
length--;
if (length > LINE_SIZE - 1)
length = LINE_SIZE - 1;
if (length < 0)
return -EINVAL;
if (copy_from_user(line, buf, length))
return -EFAULT;
linelen = strlen(line);
ptr = line + linelen - 1;
if (linelen && *ptr == '\n')
*ptr = '\0';
if (!strncmp(line, "disable=", 8)) {
reg = simple_strtoul(line + 8, &ptr, 0);
err = mtrr_del_page(reg, 0, 0);
if (err < 0)
return err;
return len;
}
if (strncmp(line, "base=", 5))
return -EINVAL;
base = simple_strtoull(line + 5, &ptr, 0);
ptr = skip_spaces(ptr);
if (strncmp(ptr, "size=", 5))
return -EINVAL;
size = simple_strtoull(ptr + 5, &ptr, 0);
if ((base & 0xfff) || (size & 0xfff))
return -EINVAL;
ptr = skip_spaces(ptr);
if (strncmp(ptr, "type=", 5))
return -EINVAL;
ptr = skip_spaces(ptr + 5);
for (i = 0; i < MTRR_NUM_TYPES; ++i) {
if (strcmp(ptr, mtrr_strings[i]))
continue;
base >>= PAGE_SHIFT;
size >>= PAGE_SHIFT;
err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true);
if (err < 0)
return err;
return len;
}
return -EINVAL;
}
static long
mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
{
int err = 0;
mtrr_type type;
unsigned long base;
unsigned long size;
struct mtrr_sentry sentry;
struct mtrr_gentry gentry;
void __user *arg = (void __user *) __arg;
switch (cmd) {
case MTRRIOC_ADD_ENTRY:
case MTRRIOC_SET_ENTRY:
case MTRRIOC_DEL_ENTRY:
case MTRRIOC_KILL_ENTRY:
case MTRRIOC_ADD_PAGE_ENTRY:
case MTRRIOC_SET_PAGE_ENTRY:
case MTRRIOC_DEL_PAGE_ENTRY:
case MTRRIOC_KILL_PAGE_ENTRY:
if (copy_from_user(&sentry, arg, sizeof sentry))
return -EFAULT;
break;
case MTRRIOC_GET_ENTRY:
case MTRRIOC_GET_PAGE_ENTRY:
if (copy_from_user(&gentry, arg, sizeof gentry))
return -EFAULT;
break;
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_ENTRY:
case MTRRIOC32_SET_ENTRY:
case MTRRIOC32_DEL_ENTRY:
case MTRRIOC32_KILL_ENTRY:
case MTRRIOC32_ADD_PAGE_ENTRY:
case MTRRIOC32_SET_PAGE_ENTRY:
case MTRRIOC32_DEL_PAGE_ENTRY:
case MTRRIOC32_KILL_PAGE_ENTRY: {
struct mtrr_sentry32 __user *s32;
s32 = (struct mtrr_sentry32 __user *)__arg;
err = get_user(sentry.base, &s32->base);
err |= get_user(sentry.size, &s32->size);
err |= get_user(sentry.type, &s32->type);
if (err)
return err;
break;
}
case MTRRIOC32_GET_ENTRY:
case MTRRIOC32_GET_PAGE_ENTRY: {
struct mtrr_gentry32 __user *g32;
g32 = (struct mtrr_gentry32 __user *)__arg;
err = get_user(gentry.regnum, &g32->regnum);
err |= get_user(gentry.base, &g32->base);
err |= get_user(gentry.size, &g32->size);
err |= get_user(gentry.type, &g32->type);
if (err)
return err;
break;
}
#endif
}
switch (cmd) {
default:
return -ENOTTY;
case MTRRIOC_ADD_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_ENTRY:
#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 0);
break;
case MTRRIOC_SET_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_ENTRY:
#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_add(sentry.base, sentry.size, sentry.type, false);
break;
case MTRRIOC_DEL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_ENTRY:
#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 0);
break;
case MTRRIOC_KILL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_ENTRY:
#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_del(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_GET_ENTRY:
#endif
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
mtrr_if->get(gentry.regnum, &base, &size, &type);
/* Hide entries that go above 4GB */
if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
|| size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
gentry.base = gentry.size = gentry.type = 0;
else {
gentry.base = base << PAGE_SHIFT;
gentry.size = size << PAGE_SHIFT;
gentry.type = type;
}
break;
case MTRRIOC_ADD_PAGE_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_PAGE_ENTRY:
#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 1);
break;
case MTRRIOC_SET_PAGE_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_PAGE_ENTRY:
#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err =
mtrr_add_page(sentry.base, sentry.size, sentry.type, false);
break;
case MTRRIOC_DEL_PAGE_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_PAGE_ENTRY:
#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 1);
break;
case MTRRIOC_KILL_PAGE_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_PAGE_ENTRY:
#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_del_page(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_PAGE_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_GET_PAGE_ENTRY:
#endif
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
mtrr_if->get(gentry.regnum, &base, &size, &type);
/* Hide entries that would overflow */
if (size != (__typeof__(gentry.size))size)
gentry.base = gentry.size = gentry.type = 0;
else {
gentry.base = base;
gentry.size = size;
gentry.type = type;
}
break;
}
if (err)
return err;
switch (cmd) {
case MTRRIOC_GET_ENTRY:
case MTRRIOC_GET_PAGE_ENTRY:
if (copy_to_user(arg, &gentry, sizeof gentry))
err = -EFAULT;
break;
#ifdef CONFIG_COMPAT
case MTRRIOC32_GET_ENTRY:
case MTRRIOC32_GET_PAGE_ENTRY: {
struct mtrr_gentry32 __user *g32;
g32 = (struct mtrr_gentry32 __user *)__arg;
err = put_user(gentry.base, &g32->base);
err |= put_user(gentry.size, &g32->size);
err |= put_user(gentry.regnum, &g32->regnum);
err |= put_user(gentry.type, &g32->type);
break;
}
#endif
}
return err;
}
static int mtrr_close(struct inode *ino, struct file *file)
{
unsigned int *fcount = FILE_FCOUNT(file);
int i, max;
if (fcount != NULL) {
max = num_var_ranges;
for (i = 0; i < max; ++i) {
while (fcount[i] > 0) {
mtrr_del(i, 0, 0);
--fcount[i];
}
}
kfree(fcount);
FILE_FCOUNT(file) = NULL;
}
return single_release(ino, file);
}
static int mtrr_seq_show(struct seq_file *seq, void *offset);
static int mtrr_open(struct inode *inode, struct file *file)
{
if (!mtrr_if)
return -EIO;
if (!mtrr_if->get)
return -ENXIO;
return single_open(file, mtrr_seq_show, NULL);
}
static const struct file_operations mtrr_fops = {
.owner = THIS_MODULE,
.open = mtrr_open,
.read = seq_read,
.llseek = seq_lseek,
.write = mtrr_write,
.unlocked_ioctl = mtrr_ioctl,
.compat_ioctl = mtrr_ioctl,
.release = mtrr_close,
};
static int mtrr_seq_show(struct seq_file *seq, void *offset)
{
char factor;
int i, max, len;
mtrr_type type;
unsigned long base, size;
len = 0;
max = num_var_ranges;
for (i = 0; i < max; i++) {
mtrr_if->get(i, &base, &size, &type);
if (size == 0) {
mtrr_usage_table[i] = 0;
continue;
}
if (size < (0x100000 >> PAGE_SHIFT)) {
/* less than 1MB */
factor = 'K';
size <<= PAGE_SHIFT - 10;
} else {
factor = 'M';
size >>= 20 - PAGE_SHIFT;
}
/* Base can be > 32bit */
len += seq_printf(seq, "reg%02i: base=0x%06lx000 "
"(%5luMB), size=%5lu%cB, count=%d: %s\n",
i, base, base >> (20 - PAGE_SHIFT), size,
factor, mtrr_usage_table[i],
mtrr_attrib_to_str(type));
}
return 0;
}
static int __init mtrr_if_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
(!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
(!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
(!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
return -ENODEV;
proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
return 0;
}
arch_initcall(mtrr_if_init);
#endif /* CONFIG_PROC_FS */

View file

@ -0,0 +1,842 @@
/* Generic MTRR (Memory Type Range Register) driver.
Copyright (C) 1997-2000 Richard Gooch
Copyright (c) 2002 Patrick Mochel
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with this library; if not, write to the Free
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
Richard Gooch may be reached by email at rgooch@atnf.csiro.au
The postal address is:
Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
Source: "Pentium Pro Family Developer's Manual, Volume 3:
Operating System Writer's Guide" (Intel document number 242692),
section 11.11.7
This was cleaned and made readable by Patrick Mochel <mochel@osdl.org>
on 6-7 March 2002.
Source: Intel Architecture Software Developers Manual, Volume 3:
System Programming Guide; Section 9.11. (1997 edition - PPro).
*/
#define DEBUG
#include <linux/types.h> /* FIXME: kvm_para.h needs this */
#include <linux/stop_machine.h>
#include <linux/kvm_para.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <linux/sort.h>
#include <linux/cpu.h>
#include <linux/pci.h>
#include <linux/smp.h>
#include <linux/syscore_ops.h>
#include <asm/processor.h>
#include <asm/e820.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include <asm/pat.h>
#include "mtrr.h"
/* arch_phys_wc_add returns an MTRR register index plus this offset. */
#define MTRR_TO_PHYS_WC_OFFSET 1000
u32 num_var_ranges;
unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
static DEFINE_MUTEX(mtrr_mutex);
u64 size_or_mask, size_and_mask;
static bool mtrr_aps_delayed_init;
static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
const struct mtrr_ops *mtrr_if;
static void set_mtrr(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type);
void set_mtrr_ops(const struct mtrr_ops *ops)
{
if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
mtrr_ops[ops->vendor] = ops;
}
/* Returns non-zero if we have the write-combining memory type */
static int have_wrcomb(void)
{
struct pci_dev *dev;
dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
if (dev != NULL) {
/*
* ServerWorks LE chipsets < rev 6 have problems with
* write-combining. Don't allow it and leave room for other
* chipsets to be tagged
*/
if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
dev->revision <= 5) {
pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
pci_dev_put(dev);
return 0;
}
/*
* Intel 450NX errata # 23. Non ascending cacheline evictions to
* write combining memory may resulting in data corruption
*/
if (dev->vendor == PCI_VENDOR_ID_INTEL &&
dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n");
pci_dev_put(dev);
return 0;
}
pci_dev_put(dev);
}
return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0;
}
/* This function returns the number of variable MTRRs */
static void __init set_num_var_ranges(void)
{
unsigned long config = 0, dummy;
if (use_intel())
rdmsr(MSR_MTRRcap, config, dummy);
else if (is_cpu(AMD))
config = 2;
else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
config = 8;
num_var_ranges = config & 0xff;
}
static void __init init_table(void)
{
int i, max;
max = num_var_ranges;
for (i = 0; i < max; i++)
mtrr_usage_table[i] = 1;
}
struct set_mtrr_data {
unsigned long smp_base;
unsigned long smp_size;
unsigned int smp_reg;
mtrr_type smp_type;
};
/**
* mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
* by all the CPUs.
* @info: pointer to mtrr configuration data
*
* Returns nothing.
*/
static int mtrr_rendezvous_handler(void *info)
{
struct set_mtrr_data *data = info;
/*
* We use this same function to initialize the mtrrs during boot,
* resume, runtime cpu online and on an explicit request to set a
* specific MTRR.
*
* During boot or suspend, the state of the boot cpu's mtrrs has been
* saved, and we want to replicate that across all the cpus that come
* online (either at the end of boot or resume or during a runtime cpu
* online). If we're doing that, @reg is set to something special and on
* all the cpu's we do mtrr_if->set_all() (On the logical cpu that
* started the boot/resume sequence, this might be a duplicate
* set_all()).
*/
if (data->smp_reg != ~0U) {
mtrr_if->set(data->smp_reg, data->smp_base,
data->smp_size, data->smp_type);
} else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
mtrr_if->set_all();
}
return 0;
}
static inline int types_compatible(mtrr_type type1, mtrr_type type2)
{
return type1 == MTRR_TYPE_UNCACHABLE ||
type2 == MTRR_TYPE_UNCACHABLE ||
(type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
(type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH);
}
/**
* set_mtrr - update mtrrs on all processors
* @reg: mtrr in question
* @base: mtrr base
* @size: mtrr size
* @type: mtrr type
*
* This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
*
* 1. Queue work to do the following on all processors:
* 2. Disable Interrupts
* 3. Wait for all procs to do so
* 4. Enter no-fill cache mode
* 5. Flush caches
* 6. Clear PGE bit
* 7. Flush all TLBs
* 8. Disable all range registers
* 9. Update the MTRRs
* 10. Enable all range registers
* 11. Flush all TLBs and caches again
* 12. Enter normal cache mode and reenable caching
* 13. Set PGE
* 14. Wait for buddies to catch up
* 15. Enable interrupts.
*
* What does that mean for us? Well, stop_machine() will ensure that
* the rendezvous handler is started on each CPU. And in lockstep they
* do the state transition of disabling interrupts, updating MTRR's
* (the CPU vendors may each do it differently, so we call mtrr_if->set()
* callback and let them take care of it.) and enabling interrupts.
*
* Note that the mechanism is the same for UP systems, too; all the SMP stuff
* becomes nops.
*/
static void
set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
{
struct set_mtrr_data data = { .smp_reg = reg,
.smp_base = base,
.smp_size = size,
.smp_type = type
};
stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
}
static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type)
{
struct set_mtrr_data data = { .smp_reg = reg,
.smp_base = base,
.smp_size = size,
.smp_type = type
};
stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
cpu_callout_mask);
}
/**
* mtrr_add_page - Add a memory type region
* @base: Physical base address of region in pages (in units of 4 kB!)
* @size: Physical size of region in pages (4 kB)
* @type: Type of MTRR desired
* @increment: If this is true do usage counting on the region
*
* Memory type region registers control the caching on newer Intel and
* non Intel processors. This function allows drivers to request an
* MTRR is added. The details and hardware specifics of each processor's
* implementation are hidden from the caller, but nevertheless the
* caller should expect to need to provide a power of two size on an
* equivalent power of two boundary.
*
* If the region cannot be added either because all regions are in use
* or the CPU cannot support it a negative value is returned. On success
* the register number for this entry is returned, but should be treated
* as a cookie only.
*
* On a multiprocessor machine the changes are made to all processors.
* This is required on x86 by the Intel processors.
*
* The available types are
*
* %MTRR_TYPE_UNCACHABLE - No caching
*
* %MTRR_TYPE_WRBACK - Write data back in bursts whenever
*
* %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
*
* %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
*
* BUGS: Needs a quiet flag for the cases where drivers do not mind
* failures and do not wish system log messages to be sent.
*/
int mtrr_add_page(unsigned long base, unsigned long size,
unsigned int type, bool increment)
{
unsigned long lbase, lsize;
int i, replace, error;
mtrr_type ltype;
if (!mtrr_if)
return -ENXIO;
error = mtrr_if->validate_add_page(base, size, type);
if (error)
return error;
if (type >= MTRR_NUM_TYPES) {
pr_warning("mtrr: type: %u invalid\n", type);
return -EINVAL;
}
/* If the type is WC, check that this processor supports it */
if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
pr_warning("mtrr: your processor doesn't support write-combining\n");
return -ENOSYS;
}
if (!size) {
pr_warning("mtrr: zero sized request\n");
return -EINVAL;
}
if ((base | (base + size - 1)) >>
(boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
pr_warning("mtrr: base or size exceeds the MTRR width\n");
return -EINVAL;
}
error = -EINVAL;
replace = -1;
/* No CPU hotplug when we change MTRR entries */
get_online_cpus();
/* Search for existing MTRR */
mutex_lock(&mtrr_mutex);
for (i = 0; i < num_var_ranges; ++i) {
mtrr_if->get(i, &lbase, &lsize, &ltype);
if (!lsize || base > lbase + lsize - 1 ||
base + size - 1 < lbase)
continue;
/*
* At this point we know there is some kind of
* overlap/enclosure
*/
if (base < lbase || base + size - 1 > lbase + lsize - 1) {
if (base <= lbase &&
base + size - 1 >= lbase + lsize - 1) {
/* New region encloses an existing region */
if (type == ltype) {
replace = replace == -1 ? i : -2;
continue;
} else if (types_compatible(type, ltype))
continue;
}
pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing"
" 0x%lx000,0x%lx000\n", base, size, lbase,
lsize);
goto out;
}
/* New region is enclosed by an existing region */
if (ltype != type) {
if (types_compatible(type, ltype))
continue;
pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
base, size, mtrr_attrib_to_str(ltype),
mtrr_attrib_to_str(type));
goto out;
}
if (increment)
++mtrr_usage_table[i];
error = i;
goto out;
}
/* Search for an empty MTRR */
i = mtrr_if->get_free_region(base, size, replace);
if (i >= 0) {
set_mtrr(i, base, size, type);
if (likely(replace < 0)) {
mtrr_usage_table[i] = 1;
} else {
mtrr_usage_table[i] = mtrr_usage_table[replace];
if (increment)
mtrr_usage_table[i]++;
if (unlikely(replace != i)) {
set_mtrr(replace, 0, 0, 0);
mtrr_usage_table[replace] = 0;
}
}
} else {
pr_info("mtrr: no more MTRRs available\n");
}
error = i;
out:
mutex_unlock(&mtrr_mutex);
put_online_cpus();
return error;
}
static int mtrr_check(unsigned long base, unsigned long size)
{
if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
pr_warning("mtrr: size and base must be multiples of 4 kiB\n");
pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base);
dump_stack();
return -1;
}
return 0;
}
/**
* mtrr_add - Add a memory type region
* @base: Physical base address of region
* @size: Physical size of region
* @type: Type of MTRR desired
* @increment: If this is true do usage counting on the region
*
* Memory type region registers control the caching on newer Intel and
* non Intel processors. This function allows drivers to request an
* MTRR is added. The details and hardware specifics of each processor's
* implementation are hidden from the caller, but nevertheless the
* caller should expect to need to provide a power of two size on an
* equivalent power of two boundary.
*
* If the region cannot be added either because all regions are in use
* or the CPU cannot support it a negative value is returned. On success
* the register number for this entry is returned, but should be treated
* as a cookie only.
*
* On a multiprocessor machine the changes are made to all processors.
* This is required on x86 by the Intel processors.
*
* The available types are
*
* %MTRR_TYPE_UNCACHABLE - No caching
*
* %MTRR_TYPE_WRBACK - Write data back in bursts whenever
*
* %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
*
* %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
*
* BUGS: Needs a quiet flag for the cases where drivers do not mind
* failures and do not wish system log messages to be sent.
*/
int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
bool increment)
{
if (mtrr_check(base, size))
return -EINVAL;
return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
increment);
}
EXPORT_SYMBOL(mtrr_add);
/**
* mtrr_del_page - delete a memory type region
* @reg: Register returned by mtrr_add
* @base: Physical base address
* @size: Size of region
*
* If register is supplied then base and size are ignored. This is
* how drivers should call it.
*
* Releases an MTRR region. If the usage count drops to zero the
* register is freed and the region returns to default state.
* On success the register is returned, on failure a negative error
* code.
*/
int mtrr_del_page(int reg, unsigned long base, unsigned long size)
{
int i, max;
mtrr_type ltype;
unsigned long lbase, lsize;
int error = -EINVAL;
if (!mtrr_if)
return -ENXIO;
max = num_var_ranges;
/* No CPU hotplug when we change MTRR entries */
get_online_cpus();
mutex_lock(&mtrr_mutex);
if (reg < 0) {
/* Search for existing MTRR */
for (i = 0; i < max; ++i) {
mtrr_if->get(i, &lbase, &lsize, &ltype);
if (lbase == base && lsize == size) {
reg = i;
break;
}
}
if (reg < 0) {
pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n",
base, size);
goto out;
}
}
if (reg >= max) {
pr_warning("mtrr: register: %d too big\n", reg);
goto out;
}
mtrr_if->get(reg, &lbase, &lsize, &ltype);
if (lsize < 1) {
pr_warning("mtrr: MTRR %d not used\n", reg);
goto out;
}
if (mtrr_usage_table[reg] < 1) {
pr_warning("mtrr: reg: %d has count=0\n", reg);
goto out;
}
if (--mtrr_usage_table[reg] < 1)
set_mtrr(reg, 0, 0, 0);
error = reg;
out:
mutex_unlock(&mtrr_mutex);
put_online_cpus();
return error;
}
/**
* mtrr_del - delete a memory type region
* @reg: Register returned by mtrr_add
* @base: Physical base address
* @size: Size of region
*
* If register is supplied then base and size are ignored. This is
* how drivers should call it.
*
* Releases an MTRR region. If the usage count drops to zero the
* register is freed and the region returns to default state.
* On success the register is returned, on failure a negative error
* code.
*/
int mtrr_del(int reg, unsigned long base, unsigned long size)
{
if (mtrr_check(base, size))
return -EINVAL;
return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
}
EXPORT_SYMBOL(mtrr_del);
/**
* arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable
* @base: Physical base address
* @size: Size of region
*
* If PAT is available, this does nothing. If PAT is unavailable, it
* attempts to add a WC MTRR covering size bytes starting at base and
* logs an error if this fails.
*
* Drivers must store the return value to pass to mtrr_del_wc_if_needed,
* but drivers should not try to interpret that return value.
*/
int arch_phys_wc_add(unsigned long base, unsigned long size)
{
int ret;
if (pat_enabled)
return 0; /* Success! (We don't need to do anything.) */
ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
if (ret < 0) {
pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.",
(void *)base, (void *)(base + size - 1));
return ret;
}
return ret + MTRR_TO_PHYS_WC_OFFSET;
}
EXPORT_SYMBOL(arch_phys_wc_add);
/*
* arch_phys_wc_del - undoes arch_phys_wc_add
* @handle: Return value from arch_phys_wc_add
*
* This cleans up after mtrr_add_wc_if_needed.
*
* The API guarantees that mtrr_del_wc_if_needed(error code) and
* mtrr_del_wc_if_needed(0) do nothing.
*/
void arch_phys_wc_del(int handle)
{
if (handle >= 1) {
WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET);
mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0);
}
}
EXPORT_SYMBOL(arch_phys_wc_del);
/*
* phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value
* @handle: Return value from arch_phys_wc_add
*
* This will turn the return value from arch_phys_wc_add into an mtrr
* index suitable for debugging.
*
* Note: There is no legitimate use for this function, except possibly
* in printk line. Alas there is an illegitimate use in some ancient
* drm ioctls.
*/
int phys_wc_to_mtrr_index(int handle)
{
if (handle < MTRR_TO_PHYS_WC_OFFSET)
return -1;
else
return handle - MTRR_TO_PHYS_WC_OFFSET;
}
EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index);
/*
* HACK ALERT!
* These should be called implicitly, but we can't yet until all the initcall
* stuff is done...
*/
static void __init init_ifs(void)
{
#ifndef CONFIG_X86_64
amd_init_mtrr();
cyrix_init_mtrr();
centaur_init_mtrr();
#endif
}
/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
* MTRR driver doesn't require this
*/
struct mtrr_value {
mtrr_type ltype;
unsigned long lbase;
unsigned long lsize;
};
static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
static int mtrr_save(void)
{
int i;
for (i = 0; i < num_var_ranges; i++) {
mtrr_if->get(i, &mtrr_value[i].lbase,
&mtrr_value[i].lsize,
&mtrr_value[i].ltype);
}
return 0;
}
static void mtrr_restore(void)
{
int i;
for (i = 0; i < num_var_ranges; i++) {
if (mtrr_value[i].lsize) {
set_mtrr(i, mtrr_value[i].lbase,
mtrr_value[i].lsize,
mtrr_value[i].ltype);
}
}
}
static struct syscore_ops mtrr_syscore_ops = {
.suspend = mtrr_save,
.resume = mtrr_restore,
};
int __initdata changed_by_mtrr_cleanup;
#define SIZE_OR_MASK_BITS(n) (~((1ULL << ((n) - PAGE_SHIFT)) - 1))
/**
* mtrr_bp_init - initialize mtrrs on the boot CPU
*
* This needs to be called early; before any of the other CPUs are
* initialized (i.e. before smp_init()).
*
*/
void __init mtrr_bp_init(void)
{
u32 phys_addr;
init_ifs();
phys_addr = 32;
if (cpu_has_mtrr) {
mtrr_if = &generic_mtrr_ops;
size_or_mask = SIZE_OR_MASK_BITS(36);
size_and_mask = 0x00f00000;
phys_addr = 36;
/*
* This is an AMD specific MSR, but we assume(hope?) that
* Intel will implement it too when they extend the address
* bus of the Xeon.
*/
if (cpuid_eax(0x80000000) >= 0x80000008) {
phys_addr = cpuid_eax(0x80000008) & 0xff;
/* CPUID workaround for Intel 0F33/0F34 CPU */
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
boot_cpu_data.x86 == 0xF &&
boot_cpu_data.x86_model == 0x3 &&
(boot_cpu_data.x86_mask == 0x3 ||
boot_cpu_data.x86_mask == 0x4))
phys_addr = 36;
size_or_mask = SIZE_OR_MASK_BITS(phys_addr);
size_and_mask = ~size_or_mask & 0xfffff00000ULL;
} else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
boot_cpu_data.x86 == 6) {
/*
* VIA C* family have Intel style MTRRs,
* but don't support PAE
*/
size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
phys_addr = 32;
}
} else {
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) {
/* Pre-Athlon (K6) AMD CPU MTRRs */
mtrr_if = mtrr_ops[X86_VENDOR_AMD];
size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
break;
case X86_VENDOR_CENTAUR:
if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) {
mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
break;
case X86_VENDOR_CYRIX:
if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) {
mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
break;
default:
break;
}
}
if (mtrr_if) {
set_num_var_ranges();
init_table();
if (use_intel()) {
get_mtrr_state();
if (mtrr_cleanup(phys_addr)) {
changed_by_mtrr_cleanup = 1;
mtrr_if->set_all();
}
}
}
}
void mtrr_ap_init(void)
{
if (!use_intel() || mtrr_aps_delayed_init)
return;
/*
* Ideally we should hold mtrr_mutex here to avoid mtrr entries
* changed, but this routine will be called in cpu boot time,
* holding the lock breaks it.
*
* This routine is called in two cases:
*
* 1. very earily time of software resume, when there absolutely
* isn't mtrr entry changes;
*
* 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
* lock to prevent mtrr entry changes
*/
set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
}
/**
* Save current fixed-range MTRR state of the first cpu in cpu_online_mask.
*/
void mtrr_save_state(void)
{
int first_cpu;
get_online_cpus();
first_cpu = cpumask_first(cpu_online_mask);
smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
put_online_cpus();
}
void set_mtrr_aps_delayed_init(void)
{
if (!use_intel())
return;
mtrr_aps_delayed_init = true;
}
/*
* Delayed MTRR initialization for all AP's
*/
void mtrr_aps_init(void)
{
if (!use_intel())
return;
/*
* Check if someone has requested the delay of AP MTRR initialization,
* by doing set_mtrr_aps_delayed_init(), prior to this point. If not,
* then we are done.
*/
if (!mtrr_aps_delayed_init)
return;
set_mtrr(~0U, 0, 0, 0);
mtrr_aps_delayed_init = false;
}
void mtrr_bp_restore(void)
{
if (!use_intel())
return;
mtrr_if->set_all();
}
static int __init mtrr_init_finialize(void)
{
if (!mtrr_if)
return 0;
if (use_intel()) {
if (!changed_by_mtrr_cleanup)
mtrr_state_warn();
return 0;
}
/*
* The CPU has no MTRR and seems to not support SMP. They have
* specific drivers, we use a tricky method to support
* suspend/resume for them.
*
* TBD: is there any system with such CPU which supports
* suspend/resume? If no, we should remove the code.
*/
register_syscore_ops(&mtrr_syscore_ops);
return 0;
}
subsys_initcall(mtrr_init_finialize);

View file

@ -0,0 +1,78 @@
/*
* local MTRR defines.
*/
#include <linux/types.h>
#include <linux/stddef.h>
#define MTRR_CHANGE_MASK_FIXED 0x01
#define MTRR_CHANGE_MASK_VARIABLE 0x02
#define MTRR_CHANGE_MASK_DEFTYPE 0x04
extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
struct mtrr_ops {
u32 vendor;
u32 use_intel_if;
void (*set)(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type);
void (*set_all)(void);
void (*get)(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type *type);
int (*get_free_region)(unsigned long base, unsigned long size,
int replace_reg);
int (*validate_add_page)(unsigned long base, unsigned long size,
unsigned int type);
int (*have_wrcomb)(void);
};
extern int generic_get_free_region(unsigned long base, unsigned long size,
int replace_reg);
extern int generic_validate_add_page(unsigned long base, unsigned long size,
unsigned int type);
extern const struct mtrr_ops generic_mtrr_ops;
extern int positive_have_wrcomb(void);
/* library functions for processor-specific routines */
struct set_mtrr_context {
unsigned long flags;
unsigned long cr4val;
u32 deftype_lo;
u32 deftype_hi;
u32 ccr3;
};
void set_mtrr_done(struct set_mtrr_context *ctxt);
void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
void get_mtrr_state(void);
extern void set_mtrr_ops(const struct mtrr_ops *ops);
extern u64 size_or_mask, size_and_mask;
extern const struct mtrr_ops *mtrr_if;
#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
extern unsigned int num_var_ranges;
extern u64 mtrr_tom2;
extern struct mtrr_state_type mtrr_state;
void mtrr_state_warn(void);
const char *mtrr_attrib_to_str(int x);
void mtrr_wrmsr(unsigned, unsigned, unsigned);
/* CPU specific mtrr init functions */
int amd_init_mtrr(void);
int cyrix_init_mtrr(void);
int centaur_init_mtrr(void);
extern int changed_by_mtrr_cleanup;
extern int mtrr_cleanup(unsigned address_bits);

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,775 @@
/*
* Performance events x86 architecture header
*
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2009 Jaswinder Singh Rajput
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
* Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
* Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
* Copyright (C) 2009 Google, Inc., Stephane Eranian
*
* For licencing details see kernel-base/COPYING
*/
#include <linux/perf_event.h>
#if 0
#undef wrmsrl
#define wrmsrl(msr, val) \
do { \
unsigned int _msr = (msr); \
u64 _val = (val); \
trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \
(unsigned long long)(_val)); \
native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \
} while (0)
#endif
/*
* | NHM/WSM | SNB |
* register -------------------------------
* | HT | no HT | HT | no HT |
*-----------------------------------------
* offcore | core | core | cpu | core |
* lbr_sel | core | core | cpu | core |
* ld_lat | cpu | core | cpu | core |
*-----------------------------------------
*
* Given that there is a small number of shared regs,
* we can pre-allocate their slot in the per-cpu
* per-core reg tables.
*/
enum extra_reg_type {
EXTRA_REG_NONE = -1, /* not used */
EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
EXTRA_REG_LBR = 2, /* lbr_select */
EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */
EXTRA_REG_MAX /* number of entries needed */
};
struct event_constraint {
union {
unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
u64 idxmsk64;
};
u64 code;
u64 cmask;
int weight;
int overlap;
int flags;
};
/*
* struct hw_perf_event.flags flags
*/
#define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */
#define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */
#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style datala, store */
#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */
#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */
#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */
struct amd_nb {
int nb_id; /* NorthBridge id */
int refcnt; /* reference count */
struct perf_event *owners[X86_PMC_IDX_MAX];
struct event_constraint event_constraints[X86_PMC_IDX_MAX];
};
/* The maximal number of PEBS events: */
#define MAX_PEBS_EVENTS 8
/*
* A debug store configuration.
*
* We only support architectures that use 64bit fields.
*/
struct debug_store {
u64 bts_buffer_base;
u64 bts_index;
u64 bts_absolute_maximum;
u64 bts_interrupt_threshold;
u64 pebs_buffer_base;
u64 pebs_index;
u64 pebs_absolute_maximum;
u64 pebs_interrupt_threshold;
u64 pebs_event_reset[MAX_PEBS_EVENTS];
};
/*
* Per register state.
*/
struct er_account {
raw_spinlock_t lock; /* per-core: protect structure */
u64 config; /* extra MSR config */
u64 reg; /* extra MSR number */
atomic_t ref; /* reference count */
};
/*
* Per core/cpu state
*
* Used to coordinate shared registers between HT threads or
* among events on a single PMU.
*/
struct intel_shared_regs {
struct er_account regs[EXTRA_REG_MAX];
int refcnt; /* per-core: #HT threads */
unsigned core_id; /* per-core: core id */
};
#define MAX_LBR_ENTRIES 16
struct cpu_hw_events {
/*
* Generic x86 PMC bits
*/
struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
int enabled;
int n_events; /* the # of events in the below arrays */
int n_added; /* the # last events in the below arrays;
they've never been enabled yet */
int n_txn; /* the # last events in the below arrays;
added in the current transaction */
int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
u64 tags[X86_PMC_IDX_MAX];
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
unsigned int group_flag;
int is_fake;
/*
* Intel DebugStore bits
*/
struct debug_store *ds;
u64 pebs_enabled;
/*
* Intel LBR bits
*/
int lbr_users;
void *lbr_context;
struct perf_branch_stack lbr_stack;
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
struct er_account *lbr_sel;
u64 br_sel;
/*
* Intel host/guest exclude bits
*/
u64 intel_ctrl_guest_mask;
u64 intel_ctrl_host_mask;
struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX];
/*
* Intel checkpoint mask
*/
u64 intel_cp_status;
/*
* manage shared (per-core, per-cpu) registers
* used on Intel NHM/WSM/SNB
*/
struct intel_shared_regs *shared_regs;
/*
* AMD specific bits
*/
struct amd_nb *amd_nb;
/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
u64 perf_ctr_virt_mask;
void *kfree_on_online;
};
#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
{ .idxmsk64 = (n) }, \
.code = (c), \
.cmask = (m), \
.weight = (w), \
.overlap = (o), \
.flags = f, \
}
#define EVENT_CONSTRAINT(c, n, m) \
__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
/*
* The overlap flag marks event constraints with overlapping counter
* masks. This is the case if the counter mask of such an event is not
* a subset of any other counter mask of a constraint with an equal or
* higher weight, e.g.:
*
* c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
* c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
* c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
*
* The event scheduler may not select the correct counter in the first
* cycle because it needs to know which subsequent events will be
* scheduled. It may fail to schedule the events then. So we set the
* overlap flag for such constraints to give the scheduler a hint which
* events to select for counter rescheduling.
*
* Care must be taken as the rescheduling algorithm is O(n!) which
* will increase scheduling cycles for an over-commited system
* dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros
* and its counter masks must be kept at a minimum.
*/
#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \
__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)
/*
* Constraint on the Event code.
*/
#define INTEL_EVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
/*
* Constraint on the Event code + UMask + fixed-mask
*
* filter mask to validate fixed counter events.
* the following filters disqualify for fixed counters:
* - inv
* - edge
* - cnt-mask
* - in_tx
* - in_tx_checkpointed
* The other filters are supported by fixed counters.
* The any-thread option is supported starting with v3.
*/
#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)
#define FIXED_EVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)
/*
* Constraint on the Event code + UMask
*/
#define INTEL_UEVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
#define INTEL_PLD_CONSTRAINT(c, n) \
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
#define INTEL_PST_CONSTRAINT(c, n) \
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
/* Event constraint, but match on all event flags too. */
#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
/* Check only flags, but allow all event/umask */
#define INTEL_ALL_EVENT_CONSTRAINT(code, n) \
EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
/* Check flags and event code, and set the HSW store flag */
#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
__EVENT_CONSTRAINT(code, n, \
ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
/* Check flags and event code, and set the HSW load flag */
#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
__EVENT_CONSTRAINT(code, n, \
ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
/* Check flags and event code/umask, and set the HSW store flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
__EVENT_CONSTRAINT(code, n, \
INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
/* Check flags and event code/umask, and set the HSW load flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
__EVENT_CONSTRAINT(code, n, \
INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
/* Check flags and event code/umask, and set the HSW N/A flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
__EVENT_CONSTRAINT(code, n, \
INTEL_ARCH_EVENT_MASK|INTEL_ARCH_EVENT_MASK, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
/*
* We define the end marker as having a weight of -1
* to enable blacklisting of events using a counter bitmask
* of zero and thus a weight of zero.
* The end marker has a weight that cannot possibly be
* obtained from counting the bits in the bitmask.
*/
#define EVENT_CONSTRAINT_END { .weight = -1 }
/*
* Check for end marker with weight == -1
*/
#define for_each_event_constraint(e, c) \
for ((e) = (c); (e)->weight != -1; (e)++)
/*
* Extra registers for specific events.
*
* Some events need large masks and require external MSRs.
* Those extra MSRs end up being shared for all events on
* a PMU and sometimes between PMU of sibling HT threads.
* In either case, the kernel needs to handle conflicting
* accesses to those extra, shared, regs. The data structure
* to manage those registers is stored in cpu_hw_event.
*/
struct extra_reg {
unsigned int event;
unsigned int msr;
u64 config_mask;
u64 valid_mask;
int idx; /* per_xxx->regs[] reg index */
bool extra_msr_access;
};
#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
.event = (e), \
.msr = (ms), \
.config_mask = (m), \
.valid_mask = (vm), \
.idx = EXTRA_REG_##i, \
.extra_msr_access = true, \
}
#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \
EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
ARCH_PERFMON_EVENTSEL_UMASK, vm, idx)
#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \
INTEL_UEVENT_EXTRA_REG(c, \
MSR_PEBS_LD_LAT_THRESHOLD, \
0xffff, \
LDLAT)
#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
union perf_capabilities {
struct {
u64 lbr_format:6;
u64 pebs_trap:1;
u64 pebs_arch_reg:1;
u64 pebs_format:4;
u64 smm_freeze:1;
/*
* PMU supports separate counter range for writing
* values > 32bit.
*/
u64 full_width_write:1;
};
u64 capabilities;
};
struct x86_pmu_quirk {
struct x86_pmu_quirk *next;
void (*func)(void);
};
union x86_pmu_config {
struct {
u64 event:8,
umask:8,
usr:1,
os:1,
edge:1,
pc:1,
interrupt:1,
__reserved1:1,
en:1,
inv:1,
cmask:8,
event2:4,
__reserved2:4,
go:1,
ho:1;
} bits;
u64 value;
};
#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
/*
* struct x86_pmu - generic x86 pmu
*/
struct x86_pmu {
/*
* Generic x86 PMC bits
*/
const char *name;
int version;
int (*handle_irq)(struct pt_regs *);
void (*disable_all)(void);
void (*enable_all)(int added);
void (*enable)(struct perf_event *);
void (*disable)(struct perf_event *);
int (*hw_config)(struct perf_event *event);
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
unsigned eventsel;
unsigned perfctr;
int (*addr_offset)(int index, bool eventsel);
int (*rdpmc_index)(int index);
u64 (*event_map)(int);
int max_events;
int num_counters;
int num_counters_fixed;
int cntval_bits;
u64 cntval_mask;
union {
unsigned long events_maskl;
unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
};
int events_mask_len;
int apic;
u64 max_period;
struct event_constraint *
(*get_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
void (*put_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
struct event_constraint *event_constraints;
struct x86_pmu_quirk *quirks;
int perfctr_second_write;
bool late_ack;
/*
* sysfs attrs
*/
int attr_rdpmc_broken;
int attr_rdpmc;
struct attribute **format_attrs;
struct attribute **event_attrs;
ssize_t (*events_sysfs_show)(char *page, u64 config);
struct attribute **cpu_events;
/*
* CPU Hotplug hooks
*/
int (*cpu_prepare)(int cpu);
void (*cpu_starting)(int cpu);
void (*cpu_dying)(int cpu);
void (*cpu_dead)(int cpu);
void (*check_microcode)(void);
void (*flush_branch_stack)(void);
/*
* Intel Arch Perfmon v2+
*/
u64 intel_ctrl;
union perf_capabilities intel_cap;
/*
* Intel DebugStore bits
*/
unsigned int bts :1,
bts_active :1,
pebs :1,
pebs_active :1,
pebs_broken :1;
int pebs_record_size;
void (*drain_pebs)(struct pt_regs *regs);
struct event_constraint *pebs_constraints;
void (*pebs_aliases)(struct perf_event *event);
int max_pebs_events;
/*
* Intel LBR
*/
unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
int lbr_nr; /* hardware stack size */
u64 lbr_sel_mask; /* LBR_SELECT valid bits */
const int *lbr_sel_map; /* lbr_select mappings */
bool lbr_double_abort; /* duplicated lbr aborts */
/*
* Extra registers for events
*/
struct extra_reg *extra_regs;
unsigned int er_flags;
/*
* Intel host/guest support (KVM)
*/
struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
};
#define x86_add_quirk(func_) \
do { \
static struct x86_pmu_quirk __quirk __initdata = { \
.func = func_, \
}; \
__quirk.next = x86_pmu.quirks; \
x86_pmu.quirks = &__quirk; \
} while (0)
#define ERF_NO_HT_SHARING 1
#define ERF_HAS_RSP_1 2
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
#define EVENT_ATTR(_name, _id) \
static struct perf_pmu_events_attr EVENT_VAR(_id) = { \
.attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \
.id = PERF_COUNT_HW_##_id, \
.event_str = NULL, \
};
#define EVENT_ATTR_STR(_name, v, str) \
static struct perf_pmu_events_attr event_attr_##v = { \
.attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \
.id = 0, \
.event_str = str, \
};
extern struct x86_pmu x86_pmu __read_mostly;
DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
int x86_perf_event_set_period(struct perf_event *event);
/*
* Generalized hw caching related hw_event table, filled
* in on a per model basis. A value of 0 means
* 'not supported', -1 means 'hw_event makes no sense on
* this CPU', any other value means the raw hw_event
* ID.
*/
#define C(x) PERF_COUNT_HW_CACHE_##x
extern u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX];
extern u64 __read_mostly hw_cache_extra_regs
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX];
u64 x86_perf_event_update(struct perf_event *event);
static inline unsigned int x86_pmu_config_addr(int index)
{
return x86_pmu.eventsel + (x86_pmu.addr_offset ?
x86_pmu.addr_offset(index, true) : index);
}
static inline unsigned int x86_pmu_event_addr(int index)
{
return x86_pmu.perfctr + (x86_pmu.addr_offset ?
x86_pmu.addr_offset(index, false) : index);
}
static inline int x86_pmu_rdpmc_index(int index)
{
return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
}
int x86_setup_perfctr(struct perf_event *event);
int x86_pmu_hw_config(struct perf_event *event);
void x86_pmu_disable_all(void);
static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
u64 enable_mask)
{
u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
if (hwc->extra_reg.reg)
wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
}
void x86_pmu_enable_all(int added);
int perf_assign_events(struct perf_event **events, int n,
int wmin, int wmax, int *assign);
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
void x86_pmu_stop(struct perf_event *event, int flags);
static inline void x86_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
wrmsrl(hwc->config_base, hwc->config);
}
void x86_pmu_enable_event(struct perf_event *event);
int x86_pmu_handle_irq(struct pt_regs *regs);
extern struct event_constraint emptyconstraint;
extern struct event_constraint unconstrained;
static inline bool kernel_ip(unsigned long ip)
{
#ifdef CONFIG_X86_32
return ip > PAGE_OFFSET;
#else
return (long)ip < 0;
#endif
}
/*
* Not all PMUs provide the right context information to place the reported IP
* into full context. Specifically segment registers are typically not
* supplied.
*
* Assuming the address is a linear address (it is for IBS), we fake the CS and
* vm86 mode using the known zero-based code segment and 'fix up' the registers
* to reflect this.
*
* Intel PEBS/LBR appear to typically provide the effective address, nothing
* much we can do about that but pray and treat it like a linear address.
*/
static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
{
regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
if (regs->flags & X86_VM_MASK)
regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
regs->ip = ip;
}
ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
ssize_t intel_event_sysfs_show(char *page, u64 config);
#ifdef CONFIG_CPU_SUP_AMD
int amd_pmu_init(void);
#else /* CONFIG_CPU_SUP_AMD */
static inline int amd_pmu_init(void)
{
return 0;
}
#endif /* CONFIG_CPU_SUP_AMD */
#ifdef CONFIG_CPU_SUP_INTEL
int intel_pmu_save_and_restart(struct perf_event *event);
struct event_constraint *
x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event);
struct intel_shared_regs *allocate_shared_regs(int cpu);
int intel_pmu_init(void);
void init_debug_store_on_cpu(int cpu);
void fini_debug_store_on_cpu(int cpu);
void release_ds_buffers(void);
void reserve_ds_buffers(void);
extern struct event_constraint bts_constraint;
void intel_pmu_enable_bts(u64 config);
void intel_pmu_disable_bts(void);
int intel_pmu_drain_bts_buffer(void);
extern struct event_constraint intel_core2_pebs_event_constraints[];
extern struct event_constraint intel_atom_pebs_event_constraints[];
extern struct event_constraint intel_slm_pebs_event_constraints[];
extern struct event_constraint intel_nehalem_pebs_event_constraints[];
extern struct event_constraint intel_westmere_pebs_event_constraints[];
extern struct event_constraint intel_snb_pebs_event_constraints[];
extern struct event_constraint intel_ivb_pebs_event_constraints[];
extern struct event_constraint intel_hsw_pebs_event_constraints[];
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
void intel_pmu_pebs_enable(struct perf_event *event);
void intel_pmu_pebs_disable(struct perf_event *event);
void intel_pmu_pebs_enable_all(void);
void intel_pmu_pebs_disable_all(void);
void intel_ds_init(void);
void intel_pmu_lbr_reset(void);
void intel_pmu_lbr_enable(struct perf_event *event);
void intel_pmu_lbr_disable(struct perf_event *event);
void intel_pmu_lbr_enable_all(void);
void intel_pmu_lbr_disable_all(void);
void intel_pmu_lbr_read(void);
void intel_pmu_lbr_init_core(void);
void intel_pmu_lbr_init_nhm(void);
void intel_pmu_lbr_init_atom(void);
void intel_pmu_lbr_init_snb(void);
int intel_pmu_setup_lbr_filter(struct perf_event *event);
int p4_pmu_init(void);
int p6_pmu_init(void);
int knc_pmu_init(void);
ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
char *page);
#else /* CONFIG_CPU_SUP_INTEL */
static inline void reserve_ds_buffers(void)
{
}
static inline void release_ds_buffers(void)
{
}
static inline int intel_pmu_init(void)
{
return 0;
}
static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
{
return NULL;
}
#endif /* CONFIG_CPU_SUP_INTEL */

View file

@ -0,0 +1,728 @@
#include <linux/perf_event.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <asm/apicdef.h>
#include "perf_event.h"
static __initconst const u64 amd_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
{
[ C(L1D) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
[ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
[ C(RESULT_MISS) ] = 0,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
[ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
},
},
[ C(L1I ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
[ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
[ C(RESULT_MISS) ] = 0,
},
},
[ C(LL ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
[ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
[ C(RESULT_MISS) ] = 0,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
},
[ C(DTLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
[ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
},
[ C(ITLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
[ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
},
[ C(BPU ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
[ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
},
[ C(NODE) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
[ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
},
};
/*
* AMD Performance Monitor K7 and later.
*/
static const u64 amd_perfmon_event_map[] =
{
[PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
[PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
[PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
[PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
[PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */
[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */
};
static u64 amd_pmu_event_map(int hw_event)
{
return amd_perfmon_event_map[hw_event];
}
/*
* Previously calculated offsets
*/
static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
/*
* Legacy CPUs:
* 4 counters starting at 0xc0010000 each offset by 1
*
* CPUs with core performance counter extensions:
* 6 counters starting at 0xc0010200 each offset by 2
*/
static inline int amd_pmu_addr_offset(int index, bool eventsel)
{
int offset;
if (!index)
return index;
if (eventsel)
offset = event_offsets[index];
else
offset = count_offsets[index];
if (offset)
return offset;
if (!cpu_has_perfctr_core)
offset = index;
else
offset = index << 1;
if (eventsel)
event_offsets[index] = offset;
else
count_offsets[index] = offset;
return offset;
}
static int amd_core_hw_config(struct perf_event *event)
{
if (event->attr.exclude_host && event->attr.exclude_guest)
/*
* When HO == GO == 1 the hardware treats that as GO == HO == 0
* and will count in both modes. We don't want to count in that
* case so we emulate no-counting by setting US = OS = 0.
*/
event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
ARCH_PERFMON_EVENTSEL_OS);
else if (event->attr.exclude_host)
event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
else if (event->attr.exclude_guest)
event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
return 0;
}
/*
* AMD64 events are detected based on their event codes.
*/
static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
{
return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
}
static inline int amd_is_nb_event(struct hw_perf_event *hwc)
{
return (hwc->config & 0xe0) == 0xe0;
}
static inline int amd_has_nb(struct cpu_hw_events *cpuc)
{
struct amd_nb *nb = cpuc->amd_nb;
return nb && nb->nb_id != -1;
}
static int amd_pmu_hw_config(struct perf_event *event)
{
int ret;
/* pass precise event sampling to ibs: */
if (event->attr.precise_ip && get_ibs_caps())
return -ENOENT;
if (has_branch_stack(event))
return -EOPNOTSUPP;
ret = x86_pmu_hw_config(event);
if (ret)
return ret;
if (event->attr.type == PERF_TYPE_RAW)
event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
return amd_core_hw_config(event);
}
static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
struct perf_event *event)
{
struct amd_nb *nb = cpuc->amd_nb;
int i;
/*
* need to scan whole list because event may not have
* been assigned during scheduling
*
* no race condition possible because event can only
* be removed on one CPU at a time AND PMU is disabled
* when we come here
*/
for (i = 0; i < x86_pmu.num_counters; i++) {
if (cmpxchg(nb->owners + i, event, NULL) == event)
break;
}
}
/*
* AMD64 NorthBridge events need special treatment because
* counter access needs to be synchronized across all cores
* of a package. Refer to BKDG section 3.12
*
* NB events are events measuring L3 cache, Hypertransport
* traffic. They are identified by an event code >= 0xe00.
* They measure events on the NorthBride which is shared
* by all cores on a package. NB events are counted on a
* shared set of counters. When a NB event is programmed
* in a counter, the data actually comes from a shared
* counter. Thus, access to those counters needs to be
* synchronized.
*
* We implement the synchronization such that no two cores
* can be measuring NB events using the same counters. Thus,
* we maintain a per-NB allocation table. The available slot
* is propagated using the event_constraint structure.
*
* We provide only one choice for each NB event based on
* the fact that only NB events have restrictions. Consequently,
* if a counter is available, there is a guarantee the NB event
* will be assigned to it. If no slot is available, an empty
* constraint is returned and scheduling will eventually fail
* for this event.
*
* Note that all cores attached the same NB compete for the same
* counters to host NB events, this is why we use atomic ops. Some
* multi-chip CPUs may have more than one NB.
*
* Given that resources are allocated (cmpxchg), they must be
* eventually freed for others to use. This is accomplished by
* calling __amd_put_nb_event_constraints()
*
* Non NB events are not impacted by this restriction.
*/
static struct event_constraint *
__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
struct event_constraint *c)
{
struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
struct perf_event *old;
int idx, new = -1;
if (!c)
c = &unconstrained;
if (cpuc->is_fake)
return c;
/*
* detect if already present, if so reuse
*
* cannot merge with actual allocation
* because of possible holes
*
* event can already be present yet not assigned (in hwc->idx)
* because of successive calls to x86_schedule_events() from
* hw_perf_group_sched_in() without hw_perf_enable()
*/
for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
if (new == -1 || hwc->idx == idx)
/* assign free slot, prefer hwc->idx */
old = cmpxchg(nb->owners + idx, NULL, event);
else if (nb->owners[idx] == event)
/* event already present */
old = event;
else
continue;
if (old && old != event)
continue;
/* reassign to this slot */
if (new != -1)
cmpxchg(nb->owners + new, event, NULL);
new = idx;
/* already present, reuse */
if (old == event)
break;
}
if (new == -1)
return &emptyconstraint;
return &nb->event_constraints[new];
}
static struct amd_nb *amd_alloc_nb(int cpu)
{
struct amd_nb *nb;
int i;
nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
if (!nb)
return NULL;
nb->nb_id = -1;
/*
* initialize all possible NB constraints
*/
for (i = 0; i < x86_pmu.num_counters; i++) {
__set_bit(i, nb->event_constraints[i].idxmsk);
nb->event_constraints[i].weight = 1;
}
return nb;
}
static int amd_pmu_cpu_prepare(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
WARN_ON_ONCE(cpuc->amd_nb);
if (boot_cpu_data.x86_max_cores < 2)
return NOTIFY_OK;
cpuc->amd_nb = amd_alloc_nb(cpu);
if (!cpuc->amd_nb)
return NOTIFY_BAD;
return NOTIFY_OK;
}
static void amd_pmu_cpu_starting(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
struct amd_nb *nb;
int i, nb_id;
cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
if (boot_cpu_data.x86_max_cores < 2)
return;
nb_id = amd_get_nb_id(cpu);
WARN_ON_ONCE(nb_id == BAD_APICID);
for_each_online_cpu(i) {
nb = per_cpu(cpu_hw_events, i).amd_nb;
if (WARN_ON_ONCE(!nb))
continue;
if (nb->nb_id == nb_id) {
cpuc->kfree_on_online = cpuc->amd_nb;
cpuc->amd_nb = nb;
break;
}
}
cpuc->amd_nb->nb_id = nb_id;
cpuc->amd_nb->refcnt++;
}
static void amd_pmu_cpu_dead(int cpu)
{
struct cpu_hw_events *cpuhw;
if (boot_cpu_data.x86_max_cores < 2)
return;
cpuhw = &per_cpu(cpu_hw_events, cpu);
if (cpuhw->amd_nb) {
struct amd_nb *nb = cpuhw->amd_nb;
if (nb->nb_id == -1 || --nb->refcnt == 0)
kfree(nb);
cpuhw->amd_nb = NULL;
}
}
static struct event_constraint *
amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
{
/*
* if not NB event or no NB, then no constraints
*/
if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
return &unconstrained;
return __amd_get_nb_event_constraints(cpuc, event, NULL);
}
static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
struct perf_event *event)
{
if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
__amd_put_nb_event_constraints(cpuc, event);
}
PMU_FORMAT_ATTR(event, "config:0-7,32-35");
PMU_FORMAT_ATTR(umask, "config:8-15" );
PMU_FORMAT_ATTR(edge, "config:18" );
PMU_FORMAT_ATTR(inv, "config:23" );
PMU_FORMAT_ATTR(cmask, "config:24-31" );
static struct attribute *amd_format_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
&format_attr_edge.attr,
&format_attr_inv.attr,
&format_attr_cmask.attr,
NULL,
};
/* AMD Family 15h */
#define AMD_EVENT_TYPE_MASK 0x000000F0ULL
#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL
#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL
#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL
#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL
#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL
#define AMD_EVENT_EX_LS 0x000000C0ULL
#define AMD_EVENT_DE 0x000000D0ULL
#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL
/*
* AMD family 15h event code/PMC mappings:
*
* type = event_code & 0x0F0:
*
* 0x000 FP PERF_CTL[5:3]
* 0x010 FP PERF_CTL[5:3]
* 0x020 LS PERF_CTL[5:0]
* 0x030 LS PERF_CTL[5:0]
* 0x040 DC PERF_CTL[5:0]
* 0x050 DC PERF_CTL[5:0]
* 0x060 CU PERF_CTL[2:0]
* 0x070 CU PERF_CTL[2:0]
* 0x080 IC/DE PERF_CTL[2:0]
* 0x090 IC/DE PERF_CTL[2:0]
* 0x0A0 ---
* 0x0B0 ---
* 0x0C0 EX/LS PERF_CTL[5:0]
* 0x0D0 DE PERF_CTL[2:0]
* 0x0E0 NB NB_PERF_CTL[3:0]
* 0x0F0 NB NB_PERF_CTL[3:0]
*
* Exceptions:
*
* 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*)
* 0x003 FP PERF_CTL[3]
* 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*)
* 0x00B FP PERF_CTL[3]
* 0x00D FP PERF_CTL[3]
* 0x023 DE PERF_CTL[2:0]
* 0x02D LS PERF_CTL[3]
* 0x02E LS PERF_CTL[3,0]
* 0x031 LS PERF_CTL[2:0] (**)
* 0x043 CU PERF_CTL[2:0]
* 0x045 CU PERF_CTL[2:0]
* 0x046 CU PERF_CTL[2:0]
* 0x054 CU PERF_CTL[2:0]
* 0x055 CU PERF_CTL[2:0]
* 0x08F IC PERF_CTL[0]
* 0x187 DE PERF_CTL[0]
* 0x188 DE PERF_CTL[0]
* 0x0DB EX PERF_CTL[5:0]
* 0x0DC LS PERF_CTL[5:0]
* 0x0DD LS PERF_CTL[5:0]
* 0x0DE LS PERF_CTL[5:0]
* 0x0DF LS PERF_CTL[5:0]
* 0x1C0 EX PERF_CTL[5:3]
* 0x1D6 EX PERF_CTL[5:0]
* 0x1D8 EX PERF_CTL[5:0]
*
* (*) depending on the umask all FPU counters may be used
* (**) only one unitmask enabled at a time
*/
static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0);
static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
static struct event_constraint *
amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
unsigned int event_code = amd_get_event_code(hwc);
switch (event_code & AMD_EVENT_TYPE_MASK) {
case AMD_EVENT_FP:
switch (event_code) {
case 0x000:
if (!(hwc->config & 0x0000F000ULL))
break;
if (!(hwc->config & 0x00000F00ULL))
break;
return &amd_f15_PMC3;
case 0x004:
if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
break;
return &amd_f15_PMC3;
case 0x003:
case 0x00B:
case 0x00D:
return &amd_f15_PMC3;
}
return &amd_f15_PMC53;
case AMD_EVENT_LS:
case AMD_EVENT_DC:
case AMD_EVENT_EX_LS:
switch (event_code) {
case 0x023:
case 0x043:
case 0x045:
case 0x046:
case 0x054:
case 0x055:
return &amd_f15_PMC20;
case 0x02D:
return &amd_f15_PMC3;
case 0x02E:
return &amd_f15_PMC30;
case 0x031:
if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
return &amd_f15_PMC20;
return &emptyconstraint;
case 0x1C0:
return &amd_f15_PMC53;
default:
return &amd_f15_PMC50;
}
case AMD_EVENT_CU:
case AMD_EVENT_IC_DE:
case AMD_EVENT_DE:
switch (event_code) {
case 0x08F:
case 0x187:
case 0x188:
return &amd_f15_PMC0;
case 0x0DB ... 0x0DF:
case 0x1D6:
case 0x1D8:
return &amd_f15_PMC50;
default:
return &amd_f15_PMC20;
}
case AMD_EVENT_NB:
/* moved to perf_event_amd_uncore.c */
return &emptyconstraint;
default:
return &emptyconstraint;
}
}
static ssize_t amd_event_sysfs_show(char *page, u64 config)
{
u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
(config & AMD64_EVENTSEL_EVENT) >> 24;
return x86_event_sysfs_show(page, config, event);
}
static __initconst const struct x86_pmu amd_pmu = {
.name = "AMD",
.handle_irq = x86_pmu_handle_irq,
.disable_all = x86_pmu_disable_all,
.enable_all = x86_pmu_enable_all,
.enable = x86_pmu_enable_event,
.disable = x86_pmu_disable_event,
.hw_config = amd_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_K7_EVNTSEL0,
.perfctr = MSR_K7_PERFCTR0,
.addr_offset = amd_pmu_addr_offset,
.event_map = amd_pmu_event_map,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
.num_counters = AMD64_NUM_COUNTERS,
.cntval_bits = 48,
.cntval_mask = (1ULL << 48) - 1,
.apic = 1,
/* use highest bit to detect overflow */
.max_period = (1ULL << 47) - 1,
.get_event_constraints = amd_get_event_constraints,
.put_event_constraints = amd_put_event_constraints,
.format_attrs = amd_format_attr,
.events_sysfs_show = amd_event_sysfs_show,
.cpu_prepare = amd_pmu_cpu_prepare,
.cpu_starting = amd_pmu_cpu_starting,
.cpu_dead = amd_pmu_cpu_dead,
};
static int __init amd_core_pmu_init(void)
{
if (!cpu_has_perfctr_core)
return 0;
switch (boot_cpu_data.x86) {
case 0x15:
pr_cont("Fam15h ");
x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
break;
default:
pr_err("core perfctr but no constraints; unknown hardware!\n");
return -ENODEV;
}
/*
* If core performance counter extensions exists, we must use
* MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
* amd_pmu_addr_offset().
*/
x86_pmu.eventsel = MSR_F15H_PERF_CTL;
x86_pmu.perfctr = MSR_F15H_PERF_CTR;
x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE;
pr_cont("core perfctr, ");
return 0;
}
__init int amd_pmu_init(void)
{
int ret;
/* Performance-monitoring supported from K7 and later: */
if (boot_cpu_data.x86 < 6)
return -ENODEV;
x86_pmu = amd_pmu;
ret = amd_core_pmu_init();
if (ret)
return ret;
/* Events are common for all AMDs */
memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
return 0;
}
void amd_pmu_enable_virt(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
cpuc->perf_ctr_virt_mask = 0;
/* Reload all events */
x86_pmu_disable_all();
x86_pmu_enable_all(0);
}
EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
void amd_pmu_disable_virt(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
/*
* We only mask out the Host-only bit so that host-only counting works
* when SVM is disabled. If someone sets up a guest-only counter when
* SVM is disabled the Guest-only bits still gets set and the counter
* will not count anything.
*/
cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
/* Reload all events */
x86_pmu_disable_all();
x86_pmu_enable_all(0);
}
EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);

View file

@ -0,0 +1,946 @@
/*
* Performance events - AMD IBS
*
* Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
*
* For licencing details see kernel-base/COPYING
*/
#include <linux/perf_event.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/ptrace.h>
#include <linux/syscore_ops.h>
#include <asm/apic.h>
#include "perf_event.h"
static u32 ibs_caps;
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
#include <linux/kprobes.h>
#include <linux/hardirq.h>
#include <asm/nmi.h>
#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
enum ibs_states {
IBS_ENABLED = 0,
IBS_STARTED = 1,
IBS_STOPPING = 2,
IBS_MAX_STATES,
};
struct cpu_perf_ibs {
struct perf_event *event;
unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)];
};
struct perf_ibs {
struct pmu pmu;
unsigned int msr;
u64 config_mask;
u64 cnt_mask;
u64 enable_mask;
u64 valid_mask;
u64 max_period;
unsigned long offset_mask[1];
int offset_max;
struct cpu_perf_ibs __percpu *pcpu;
struct attribute **format_attrs;
struct attribute_group format_group;
const struct attribute_group *attr_groups[2];
u64 (*get_count)(u64 config);
};
struct perf_ibs_data {
u32 size;
union {
u32 data[0]; /* data buffer starts here */
u32 caps;
};
u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX];
};
static int
perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
{
s64 left = local64_read(&hwc->period_left);
s64 period = hwc->sample_period;
int overflow = 0;
/*
* If we are way outside a reasonable range then just skip forward:
*/
if (unlikely(left <= -period)) {
left = period;
local64_set(&hwc->period_left, left);
hwc->last_period = period;
overflow = 1;
}
if (unlikely(left < (s64)min)) {
left += period;
local64_set(&hwc->period_left, left);
hwc->last_period = period;
overflow = 1;
}
/*
* If the hw period that triggers the sw overflow is too short
* we might hit the irq handler. This biases the results.
* Thus we shorten the next-to-last period and set the last
* period to the max period.
*/
if (left > max) {
left -= max;
if (left > max)
left = max;
else if (left < min)
left = min;
}
*hw_period = (u64)left;
return overflow;
}
static int
perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
{
struct hw_perf_event *hwc = &event->hw;
int shift = 64 - width;
u64 prev_raw_count;
u64 delta;
/*
* Careful: an NMI might modify the previous event value.
*
* Our tactic to handle this is to first atomically read and
* exchange a new raw count - then add that new-prev delta
* count to the generic event atomically:
*/
prev_raw_count = local64_read(&hwc->prev_count);
if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
new_raw_count) != prev_raw_count)
return 0;
/*
* Now we have the new raw value and have updated the prev
* timestamp already. We can now calculate the elapsed delta
* (event-)time and add that to the generic event.
*
* Careful, not all hw sign-extends above the physical width
* of the count.
*/
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
local64_add(delta, &event->count);
local64_sub(delta, &hwc->period_left);
return 1;
}
static struct perf_ibs perf_ibs_fetch;
static struct perf_ibs perf_ibs_op;
static struct perf_ibs *get_ibs_pmu(int type)
{
if (perf_ibs_fetch.pmu.type == type)
return &perf_ibs_fetch;
if (perf_ibs_op.pmu.type == type)
return &perf_ibs_op;
return NULL;
}
/*
* Use IBS for precise event sampling:
*
* perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count
* perf record -a -e r076:p ... # same as -e cpu-cycles:p
* perf record -a -e r0C1:p ... # use ibs op counting micro-ops
*
* IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
* MSRC001_1033) is used to select either cycle or micro-ops counting
* mode.
*
* The rip of IBS samples has skid 0. Thus, IBS supports precise
* levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
* rip is invalid when IBS was not able to record the rip correctly.
* We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
*
*/
static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
{
switch (event->attr.precise_ip) {
case 0:
return -ENOENT;
case 1:
case 2:
break;
default:
return -EOPNOTSUPP;
}
switch (event->attr.type) {
case PERF_TYPE_HARDWARE:
switch (event->attr.config) {
case PERF_COUNT_HW_CPU_CYCLES:
*config = 0;
return 0;
}
break;
case PERF_TYPE_RAW:
switch (event->attr.config) {
case 0x0076:
*config = 0;
return 0;
case 0x00C1:
*config = IBS_OP_CNT_CTL;
return 0;
}
break;
default:
return -ENOENT;
}
return -EOPNOTSUPP;
}
static const struct perf_event_attr ibs_notsupp = {
.exclude_user = 1,
.exclude_kernel = 1,
.exclude_hv = 1,
.exclude_idle = 1,
.exclude_host = 1,
.exclude_guest = 1,
};
static int perf_ibs_init(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
struct perf_ibs *perf_ibs;
u64 max_cnt, config;
int ret;
perf_ibs = get_ibs_pmu(event->attr.type);
if (perf_ibs) {
config = event->attr.config;
} else {
perf_ibs = &perf_ibs_op;
ret = perf_ibs_precise_event(event, &config);
if (ret)
return ret;
}
if (event->pmu != &perf_ibs->pmu)
return -ENOENT;
if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
return -EINVAL;
if (config & ~perf_ibs->config_mask)
return -EINVAL;
if (hwc->sample_period) {
if (config & perf_ibs->cnt_mask)
/* raw max_cnt may not be set */
return -EINVAL;
if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
/*
* lower 4 bits can not be set in ibs max cnt,
* but allowing it in case we adjust the
* sample period to set a frequency.
*/
return -EINVAL;
hwc->sample_period &= ~0x0FULL;
if (!hwc->sample_period)
hwc->sample_period = 0x10;
} else {
max_cnt = config & perf_ibs->cnt_mask;
config &= ~perf_ibs->cnt_mask;
event->attr.sample_period = max_cnt << 4;
hwc->sample_period = event->attr.sample_period;
}
if (!hwc->sample_period)
return -EINVAL;
/*
* If we modify hwc->sample_period, we also need to update
* hwc->last_period and hwc->period_left.
*/
hwc->last_period = hwc->sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
hwc->config_base = perf_ibs->msr;
hwc->config = config;
return 0;
}
static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
struct hw_perf_event *hwc, u64 *period)
{
int overflow;
/* ignore lower 4 bits in min count: */
overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
local64_set(&hwc->prev_count, 0);
return overflow;
}
static u64 get_ibs_fetch_count(u64 config)
{
return (config & IBS_FETCH_CNT) >> 12;
}
static u64 get_ibs_op_count(u64 config)
{
u64 count = 0;
if (config & IBS_OP_VAL)
count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
if (ibs_caps & IBS_CAPS_RDWROPCNT)
count += (config & IBS_OP_CUR_CNT) >> 32;
return count;
}
static void
perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
u64 *config)
{
u64 count = perf_ibs->get_count(*config);
/*
* Set width to 64 since we do not overflow on max width but
* instead on max count. In perf_ibs_set_period() we clear
* prev count manually on overflow.
*/
while (!perf_event_try_update(event, count, 64)) {
rdmsrl(event->hw.config_base, *config);
count = perf_ibs->get_count(*config);
}
}
static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
struct hw_perf_event *hwc, u64 config)
{
wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
}
/*
* Erratum #420 Instruction-Based Sampling Engine May Generate
* Interrupt that Cannot Be Cleared:
*
* Must clear counter mask first, then clear the enable bit. See
* Revision Guide for AMD Family 10h Processors, Publication #41322.
*/
static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
struct hw_perf_event *hwc, u64 config)
{
config &= ~perf_ibs->cnt_mask;
wrmsrl(hwc->config_base, config);
config &= ~perf_ibs->enable_mask;
wrmsrl(hwc->config_base, config);
}
/*
* We cannot restore the ibs pmu state, so we always needs to update
* the event while stopping it and then reset the state when starting
* again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
* perf_ibs_start()/perf_ibs_stop() and instead always do it.
*/
static void perf_ibs_start(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
u64 period;
if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
return;
WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
hwc->state = 0;
perf_ibs_set_period(perf_ibs, hwc, &period);
set_bit(IBS_STARTED, pcpu->state);
perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
perf_event_update_userpage(event);
}
static void perf_ibs_stop(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
u64 config;
int stopping;
stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
if (!stopping && (hwc->state & PERF_HES_UPTODATE))
return;
rdmsrl(hwc->config_base, config);
if (stopping) {
set_bit(IBS_STOPPING, pcpu->state);
perf_ibs_disable_event(perf_ibs, hwc, config);
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
}
if (hwc->state & PERF_HES_UPTODATE)
return;
/*
* Clear valid bit to not count rollovers on update, rollovers
* are only updated in the irq handler.
*/
config &= ~perf_ibs->valid_mask;
perf_ibs_event_update(perf_ibs, event, &config);
hwc->state |= PERF_HES_UPTODATE;
}
static int perf_ibs_add(struct perf_event *event, int flags)
{
struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
if (test_and_set_bit(IBS_ENABLED, pcpu->state))
return -ENOSPC;
event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
pcpu->event = event;
if (flags & PERF_EF_START)
perf_ibs_start(event, PERF_EF_RELOAD);
return 0;
}
static void perf_ibs_del(struct perf_event *event, int flags)
{
struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
return;
perf_ibs_stop(event, PERF_EF_UPDATE);
pcpu->event = NULL;
perf_event_update_userpage(event);
}
static void perf_ibs_read(struct perf_event *event) { }
PMU_FORMAT_ATTR(rand_en, "config:57");
PMU_FORMAT_ATTR(cnt_ctl, "config:19");
static struct attribute *ibs_fetch_format_attrs[] = {
&format_attr_rand_en.attr,
NULL,
};
static struct attribute *ibs_op_format_attrs[] = {
NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
NULL,
};
static struct perf_ibs perf_ibs_fetch = {
.pmu = {
.task_ctx_nr = perf_invalid_context,
.event_init = perf_ibs_init,
.add = perf_ibs_add,
.del = perf_ibs_del,
.start = perf_ibs_start,
.stop = perf_ibs_stop,
.read = perf_ibs_read,
},
.msr = MSR_AMD64_IBSFETCHCTL,
.config_mask = IBS_FETCH_CONFIG_MASK,
.cnt_mask = IBS_FETCH_MAX_CNT,
.enable_mask = IBS_FETCH_ENABLE,
.valid_mask = IBS_FETCH_VAL,
.max_period = IBS_FETCH_MAX_CNT << 4,
.offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK },
.offset_max = MSR_AMD64_IBSFETCH_REG_COUNT,
.format_attrs = ibs_fetch_format_attrs,
.get_count = get_ibs_fetch_count,
};
static struct perf_ibs perf_ibs_op = {
.pmu = {
.task_ctx_nr = perf_invalid_context,
.event_init = perf_ibs_init,
.add = perf_ibs_add,
.del = perf_ibs_del,
.start = perf_ibs_start,
.stop = perf_ibs_stop,
.read = perf_ibs_read,
},
.msr = MSR_AMD64_IBSOPCTL,
.config_mask = IBS_OP_CONFIG_MASK,
.cnt_mask = IBS_OP_MAX_CNT,
.enable_mask = IBS_OP_ENABLE,
.valid_mask = IBS_OP_VAL,
.max_period = IBS_OP_MAX_CNT << 4,
.offset_mask = { MSR_AMD64_IBSOP_REG_MASK },
.offset_max = MSR_AMD64_IBSOP_REG_COUNT,
.format_attrs = ibs_op_format_attrs,
.get_count = get_ibs_op_count,
};
static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
{
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
struct perf_event *event = pcpu->event;
struct hw_perf_event *hwc = &event->hw;
struct perf_sample_data data;
struct perf_raw_record raw;
struct pt_regs regs;
struct perf_ibs_data ibs_data;
int offset, size, check_rip, offset_max, throttle = 0;
unsigned int msr;
u64 *buf, *config, period;
if (!test_bit(IBS_STARTED, pcpu->state)) {
/*
* Catch spurious interrupts after stopping IBS: After
* disabling IBS there could be still incoming NMIs
* with samples that even have the valid bit cleared.
* Mark all this NMIs as handled.
*/
return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
}
msr = hwc->config_base;
buf = ibs_data.regs;
rdmsrl(msr, *buf);
if (!(*buf++ & perf_ibs->valid_mask))
return 0;
config = &ibs_data.regs[0];
perf_ibs_event_update(perf_ibs, event, config);
perf_sample_data_init(&data, 0, hwc->last_period);
if (!perf_ibs_set_period(perf_ibs, hwc, &period))
goto out; /* no sw counter overflow */
ibs_data.caps = ibs_caps;
size = 1;
offset = 1;
check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
if (event->attr.sample_type & PERF_SAMPLE_RAW)
offset_max = perf_ibs->offset_max;
else if (check_rip)
offset_max = 2;
else
offset_max = 1;
do {
rdmsrl(msr + offset, *buf++);
size++;
offset = find_next_bit(perf_ibs->offset_mask,
perf_ibs->offset_max,
offset + 1);
} while (offset < offset_max);
ibs_data.size = sizeof(u64) * size;
regs = *iregs;
if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
regs.flags &= ~PERF_EFLAGS_EXACT;
} else {
set_linear_ip(&regs, ibs_data.regs[1]);
regs.flags |= PERF_EFLAGS_EXACT;
}
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw.size = sizeof(u32) + ibs_data.size;
raw.data = ibs_data.data;
data.raw = &raw;
}
throttle = perf_event_overflow(event, &data, &regs);
out:
if (throttle)
perf_ibs_disable_event(perf_ibs, hwc, *config);
else
perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
perf_event_update_userpage(event);
return 1;
}
static int
perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
int handled = 0;
handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
if (handled)
inc_irq_stat(apic_perf_irqs);
return handled;
}
NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
{
struct cpu_perf_ibs __percpu *pcpu;
int ret;
pcpu = alloc_percpu(struct cpu_perf_ibs);
if (!pcpu)
return -ENOMEM;
perf_ibs->pcpu = pcpu;
/* register attributes */
if (perf_ibs->format_attrs[0]) {
memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
perf_ibs->format_group.name = "format";
perf_ibs->format_group.attrs = perf_ibs->format_attrs;
memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
perf_ibs->attr_groups[0] = &perf_ibs->format_group;
perf_ibs->pmu.attr_groups = perf_ibs->attr_groups;
}
ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
if (ret) {
perf_ibs->pcpu = NULL;
free_percpu(pcpu);
}
return ret;
}
static __init int perf_event_ibs_init(void)
{
struct attribute **attr = ibs_op_format_attrs;
if (!ibs_caps)
return -ENODEV; /* ibs not supported by the cpu */
perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
if (ibs_caps & IBS_CAPS_OPCNT) {
perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
*attr++ = &format_attr_cnt_ctl.attr;
}
perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
return 0;
}
#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
static __init int perf_event_ibs_init(void) { return 0; }
#endif
/* IBS - apic initialization, for perf and oprofile */
static __init u32 __get_ibs_caps(void)
{
u32 caps;
unsigned int max_level;
if (!boot_cpu_has(X86_FEATURE_IBS))
return 0;
/* check IBS cpuid feature flags */
max_level = cpuid_eax(0x80000000);
if (max_level < IBS_CPUID_FEATURES)
return IBS_CAPS_DEFAULT;
caps = cpuid_eax(IBS_CPUID_FEATURES);
if (!(caps & IBS_CAPS_AVAIL))
/* cpuid flags not valid */
return IBS_CAPS_DEFAULT;
return caps;
}
u32 get_ibs_caps(void)
{
return ibs_caps;
}
EXPORT_SYMBOL(get_ibs_caps);
static inline int get_eilvt(int offset)
{
return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
}
static inline int put_eilvt(int offset)
{
return !setup_APIC_eilvt(offset, 0, 0, 1);
}
/*
* Check and reserve APIC extended interrupt LVT offset for IBS if available.
*/
static inline int ibs_eilvt_valid(void)
{
int offset;
u64 val;
int valid = 0;
preempt_disable();
rdmsrl(MSR_AMD64_IBSCTL, val);
offset = val & IBSCTL_LVT_OFFSET_MASK;
if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
goto out;
}
if (!get_eilvt(offset)) {
pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
goto out;
}
valid = 1;
out:
preempt_enable();
return valid;
}
static int setup_ibs_ctl(int ibs_eilvt_off)
{
struct pci_dev *cpu_cfg;
int nodes;
u32 value = 0;
nodes = 0;
cpu_cfg = NULL;
do {
cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
PCI_DEVICE_ID_AMD_10H_NB_MISC,
cpu_cfg);
if (!cpu_cfg)
break;
++nodes;
pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
| IBSCTL_LVT_OFFSET_VALID);
pci_read_config_dword(cpu_cfg, IBSCTL, &value);
if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
pci_dev_put(cpu_cfg);
printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
"IBSCTL = 0x%08x\n", value);
return -EINVAL;
}
} while (1);
if (!nodes) {
printk(KERN_DEBUG "No CPU node configured for IBS\n");
return -ENODEV;
}
return 0;
}
/*
* This runs only on the current cpu. We try to find an LVT offset and
* setup the local APIC. For this we must disable preemption. On
* success we initialize all nodes with this offset. This updates then
* the offset in the IBS_CTL per-node msr. The per-core APIC setup of
* the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
* is using the new offset.
*/
static int force_ibs_eilvt_setup(void)
{
int offset;
int ret;
preempt_disable();
/* find the next free available EILVT entry, skip offset 0 */
for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
if (get_eilvt(offset))
break;
}
preempt_enable();
if (offset == APIC_EILVT_NR_MAX) {
printk(KERN_DEBUG "No EILVT entry available\n");
return -EBUSY;
}
ret = setup_ibs_ctl(offset);
if (ret)
goto out;
if (!ibs_eilvt_valid()) {
ret = -EFAULT;
goto out;
}
pr_info("IBS: LVT offset %d assigned\n", offset);
return 0;
out:
preempt_disable();
put_eilvt(offset);
preempt_enable();
return ret;
}
static void ibs_eilvt_setup(void)
{
/*
* Force LVT offset assignment for family 10h: The offsets are
* not assigned by the BIOS for this family, so the OS is
* responsible for doing it. If the OS assignment fails, fall
* back to BIOS settings and try to setup this.
*/
if (boot_cpu_data.x86 == 0x10)
force_ibs_eilvt_setup();
}
static inline int get_ibs_lvt_offset(void)
{
u64 val;
rdmsrl(MSR_AMD64_IBSCTL, val);
if (!(val & IBSCTL_LVT_OFFSET_VALID))
return -EINVAL;
return val & IBSCTL_LVT_OFFSET_MASK;
}
static void setup_APIC_ibs(void *dummy)
{
int offset;
offset = get_ibs_lvt_offset();
if (offset < 0)
goto failed;
if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
return;
failed:
pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
smp_processor_id());
}
static void clear_APIC_ibs(void *dummy)
{
int offset;
offset = get_ibs_lvt_offset();
if (offset >= 0)
setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
}
#ifdef CONFIG_PM
static int perf_ibs_suspend(void)
{
clear_APIC_ibs(NULL);
return 0;
}
static void perf_ibs_resume(void)
{
ibs_eilvt_setup();
setup_APIC_ibs(NULL);
}
static struct syscore_ops perf_ibs_syscore_ops = {
.resume = perf_ibs_resume,
.suspend = perf_ibs_suspend,
};
static void perf_ibs_pm_init(void)
{
register_syscore_ops(&perf_ibs_syscore_ops);
}
#else
static inline void perf_ibs_pm_init(void) { }
#endif
static int
perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_STARTING:
setup_APIC_ibs(NULL);
break;
case CPU_DYING:
clear_APIC_ibs(NULL);
break;
default:
break;
}
return NOTIFY_OK;
}
static __init int amd_ibs_init(void)
{
u32 caps;
int ret = -EINVAL;
caps = __get_ibs_caps();
if (!caps)
return -ENODEV; /* ibs not supported by the cpu */
ibs_eilvt_setup();
if (!ibs_eilvt_valid())
goto out;
perf_ibs_pm_init();
cpu_notifier_register_begin();
ibs_caps = caps;
/* make ibs_caps visible to other cpus: */
smp_mb();
smp_call_function(setup_APIC_ibs, NULL, 1);
__perf_cpu_notifier(perf_ibs_cpu_notifier);
cpu_notifier_register_done();
ret = perf_event_ibs_init();
out:
if (ret)
pr_err("Failed to setup IBS, %d\n", ret);
return ret;
}
/* Since we need the pci subsystem to init ibs we can't do this earlier: */
device_initcall(amd_ibs_init);

View file

@ -0,0 +1,502 @@
/*
* Copyright (C) 2013 Advanced Micro Devices, Inc.
*
* Author: Steven Kinney <Steven.Kinney@amd.com>
* Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
*
* Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/perf_event.h>
#include <linux/module.h>
#include <linux/cpumask.h>
#include <linux/slab.h>
#include "perf_event.h"
#include "perf_event_amd_iommu.h"
#define COUNTER_SHIFT 16
#define _GET_BANK(ev) ((u8)(ev->hw.extra_reg.reg >> 8))
#define _GET_CNTR(ev) ((u8)(ev->hw.extra_reg.reg))
/* iommu pmu config masks */
#define _GET_CSOURCE(ev) ((ev->hw.config & 0xFFULL))
#define _GET_DEVID(ev) ((ev->hw.config >> 8) & 0xFFFFULL)
#define _GET_PASID(ev) ((ev->hw.config >> 24) & 0xFFFFULL)
#define _GET_DOMID(ev) ((ev->hw.config >> 40) & 0xFFFFULL)
#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config) & 0xFFFFULL)
#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
static struct perf_amd_iommu __perf_iommu;
struct perf_amd_iommu {
struct pmu pmu;
u8 max_banks;
u8 max_counters;
u64 cntr_assign_mask;
raw_spinlock_t lock;
const struct attribute_group *attr_groups[4];
};
#define format_group attr_groups[0]
#define cpumask_group attr_groups[1]
#define events_group attr_groups[2]
#define null_group attr_groups[3]
/*---------------------------------------------
* sysfs format attributes
*---------------------------------------------*/
PMU_FORMAT_ATTR(csource, "config:0-7");
PMU_FORMAT_ATTR(devid, "config:8-23");
PMU_FORMAT_ATTR(pasid, "config:24-39");
PMU_FORMAT_ATTR(domid, "config:40-55");
PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
static struct attribute *iommu_format_attrs[] = {
&format_attr_csource.attr,
&format_attr_devid.attr,
&format_attr_pasid.attr,
&format_attr_domid.attr,
&format_attr_devid_mask.attr,
&format_attr_pasid_mask.attr,
&format_attr_domid_mask.attr,
NULL,
};
static struct attribute_group amd_iommu_format_group = {
.name = "format",
.attrs = iommu_format_attrs,
};
/*---------------------------------------------
* sysfs events attributes
*---------------------------------------------*/
struct amd_iommu_event_desc {
struct kobj_attribute attr;
const char *event;
};
static ssize_t _iommu_event_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct amd_iommu_event_desc *event =
container_of(attr, struct amd_iommu_event_desc, attr);
return sprintf(buf, "%s\n", event->event);
}
#define AMD_IOMMU_EVENT_DESC(_name, _event) \
{ \
.attr = __ATTR(_name, 0444, _iommu_event_show, NULL), \
.event = _event, \
}
static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
AMD_IOMMU_EVENT_DESC(mem_pass_untrans, "csource=0x01"),
AMD_IOMMU_EVENT_DESC(mem_pass_pretrans, "csource=0x02"),
AMD_IOMMU_EVENT_DESC(mem_pass_excl, "csource=0x03"),
AMD_IOMMU_EVENT_DESC(mem_target_abort, "csource=0x04"),
AMD_IOMMU_EVENT_DESC(mem_trans_total, "csource=0x05"),
AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit, "csource=0x06"),
AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis, "csource=0x07"),
AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit, "csource=0x08"),
AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis, "csource=0x09"),
AMD_IOMMU_EVENT_DESC(mem_dte_hit, "csource=0x0a"),
AMD_IOMMU_EVENT_DESC(mem_dte_mis, "csource=0x0b"),
AMD_IOMMU_EVENT_DESC(page_tbl_read_tot, "csource=0x0c"),
AMD_IOMMU_EVENT_DESC(page_tbl_read_nst, "csource=0x0d"),
AMD_IOMMU_EVENT_DESC(page_tbl_read_gst, "csource=0x0e"),
AMD_IOMMU_EVENT_DESC(int_dte_hit, "csource=0x0f"),
AMD_IOMMU_EVENT_DESC(int_dte_mis, "csource=0x10"),
AMD_IOMMU_EVENT_DESC(cmd_processed, "csource=0x11"),
AMD_IOMMU_EVENT_DESC(cmd_processed_inv, "csource=0x12"),
AMD_IOMMU_EVENT_DESC(tlb_inv, "csource=0x13"),
{ /* end: all zeroes */ },
};
/*---------------------------------------------
* sysfs cpumask attributes
*---------------------------------------------*/
static cpumask_t iommu_cpumask;
static ssize_t _iommu_cpumask_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &iommu_cpumask);
buf[n++] = '\n';
buf[n] = '\0';
return n;
}
static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
static struct attribute *iommu_cpumask_attrs[] = {
&dev_attr_cpumask.attr,
NULL,
};
static struct attribute_group amd_iommu_cpumask_group = {
.attrs = iommu_cpumask_attrs,
};
/*---------------------------------------------*/
static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
{
unsigned long flags;
int shift, bank, cntr, retval;
int max_banks = perf_iommu->max_banks;
int max_cntrs = perf_iommu->max_counters;
raw_spin_lock_irqsave(&perf_iommu->lock, flags);
for (bank = 0, shift = 0; bank < max_banks; bank++) {
for (cntr = 0; cntr < max_cntrs; cntr++) {
shift = bank + (bank*3) + cntr;
if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
continue;
} else {
perf_iommu->cntr_assign_mask |= (1ULL<<shift);
retval = ((u16)((u16)bank<<8) | (u8)(cntr));
goto out;
}
}
}
retval = -ENOSPC;
out:
raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
return retval;
}
static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
u8 bank, u8 cntr)
{
unsigned long flags;
int max_banks, max_cntrs;
int shift = 0;
max_banks = perf_iommu->max_banks;
max_cntrs = perf_iommu->max_counters;
if ((bank > max_banks) || (cntr > max_cntrs))
return -EINVAL;
shift = bank + cntr + (bank*3);
raw_spin_lock_irqsave(&perf_iommu->lock, flags);
perf_iommu->cntr_assign_mask &= ~(1ULL<<shift);
raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
return 0;
}
static int perf_iommu_event_init(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
struct perf_amd_iommu *perf_iommu;
u64 config, config1;
/* test the event attr type check for PMU enumeration */
if (event->attr.type != event->pmu->type)
return -ENOENT;
/*
* IOMMU counters are shared across all cores.
* Therefore, it does not support per-process mode.
* Also, it does not support event sampling mode.
*/
if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
return -EINVAL;
/* IOMMU counters do not have usr/os/guest/host bits */
if (event->attr.exclude_user || event->attr.exclude_kernel ||
event->attr.exclude_host || event->attr.exclude_guest)
return -EINVAL;
if (event->cpu < 0)
return -EINVAL;
perf_iommu = &__perf_iommu;
if (event->pmu != &perf_iommu->pmu)
return -ENOENT;
if (perf_iommu) {
config = event->attr.config;
config1 = event->attr.config1;
} else {
return -EINVAL;
}
/* integrate with iommu base devid (0000), assume one iommu */
perf_iommu->max_banks =
amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
perf_iommu->max_counters =
amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
return -EINVAL;
/* update the hw_perf_event struct with the iommu config data */
hwc->config = config;
hwc->extra_reg.config = config1;
return 0;
}
static void perf_iommu_enable_event(struct perf_event *ev)
{
u8 csource = _GET_CSOURCE(ev);
u16 devid = _GET_DEVID(ev);
u64 reg = 0ULL;
reg = csource;
amd_iommu_pc_get_set_reg_val(devid,
_GET_BANK(ev), _GET_CNTR(ev) ,
IOMMU_PC_COUNTER_SRC_REG, &reg, true);
reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
if (reg)
reg |= (1UL << 31);
amd_iommu_pc_get_set_reg_val(devid,
_GET_BANK(ev), _GET_CNTR(ev) ,
IOMMU_PC_DEVID_MATCH_REG, &reg, true);
reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
if (reg)
reg |= (1UL << 31);
amd_iommu_pc_get_set_reg_val(devid,
_GET_BANK(ev), _GET_CNTR(ev) ,
IOMMU_PC_PASID_MATCH_REG, &reg, true);
reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
if (reg)
reg |= (1UL << 31);
amd_iommu_pc_get_set_reg_val(devid,
_GET_BANK(ev), _GET_CNTR(ev) ,
IOMMU_PC_DOMID_MATCH_REG, &reg, true);
}
static void perf_iommu_disable_event(struct perf_event *event)
{
u64 reg = 0ULL;
amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
_GET_BANK(event), _GET_CNTR(event),
IOMMU_PC_COUNTER_SRC_REG, &reg, true);
}
static void perf_iommu_start(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
pr_debug("perf: amd_iommu:perf_iommu_start\n");
if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
return;
WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
hwc->state = 0;
if (flags & PERF_EF_RELOAD) {
u64 prev_raw_count = local64_read(&hwc->prev_count);
amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
_GET_BANK(event), _GET_CNTR(event),
IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
}
perf_iommu_enable_event(event);
perf_event_update_userpage(event);
}
static void perf_iommu_read(struct perf_event *event)
{
u64 count = 0ULL;
u64 prev_raw_count = 0ULL;
u64 delta = 0ULL;
struct hw_perf_event *hwc = &event->hw;
pr_debug("perf: amd_iommu:perf_iommu_read\n");
amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
_GET_BANK(event), _GET_CNTR(event),
IOMMU_PC_COUNTER_REG, &count, false);
/* IOMMU pc counter register is only 48 bits */
count &= 0xFFFFFFFFFFFFULL;
prev_raw_count = local64_read(&hwc->prev_count);
if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
count) != prev_raw_count)
return;
/* Handling 48-bit counter overflowing */
delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
delta >>= COUNTER_SHIFT;
local64_add(delta, &event->count);
}
static void perf_iommu_stop(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
u64 config;
pr_debug("perf: amd_iommu:perf_iommu_stop\n");
if (hwc->state & PERF_HES_UPTODATE)
return;
perf_iommu_disable_event(event);
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
if (hwc->state & PERF_HES_UPTODATE)
return;
config = hwc->config;
perf_iommu_read(event);
hwc->state |= PERF_HES_UPTODATE;
}
static int perf_iommu_add(struct perf_event *event, int flags)
{
int retval;
struct perf_amd_iommu *perf_iommu =
container_of(event->pmu, struct perf_amd_iommu, pmu);
pr_debug("perf: amd_iommu:perf_iommu_add\n");
event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
/* request an iommu bank/counter */
retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
if (retval != -ENOSPC)
event->hw.extra_reg.reg = (u16)retval;
else
return retval;
if (flags & PERF_EF_START)
perf_iommu_start(event, PERF_EF_RELOAD);
return 0;
}
static void perf_iommu_del(struct perf_event *event, int flags)
{
struct perf_amd_iommu *perf_iommu =
container_of(event->pmu, struct perf_amd_iommu, pmu);
pr_debug("perf: amd_iommu:perf_iommu_del\n");
perf_iommu_stop(event, PERF_EF_UPDATE);
/* clear the assigned iommu bank/counter */
clear_avail_iommu_bnk_cntr(perf_iommu,
_GET_BANK(event),
_GET_CNTR(event));
perf_event_update_userpage(event);
}
static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
{
struct attribute **attrs;
struct attribute_group *attr_group;
int i = 0, j;
while (amd_iommu_v2_event_descs[i].attr.attr.name)
i++;
attr_group = kzalloc(sizeof(struct attribute *)
* (i + 1) + sizeof(*attr_group), GFP_KERNEL);
if (!attr_group)
return -ENOMEM;
attrs = (struct attribute **)(attr_group + 1);
for (j = 0; j < i; j++)
attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
attr_group->name = "events";
attr_group->attrs = attrs;
perf_iommu->events_group = attr_group;
return 0;
}
static __init void amd_iommu_pc_exit(void)
{
if (__perf_iommu.events_group != NULL) {
kfree(__perf_iommu.events_group);
__perf_iommu.events_group = NULL;
}
}
static __init int _init_perf_amd_iommu(
struct perf_amd_iommu *perf_iommu, char *name)
{
int ret;
raw_spin_lock_init(&perf_iommu->lock);
/* Init format attributes */
perf_iommu->format_group = &amd_iommu_format_group;
/* Init cpumask attributes to only core 0 */
cpumask_set_cpu(0, &iommu_cpumask);
perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
/* Init events attributes */
if (_init_events_attrs(perf_iommu) != 0)
pr_err("perf: amd_iommu: Only support raw events.\n");
/* Init null attributes */
perf_iommu->null_group = NULL;
perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
if (ret) {
pr_err("perf: amd_iommu: Failed to initialized.\n");
amd_iommu_pc_exit();
} else {
pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
}
return ret;
}
static struct perf_amd_iommu __perf_iommu = {
.pmu = {
.event_init = perf_iommu_event_init,
.add = perf_iommu_add,
.del = perf_iommu_del,
.start = perf_iommu_start,
.stop = perf_iommu_stop,
.read = perf_iommu_read,
},
.max_banks = 0x00,
.max_counters = 0x00,
.cntr_assign_mask = 0ULL,
.format_group = NULL,
.cpumask_group = NULL,
.events_group = NULL,
.null_group = NULL,
};
static __init int amd_iommu_pc_init(void)
{
/* Make sure the IOMMU PC resource is available */
if (!amd_iommu_pc_supported())
return -ENODEV;
_init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
return 0;
}
device_initcall(amd_iommu_pc_init);

View file

@ -0,0 +1,40 @@
/*
* Copyright (C) 2013 Advanced Micro Devices, Inc.
*
* Author: Steven Kinney <Steven.Kinney@amd.com>
* Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#ifndef _PERF_EVENT_AMD_IOMMU_H_
#define _PERF_EVENT_AMD_IOMMU_H_
/* iommu pc mmio region register indexes */
#define IOMMU_PC_COUNTER_REG 0x00
#define IOMMU_PC_COUNTER_SRC_REG 0x08
#define IOMMU_PC_PASID_MATCH_REG 0x10
#define IOMMU_PC_DOMID_MATCH_REG 0x18
#define IOMMU_PC_DEVID_MATCH_REG 0x20
#define IOMMU_PC_COUNTER_REPORT_REG 0x28
/* maximun specified bank/counters */
#define PC_MAX_SPEC_BNKS 64
#define PC_MAX_SPEC_CNTRS 16
/* iommu pc reg masks*/
#define IOMMU_BASE_DEVID 0x0000
/* amd_iommu_init.c external support functions */
extern bool amd_iommu_pc_supported(void);
extern u8 amd_iommu_pc_get_max_banks(u16 devid);
extern u8 amd_iommu_pc_get_max_counters(u16 devid);
extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
u8 fxn, u64 *value, bool is_write);
#endif /*_PERF_EVENT_AMD_IOMMU_H_*/

View file

@ -0,0 +1,604 @@
/*
* Copyright (C) 2013 Advanced Micro Devices, Inc.
*
* Author: Jacob Shin <jacob.shin@amd.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/perf_event.h>
#include <linux/percpu.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <asm/cpufeature.h>
#include <asm/perf_event.h>
#include <asm/msr.h>
#define NUM_COUNTERS_NB 4
#define NUM_COUNTERS_L2 4
#define MAX_COUNTERS NUM_COUNTERS_NB
#define RDPMC_BASE_NB 6
#define RDPMC_BASE_L2 10
#define COUNTER_SHIFT 16
struct amd_uncore {
int id;
int refcnt;
int cpu;
int num_counters;
int rdpmc_base;
u32 msr_base;
cpumask_t *active_mask;
struct pmu *pmu;
struct perf_event *events[MAX_COUNTERS];
struct amd_uncore *free_when_cpu_online;
};
static struct amd_uncore * __percpu *amd_uncore_nb;
static struct amd_uncore * __percpu *amd_uncore_l2;
static struct pmu amd_nb_pmu;
static struct pmu amd_l2_pmu;
static cpumask_t amd_nb_active_mask;
static cpumask_t amd_l2_active_mask;
static bool is_nb_event(struct perf_event *event)
{
return event->pmu->type == amd_nb_pmu.type;
}
static bool is_l2_event(struct perf_event *event)
{
return event->pmu->type == amd_l2_pmu.type;
}
static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
{
if (is_nb_event(event) && amd_uncore_nb)
return *per_cpu_ptr(amd_uncore_nb, event->cpu);
else if (is_l2_event(event) && amd_uncore_l2)
return *per_cpu_ptr(amd_uncore_l2, event->cpu);
return NULL;
}
static void amd_uncore_read(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
u64 prev, new;
s64 delta;
/*
* since we do not enable counter overflow interrupts,
* we do not have to worry about prev_count changing on us
*/
prev = local64_read(&hwc->prev_count);
rdpmcl(hwc->event_base_rdpmc, new);
local64_set(&hwc->prev_count, new);
delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
delta >>= COUNTER_SHIFT;
local64_add(delta, &event->count);
}
static void amd_uncore_start(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
if (flags & PERF_EF_RELOAD)
wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
hwc->state = 0;
wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE));
perf_event_update_userpage(event);
}
static void amd_uncore_stop(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
wrmsrl(hwc->config_base, hwc->config);
hwc->state |= PERF_HES_STOPPED;
if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
amd_uncore_read(event);
hwc->state |= PERF_HES_UPTODATE;
}
}
static int amd_uncore_add(struct perf_event *event, int flags)
{
int i;
struct amd_uncore *uncore = event_to_amd_uncore(event);
struct hw_perf_event *hwc = &event->hw;
/* are we already assigned? */
if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
goto out;
for (i = 0; i < uncore->num_counters; i++) {
if (uncore->events[i] == event) {
hwc->idx = i;
goto out;
}
}
/* if not, take the first available counter */
hwc->idx = -1;
for (i = 0; i < uncore->num_counters; i++) {
if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
hwc->idx = i;
break;
}
}
out:
if (hwc->idx == -1)
return -EBUSY;
hwc->config_base = uncore->msr_base + (2 * hwc->idx);
hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
if (flags & PERF_EF_START)
amd_uncore_start(event, PERF_EF_RELOAD);
return 0;
}
static void amd_uncore_del(struct perf_event *event, int flags)
{
int i;
struct amd_uncore *uncore = event_to_amd_uncore(event);
struct hw_perf_event *hwc = &event->hw;
amd_uncore_stop(event, PERF_EF_UPDATE);
for (i = 0; i < uncore->num_counters; i++) {
if (cmpxchg(&uncore->events[i], event, NULL) == event)
break;
}
hwc->idx = -1;
}
static int amd_uncore_event_init(struct perf_event *event)
{
struct amd_uncore *uncore;
struct hw_perf_event *hwc = &event->hw;
if (event->attr.type != event->pmu->type)
return -ENOENT;
/*
* NB and L2 counters (MSRs) are shared across all cores that share the
* same NB / L2 cache. Interrupts can be directed to a single target
* core, however, event counts generated by processes running on other
* cores cannot be masked out. So we do not support sampling and
* per-thread events.
*/
if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
return -EINVAL;
/* NB and L2 counters do not have usr/os/guest/host bits */
if (event->attr.exclude_user || event->attr.exclude_kernel ||
event->attr.exclude_host || event->attr.exclude_guest)
return -EINVAL;
/* and we do not enable counter overflow interrupts */
hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
hwc->idx = -1;
if (event->cpu < 0)
return -EINVAL;
uncore = event_to_amd_uncore(event);
if (!uncore)
return -ENODEV;
/*
* since request can come in to any of the shared cores, we will remap
* to a single common cpu.
*/
event->cpu = uncore->cpu;
return 0;
}
static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
struct device_attribute *attr,
char *buf)
{
int n;
cpumask_t *active_mask;
struct pmu *pmu = dev_get_drvdata(dev);
if (pmu->type == amd_nb_pmu.type)
active_mask = &amd_nb_active_mask;
else if (pmu->type == amd_l2_pmu.type)
active_mask = &amd_l2_active_mask;
else
return 0;
n = cpulist_scnprintf(buf, PAGE_SIZE - 2, active_mask);
buf[n++] = '\n';
buf[n] = '\0';
return n;
}
static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
static struct attribute *amd_uncore_attrs[] = {
&dev_attr_cpumask.attr,
NULL,
};
static struct attribute_group amd_uncore_attr_group = {
.attrs = amd_uncore_attrs,
};
PMU_FORMAT_ATTR(event, "config:0-7,32-35");
PMU_FORMAT_ATTR(umask, "config:8-15");
static struct attribute *amd_uncore_format_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
NULL,
};
static struct attribute_group amd_uncore_format_group = {
.name = "format",
.attrs = amd_uncore_format_attr,
};
static const struct attribute_group *amd_uncore_attr_groups[] = {
&amd_uncore_attr_group,
&amd_uncore_format_group,
NULL,
};
static struct pmu amd_nb_pmu = {
.attr_groups = amd_uncore_attr_groups,
.name = "amd_nb",
.event_init = amd_uncore_event_init,
.add = amd_uncore_add,
.del = amd_uncore_del,
.start = amd_uncore_start,
.stop = amd_uncore_stop,
.read = amd_uncore_read,
};
static struct pmu amd_l2_pmu = {
.attr_groups = amd_uncore_attr_groups,
.name = "amd_l2",
.event_init = amd_uncore_event_init,
.add = amd_uncore_add,
.del = amd_uncore_del,
.start = amd_uncore_start,
.stop = amd_uncore_stop,
.read = amd_uncore_read,
};
static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
{
return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
cpu_to_node(cpu));
}
static int amd_uncore_cpu_up_prepare(unsigned int cpu)
{
struct amd_uncore *uncore_nb = NULL, *uncore_l2;
if (amd_uncore_nb) {
uncore_nb = amd_uncore_alloc(cpu);
if (!uncore_nb)
goto fail;
uncore_nb->cpu = cpu;
uncore_nb->num_counters = NUM_COUNTERS_NB;
uncore_nb->rdpmc_base = RDPMC_BASE_NB;
uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
uncore_nb->active_mask = &amd_nb_active_mask;
uncore_nb->pmu = &amd_nb_pmu;
*per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
}
if (amd_uncore_l2) {
uncore_l2 = amd_uncore_alloc(cpu);
if (!uncore_l2)
goto fail;
uncore_l2->cpu = cpu;
uncore_l2->num_counters = NUM_COUNTERS_L2;
uncore_l2->rdpmc_base = RDPMC_BASE_L2;
uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
uncore_l2->active_mask = &amd_l2_active_mask;
uncore_l2->pmu = &amd_l2_pmu;
*per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
}
return 0;
fail:
kfree(uncore_nb);
return -ENOMEM;
}
static struct amd_uncore *
amd_uncore_find_online_sibling(struct amd_uncore *this,
struct amd_uncore * __percpu *uncores)
{
unsigned int cpu;
struct amd_uncore *that;
for_each_online_cpu(cpu) {
that = *per_cpu_ptr(uncores, cpu);
if (!that)
continue;
if (this == that)
continue;
if (this->id == that->id) {
that->free_when_cpu_online = this;
this = that;
break;
}
}
this->refcnt++;
return this;
}
static void amd_uncore_cpu_starting(unsigned int cpu)
{
unsigned int eax, ebx, ecx, edx;
struct amd_uncore *uncore;
if (amd_uncore_nb) {
uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
uncore->id = ecx & 0xff;
uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
*per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
}
if (amd_uncore_l2) {
unsigned int apicid = cpu_data(cpu).apicid;
unsigned int nshared;
uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
nshared = ((eax >> 14) & 0xfff) + 1;
uncore->id = apicid - (apicid % nshared);
uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
*per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
}
}
static void uncore_online(unsigned int cpu,
struct amd_uncore * __percpu *uncores)
{
struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
kfree(uncore->free_when_cpu_online);
uncore->free_when_cpu_online = NULL;
if (cpu == uncore->cpu)
cpumask_set_cpu(cpu, uncore->active_mask);
}
static void amd_uncore_cpu_online(unsigned int cpu)
{
if (amd_uncore_nb)
uncore_online(cpu, amd_uncore_nb);
if (amd_uncore_l2)
uncore_online(cpu, amd_uncore_l2);
}
static void uncore_down_prepare(unsigned int cpu,
struct amd_uncore * __percpu *uncores)
{
unsigned int i;
struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
if (this->cpu != cpu)
return;
/* this cpu is going down, migrate to a shared sibling if possible */
for_each_online_cpu(i) {
struct amd_uncore *that = *per_cpu_ptr(uncores, i);
if (cpu == i)
continue;
if (this == that) {
perf_pmu_migrate_context(this->pmu, cpu, i);
cpumask_clear_cpu(cpu, that->active_mask);
cpumask_set_cpu(i, that->active_mask);
that->cpu = i;
break;
}
}
}
static void amd_uncore_cpu_down_prepare(unsigned int cpu)
{
if (amd_uncore_nb)
uncore_down_prepare(cpu, amd_uncore_nb);
if (amd_uncore_l2)
uncore_down_prepare(cpu, amd_uncore_l2);
}
static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
{
struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
if (cpu == uncore->cpu)
cpumask_clear_cpu(cpu, uncore->active_mask);
if (!--uncore->refcnt)
kfree(uncore);
*per_cpu_ptr(uncores, cpu) = NULL;
}
static void amd_uncore_cpu_dead(unsigned int cpu)
{
if (amd_uncore_nb)
uncore_dead(cpu, amd_uncore_nb);
if (amd_uncore_l2)
uncore_dead(cpu, amd_uncore_l2);
}
static int
amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
void *hcpu)
{
unsigned int cpu = (long)hcpu;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
if (amd_uncore_cpu_up_prepare(cpu))
return notifier_from_errno(-ENOMEM);
break;
case CPU_STARTING:
amd_uncore_cpu_starting(cpu);
break;
case CPU_ONLINE:
amd_uncore_cpu_online(cpu);
break;
case CPU_DOWN_PREPARE:
amd_uncore_cpu_down_prepare(cpu);
break;
case CPU_UP_CANCELED:
case CPU_DEAD:
amd_uncore_cpu_dead(cpu);
break;
default:
break;
}
return NOTIFY_OK;
}
static struct notifier_block amd_uncore_cpu_notifier_block = {
.notifier_call = amd_uncore_cpu_notifier,
.priority = CPU_PRI_PERF + 1,
};
static void __init init_cpu_already_online(void *dummy)
{
unsigned int cpu = smp_processor_id();
amd_uncore_cpu_starting(cpu);
amd_uncore_cpu_online(cpu);
}
static void cleanup_cpu_online(void *dummy)
{
unsigned int cpu = smp_processor_id();
amd_uncore_cpu_dead(cpu);
}
static int __init amd_uncore_init(void)
{
unsigned int cpu, cpu2;
int ret = -ENODEV;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
goto fail_nodev;
if (!cpu_has_topoext)
goto fail_nodev;
if (cpu_has_perfctr_nb) {
amd_uncore_nb = alloc_percpu(struct amd_uncore *);
if (!amd_uncore_nb) {
ret = -ENOMEM;
goto fail_nb;
}
ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
if (ret)
goto fail_nb;
printk(KERN_INFO "perf: AMD NB counters detected\n");
ret = 0;
}
if (cpu_has_perfctr_l2) {
amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
if (!amd_uncore_l2) {
ret = -ENOMEM;
goto fail_l2;
}
ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
if (ret)
goto fail_l2;
printk(KERN_INFO "perf: AMD L2I counters detected\n");
ret = 0;
}
if (ret)
goto fail_nodev;
cpu_notifier_register_begin();
/* init cpus already online before registering for hotplug notifier */
for_each_online_cpu(cpu) {
ret = amd_uncore_cpu_up_prepare(cpu);
if (ret)
goto fail_online;
smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
}
__register_cpu_notifier(&amd_uncore_cpu_notifier_block);
cpu_notifier_register_done();
return 0;
fail_online:
for_each_online_cpu(cpu2) {
if (cpu2 == cpu)
break;
smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1);
}
cpu_notifier_register_done();
/* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
amd_uncore_nb = amd_uncore_l2 = NULL;
if (cpu_has_perfctr_l2)
perf_pmu_unregister(&amd_l2_pmu);
fail_l2:
if (cpu_has_perfctr_nb)
perf_pmu_unregister(&amd_nb_pmu);
if (amd_uncore_l2)
free_percpu(amd_uncore_l2);
fail_nb:
if (amd_uncore_nb)
free_percpu(amd_uncore_nb);
fail_nodev:
return ret;
}
device_initcall(amd_uncore_init);

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,779 @@
#include <linux/perf_event.h>
#include <linux/types.h>
#include <asm/perf_event.h>
#include <asm/msr.h>
#include <asm/insn.h>
#include "perf_event.h"
enum {
LBR_FORMAT_32 = 0x00,
LBR_FORMAT_LIP = 0x01,
LBR_FORMAT_EIP = 0x02,
LBR_FORMAT_EIP_FLAGS = 0x03,
LBR_FORMAT_EIP_FLAGS2 = 0x04,
LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_EIP_FLAGS2,
};
static enum {
LBR_EIP_FLAGS = 1,
LBR_TSX = 2,
} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
[LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS,
[LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
};
/*
* Intel LBR_SELECT bits
* Intel Vol3a, April 2011, Section 16.7 Table 16-10
*
* Hardware branch filter (not available on all CPUs)
*/
#define LBR_KERNEL_BIT 0 /* do not capture at ring0 */
#define LBR_USER_BIT 1 /* do not capture at ring > 0 */
#define LBR_JCC_BIT 2 /* do not capture conditional branches */
#define LBR_REL_CALL_BIT 3 /* do not capture relative calls */
#define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */
#define LBR_RETURN_BIT 5 /* do not capture near returns */
#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
#define LBR_FAR_BIT 8 /* do not capture far branches */
#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
#define LBR_USER (1 << LBR_USER_BIT)
#define LBR_JCC (1 << LBR_JCC_BIT)
#define LBR_REL_CALL (1 << LBR_REL_CALL_BIT)
#define LBR_IND_CALL (1 << LBR_IND_CALL_BIT)
#define LBR_RETURN (1 << LBR_RETURN_BIT)
#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
#define LBR_FAR (1 << LBR_FAR_BIT)
#define LBR_PLM (LBR_KERNEL | LBR_USER)
#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */
#define LBR_NOT_SUPP -1 /* LBR filter not supported */
#define LBR_IGN 0 /* ignored */
#define LBR_ANY \
(LBR_JCC |\
LBR_REL_CALL |\
LBR_IND_CALL |\
LBR_RETURN |\
LBR_REL_JMP |\
LBR_IND_JMP |\
LBR_FAR)
#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
#define LBR_FROM_FLAG_IN_TX (1ULL << 62)
#define LBR_FROM_FLAG_ABORT (1ULL << 61)
#define for_each_branch_sample_type(x) \
for ((x) = PERF_SAMPLE_BRANCH_USER; \
(x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
/*
* x86control flow change classification
* x86control flow changes include branches, interrupts, traps, faults
*/
enum {
X86_BR_NONE = 0, /* unknown */
X86_BR_USER = 1 << 0, /* branch target is user */
X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
X86_BR_CALL = 1 << 2, /* call */
X86_BR_RET = 1 << 3, /* return */
X86_BR_SYSCALL = 1 << 4, /* syscall */
X86_BR_SYSRET = 1 << 5, /* syscall return */
X86_BR_INT = 1 << 6, /* sw interrupt */
X86_BR_IRET = 1 << 7, /* return from interrupt */
X86_BR_JCC = 1 << 8, /* conditional */
X86_BR_JMP = 1 << 9, /* jump */
X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
X86_BR_IND_CALL = 1 << 11,/* indirect calls */
X86_BR_ABORT = 1 << 12,/* transaction abort */
X86_BR_IN_TX = 1 << 13,/* in transaction */
X86_BR_NO_TX = 1 << 14,/* not in transaction */
};
#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
#define X86_BR_ANY \
(X86_BR_CALL |\
X86_BR_RET |\
X86_BR_SYSCALL |\
X86_BR_SYSRET |\
X86_BR_INT |\
X86_BR_IRET |\
X86_BR_JCC |\
X86_BR_JMP |\
X86_BR_IRQ |\
X86_BR_ABORT |\
X86_BR_IND_CALL)
#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
#define X86_BR_ANY_CALL \
(X86_BR_CALL |\
X86_BR_IND_CALL |\
X86_BR_SYSCALL |\
X86_BR_IRQ |\
X86_BR_INT)
static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
/*
* We only support LBR implementations that have FREEZE_LBRS_ON_PMI
* otherwise it becomes near impossible to get a reliable stack.
*/
static void __intel_pmu_lbr_enable(void)
{
u64 debugctl;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (cpuc->lbr_sel)
wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
}
static void __intel_pmu_lbr_disable(void)
{
u64 debugctl;
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
}
static void intel_pmu_lbr_reset_32(void)
{
int i;
for (i = 0; i < x86_pmu.lbr_nr; i++)
wrmsrl(x86_pmu.lbr_from + i, 0);
}
static void intel_pmu_lbr_reset_64(void)
{
int i;
for (i = 0; i < x86_pmu.lbr_nr; i++) {
wrmsrl(x86_pmu.lbr_from + i, 0);
wrmsrl(x86_pmu.lbr_to + i, 0);
}
}
void intel_pmu_lbr_reset(void)
{
if (!x86_pmu.lbr_nr)
return;
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
intel_pmu_lbr_reset_32();
else
intel_pmu_lbr_reset_64();
}
void intel_pmu_lbr_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (!x86_pmu.lbr_nr)
return;
/*
* Reset the LBR stack if we changed task context to
* avoid data leaks.
*/
if (event->ctx->task && cpuc->lbr_context != event->ctx) {
intel_pmu_lbr_reset();
cpuc->lbr_context = event->ctx;
}
cpuc->br_sel = event->hw.branch_reg.reg;
cpuc->lbr_users++;
}
void intel_pmu_lbr_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (!x86_pmu.lbr_nr)
return;
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
if (cpuc->enabled && !cpuc->lbr_users) {
__intel_pmu_lbr_disable();
/* avoid stale pointer */
cpuc->lbr_context = NULL;
}
}
void intel_pmu_lbr_enable_all(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (cpuc->lbr_users)
__intel_pmu_lbr_enable();
}
void intel_pmu_lbr_disable_all(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (cpuc->lbr_users)
__intel_pmu_lbr_disable();
}
/*
* TOS = most recently recorded branch
*/
static inline u64 intel_pmu_lbr_tos(void)
{
u64 tos;
rdmsrl(x86_pmu.lbr_tos, tos);
return tos;
}
static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
{
unsigned long mask = x86_pmu.lbr_nr - 1;
u64 tos = intel_pmu_lbr_tos();
int i;
for (i = 0; i < x86_pmu.lbr_nr; i++) {
unsigned long lbr_idx = (tos - i) & mask;
union {
struct {
u32 from;
u32 to;
};
u64 lbr;
} msr_lastbranch;
rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
cpuc->lbr_entries[i].from = msr_lastbranch.from;
cpuc->lbr_entries[i].to = msr_lastbranch.to;
cpuc->lbr_entries[i].mispred = 0;
cpuc->lbr_entries[i].predicted = 0;
cpuc->lbr_entries[i].reserved = 0;
}
cpuc->lbr_stack.nr = i;
}
/*
* Due to lack of segmentation in Linux the effective address (offset)
* is the same as the linear address, allowing us to merge the LIP and EIP
* LBR formats.
*/
static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
{
unsigned long mask = x86_pmu.lbr_nr - 1;
int lbr_format = x86_pmu.intel_cap.lbr_format;
u64 tos = intel_pmu_lbr_tos();
int i;
int out = 0;
for (i = 0; i < x86_pmu.lbr_nr; i++) {
unsigned long lbr_idx = (tos - i) & mask;
u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
int skip = 0;
int lbr_flags = lbr_desc[lbr_format];
rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
if (lbr_flags & LBR_EIP_FLAGS) {
mis = !!(from & LBR_FROM_FLAG_MISPRED);
pred = !mis;
skip = 1;
}
if (lbr_flags & LBR_TSX) {
in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
abort = !!(from & LBR_FROM_FLAG_ABORT);
skip = 3;
}
from = (u64)((((s64)from) << skip) >> skip);
/*
* Some CPUs report duplicated abort records,
* with the second entry not having an abort bit set.
* Skip them here. This loop runs backwards,
* so we need to undo the previous record.
* If the abort just happened outside the window
* the extra entry cannot be removed.
*/
if (abort && x86_pmu.lbr_double_abort && out > 0)
out--;
cpuc->lbr_entries[out].from = from;
cpuc->lbr_entries[out].to = to;
cpuc->lbr_entries[out].mispred = mis;
cpuc->lbr_entries[out].predicted = pred;
cpuc->lbr_entries[out].in_tx = in_tx;
cpuc->lbr_entries[out].abort = abort;
cpuc->lbr_entries[out].reserved = 0;
out++;
}
cpuc->lbr_stack.nr = out;
}
void intel_pmu_lbr_read(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (!cpuc->lbr_users)
return;
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
intel_pmu_lbr_read_32(cpuc);
else
intel_pmu_lbr_read_64(cpuc);
intel_pmu_lbr_filter(cpuc);
}
/*
* SW filter is used:
* - in case there is no HW filter
* - in case the HW filter has errata or limitations
*/
static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
{
u64 br_type = event->attr.branch_sample_type;
int mask = 0;
if (br_type & PERF_SAMPLE_BRANCH_USER)
mask |= X86_BR_USER;
if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
mask |= X86_BR_KERNEL;
/* we ignore BRANCH_HV here */
if (br_type & PERF_SAMPLE_BRANCH_ANY)
mask |= X86_BR_ANY;
if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
mask |= X86_BR_ANY_CALL;
if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
mask |= X86_BR_IND_CALL;
if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
mask |= X86_BR_ABORT;
if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
mask |= X86_BR_IN_TX;
if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
mask |= X86_BR_NO_TX;
if (br_type & PERF_SAMPLE_BRANCH_COND)
mask |= X86_BR_JCC;
/*
* stash actual user request into reg, it may
* be used by fixup code for some CPU
*/
event->hw.branch_reg.reg = mask;
}
/*
* setup the HW LBR filter
* Used only when available, may not be enough to disambiguate
* all branches, may need the help of the SW filter
*/
static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
{
struct hw_perf_event_extra *reg;
u64 br_type = event->attr.branch_sample_type;
u64 mask = 0, m;
u64 v;
for_each_branch_sample_type(m) {
if (!(br_type & m))
continue;
v = x86_pmu.lbr_sel_map[m];
if (v == LBR_NOT_SUPP)
return -EOPNOTSUPP;
if (v != LBR_IGN)
mask |= v;
}
reg = &event->hw.branch_reg;
reg->idx = EXTRA_REG_LBR;
/* LBR_SELECT operates in suppress mode so invert mask */
reg->config = ~mask & x86_pmu.lbr_sel_mask;
return 0;
}
int intel_pmu_setup_lbr_filter(struct perf_event *event)
{
int ret = 0;
/*
* no LBR on this PMU
*/
if (!x86_pmu.lbr_nr)
return -EOPNOTSUPP;
/*
* setup SW LBR filter
*/
intel_pmu_setup_sw_lbr_filter(event);
/*
* setup HW LBR filter, if any
*/
if (x86_pmu.lbr_sel_map)
ret = intel_pmu_setup_hw_lbr_filter(event);
return ret;
}
/*
* return the type of control flow change at address "from"
* intruction is not necessarily a branch (in case of interrupt).
*
* The branch type returned also includes the priv level of the
* target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
*
* If a branch type is unknown OR the instruction cannot be
* decoded (e.g., text page not present), then X86_BR_NONE is
* returned.
*/
static int branch_type(unsigned long from, unsigned long to, int abort)
{
struct insn insn;
void *addr;
int bytes, size = MAX_INSN_SIZE;
int ret = X86_BR_NONE;
int ext, to_plm, from_plm;
u8 buf[MAX_INSN_SIZE];
int is64 = 0;
to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
/*
* maybe zero if lbr did not fill up after a reset by the time
* we get a PMU interrupt
*/
if (from == 0 || to == 0)
return X86_BR_NONE;
if (abort)
return X86_BR_ABORT | to_plm;
if (from_plm == X86_BR_USER) {
/*
* can happen if measuring at the user level only
* and we interrupt in a kernel thread, e.g., idle.
*/
if (!current->mm)
return X86_BR_NONE;
/* may fail if text not present */
bytes = copy_from_user_nmi(buf, (void __user *)from, size);
if (bytes != 0)
return X86_BR_NONE;
addr = buf;
} else {
/*
* The LBR logs any address in the IP, even if the IP just
* faulted. This means userspace can control the from address.
* Ensure we don't blindy read any address by validating it is
* a known text address.
*/
if (kernel_text_address(from))
addr = (void *)from;
else
return X86_BR_NONE;
}
/*
* decoder needs to know the ABI especially
* on 64-bit systems running 32-bit apps
*/
#ifdef CONFIG_X86_64
is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
#endif
insn_init(&insn, addr, is64);
insn_get_opcode(&insn);
switch (insn.opcode.bytes[0]) {
case 0xf:
switch (insn.opcode.bytes[1]) {
case 0x05: /* syscall */
case 0x34: /* sysenter */
ret = X86_BR_SYSCALL;
break;
case 0x07: /* sysret */
case 0x35: /* sysexit */
ret = X86_BR_SYSRET;
break;
case 0x80 ... 0x8f: /* conditional */
ret = X86_BR_JCC;
break;
default:
ret = X86_BR_NONE;
}
break;
case 0x70 ... 0x7f: /* conditional */
ret = X86_BR_JCC;
break;
case 0xc2: /* near ret */
case 0xc3: /* near ret */
case 0xca: /* far ret */
case 0xcb: /* far ret */
ret = X86_BR_RET;
break;
case 0xcf: /* iret */
ret = X86_BR_IRET;
break;
case 0xcc ... 0xce: /* int */
ret = X86_BR_INT;
break;
case 0xe8: /* call near rel */
case 0x9a: /* call far absolute */
ret = X86_BR_CALL;
break;
case 0xe0 ... 0xe3: /* loop jmp */
ret = X86_BR_JCC;
break;
case 0xe9 ... 0xeb: /* jmp */
ret = X86_BR_JMP;
break;
case 0xff: /* call near absolute, call far absolute ind */
insn_get_modrm(&insn);
ext = (insn.modrm.bytes[0] >> 3) & 0x7;
switch (ext) {
case 2: /* near ind call */
case 3: /* far ind call */
ret = X86_BR_IND_CALL;
break;
case 4:
case 5:
ret = X86_BR_JMP;
break;
}
break;
default:
ret = X86_BR_NONE;
}
/*
* interrupts, traps, faults (and thus ring transition) may
* occur on any instructions. Thus, to classify them correctly,
* we need to first look at the from and to priv levels. If they
* are different and to is in the kernel, then it indicates
* a ring transition. If the from instruction is not a ring
* transition instr (syscall, systenter, int), then it means
* it was a irq, trap or fault.
*
* we have no way of detecting kernel to kernel faults.
*/
if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
&& ret != X86_BR_SYSCALL && ret != X86_BR_INT)
ret = X86_BR_IRQ;
/*
* branch priv level determined by target as
* is done by HW when LBR_SELECT is implemented
*/
if (ret != X86_BR_NONE)
ret |= to_plm;
return ret;
}
/*
* implement actual branch filter based on user demand.
* Hardware may not exactly satisfy that request, thus
* we need to inspect opcodes. Mismatched branches are
* discarded. Therefore, the number of branches returned
* in PERF_SAMPLE_BRANCH_STACK sample may vary.
*/
static void
intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
{
u64 from, to;
int br_sel = cpuc->br_sel;
int i, j, type;
bool compress = false;
/* if sampling all branches, then nothing to filter */
if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
return;
for (i = 0; i < cpuc->lbr_stack.nr; i++) {
from = cpuc->lbr_entries[i].from;
to = cpuc->lbr_entries[i].to;
type = branch_type(from, to, cpuc->lbr_entries[i].abort);
if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
if (cpuc->lbr_entries[i].in_tx)
type |= X86_BR_IN_TX;
else
type |= X86_BR_NO_TX;
}
/* if type does not correspond, then discard */
if (type == X86_BR_NONE || (br_sel & type) != type) {
cpuc->lbr_entries[i].from = 0;
compress = true;
}
}
if (!compress)
return;
/* remove all entries with from=0 */
for (i = 0; i < cpuc->lbr_stack.nr; ) {
if (!cpuc->lbr_entries[i].from) {
j = i;
while (++j < cpuc->lbr_stack.nr)
cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
cpuc->lbr_stack.nr--;
if (!cpuc->lbr_entries[i].from)
continue;
}
i++;
}
}
/*
* Map interface branch filters onto LBR filters
*/
static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
[PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
[PERF_SAMPLE_BRANCH_USER] = LBR_USER,
[PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
[PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
[PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP
| LBR_IND_JMP | LBR_FAR,
/*
* NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
*/
[PERF_SAMPLE_BRANCH_ANY_CALL] =
LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
/*
* NHM/WSM erratum: must include IND_JMP to capture IND_CALL
*/
[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
[PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
};
static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
[PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
[PERF_SAMPLE_BRANCH_USER] = LBR_USER,
[PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
[PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
[PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR,
[PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
| LBR_FAR,
[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
[PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
};
/* core */
void __init intel_pmu_lbr_init_core(void)
{
x86_pmu.lbr_nr = 4;
x86_pmu.lbr_tos = MSR_LBR_TOS;
x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
x86_pmu.lbr_to = MSR_LBR_CORE_TO;
/*
* SW branch filter usage:
* - compensate for lack of HW filter
*/
pr_cont("4-deep LBR, ");
}
/* nehalem/westmere */
void __init intel_pmu_lbr_init_nhm(void)
{
x86_pmu.lbr_nr = 16;
x86_pmu.lbr_tos = MSR_LBR_TOS;
x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
x86_pmu.lbr_to = MSR_LBR_NHM_TO;
x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
x86_pmu.lbr_sel_map = nhm_lbr_sel_map;
/*
* SW branch filter usage:
* - workaround LBR_SEL errata (see above)
* - support syscall, sysret capture.
* That requires LBR_FAR but that means far
* jmp need to be filtered out
*/
pr_cont("16-deep LBR, ");
}
/* sandy bridge */
void __init intel_pmu_lbr_init_snb(void)
{
x86_pmu.lbr_nr = 16;
x86_pmu.lbr_tos = MSR_LBR_TOS;
x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
x86_pmu.lbr_to = MSR_LBR_NHM_TO;
x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
x86_pmu.lbr_sel_map = snb_lbr_sel_map;
/*
* SW branch filter usage:
* - support syscall, sysret capture.
* That requires LBR_FAR but that means far
* jmp need to be filtered out
*/
pr_cont("16-deep LBR, ");
}
/* atom */
void __init intel_pmu_lbr_init_atom(void)
{
/*
* only models starting at stepping 10 seems
* to have an operational LBR which can freeze
* on PMU interrupt
*/
if (boot_cpu_data.x86_model == 28
&& boot_cpu_data.x86_mask < 10) {
pr_cont("LBR disabled due to erratum");
return;
}
x86_pmu.lbr_nr = 8;
x86_pmu.lbr_tos = MSR_LBR_TOS;
x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
x86_pmu.lbr_to = MSR_LBR_CORE_TO;
/*
* SW branch filter usage:
* - compensate for lack of HW filter
*/
pr_cont("8-deep LBR, ");
}

View file

@ -0,0 +1,714 @@
/*
* perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
* Copyright (C) 2013 Google, Inc., Stephane Eranian
*
* Intel RAPL interface is specified in the IA-32 Manual Vol3b
* section 14.7.1 (September 2013)
*
* RAPL provides more controls than just reporting energy consumption
* however here we only expose the 3 energy consumption free running
* counters (pp0, pkg, dram).
*
* Each of those counters increments in a power unit defined by the
* RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
* but it can vary.
*
* Counter to rapl events mappings:
*
* pp0 counter: consumption of all physical cores (power plane 0)
* event: rapl_energy_cores
* perf code: 0x1
*
* pkg counter: consumption of the whole processor package
* event: rapl_energy_pkg
* perf code: 0x2
*
* dram counter: consumption of the dram domain (servers only)
* event: rapl_energy_dram
* perf code: 0x3
*
* dram counter: consumption of the builtin-gpu domain (client only)
* event: rapl_energy_gpu
* perf code: 0x4
*
* We manage those counters as free running (read-only). They may be
* use simultaneously by other tools, such as turbostat.
*
* The events only support system-wide mode counting. There is no
* sampling support because it does not make sense and is not
* supported by the RAPL hardware.
*
* Because we want to avoid floating-point operations in the kernel,
* the events are all reported in fixed point arithmetic (32.32).
* Tools must adjust the counts to convert them to Watts using
* the duration of the measurement. Tools may use a function such as
* ldexp(raw_count, -32);
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <asm/cpu_device_id.h>
#include "perf_event.h"
/*
* RAPL energy status counters
*/
#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */
#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */
#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */
#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */
#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */
#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */
#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */
#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
/* Clients have PP0, PKG */
#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
1<<RAPL_IDX_PKG_NRG_STAT|\
1<<RAPL_IDX_PP1_NRG_STAT)
/* Servers have PP0, PKG, RAM */
#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\
1<<RAPL_IDX_PKG_NRG_STAT|\
1<<RAPL_IDX_RAM_NRG_STAT)
/* Servers have PP0, PKG, RAM, PP1 */
#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\
1<<RAPL_IDX_PKG_NRG_STAT|\
1<<RAPL_IDX_RAM_NRG_STAT|\
1<<RAPL_IDX_PP1_NRG_STAT)
/*
* event code: LSB 8 bits, passed in attr->config
* any other bit is reserved
*/
#define RAPL_EVENT_MASK 0xFFULL
#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
struct kobj_attribute *attr, \
char *page) \
{ \
BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
return sprintf(page, _format "\n"); \
} \
static struct kobj_attribute format_attr_##_var = \
__ATTR(_name, 0444, __rapl_##_var##_show, NULL)
#define RAPL_EVENT_DESC(_name, _config) \
{ \
.attr = __ATTR(_name, 0444, rapl_event_show, NULL), \
.config = _config, \
}
#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
struct rapl_pmu {
spinlock_t lock;
int hw_unit; /* 1/2^hw_unit Joule */
int n_active; /* number of active events */
struct list_head active_list;
struct pmu *pmu; /* pointer to rapl_pmu_class */
ktime_t timer_interval; /* in ktime_t unit */
struct hrtimer hrtimer;
};
static struct pmu rapl_pmu_class;
static cpumask_t rapl_cpu_mask;
static int rapl_cntr_mask;
static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
static inline u64 rapl_read_counter(struct perf_event *event)
{
u64 raw;
rdmsrl(event->hw.event_base, raw);
return raw;
}
static inline u64 rapl_scale(u64 v)
{
/*
* scale delta to smallest unit (1/2^32)
* users must then scale back: count * 1/(1e9*2^32) to get Joules
* or use ldexp(count, -32).
* Watts = Joules/Time delta
*/
return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit);
}
static u64 rapl_event_update(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
u64 prev_raw_count, new_raw_count;
s64 delta, sdelta;
int shift = RAPL_CNTR_WIDTH;
again:
prev_raw_count = local64_read(&hwc->prev_count);
rdmsrl(event->hw.event_base, new_raw_count);
if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
new_raw_count) != prev_raw_count) {
cpu_relax();
goto again;
}
/*
* Now we have the new raw value and have updated the prev
* timestamp already. We can now calculate the elapsed delta
* (event-)time and add that to the generic event.
*
* Careful, not all hw sign-extends above the physical width
* of the count.
*/
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
sdelta = rapl_scale(delta);
local64_add(sdelta, &event->count);
return new_raw_count;
}
static void rapl_start_hrtimer(struct rapl_pmu *pmu)
{
__hrtimer_start_range_ns(&pmu->hrtimer,
pmu->timer_interval, 0,
HRTIMER_MODE_REL_PINNED, 0);
}
static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
{
hrtimer_cancel(&pmu->hrtimer);
}
static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
{
struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
struct perf_event *event;
unsigned long flags;
if (!pmu->n_active)
return HRTIMER_NORESTART;
spin_lock_irqsave(&pmu->lock, flags);
list_for_each_entry(event, &pmu->active_list, active_entry) {
rapl_event_update(event);
}
spin_unlock_irqrestore(&pmu->lock, flags);
hrtimer_forward_now(hrtimer, pmu->timer_interval);
return HRTIMER_RESTART;
}
static void rapl_hrtimer_init(struct rapl_pmu *pmu)
{
struct hrtimer *hr = &pmu->hrtimer;
hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hr->function = rapl_hrtimer_handle;
}
static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
struct perf_event *event)
{
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
return;
event->hw.state = 0;
list_add_tail(&event->active_entry, &pmu->active_list);
local64_set(&event->hw.prev_count, rapl_read_counter(event));
pmu->n_active++;
if (pmu->n_active == 1)
rapl_start_hrtimer(pmu);
}
static void rapl_pmu_event_start(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
unsigned long flags;
spin_lock_irqsave(&pmu->lock, flags);
__rapl_pmu_event_start(pmu, event);
spin_unlock_irqrestore(&pmu->lock, flags);
}
static void rapl_pmu_event_stop(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
spin_lock_irqsave(&pmu->lock, flags);
/* mark event as deactivated and stopped */
if (!(hwc->state & PERF_HES_STOPPED)) {
WARN_ON_ONCE(pmu->n_active <= 0);
pmu->n_active--;
if (pmu->n_active == 0)
rapl_stop_hrtimer(pmu);
list_del(&event->active_entry);
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
}
/* check if update of sw counter is necessary */
if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
/*
* Drain the remaining delta count out of a event
* that we are disabling:
*/
rapl_event_update(event);
hwc->state |= PERF_HES_UPTODATE;
}
spin_unlock_irqrestore(&pmu->lock, flags);
}
static int rapl_pmu_event_add(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
spin_lock_irqsave(&pmu->lock, flags);
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
if (mode & PERF_EF_START)
__rapl_pmu_event_start(pmu, event);
spin_unlock_irqrestore(&pmu->lock, flags);
return 0;
}
static void rapl_pmu_event_del(struct perf_event *event, int flags)
{
rapl_pmu_event_stop(event, PERF_EF_UPDATE);
}
static int rapl_pmu_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
int bit, msr, ret = 0;
/* only look at RAPL events */
if (event->attr.type != rapl_pmu_class.type)
return -ENOENT;
/* check only supported bits are set */
if (event->attr.config & ~RAPL_EVENT_MASK)
return -EINVAL;
/*
* check event is known (determines counter)
*/
switch (cfg) {
case INTEL_RAPL_PP0:
bit = RAPL_IDX_PP0_NRG_STAT;
msr = MSR_PP0_ENERGY_STATUS;
break;
case INTEL_RAPL_PKG:
bit = RAPL_IDX_PKG_NRG_STAT;
msr = MSR_PKG_ENERGY_STATUS;
break;
case INTEL_RAPL_RAM:
bit = RAPL_IDX_RAM_NRG_STAT;
msr = MSR_DRAM_ENERGY_STATUS;
break;
case INTEL_RAPL_PP1:
bit = RAPL_IDX_PP1_NRG_STAT;
msr = MSR_PP1_ENERGY_STATUS;
break;
default:
return -EINVAL;
}
/* check event supported */
if (!(rapl_cntr_mask & (1 << bit)))
return -EINVAL;
/* unsupported modes and filters */
if (event->attr.exclude_user ||
event->attr.exclude_kernel ||
event->attr.exclude_hv ||
event->attr.exclude_idle ||
event->attr.exclude_host ||
event->attr.exclude_guest ||
event->attr.sample_period) /* no sampling */
return -EINVAL;
/* must be done before validate_group */
event->hw.event_base = msr;
event->hw.config = cfg;
event->hw.idx = bit;
return ret;
}
static void rapl_pmu_event_read(struct perf_event *event)
{
rapl_event_update(event);
}
static ssize_t rapl_get_attr_cpumask(struct device *dev,
struct device_attribute *attr, char *buf)
{
int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask);
buf[n++] = '\n';
buf[n] = '\0';
return n;
}
static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
static struct attribute *rapl_pmu_attrs[] = {
&dev_attr_cpumask.attr,
NULL,
};
static struct attribute_group rapl_pmu_attr_group = {
.attrs = rapl_pmu_attrs,
};
EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
/*
* we compute in 0.23 nJ increments regardless of MSR
*/
EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
static struct attribute *rapl_events_srv_attr[] = {
EVENT_PTR(rapl_cores),
EVENT_PTR(rapl_pkg),
EVENT_PTR(rapl_ram),
EVENT_PTR(rapl_cores_unit),
EVENT_PTR(rapl_pkg_unit),
EVENT_PTR(rapl_ram_unit),
EVENT_PTR(rapl_cores_scale),
EVENT_PTR(rapl_pkg_scale),
EVENT_PTR(rapl_ram_scale),
NULL,
};
static struct attribute *rapl_events_cln_attr[] = {
EVENT_PTR(rapl_cores),
EVENT_PTR(rapl_pkg),
EVENT_PTR(rapl_gpu),
EVENT_PTR(rapl_cores_unit),
EVENT_PTR(rapl_pkg_unit),
EVENT_PTR(rapl_gpu_unit),
EVENT_PTR(rapl_cores_scale),
EVENT_PTR(rapl_pkg_scale),
EVENT_PTR(rapl_gpu_scale),
NULL,
};
static struct attribute *rapl_events_hsw_attr[] = {
EVENT_PTR(rapl_cores),
EVENT_PTR(rapl_pkg),
EVENT_PTR(rapl_gpu),
EVENT_PTR(rapl_ram),
EVENT_PTR(rapl_cores_unit),
EVENT_PTR(rapl_pkg_unit),
EVENT_PTR(rapl_gpu_unit),
EVENT_PTR(rapl_ram_unit),
EVENT_PTR(rapl_cores_scale),
EVENT_PTR(rapl_pkg_scale),
EVENT_PTR(rapl_gpu_scale),
EVENT_PTR(rapl_ram_scale),
NULL,
};
static struct attribute_group rapl_pmu_events_group = {
.name = "events",
.attrs = NULL, /* patched at runtime */
};
DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
static struct attribute *rapl_formats_attr[] = {
&format_attr_event.attr,
NULL,
};
static struct attribute_group rapl_pmu_format_group = {
.name = "format",
.attrs = rapl_formats_attr,
};
const struct attribute_group *rapl_attr_groups[] = {
&rapl_pmu_attr_group,
&rapl_pmu_format_group,
&rapl_pmu_events_group,
NULL,
};
static struct pmu rapl_pmu_class = {
.attr_groups = rapl_attr_groups,
.task_ctx_nr = perf_invalid_context, /* system-wide only */
.event_init = rapl_pmu_event_init,
.add = rapl_pmu_event_add, /* must have */
.del = rapl_pmu_event_del, /* must have */
.start = rapl_pmu_event_start,
.stop = rapl_pmu_event_stop,
.read = rapl_pmu_event_read,
};
static void rapl_cpu_exit(int cpu)
{
struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
int i, phys_id = topology_physical_package_id(cpu);
int target = -1;
/* find a new cpu on same package */
for_each_online_cpu(i) {
if (i == cpu)
continue;
if (phys_id == topology_physical_package_id(i)) {
target = i;
break;
}
}
/*
* clear cpu from cpumask
* if was set in cpumask and still some cpu on package,
* then move to new cpu
*/
if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
cpumask_set_cpu(target, &rapl_cpu_mask);
WARN_ON(cpumask_empty(&rapl_cpu_mask));
/*
* migrate events and context to new cpu
*/
if (target >= 0)
perf_pmu_migrate_context(pmu->pmu, cpu, target);
/* cancel overflow polling timer for CPU */
rapl_stop_hrtimer(pmu);
}
static void rapl_cpu_init(int cpu)
{
int i, phys_id = topology_physical_package_id(cpu);
/* check if phys_is is already covered */
for_each_cpu(i, &rapl_cpu_mask) {
if (phys_id == topology_physical_package_id(i))
return;
}
/* was not found, so add it */
cpumask_set_cpu(cpu, &rapl_cpu_mask);
}
static int rapl_cpu_prepare(int cpu)
{
struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
int phys_id = topology_physical_package_id(cpu);
u64 ms;
u64 msr_rapl_power_unit_bits;
if (pmu)
return 0;
if (phys_id < 0)
return -1;
/* protect rdmsrl() to handle virtualization */
if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
return -1;
pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
if (!pmu)
return -1;
spin_lock_init(&pmu->lock);
INIT_LIST_HEAD(&pmu->active_list);
/*
* grab power unit as: 1/2^unit Joules
*
* we cache in local PMU instance
*/
pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
pmu->pmu = &rapl_pmu_class;
/*
* use reference of 200W for scaling the timeout
* to avoid missing counter overflows.
* 200W = 200 Joules/sec
* divide interval by 2 to avoid lockstep (2 * 100)
* if hw unit is 32, then we use 2 ms 1/200/2
*/
if (pmu->hw_unit < 32)
ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
else
ms = 2;
pmu->timer_interval = ms_to_ktime(ms);
rapl_hrtimer_init(pmu);
/* set RAPL pmu for this cpu for now */
per_cpu(rapl_pmu, cpu) = pmu;
per_cpu(rapl_pmu_to_free, cpu) = NULL;
return 0;
}
static void rapl_cpu_kfree(int cpu)
{
struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
kfree(pmu);
per_cpu(rapl_pmu_to_free, cpu) = NULL;
}
static int rapl_cpu_dying(int cpu)
{
struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
if (!pmu)
return 0;
per_cpu(rapl_pmu, cpu) = NULL;
per_cpu(rapl_pmu_to_free, cpu) = pmu;
return 0;
}
static int rapl_cpu_notifier(struct notifier_block *self,
unsigned long action, void *hcpu)
{
unsigned int cpu = (long)hcpu;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
rapl_cpu_prepare(cpu);
break;
case CPU_STARTING:
rapl_cpu_init(cpu);
break;
case CPU_UP_CANCELED:
case CPU_DYING:
rapl_cpu_dying(cpu);
break;
case CPU_ONLINE:
case CPU_DEAD:
rapl_cpu_kfree(cpu);
break;
case CPU_DOWN_PREPARE:
rapl_cpu_exit(cpu);
break;
default:
break;
}
return NOTIFY_OK;
}
static const struct x86_cpu_id rapl_cpu_match[] = {
[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
[1] = {},
};
static int __init rapl_pmu_init(void)
{
struct rapl_pmu *pmu;
int cpu, ret;
/*
* check for Intel processor family 6
*/
if (!x86_match_cpu(rapl_cpu_match))
return 0;
/* check supported CPU */
switch (boot_cpu_data.x86_model) {
case 42: /* Sandy Bridge */
case 58: /* Ivy Bridge */
rapl_cntr_mask = RAPL_IDX_CLN;
rapl_pmu_events_group.attrs = rapl_events_cln_attr;
break;
case 60: /* Haswell */
case 69: /* Haswell-Celeron */
rapl_cntr_mask = RAPL_IDX_HSW;
rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
break;
case 45: /* Sandy Bridge-EP */
case 62: /* IvyTown */
rapl_cntr_mask = RAPL_IDX_SRV;
rapl_pmu_events_group.attrs = rapl_events_srv_attr;
break;
default:
/* unsupported */
return 0;
}
cpu_notifier_register_begin();
for_each_online_cpu(cpu) {
ret = rapl_cpu_prepare(cpu);
if (ret)
goto out;
rapl_cpu_init(cpu);
}
__perf_cpu_notifier(rapl_cpu_notifier);
ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
if (WARN_ON(ret)) {
pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
cpu_notifier_register_done();
return -1;
}
pmu = __this_cpu_read(rapl_pmu);
pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
" API unit is 2^-32 Joules,"
" %d fixed counters"
" %llu ms ovfl timer\n",
pmu->hw_unit,
hweight32(rapl_cntr_mask),
ktime_to_ms(pmu->timer_interval));
out:
cpu_notifier_register_done();
return 0;
}
device_initcall(rapl_pmu_init);

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,339 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/pci.h>
#include <linux/perf_event.h>
#include "perf_event.h"
#define UNCORE_PMU_NAME_LEN 32
#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC)
#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
#define UNCORE_FIXED_EVENT 0xff
#define UNCORE_PMC_IDX_MAX_GENERIC 8
#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC
#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1)
#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx)
#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff)
#define UNCORE_PCI_DEV_IDX(data) (data & 0xff)
#define UNCORE_EXTRA_PCI_DEV 0xff
#define UNCORE_EXTRA_PCI_DEV_MAX 3
/* support up to 8 sockets */
#define UNCORE_SOCKET_MAX 8
#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
struct intel_uncore_ops;
struct intel_uncore_pmu;
struct intel_uncore_box;
struct uncore_event_desc;
struct intel_uncore_type {
const char *name;
int num_counters;
int num_boxes;
int perf_ctr_bits;
int fixed_ctr_bits;
unsigned perf_ctr;
unsigned event_ctl;
unsigned event_mask;
unsigned fixed_ctr;
unsigned fixed_ctl;
unsigned box_ctl;
unsigned msr_offset;
unsigned num_shared_regs:8;
unsigned single_fixed:1;
unsigned pair_ctr_ctl:1;
unsigned *msr_offsets;
struct event_constraint unconstrainted;
struct event_constraint *constraints;
struct intel_uncore_pmu *pmus;
struct intel_uncore_ops *ops;
struct uncore_event_desc *event_descs;
const struct attribute_group *attr_groups[4];
struct pmu *pmu; /* for custom pmu ops */
};
#define pmu_group attr_groups[0]
#define format_group attr_groups[1]
#define events_group attr_groups[2]
struct intel_uncore_ops {
void (*init_box)(struct intel_uncore_box *);
void (*disable_box)(struct intel_uncore_box *);
void (*enable_box)(struct intel_uncore_box *);
void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
struct perf_event *);
void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
};
struct intel_uncore_pmu {
struct pmu pmu;
char name[UNCORE_PMU_NAME_LEN];
int pmu_idx;
int func_id;
struct intel_uncore_type *type;
struct intel_uncore_box ** __percpu box;
struct list_head box_list;
};
struct intel_uncore_extra_reg {
raw_spinlock_t lock;
u64 config, config1, config2;
atomic_t ref;
};
struct intel_uncore_box {
int phys_id;
int n_active; /* number of active events */
int n_events;
int cpu; /* cpu to collect events */
unsigned long flags;
atomic_t refcnt;
struct perf_event *events[UNCORE_PMC_IDX_MAX];
struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
u64 tags[UNCORE_PMC_IDX_MAX];
struct pci_dev *pci_dev;
struct intel_uncore_pmu *pmu;
u64 hrtimer_duration; /* hrtimer timeout for this box */
struct hrtimer hrtimer;
struct list_head list;
struct list_head active_list;
void *io_addr;
struct intel_uncore_extra_reg shared_regs[0];
};
#define UNCORE_BOX_FLAG_INITIATED 0
struct uncore_event_desc {
struct kobj_attribute attr;
const char *config;
};
ssize_t uncore_event_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf);
#define INTEL_UNCORE_EVENT_DESC(_name, _config) \
{ \
.attr = __ATTR(_name, 0444, uncore_event_show, NULL), \
.config = _config, \
}
#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \
static ssize_t __uncore_##_var##_show(struct kobject *kobj, \
struct kobj_attribute *attr, \
char *page) \
{ \
BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
return sprintf(page, _format "\n"); \
} \
static struct kobj_attribute format_attr_##_var = \
__ATTR(_name, 0444, __uncore_##_var##_show, NULL)
static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
{
return box->pmu->type->box_ctl;
}
static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
{
return box->pmu->type->fixed_ctl;
}
static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
{
return box->pmu->type->fixed_ctr;
}
static inline
unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
{
return idx * 4 + box->pmu->type->event_ctl;
}
static inline
unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
{
return idx * 8 + box->pmu->type->perf_ctr;
}
static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
{
struct intel_uncore_pmu *pmu = box->pmu;
return pmu->type->msr_offsets ?
pmu->type->msr_offsets[pmu->pmu_idx] :
pmu->type->msr_offset * pmu->pmu_idx;
}
static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
{
if (!box->pmu->type->box_ctl)
return 0;
return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
}
static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
{
if (!box->pmu->type->fixed_ctl)
return 0;
return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
}
static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
{
return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
}
static inline
unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
{
return box->pmu->type->event_ctl +
(box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
uncore_msr_box_offset(box);
}
static inline
unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
{
return box->pmu->type->perf_ctr +
(box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
uncore_msr_box_offset(box);
}
static inline
unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
{
if (box->pci_dev)
return uncore_pci_fixed_ctl(box);
else
return uncore_msr_fixed_ctl(box);
}
static inline
unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
{
if (box->pci_dev)
return uncore_pci_fixed_ctr(box);
else
return uncore_msr_fixed_ctr(box);
}
static inline
unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
{
if (box->pci_dev)
return uncore_pci_event_ctl(box, idx);
else
return uncore_msr_event_ctl(box, idx);
}
static inline
unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
{
if (box->pci_dev)
return uncore_pci_perf_ctr(box, idx);
else
return uncore_msr_perf_ctr(box, idx);
}
static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
{
return box->pmu->type->perf_ctr_bits;
}
static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
{
return box->pmu->type->fixed_ctr_bits;
}
static inline int uncore_num_counters(struct intel_uncore_box *box)
{
return box->pmu->type->num_counters;
}
static inline void uncore_disable_box(struct intel_uncore_box *box)
{
if (box->pmu->type->ops->disable_box)
box->pmu->type->ops->disable_box(box);
}
static inline void uncore_enable_box(struct intel_uncore_box *box)
{
if (box->pmu->type->ops->enable_box)
box->pmu->type->ops->enable_box(box);
}
static inline void uncore_disable_event(struct intel_uncore_box *box,
struct perf_event *event)
{
box->pmu->type->ops->disable_event(box, event);
}
static inline void uncore_enable_event(struct intel_uncore_box *box,
struct perf_event *event)
{
box->pmu->type->ops->enable_event(box, event);
}
static inline u64 uncore_read_counter(struct intel_uncore_box *box,
struct perf_event *event)
{
return box->pmu->type->ops->read_counter(box, event);
}
static inline void uncore_box_init(struct intel_uncore_box *box)
{
if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
if (box->pmu->type->ops->init_box)
box->pmu->type->ops->init_box(box);
}
}
static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
{
return (box->phys_id < 0);
}
struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event);
struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
struct intel_uncore_box *uncore_event_to_box(struct perf_event *event);
u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
void uncore_pmu_event_read(struct perf_event *event);
void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
struct event_constraint *
uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
extern struct intel_uncore_type **uncore_msr_uncores;
extern struct intel_uncore_type **uncore_pci_uncores;
extern struct pci_driver *uncore_pci_driver;
extern int uncore_pcibus_to_physid[256];
extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
extern struct event_constraint uncore_constraint_empty;
/* perf_event_intel_uncore_snb.c */
int snb_uncore_pci_init(void);
int ivb_uncore_pci_init(void);
int hsw_uncore_pci_init(void);
void snb_uncore_cpu_init(void);
void nhm_uncore_cpu_init(void);
/* perf_event_intel_uncore_snbep.c */
int snbep_uncore_pci_init(void);
void snbep_uncore_cpu_init(void);
int ivbep_uncore_pci_init(void);
void ivbep_uncore_cpu_init(void);
int hswep_uncore_pci_init(void);
void hswep_uncore_cpu_init(void);
/* perf_event_intel_uncore_nhmex.c */
void nhmex_uncore_cpu_init(void);

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,636 @@
/* Nehalem/SandBridge/Haswell uncore support */
#include "perf_event_intel_uncore.h"
/* SNB event control */
#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00
#define SNB_UNC_CTL_EDGE_DET (1 << 18)
#define SNB_UNC_CTL_EN (1 << 22)
#define SNB_UNC_CTL_INVERT (1 << 23)
#define SNB_UNC_CTL_CMASK_MASK 0x1f000000
#define NHM_UNC_CTL_CMASK_MASK 0xff000000
#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0)
#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
SNB_UNC_CTL_UMASK_MASK | \
SNB_UNC_CTL_EDGE_DET | \
SNB_UNC_CTL_INVERT | \
SNB_UNC_CTL_CMASK_MASK)
#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
SNB_UNC_CTL_UMASK_MASK | \
SNB_UNC_CTL_EDGE_DET | \
SNB_UNC_CTL_INVERT | \
NHM_UNC_CTL_CMASK_MASK)
/* SNB global control register */
#define SNB_UNC_PERF_GLOBAL_CTL 0x391
#define SNB_UNC_FIXED_CTR_CTRL 0x394
#define SNB_UNC_FIXED_CTR 0x395
/* SNB uncore global control */
#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1)
#define SNB_UNC_GLOBAL_CTL_EN (1 << 29)
/* SNB Cbo register */
#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700
#define SNB_UNC_CBO_0_PER_CTR0 0x706
#define SNB_UNC_CBO_MSR_OFFSET 0x10
/* NHM global control register */
#define NHM_UNC_PERF_GLOBAL_CTL 0x391
#define NHM_UNC_FIXED_CTR 0x394
#define NHM_UNC_FIXED_CTR_CTRL 0x395
/* NHM uncore global control */
#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1)
#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32)
/* NHM uncore register */
#define NHM_UNC_PERFEVTSEL0 0x3c0
#define NHM_UNC_UNCORE_PMC0 0x3b0
DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
/* Sandy Bridge uncore support */
static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
if (hwc->idx < UNCORE_PMC_IDX_FIXED)
wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
else
wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
}
static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
{
wrmsrl(event->hw.config_base, 0);
}
static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
{
if (box->pmu->pmu_idx == 0) {
wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
}
}
static struct uncore_event_desc snb_uncore_events[] = {
INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
{ /* end: all zeroes */ },
};
static struct attribute *snb_uncore_formats_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
&format_attr_edge.attr,
&format_attr_inv.attr,
&format_attr_cmask5.attr,
NULL,
};
static struct attribute_group snb_uncore_format_group = {
.name = "format",
.attrs = snb_uncore_formats_attr,
};
static struct intel_uncore_ops snb_uncore_msr_ops = {
.init_box = snb_uncore_msr_init_box,
.disable_event = snb_uncore_msr_disable_event,
.enable_event = snb_uncore_msr_enable_event,
.read_counter = uncore_msr_read_counter,
};
static struct event_constraint snb_uncore_cbox_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
EVENT_CONSTRAINT_END
};
static struct intel_uncore_type snb_uncore_cbox = {
.name = "cbox",
.num_counters = 2,
.num_boxes = 4,
.perf_ctr_bits = 44,
.fixed_ctr_bits = 48,
.perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
.event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
.fixed_ctr = SNB_UNC_FIXED_CTR,
.fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
.single_fixed = 1,
.event_mask = SNB_UNC_RAW_EVENT_MASK,
.msr_offset = SNB_UNC_CBO_MSR_OFFSET,
.constraints = snb_uncore_cbox_constraints,
.ops = &snb_uncore_msr_ops,
.format_group = &snb_uncore_format_group,
.event_descs = snb_uncore_events,
};
static struct intel_uncore_type *snb_msr_uncores[] = {
&snb_uncore_cbox,
NULL,
};
void snb_uncore_cpu_init(void)
{
uncore_msr_uncores = snb_msr_uncores;
if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
}
enum {
SNB_PCI_UNCORE_IMC,
};
static struct uncore_event_desc snb_uncore_imc_events[] = {
INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"),
INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
{ /* end: all zeroes */ },
};
#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff
#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48
/* page size multiple covering all config regs */
#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000
#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1
#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050
#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2
#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054
#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE
static struct attribute *snb_uncore_imc_formats_attr[] = {
&format_attr_event.attr,
NULL,
};
static struct attribute_group snb_uncore_imc_format_group = {
.name = "format",
.attrs = snb_uncore_imc_formats_attr,
};
static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
{
struct pci_dev *pdev = box->pci_dev;
int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
resource_size_t addr;
u32 pci_dword;
pci_read_config_dword(pdev, where, &pci_dword);
addr = pci_dword;
#ifdef CONFIG_PHYS_ADDR_T_64BIT
pci_read_config_dword(pdev, where + 4, &pci_dword);
addr |= ((resource_size_t)pci_dword << 32);
#endif
addr &= ~(PAGE_SIZE - 1);
box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
}
static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
{}
static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
{}
static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
{}
static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
{}
static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
}
/*
* custom event_init() function because we define our own fixed, free
* running counters, so we do not want to conflict with generic uncore
* logic. Also simplifies processing
*/
static int snb_uncore_imc_event_init(struct perf_event *event)
{
struct intel_uncore_pmu *pmu;
struct intel_uncore_box *box;
struct hw_perf_event *hwc = &event->hw;
u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
int idx, base;
if (event->attr.type != event->pmu->type)
return -ENOENT;
pmu = uncore_event_to_pmu(event);
/* no device found for this pmu */
if (pmu->func_id < 0)
return -ENOENT;
/* Sampling not supported yet */
if (hwc->sample_period)
return -EINVAL;
/* unsupported modes and filters */
if (event->attr.exclude_user ||
event->attr.exclude_kernel ||
event->attr.exclude_hv ||
event->attr.exclude_idle ||
event->attr.exclude_host ||
event->attr.exclude_guest ||
event->attr.sample_period) /* no sampling */
return -EINVAL;
/*
* Place all uncore events for a particular physical package
* onto a single cpu
*/
if (event->cpu < 0)
return -EINVAL;
/* check only supported bits are set */
if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
return -EINVAL;
box = uncore_pmu_to_box(pmu, event->cpu);
if (!box || box->cpu < 0)
return -EINVAL;
event->cpu = box->cpu;
event->hw.idx = -1;
event->hw.last_tag = ~0ULL;
event->hw.extra_reg.idx = EXTRA_REG_NONE;
event->hw.branch_reg.idx = EXTRA_REG_NONE;
/*
* check event is known (whitelist, determines counter)
*/
switch (cfg) {
case SNB_UNCORE_PCI_IMC_DATA_READS:
base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
idx = UNCORE_PMC_IDX_FIXED;
break;
case SNB_UNCORE_PCI_IMC_DATA_WRITES:
base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
idx = UNCORE_PMC_IDX_FIXED + 1;
break;
default:
return -EINVAL;
}
/* must be done before validate_group */
event->hw.event_base = base;
event->hw.config = cfg;
event->hw.idx = idx;
/* no group validation needed, we have free running counters */
return 0;
}
static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
{
return 0;
}
static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
{
struct intel_uncore_box *box = uncore_event_to_box(event);
u64 count;
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
return;
event->hw.state = 0;
box->n_active++;
list_add_tail(&event->active_entry, &box->active_list);
count = snb_uncore_imc_read_counter(box, event);
local64_set(&event->hw.prev_count, count);
if (box->n_active == 1)
uncore_pmu_start_hrtimer(box);
}
static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
{
struct intel_uncore_box *box = uncore_event_to_box(event);
struct hw_perf_event *hwc = &event->hw;
if (!(hwc->state & PERF_HES_STOPPED)) {
box->n_active--;
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
list_del(&event->active_entry);
if (box->n_active == 0)
uncore_pmu_cancel_hrtimer(box);
}
if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
/*
* Drain the remaining delta count out of a event
* that we are disabling:
*/
uncore_perf_event_update(box, event);
hwc->state |= PERF_HES_UPTODATE;
}
}
static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
{
struct intel_uncore_box *box = uncore_event_to_box(event);
struct hw_perf_event *hwc = &event->hw;
if (!box)
return -ENODEV;
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
if (!(flags & PERF_EF_START))
hwc->state |= PERF_HES_ARCH;
snb_uncore_imc_event_start(event, 0);
box->n_events++;
return 0;
}
static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
{
struct intel_uncore_box *box = uncore_event_to_box(event);
int i;
snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
for (i = 0; i < box->n_events; i++) {
if (event == box->event_list[i]) {
--box->n_events;
break;
}
}
}
static int snb_pci2phy_map_init(int devid)
{
struct pci_dev *dev = NULL;
int bus;
dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
if (!dev)
return -ENOTTY;
bus = dev->bus->number;
uncore_pcibus_to_physid[bus] = 0;
pci_dev_put(dev);
return 0;
}
static struct pmu snb_uncore_imc_pmu = {
.task_ctx_nr = perf_invalid_context,
.event_init = snb_uncore_imc_event_init,
.add = snb_uncore_imc_event_add,
.del = snb_uncore_imc_event_del,
.start = snb_uncore_imc_event_start,
.stop = snb_uncore_imc_event_stop,
.read = uncore_pmu_event_read,
};
static struct intel_uncore_ops snb_uncore_imc_ops = {
.init_box = snb_uncore_imc_init_box,
.enable_box = snb_uncore_imc_enable_box,
.disable_box = snb_uncore_imc_disable_box,
.disable_event = snb_uncore_imc_disable_event,
.enable_event = snb_uncore_imc_enable_event,
.hw_config = snb_uncore_imc_hw_config,
.read_counter = snb_uncore_imc_read_counter,
};
static struct intel_uncore_type snb_uncore_imc = {
.name = "imc",
.num_counters = 2,
.num_boxes = 1,
.fixed_ctr_bits = 32,
.fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE,
.event_descs = snb_uncore_imc_events,
.format_group = &snb_uncore_imc_format_group,
.perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
.event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK,
.ops = &snb_uncore_imc_ops,
.pmu = &snb_uncore_imc_pmu,
};
static struct intel_uncore_type *snb_pci_uncores[] = {
[SNB_PCI_UNCORE_IMC] = &snb_uncore_imc,
NULL,
};
static const struct pci_device_id snb_uncore_pci_ids[] = {
{ /* IMC */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
{ /* end: all zeroes */ },
};
static const struct pci_device_id ivb_uncore_pci_ids[] = {
{ /* IMC */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
{ /* IMC */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
{ /* end: all zeroes */ },
};
static const struct pci_device_id hsw_uncore_pci_ids[] = {
{ /* IMC */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
{ /* end: all zeroes */ },
};
static struct pci_driver snb_uncore_pci_driver = {
.name = "snb_uncore",
.id_table = snb_uncore_pci_ids,
};
static struct pci_driver ivb_uncore_pci_driver = {
.name = "ivb_uncore",
.id_table = ivb_uncore_pci_ids,
};
static struct pci_driver hsw_uncore_pci_driver = {
.name = "hsw_uncore",
.id_table = hsw_uncore_pci_ids,
};
struct imc_uncore_pci_dev {
__u32 pci_id;
struct pci_driver *driver;
};
#define IMC_DEV(a, d) \
{ .pci_id = PCI_DEVICE_ID_INTEL_##a, .driver = (d) }
static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
IMC_DEV(SNB_IMC, &snb_uncore_pci_driver),
IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver), /* 3rd Gen Core processor */
IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */
{ /* end marker */ }
};
#define for_each_imc_pci_id(x, t) \
for (x = (t); (x)->pci_id; x++)
static struct pci_driver *imc_uncore_find_dev(void)
{
const struct imc_uncore_pci_dev *p;
int ret;
for_each_imc_pci_id(p, desktop_imc_pci_ids) {
ret = snb_pci2phy_map_init(p->pci_id);
if (ret == 0)
return p->driver;
}
return NULL;
}
static int imc_uncore_pci_init(void)
{
struct pci_driver *imc_drv = imc_uncore_find_dev();
if (!imc_drv)
return -ENODEV;
uncore_pci_uncores = snb_pci_uncores;
uncore_pci_driver = imc_drv;
return 0;
}
int snb_uncore_pci_init(void)
{
return imc_uncore_pci_init();
}
int ivb_uncore_pci_init(void)
{
return imc_uncore_pci_init();
}
int hsw_uncore_pci_init(void)
{
return imc_uncore_pci_init();
}
/* end of Sandy Bridge uncore support */
/* Nehalem uncore support */
static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
{
wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
}
static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
{
wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
}
static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
if (hwc->idx < UNCORE_PMC_IDX_FIXED)
wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
else
wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
}
static struct attribute *nhm_uncore_formats_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
&format_attr_edge.attr,
&format_attr_inv.attr,
&format_attr_cmask8.attr,
NULL,
};
static struct attribute_group nhm_uncore_format_group = {
.name = "format",
.attrs = nhm_uncore_formats_attr,
};
static struct uncore_event_desc nhm_uncore_events[] = {
INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"),
INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"),
INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"),
INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"),
INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"),
INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"),
INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"),
{ /* end: all zeroes */ },
};
static struct intel_uncore_ops nhm_uncore_msr_ops = {
.disable_box = nhm_uncore_msr_disable_box,
.enable_box = nhm_uncore_msr_enable_box,
.disable_event = snb_uncore_msr_disable_event,
.enable_event = nhm_uncore_msr_enable_event,
.read_counter = uncore_msr_read_counter,
};
static struct intel_uncore_type nhm_uncore = {
.name = "",
.num_counters = 8,
.num_boxes = 1,
.perf_ctr_bits = 48,
.fixed_ctr_bits = 48,
.event_ctl = NHM_UNC_PERFEVTSEL0,
.perf_ctr = NHM_UNC_UNCORE_PMC0,
.fixed_ctr = NHM_UNC_FIXED_CTR,
.fixed_ctl = NHM_UNC_FIXED_CTR_CTRL,
.event_mask = NHM_UNC_RAW_EVENT_MASK,
.event_descs = nhm_uncore_events,
.ops = &nhm_uncore_msr_ops,
.format_group = &nhm_uncore_format_group,
};
static struct intel_uncore_type *nhm_msr_uncores[] = {
&nhm_uncore,
NULL,
};
void nhm_uncore_cpu_init(void)
{
uncore_msr_uncores = nhm_msr_uncores;
}
/* end of Nehalem uncore support */

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,319 @@
/* Driver for Intel Xeon Phi "Knights Corner" PMU */
#include <linux/perf_event.h>
#include <linux/types.h>
#include <asm/hardirq.h>
#include "perf_event.h"
static const u64 knc_perfmon_event_map[] =
{
[PERF_COUNT_HW_CPU_CYCLES] = 0x002a,
[PERF_COUNT_HW_INSTRUCTIONS] = 0x0016,
[PERF_COUNT_HW_CACHE_REFERENCES] = 0x0028,
[PERF_COUNT_HW_CACHE_MISSES] = 0x0029,
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0012,
[PERF_COUNT_HW_BRANCH_MISSES] = 0x002b,
};
static const u64 __initconst knc_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
{
[ C(L1D) ] = {
[ C(OP_READ) ] = {
/* On Xeon Phi event "0" is a valid DATA_READ */
/* (L1 Data Cache Reads) Instruction. */
/* We code this as ARCH_PERFMON_EVENTSEL_INT as this */
/* bit will always be set in x86_pmu_hw_config(). */
[ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
/* DATA_READ */
[ C(RESULT_MISS) ] = 0x0003, /* DATA_READ_MISS */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
[ C(RESULT_MISS) ] = 0x0004, /* DATA_WRITE_MISS */
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0x0011, /* L1_DATA_PF1 */
[ C(RESULT_MISS) ] = 0x001c, /* L1_DATA_PF1_MISS */
},
},
[ C(L1I ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
[ C(RESULT_MISS) ] = 0x000e, /* CODE_CACHE_MISS */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0x0,
[ C(RESULT_MISS) ] = 0x0,
},
},
[ C(LL ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0x10cb, /* L2_READ_MISS */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0x10cc, /* L2_WRITE_HIT */
[ C(RESULT_MISS) ] = 0,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0x10fc, /* L2_DATA_PF2 */
[ C(RESULT_MISS) ] = 0x10fe, /* L2_DATA_PF2_MISS */
},
},
[ C(DTLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
/* DATA_READ */
/* see note on L1 OP_READ */
[ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
[ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0x0,
[ C(RESULT_MISS) ] = 0x0,
},
},
[ C(ITLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
[ C(RESULT_MISS) ] = 0x000d, /* CODE_PAGE_WALK */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
},
[ C(BPU ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0012, /* BRANCHES */
[ C(RESULT_MISS) ] = 0x002b, /* BRANCHES_MISPREDICTED */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
},
};
static u64 knc_pmu_event_map(int hw_event)
{
return knc_perfmon_event_map[hw_event];
}
static struct event_constraint knc_event_constraints[] =
{
INTEL_EVENT_CONSTRAINT(0xc3, 0x1), /* HWP_L2HIT */
INTEL_EVENT_CONSTRAINT(0xc4, 0x1), /* HWP_L2MISS */
INTEL_EVENT_CONSTRAINT(0xc8, 0x1), /* L2_READ_HIT_E */
INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* L2_READ_HIT_M */
INTEL_EVENT_CONSTRAINT(0xca, 0x1), /* L2_READ_HIT_S */
INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* L2_READ_MISS */
INTEL_EVENT_CONSTRAINT(0xcc, 0x1), /* L2_WRITE_HIT */
INTEL_EVENT_CONSTRAINT(0xce, 0x1), /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
INTEL_EVENT_CONSTRAINT(0xcf, 0x1), /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
INTEL_EVENT_CONSTRAINT(0xd7, 0x1), /* L2_VICTIM_REQ_WITH_DATA */
INTEL_EVENT_CONSTRAINT(0xe3, 0x1), /* SNP_HITM_BUNIT */
INTEL_EVENT_CONSTRAINT(0xe6, 0x1), /* SNP_HIT_L2 */
INTEL_EVENT_CONSTRAINT(0xe7, 0x1), /* SNP_HITM_L2 */
INTEL_EVENT_CONSTRAINT(0xf1, 0x1), /* L2_DATA_READ_MISS_CACHE_FILL */
INTEL_EVENT_CONSTRAINT(0xf2, 0x1), /* L2_DATA_WRITE_MISS_CACHE_FILL */
INTEL_EVENT_CONSTRAINT(0xf6, 0x1), /* L2_DATA_READ_MISS_MEM_FILL */
INTEL_EVENT_CONSTRAINT(0xf7, 0x1), /* L2_DATA_WRITE_MISS_MEM_FILL */
INTEL_EVENT_CONSTRAINT(0xfc, 0x1), /* L2_DATA_PF2 */
INTEL_EVENT_CONSTRAINT(0xfd, 0x1), /* L2_DATA_PF2_DROP */
INTEL_EVENT_CONSTRAINT(0xfe, 0x1), /* L2_DATA_PF2_MISS */
INTEL_EVENT_CONSTRAINT(0xff, 0x1), /* L2_DATA_HIT_INFLIGHT_PF2 */
EVENT_CONSTRAINT_END
};
#define MSR_KNC_IA32_PERF_GLOBAL_STATUS 0x0000002d
#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL 0x0000002e
#define MSR_KNC_IA32_PERF_GLOBAL_CTRL 0x0000002f
#define KNC_ENABLE_COUNTER0 0x00000001
#define KNC_ENABLE_COUNTER1 0x00000002
static void knc_pmu_disable_all(void)
{
u64 val;
rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
}
static void knc_pmu_enable_all(int added)
{
u64 val;
rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
}
static inline void
knc_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
u64 val;
val = hwc->config;
val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
(void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
}
static void knc_pmu_enable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
u64 val;
val = hwc->config;
val |= ARCH_PERFMON_EVENTSEL_ENABLE;
(void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
}
static inline u64 knc_pmu_get_status(void)
{
u64 status;
rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status);
return status;
}
static inline void knc_pmu_ack_status(u64 ack)
{
wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack);
}
static int knc_pmu_handle_irq(struct pt_regs *regs)
{
struct perf_sample_data data;
struct cpu_hw_events *cpuc;
int handled = 0;
int bit, loops;
u64 status;
cpuc = this_cpu_ptr(&cpu_hw_events);
knc_pmu_disable_all();
status = knc_pmu_get_status();
if (!status) {
knc_pmu_enable_all(0);
return handled;
}
loops = 0;
again:
knc_pmu_ack_status(status);
if (++loops > 100) {
WARN_ONCE(1, "perf: irq loop stuck!\n");
perf_event_print_debug();
goto done;
}
inc_irq_stat(apic_perf_irqs);
for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
struct perf_event *event = cpuc->events[bit];
handled++;
if (!test_bit(bit, cpuc->active_mask))
continue;
if (!intel_pmu_save_and_restart(event))
continue;
perf_sample_data_init(&data, 0, event->hw.last_period);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}
/*
* Repeat if there is more work to be done:
*/
status = knc_pmu_get_status();
if (status)
goto again;
done:
knc_pmu_enable_all(0);
return handled;
}
PMU_FORMAT_ATTR(event, "config:0-7" );
PMU_FORMAT_ATTR(umask, "config:8-15" );
PMU_FORMAT_ATTR(edge, "config:18" );
PMU_FORMAT_ATTR(inv, "config:23" );
PMU_FORMAT_ATTR(cmask, "config:24-31" );
static struct attribute *intel_knc_formats_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
&format_attr_edge.attr,
&format_attr_inv.attr,
&format_attr_cmask.attr,
NULL,
};
static const struct x86_pmu knc_pmu __initconst = {
.name = "knc",
.handle_irq = knc_pmu_handle_irq,
.disable_all = knc_pmu_disable_all,
.enable_all = knc_pmu_enable_all,
.enable = knc_pmu_enable_event,
.disable = knc_pmu_disable_event,
.hw_config = x86_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_KNC_EVNTSEL0,
.perfctr = MSR_KNC_PERFCTR0,
.event_map = knc_pmu_event_map,
.max_events = ARRAY_SIZE(knc_perfmon_event_map),
.apic = 1,
.max_period = (1ULL << 39) - 1,
.version = 0,
.num_counters = 2,
.cntval_bits = 40,
.cntval_mask = (1ULL << 40) - 1,
.get_event_constraints = x86_get_event_constraints,
.event_constraints = knc_event_constraints,
.format_attrs = intel_knc_formats_attr,
};
__init int knc_pmu_init(void)
{
x86_pmu = knc_pmu;
memcpy(hw_cache_event_ids, knc_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
return 0;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,279 @@
#include <linux/perf_event.h>
#include <linux/types.h>
#include "perf_event.h"
/*
* Not sure about some of these
*/
static const u64 p6_perfmon_event_map[] =
{
[PERF_COUNT_HW_CPU_CYCLES] = 0x0079, /* CPU_CLK_UNHALTED */
[PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, /* INST_RETIRED */
[PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, /* L2_RQSTS:M:E:S:I */
[PERF_COUNT_HW_CACHE_MISSES] = 0x012e, /* L2_RQSTS:I */
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, /* BR_INST_RETIRED */
[PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, /* BR_MISS_PRED_RETIRED */
[PERF_COUNT_HW_BUS_CYCLES] = 0x0062, /* BUS_DRDY_CLOCKS */
[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2, /* RESOURCE_STALLS */
};
static const u64 __initconst p6_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
{
[ C(L1D) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */
[ C(RESULT_MISS) ] = 0x0045, /* DCU_LINES_IN */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0x0f29, /* L2_LD:M:E:S:I */
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
},
[ C(L1I ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */
[ C(RESULT_MISS) ] = 0x0f28, /* L2_IFETCH:M:E:S:I */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
},
[ C(LL ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0x0025, /* L2_M_LINES_INM */
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
},
[ C(DTLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */
[ C(RESULT_MISS) ] = 0,
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
},
[ C(ITLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */
[ C(RESULT_MISS) ] = 0x0085, /* ITLB_MISS */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
},
[ C(BPU ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED */
[ C(RESULT_MISS) ] = 0x00c5, /* BR_MISS_PRED_RETIRED */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
},
};
static u64 p6_pmu_event_map(int hw_event)
{
return p6_perfmon_event_map[hw_event];
}
/*
* Event setting that is specified not to count anything.
* We use this to effectively disable a counter.
*
* L2_RQSTS with 0 MESI unit mask.
*/
#define P6_NOP_EVENT 0x0000002EULL
static struct event_constraint p6_event_constraints[] =
{
INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
EVENT_CONSTRAINT_END
};
static void p6_pmu_disable_all(void)
{
u64 val;
/* p6 only has one enable register */
rdmsrl(MSR_P6_EVNTSEL0, val);
val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
wrmsrl(MSR_P6_EVNTSEL0, val);
}
static void p6_pmu_enable_all(int added)
{
unsigned long val;
/* p6 only has one enable register */
rdmsrl(MSR_P6_EVNTSEL0, val);
val |= ARCH_PERFMON_EVENTSEL_ENABLE;
wrmsrl(MSR_P6_EVNTSEL0, val);
}
static inline void
p6_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
u64 val = P6_NOP_EVENT;
(void)wrmsrl_safe(hwc->config_base, val);
}
static void p6_pmu_enable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
u64 val;
val = hwc->config;
/*
* p6 only has a global event enable, set on PerfEvtSel0
* We "disable" events by programming P6_NOP_EVENT
* and we rely on p6_pmu_enable_all() being called
* to actually enable the events.
*/
(void)wrmsrl_safe(hwc->config_base, val);
}
PMU_FORMAT_ATTR(event, "config:0-7" );
PMU_FORMAT_ATTR(umask, "config:8-15" );
PMU_FORMAT_ATTR(edge, "config:18" );
PMU_FORMAT_ATTR(pc, "config:19" );
PMU_FORMAT_ATTR(inv, "config:23" );
PMU_FORMAT_ATTR(cmask, "config:24-31" );
static struct attribute *intel_p6_formats_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
&format_attr_edge.attr,
&format_attr_pc.attr,
&format_attr_inv.attr,
&format_attr_cmask.attr,
NULL,
};
static __initconst const struct x86_pmu p6_pmu = {
.name = "p6",
.handle_irq = x86_pmu_handle_irq,
.disable_all = p6_pmu_disable_all,
.enable_all = p6_pmu_enable_all,
.enable = p6_pmu_enable_event,
.disable = p6_pmu_disable_event,
.hw_config = x86_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_P6_EVNTSEL0,
.perfctr = MSR_P6_PERFCTR0,
.event_map = p6_pmu_event_map,
.max_events = ARRAY_SIZE(p6_perfmon_event_map),
.apic = 1,
.max_period = (1ULL << 31) - 1,
.version = 0,
.num_counters = 2,
/*
* Events have 40 bits implemented. However they are designed such
* that bits [32-39] are sign extensions of bit 31. As such the
* effective width of a event for P6-like PMU is 32 bits only.
*
* See IA-32 Intel Architecture Software developer manual Vol 3B
*/
.cntval_bits = 32,
.cntval_mask = (1ULL << 32) - 1,
.get_event_constraints = x86_get_event_constraints,
.event_constraints = p6_event_constraints,
.format_attrs = intel_p6_formats_attr,
.events_sysfs_show = intel_event_sysfs_show,
};
static __init void p6_pmu_rdpmc_quirk(void)
{
if (boot_cpu_data.x86_mask < 9) {
/*
* PPro erratum 26; fixed in stepping 9 and above.
*/
pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n");
x86_pmu.attr_rdpmc_broken = 1;
x86_pmu.attr_rdpmc = 0;
}
}
__init int p6_pmu_init(void)
{
x86_pmu = p6_pmu;
switch (boot_cpu_data.x86_model) {
case 1: /* Pentium Pro */
x86_add_quirk(p6_pmu_rdpmc_quirk);
break;
case 3: /* Pentium II - Klamath */
case 5: /* Pentium II - Deschutes */
case 6: /* Pentium II - Mendocino */
break;
case 7: /* Pentium III - Katmai */
case 8: /* Pentium III - Coppermine */
case 10: /* Pentium III Xeon */
case 11: /* Pentium III - Tualatin */
break;
case 9: /* Pentium M - Banias */
case 13: /* Pentium M - Dothan */
break;
default:
pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);
return -ENODEV;
}
memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
return 0;
}

View file

@ -0,0 +1,160 @@
/*
* local apic based NMI watchdog for various CPUs.
*
* This file also handles reservation of performance counters for coordination
* with other users (like oprofile).
*
* Note that these events normally don't tick when the CPU idles. This means
* the frequency varies with CPU load.
*
* Original code for K7/P6 written by Keith Owens
*
*/
#include <linux/percpu.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/smp.h>
#include <asm/nmi.h>
#include <linux/kprobes.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
/*
* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
* offset from MSR_P4_BSU_ESCR0.
*
* It will be the max for all platforms (for now)
*/
#define NMI_MAX_COUNTER_BITS 66
/*
* perfctr_nmi_owner tracks the ownership of the perfctr registers:
* evtsel_nmi_owner tracks the ownership of the event selection
* - different performance counters/ event selection may be reserved for
* different subsystems this reservation system just tries to coordinate
* things a little
*/
static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
/* converts an msr to an appropriate reservation bit */
static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
{
/* returns the bit offset of the performance counter register */
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
if (msr >= MSR_F15H_PERF_CTR)
return (msr - MSR_F15H_PERF_CTR) >> 1;
return msr - MSR_K7_PERFCTR0;
case X86_VENDOR_INTEL:
if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
return msr - MSR_ARCH_PERFMON_PERFCTR0;
switch (boot_cpu_data.x86) {
case 6:
return msr - MSR_P6_PERFCTR0;
case 11:
return msr - MSR_KNC_PERFCTR0;
case 15:
return msr - MSR_P4_BPU_PERFCTR0;
}
}
return 0;
}
/*
* converts an msr to an appropriate reservation bit
* returns the bit offset of the event selection register
*/
static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
{
/* returns the bit offset of the event selection register */
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
if (msr >= MSR_F15H_PERF_CTL)
return (msr - MSR_F15H_PERF_CTL) >> 1;
return msr - MSR_K7_EVNTSEL0;
case X86_VENDOR_INTEL:
if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
return msr - MSR_ARCH_PERFMON_EVENTSEL0;
switch (boot_cpu_data.x86) {
case 6:
return msr - MSR_P6_EVNTSEL0;
case 11:
return msr - MSR_KNC_EVNTSEL0;
case 15:
return msr - MSR_P4_BSU_ESCR0;
}
}
return 0;
}
/* checks for a bit availability (hack for oprofile) */
int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
{
BUG_ON(counter > NMI_MAX_COUNTER_BITS);
return !test_bit(counter, perfctr_nmi_owner);
}
EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
int reserve_perfctr_nmi(unsigned int msr)
{
unsigned int counter;
counter = nmi_perfctr_msr_to_bit(msr);
/* register not managed by the allocator? */
if (counter > NMI_MAX_COUNTER_BITS)
return 1;
if (!test_and_set_bit(counter, perfctr_nmi_owner))
return 1;
return 0;
}
EXPORT_SYMBOL(reserve_perfctr_nmi);
void release_perfctr_nmi(unsigned int msr)
{
unsigned int counter;
counter = nmi_perfctr_msr_to_bit(msr);
/* register not managed by the allocator? */
if (counter > NMI_MAX_COUNTER_BITS)
return;
clear_bit(counter, perfctr_nmi_owner);
}
EXPORT_SYMBOL(release_perfctr_nmi);
int reserve_evntsel_nmi(unsigned int msr)
{
unsigned int counter;
counter = nmi_evntsel_msr_to_bit(msr);
/* register not managed by the allocator? */
if (counter > NMI_MAX_COUNTER_BITS)
return 1;
if (!test_and_set_bit(counter, evntsel_nmi_owner))
return 1;
return 0;
}
EXPORT_SYMBOL(reserve_evntsel_nmi);
void release_evntsel_nmi(unsigned int msr)
{
unsigned int counter;
counter = nmi_evntsel_msr_to_bit(msr);
/* register not managed by the allocator? */
if (counter > NMI_MAX_COUNTER_BITS)
return;
clear_bit(counter, evntsel_nmi_owner);
}
EXPORT_SYMBOL(release_evntsel_nmi);

View file

@ -0,0 +1,21 @@
/*
* Strings for the various x86 power flags
*
* This file must not contain any executable code.
*/
#include <asm/cpufeature.h>
const char *const x86_power_flags[32] = {
"ts", /* temperature sensor */
"fid", /* frequency id control */
"vid", /* voltage id control */
"ttp", /* thermal trip */
"tm", /* hardware thermal control */
"stc", /* software thermal control */
"100mhzsteps", /* 100 MHz multiplier control */
"hwpstate", /* hardware P-state control */
"", /* tsc invariant mapped to constant_tsc */
"cpb", /* core performance boost */
"eff_freq_ro", /* Readonly aperf/mperf */
};

162
arch/x86/kernel/cpu/proc.c Normal file
View file

@ -0,0 +1,162 @@
#include <linux/smp.h>
#include <linux/timex.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/cpufreq.h>
/*
* Get CPU information for use by the procfs.
*/
static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
unsigned int cpu)
{
#ifdef CONFIG_SMP
seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu)));
seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
seq_printf(m, "apicid\t\t: %d\n", c->apicid);
seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
#endif
}
#ifdef CONFIG_X86_32
static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
{
seq_printf(m,
"fdiv_bug\t: %s\n"
"f00f_bug\t: %s\n"
"coma_bug\t: %s\n"
"fpu\t\t: %s\n"
"fpu_exception\t: %s\n"
"cpuid level\t: %d\n"
"wp\t\t: %s\n",
static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no",
static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no",
static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no",
static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
c->cpuid_level,
c->wp_works_ok ? "yes" : "no");
}
#else
static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
{
seq_printf(m,
"fpu\t\t: yes\n"
"fpu_exception\t: yes\n"
"cpuid level\t: %d\n"
"wp\t\t: yes\n",
c->cpuid_level);
}
#endif
static int show_cpuinfo(struct seq_file *m, void *v)
{
struct cpuinfo_x86 *c = v;
unsigned int cpu;
int i;
cpu = c->cpu_index;
seq_printf(m, "processor\t: %u\n"
"vendor_id\t: %s\n"
"cpu family\t: %d\n"
"model\t\t: %u\n"
"model name\t: %s\n",
cpu,
c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
c->x86,
c->x86_model,
c->x86_model_id[0] ? c->x86_model_id : "unknown");
if (c->x86_mask || c->cpuid_level >= 0)
seq_printf(m, "stepping\t: %d\n", c->x86_mask);
else
seq_printf(m, "stepping\t: unknown\n");
if (c->microcode)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
if (cpu_has(c, X86_FEATURE_TSC)) {
unsigned int freq = cpufreq_quick_get(cpu);
if (!freq)
freq = cpu_khz;
seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
freq / 1000, (freq % 1000));
}
/* Cache size */
if (c->x86_cache_size >= 0)
seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
show_cpuinfo_core(m, c, cpu);
show_cpuinfo_misc(m, c);
seq_printf(m, "flags\t\t:");
for (i = 0; i < 32*NCAPINTS; i++)
if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
seq_printf(m, " %s", x86_cap_flags[i]);
seq_printf(m, "\nbugs\t\t:");
for (i = 0; i < 32*NBUGINTS; i++) {
unsigned int bug_bit = 32*NCAPINTS + i;
if (cpu_has_bug(c, bug_bit) && x86_bug_flags[i])
seq_printf(m, " %s", x86_bug_flags[i]);
}
seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
c->loops_per_jiffy/(500000/HZ),
(c->loops_per_jiffy/(5000/HZ)) % 100);
#ifdef CONFIG_X86_64
if (c->x86_tlbsize > 0)
seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
#endif
seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size);
seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
c->x86_phys_bits, c->x86_virt_bits);
seq_printf(m, "power management:");
for (i = 0; i < 32; i++) {
if (c->x86_power & (1 << i)) {
if (i < ARRAY_SIZE(x86_power_flags) &&
x86_power_flags[i])
seq_printf(m, "%s%s",
x86_power_flags[i][0] ? " " : "",
x86_power_flags[i]);
else
seq_printf(m, " [%d]", i);
}
}
seq_printf(m, "\n\n");
return 0;
}
static void *c_start(struct seq_file *m, loff_t *pos)
{
*pos = cpumask_next(*pos - 1, cpu_online_mask);
if ((*pos) < nr_cpu_ids)
return &cpu_data(*pos);
return NULL;
}
static void *c_next(struct seq_file *m, void *v, loff_t *pos)
{
(*pos)++;
return c_start(m, pos);
}
static void c_stop(struct seq_file *m, void *v)
{
}
const struct seq_operations cpuinfo_op = {
.start = c_start,
.next = c_next,
.stop = c_stop,
.show = show_cpuinfo,
};

Some files were not shown because too many files have changed in this diff Show more