Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

View file

@ -0,0 +1,69 @@
#
# Makefile for x86-compatible CPU details, features and quirks
#
# Don't trace early stages of a secondary CPU boot
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_common.o = -pg
CFLAGS_REMOVE_perf_event.o = -pg
endif
# Make sure load_percpu_segment has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_common.o := $(nostackp)
obj-y := intel_cacheinfo.o scattered.o topology.o
obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
obj-$(CONFIG_X86_32) += bugs.o
obj-$(CONFIG_X86_64) += bugs_64.o
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o
ifdef CONFIG_PERF_EVENTS
obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o perf_event_amd_uncore.o
ifdef CONFIG_AMD_IOMMU
obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
endif
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o
obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
perf_event_intel_uncore_snb.o \
perf_event_intel_uncore_snbep.o \
perf_event_intel_uncore_nhmex.o
endif
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_MICROCODE) += microcode/
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o
obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o
ifdef CONFIG_X86_FEATURE_NAMES
quiet_cmd_mkcapflags = MKCAP $@
cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
cpufeature = $(src)/../../include/asm/cpufeature.h
targets += capflags.c
$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
$(call if_changed,mkcapflags)
endif
clean-files += capflags.c

872
arch/x86/kernel/cpu/amd.c Normal file
View file

@ -0,0 +1,872 @@
#include <linux/export.h>
#include <linux/bitops.h>
#include <linux/elf.h>
#include <linux/mm.h>
#include <linux/io.h>
#include <linux/sched.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cpu.h>
#include <asm/smp.h>
#include <asm/pci-direct.h>
#ifdef CONFIG_X86_64
# include <asm/mmconfig.h>
# include <asm/cacheflush.h>
#endif
#include "cpu.h"
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
{
u32 gprs[8] = { 0 };
int err;
WARN_ONCE((boot_cpu_data.x86 != 0xf),
"%s should only be used on K8!\n", __func__);
gprs[1] = msr;
gprs[7] = 0x9c5a203a;
err = rdmsr_safe_regs(gprs);
*p = gprs[0] | ((u64)gprs[2] << 32);
return err;
}
static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
{
u32 gprs[8] = { 0 };
WARN_ONCE((boot_cpu_data.x86 != 0xf),
"%s should only be used on K8!\n", __func__);
gprs[0] = (u32)val;
gprs[1] = msr;
gprs[2] = val >> 32;
gprs[7] = 0x9c5a203a;
return wrmsr_safe_regs(gprs);
}
/*
* B step AMD K6 before B 9730xxxx have hardware bugs that can cause
* misexecution of code under Linux. Owners of such processors should
* contact AMD for precise details and a CPU swap.
*
* See http://www.multimania.com/poulot/k6bug.html
* and section 2.6.2 of "AMD-K6 Processor Revision Guide - Model 6"
* (Publication # 21266 Issue Date: August 1998)
*
* The following test is erm.. interesting. AMD neglected to up
* the chip setting when fixing the bug but they also tweaked some
* performance at the same time..
*/
extern __visible void vide(void);
__asm__(".globl vide\n\t.align 4\nvide: ret");
static void init_amd_k5(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
/*
* General Systems BIOSen alias the cpu frequency registers
* of the Elan at 0x000df000. Unfortuantly, one of the Linux
* drivers subsequently pokes it, and changes the CPU speed.
* Workaround : Remove the unneeded alias.
*/
#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
#define CBAR_ENB (0x80000000)
#define CBAR_KEY (0X000000CB)
if (c->x86_model == 9 || c->x86_model == 10) {
if (inl(CBAR) & CBAR_ENB)
outl(0 | CBAR_KEY, CBAR);
}
#endif
}
static void init_amd_k6(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
u32 l, h;
int mbytes = get_num_physpages() >> (20-PAGE_SHIFT);
if (c->x86_model < 6) {
/* Based on AMD doc 20734R - June 2000 */
if (c->x86_model == 0) {
clear_cpu_cap(c, X86_FEATURE_APIC);
set_cpu_cap(c, X86_FEATURE_PGE);
}
return;
}
if (c->x86_model == 6 && c->x86_mask == 1) {
const int K6_BUG_LOOP = 1000000;
int n;
void (*f_vide)(void);
unsigned long d, d2;
printk(KERN_INFO "AMD K6 stepping B detected - ");
/*
* It looks like AMD fixed the 2.6.2 bug and improved indirect
* calls at the same time.
*/
n = K6_BUG_LOOP;
f_vide = vide;
rdtscl(d);
while (n--)
f_vide();
rdtscl(d2);
d = d2-d;
if (d > 20*K6_BUG_LOOP)
printk(KERN_CONT
"system stability may be impaired when more than 32 MB are used.\n");
else
printk(KERN_CONT "probably OK (after B9730xxxx).\n");
}
/* K6 with old style WHCR */
if (c->x86_model < 8 ||
(c->x86_model == 8 && c->x86_mask < 8)) {
/* We can only write allocate on the low 508Mb */
if (mbytes > 508)
mbytes = 508;
rdmsr(MSR_K6_WHCR, l, h);
if ((l&0x0000FFFF) == 0) {
unsigned long flags;
l = (1<<0)|((mbytes/4)<<1);
local_irq_save(flags);
wbinvd();
wrmsr(MSR_K6_WHCR, l, h);
local_irq_restore(flags);
printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
mbytes);
}
return;
}
if ((c->x86_model == 8 && c->x86_mask > 7) ||
c->x86_model == 9 || c->x86_model == 13) {
/* The more serious chips .. */
if (mbytes > 4092)
mbytes = 4092;
rdmsr(MSR_K6_WHCR, l, h);
if ((l&0xFFFF0000) == 0) {
unsigned long flags;
l = ((mbytes>>2)<<22)|(1<<16);
local_irq_save(flags);
wbinvd();
wrmsr(MSR_K6_WHCR, l, h);
local_irq_restore(flags);
printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
mbytes);
}
return;
}
if (c->x86_model == 10) {
/* AMD Geode LX is model 10 */
/* placeholder for any needed mods */
return;
}
#endif
}
static void init_amd_k7(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
u32 l, h;
/*
* Bit 15 of Athlon specific MSR 15, needs to be 0
* to enable SSE on Palomino/Morgan/Barton CPU's.
* If the BIOS didn't enable it already, enable it here.
*/
if (c->x86_model >= 6 && c->x86_model <= 10) {
if (!cpu_has(c, X86_FEATURE_XMM)) {
printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
msr_clear_bit(MSR_K7_HWCR, 15);
set_cpu_cap(c, X86_FEATURE_XMM);
}
}
/*
* It's been determined by AMD that Athlons since model 8 stepping 1
* are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
* As per AMD technical note 27212 0.2
*/
if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
rdmsr(MSR_K7_CLK_CTL, l, h);
if ((l & 0xfff00000) != 0x20000000) {
printk(KERN_INFO
"CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
l, ((l & 0x000fffff)|0x20000000));
wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
}
}
set_cpu_cap(c, X86_FEATURE_K7);
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
return;
/*
* Certain Athlons might work (for various values of 'work') in SMP
* but they are not certified as MP capable.
*/
/* Athlon 660/661 is valid. */
if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
(c->x86_mask == 1)))
return;
/* Duron 670 is valid */
if ((c->x86_model == 7) && (c->x86_mask == 0))
return;
/*
* Athlon 662, Duron 671, and Athlon >model 7 have capability
* bit. It's worth noting that the A5 stepping (662) of some
* Athlon XP's have the MP bit set.
* See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
* more.
*/
if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
((c->x86_model == 7) && (c->x86_mask >= 1)) ||
(c->x86_model > 7))
if (cpu_has(c, X86_FEATURE_MP))
return;
/* If we get here, not a certified SMP capable AMD system. */
/*
* Don't taint if we are running SMP kernel on a single non-MP
* approved Athlon
*/
WARN_ONCE(1, "WARNING: This combination of AMD"
" processors is not suitable for SMP.\n");
add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
#endif
}
#ifdef CONFIG_NUMA
/*
* To workaround broken NUMA config. Read the comment in
* srat_detect_node().
*/
static int nearby_node(int apicid)
{
int i, node;
for (i = apicid - 1; i >= 0; i--) {
node = __apicid_to_node[i];
if (node != NUMA_NO_NODE && node_online(node))
return node;
}
for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
node = __apicid_to_node[i];
if (node != NUMA_NO_NODE && node_online(node))
return node;
}
return first_node(node_online_map); /* Shouldn't happen */
}
#endif
/*
* Fixup core topology information for
* (1) AMD multi-node processors
* Assumption: Number of cores in each internal node is the same.
* (2) AMD processors supporting compute units
*/
#ifdef CONFIG_X86_HT
static void amd_get_topology(struct cpuinfo_x86 *c)
{
u32 nodes, cores_per_cu = 1;
u8 node_id;
int cpu = smp_processor_id();
/* get information required for multi-node processors */
if (cpu_has_topoext) {
u32 eax, ebx, ecx, edx;
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
nodes = ((ecx >> 8) & 7) + 1;
node_id = ecx & 7;
/* get compute unit information */
smp_num_siblings = ((ebx >> 8) & 3) + 1;
c->compute_unit_id = ebx & 0xff;
cores_per_cu += ((ebx >> 8) & 3);
} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
nodes = ((value >> 3) & 7) + 1;
node_id = value & 7;
} else
return;
/* fixup multi-node processor information */
if (nodes > 1) {
u32 cores_per_node;
u32 cus_per_node;
set_cpu_cap(c, X86_FEATURE_AMD_DCM);
cores_per_node = c->x86_max_cores / nodes;
cus_per_node = cores_per_node / cores_per_cu;
/* store NodeID, use llc_shared_map to store sibling info */
per_cpu(cpu_llc_id, cpu) = node_id;
/* core id has to be in the [0 .. cores_per_node - 1] range */
c->cpu_core_id %= cores_per_node;
c->compute_unit_id %= cus_per_node;
}
}
#endif
/*
* On a AMD dual core setup the lower bits of the APIC id distinguish the cores.
* Assumes number of cores is a power of two.
*/
static void amd_detect_cmp(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_HT
unsigned bits;
int cpu = smp_processor_id();
bits = c->x86_coreid_bits;
/* Low order bits define the core id (index of core in socket) */
c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
/* Convert the initial APIC ID into the socket ID */
c->phys_proc_id = c->initial_apicid >> bits;
/* use socket ID also for last level cache */
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
amd_get_topology(c);
#endif
}
u16 amd_get_nb_id(int cpu)
{
u16 id = 0;
#ifdef CONFIG_SMP
id = per_cpu(cpu_llc_id, cpu);
#endif
return id;
}
EXPORT_SYMBOL_GPL(amd_get_nb_id);
static void srat_detect_node(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_NUMA
int cpu = smp_processor_id();
int node;
unsigned apicid = c->apicid;
node = numa_cpu_node(cpu);
if (node == NUMA_NO_NODE)
node = per_cpu(cpu_llc_id, cpu);
/*
* On multi-fabric platform (e.g. Numascale NumaChip) a
* platform-specific handler needs to be called to fixup some
* IDs of the CPU.
*/
if (x86_cpuinit.fixup_cpu_id)
x86_cpuinit.fixup_cpu_id(c, node);
if (!node_online(node)) {
/*
* Two possibilities here:
*
* - The CPU is missing memory and no node was created. In
* that case try picking one from a nearby CPU.
*
* - The APIC IDs differ from the HyperTransport node IDs
* which the K8 northbridge parsing fills in. Assume
* they are all increased by a constant offset, but in
* the same order as the HT nodeids. If that doesn't
* result in a usable node fall back to the path for the
* previous case.
*
* This workaround operates directly on the mapping between
* APIC ID and NUMA node, assuming certain relationship
* between APIC ID, HT node ID and NUMA topology. As going
* through CPU mapping may alter the outcome, directly
* access __apicid_to_node[].
*/
int ht_nodeid = c->initial_apicid;
if (ht_nodeid >= 0 &&
__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
node = __apicid_to_node[ht_nodeid];
/* Pick a nearby node */
if (!node_online(node))
node = nearby_node(apicid);
}
numa_set_node(cpu, node);
#endif
}
static void early_init_amd_mc(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_HT
unsigned bits, ecx;
/* Multi core CPU? */
if (c->extended_cpuid_level < 0x80000008)
return;
ecx = cpuid_ecx(0x80000008);
c->x86_max_cores = (ecx & 0xff) + 1;
/* CPU telling us the core id bits shift? */
bits = (ecx >> 12) & 0xF;
/* Otherwise recompute */
if (bits == 0) {
while ((1 << bits) < c->x86_max_cores)
bits++;
}
c->x86_coreid_bits = bits;
#endif
}
static void bsp_init_amd(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_64
if (c->x86 >= 0xf) {
unsigned long long tseg;
/*
* Split up direct mapping around the TSEG SMM area.
* Don't do it for gbpages because there seems very little
* benefit in doing so.
*/
if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
unsigned long pfn = tseg >> PAGE_SHIFT;
printk(KERN_DEBUG "tseg: %010llx\n", tseg);
if (pfn_range_is_mapped(pfn, pfn + 1))
set_memory_4k((unsigned long)__va(tseg), 1);
}
}
#endif
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
if (c->x86 > 0x10 ||
(c->x86 == 0x10 && c->x86_model >= 0x2)) {
u64 val;
rdmsrl(MSR_K7_HWCR, val);
if (!(val & BIT(24)))
printk(KERN_WARNING FW_BUG "TSC doesn't count "
"with P0 frequency!\n");
}
}
if (c->x86 == 0x15) {
unsigned long upperbit;
u32 cpuid, assoc;
cpuid = cpuid_edx(0x80000005);
assoc = cpuid >> 16 & 0xff;
upperbit = ((cpuid >> 24) << 10) / assoc;
va_align.mask = (upperbit - 1) & PAGE_MASK;
va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
}
}
static void early_init_amd(struct cpuinfo_x86 *c)
{
early_init_amd_mc(c);
/*
* c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
* with P/T states and does not stop in deep C-states
*/
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
if (!check_tsc_unstable())
set_sched_clock_stable();
}
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
#else
/* Set MTRR capability flag if appropriate */
if (c->x86 == 5)
if (c->x86_model == 13 || c->x86_model == 9 ||
(c->x86_model == 8 && c->x86_mask >= 8))
set_cpu_cap(c, X86_FEATURE_K6_MTRR);
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
/* check CPU config space for extended APIC ID */
if (cpu_has_apic && c->x86 >= 0xf) {
unsigned int val;
val = read_pci_config(0, 24, 0, 0x68);
if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
}
#endif
/*
* This is only needed to tell the kernel whether to use VMCALL
* and VMMCALL. VMMCALL is never executed except under virt, so
* we can set it unconditionally.
*/
set_cpu_cap(c, X86_FEATURE_VMMCALL);
/* F16h erratum 793, CVE-2013-6885 */
if (c->x86 == 0x16 && c->x86_model <= 0xf)
msr_set_bit(MSR_AMD64_LS_CFG, 15);
}
static const int amd_erratum_383[];
static const int amd_erratum_400[];
static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
static void init_amd_k8(struct cpuinfo_x86 *c)
{
u32 level;
u64 value;
/* On C+ stepping K8 rep microcode works well for copy/memset */
level = cpuid_eax(1);
if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
/*
* Some BIOSes incorrectly force this feature, but only K8 revision D
* (model = 0x14) and later actually support it.
* (AMD Erratum #110, docId: 25759).
*/
if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
if (!rdmsrl_amd_safe(0xc001100d, &value)) {
value &= ~BIT_64(32);
wrmsrl_amd_safe(0xc001100d, value);
}
}
if (!c->x86_model_id[0])
strcpy(c->x86_model_id, "Hammer");
}
static void init_amd_gh(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_64
/* do this for boot cpu */
if (c == &boot_cpu_data)
check_enable_amd_mmconf_dmi();
fam10h_check_enable_mmcfg();
#endif
/*
* Disable GART TLB Walk Errors on Fam10h. We do this here because this
* is always needed when GART is enabled, even in a kernel which has no
* MCE support built in. BIOS should disable GartTlbWlk Errors already.
* If it doesn't, we do it here as suggested by the BKDG.
*
* Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
*/
msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
/*
* On family 10h BIOS may not have properly enabled WC+ support, causing
* it to be converted to CD memtype. This may result in performance
* degradation for certain nested-paging guests. Prevent this conversion
* by clearing bit 24 in MSR_AMD64_BU_CFG2.
*
* NOTE: we want to use the _safe accessors so as not to #GP kvm
* guests on older kvm hosts.
*/
msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
if (cpu_has_amd_erratum(c, amd_erratum_383))
set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
}
static void init_amd_bd(struct cpuinfo_x86 *c)
{
u64 value;
/* re-enable TopologyExtensions if switched off by BIOS */
if ((c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
!cpu_has(c, X86_FEATURE_TOPOEXT)) {
if (msr_set_bit(0xc0011005, 54) > 0) {
rdmsrl(0xc0011005, value);
if (value & BIT_64(54)) {
set_cpu_cap(c, X86_FEATURE_TOPOEXT);
pr_info(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
}
}
}
/*
* The way access filter has a performance penalty on some workloads.
* Disable it on the affected CPUs.
*/
if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) {
value |= 0x1E;
wrmsrl_safe(0xc0011021, value);
}
}
}
static void init_amd(struct cpuinfo_x86 *c)
{
u32 dummy;
#ifdef CONFIG_SMP
/*
* Disable TLB flush filter by setting HWCR.FFDIS on K8
* bit 6 of msr C001_0015
*
* Errata 63 for SH-B3 steppings
* Errata 122 for all steppings (F+ have it disabled by default)
*/
if (c->x86 == 0xf)
msr_set_bit(MSR_K7_HWCR, 6);
#endif
early_init_amd(c);
/*
* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
* 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
*/
clear_cpu_cap(c, 0*32+31);
if (c->x86 >= 0x10)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
/* get apicid instead of initial apic id from cpuid */
c->apicid = hard_smp_processor_id();
/* K6s reports MCEs but don't actually have all the MSRs */
if (c->x86 < 6)
clear_cpu_cap(c, X86_FEATURE_MCE);
switch (c->x86) {
case 4: init_amd_k5(c); break;
case 5: init_amd_k6(c); break;
case 6: init_amd_k7(c); break;
case 0xf: init_amd_k8(c); break;
case 0x10: init_amd_gh(c); break;
case 0x15: init_amd_bd(c); break;
}
/* Enable workaround for FXSAVE leak */
if (c->x86 >= 6)
set_cpu_bug(c, X86_BUG_FXSAVE_LEAK);
cpu_detect_cache_sizes(c);
/* Multi core CPU? */
if (c->extended_cpuid_level >= 0x80000008) {
amd_detect_cmp(c);
srat_detect_node(c);
}
#ifdef CONFIG_X86_32
detect_ht(c);
#endif
init_amd_cacheinfo(c);
if (c->x86 >= 0xf)
set_cpu_cap(c, X86_FEATURE_K8);
if (cpu_has_xmm2) {
/* MFENCE stops RDTSC speculation */
set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
}
/*
* Family 0x12 and above processors have APIC timer
* running in deep C states.
*/
if (c->x86 > 0x11)
set_cpu_cap(c, X86_FEATURE_ARAT);
if (cpu_has_amd_erratum(c, amd_erratum_400))
set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
}
#ifdef CONFIG_X86_32
static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/* AMD errata T13 (order #21922) */
if ((c->x86 == 6)) {
/* Duron Rev A0 */
if (c->x86_model == 3 && c->x86_mask == 0)
size = 64;
/* Tbird rev A1/A2 */
if (c->x86_model == 4 &&
(c->x86_mask == 0 || c->x86_mask == 1))
size = 256;
}
return size;
}
#endif
static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
{
u32 ebx, eax, ecx, edx;
u16 mask = 0xfff;
if (c->x86 < 0xf)
return;
if (c->extended_cpuid_level < 0x80000006)
return;
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
tlb_lli_4k[ENTRIES] = ebx & mask;
/*
* K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB
* characteristics from the CPUID function 0x80000005 instead.
*/
if (c->x86 == 0xf) {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
mask = 0xff;
}
/* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!((eax >> 16) & mask))
tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
else
tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
/* a 4M entry uses two 2M entries */
tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
/* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!(eax & mask)) {
/* Erratum 658 */
if (c->x86 == 0x15 && c->x86_model <= 0x1f) {
tlb_lli_2m[ENTRIES] = 1024;
} else {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
tlb_lli_2m[ENTRIES] = eax & 0xff;
}
} else
tlb_lli_2m[ENTRIES] = eax & mask;
tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
}
static const struct cpu_dev amd_cpu_dev = {
.c_vendor = "AMD",
.c_ident = { "AuthenticAMD" },
#ifdef CONFIG_X86_32
.legacy_models = {
{ .family = 4, .model_names =
{
[3] = "486 DX/2",
[7] = "486 DX/2-WB",
[8] = "486 DX/4",
[9] = "486 DX/4-WB",
[14] = "Am5x86-WT",
[15] = "Am5x86-WB"
}
},
},
.legacy_cache_size = amd_size_cache,
#endif
.c_early_init = early_init_amd,
.c_detect_tlb = cpu_detect_tlb_amd,
.c_bsp_init = bsp_init_amd,
.c_init = init_amd,
.c_x86_vendor = X86_VENDOR_AMD,
};
cpu_dev_register(amd_cpu_dev);
/*
* AMD errata checking
*
* Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
* AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
* have an OSVW id assigned, which it takes as first argument. Both take a
* variable number of family-specific model-stepping ranges created by
* AMD_MODEL_RANGE().
*
* Example:
*
* const int amd_erratum_319[] =
* AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
* AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
* AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
*/
#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
static const int amd_erratum_400[] =
AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
static const int amd_erratum_383[] =
AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
{
int osvw_id = *erratum++;
u32 range;
u32 ms;
if (osvw_id >= 0 && osvw_id < 65536 &&
cpu_has(cpu, X86_FEATURE_OSVW)) {
u64 osvw_len;
rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
if (osvw_id < osvw_len) {
u64 osvw_bits;
rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
osvw_bits);
return osvw_bits & (1ULL << (osvw_id & 0x3f));
}
}
/* OSVW unavailable or ID unknown, match family-model-stepping range */
ms = (cpu->x86_model << 4) | cpu->x86_mask;
while ((range = *erratum++))
if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
(ms >= AMD_MODEL_RANGE_START(range)) &&
(ms <= AMD_MODEL_RANGE_END(range)))
return true;
return false;
}

View file

@ -0,0 +1,94 @@
/*
* Copyright (C) 1994 Linus Torvalds
*
* Cyrix stuff, June 1998 by:
* - Rafael R. Reilova (moved everything from head.S),
* <rreilova@ececs.uc.edu>
* - Channing Corn (tests & fixes),
* - Andrew D. Balsa (code cleanup).
*/
#include <linux/init.h>
#include <linux/utsname.h>
#include <asm/bugs.h>
#include <asm/processor.h>
#include <asm/processor-flags.h>
#include <asm/i387.h>
#include <asm/msr.h>
#include <asm/paravirt.h>
#include <asm/alternative.h>
static double __initdata x = 4195835.0;
static double __initdata y = 3145727.0;
/*
* This used to check for exceptions..
* However, it turns out that to support that,
* the XMM trap handlers basically had to
* be buggy. So let's have a correct XMM trap
* handler, and forget about printing out
* some status at boot.
*
* We should really only care about bugs here
* anyway. Not features.
*/
static void __init check_fpu(void)
{
s32 fdiv_bug;
kernel_fpu_begin();
/*
* trap_init() enabled FXSR and company _before_ testing for FP
* problems here.
*
* Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug
*/
__asm__("fninit\n\t"
"fldl %1\n\t"
"fdivl %2\n\t"
"fmull %2\n\t"
"fldl %1\n\t"
"fsubp %%st,%%st(1)\n\t"
"fistpl %0\n\t"
"fwait\n\t"
"fninit"
: "=m" (*&fdiv_bug)
: "m" (*&x), "m" (*&y));
kernel_fpu_end();
if (fdiv_bug) {
set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
pr_warn("Hmm, FPU with FDIV bug\n");
}
}
void __init check_bugs(void)
{
identify_boot_cpu();
#ifndef CONFIG_SMP
pr_info("CPU: ");
print_cpu_info(&boot_cpu_data);
#endif
/*
* Check whether we are able to run this kernel safely on SMP.
*
* - i386 is no longer supported.
* - In order to run on anything without a TSC, we need to be
* compiled for a i486.
*/
if (boot_cpu_data.x86 < 4)
panic("Kernel requires i486+ for 'invlpg' and other features");
init_utsname()->machine[1] =
'0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
alternative_instructions();
/*
* kernel_fpu_begin/end() in check_fpu() relies on the patched
* alternative instructions.
*/
if (cpu_has_fpu)
check_fpu();
}

View file

@ -0,0 +1,33 @@
/*
* Copyright (C) 1994 Linus Torvalds
* Copyright (C) 2000 SuSE
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <asm/alternative.h>
#include <asm/bugs.h>
#include <asm/processor.h>
#include <asm/mtrr.h>
#include <asm/cacheflush.h>
void __init check_bugs(void)
{
identify_boot_cpu();
#if !defined(CONFIG_SMP)
printk(KERN_INFO "CPU: ");
print_cpu_info(&boot_cpu_data);
#endif
alternative_instructions();
/*
* Make sure the first 2MB area is not mapped by huge pages
* There are typically fixed size MTRRs in there and overlapping
* MTRRs into large pages causes slow downs.
*
* Right now we don't do that with gbpages because there seems
* very little benefit for that case.
*/
if (!direct_gbpages)
set_memory_4k((unsigned long)__va(0), 1);
}

View file

@ -0,0 +1,229 @@
#include <linux/bitops.h>
#include <linux/kernel.h>
#include <asm/processor.h>
#include <asm/e820.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include "cpu.h"
#define ACE_PRESENT (1 << 6)
#define ACE_ENABLED (1 << 7)
#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */
#define RNG_PRESENT (1 << 2)
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */
static void init_c3(struct cpuinfo_x86 *c)
{
u32 lo, hi;
/* Test for Centaur Extended Feature Flags presence */
if (cpuid_eax(0xC0000000) >= 0xC0000001) {
u32 tmp = cpuid_edx(0xC0000001);
/* enable ACE unit, if present and disabled */
if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) {
rdmsr(MSR_VIA_FCR, lo, hi);
lo |= ACE_FCR; /* enable ACE unit */
wrmsr(MSR_VIA_FCR, lo, hi);
printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n");
}
/* enable RNG unit, if present and disabled */
if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) {
rdmsr(MSR_VIA_RNG, lo, hi);
lo |= RNG_ENABLE; /* enable RNG unit */
wrmsr(MSR_VIA_RNG, lo, hi);
printk(KERN_INFO "CPU: Enabled h/w RNG\n");
}
/* store Centaur Extended Feature Flags as
* word 5 of the CPU capability bit array
*/
c->x86_capability[5] = cpuid_edx(0xC0000001);
}
#ifdef CONFIG_X86_32
/* Cyrix III family needs CX8 & PGE explicitly enabled. */
if (c->x86_model >= 6 && c->x86_model <= 13) {
rdmsr(MSR_VIA_FCR, lo, hi);
lo |= (1<<1 | 1<<7);
wrmsr(MSR_VIA_FCR, lo, hi);
set_cpu_cap(c, X86_FEATURE_CX8);
}
/* Before Nehemiah, the C3's had 3dNOW! */
if (c->x86_model >= 6 && c->x86_model < 9)
set_cpu_cap(c, X86_FEATURE_3DNOW);
#endif
if (c->x86 == 0x6 && c->x86_model >= 0xf) {
c->x86_cache_alignment = c->x86_clflush_size * 2;
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
}
cpu_detect_cache_sizes(c);
}
enum {
ECX8 = 1<<1,
EIERRINT = 1<<2,
DPM = 1<<3,
DMCE = 1<<4,
DSTPCLK = 1<<5,
ELINEAR = 1<<6,
DSMC = 1<<7,
DTLOCK = 1<<8,
EDCTLB = 1<<8,
EMMX = 1<<9,
DPDC = 1<<11,
EBRPRED = 1<<12,
DIC = 1<<13,
DDC = 1<<14,
DNA = 1<<15,
ERETSTK = 1<<16,
E2MMX = 1<<19,
EAMD3D = 1<<20,
};
static void early_init_centaur(struct cpuinfo_x86 *c)
{
switch (c->x86) {
#ifdef CONFIG_X86_32
case 5:
/* Emulate MTRRs using Centaur's MCR. */
set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
break;
#endif
case 6:
if (c->x86_model >= 0xf)
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
break;
}
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSENTER32);
#endif
}
static void init_centaur(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
char *name;
u32 fcr_set = 0;
u32 fcr_clr = 0;
u32 lo, hi, newlo;
u32 aa, bb, cc, dd;
/*
* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
* 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
*/
clear_cpu_cap(c, 0*32+31);
#endif
early_init_centaur(c);
switch (c->x86) {
#ifdef CONFIG_X86_32
case 5:
switch (c->x86_model) {
case 4:
name = "C6";
fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
fcr_clr = DPDC;
printk(KERN_NOTICE "Disabling bugged TSC.\n");
clear_cpu_cap(c, X86_FEATURE_TSC);
break;
case 8:
switch (c->x86_mask) {
default:
name = "2";
break;
case 7 ... 9:
name = "2A";
break;
case 10 ... 15:
name = "2B";
break;
}
fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
E2MMX|EAMD3D;
fcr_clr = DPDC;
break;
case 9:
name = "3";
fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
E2MMX|EAMD3D;
fcr_clr = DPDC;
break;
default:
name = "??";
}
rdmsr(MSR_IDT_FCR1, lo, hi);
newlo = (lo|fcr_set) & (~fcr_clr);
if (newlo != lo) {
printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n",
lo, newlo);
wrmsr(MSR_IDT_FCR1, newlo, hi);
} else {
printk(KERN_INFO "Centaur FCR is 0x%X\n", lo);
}
/* Emulate MTRRs using Centaur's MCR. */
set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
/* Report CX8 */
set_cpu_cap(c, X86_FEATURE_CX8);
/* Set 3DNow! on Winchip 2 and above. */
if (c->x86_model >= 8)
set_cpu_cap(c, X86_FEATURE_3DNOW);
/* See if we can find out some more. */
if (cpuid_eax(0x80000000) >= 0x80000005) {
/* Yes, we can. */
cpuid(0x80000005, &aa, &bb, &cc, &dd);
/* Add L1 data and code cache sizes. */
c->x86_cache_size = (cc>>24)+(dd>>24);
}
sprintf(c->x86_model_id, "WinChip %s", name);
break;
#endif
case 6:
init_c3(c);
break;
}
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
#endif
}
#ifdef CONFIG_X86_32
static unsigned int
centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/* VIA C3 CPUs (670-68F) need further shifting. */
if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
size >>= 8;
/*
* There's also an erratum in Nehemiah stepping 1, which
* returns '65KB' instead of '64KB'
* - Note, it seems this may only be in engineering samples.
*/
if ((c->x86 == 6) && (c->x86_model == 9) &&
(c->x86_mask == 1) && (size == 65))
size -= 1;
return size;
}
#endif
static const struct cpu_dev centaur_cpu_dev = {
.c_vendor = "Centaur",
.c_ident = { "CentaurHauls" },
.c_early_init = early_init_centaur,
.c_init = init_centaur,
#ifdef CONFIG_X86_32
.legacy_cache_size = centaur_size_cache,
#endif
.c_x86_vendor = X86_VENDOR_CENTAUR,
};
cpu_dev_register(centaur_cpu_dev);

1450
arch/x86/kernel/cpu/common.c Normal file

File diff suppressed because it is too large Load diff

48
arch/x86/kernel/cpu/cpu.h Normal file
View file

@ -0,0 +1,48 @@
#ifndef ARCH_X86_CPU_H
#define ARCH_X86_CPU_H
/* attempt to consolidate cpu attributes */
struct cpu_dev {
const char *c_vendor;
/* some have two possibilities for cpuid string */
const char *c_ident[2];
void (*c_early_init)(struct cpuinfo_x86 *);
void (*c_bsp_init)(struct cpuinfo_x86 *);
void (*c_init)(struct cpuinfo_x86 *);
void (*c_identify)(struct cpuinfo_x86 *);
void (*c_detect_tlb)(struct cpuinfo_x86 *);
int c_x86_vendor;
#ifdef CONFIG_X86_32
/* Optional vendor specific routine to obtain the cache size. */
unsigned int (*legacy_cache_size)(struct cpuinfo_x86 *,
unsigned int);
/* Family/stepping-based lookup table for model names. */
struct legacy_cpu_model_info {
int family;
const char *model_names[16];
} legacy_models[5];
#endif
};
struct _tlb_table {
unsigned char descriptor;
char tlb_type;
unsigned int entries;
/* unsigned int ways; */
char info[128];
};
#define cpu_dev_register(cpu_devX) \
static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
__attribute__((__section__(".x86_cpu_dev.init"))) = \
&cpu_devX;
extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
#endif /* ARCH_X86_CPU_H */

461
arch/x86/kernel/cpu/cyrix.c Normal file
View file

@ -0,0 +1,461 @@
#include <linux/bitops.h>
#include <linux/delay.h>
#include <linux/pci.h>
#include <asm/dma.h>
#include <linux/io.h>
#include <asm/processor-cyrix.h>
#include <asm/processor-flags.h>
#include <linux/timer.h>
#include <asm/pci-direct.h>
#include <asm/tsc.h>
#include "cpu.h"
/*
* Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
*/
static void __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
{
unsigned char ccr2, ccr3;
/* we test for DEVID by checking whether CCR3 is writable */
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, ccr3 ^ 0x80);
getCx86(0xc0); /* dummy to change bus */
if (getCx86(CX86_CCR3) == ccr3) { /* no DEVID regs. */
ccr2 = getCx86(CX86_CCR2);
setCx86(CX86_CCR2, ccr2 ^ 0x04);
getCx86(0xc0); /* dummy */
if (getCx86(CX86_CCR2) == ccr2) /* old Cx486SLC/DLC */
*dir0 = 0xfd;
else { /* Cx486S A step */
setCx86(CX86_CCR2, ccr2);
*dir0 = 0xfe;
}
} else {
setCx86(CX86_CCR3, ccr3); /* restore CCR3 */
/* read DIR0 and DIR1 CPU registers */
*dir0 = getCx86(CX86_DIR0);
*dir1 = getCx86(CX86_DIR1);
}
}
static void do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
{
unsigned long flags;
local_irq_save(flags);
__do_cyrix_devid(dir0, dir1);
local_irq_restore(flags);
}
/*
* Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in
* order to identify the Cyrix CPU model after we're out of setup.c
*
* Actually since bugs.h doesn't even reference this perhaps someone should
* fix the documentation ???
*/
static unsigned char Cx86_dir0_msb = 0;
static const char Cx86_model[][9] = {
"Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
"M II ", "Unknown"
};
static const char Cx486_name[][5] = {
"SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
"SRx2", "DRx2"
};
static const char Cx486S_name[][4] = {
"S", "S2", "Se", "S2e"
};
static const char Cx486D_name[][4] = {
"DX", "DX2", "?", "?", "?", "DX4"
};
static char Cx86_cb[] = "?.5x Core/Bus Clock";
static const char cyrix_model_mult1[] = "12??43";
static const char cyrix_model_mult2[] = "12233445";
/*
* Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
* BIOSes for compatibility with DOS games. This makes the udelay loop
* work correctly, and improves performance.
*
* FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP
*/
static void check_cx686_slop(struct cpuinfo_x86 *c)
{
unsigned long flags;
if (Cx86_dir0_msb == 3) {
unsigned char ccr3, ccr5;
local_irq_save(flags);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
ccr5 = getCx86(CX86_CCR5);
if (ccr5 & 2)
setCx86(CX86_CCR5, ccr5 & 0xfd); /* reset SLOP */
setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
local_irq_restore(flags);
if (ccr5 & 2) { /* possible wrong calibration done */
printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n");
calibrate_delay();
c->loops_per_jiffy = loops_per_jiffy;
}
}
}
static void set_cx86_reorder(void)
{
u8 ccr3;
printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n");
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
/* Load/Store Serialize to mem access disable (=reorder it) */
setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
/* set load/store serialize from 1GB to 4GB */
ccr3 |= 0xe0;
setCx86(CX86_CCR3, ccr3);
}
static void set_cx86_memwb(void)
{
printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
/* CCR2 bit 2: unlock NW bit */
setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
/* set 'Not Write-through' */
write_cr0(read_cr0() | X86_CR0_NW);
/* CCR2 bit 2: lock NW bit and set WT1 */
setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
}
/*
* Configure later MediaGX and/or Geode processor.
*/
static void geode_configure(void)
{
unsigned long flags;
u8 ccr3;
local_irq_save(flags);
/* Suspend on halt power saving and enable #SUSP pin */
setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
/* FPU fast, DTE cache, Mem bypass */
setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
set_cx86_memwb();
set_cx86_reorder();
local_irq_restore(flags);
}
static void early_init_cyrix(struct cpuinfo_x86 *c)
{
unsigned char dir0, dir0_msn, dir1 = 0;
__do_cyrix_devid(&dir0, &dir1);
dir0_msn = dir0 >> 4; /* identifies CPU "family" */
switch (dir0_msn) {
case 3: /* 6x86/6x86L */
/* Emulate MTRRs using Cyrix's ARRs. */
set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
break;
case 5: /* 6x86MX/M II */
/* Emulate MTRRs using Cyrix's ARRs. */
set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
break;
}
}
static void init_cyrix(struct cpuinfo_x86 *c)
{
unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
char *buf = c->x86_model_id;
const char *p = NULL;
/*
* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
* 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
*/
clear_cpu_cap(c, 0*32+31);
/* Cyrix used bit 24 in extended (AMD) CPUID for Cyrix MMX extensions */
if (test_cpu_cap(c, 1*32+24)) {
clear_cpu_cap(c, 1*32+24);
set_cpu_cap(c, X86_FEATURE_CXMMX);
}
do_cyrix_devid(&dir0, &dir1);
check_cx686_slop(c);
Cx86_dir0_msb = dir0_msn = dir0 >> 4; /* identifies CPU "family" */
dir0_lsn = dir0 & 0xf; /* model or clock multiplier */
/* common case step number/rev -- exceptions handled below */
c->x86_model = (dir1 >> 4) + 1;
c->x86_mask = dir1 & 0xf;
/* Now cook; the original recipe is by Channing Corn, from Cyrix.
* We do the same thing for each generation: we work out
* the model, multiplier and stepping. Black magic included,
* to make the silicon step/rev numbers match the printed ones.
*/
switch (dir0_msn) {
unsigned char tmp;
case 0: /* Cx486SLC/DLC/SRx/DRx */
p = Cx486_name[dir0_lsn & 7];
break;
case 1: /* Cx486S/DX/DX2/DX4 */
p = (dir0_lsn & 8) ? Cx486D_name[dir0_lsn & 5]
: Cx486S_name[dir0_lsn & 3];
break;
case 2: /* 5x86 */
Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5];
p = Cx86_cb+2;
break;
case 3: /* 6x86/6x86L */
Cx86_cb[1] = ' ';
Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5];
if (dir1 > 0x21) { /* 686L */
Cx86_cb[0] = 'L';
p = Cx86_cb;
(c->x86_model)++;
} else /* 686 */
p = Cx86_cb+1;
/* Emulate MTRRs using Cyrix's ARRs. */
set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
/* 6x86's contain this bug */
set_cpu_bug(c, X86_BUG_COMA);
break;
case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
#ifdef CONFIG_PCI
{
u32 vendor, device;
/*
* It isn't really a PCI quirk directly, but the cure is the
* same. The MediaGX has deep magic SMM stuff that handles the
* SB emulation. It throws away the fifo on disable_dma() which
* is wrong and ruins the audio.
*
* Bug2: VSA1 has a wrap bug so that using maximum sized DMA
* causes bad things. According to NatSemi VSA2 has another
* bug to do with 'hlt'. I've not seen any boards using VSA2
* and X doesn't seem to support it either so who cares 8).
* VSA1 we work around however.
*/
printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
isa_dma_bridge_buggy = 2;
/* We do this before the PCI layer is running. However we
are safe here as we know the bridge must be a Cyrix
companion and must be present */
vendor = read_pci_config_16(0, 0, 0x12, PCI_VENDOR_ID);
device = read_pci_config_16(0, 0, 0x12, PCI_DEVICE_ID);
/*
* The 5510/5520 companion chips have a funky PIT.
*/
if (vendor == PCI_VENDOR_ID_CYRIX &&
(device == PCI_DEVICE_ID_CYRIX_5510 ||
device == PCI_DEVICE_ID_CYRIX_5520))
mark_tsc_unstable("cyrix 5510/5520 detected");
}
#endif
c->x86_cache_size = 16; /* Yep 16K integrated cache thats it */
/* GXm supports extended cpuid levels 'ala' AMD */
if (c->cpuid_level == 2) {
/* Enable cxMMX extensions (GX1 Datasheet 54) */
setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
/*
* GXm : 0x30 ... 0x5f GXm datasheet 51
* GXlv: 0x6x GXlv datasheet 54
* ? : 0x7x
* GX1 : 0x8x GX1 datasheet 56
*/
if ((0x30 <= dir1 && dir1 <= 0x6f) ||
(0x80 <= dir1 && dir1 <= 0x8f))
geode_configure();
return;
} else { /* MediaGX */
Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
p = Cx86_cb+2;
c->x86_model = (dir1 & 0x20) ? 1 : 2;
}
break;
case 5: /* 6x86MX/M II */
if (dir1 > 7) {
dir0_msn++; /* M II */
/* Enable MMX extensions (App note 108) */
setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
} else {
/* A 6x86MX - it has the bug. */
set_cpu_bug(c, X86_BUG_COMA);
}
tmp = (!(dir0_lsn & 7) || dir0_lsn & 1) ? 2 : 0;
Cx86_cb[tmp] = cyrix_model_mult2[dir0_lsn & 7];
p = Cx86_cb+tmp;
if (((dir1 & 0x0f) > 4) || ((dir1 & 0xf0) == 0x20))
(c->x86_model)++;
/* Emulate MTRRs using Cyrix's ARRs. */
set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
break;
case 0xf: /* Cyrix 486 without DEVID registers */
switch (dir0_lsn) {
case 0xd: /* either a 486SLC or DLC w/o DEVID */
dir0_msn = 0;
p = Cx486_name[(cpu_has_fpu ? 1 : 0)];
break;
case 0xe: /* a 486S A step */
dir0_msn = 0;
p = Cx486S_name[0];
break;
}
break;
default: /* unknown (shouldn't happen, we know everyone ;-) */
dir0_msn = 7;
break;
}
strcpy(buf, Cx86_model[dir0_msn & 7]);
if (p)
strcat(buf, p);
return;
}
/*
* Handle National Semiconductor branded processors
*/
static void init_nsc(struct cpuinfo_x86 *c)
{
/*
* There may be GX1 processors in the wild that are branded
* NSC and not Cyrix.
*
* This function only handles the GX processor, and kicks every
* thing else to the Cyrix init function above - that should
* cover any processors that might have been branded differently
* after NSC acquired Cyrix.
*
* If this breaks your GX1 horribly, please e-mail
* info-linux@ldcmail.amd.com to tell us.
*/
/* Handle the GX (Formally known as the GX2) */
if (c->x86 == 5 && c->x86_model == 5)
cpu_detect_cache_sizes(c);
else
init_cyrix(c);
}
/*
* Cyrix CPUs without cpuid or with cpuid not yet enabled can be detected
* by the fact that they preserve the flags across the division of 5/2.
* PII and PPro exhibit this behavior too, but they have cpuid available.
*/
/*
* Perform the Cyrix 5/2 test. A Cyrix won't change
* the flags, while other 486 chips will.
*/
static inline int test_cyrix_52div(void)
{
unsigned int test;
__asm__ __volatile__(
"sahf\n\t" /* clear flags (%eax = 0x0005) */
"div %b2\n\t" /* divide 5 by 2 */
"lahf" /* store flags into %ah */
: "=a" (test)
: "0" (5), "q" (2)
: "cc");
/* AH is 0x02 on Cyrix after the divide.. */
return (unsigned char) (test >> 8) == 0x02;
}
static void cyrix_identify(struct cpuinfo_x86 *c)
{
/* Detect Cyrix with disabled CPUID */
if (c->x86 == 4 && test_cyrix_52div()) {
unsigned char dir0, dir1;
strcpy(c->x86_vendor_id, "CyrixInstead");
c->x86_vendor = X86_VENDOR_CYRIX;
/* Actually enable cpuid on the older cyrix */
/* Retrieve CPU revisions */
do_cyrix_devid(&dir0, &dir1);
dir0 >>= 4;
/* Check it is an affected model */
if (dir0 == 5 || dir0 == 3) {
unsigned char ccr3;
unsigned long flags;
printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
local_irq_save(flags);
ccr3 = getCx86(CX86_CCR3);
/* enable MAPEN */
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
/* enable cpuid */
setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80);
/* disable MAPEN */
setCx86(CX86_CCR3, ccr3);
local_irq_restore(flags);
}
}
}
static const struct cpu_dev cyrix_cpu_dev = {
.c_vendor = "Cyrix",
.c_ident = { "CyrixInstead" },
.c_early_init = early_init_cyrix,
.c_init = init_cyrix,
.c_identify = cyrix_identify,
.c_x86_vendor = X86_VENDOR_CYRIX,
};
cpu_dev_register(cyrix_cpu_dev);
static const struct cpu_dev nsc_cpu_dev = {
.c_vendor = "NSC",
.c_ident = { "Geode by NSC" },
.c_init = init_nsc,
.c_x86_vendor = X86_VENDOR_NSC,
};
cpu_dev_register(nsc_cpu_dev);

View file

@ -0,0 +1,87 @@
/*
* Common hypervisor code
*
* Copyright (C) 2008, VMware, Inc.
* Author : Alok N Kataria <akataria@vmware.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
*/
#include <linux/module.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
#ifdef CONFIG_XEN_PVHVM
&x86_hyper_xen_hvm,
#endif
&x86_hyper_vmware,
&x86_hyper_ms_hyperv,
#ifdef CONFIG_KVM_GUEST
&x86_hyper_kvm,
#endif
};
const struct hypervisor_x86 *x86_hyper;
EXPORT_SYMBOL(x86_hyper);
static inline void __init
detect_hypervisor_vendor(void)
{
const struct hypervisor_x86 *h, * const *p;
uint32_t pri, max_pri = 0;
for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
h = *p;
pri = h->detect();
if (pri != 0 && pri > max_pri) {
max_pri = pri;
x86_hyper = h;
}
}
if (max_pri)
printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name);
}
void init_hypervisor(struct cpuinfo_x86 *c)
{
if (x86_hyper && x86_hyper->set_cpu_features)
x86_hyper->set_cpu_features(c);
}
void __init init_hypervisor_platform(void)
{
detect_hypervisor_vendor();
if (!x86_hyper)
return;
init_hypervisor(&boot_cpu_data);
if (x86_hyper->init_platform)
x86_hyper->init_platform();
}
bool __init hypervisor_x2apic_available(void)
{
return x86_hyper &&
x86_hyper->x2apic_available &&
x86_hyper->x2apic_available();
}

756
arch/x86/kernel/cpu/intel.c Normal file
View file

@ -0,0 +1,756 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/bitops.h>
#include <linux/smp.h>
#include <linux/sched.h>
#include <linux/thread_info.h>
#include <linux/module.h>
#include <linux/uaccess.h>
#include <asm/processor.h>
#include <asm/pgtable.h>
#include <asm/msr.h>
#include <asm/bugs.h>
#include <asm/cpu.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
#endif
#include "cpu.h"
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/mpspec.h>
#include <asm/apic.h>
#endif
static void early_init_intel(struct cpuinfo_x86 *c)
{
u64 misc_enable;
/* Unmask CPUID levels if masked: */
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) {
c->cpuid_level = cpuid_eax(0);
get_cpu_cap(c);
}
}
if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
(c->x86 == 0x6 && c->x86_model >= 0x0e))
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) {
unsigned lower_word;
wrmsr(MSR_IA32_UCODE_REV, 0, 0);
/* Required by the SDM */
sync_core();
rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode);
}
/*
* Atom erratum AAE44/AAF40/AAG38/AAH41:
*
* A race condition between speculative fetches and invalidating
* a large page. This is worked around in microcode, but we
* need the microcode to have already been loaded... so if it is
* not, recommend a BIOS update and disable large pages.
*/
if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 &&
c->microcode < 0x20e) {
printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
clear_cpu_cap(c, X86_FEATURE_PSE);
}
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSENTER32);
#else
/* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
if (c->x86 == 15 && c->x86_cache_alignment == 64)
c->x86_cache_alignment = 128;
#endif
/* CPUID workaround for 0F33/0F34 CPU */
if (c->x86 == 0xF && c->x86_model == 0x3
&& (c->x86_mask == 0x3 || c->x86_mask == 0x4))
c->x86_phys_bits = 36;
/*
* c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
* with P/T states and does not stop in deep C-states.
*
* It is also reliable across cores and sockets. (but not across
* cabinets - we turn it off in that case explicitly.)
*/
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
if (!check_tsc_unstable())
set_sched_clock_stable();
}
/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
if (c->x86 == 6) {
switch (c->x86_model) {
case 0x27: /* Penwell */
case 0x35: /* Cloverview */
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
break;
default:
break;
}
}
/*
* There is a known erratum on Pentium III and Core Solo
* and Core Duo CPUs.
* " Page with PAT set to WC while associated MTRR is UC
* may consolidate to UC "
* Because of this erratum, it is better to stick with
* setting WC in MTRR rather than using PAT on these CPUs.
*
* Enable PAT WC only on P4, Core 2 or later CPUs.
*/
if (c->x86 == 6 && c->x86_model < 15)
clear_cpu_cap(c, X86_FEATURE_PAT);
#ifdef CONFIG_KMEMCHECK
/*
* P4s have a "fast strings" feature which causes single-
* stepping REP instructions to only generate a #DB on
* cache-line boundaries.
*
* Ingo Molnar reported a Pentium D (model 6) and a Xeon
* (model 2) with the same problem.
*/
if (c->x86 == 15)
if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0)
pr_info("kmemcheck: Disabling fast string operations\n");
#endif
/*
* If fast string is not enabled in IA32_MISC_ENABLE for any reason,
* clear the fast string and enhanced fast string CPU capabilities.
*/
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
printk(KERN_INFO "Disabled fast string operations\n");
setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
setup_clear_cpu_cap(X86_FEATURE_ERMS);
}
}
/*
* Intel Quark Core DevMan_001.pdf section 6.4.11
* "The operating system also is required to invalidate (i.e., flush)
* the TLB when any changes are made to any of the page table entries.
* The operating system must reload CR3 to cause the TLB to be flushed"
*
* As a result cpu_has_pge() in arch/x86/include/asm/tlbflush.h should
* be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE
* to be modified
*/
if (c->x86 == 5 && c->x86_model == 9) {
pr_info("Disabling PGE capability bit\n");
setup_clear_cpu_cap(X86_FEATURE_PGE);
}
}
#ifdef CONFIG_X86_32
/*
* Early probe support logic for ppro memory erratum #50
*
* This is called before we do cpu ident work
*/
int ppro_with_ram_bug(void)
{
/* Uses data from early_cpu_detect now */
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
boot_cpu_data.x86 == 6 &&
boot_cpu_data.x86_model == 1 &&
boot_cpu_data.x86_mask < 8) {
printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n");
return 1;
}
return 0;
}
static void intel_smp_check(struct cpuinfo_x86 *c)
{
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
return;
/*
* Mask B, Pentium, but not Pentium MMX
*/
if (c->x86 == 5 &&
c->x86_mask >= 1 && c->x86_mask <= 4 &&
c->x86_model <= 3) {
/*
* Remember we have B step Pentia with bugs
*/
WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
"with B stepping processors.\n");
}
}
static int forcepae;
static int __init forcepae_setup(char *__unused)
{
forcepae = 1;
return 1;
}
__setup("forcepae", forcepae_setup);
static void intel_workarounds(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_F00F_BUG
/*
* All models of Pentium and Pentium with MMX technology CPUs
* have the F0 0F bug, which lets nonprivileged users lock up the
* system. Announce that the fault handler will be checking for it.
* The Quark is also family 5, but does not have the same bug.
*/
clear_cpu_bug(c, X86_BUG_F00F);
if (!paravirt_enabled() && c->x86 == 5 && c->x86_model < 9) {
static int f00f_workaround_enabled;
set_cpu_bug(c, X86_BUG_F00F);
if (!f00f_workaround_enabled) {
printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
f00f_workaround_enabled = 1;
}
}
#endif
/*
* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
* model 3 mask 3
*/
if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
clear_cpu_cap(c, X86_FEATURE_SEP);
/*
* PAE CPUID issue: many Pentium M report no PAE but may have a
* functionally usable PAE implementation.
* Forcefully enable PAE if kernel parameter "forcepae" is present.
*/
if (forcepae) {
printk(KERN_WARNING "PAE forced!\n");
set_cpu_cap(c, X86_FEATURE_PAE);
add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
}
/*
* P4 Xeon errata 037 workaround.
* Hardware prefetcher may cause stale data to be loaded into the cache.
*/
if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
if (msr_set_bit(MSR_IA32_MISC_ENABLE,
MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT)
> 0) {
pr_info("CPU: C0 stepping P4 Xeon detected.\n");
pr_info("CPU: Disabling hardware prefetching (Errata 037)\n");
}
}
/*
* See if we have a good local APIC by checking for buggy Pentia,
* i.e. all B steppings and the C2 stepping of P54C when using their
* integrated APIC (see 11AP erratum in "Pentium Processor
* Specification Update").
*/
if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
(c->x86_mask < 0x6 || c->x86_mask == 0xb))
set_cpu_bug(c, X86_BUG_11AP);
#ifdef CONFIG_X86_INTEL_USERCOPY
/*
* Set up the preferred alignment for movsl bulk memory moves
*/
switch (c->x86) {
case 4: /* 486: untested */
break;
case 5: /* Old Pentia: untested */
break;
case 6: /* PII/PIII only like movsl with 8-byte alignment */
movsl_mask.mask = 7;
break;
case 15: /* P4 is OK down to 8-byte alignment */
movsl_mask.mask = 7;
break;
}
#endif
intel_smp_check(c);
}
#else
static void intel_workarounds(struct cpuinfo_x86 *c)
{
}
#endif
static void srat_detect_node(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_NUMA
unsigned node;
int cpu = smp_processor_id();
/* Don't do the funky fallback heuristics the AMD version employs
for now. */
node = numa_cpu_node(cpu);
if (node == NUMA_NO_NODE || !node_online(node)) {
/* reuse the value from init_cpu_to_node() */
node = cpu_to_node(cpu);
}
numa_set_node(cpu, node);
#endif
}
/*
* find out the number of processor cores on the die
*/
static int intel_num_cpu_cores(struct cpuinfo_x86 *c)
{
unsigned int eax, ebx, ecx, edx;
if (c->cpuid_level < 4)
return 1;
/* Intel has a non-standard dependency on %ecx for this CPUID level. */
cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
if (eax & 0x1f)
return (eax >> 26) + 1;
else
return 1;
}
static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
{
/* Intel VMX MSR indicated features */
#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
clear_cpu_cap(c, X86_FEATURE_VNMI);
clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
clear_cpu_cap(c, X86_FEATURE_EPT);
clear_cpu_cap(c, X86_FEATURE_VPID);
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
msr_ctl = vmx_msr_high | vmx_msr_low;
if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
set_cpu_cap(c, X86_FEATURE_VNMI);
if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
vmx_msr_low, vmx_msr_high);
msr_ctl2 = vmx_msr_high | vmx_msr_low;
if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
(msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
set_cpu_cap(c, X86_FEATURE_EPT);
if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
set_cpu_cap(c, X86_FEATURE_VPID);
}
}
static void init_intel(struct cpuinfo_x86 *c)
{
unsigned int l2 = 0;
early_init_intel(c);
intel_workarounds(c);
/*
* Detect the extended topology information if available. This
* will reinitialise the initial_apicid which will be used
* in init_intel_cacheinfo()
*/
detect_extended_topology(c);
if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
/*
* let's use the legacy cpuid vector 0x1 and 0x4 for topology
* detection.
*/
c->x86_max_cores = intel_num_cpu_cores(c);
#ifdef CONFIG_X86_32
detect_ht(c);
#endif
}
l2 = init_intel_cacheinfo(c);
/* Detect legacy cache sizes if init_intel_cacheinfo did not */
if (l2 == 0) {
cpu_detect_cache_sizes(c);
l2 = c->x86_cache_size;
}
if (c->cpuid_level > 9) {
unsigned eax = cpuid_eax(10);
/* Check for version and the number of counters */
if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
}
if (cpu_has_xmm2)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
if (cpu_has_ds) {
unsigned int l1;
rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
if (!(l1 & (1<<11)))
set_cpu_cap(c, X86_FEATURE_BTS);
if (!(l1 & (1<<12)))
set_cpu_cap(c, X86_FEATURE_PEBS);
}
if (c->x86 == 6 && cpu_has_clflush &&
(c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
#ifdef CONFIG_X86_64
if (c->x86 == 15)
c->x86_cache_alignment = c->x86_clflush_size * 2;
if (c->x86 == 6)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
#else
/*
* Names for the Pentium II/Celeron processors
* detectable only by also checking the cache size.
* Dixon is NOT a Celeron.
*/
if (c->x86 == 6) {
char *p = NULL;
switch (c->x86_model) {
case 5:
if (l2 == 0)
p = "Celeron (Covington)";
else if (l2 == 256)
p = "Mobile Pentium II (Dixon)";
break;
case 6:
if (l2 == 128)
p = "Celeron (Mendocino)";
else if (c->x86_mask == 0 || c->x86_mask == 5)
p = "Celeron-A";
break;
case 8:
if (l2 == 128)
p = "Celeron (Coppermine)";
break;
}
if (p)
strcpy(c->x86_model_id, p);
}
if (c->x86 == 15)
set_cpu_cap(c, X86_FEATURE_P4);
if (c->x86 == 6)
set_cpu_cap(c, X86_FEATURE_P3);
#endif
/* Work around errata */
srat_detect_node(c);
if (cpu_has(c, X86_FEATURE_VMX))
detect_vmx_virtcap(c);
/*
* Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
* x86_energy_perf_policy(8) is available to change it at run-time
*/
if (cpu_has(c, X86_FEATURE_EPB)) {
u64 epb;
rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
printk_once(KERN_WARNING "ENERGY_PERF_BIAS:"
" Set to 'normal', was 'performance'\n"
"ENERGY_PERF_BIAS: View and update with"
" x86_energy_perf_policy(8)\n");
epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
}
}
}
#ifdef CONFIG_X86_32
static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/*
* Intel PIII Tualatin. This comes in two flavours.
* One has 256kb of cache, the other 512. We have no way
* to determine which, so we use a boottime override
* for the 512kb model, and assume 256 otherwise.
*/
if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0))
size = 256;
/*
* Intel Quark SoC X1000 contains a 4-way set associative
* 16K cache with a 16 byte cache line and 256 lines per tag
*/
if ((c->x86 == 5) && (c->x86_model == 9))
size = 16;
return size;
}
#endif
#define TLB_INST_4K 0x01
#define TLB_INST_4M 0x02
#define TLB_INST_2M_4M 0x03
#define TLB_INST_ALL 0x05
#define TLB_INST_1G 0x06
#define TLB_DATA_4K 0x11
#define TLB_DATA_4M 0x12
#define TLB_DATA_2M_4M 0x13
#define TLB_DATA_4K_4M 0x14
#define TLB_DATA_1G 0x16
#define TLB_DATA0_4K 0x21
#define TLB_DATA0_4M 0x22
#define TLB_DATA0_2M_4M 0x23
#define STLB_4K 0x41
#define STLB_4K_2M 0x42
static const struct _tlb_table intel_tlb_table[] = {
{ 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
{ 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" },
{ 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" },
{ 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
{ 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
{ 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
{ 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" },
{ 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
{ 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
{ 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
{ 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
{ 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" },
{ 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" },
{ 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" },
{ 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" },
{ 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
{ 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
{ 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
{ 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" },
{ 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" },
{ 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
{ 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
{ 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
{ 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
{ 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
{ 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
{ 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set ssociative" },
{ 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set ssociative" },
{ 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
{ 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
{ 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
{ 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" },
{ 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
{ 0x00, 0, 0 }
};
static void intel_tlb_lookup(const unsigned char desc)
{
unsigned char k;
if (desc == 0)
return;
/* look up this descriptor in the table */
for (k = 0; intel_tlb_table[k].descriptor != desc && \
intel_tlb_table[k].descriptor != 0; k++)
;
if (intel_tlb_table[k].tlb_type == 0)
return;
switch (intel_tlb_table[k].tlb_type) {
case STLB_4K:
if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
break;
case STLB_4K_2M:
if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_INST_ALL:
if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_INST_4K:
if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_INST_4M:
if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_INST_2M_4M:
if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_DATA_4K:
case TLB_DATA0_4K:
if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_DATA_4M:
case TLB_DATA0_4M:
if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_DATA_2M_4M:
case TLB_DATA0_2M_4M:
if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_DATA_4K_4M:
if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
break;
case TLB_DATA_1G:
if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
break;
}
}
static void intel_detect_tlb(struct cpuinfo_x86 *c)
{
int i, j, n;
unsigned int regs[4];
unsigned char *desc = (unsigned char *)regs;
if (c->cpuid_level < 2)
return;
/* Number of times to iterate */
n = cpuid_eax(2) & 0xFF;
for (i = 0 ; i < n ; i++) {
cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
/* If bit 31 is set, this is an unknown format */
for (j = 0 ; j < 3 ; j++)
if (regs[j] & (1 << 31))
regs[j] = 0;
/* Byte 0 is level count, not a descriptor */
for (j = 1 ; j < 16 ; j++)
intel_tlb_lookup(desc[j]);
}
}
static const struct cpu_dev intel_cpu_dev = {
.c_vendor = "Intel",
.c_ident = { "GenuineIntel" },
#ifdef CONFIG_X86_32
.legacy_models = {
{ .family = 4, .model_names =
{
[0] = "486 DX-25/33",
[1] = "486 DX-50",
[2] = "486 SX",
[3] = "486 DX/2",
[4] = "486 SL",
[5] = "486 SX/2",
[7] = "486 DX/2-WB",
[8] = "486 DX/4",
[9] = "486 DX/4-WB"
}
},
{ .family = 5, .model_names =
{
[0] = "Pentium 60/66 A-step",
[1] = "Pentium 60/66",
[2] = "Pentium 75 - 200",
[3] = "OverDrive PODP5V83",
[4] = "Pentium MMX",
[7] = "Mobile Pentium 75 - 200",
[8] = "Mobile Pentium MMX",
[9] = "Quark SoC X1000",
}
},
{ .family = 6, .model_names =
{
[0] = "Pentium Pro A-step",
[1] = "Pentium Pro",
[3] = "Pentium II (Klamath)",
[4] = "Pentium II (Deschutes)",
[5] = "Pentium II (Deschutes)",
[6] = "Mobile Pentium II",
[7] = "Pentium III (Katmai)",
[8] = "Pentium III (Coppermine)",
[10] = "Pentium III (Cascades)",
[11] = "Pentium III (Tualatin)",
}
},
{ .family = 15, .model_names =
{
[0] = "Pentium 4 (Unknown)",
[1] = "Pentium 4 (Willamette)",
[2] = "Pentium 4 (Northwood)",
[4] = "Pentium 4 (Foster)",
[5] = "Pentium 4 (Foster)",
}
},
},
.legacy_cache_size = intel_size_cache,
#endif
.c_detect_tlb = intel_detect_tlb,
.c_early_init = early_init_intel,
.c_init = init_intel,
.c_x86_vendor = X86_VENDOR_INTEL,
};
cpu_dev_register(intel_cpu_dev);

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,49 @@
#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/slab.h>
/**
* x86_match_cpu - match current CPU again an array of x86_cpu_ids
* @match: Pointer to array of x86_cpu_ids. Last entry terminated with
* {}.
*
* Return the entry if the current CPU matches the entries in the
* passed x86_cpu_id match table. Otherwise NULL. The match table
* contains vendor (X86_VENDOR_*), family, model and feature bits or
* respective wildcard entries.
*
* A typical table entry would be to match a specific CPU
* { X86_VENDOR_INTEL, 6, 0x12 }
* or to match a specific CPU feature
* { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) }
*
* Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
* %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
*
* Arrays used to match for this should also be declared using
* MODULE_DEVICE_TABLE(x86cpu, ...)
*
* This always matches against the boot cpu, assuming models and features are
* consistent over all CPUs.
*/
const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
{
const struct x86_cpu_id *m;
struct cpuinfo_x86 *c = &boot_cpu_data;
for (m = match; m->vendor | m->family | m->model | m->feature; m++) {
if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
continue;
if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
continue;
if (m->model != X86_MODEL_ANY && c->x86_model != m->model)
continue;
if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
continue;
return m;
}
return NULL;
}
EXPORT_SYMBOL(x86_match_cpu);

View file

@ -0,0 +1,11 @@
obj-y = mce.o mce-severity.o
obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
obj-$(CONFIG_ACPI_APEI) += mce-apei.o

View file

@ -0,0 +1,155 @@
/*
* Bridge between MCE and APEI
*
* On some machine, corrected memory errors are reported via APEI
* generic hardware error source (GHES) instead of corrected Machine
* Check. These corrected memory errors can be reported to user space
* through /dev/mcelog via faking a corrected Machine Check, so that
* the error memory page can be offlined by /sbin/mcelog if the error
* count for one page is beyond the threshold.
*
* For fatal MCE, save MCE record into persistent storage via ERST, so
* that the MCE record can be logged after reboot via ERST.
*
* Copyright 2010 Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/cper.h>
#include <acpi/apei.h>
#include <acpi/ghes.h>
#include <asm/mce.h>
#include "mce-internal.h"
void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
{
struct mce m;
if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
return;
mce_setup(&m);
m.bank = 1;
/* Fake a memory read error with unknown channel */
m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
if (severity >= GHES_SEV_RECOVERABLE)
m.status |= MCI_STATUS_UC;
if (severity >= GHES_SEV_PANIC)
m.status |= MCI_STATUS_PCC;
m.addr = mem_err->physical_addr;
mce_log(&m);
mce_notify_irq();
}
EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
#define CPER_CREATOR_MCE \
UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
0x64, 0x90, 0xb8, 0x9d)
#define CPER_SECTION_TYPE_MCE \
UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
0x04, 0x4a, 0x38, 0xfc)
/*
* CPER specification (in UEFI specification 2.3 appendix N) requires
* byte-packed.
*/
struct cper_mce_record {
struct cper_record_header hdr;
struct cper_section_descriptor sec_hdr;
struct mce mce;
} __packed;
int apei_write_mce(struct mce *m)
{
struct cper_mce_record rcd;
memset(&rcd, 0, sizeof(rcd));
memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
rcd.hdr.revision = CPER_RECORD_REV;
rcd.hdr.signature_end = CPER_SIG_END;
rcd.hdr.section_count = 1;
rcd.hdr.error_severity = CPER_SEV_FATAL;
/* timestamp, platform_id, partition_id are all invalid */
rcd.hdr.validation_bits = 0;
rcd.hdr.record_length = sizeof(rcd);
rcd.hdr.creator_id = CPER_CREATOR_MCE;
rcd.hdr.notification_type = CPER_NOTIFY_MCE;
rcd.hdr.record_id = cper_next_record_id();
rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
rcd.sec_hdr.section_length = sizeof(rcd.mce);
rcd.sec_hdr.revision = CPER_SEC_REV;
/* fru_id and fru_text is invalid */
rcd.sec_hdr.validation_bits = 0;
rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
memcpy(&rcd.mce, m, sizeof(*m));
return erst_write(&rcd.hdr);
}
ssize_t apei_read_mce(struct mce *m, u64 *record_id)
{
struct cper_mce_record rcd;
int rc, pos;
rc = erst_get_record_id_begin(&pos);
if (rc)
return rc;
retry:
rc = erst_get_record_id_next(&pos, record_id);
if (rc)
goto out;
/* no more record */
if (*record_id == APEI_ERST_INVALID_RECORD_ID)
goto out;
rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
/* someone else has cleared the record, try next one */
if (rc == -ENOENT)
goto retry;
else if (rc < 0)
goto out;
/* try to skip other type records in storage */
else if (rc != sizeof(rcd) ||
uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
goto retry;
memcpy(m, &rcd.mce, sizeof(*m));
rc = sizeof(*m);
out:
erst_get_record_id_end();
return rc;
}
/* Check whether there is record in ERST */
int apei_check_mce(void)
{
return erst_get_record_count();
}
int apei_clear_mce(u64 record_id)
{
return erst_clear(record_id);
}

View file

@ -0,0 +1,256 @@
/*
* Machine check injection support.
* Copyright 2008 Intel Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*
* Authors:
* Andi Kleen
* Ying Huang
*/
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/timer.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/preempt.h>
#include <linux/smp.h>
#include <linux/notifier.h>
#include <linux/kdebug.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <linux/gfp.h>
#include <asm/mce.h>
#include <asm/apic.h>
#include <asm/nmi.h>
/* Update fake mce registers on current CPU. */
static void inject_mce(struct mce *m)
{
struct mce *i = &per_cpu(injectm, m->extcpu);
/* Make sure no one reads partially written injectm */
i->finished = 0;
mb();
m->finished = 0;
/* First set the fields after finished */
i->extcpu = m->extcpu;
mb();
/* Now write record in order, finished last (except above) */
memcpy(i, m, sizeof(struct mce));
/* Finally activate it */
mb();
i->finished = 1;
}
static void raise_poll(struct mce *m)
{
unsigned long flags;
mce_banks_t b;
memset(&b, 0xff, sizeof(mce_banks_t));
local_irq_save(flags);
machine_check_poll(0, &b);
local_irq_restore(flags);
m->finished = 0;
}
static void raise_exception(struct mce *m, struct pt_regs *pregs)
{
struct pt_regs regs;
unsigned long flags;
if (!pregs) {
memset(&regs, 0, sizeof(struct pt_regs));
regs.ip = m->ip;
regs.cs = m->cs;
pregs = &regs;
}
/* in mcheck exeception handler, irq will be disabled */
local_irq_save(flags);
do_machine_check(pregs, 0);
local_irq_restore(flags);
m->finished = 0;
}
static cpumask_var_t mce_inject_cpumask;
static DEFINE_MUTEX(mce_inject_mutex);
static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
{
int cpu = smp_processor_id();
struct mce *m = this_cpu_ptr(&injectm);
if (!cpumask_test_cpu(cpu, mce_inject_cpumask))
return NMI_DONE;
cpumask_clear_cpu(cpu, mce_inject_cpumask);
if (m->inject_flags & MCJ_EXCEPTION)
raise_exception(m, regs);
else if (m->status)
raise_poll(m);
return NMI_HANDLED;
}
static void mce_irq_ipi(void *info)
{
int cpu = smp_processor_id();
struct mce *m = this_cpu_ptr(&injectm);
if (cpumask_test_cpu(cpu, mce_inject_cpumask) &&
m->inject_flags & MCJ_EXCEPTION) {
cpumask_clear_cpu(cpu, mce_inject_cpumask);
raise_exception(m, NULL);
}
}
/* Inject mce on current CPU */
static int raise_local(void)
{
struct mce *m = this_cpu_ptr(&injectm);
int context = MCJ_CTX(m->inject_flags);
int ret = 0;
int cpu = m->extcpu;
if (m->inject_flags & MCJ_EXCEPTION) {
printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
switch (context) {
case MCJ_CTX_IRQ:
/*
* Could do more to fake interrupts like
* calling irq_enter, but the necessary
* machinery isn't exported currently.
*/
/*FALL THROUGH*/
case MCJ_CTX_PROCESS:
raise_exception(m, NULL);
break;
default:
printk(KERN_INFO "Invalid MCE context\n");
ret = -EINVAL;
}
printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
} else if (m->status) {
printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
raise_poll(m);
mce_notify_irq();
printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
} else
m->finished = 0;
return ret;
}
static void raise_mce(struct mce *m)
{
int context = MCJ_CTX(m->inject_flags);
inject_mce(m);
if (context == MCJ_CTX_RANDOM)
return;
#ifdef CONFIG_X86_LOCAL_APIC
if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) {
unsigned long start;
int cpu;
get_online_cpus();
cpumask_copy(mce_inject_cpumask, cpu_online_mask);
cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
for_each_online_cpu(cpu) {
struct mce *mcpu = &per_cpu(injectm, cpu);
if (!mcpu->finished ||
MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
cpumask_clear_cpu(cpu, mce_inject_cpumask);
}
if (!cpumask_empty(mce_inject_cpumask)) {
if (m->inject_flags & MCJ_IRQ_BROADCAST) {
/*
* don't wait because mce_irq_ipi is necessary
* to be sync with following raise_local
*/
preempt_disable();
smp_call_function_many(mce_inject_cpumask,
mce_irq_ipi, NULL, 0);
preempt_enable();
} else if (m->inject_flags & MCJ_NMI_BROADCAST)
apic->send_IPI_mask(mce_inject_cpumask,
NMI_VECTOR);
}
start = jiffies;
while (!cpumask_empty(mce_inject_cpumask)) {
if (!time_before(jiffies, start + 2*HZ)) {
printk(KERN_ERR
"Timeout waiting for mce inject %lx\n",
*cpumask_bits(mce_inject_cpumask));
break;
}
cpu_relax();
}
raise_local();
put_cpu();
put_online_cpus();
} else
#endif
{
preempt_disable();
raise_local();
preempt_enable();
}
}
/* Error injection interface */
static ssize_t mce_write(struct file *filp, const char __user *ubuf,
size_t usize, loff_t *off)
{
struct mce m;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
/*
* There are some cases where real MSR reads could slip
* through.
*/
if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
return -EIO;
if ((unsigned long)usize > sizeof(struct mce))
usize = sizeof(struct mce);
if (copy_from_user(&m, ubuf, usize))
return -EFAULT;
if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
return -EINVAL;
/*
* Need to give user space some time to set everything up,
* so do it a jiffie or two later everywhere.
*/
schedule_timeout(2);
mutex_lock(&mce_inject_mutex);
raise_mce(&m);
mutex_unlock(&mce_inject_mutex);
return usize;
}
static int inject_init(void)
{
if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
return -ENOMEM;
printk(KERN_INFO "Machine check injector initialized\n");
register_mce_write_callback(mce_write);
register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0,
"mce_notify");
return 0;
}
module_init(inject_init);
/*
* Cannot tolerate unloading currently because we cannot
* guarantee all openers of mce_chrdev will get a reference to us.
*/
MODULE_LICENSE("GPL");

View file

@ -0,0 +1,66 @@
#include <linux/device.h>
#include <asm/mce.h>
enum severity_level {
MCE_NO_SEVERITY,
MCE_KEEP_SEVERITY,
MCE_SOME_SEVERITY,
MCE_AO_SEVERITY,
MCE_UC_SEVERITY,
MCE_AR_SEVERITY,
MCE_PANIC_SEVERITY,
};
#define ATTR_LEN 16
/* One object for each MCE bank, shared by all CPUs */
struct mce_bank {
u64 ctl; /* subevents to enable */
unsigned char init; /* initialise bank? */
struct device_attribute attr; /* device attribute */
char attrname[ATTR_LEN]; /* attribute name */
};
int mce_severity(struct mce *a, int tolerant, char **msg);
struct dentry *mce_get_debugfs_dir(void);
extern struct mce_bank *mce_banks;
extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
unsigned long mce_intel_adjust_timer(unsigned long interval);
void mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
void cmci_disable_bank(int bank);
#else
# define mce_intel_adjust_timer mce_adjust_timer_default
static inline void mce_intel_cmci_poll(void) { }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
static inline void cmci_disable_bank(int bank) { }
#endif
void mce_timer_kick(unsigned long interval);
#ifdef CONFIG_ACPI_APEI
int apei_write_mce(struct mce *m);
ssize_t apei_read_mce(struct mce *m, u64 *record_id);
int apei_check_mce(void);
int apei_clear_mce(u64 record_id);
#else
static inline int apei_write_mce(struct mce *m)
{
return -EINVAL;
}
static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
{
return 0;
}
static inline int apei_check_mce(void)
{
return 0;
}
static inline int apei_clear_mce(u64 record_id)
{
return -EINVAL;
}
#endif

View file

@ -0,0 +1,283 @@
/*
* MCE grading rules.
* Copyright 2008, 2009 Intel Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*
* Author: Andi Kleen
*/
#include <linux/kernel.h>
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/debugfs.h>
#include <asm/mce.h>
#include "mce-internal.h"
/*
* Grade an mce by severity. In general the most severe ones are processed
* first. Since there are quite a lot of combinations test the bits in a
* table-driven way. The rules are simply processed in order, first
* match wins.
*
* Note this is only used for machine check exceptions, the corrected
* errors use much simpler rules. The exceptions still check for the corrected
* errors, but only to leave them alone for the CMCI handler (except for
* panic situations)
*/
enum context { IN_KERNEL = 1, IN_USER = 2 };
enum ser { SER_REQUIRED = 1, NO_SER = 2 };
static struct severity {
u64 mask;
u64 result;
unsigned char sev;
unsigned char mcgmask;
unsigned char mcgres;
unsigned char ser;
unsigned char context;
unsigned char covered;
char *msg;
} severities[] = {
#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
#define KERNEL .context = IN_KERNEL
#define USER .context = IN_USER
#define SER .ser = SER_REQUIRED
#define NOSER .ser = NO_SER
#define BITCLR(x) .mask = x, .result = 0
#define BITSET(x) .mask = x, .result = x
#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
#define MASK(x, y) .mask = x, .result = y
#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
MCESEV(
NO, "Invalid",
BITCLR(MCI_STATUS_VAL)
),
MCESEV(
NO, "Not enabled",
BITCLR(MCI_STATUS_EN)
),
MCESEV(
PANIC, "Processor context corrupt",
BITSET(MCI_STATUS_PCC)
),
/* When MCIP is not set something is very confused */
MCESEV(
PANIC, "MCIP not set in MCA handler",
MCGMASK(MCG_STATUS_MCIP, 0)
),
/* Neither return not error IP -- no chance to recover -> PANIC */
MCESEV(
PANIC, "Neither restart nor error IP",
MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
),
MCESEV(
PANIC, "In kernel and no restart IP",
KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
),
MCESEV(
KEEP, "Corrected error",
NOSER, BITCLR(MCI_STATUS_UC)
),
/* ignore OVER for UCNA */
MCESEV(
KEEP, "Uncorrected no action required",
SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
),
MCESEV(
PANIC, "Illegal combination (UCNA with AR=1)",
SER,
MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
),
MCESEV(
KEEP, "Non signalled machine check",
SER, BITCLR(MCI_STATUS_S)
),
MCESEV(
PANIC, "Action required with lost events",
SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
),
/* known AR MCACODs: */
#ifdef CONFIG_MEMORY_FAILURE
MCESEV(
KEEP, "Action required but unaffected thread is continuable",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
),
MCESEV(
AR, "Action required: data load error in a user process",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
USER
),
MCESEV(
AR, "Action required: instruction fetch error in a user process",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
USER
),
#endif
MCESEV(
PANIC, "Action required: unknown MCACOD",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
),
/* known AO MCACODs: */
MCESEV(
AO, "Action optional: memory scrubbing error",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
),
MCESEV(
AO, "Action optional: last level cache writeback error",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
),
MCESEV(
SOME, "Action optional: unknown MCACOD",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
),
MCESEV(
SOME, "Action optional with lost events",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
),
MCESEV(
PANIC, "Overflowed uncorrected",
BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
),
MCESEV(
UC, "Uncorrected",
BITSET(MCI_STATUS_UC)
),
MCESEV(
SOME, "No match",
BITSET(0)
) /* always matches. keep at end */
};
/*
* If mcgstatus indicated that ip/cs on the stack were
* no good, then "m->cs" will be zero and we will have
* to assume the worst case (IN_KERNEL) as we actually
* have no idea what we were executing when the machine
* check hit.
* If we do have a good "m->cs" (or a faked one in the
* case we were executing in VM86 mode) we can use it to
* distinguish an exception taken in user from from one
* taken in the kernel.
*/
static int error_context(struct mce *m)
{
return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
}
int mce_severity(struct mce *m, int tolerant, char **msg)
{
enum context ctx = error_context(m);
struct severity *s;
for (s = severities;; s++) {
if ((m->status & s->mask) != s->result)
continue;
if ((m->mcgstatus & s->mcgmask) != s->mcgres)
continue;
if (s->ser == SER_REQUIRED && !mca_cfg.ser)
continue;
if (s->ser == NO_SER && mca_cfg.ser)
continue;
if (s->context && ctx != s->context)
continue;
if (msg)
*msg = s->msg;
s->covered = 1;
if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
if (panic_on_oops || tolerant < 1)
return MCE_PANIC_SEVERITY;
}
return s->sev;
}
}
#ifdef CONFIG_DEBUG_FS
static void *s_start(struct seq_file *f, loff_t *pos)
{
if (*pos >= ARRAY_SIZE(severities))
return NULL;
return &severities[*pos];
}
static void *s_next(struct seq_file *f, void *data, loff_t *pos)
{
if (++(*pos) >= ARRAY_SIZE(severities))
return NULL;
return &severities[*pos];
}
static void s_stop(struct seq_file *f, void *data)
{
}
static int s_show(struct seq_file *f, void *data)
{
struct severity *ser = data;
seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
return 0;
}
static const struct seq_operations severities_seq_ops = {
.start = s_start,
.next = s_next,
.stop = s_stop,
.show = s_show,
};
static int severities_coverage_open(struct inode *inode, struct file *file)
{
return seq_open(file, &severities_seq_ops);
}
static ssize_t severities_coverage_write(struct file *file,
const char __user *ubuf,
size_t count, loff_t *ppos)
{
int i;
for (i = 0; i < ARRAY_SIZE(severities); i++)
severities[i].covered = 0;
return count;
}
static const struct file_operations severities_coverage_fops = {
.open = severities_coverage_open,
.release = seq_release,
.read = seq_read,
.write = severities_coverage_write,
.llseek = seq_lseek,
};
static int __init severities_debugfs_init(void)
{
struct dentry *dmce, *fsev;
dmce = mce_get_debugfs_dir();
if (!dmce)
goto err_out;
fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
&severities_coverage_fops);
if (!fsev)
goto err_out;
return 0;
err_out:
return -ENOMEM;
}
late_initcall(severities_debugfs_init);
#endif /* CONFIG_DEBUG_FS */

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,789 @@
/*
* (c) 2005-2012 Advanced Micro Devices, Inc.
* Your use of this code is subject to the terms and conditions of the
* GNU general public license version 2. See "COPYING" or
* http://www.gnu.org/licenses/gpl.html
*
* Written by Jacob Shin - AMD, Inc.
*
* Maintained by: Borislav Petkov <bp@alien8.de>
*
* April 2006
* - added support for AMD Family 0x10 processors
* May 2012
* - major scrubbing
*
* All MC4_MISCi registers are shared between multi-cores
*/
#include <linux/interrupt.h>
#include <linux/notifier.h>
#include <linux/kobject.h>
#include <linux/percpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <asm/amd_nb.h>
#include <asm/apic.h>
#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/msr.h>
#define NR_BLOCKS 9
#define THRESHOLD_MAX 0xFFF
#define INT_TYPE_APIC 0x00020000
#define MASK_VALID_HI 0x80000000
#define MASK_CNTP_HI 0x40000000
#define MASK_LOCKED_HI 0x20000000
#define MASK_LVTOFF_HI 0x00F00000
#define MASK_COUNT_EN_HI 0x00080000
#define MASK_INT_TYPE_HI 0x00060000
#define MASK_OVERFLOW_HI 0x00010000
#define MASK_ERR_COUNT_HI 0x00000FFF
#define MASK_BLKPTR_LO 0xFF000000
#define MCG_XBLK_ADDR 0xC0000400
static const char * const th_names[] = {
"load_store",
"insn_fetch",
"combined_unit",
"",
"northbridge",
"execution_unit",
};
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
static void amd_threshold_interrupt(void);
/*
* CPU Initialization
*/
struct thresh_restart {
struct threshold_block *b;
int reset;
int set_lvt_off;
int lvt_off;
u16 old_limit;
};
static inline bool is_shared_bank(int bank)
{
/* Bank 4 is for northbridge reporting and is thus shared */
return (bank == 4);
}
static const char * const bank4_names(struct threshold_block *b)
{
switch (b->address) {
/* MSR4_MISC0 */
case 0x00000413:
return "dram";
case 0xc0000408:
return "ht_links";
case 0xc0000409:
return "l3_cache";
default:
WARN(1, "Funny MSR: 0x%08x\n", b->address);
return "";
}
};
static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
{
/*
* bank 4 supports APIC LVT interrupts implicitly since forever.
*/
if (bank == 4)
return true;
/*
* IntP: interrupt present; if this bit is set, the thresholding
* bank can generate APIC LVT interrupts
*/
return msr_high_bits & BIT(28);
}
static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
{
int msr = (hi & MASK_LVTOFF_HI) >> 20;
if (apic < 0) {
pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
b->bank, b->block, b->address, hi, lo);
return 0;
}
if (apic != msr) {
pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n",
b->cpu, apic, b->bank, b->block, b->address, hi, lo);
return 0;
}
return 1;
};
/*
* Called via smp_call_function_single(), must be called with correct
* cpu affinity.
*/
static void threshold_restart_bank(void *_tr)
{
struct thresh_restart *tr = _tr;
u32 hi, lo;
rdmsr(tr->b->address, lo, hi);
if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
tr->reset = 1; /* limit cannot be lower than err count */
if (tr->reset) { /* reset err count and overflow bit */
hi =
(hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
(THRESHOLD_MAX - tr->b->threshold_limit);
} else if (tr->old_limit) { /* change limit w/o reset */
int new_count = (hi & THRESHOLD_MAX) +
(tr->old_limit - tr->b->threshold_limit);
hi = (hi & ~MASK_ERR_COUNT_HI) |
(new_count & THRESHOLD_MAX);
}
/* clear IntType */
hi &= ~MASK_INT_TYPE_HI;
if (!tr->b->interrupt_capable)
goto done;
if (tr->set_lvt_off) {
if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
/* set new lvt offset */
hi &= ~MASK_LVTOFF_HI;
hi |= tr->lvt_off << 20;
}
}
if (tr->b->interrupt_enable)
hi |= INT_TYPE_APIC;
done:
hi |= MASK_COUNT_EN_HI;
wrmsr(tr->b->address, lo, hi);
}
static void mce_threshold_block_init(struct threshold_block *b, int offset)
{
struct thresh_restart tr = {
.b = b,
.set_lvt_off = 1,
.lvt_off = offset,
};
b->threshold_limit = THRESHOLD_MAX;
threshold_restart_bank(&tr);
};
static int setup_APIC_mce(int reserved, int new)
{
if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
APIC_EILVT_MSG_FIX, 0))
return new;
return reserved;
}
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
struct threshold_block b;
unsigned int cpu = smp_processor_id();
u32 low = 0, high = 0, address = 0;
unsigned int bank, block;
int offset = -1;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
if (block == 0)
address = MSR_IA32_MC0_MISC + bank * 4;
else if (block == 1) {
address = (low & MASK_BLKPTR_LO) >> 21;
if (!address)
break;
address += MCG_XBLK_ADDR;
} else
++address;
if (rdmsr_safe(address, &low, &high))
break;
if (!(high & MASK_VALID_HI))
continue;
if (!(high & MASK_CNTP_HI) ||
(high & MASK_LOCKED_HI))
continue;
if (!block)
per_cpu(bank_map, cpu) |= (1 << bank);
memset(&b, 0, sizeof(b));
b.cpu = cpu;
b.bank = bank;
b.block = block;
b.address = address;
b.interrupt_capable = lvt_interrupt_supported(bank, high);
if (b.interrupt_capable) {
int new = (high & MASK_LVTOFF_HI) >> 20;
offset = setup_APIC_mce(offset, new);
}
mce_threshold_block_init(&b, offset);
mce_threshold_vector = amd_threshold_interrupt;
}
}
}
/*
* APIC Interrupt Handler
*/
/*
* threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
* the interrupt goes off when error_count reaches threshold_limit.
* the handler will simply log mcelog w/ software defined bank number.
*/
static void amd_threshold_interrupt(void)
{
u32 low = 0, high = 0, address = 0;
unsigned int bank, block;
struct mce m;
mce_setup(&m);
/* assume first bank caused it */
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
continue;
for (block = 0; block < NR_BLOCKS; ++block) {
if (block == 0) {
address = MSR_IA32_MC0_MISC + bank * 4;
} else if (block == 1) {
address = (low & MASK_BLKPTR_LO) >> 21;
if (!address)
break;
address += MCG_XBLK_ADDR;
} else {
++address;
}
if (rdmsr_safe(address, &low, &high))
break;
if (!(high & MASK_VALID_HI)) {
if (block)
continue;
else
break;
}
if (!(high & MASK_CNTP_HI) ||
(high & MASK_LOCKED_HI))
continue;
/*
* Log the machine check that caused the threshold
* event.
*/
machine_check_poll(MCP_TIMESTAMP,
this_cpu_ptr(&mce_poll_banks));
if (high & MASK_OVERFLOW_HI) {
rdmsrl(address, m.misc);
rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
m.status);
m.bank = K8_MCE_THRESHOLD_BASE
+ bank * NR_BLOCKS
+ block;
mce_log(&m);
return;
}
}
}
}
/*
* Sysfs Interface
*/
struct threshold_attr {
struct attribute attr;
ssize_t (*show) (struct threshold_block *, char *);
ssize_t (*store) (struct threshold_block *, const char *, size_t count);
};
#define SHOW_FIELDS(name) \
static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
{ \
return sprintf(buf, "%lu\n", (unsigned long) b->name); \
}
SHOW_FIELDS(interrupt_enable)
SHOW_FIELDS(threshold_limit)
static ssize_t
store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
{
struct thresh_restart tr;
unsigned long new;
if (!b->interrupt_capable)
return -EINVAL;
if (kstrtoul(buf, 0, &new) < 0)
return -EINVAL;
b->interrupt_enable = !!new;
memset(&tr, 0, sizeof(tr));
tr.b = b;
smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
return size;
}
static ssize_t
store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
{
struct thresh_restart tr;
unsigned long new;
if (kstrtoul(buf, 0, &new) < 0)
return -EINVAL;
if (new > THRESHOLD_MAX)
new = THRESHOLD_MAX;
if (new < 1)
new = 1;
memset(&tr, 0, sizeof(tr));
tr.old_limit = b->threshold_limit;
b->threshold_limit = new;
tr.b = b;
smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
return size;
}
static ssize_t show_error_count(struct threshold_block *b, char *buf)
{
u32 lo, hi;
rdmsr_on_cpu(b->cpu, b->address, &lo, &hi);
return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
(THRESHOLD_MAX - b->threshold_limit)));
}
static struct threshold_attr error_count = {
.attr = {.name = __stringify(error_count), .mode = 0444 },
.show = show_error_count,
};
#define RW_ATTR(val) \
static struct threshold_attr val = { \
.attr = {.name = __stringify(val), .mode = 0644 }, \
.show = show_## val, \
.store = store_## val, \
};
RW_ATTR(interrupt_enable);
RW_ATTR(threshold_limit);
static struct attribute *default_attrs[] = {
&threshold_limit.attr,
&error_count.attr,
NULL, /* possibly interrupt_enable if supported, see below */
NULL,
};
#define to_block(k) container_of(k, struct threshold_block, kobj)
#define to_attr(a) container_of(a, struct threshold_attr, attr)
static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
{
struct threshold_block *b = to_block(kobj);
struct threshold_attr *a = to_attr(attr);
ssize_t ret;
ret = a->show ? a->show(b, buf) : -EIO;
return ret;
}
static ssize_t store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t count)
{
struct threshold_block *b = to_block(kobj);
struct threshold_attr *a = to_attr(attr);
ssize_t ret;
ret = a->store ? a->store(b, buf, count) : -EIO;
return ret;
}
static const struct sysfs_ops threshold_ops = {
.show = show,
.store = store,
};
static struct kobj_type threshold_ktype = {
.sysfs_ops = &threshold_ops,
.default_attrs = default_attrs,
};
static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
unsigned int block, u32 address)
{
struct threshold_block *b = NULL;
u32 low, high;
int err;
if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
return 0;
if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
return 0;
if (!(high & MASK_VALID_HI)) {
if (block)
goto recurse;
else
return 0;
}
if (!(high & MASK_CNTP_HI) ||
(high & MASK_LOCKED_HI))
goto recurse;
b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
if (!b)
return -ENOMEM;
b->block = block;
b->bank = bank;
b->cpu = cpu;
b->address = address;
b->interrupt_enable = 0;
b->interrupt_capable = lvt_interrupt_supported(bank, high);
b->threshold_limit = THRESHOLD_MAX;
if (b->interrupt_capable)
threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
else
threshold_ktype.default_attrs[2] = NULL;
INIT_LIST_HEAD(&b->miscj);
if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
list_add(&b->miscj,
&per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
} else {
per_cpu(threshold_banks, cpu)[bank]->blocks = b;
}
err = kobject_init_and_add(&b->kobj, &threshold_ktype,
per_cpu(threshold_banks, cpu)[bank]->kobj,
(bank == 4 ? bank4_names(b) : th_names[bank]));
if (err)
goto out_free;
recurse:
if (!block) {
address = (low & MASK_BLKPTR_LO) >> 21;
if (!address)
return 0;
address += MCG_XBLK_ADDR;
} else {
++address;
}
err = allocate_threshold_blocks(cpu, bank, ++block, address);
if (err)
goto out_free;
if (b)
kobject_uevent(&b->kobj, KOBJ_ADD);
return err;
out_free:
if (b) {
kobject_put(&b->kobj);
list_del(&b->miscj);
kfree(b);
}
return err;
}
static int __threshold_add_blocks(struct threshold_bank *b)
{
struct list_head *head = &b->blocks->miscj;
struct threshold_block *pos = NULL;
struct threshold_block *tmp = NULL;
int err = 0;
err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
if (err)
return err;
list_for_each_entry_safe(pos, tmp, head, miscj) {
err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
if (err) {
list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
kobject_del(&pos->kobj);
return err;
}
}
return err;
}
static int threshold_create_bank(unsigned int cpu, unsigned int bank)
{
struct device *dev = per_cpu(mce_device, cpu);
struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
const char *name = th_names[bank];
int err = 0;
if (is_shared_bank(bank)) {
nb = node_to_amd_nb(amd_get_nb_id(cpu));
/* threshold descriptor already initialized on this node? */
if (nb && nb->bank4) {
/* yes, use it */
b = nb->bank4;
err = kobject_add(b->kobj, &dev->kobj, name);
if (err)
goto out;
per_cpu(threshold_banks, cpu)[bank] = b;
atomic_inc(&b->cpus);
err = __threshold_add_blocks(b);
goto out;
}
}
b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
if (!b) {
err = -ENOMEM;
goto out;
}
b->kobj = kobject_create_and_add(name, &dev->kobj);
if (!b->kobj) {
err = -EINVAL;
goto out_free;
}
per_cpu(threshold_banks, cpu)[bank] = b;
if (is_shared_bank(bank)) {
atomic_set(&b->cpus, 1);
/* nb is already initialized, see above */
if (nb) {
WARN_ON(nb->bank4);
nb->bank4 = b;
}
}
err = allocate_threshold_blocks(cpu, bank, 0,
MSR_IA32_MC0_MISC + bank * 4);
if (!err)
goto out;
out_free:
kfree(b);
out:
return err;
}
/* create dir/files for all valid threshold banks */
static int threshold_create_device(unsigned int cpu)
{
unsigned int bank;
struct threshold_bank **bp;
int err = 0;
bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks,
GFP_KERNEL);
if (!bp)
return -ENOMEM;
per_cpu(threshold_banks, cpu) = bp;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
err = threshold_create_bank(cpu, bank);
if (err)
return err;
}
return err;
}
static void deallocate_threshold_block(unsigned int cpu,
unsigned int bank)
{
struct threshold_block *pos = NULL;
struct threshold_block *tmp = NULL;
struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
if (!head)
return;
list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
kobject_put(&pos->kobj);
list_del(&pos->miscj);
kfree(pos);
}
kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
}
static void __threshold_remove_blocks(struct threshold_bank *b)
{
struct threshold_block *pos = NULL;
struct threshold_block *tmp = NULL;
kobject_del(b->kobj);
list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
kobject_del(&pos->kobj);
}
static void threshold_remove_bank(unsigned int cpu, int bank)
{
struct amd_northbridge *nb;
struct threshold_bank *b;
b = per_cpu(threshold_banks, cpu)[bank];
if (!b)
return;
if (!b->blocks)
goto free_out;
if (is_shared_bank(bank)) {
if (!atomic_dec_and_test(&b->cpus)) {
__threshold_remove_blocks(b);
per_cpu(threshold_banks, cpu)[bank] = NULL;
return;
} else {
/*
* the last CPU on this node using the shared bank is
* going away, remove that bank now.
*/
nb = node_to_amd_nb(amd_get_nb_id(cpu));
nb->bank4 = NULL;
}
}
deallocate_threshold_block(cpu, bank);
free_out:
kobject_del(b->kobj);
kobject_put(b->kobj);
kfree(b);
per_cpu(threshold_banks, cpu)[bank] = NULL;
}
static void threshold_remove_device(unsigned int cpu)
{
unsigned int bank;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
threshold_remove_bank(cpu, bank);
}
kfree(per_cpu(threshold_banks, cpu));
}
/* get notified when a cpu comes on/off */
static void
amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
{
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
threshold_create_device(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
threshold_remove_device(cpu);
break;
default:
break;
}
}
static __init int threshold_init_device(void)
{
unsigned lcpu = 0;
/* to hit CPUs online before the notifier is up */
for_each_online_cpu(lcpu) {
int err = threshold_create_device(lcpu);
if (err)
return err;
}
threshold_cpu_callback = amd_64_threshold_cpu_callback;
return 0;
}
/*
* there are 3 funcs which need to be _initcalled in a logic sequence:
* 1. xen_late_init_mcelog
* 2. mcheck_init_device
* 3. threshold_init_device
*
* xen_late_init_mcelog must register xen_mce_chrdev_device before
* native mce_chrdev_device registration if running under xen platform;
*
* mcheck_init_device should be inited before threshold_init_device to
* initialize mce_device, otherwise a NULL ptr dereference will cause panic.
*
* so we use following _initcalls
* 1. device_initcall(xen_late_init_mcelog);
* 2. device_initcall_sync(mcheck_init_device);
* 3. late_initcall(threshold_init_device);
*
* when running under xen, the initcall order is 1,2,3;
* on baremetal, we skip 1 and we do only 2 and 3.
*/
late_initcall(threshold_init_device);

View file

@ -0,0 +1,391 @@
/*
* Intel specific MCE features.
* Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
* Copyright (C) 2008, 2009 Intel Corporation
* Author: Andi Kleen
*/
#include <linux/gfp.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/cpumask.h>
#include <asm/apic.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
#include "mce-internal.h"
/*
* Support for Intel Correct Machine Check Interrupts. This allows
* the CPU to raise an interrupt when a corrected machine check happened.
* Normally we pick those up using a regular polling timer.
* Also supports reliable discovery of shared banks.
*/
/*
* CMCI can be delivered to multiple cpus that share a machine check bank
* so we need to designate a single cpu to process errors logged in each bank
* in the interrupt handler (otherwise we would have many races and potential
* double reporting of the same error).
* Note that this can change when a cpu is offlined or brought online since
* some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear()
* disables CMCI on all banks owned by the cpu and clears this bitfield. At
* this point, cmci_rediscover() kicks in and a different cpu may end up
* taking ownership of some of the shared MCA banks that were previously
* owned by the offlined cpu.
*/
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
/*
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
#define CMCI_THRESHOLD 1
#define CMCI_POLL_INTERVAL (30 * HZ)
#define CMCI_STORM_INTERVAL (1 * HZ)
#define CMCI_STORM_THRESHOLD 15
static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
enum {
CMCI_STORM_NONE,
CMCI_STORM_ACTIVE,
CMCI_STORM_SUBSIDED,
};
static atomic_t cmci_storm_on_cpus;
static int cmci_supported(int *banks)
{
u64 cap;
if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
return 0;
/*
* Vendor check is not strictly needed, but the initial
* initialization is vendor keyed and this
* makes sure none of the backdoors are entered otherwise.
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 0;
if (!cpu_has_apic || lapic_get_maxlvt() < 6)
return 0;
rdmsrl(MSR_IA32_MCG_CAP, cap);
*banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
return !!(cap & MCG_CMCI_P);
}
void mce_intel_cmci_poll(void)
{
if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
return;
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
}
void mce_intel_hcpu_update(unsigned long cpu)
{
if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
atomic_dec(&cmci_storm_on_cpus);
per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
}
unsigned long mce_intel_adjust_timer(unsigned long interval)
{
int r;
if (interval < CMCI_POLL_INTERVAL)
return interval;
switch (__this_cpu_read(cmci_storm_state)) {
case CMCI_STORM_ACTIVE:
/*
* We switch back to interrupt mode once the poll timer has
* silenced itself. That means no events recorded and the
* timer interval is back to our poll interval.
*/
__this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
r = atomic_sub_return(1, &cmci_storm_on_cpus);
if (r == 0)
pr_notice("CMCI storm subsided: switching to interrupt mode\n");
/* FALLTHROUGH */
case CMCI_STORM_SUBSIDED:
/*
* We wait for all cpus to go back to SUBSIDED
* state. When that happens we switch back to
* interrupt mode.
*/
if (!atomic_read(&cmci_storm_on_cpus)) {
__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
cmci_reenable();
cmci_recheck();
}
return CMCI_POLL_INTERVAL;
default:
/*
* We have shiny weather. Let the poll do whatever it
* thinks.
*/
return interval;
}
}
static void cmci_storm_disable_banks(void)
{
unsigned long flags, *owned;
int bank;
u64 val;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
owned = this_cpu_ptr(mce_banks_owned);
for_each_set_bit(bank, owned, MAX_NR_BANKS) {
rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static bool cmci_storm_detect(void)
{
unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
unsigned long ts = __this_cpu_read(cmci_time_stamp);
unsigned long now = jiffies;
int r;
if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
return true;
if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
cnt++;
} else {
cnt = 1;
__this_cpu_write(cmci_time_stamp, now);
}
__this_cpu_write(cmci_storm_cnt, cnt);
if (cnt <= CMCI_STORM_THRESHOLD)
return false;
cmci_storm_disable_banks();
__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
r = atomic_add_return(1, &cmci_storm_on_cpus);
mce_timer_kick(CMCI_POLL_INTERVAL);
if (r == 1)
pr_notice("CMCI storm detected: switching to poll mode\n");
return true;
}
/*
* The interrupt handler. This is called on every event.
* Just call the poller directly to log any events.
* This could in theory increase the threshold under high load,
* but doesn't for now.
*/
static void intel_threshold_interrupt(void)
{
if (cmci_storm_detect())
return;
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
mce_notify_irq();
}
/*
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
* on this CPU. Use the algorithm recommended in the SDM to discover shared
* banks.
*/
static void cmci_discover(int banks)
{
unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
unsigned long flags;
int i;
int bios_wrong_thresh = 0;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
u64 val;
int bios_zero_thresh = 0;
if (test_bit(i, owned))
continue;
/* Skip banks in firmware first mode */
if (test_bit(i, mce_banks_ce_disabled))
continue;
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Already owned by someone else? */
if (val & MCI_CTL2_CMCI_EN) {
clear_bit(i, owned);
__clear_bit(i, this_cpu_ptr(mce_poll_banks));
continue;
}
if (!mca_cfg.bios_cmci_threshold) {
val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
val |= CMCI_THRESHOLD;
} else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
/*
* If bios_cmci_threshold boot option was specified
* but the threshold is zero, we'll try to initialize
* it to 1.
*/
bios_zero_thresh = 1;
val |= CMCI_THRESHOLD;
}
val |= MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Did the enable bit stick? -- the bank supports CMCI */
if (val & MCI_CTL2_CMCI_EN) {
set_bit(i, owned);
__clear_bit(i, this_cpu_ptr(mce_poll_banks));
/*
* We are able to set thresholds for some banks that
* had a threshold of 0. This means the BIOS has not
* set the thresholds properly or does not work with
* this boot option. Note down now and report later.
*/
if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
(val & MCI_CTL2_CMCI_THRESHOLD_MASK))
bios_wrong_thresh = 1;
} else {
WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks)));
}
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
pr_info_once(
"bios_cmci_threshold: Some banks do not have valid thresholds set\n");
pr_info_once(
"bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
}
}
/*
* Just in case we missed an event during initialization check
* all the CMCI owned banks.
*/
void cmci_recheck(void)
{
unsigned long flags;
int banks;
if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
return;
local_irq_save(flags);
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
local_irq_restore(flags);
}
/* Caller must hold the lock on cmci_discover_lock */
static void __cmci_disable_bank(int bank)
{
u64 val;
if (!test_bit(bank, this_cpu_ptr(mce_banks_owned)))
return;
rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
__clear_bit(bank, this_cpu_ptr(mce_banks_owned));
}
/*
* Disable CMCI on this CPU for all banks it owns when it goes down.
* This allows other CPUs to claim the banks on rediscovery.
*/
void cmci_clear(void)
{
unsigned long flags;
int i;
int banks;
if (!cmci_supported(&banks))
return;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++)
__cmci_disable_bank(i);
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static void cmci_rediscover_work_func(void *arg)
{
int banks;
/* Recheck banks in case CPUs don't all have the same */
if (cmci_supported(&banks))
cmci_discover(banks);
}
/* After a CPU went down cycle through all the others and rediscover */
void cmci_rediscover(void)
{
int banks;
if (!cmci_supported(&banks))
return;
on_each_cpu(cmci_rediscover_work_func, NULL, 1);
}
/*
* Reenable CMCI on this CPU in case a CPU down failed.
*/
void cmci_reenable(void)
{
int banks;
if (cmci_supported(&banks))
cmci_discover(banks);
}
void cmci_disable_bank(int bank)
{
int banks;
unsigned long flags;
if (!cmci_supported(&banks))
return;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
__cmci_disable_bank(bank);
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static void intel_init_cmci(void)
{
int banks;
if (!cmci_supported(&banks))
return;
mce_threshold_vector = intel_threshold_interrupt;
cmci_discover(banks);
/*
* For CPU #0 this runs with still disabled APIC, but that's
* ok because only the vector is set up. We still do another
* check for the banks later for CPU #0 just to make sure
* to not miss any events.
*/
apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
cmci_recheck();
}
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
intel_init_cmci();
}

View file

@ -0,0 +1,66 @@
/*
* P5 specific Machine Check Exception Reporting
* (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
*/
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/mce.h>
#include <asm/msr.h>
/* By default disabled */
int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
static void pentium_machine_check(struct pt_regs *regs, long error_code)
{
u32 loaddr, hi, lotype;
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
printk(KERN_EMERG
"CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
smp_processor_id(), loaddr, lotype);
if (lotype & (1<<5)) {
printk(KERN_EMERG
"CPU#%d: Possible thermal failure (CPU on fire ?).\n",
smp_processor_id());
}
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
}
/* Set up machine check reporting for processors with Intel style MCE: */
void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
/* Default P5 to off as its often misconnected: */
if (!mce_p5_enabled)
return;
/* Check for MCE support: */
if (!cpu_has(c, X86_FEATURE_MCE))
return;
machine_check_vector = pentium_machine_check;
/* Make sure the vector pointer is visible before we enable MCEs: */
wmb();
/* Read registers before enabling: */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
printk(KERN_INFO
"Intel old style machine check architecture supported.\n");
/* Enable MCE: */
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO
"Intel old style machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
}

View file

@ -0,0 +1,573 @@
/*
* Thermal throttle event support code (such as syslog messaging and rate
* limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
*
* This allows consistent reporting of CPU thermal throttle events.
*
* Maintains a counter in /sys that keeps track of the number of thermal
* events, such that the user knows how bad the thermal problem might be
* (since the logging to syslog and mcelog is rate limited).
*
* Author: Dmitriy Zavin (dmitriyz@google.com)
*
* Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
* Inspired by Ross Biro's and Al Borchers' counter code.
*/
#include <linux/interrupt.h>
#include <linux/notifier.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/trace/irq_vectors.h>
/* How long to wait between reporting thermal events */
#define CHECK_INTERVAL (300 * HZ)
#define THERMAL_THROTTLING_EVENT 0
#define POWER_LIMIT_EVENT 1
/*
* Current thermal event state:
*/
struct _thermal_state {
bool new_event;
int event;
u64 next_check;
unsigned long count;
unsigned long last_count;
};
struct thermal_state {
struct _thermal_state core_throttle;
struct _thermal_state core_power_limit;
struct _thermal_state package_throttle;
struct _thermal_state package_power_limit;
struct _thermal_state core_thresh0;
struct _thermal_state core_thresh1;
struct _thermal_state pkg_thresh0;
struct _thermal_state pkg_thresh1;
};
/* Callback to handle core threshold interrupts */
int (*platform_thermal_notify)(__u64 msr_val);
EXPORT_SYMBOL(platform_thermal_notify);
/* Callback to handle core package threshold_interrupts */
int (*platform_thermal_package_notify)(__u64 msr_val);
EXPORT_SYMBOL_GPL(platform_thermal_package_notify);
/* Callback support of rate control, return true, if
* callback has rate control */
bool (*platform_thermal_package_rate_control)(void);
EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);
static DEFINE_PER_CPU(struct thermal_state, thermal_state);
static atomic_t therm_throt_en = ATOMIC_INIT(0);
static u32 lvtthmr_init __read_mostly;
#ifdef CONFIG_SYSFS
#define define_therm_throt_device_one_ro(_name) \
static DEVICE_ATTR(_name, 0444, \
therm_throt_device_show_##_name, \
NULL) \
#define define_therm_throt_device_show_func(event, name) \
\
static ssize_t therm_throt_device_show_##event##_##name( \
struct device *dev, \
struct device_attribute *attr, \
char *buf) \
{ \
unsigned int cpu = dev->id; \
ssize_t ret; \
\
preempt_disable(); /* CPU hotplug */ \
if (cpu_online(cpu)) { \
ret = sprintf(buf, "%lu\n", \
per_cpu(thermal_state, cpu).event.name); \
} else \
ret = 0; \
preempt_enable(); \
\
return ret; \
}
define_therm_throt_device_show_func(core_throttle, count);
define_therm_throt_device_one_ro(core_throttle_count);
define_therm_throt_device_show_func(core_power_limit, count);
define_therm_throt_device_one_ro(core_power_limit_count);
define_therm_throt_device_show_func(package_throttle, count);
define_therm_throt_device_one_ro(package_throttle_count);
define_therm_throt_device_show_func(package_power_limit, count);
define_therm_throt_device_one_ro(package_power_limit_count);
static struct attribute *thermal_throttle_attrs[] = {
&dev_attr_core_throttle_count.attr,
NULL
};
static struct attribute_group thermal_attr_group = {
.attrs = thermal_throttle_attrs,
.name = "thermal_throttle"
};
#endif /* CONFIG_SYSFS */
#define CORE_LEVEL 0
#define PACKAGE_LEVEL 1
/***
* therm_throt_process - Process thermal throttling event from interrupt
* @curr: Whether the condition is current or not (boolean), since the
* thermal interrupt normally gets called both when the thermal
* event begins and once the event has ended.
*
* This function is called by the thermal interrupt after the
* IRQ has been acknowledged.
*
* It will take care of rate limiting and printing messages to the syslog.
*
* Returns: 0 : Event should NOT be further logged, i.e. still in
* "timeout" from previous log message.
* 1 : Event should be logged further, and a message has been
* printed to the syslog.
*/
static int therm_throt_process(bool new_event, int event, int level)
{
struct _thermal_state *state;
unsigned int this_cpu = smp_processor_id();
bool old_event;
u64 now;
struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
now = get_jiffies_64();
if (level == CORE_LEVEL) {
if (event == THERMAL_THROTTLING_EVENT)
state = &pstate->core_throttle;
else if (event == POWER_LIMIT_EVENT)
state = &pstate->core_power_limit;
else
return 0;
} else if (level == PACKAGE_LEVEL) {
if (event == THERMAL_THROTTLING_EVENT)
state = &pstate->package_throttle;
else if (event == POWER_LIMIT_EVENT)
state = &pstate->package_power_limit;
else
return 0;
} else
return 0;
old_event = state->new_event;
state->new_event = new_event;
if (new_event)
state->count++;
if (time_before64(now, state->next_check) &&
state->count != state->last_count)
return 0;
state->next_check = now + CHECK_INTERVAL;
state->last_count = state->count;
/* if we just entered the thermal event */
if (new_event) {
if (event == THERMAL_THROTTLING_EVENT)
printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
this_cpu,
level == CORE_LEVEL ? "Core" : "Package",
state->count);
return 1;
}
if (old_event) {
if (event == THERMAL_THROTTLING_EVENT)
printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
this_cpu,
level == CORE_LEVEL ? "Core" : "Package");
return 1;
}
return 0;
}
static int thresh_event_valid(int level, int event)
{
struct _thermal_state *state;
unsigned int this_cpu = smp_processor_id();
struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
u64 now = get_jiffies_64();
if (level == PACKAGE_LEVEL)
state = (event == 0) ? &pstate->pkg_thresh0 :
&pstate->pkg_thresh1;
else
state = (event == 0) ? &pstate->core_thresh0 :
&pstate->core_thresh1;
if (time_before64(now, state->next_check))
return 0;
state->next_check = now + CHECK_INTERVAL;
return 1;
}
static bool int_pln_enable;
static int __init int_pln_enable_setup(char *s)
{
int_pln_enable = true;
return 1;
}
__setup("int_pln_enable", int_pln_enable_setup);
#ifdef CONFIG_SYSFS
/* Add/Remove thermal_throttle interface for CPU device: */
static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
{
int err;
struct cpuinfo_x86 *c = &cpu_data(cpu);
err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
if (err)
return err;
if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_core_power_limit_count.attr,
thermal_attr_group.name);
if (cpu_has(c, X86_FEATURE_PTS)) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_throttle_count.attr,
thermal_attr_group.name);
if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_power_limit_count.attr,
thermal_attr_group.name);
}
return err;
}
static void thermal_throttle_remove_dev(struct device *dev)
{
sysfs_remove_group(&dev->kobj, &thermal_attr_group);
}
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int
thermal_throttle_cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
struct device *dev;
int err = 0;
dev = get_cpu_device(cpu);
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
err = thermal_throttle_add_dev(dev, cpu);
WARN_ON(err);
break;
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
thermal_throttle_remove_dev(dev);
break;
}
return notifier_from_errno(err);
}
static struct notifier_block thermal_throttle_cpu_notifier =
{
.notifier_call = thermal_throttle_cpu_callback,
};
static __init int thermal_throttle_init_device(void)
{
unsigned int cpu = 0;
int err;
if (!atomic_read(&therm_throt_en))
return 0;
cpu_notifier_register_begin();
/* connect live CPUs to sysfs */
for_each_online_cpu(cpu) {
err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
WARN_ON(err);
}
__register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
cpu_notifier_register_done();
return 0;
}
device_initcall(thermal_throttle_init_device);
#endif /* CONFIG_SYSFS */
static void notify_package_thresholds(__u64 msr_val)
{
bool notify_thres_0 = false;
bool notify_thres_1 = false;
if (!platform_thermal_package_notify)
return;
/* lower threshold check */
if (msr_val & THERM_LOG_THRESHOLD0)
notify_thres_0 = true;
/* higher threshold check */
if (msr_val & THERM_LOG_THRESHOLD1)
notify_thres_1 = true;
if (!notify_thres_0 && !notify_thres_1)
return;
if (platform_thermal_package_rate_control &&
platform_thermal_package_rate_control()) {
/* Rate control is implemented in callback */
platform_thermal_package_notify(msr_val);
return;
}
/* lower threshold reached */
if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
platform_thermal_package_notify(msr_val);
/* higher threshold reached */
if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
platform_thermal_package_notify(msr_val);
}
static void notify_thresholds(__u64 msr_val)
{
/* check whether the interrupt handler is defined;
* otherwise simply return
*/
if (!platform_thermal_notify)
return;
/* lower threshold reached */
if ((msr_val & THERM_LOG_THRESHOLD0) &&
thresh_event_valid(CORE_LEVEL, 0))
platform_thermal_notify(msr_val);
/* higher threshold reached */
if ((msr_val & THERM_LOG_THRESHOLD1) &&
thresh_event_valid(CORE_LEVEL, 1))
platform_thermal_notify(msr_val);
}
/* Thermal transition interrupt handler */
static void intel_thermal_interrupt(void)
{
__u64 msr_val;
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
/* Check for violation of core thermal thresholds*/
notify_thresholds(msr_val);
if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
CORE_LEVEL) != 0)
mce_log_therm_throt_event(msr_val);
if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
CORE_LEVEL);
if (this_cpu_has(X86_FEATURE_PTS)) {
rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
/* check violations of package thermal thresholds */
notify_package_thresholds(msr_val);
therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
PACKAGE_LEVEL);
if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
therm_throt_process(msr_val &
PACKAGE_THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
PACKAGE_LEVEL);
}
}
static void unexpected_thermal_interrupt(void)
{
printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
smp_processor_id());
}
static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
static inline void __smp_thermal_interrupt(void)
{
inc_irq_stat(irq_thermal_count);
smp_thermal_vector();
}
asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs)
{
entering_irq();
__smp_thermal_interrupt();
exiting_ack_irq();
}
asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)
{
entering_irq();
trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
__smp_thermal_interrupt();
trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
exiting_ack_irq();
}
/* Thermal monitoring depends on APIC, ACPI and clock modulation */
static int intel_thermal_supported(struct cpuinfo_x86 *c)
{
if (!cpu_has_apic)
return 0;
if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
return 0;
return 1;
}
void __init mcheck_intel_therm_init(void)
{
/*
* This function is only called on boot CPU. Save the init thermal
* LVT value on BSP and use that value to restore APs' thermal LVT
* entry BIOS programmed later
*/
if (intel_thermal_supported(&boot_cpu_data))
lvtthmr_init = apic_read(APIC_LVTTHMR);
}
void intel_init_thermal(struct cpuinfo_x86 *c)
{
unsigned int cpu = smp_processor_id();
int tm2 = 0;
u32 l, h;
if (!intel_thermal_supported(c))
return;
/*
* First check if its enabled already, in which case there might
* be some SMM goo which handles it, so we can't even put a handler
* since it might be delivered via SMI already:
*/
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
h = lvtthmr_init;
/*
* The initial value of thermal LVT entries on all APs always reads
* 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
* sequence to them and LVT registers are reset to 0s except for
* the mask bits which are set to 1s when APs receive INIT IPI.
* If BIOS takes over the thermal interrupt and sets its interrupt
* delivery mode to SMI (not fixed), it restores the value that the
* BIOS has programmed on AP based on BSP's info we saved since BIOS
* is always setting the same value for all threads/cores.
*/
if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
apic_write(APIC_LVTTHMR, lvtthmr_init);
if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
if (system_state == SYSTEM_BOOTING)
printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu);
return;
}
/* Check whether a vector already exists */
if (h & APIC_VECTOR_MASK) {
printk(KERN_DEBUG
"CPU%d: Thermal LVT vector (%#x) already installed\n",
cpu, (h & APIC_VECTOR_MASK));
return;
}
/* early Pentium M models use different method for enabling TM2 */
if (cpu_has(c, X86_FEATURE_TM2)) {
if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
rdmsr(MSR_THERM2_CTL, l, h);
if (l & MSR_THERM2_CTL_TM_SELECT)
tm2 = 1;
} else if (l & MSR_IA32_MISC_ENABLE_TM2)
tm2 = 1;
}
/* We'll mask the thermal vector in the lapic till we're ready: */
h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
apic_write(APIC_LVTTHMR, h);
rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
wrmsr(MSR_IA32_THERM_INTERRUPT,
(l | (THERM_INT_LOW_ENABLE
| THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h);
else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
wrmsr(MSR_IA32_THERM_INTERRUPT,
l | (THERM_INT_LOW_ENABLE
| THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
else
wrmsr(MSR_IA32_THERM_INTERRUPT,
l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
if (cpu_has(c, X86_FEATURE_PTS)) {
rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
(l | (PACKAGE_THERM_INT_LOW_ENABLE
| PACKAGE_THERM_INT_HIGH_ENABLE))
& ~PACKAGE_THERM_INT_PLN_ENABLE, h);
else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
l | (PACKAGE_THERM_INT_LOW_ENABLE
| PACKAGE_THERM_INT_HIGH_ENABLE
| PACKAGE_THERM_INT_PLN_ENABLE), h);
else
wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
l | (PACKAGE_THERM_INT_LOW_ENABLE
| PACKAGE_THERM_INT_HIGH_ENABLE), h);
}
smp_thermal_vector = intel_thermal_interrupt;
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
/* Unmask the thermal vector: */
l = apic_read(APIC_LVTTHMR);
apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
tm2 ? "TM2" : "TM1");
/* enable thermal throttle processing */
atomic_set(&therm_throt_en, 1);
}

View file

@ -0,0 +1,41 @@
/*
* Common corrected MCE threshold handler code:
*/
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <asm/irq_vectors.h>
#include <asm/apic.h>
#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>
static void default_threshold_interrupt(void)
{
printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
THRESHOLD_APIC_VECTOR);
}
void (*mce_threshold_vector)(void) = default_threshold_interrupt;
static inline void __smp_threshold_interrupt(void)
{
inc_irq_stat(irq_threshold_count);
mce_threshold_vector();
}
asmlinkage __visible void smp_threshold_interrupt(void)
{
entering_irq();
__smp_threshold_interrupt();
exiting_ack_irq();
}
asmlinkage __visible void smp_trace_threshold_interrupt(void)
{
entering_irq();
trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
__smp_threshold_interrupt();
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
exiting_ack_irq();
}

View file

@ -0,0 +1,38 @@
/*
* IDT Winchip specific Machine Check Exception Reporting
* (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
*/
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <asm/processor.h>
#include <asm/mce.h>
#include <asm/msr.h>
/* Machine check handler for WinChip C6: */
static void winchip_machine_check(struct pt_regs *regs, long error_code)
{
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
}
/* Set up machine check reporting on the Winchip C6 series */
void winchip_mcheck_init(struct cpuinfo_x86 *c)
{
u32 lo, hi;
machine_check_vector = winchip_machine_check;
/* Make sure the vector pointer is visible before we enable MCEs: */
wmb();
rdmsr(MSR_IDT_FCR1, lo, hi);
lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
lo &= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO
"Winchip machine check reporting enabled on CPU#0.\n");
}

View file

@ -0,0 +1,7 @@
microcode-y := core.o
obj-$(CONFIG_MICROCODE) += microcode.o
microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o
microcode-$(CONFIG_MICROCODE_AMD) += amd.o
obj-$(CONFIG_MICROCODE_EARLY) += core_early.o
obj-$(CONFIG_MICROCODE_INTEL_EARLY) += intel_early.o
obj-$(CONFIG_MICROCODE_AMD_EARLY) += amd_early.o

View file

@ -0,0 +1,492 @@
/*
* AMD CPU Microcode Update Driver for Linux
* Copyright (C) 2008-2011 Advanced Micro Devices Inc.
*
* Author: Peter Oruba <peter.oruba@amd.com>
*
* Based on work by:
* Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
*
* Maintainers:
* Andreas Herrmann <herrmann.der.user@googlemail.com>
* Borislav Petkov <bp@alien8.de>
*
* This driver allows to upgrade microcode on F10h AMD
* CPUs and later.
*
* Licensed under the terms of the GNU General Public
* License version 2. See file COPYING for details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/firmware.h>
#include <linux/pci_ids.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <asm/microcode.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/microcode_amd.h>
MODULE_DESCRIPTION("AMD Microcode Update Driver");
MODULE_AUTHOR("Peter Oruba");
MODULE_LICENSE("GPL v2");
static struct equiv_cpu_entry *equiv_cpu_table;
struct ucode_patch {
struct list_head plist;
void *data;
u32 patch_id;
u16 equiv_cpu;
};
static LIST_HEAD(pcache);
static u16 __find_equiv_id(unsigned int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
return find_equiv_id(equiv_cpu_table, uci->cpu_sig.sig);
}
static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu)
{
int i = 0;
BUG_ON(!equiv_cpu_table);
while (equiv_cpu_table[i].equiv_cpu != 0) {
if (equiv_cpu == equiv_cpu_table[i].equiv_cpu)
return equiv_cpu_table[i].installed_cpu;
i++;
}
return 0;
}
/*
* a small, trivial cache of per-family ucode patches
*/
static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
{
struct ucode_patch *p;
list_for_each_entry(p, &pcache, plist)
if (p->equiv_cpu == equiv_cpu)
return p;
return NULL;
}
static void update_cache(struct ucode_patch *new_patch)
{
struct ucode_patch *p;
list_for_each_entry(p, &pcache, plist) {
if (p->equiv_cpu == new_patch->equiv_cpu) {
if (p->patch_id >= new_patch->patch_id)
/* we already have the latest patch */
return;
list_replace(&p->plist, &new_patch->plist);
kfree(p->data);
kfree(p);
return;
}
}
/* no patch found, add it */
list_add_tail(&new_patch->plist, &pcache);
}
static void free_cache(void)
{
struct ucode_patch *p, *tmp;
list_for_each_entry_safe(p, tmp, &pcache, plist) {
__list_del(p->plist.prev, p->plist.next);
kfree(p->data);
kfree(p);
}
}
static struct ucode_patch *find_patch(unsigned int cpu)
{
u16 equiv_id;
equiv_id = __find_equiv_id(cpu);
if (!equiv_id)
return NULL;
return cache_find_patch(equiv_id);
}
static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct ucode_patch *p;
csig->sig = cpuid_eax(0x00000001);
csig->rev = c->microcode;
/*
* a patch could have been loaded early, set uci->mc so that
* mc_bp_resume() can call apply_microcode()
*/
p = find_patch(cpu);
if (p && (p->patch_id == csig->rev))
uci->mc = p->data;
pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
return 0;
}
static unsigned int verify_patch_size(u8 family, u32 patch_size,
unsigned int size)
{
u32 max_size;
#define F1XH_MPB_MAX_SIZE 2048
#define F14H_MPB_MAX_SIZE 1824
#define F15H_MPB_MAX_SIZE 4096
#define F16H_MPB_MAX_SIZE 3458
switch (family) {
case 0x14:
max_size = F14H_MPB_MAX_SIZE;
break;
case 0x15:
max_size = F15H_MPB_MAX_SIZE;
break;
case 0x16:
max_size = F16H_MPB_MAX_SIZE;
break;
default:
max_size = F1XH_MPB_MAX_SIZE;
break;
}
if (patch_size > min_t(u32, size, max_size)) {
pr_err("patch size mismatch\n");
return 0;
}
return patch_size;
}
int __apply_microcode_amd(struct microcode_amd *mc_amd)
{
u32 rev, dummy;
native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
/* verify patch application was successful */
native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
if (rev != mc_amd->hdr.patch_id)
return -1;
return 0;
}
int apply_microcode_amd(int cpu)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
struct microcode_amd *mc_amd;
struct ucode_cpu_info *uci;
struct ucode_patch *p;
u32 rev, dummy;
BUG_ON(raw_smp_processor_id() != cpu);
uci = ucode_cpu_info + cpu;
p = find_patch(cpu);
if (!p)
return 0;
mc_amd = p->data;
uci->mc = p->data;
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
/* need to apply patch? */
if (rev >= mc_amd->hdr.patch_id) {
c->microcode = rev;
uci->cpu_sig.rev = rev;
return 0;
}
if (__apply_microcode_amd(mc_amd)) {
pr_err("CPU%d: update failed for patch_level=0x%08x\n",
cpu, mc_amd->hdr.patch_id);
return -1;
}
pr_info("CPU%d: new patch_level=0x%08x\n", cpu,
mc_amd->hdr.patch_id);
uci->cpu_sig.rev = mc_amd->hdr.patch_id;
c->microcode = mc_amd->hdr.patch_id;
return 0;
}
static int install_equiv_cpu_table(const u8 *buf)
{
unsigned int *ibuf = (unsigned int *)buf;
unsigned int type = ibuf[1];
unsigned int size = ibuf[2];
if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
pr_err("empty section/"
"invalid type field in container file section header\n");
return -EINVAL;
}
equiv_cpu_table = vmalloc(size);
if (!equiv_cpu_table) {
pr_err("failed to allocate equivalent CPU table\n");
return -ENOMEM;
}
memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
/* add header length */
return size + CONTAINER_HDR_SZ;
}
static void free_equiv_cpu_table(void)
{
vfree(equiv_cpu_table);
equiv_cpu_table = NULL;
}
static void cleanup(void)
{
free_equiv_cpu_table();
free_cache();
}
/*
* We return the current size even if some of the checks failed so that
* we can skip over the next patch. If we return a negative value, we
* signal a grave error like a memory allocation has failed and the
* driver cannot continue functioning normally. In such cases, we tear
* down everything we've used up so far and exit.
*/
static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
{
struct microcode_header_amd *mc_hdr;
struct ucode_patch *patch;
unsigned int patch_size, crnt_size, ret;
u32 proc_fam;
u16 proc_id;
patch_size = *(u32 *)(fw + 4);
crnt_size = patch_size + SECTION_HDR_SIZE;
mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
proc_id = mc_hdr->processor_rev_id;
proc_fam = find_cpu_family_by_equiv_cpu(proc_id);
if (!proc_fam) {
pr_err("No patch family for equiv ID: 0x%04x\n", proc_id);
return crnt_size;
}
/* check if patch is for the current family */
proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff);
if (proc_fam != family)
return crnt_size;
if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n",
mc_hdr->patch_id);
return crnt_size;
}
ret = verify_patch_size(family, patch_size, leftover);
if (!ret) {
pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id);
return crnt_size;
}
patch = kzalloc(sizeof(*patch), GFP_KERNEL);
if (!patch) {
pr_err("Patch allocation failure.\n");
return -EINVAL;
}
patch->data = kzalloc(patch_size, GFP_KERNEL);
if (!patch->data) {
pr_err("Patch data allocation failure.\n");
kfree(patch);
return -EINVAL;
}
/* All looks ok, copy patch... */
memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size);
INIT_LIST_HEAD(&patch->plist);
patch->patch_id = mc_hdr->patch_id;
patch->equiv_cpu = proc_id;
pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
__func__, patch->patch_id, proc_id);
/* ... and add to cache. */
update_cache(patch);
return crnt_size;
}
static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
size_t size)
{
enum ucode_state ret = UCODE_ERROR;
unsigned int leftover;
u8 *fw = (u8 *)data;
int crnt_size = 0;
int offset;
offset = install_equiv_cpu_table(data);
if (offset < 0) {
pr_err("failed to create equivalent cpu table\n");
return ret;
}
fw += offset;
leftover = size - offset;
if (*(u32 *)fw != UCODE_UCODE_TYPE) {
pr_err("invalid type field in container file section header\n");
free_equiv_cpu_table();
return ret;
}
while (leftover) {
crnt_size = verify_and_add_patch(family, fw, leftover);
if (crnt_size < 0)
return ret;
fw += crnt_size;
leftover -= crnt_size;
}
return UCODE_OK;
}
enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
{
enum ucode_state ret;
/* free old equiv table */
free_equiv_cpu_table();
ret = __load_microcode_amd(family, data, size);
if (ret != UCODE_OK)
cleanup();
#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32)
/* save BSP's matching patch for early load */
if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) {
struct ucode_patch *p = find_patch(cpu);
if (p) {
memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
PATCH_MAX_SIZE));
}
}
#endif
return ret;
}
/*
* AMD microcode firmware naming convention, up to family 15h they are in
* the legacy file:
*
* amd-ucode/microcode_amd.bin
*
* This legacy file is always smaller than 2K in size.
*
* Beginning with family 15h, they are in family-specific firmware files:
*
* amd-ucode/microcode_amd_fam15h.bin
* amd-ucode/microcode_amd_fam16h.bin
* ...
*
* These might be larger than 2K.
*/
static enum ucode_state request_microcode_amd(int cpu, struct device *device,
bool refresh_fw)
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
struct cpuinfo_x86 *c = &cpu_data(cpu);
enum ucode_state ret = UCODE_NFOUND;
const struct firmware *fw;
/* reload ucode container only on the boot cpu */
if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index)
return UCODE_OK;
if (c->x86 >= 0x15)
snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
if (request_firmware_direct(&fw, (const char *)fw_name, device)) {
pr_debug("failed to load file %s\n", fw_name);
goto out;
}
ret = UCODE_ERROR;
if (*(u32 *)fw->data != UCODE_MAGIC) {
pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data);
goto fw_release;
}
ret = load_microcode_amd(cpu, c->x86, fw->data, fw->size);
fw_release:
release_firmware(fw);
out:
return ret;
}
static enum ucode_state
request_microcode_user(int cpu, const void __user *buf, size_t size)
{
return UCODE_ERROR;
}
static void microcode_fini_cpu_amd(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
uci->mc = NULL;
}
static struct microcode_ops microcode_amd_ops = {
.request_microcode_user = request_microcode_user,
.request_microcode_fw = request_microcode_amd,
.collect_cpu_info = collect_cpu_info_amd,
.apply_microcode = apply_microcode_amd,
.microcode_fini_cpu = microcode_fini_cpu_amd,
};
struct microcode_ops * __init init_amd_microcode(void)
{
struct cpuinfo_x86 *c = &cpu_data(0);
if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
return NULL;
}
return &microcode_amd_ops;
}
void __exit exit_amd_microcode(void)
{
cleanup();
}

View file

@ -0,0 +1,422 @@
/*
* Copyright (C) 2013 Advanced Micro Devices, Inc.
*
* Author: Jacob Shin <jacob.shin@amd.com>
* Fixes: Borislav Petkov <bp@suse.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/earlycpio.h>
#include <linux/initrd.h>
#include <asm/cpu.h>
#include <asm/setup.h>
#include <asm/microcode_amd.h>
/*
* This points to the current valid container of microcode patches which we will
* save from the initrd before jettisoning its contents.
*/
static u8 *container;
static size_t container_size;
static u32 ucode_new_rev;
u8 amd_ucode_patch[PATCH_MAX_SIZE];
static u16 this_equiv_id;
static struct cpio_data ucode_cpio;
/*
* Microcode patch container file is prepended to the initrd in cpio format.
* See Documentation/x86/early-microcode.txt
*/
static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
static struct cpio_data __init find_ucode_in_initrd(void)
{
long offset = 0;
char *path;
void *start;
size_t size;
#ifdef CONFIG_X86_32
struct boot_params *p;
/*
* On 32-bit, early load occurs before paging is turned on so we need
* to use physical addresses.
*/
p = (struct boot_params *)__pa_nodebug(&boot_params);
path = (char *)__pa_nodebug(ucode_path);
start = (void *)p->hdr.ramdisk_image;
size = p->hdr.ramdisk_size;
#else
path = ucode_path;
start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
size = boot_params.hdr.ramdisk_size;
#endif
return find_cpio_data(path, start, size, &offset);
}
static size_t compute_container_size(u8 *data, u32 total_size)
{
size_t size = 0;
u32 *header = (u32 *)data;
if (header[0] != UCODE_MAGIC ||
header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
header[2] == 0) /* size */
return size;
size = header[2] + CONTAINER_HDR_SZ;
total_size -= size;
data += size;
while (total_size) {
u16 patch_size;
header = (u32 *)data;
if (header[0] != UCODE_UCODE_TYPE)
break;
/*
* Sanity-check patch size.
*/
patch_size = header[1];
if (patch_size > PATCH_MAX_SIZE)
break;
size += patch_size + SECTION_HDR_SIZE;
data += patch_size + SECTION_HDR_SIZE;
total_size -= patch_size + SECTION_HDR_SIZE;
}
return size;
}
/*
* Early load occurs before we can vmalloc(). So we look for the microcode
* patch container file in initrd, traverse equivalent cpu table, look for a
* matching microcode patch, and update, all in initrd memory in place.
* When vmalloc() is available for use later -- on 64-bit during first AP load,
* and on 32-bit during save_microcode_in_initrd_amd() -- we can call
* load_microcode_amd() to save equivalent cpu table and microcode patches in
* kernel heap memory.
*/
static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
{
struct equiv_cpu_entry *eq;
size_t *cont_sz;
u32 *header;
u8 *data, **cont;
u8 (*patch)[PATCH_MAX_SIZE];
u16 eq_id = 0;
int offset, left;
u32 rev, eax, ebx, ecx, edx;
u32 *new_rev;
#ifdef CONFIG_X86_32
new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
cont_sz = (size_t *)__pa_nodebug(&container_size);
cont = (u8 **)__pa_nodebug(&container);
patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
#else
new_rev = &ucode_new_rev;
cont_sz = &container_size;
cont = &container;
patch = &amd_ucode_patch;
#endif
data = ucode;
left = size;
header = (u32 *)data;
/* find equiv cpu table */
if (header[0] != UCODE_MAGIC ||
header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
header[2] == 0) /* size */
return;
eax = 0x00000001;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
while (left > 0) {
eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
*cont = data;
/* Advance past the container header */
offset = header[2] + CONTAINER_HDR_SZ;
data += offset;
left -= offset;
eq_id = find_equiv_id(eq, eax);
if (eq_id) {
this_equiv_id = eq_id;
*cont_sz = compute_container_size(*cont, left + offset);
/*
* truncate how much we need to iterate over in the
* ucode update loop below
*/
left = *cont_sz - offset;
break;
}
/*
* support multiple container files appended together. if this
* one does not have a matching equivalent cpu entry, we fast
* forward to the next container file.
*/
while (left > 0) {
header = (u32 *)data;
if (header[0] == UCODE_MAGIC &&
header[1] == UCODE_EQUIV_CPU_TABLE_TYPE)
break;
offset = header[1] + SECTION_HDR_SIZE;
data += offset;
left -= offset;
}
/* mark where the next microcode container file starts */
offset = data - (u8 *)ucode;
ucode = data;
}
if (!eq_id) {
*cont = NULL;
*cont_sz = 0;
return;
}
/* find ucode and update if needed */
native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
while (left > 0) {
struct microcode_amd *mc;
header = (u32 *)data;
if (header[0] != UCODE_UCODE_TYPE || /* type */
header[1] == 0) /* size */
break;
mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
if (!__apply_microcode_amd(mc)) {
rev = mc->hdr.patch_id;
*new_rev = rev;
if (save_patch)
memcpy(patch, mc,
min_t(u32, header[1], PATCH_MAX_SIZE));
}
}
offset = header[1] + SECTION_HDR_SIZE;
data += offset;
left -= offset;
}
}
void __init load_ucode_amd_bsp(void)
{
struct cpio_data cp;
void **data;
size_t *size;
#ifdef CONFIG_X86_32
data = (void **)__pa_nodebug(&ucode_cpio.data);
size = (size_t *)__pa_nodebug(&ucode_cpio.size);
#else
data = &ucode_cpio.data;
size = &ucode_cpio.size;
#endif
cp = find_ucode_in_initrd();
if (!cp.data)
return;
*data = cp.data;
*size = cp.size;
apply_ucode_in_initrd(cp.data, cp.size, true);
}
#ifdef CONFIG_X86_32
/*
* On 32-bit, since AP's early load occurs before paging is turned on, we
* cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
* cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
* save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
* which is used upon resume from suspend.
*/
void load_ucode_amd_ap(void)
{
struct microcode_amd *mc;
size_t *usize;
void **ucode;
mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
__apply_microcode_amd(mc);
return;
}
ucode = (void *)__pa_nodebug(&container);
usize = (size_t *)__pa_nodebug(&container_size);
if (!*ucode || !*usize)
return;
apply_ucode_in_initrd(*ucode, *usize, false);
}
static void __init collect_cpu_sig_on_bsp(void *arg)
{
unsigned int cpu = smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
uci->cpu_sig.sig = cpuid_eax(0x00000001);
}
static void __init get_bsp_sig(void)
{
unsigned int bsp = boot_cpu_data.cpu_index;
struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
if (!uci->cpu_sig.sig)
smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
}
#else
void load_ucode_amd_ap(void)
{
unsigned int cpu = smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct equiv_cpu_entry *eq;
struct microcode_amd *mc;
u32 rev, eax;
u16 eq_id;
/* Exit if called on the BSP. */
if (!cpu)
return;
if (!container)
return;
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
uci->cpu_sig.rev = rev;
uci->cpu_sig.sig = eax;
eax = cpuid_eax(0x00000001);
eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
eq_id = find_equiv_id(eq, eax);
if (!eq_id)
return;
if (eq_id == this_equiv_id) {
mc = (struct microcode_amd *)amd_ucode_patch;
if (mc && rev < mc->hdr.patch_id) {
if (!__apply_microcode_amd(mc))
ucode_new_rev = mc->hdr.patch_id;
}
} else {
if (!ucode_cpio.data)
return;
/*
* AP has a different equivalence ID than BSP, looks like
* mixed-steppings silicon so go through the ucode blob anew.
*/
apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false);
}
}
#endif
int __init save_microcode_in_initrd_amd(void)
{
unsigned long cont;
int retval = 0;
enum ucode_state ret;
u8 *cont_va;
u32 eax;
if (!container)
return -EINVAL;
#ifdef CONFIG_X86_32
get_bsp_sig();
cont = (unsigned long)container;
cont_va = __va(container);
#else
/*
* We need the physical address of the container for both bitness since
* boot_params.hdr.ramdisk_image is a physical address.
*/
cont = __pa(container);
cont_va = container;
#endif
/*
* Take into account the fact that the ramdisk might get relocated and
* therefore we need to recompute the container's position in virtual
* memory space.
*/
if (relocated_ramdisk)
container = (u8 *)(__va(relocated_ramdisk) +
(cont - boot_params.hdr.ramdisk_image));
else
container = cont_va;
if (ucode_new_rev)
pr_info("microcode: updated early to new patch_level=0x%08x\n",
ucode_new_rev);
eax = cpuid_eax(0x00000001);
eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
ret = load_microcode_amd(smp_processor_id(), eax, container, container_size);
if (ret != UCODE_OK)
retval = -EINVAL;
/*
* This will be freed any msec now, stash patches for the current
* family and switch to patch cache for cpu hotplug, etc later.
*/
container = NULL;
container_size = 0;
return retval;
}
void reload_ucode_amd(void)
{
struct microcode_amd *mc;
u32 rev, eax;
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
mc = (struct microcode_amd *)amd_ucode_patch;
if (mc && rev < mc->hdr.patch_id) {
if (!__apply_microcode_amd(mc)) {
ucode_new_rev = mc->hdr.patch_id;
pr_info("microcode: reload patch_level=0x%08x\n",
ucode_new_rev);
}
}
}

View file

@ -0,0 +1,653 @@
/*
* Intel CPU Microcode Update Driver for Linux
*
* Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
* 2006 Shaohua Li <shaohua.li@intel.com>
*
* This driver allows to upgrade microcode on Intel processors
* belonging to IA-32 family - PentiumPro, Pentium II,
* Pentium III, Xeon, Pentium 4, etc.
*
* Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
* Software Developer's Manual
* Order Number 253668 or free download from:
*
* http://developer.intel.com/Assets/PDF/manual/253668.pdf
*
* For more information, go to http://www.urbanmyth.org/microcode
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Initial release.
* 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Added read() support + cleanups.
* 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Added 'device trimming' support. open(O_WRONLY) zeroes
* and frees the saved copy of applied microcode.
* 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Made to use devfs (/dev/cpu/microcode) + cleanups.
* 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
* Added misc device support (now uses both devfs and misc).
* Added MICROCODE_IOCFREE ioctl to clear memory.
* 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
* Messages for error cases (non Intel & no suitable microcode).
* 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
* Removed ->release(). Removed exclusive open and status bitmap.
* Added microcode_rwsem to serialize read()/write()/ioctl().
* Removed global kernel lock usage.
* 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
* Write 0 to 0x8B msr and then cpuid before reading revision,
* so that it works even if there were no update done by the
* BIOS. Otherwise, reading from 0x8B gives junk (which happened
* to be 0 on my machine which is why it worked even when I
* disabled update by the BIOS)
* Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
* 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
* Tigran Aivazian <tigran@veritas.com>
* Intel Pentium 4 processor support and bugfixes.
* 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
* Bugfix for HT (Hyper-Threading) enabled processors
* whereby processor resources are shared by all logical processors
* in a single CPU package.
* 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
* Tigran Aivazian <tigran@veritas.com>,
* Serialize updates as required on HT processors due to
* speculative nature of implementation.
* 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
* Fix the panic when writing zero-length microcode chunk.
* 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
* Jun Nakajima <jun.nakajima@intel.com>
* Support for the microcode updates in the new format.
* 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
* Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
* because we no longer hold a copy of applied microcode
* in kernel memory.
* 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
* Fix sigmatch() macro to handle old CPUs with pf == 0.
* Thanks to Stuart Swales for pointing out this bug.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/platform_device.h>
#include <linux/miscdevice.h>
#include <linux/capability.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/cpu.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/syscore_ops.h>
#include <asm/microcode.h>
#include <asm/processor.h>
#include <asm/cpu_device_id.h>
#include <asm/perf_event.h>
MODULE_DESCRIPTION("Microcode Update Driver");
MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
MODULE_LICENSE("GPL");
#define MICROCODE_VERSION "2.00"
static struct microcode_ops *microcode_ops;
bool dis_ucode_ldr;
module_param(dis_ucode_ldr, bool, 0);
/*
* Synchronization.
*
* All non cpu-hotplug-callback call sites use:
*
* - microcode_mutex to synchronize with each other;
* - get/put_online_cpus() to synchronize with
* the cpu-hotplug-callback call sites.
*
* We guarantee that only a single cpu is being
* updated at any particular moment of time.
*/
static DEFINE_MUTEX(microcode_mutex);
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
EXPORT_SYMBOL_GPL(ucode_cpu_info);
/*
* Operations that are run on a target cpu:
*/
struct cpu_info_ctx {
struct cpu_signature *cpu_sig;
int err;
};
static void collect_cpu_info_local(void *arg)
{
struct cpu_info_ctx *ctx = arg;
ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
ctx->cpu_sig);
}
static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
{
struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
int ret;
ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
if (!ret)
ret = ctx.err;
return ret;
}
static int collect_cpu_info(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
int ret;
memset(uci, 0, sizeof(*uci));
ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
if (!ret)
uci->valid = 1;
return ret;
}
struct apply_microcode_ctx {
int err;
};
static void apply_microcode_local(void *arg)
{
struct apply_microcode_ctx *ctx = arg;
ctx->err = microcode_ops->apply_microcode(smp_processor_id());
}
static int apply_microcode_on_target(int cpu)
{
struct apply_microcode_ctx ctx = { .err = 0 };
int ret;
ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
if (!ret)
ret = ctx.err;
return ret;
}
#ifdef CONFIG_MICROCODE_OLD_INTERFACE
static int do_microcode_update(const void __user *buf, size_t size)
{
int error = 0;
int cpu;
for_each_online_cpu(cpu) {
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
enum ucode_state ustate;
if (!uci->valid)
continue;
ustate = microcode_ops->request_microcode_user(cpu, buf, size);
if (ustate == UCODE_ERROR) {
error = -1;
break;
} else if (ustate == UCODE_OK)
apply_microcode_on_target(cpu);
}
return error;
}
static int microcode_open(struct inode *inode, struct file *file)
{
return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
}
static ssize_t microcode_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos)
{
ssize_t ret = -EINVAL;
if ((len >> PAGE_SHIFT) > totalram_pages) {
pr_err("too much data (max %ld pages)\n", totalram_pages);
return ret;
}
get_online_cpus();
mutex_lock(&microcode_mutex);
if (do_microcode_update(buf, len) == 0)
ret = (ssize_t)len;
if (ret > 0)
perf_check_microcode();
mutex_unlock(&microcode_mutex);
put_online_cpus();
return ret;
}
static const struct file_operations microcode_fops = {
.owner = THIS_MODULE,
.write = microcode_write,
.open = microcode_open,
.llseek = no_llseek,
};
static struct miscdevice microcode_dev = {
.minor = MICROCODE_MINOR,
.name = "microcode",
.nodename = "cpu/microcode",
.fops = &microcode_fops,
};
static int __init microcode_dev_init(void)
{
int error;
error = misc_register(&microcode_dev);
if (error) {
pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
return error;
}
return 0;
}
static void __exit microcode_dev_exit(void)
{
misc_deregister(&microcode_dev);
}
MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
MODULE_ALIAS("devname:cpu/microcode");
#else
#define microcode_dev_init() 0
#define microcode_dev_exit() do { } while (0)
#endif
/* fake device for request_firmware */
static struct platform_device *microcode_pdev;
static int reload_for_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
enum ucode_state ustate;
int err = 0;
if (!uci->valid)
return err;
ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, true);
if (ustate == UCODE_OK)
apply_microcode_on_target(cpu);
else
if (ustate == UCODE_ERROR)
err = -EINVAL;
return err;
}
static ssize_t reload_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t size)
{
unsigned long val;
int cpu;
ssize_t ret = 0, tmp_ret;
ret = kstrtoul(buf, 0, &val);
if (ret)
return ret;
if (val != 1)
return size;
get_online_cpus();
mutex_lock(&microcode_mutex);
for_each_online_cpu(cpu) {
tmp_ret = reload_for_cpu(cpu);
if (tmp_ret != 0)
pr_warn("Error reloading microcode on CPU %d\n", cpu);
/* save retval of the first encountered reload error */
if (!ret)
ret = tmp_ret;
}
if (!ret)
perf_check_microcode();
mutex_unlock(&microcode_mutex);
put_online_cpus();
if (!ret)
ret = size;
return ret;
}
static ssize_t version_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
}
static ssize_t pf_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
}
static DEVICE_ATTR(reload, 0200, NULL, reload_store);
static DEVICE_ATTR(version, 0400, version_show, NULL);
static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
static struct attribute *mc_default_attrs[] = {
&dev_attr_version.attr,
&dev_attr_processor_flags.attr,
NULL
};
static struct attribute_group mc_attr_group = {
.attrs = mc_default_attrs,
.name = "microcode",
};
static void microcode_fini_cpu(int cpu)
{
microcode_ops->microcode_fini_cpu(cpu);
}
static enum ucode_state microcode_resume_cpu(int cpu)
{
pr_debug("CPU%d updated upon resume\n", cpu);
if (apply_microcode_on_target(cpu))
return UCODE_ERROR;
return UCODE_OK;
}
static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
{
enum ucode_state ustate;
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
if (uci && uci->valid)
return UCODE_OK;
if (collect_cpu_info(cpu))
return UCODE_ERROR;
/* --dimm. Trigger a delayed update? */
if (system_state != SYSTEM_RUNNING)
return UCODE_NFOUND;
ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev,
refresh_fw);
if (ustate == UCODE_OK) {
pr_debug("CPU%d updated upon init\n", cpu);
apply_microcode_on_target(cpu);
}
return ustate;
}
static enum ucode_state microcode_update_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
if (uci->valid)
return microcode_resume_cpu(cpu);
return microcode_init_cpu(cpu, false);
}
static int mc_device_add(struct device *dev, struct subsys_interface *sif)
{
int err, cpu = dev->id;
if (!cpu_online(cpu))
return 0;
pr_debug("CPU%d added\n", cpu);
err = sysfs_create_group(&dev->kobj, &mc_attr_group);
if (err)
return err;
if (microcode_init_cpu(cpu, true) == UCODE_ERROR)
return -EINVAL;
return err;
}
static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
{
int cpu = dev->id;
if (!cpu_online(cpu))
return 0;
pr_debug("CPU%d removed\n", cpu);
microcode_fini_cpu(cpu);
sysfs_remove_group(&dev->kobj, &mc_attr_group);
return 0;
}
static struct subsys_interface mc_cpu_interface = {
.name = "microcode",
.subsys = &cpu_subsys,
.add_dev = mc_device_add,
.remove_dev = mc_device_remove,
};
/**
* mc_bp_resume - Update boot CPU microcode during resume.
*/
static void mc_bp_resume(void)
{
int cpu = smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
if (uci->valid && uci->mc)
microcode_ops->apply_microcode(cpu);
else if (!uci->mc)
reload_early_microcode();
}
static struct syscore_ops mc_syscore_ops = {
.resume = mc_bp_resume,
};
static int
mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
struct device *dev;
dev = get_cpu_device(cpu);
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
microcode_update_cpu(cpu);
pr_debug("CPU%d added\n", cpu);
/*
* "break" is missing on purpose here because we want to fall
* through in order to create the sysfs group.
*/
case CPU_DOWN_FAILED:
if (sysfs_create_group(&dev->kobj, &mc_attr_group))
pr_err("Failed to create group for CPU%d\n", cpu);
break;
case CPU_DOWN_PREPARE:
/* Suspend is in progress, only remove the interface */
sysfs_remove_group(&dev->kobj, &mc_attr_group);
pr_debug("CPU%d removed\n", cpu);
break;
/*
* case CPU_DEAD:
*
* When a CPU goes offline, don't free up or invalidate the copy of
* the microcode in kernel memory, so that we can reuse it when the
* CPU comes back online without unnecessarily requesting the userspace
* for it again.
*/
}
/* The CPU refused to come up during a system resume */
if (action == CPU_UP_CANCELED_FROZEN)
microcode_fini_cpu(cpu);
return NOTIFY_OK;
}
static struct notifier_block __refdata mc_cpu_notifier = {
.notifier_call = mc_cpu_callback,
};
#ifdef MODULE
/* Autoload on Intel and AMD systems */
static const struct x86_cpu_id __initconst microcode_id[] = {
#ifdef CONFIG_MICROCODE_INTEL
{ X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
#endif
#ifdef CONFIG_MICROCODE_AMD
{ X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, },
#endif
{}
};
MODULE_DEVICE_TABLE(x86cpu, microcode_id);
#endif
static struct attribute *cpu_root_microcode_attrs[] = {
&dev_attr_reload.attr,
NULL
};
static struct attribute_group cpu_root_microcode_group = {
.name = "microcode",
.attrs = cpu_root_microcode_attrs,
};
static int __init microcode_init(void)
{
struct cpuinfo_x86 *c = &cpu_data(0);
int error;
if (paravirt_enabled() || dis_ucode_ldr)
return -EINVAL;
if (c->x86_vendor == X86_VENDOR_INTEL)
microcode_ops = init_intel_microcode();
else if (c->x86_vendor == X86_VENDOR_AMD)
microcode_ops = init_amd_microcode();
else
pr_err("no support for this CPU vendor\n");
if (!microcode_ops)
return -ENODEV;
microcode_pdev = platform_device_register_simple("microcode", -1,
NULL, 0);
if (IS_ERR(microcode_pdev))
return PTR_ERR(microcode_pdev);
get_online_cpus();
mutex_lock(&microcode_mutex);
error = subsys_interface_register(&mc_cpu_interface);
if (!error)
perf_check_microcode();
mutex_unlock(&microcode_mutex);
put_online_cpus();
if (error)
goto out_pdev;
error = sysfs_create_group(&cpu_subsys.dev_root->kobj,
&cpu_root_microcode_group);
if (error) {
pr_err("Error creating microcode group!\n");
goto out_driver;
}
error = microcode_dev_init();
if (error)
goto out_ucode_group;
register_syscore_ops(&mc_syscore_ops);
register_hotcpu_notifier(&mc_cpu_notifier);
pr_info("Microcode Update Driver: v" MICROCODE_VERSION
" <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
return 0;
out_ucode_group:
sysfs_remove_group(&cpu_subsys.dev_root->kobj,
&cpu_root_microcode_group);
out_driver:
get_online_cpus();
mutex_lock(&microcode_mutex);
subsys_interface_unregister(&mc_cpu_interface);
mutex_unlock(&microcode_mutex);
put_online_cpus();
out_pdev:
platform_device_unregister(microcode_pdev);
return error;
}
module_init(microcode_init);
static void __exit microcode_exit(void)
{
struct cpuinfo_x86 *c = &cpu_data(0);
microcode_dev_exit();
unregister_hotcpu_notifier(&mc_cpu_notifier);
unregister_syscore_ops(&mc_syscore_ops);
sysfs_remove_group(&cpu_subsys.dev_root->kobj,
&cpu_root_microcode_group);
get_online_cpus();
mutex_lock(&microcode_mutex);
subsys_interface_unregister(&mc_cpu_interface);
mutex_unlock(&microcode_mutex);
put_online_cpus();
platform_device_unregister(microcode_pdev);
microcode_ops = NULL;
if (c->x86_vendor == X86_VENDOR_AMD)
exit_amd_microcode();
pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
}
module_exit(microcode_exit);

View file

@ -0,0 +1,199 @@
/*
* X86 CPU microcode early update for Linux
*
* Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
* H Peter Anvin" <hpa@zytor.com>
*
* This driver allows to early upgrade microcode on Intel processors
* belonging to IA-32 family - PentiumPro, Pentium II,
* Pentium III, Xeon, Pentium 4, etc.
*
* Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
* Software Developer's Manual.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <asm/microcode.h>
#include <asm/microcode_intel.h>
#include <asm/microcode_amd.h>
#include <asm/processor.h>
#include <asm/cmdline.h>
#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
#define CPUID_IS(a, b, c, ebx, ecx, edx) \
(!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
/*
* In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
* x86_vendor() gets vendor id for BSP.
*
* In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
* coding, we still use x86_vendor() to get vendor id for AP.
*
* x86_vendor() gets vendor information directly through cpuid.
*/
static int x86_vendor(void)
{
u32 eax = 0x00000000;
u32 ebx, ecx = 0, edx;
native_cpuid(&eax, &ebx, &ecx, &edx);
if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
return X86_VENDOR_INTEL;
if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
return X86_VENDOR_AMD;
return X86_VENDOR_UNKNOWN;
}
static int x86_family(void)
{
u32 eax = 0x00000001;
u32 ebx, ecx = 0, edx;
int x86;
native_cpuid(&eax, &ebx, &ecx, &edx);
x86 = (eax >> 8) & 0xf;
if (x86 == 15)
x86 += (eax >> 20) & 0xff;
return x86;
}
static bool __init check_loader_disabled_bsp(void)
{
#ifdef CONFIG_X86_32
const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
const char *opt = "dis_ucode_ldr";
const char *option = (const char *)__pa_nodebug(opt);
bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
#else /* CONFIG_X86_64 */
const char *cmdline = boot_command_line;
const char *option = "dis_ucode_ldr";
bool *res = &dis_ucode_ldr;
#endif
if (cmdline_find_option_bool(cmdline, option))
*res = true;
return *res;
}
void __init load_ucode_bsp(void)
{
int vendor, x86;
if (check_loader_disabled_bsp())
return;
if (!have_cpuid_p())
return;
vendor = x86_vendor();
x86 = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
if (x86 >= 6)
load_ucode_intel_bsp();
break;
case X86_VENDOR_AMD:
if (x86 >= 0x10)
load_ucode_amd_bsp();
break;
default:
break;
}
}
static bool check_loader_disabled_ap(void)
{
#ifdef CONFIG_X86_32
return *((bool *)__pa_nodebug(&dis_ucode_ldr));
#else
return dis_ucode_ldr;
#endif
}
void load_ucode_ap(void)
{
int vendor, x86;
if (check_loader_disabled_ap())
return;
if (!have_cpuid_p())
return;
vendor = x86_vendor();
x86 = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
if (x86 >= 6)
load_ucode_intel_ap();
break;
case X86_VENDOR_AMD:
if (x86 >= 0x10)
load_ucode_amd_ap();
break;
default:
break;
}
}
int __init save_microcode_in_initrd(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
if (c->x86 >= 6)
save_microcode_in_initrd_intel();
break;
case X86_VENDOR_AMD:
if (c->x86 >= 0x10)
save_microcode_in_initrd_amd();
break;
default:
break;
}
return 0;
}
void reload_early_microcode(void)
{
int vendor, x86;
vendor = x86_vendor();
x86 = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
if (x86 >= 6)
reload_ucode_intel();
break;
case X86_VENDOR_AMD:
if (x86 >= 0x10)
reload_ucode_amd();
break;
default:
break;
}
}

View file

@ -0,0 +1,333 @@
/*
* Intel CPU Microcode Update Driver for Linux
*
* Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
* 2006 Shaohua Li <shaohua.li@intel.com>
*
* This driver allows to upgrade microcode on Intel processors
* belonging to IA-32 family - PentiumPro, Pentium II,
* Pentium III, Xeon, Pentium 4, etc.
*
* Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
* Software Developer's Manual
* Order Number 253668 or free download from:
*
* http://developer.intel.com/Assets/PDF/manual/253668.pdf
*
* For more information, go to http://www.urbanmyth.org/microcode
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Initial release.
* 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Added read() support + cleanups.
* 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Added 'device trimming' support. open(O_WRONLY) zeroes
* and frees the saved copy of applied microcode.
* 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Made to use devfs (/dev/cpu/microcode) + cleanups.
* 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
* Added misc device support (now uses both devfs and misc).
* Added MICROCODE_IOCFREE ioctl to clear memory.
* 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
* Messages for error cases (non Intel & no suitable microcode).
* 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
* Removed ->release(). Removed exclusive open and status bitmap.
* Added microcode_rwsem to serialize read()/write()/ioctl().
* Removed global kernel lock usage.
* 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
* Write 0 to 0x8B msr and then cpuid before reading revision,
* so that it works even if there were no update done by the
* BIOS. Otherwise, reading from 0x8B gives junk (which happened
* to be 0 on my machine which is why it worked even when I
* disabled update by the BIOS)
* Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
* 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
* Tigran Aivazian <tigran@veritas.com>
* Intel Pentium 4 processor support and bugfixes.
* 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
* Bugfix for HT (Hyper-Threading) enabled processors
* whereby processor resources are shared by all logical processors
* in a single CPU package.
* 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
* Tigran Aivazian <tigran@veritas.com>,
* Serialize updates as required on HT processors due to
* speculative nature of implementation.
* 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
* Fix the panic when writing zero-length microcode chunk.
* 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
* Jun Nakajima <jun.nakajima@intel.com>
* Support for the microcode updates in the new format.
* 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
* Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
* because we no longer hold a copy of applied microcode
* in kernel memory.
* 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
* Fix sigmatch() macro to handle old CPUs with pf == 0.
* Thanks to Stuart Swales for pointing out this bug.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/firmware.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <asm/microcode_intel.h>
#include <asm/processor.h>
#include <asm/msr.h>
MODULE_DESCRIPTION("Microcode Update Driver");
MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
MODULE_LICENSE("GPL");
static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu_num);
unsigned int val[2];
memset(csig, 0, sizeof(*csig));
csig->sig = cpuid_eax(0x00000001);
if ((c->x86_model >= 5) || (c->x86 > 6)) {
/* get processor flags from MSR 0x17 */
rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
csig->pf = 1 << ((val[1] >> 18) & 7);
}
csig->rev = c->microcode;
pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
cpu_num, csig->sig, csig->pf, csig->rev);
return 0;
}
/*
* return 0 - no update found
* return 1 - found update
*/
static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
{
struct cpu_signature cpu_sig;
unsigned int csig, cpf, crev;
collect_cpu_info(cpu, &cpu_sig);
csig = cpu_sig.sig;
cpf = cpu_sig.pf;
crev = cpu_sig.rev;
return get_matching_microcode(csig, cpf, mc_intel, crev);
}
static int apply_microcode_intel(int cpu)
{
struct microcode_intel *mc_intel;
struct ucode_cpu_info *uci;
unsigned int val[2];
int cpu_num = raw_smp_processor_id();
struct cpuinfo_x86 *c = &cpu_data(cpu_num);
uci = ucode_cpu_info + cpu;
mc_intel = uci->mc;
/* We should bind the task to the CPU */
BUG_ON(cpu_num != cpu);
if (mc_intel == NULL)
return 0;
/*
* Microcode on this CPU could be updated earlier. Only apply the
* microcode patch in mc_intel when it is newer than the one on this
* CPU.
*/
if (get_matching_mc(mc_intel, cpu) == 0)
return 0;
/* write microcode via MSR 0x79 */
wrmsr(MSR_IA32_UCODE_WRITE,
(unsigned long) mc_intel->bits,
(unsigned long) mc_intel->bits >> 16 >> 16);
wrmsr(MSR_IA32_UCODE_REV, 0, 0);
/* As documented in the SDM: Do a CPUID 1 here */
sync_core();
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
if (val[1] != mc_intel->hdr.rev) {
pr_err("CPU%d update to revision 0x%x failed\n",
cpu_num, mc_intel->hdr.rev);
return -1;
}
pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
cpu_num, val[1],
mc_intel->hdr.date & 0xffff,
mc_intel->hdr.date >> 24,
(mc_intel->hdr.date >> 16) & 0xff);
uci->cpu_sig.rev = val[1];
c->microcode = val[1];
return 0;
}
static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
int (*get_ucode_data)(void *, const void *, size_t))
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
int new_rev = uci->cpu_sig.rev;
unsigned int leftover = size;
enum ucode_state state = UCODE_OK;
unsigned int curr_mc_size = 0;
unsigned int csig, cpf;
while (leftover) {
struct microcode_header_intel mc_header;
unsigned int mc_size;
if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
break;
mc_size = get_totalsize(&mc_header);
if (!mc_size || mc_size > leftover) {
pr_err("error! Bad data in microcode data file\n");
break;
}
/* For performance reasons, reuse mc area when possible */
if (!mc || mc_size > curr_mc_size) {
vfree(mc);
mc = vmalloc(mc_size);
if (!mc)
break;
curr_mc_size = mc_size;
}
if (get_ucode_data(mc, ucode_ptr, mc_size) ||
microcode_sanity_check(mc, 1) < 0) {
break;
}
csig = uci->cpu_sig.sig;
cpf = uci->cpu_sig.pf;
if (get_matching_microcode(csig, cpf, mc, new_rev)) {
vfree(new_mc);
new_rev = mc_header.rev;
new_mc = mc;
mc = NULL; /* trigger new vmalloc */
}
ucode_ptr += mc_size;
leftover -= mc_size;
}
vfree(mc);
if (leftover) {
vfree(new_mc);
state = UCODE_ERROR;
goto out;
}
if (!new_mc) {
state = UCODE_NFOUND;
goto out;
}
vfree(uci->mc);
uci->mc = (struct microcode_intel *)new_mc;
/*
* If early loading microcode is supported, save this mc into
* permanent memory. So it will be loaded early when a CPU is hot added
* or resumes.
*/
save_mc_for_early(new_mc);
pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
out:
return state;
}
static int get_ucode_fw(void *to, const void *from, size_t n)
{
memcpy(to, from, n);
return 0;
}
static enum ucode_state request_microcode_fw(int cpu, struct device *device,
bool refresh_fw)
{
char name[30];
struct cpuinfo_x86 *c = &cpu_data(cpu);
const struct firmware *firmware;
enum ucode_state ret;
sprintf(name, "intel-ucode/%02x-%02x-%02x",
c->x86, c->x86_model, c->x86_mask);
if (request_firmware_direct(&firmware, name, device)) {
pr_debug("data file %s load failed\n", name);
return UCODE_NFOUND;
}
ret = generic_load_microcode(cpu, (void *)firmware->data,
firmware->size, &get_ucode_fw);
release_firmware(firmware);
return ret;
}
static int get_ucode_user(void *to, const void *from, size_t n)
{
return copy_from_user(to, from, n);
}
static enum ucode_state
request_microcode_user(int cpu, const void __user *buf, size_t size)
{
return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
}
static void microcode_fini_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
vfree(uci->mc);
uci->mc = NULL;
}
static struct microcode_ops microcode_intel_ops = {
.request_microcode_user = request_microcode_user,
.request_microcode_fw = request_microcode_fw,
.collect_cpu_info = collect_cpu_info,
.apply_microcode = apply_microcode_intel,
.microcode_fini_cpu = microcode_fini_cpu,
};
struct microcode_ops * __init init_intel_microcode(void)
{
struct cpuinfo_x86 *c = &cpu_data(0);
if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
cpu_has(c, X86_FEATURE_IA64)) {
pr_err("Intel CPU family 0x%x not supported\n", c->x86);
return NULL;
}
return &microcode_intel_ops;
}

View file

@ -0,0 +1,813 @@
/*
* Intel CPU microcode early update for Linux
*
* Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
* H Peter Anvin" <hpa@zytor.com>
*
* This allows to early upgrade microcode on Intel processors
* belonging to IA-32 family - PentiumPro, Pentium II,
* Pentium III, Xeon, Pentium 4, etc.
*
* Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
* Software Developer's Manual.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/earlycpio.h>
#include <linux/initrd.h>
#include <linux/cpu.h>
#include <asm/msr.h>
#include <asm/microcode_intel.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/setup.h>
static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
static struct mc_saved_data {
unsigned int mc_saved_count;
struct microcode_intel **mc_saved;
} mc_saved_data;
static enum ucode_state
generic_load_microcode_early(struct microcode_intel **mc_saved_p,
unsigned int mc_saved_count,
struct ucode_cpu_info *uci)
{
struct microcode_intel *ucode_ptr, *new_mc = NULL;
int new_rev = uci->cpu_sig.rev;
enum ucode_state state = UCODE_OK;
unsigned int mc_size;
struct microcode_header_intel *mc_header;
unsigned int csig = uci->cpu_sig.sig;
unsigned int cpf = uci->cpu_sig.pf;
int i;
for (i = 0; i < mc_saved_count; i++) {
ucode_ptr = mc_saved_p[i];
mc_header = (struct microcode_header_intel *)ucode_ptr;
mc_size = get_totalsize(mc_header);
if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) {
new_rev = mc_header->rev;
new_mc = ucode_ptr;
}
}
if (!new_mc) {
state = UCODE_NFOUND;
goto out;
}
uci->mc = (struct microcode_intel *)new_mc;
out:
return state;
}
static void
microcode_pointer(struct microcode_intel **mc_saved,
unsigned long *mc_saved_in_initrd,
unsigned long initrd_start, int mc_saved_count)
{
int i;
for (i = 0; i < mc_saved_count; i++)
mc_saved[i] = (struct microcode_intel *)
(mc_saved_in_initrd[i] + initrd_start);
}
#ifdef CONFIG_X86_32
static void
microcode_phys(struct microcode_intel **mc_saved_tmp,
struct mc_saved_data *mc_saved_data)
{
int i;
struct microcode_intel ***mc_saved;
mc_saved = (struct microcode_intel ***)
__pa_nodebug(&mc_saved_data->mc_saved);
for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
struct microcode_intel *p;
p = *(struct microcode_intel **)
__pa_nodebug(mc_saved_data->mc_saved + i);
mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
}
}
#endif
static enum ucode_state
load_microcode(struct mc_saved_data *mc_saved_data,
unsigned long *mc_saved_in_initrd,
unsigned long initrd_start,
struct ucode_cpu_info *uci)
{
struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
unsigned int count = mc_saved_data->mc_saved_count;
if (!mc_saved_data->mc_saved) {
microcode_pointer(mc_saved_tmp, mc_saved_in_initrd,
initrd_start, count);
return generic_load_microcode_early(mc_saved_tmp, count, uci);
} else {
#ifdef CONFIG_X86_32
microcode_phys(mc_saved_tmp, mc_saved_data);
return generic_load_microcode_early(mc_saved_tmp, count, uci);
#else
return generic_load_microcode_early(mc_saved_data->mc_saved,
count, uci);
#endif
}
}
static u8 get_x86_family(unsigned long sig)
{
u8 x86;
x86 = (sig >> 8) & 0xf;
if (x86 == 0xf)
x86 += (sig >> 20) & 0xff;
return x86;
}
static u8 get_x86_model(unsigned long sig)
{
u8 x86, x86_model;
x86 = get_x86_family(sig);
x86_model = (sig >> 4) & 0xf;
if (x86 == 0x6 || x86 == 0xf)
x86_model += ((sig >> 16) & 0xf) << 4;
return x86_model;
}
/*
* Given CPU signature and a microcode patch, this function finds if the
* microcode patch has matching family and model with the CPU.
*/
static enum ucode_state
matching_model_microcode(struct microcode_header_intel *mc_header,
unsigned long sig)
{
u8 x86, x86_model;
u8 x86_ucode, x86_model_ucode;
struct extended_sigtable *ext_header;
unsigned long total_size = get_totalsize(mc_header);
unsigned long data_size = get_datasize(mc_header);
int ext_sigcount, i;
struct extended_signature *ext_sig;
x86 = get_x86_family(sig);
x86_model = get_x86_model(sig);
x86_ucode = get_x86_family(mc_header->sig);
x86_model_ucode = get_x86_model(mc_header->sig);
if (x86 == x86_ucode && x86_model == x86_model_ucode)
return UCODE_OK;
/* Look for ext. headers: */
if (total_size <= data_size + MC_HEADER_SIZE)
return UCODE_NFOUND;
ext_header = (struct extended_sigtable *)
mc_header + data_size + MC_HEADER_SIZE;
ext_sigcount = ext_header->count;
ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
for (i = 0; i < ext_sigcount; i++) {
x86_ucode = get_x86_family(ext_sig->sig);
x86_model_ucode = get_x86_model(ext_sig->sig);
if (x86 == x86_ucode && x86_model == x86_model_ucode)
return UCODE_OK;
ext_sig++;
}
return UCODE_NFOUND;
}
static int
save_microcode(struct mc_saved_data *mc_saved_data,
struct microcode_intel **mc_saved_src,
unsigned int mc_saved_count)
{
int i, j;
struct microcode_intel **mc_saved_p;
int ret;
if (!mc_saved_count)
return -EINVAL;
/*
* Copy new microcode data.
*/
mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *),
GFP_KERNEL);
if (!mc_saved_p)
return -ENOMEM;
for (i = 0; i < mc_saved_count; i++) {
struct microcode_intel *mc = mc_saved_src[i];
struct microcode_header_intel *mc_header = &mc->hdr;
unsigned long mc_size = get_totalsize(mc_header);
mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL);
if (!mc_saved_p[i]) {
ret = -ENOMEM;
goto err;
}
if (!mc_saved_src[i]) {
ret = -EINVAL;
goto err;
}
memcpy(mc_saved_p[i], mc, mc_size);
}
/*
* Point to newly saved microcode.
*/
mc_saved_data->mc_saved = mc_saved_p;
mc_saved_data->mc_saved_count = mc_saved_count;
return 0;
err:
for (j = 0; j <= i; j++)
kfree(mc_saved_p[j]);
kfree(mc_saved_p);
return ret;
}
/*
* A microcode patch in ucode_ptr is saved into mc_saved
* - if it has matching signature and newer revision compared to an existing
* patch mc_saved.
* - or if it is a newly discovered microcode patch.
*
* The microcode patch should have matching model with CPU.
*/
static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr,
unsigned int *mc_saved_count_p)
{
int i;
int found = 0;
unsigned int mc_saved_count = *mc_saved_count_p;
struct microcode_header_intel *mc_header;
mc_header = (struct microcode_header_intel *)ucode_ptr;
for (i = 0; i < mc_saved_count; i++) {
unsigned int sig, pf;
unsigned int new_rev;
struct microcode_header_intel *mc_saved_header =
(struct microcode_header_intel *)mc_saved[i];
sig = mc_saved_header->sig;
pf = mc_saved_header->pf;
new_rev = mc_header->rev;
if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) {
found = 1;
if (update_match_revision(mc_header, new_rev)) {
/*
* Found an older ucode saved before.
* Replace the older one with this newer
* one.
*/
mc_saved[i] =
(struct microcode_intel *)ucode_ptr;
break;
}
}
}
if (i >= mc_saved_count && !found)
/*
* This ucode is first time discovered in ucode file.
* Save it to memory.
*/
mc_saved[mc_saved_count++] =
(struct microcode_intel *)ucode_ptr;
*mc_saved_count_p = mc_saved_count;
}
/*
* Get microcode matching with BSP's model. Only CPUs with the same model as
* BSP can stay in the platform.
*/
static enum ucode_state __init
get_matching_model_microcode(int cpu, unsigned long start,
void *data, size_t size,
struct mc_saved_data *mc_saved_data,
unsigned long *mc_saved_in_initrd,
struct ucode_cpu_info *uci)
{
u8 *ucode_ptr = data;
unsigned int leftover = size;
enum ucode_state state = UCODE_OK;
unsigned int mc_size;
struct microcode_header_intel *mc_header;
struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
int i;
while (leftover) {
mc_header = (struct microcode_header_intel *)ucode_ptr;
mc_size = get_totalsize(mc_header);
if (!mc_size || mc_size > leftover ||
microcode_sanity_check(ucode_ptr, 0) < 0)
break;
leftover -= mc_size;
/*
* Since APs with same family and model as the BSP may boot in
* the platform, we need to find and save microcode patches
* with the same family and model as the BSP.
*/
if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
UCODE_OK) {
ucode_ptr += mc_size;
continue;
}
_save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count);
ucode_ptr += mc_size;
}
if (leftover) {
state = UCODE_ERROR;
goto out;
}
if (mc_saved_count == 0) {
state = UCODE_NFOUND;
goto out;
}
for (i = 0; i < mc_saved_count; i++)
mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
mc_saved_data->mc_saved_count = mc_saved_count;
out:
return state;
}
static int collect_cpu_info_early(struct ucode_cpu_info *uci)
{
unsigned int val[2];
u8 x86, x86_model;
struct cpu_signature csig;
unsigned int eax, ebx, ecx, edx;
csig.sig = 0;
csig.pf = 0;
csig.rev = 0;
memset(uci, 0, sizeof(*uci));
eax = 0x00000001;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
csig.sig = eax;
x86 = get_x86_family(csig.sig);
x86_model = get_x86_model(csig.sig);
if ((x86_model >= 5) || (x86 > 6)) {
/* get processor flags from MSR 0x17 */
native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
csig.pf = 1 << ((val[1] >> 18) & 7);
}
native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
/* As documented in the SDM: Do a CPUID 1 here */
sync_core();
/* get the current revision from MSR 0x8B */
native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
csig.rev = val[1];
uci->cpu_sig = csig;
uci->valid = 1;
return 0;
}
#ifdef DEBUG
static void __ref show_saved_mc(void)
{
int i, j;
unsigned int sig, pf, rev, total_size, data_size, date;
struct ucode_cpu_info uci;
if (mc_saved_data.mc_saved_count == 0) {
pr_debug("no microcode data saved.\n");
return;
}
pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
collect_cpu_info_early(&uci);
sig = uci.cpu_sig.sig;
pf = uci.cpu_sig.pf;
rev = uci.cpu_sig.rev;
pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n",
smp_processor_id(), sig, pf, rev);
for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
struct microcode_header_intel *mc_saved_header;
struct extended_sigtable *ext_header;
int ext_sigcount;
struct extended_signature *ext_sig;
mc_saved_header = (struct microcode_header_intel *)
mc_saved_data.mc_saved[i];
sig = mc_saved_header->sig;
pf = mc_saved_header->pf;
rev = mc_saved_header->rev;
total_size = get_totalsize(mc_saved_header);
data_size = get_datasize(mc_saved_header);
date = mc_saved_header->date;
pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n",
i, sig, pf, rev, total_size,
date & 0xffff,
date >> 24,
(date >> 16) & 0xff);
/* Look for ext. headers: */
if (total_size <= data_size + MC_HEADER_SIZE)
continue;
ext_header = (struct extended_sigtable *)
mc_saved_header + data_size + MC_HEADER_SIZE;
ext_sigcount = ext_header->count;
ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
for (j = 0; j < ext_sigcount; j++) {
sig = ext_sig->sig;
pf = ext_sig->pf;
pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
j, sig, pf);
ext_sig++;
}
}
}
#else
static inline void show_saved_mc(void)
{
}
#endif
#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU)
static DEFINE_MUTEX(x86_cpu_microcode_mutex);
/*
* Save this mc into mc_saved_data. So it will be loaded early when a CPU is
* hot added or resumes.
*
* Please make sure this mc should be a valid microcode patch before calling
* this function.
*/
int save_mc_for_early(u8 *mc)
{
struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
unsigned int mc_saved_count_init;
unsigned int mc_saved_count;
struct microcode_intel **mc_saved;
int ret = 0;
int i;
/*
* Hold hotplug lock so mc_saved_data is not accessed by a CPU in
* hotplug.
*/
mutex_lock(&x86_cpu_microcode_mutex);
mc_saved_count_init = mc_saved_data.mc_saved_count;
mc_saved_count = mc_saved_data.mc_saved_count;
mc_saved = mc_saved_data.mc_saved;
if (mc_saved && mc_saved_count)
memcpy(mc_saved_tmp, mc_saved,
mc_saved_count * sizeof(struct microcode_intel *));
/*
* Save the microcode patch mc in mc_save_tmp structure if it's a newer
* version.
*/
_save_mc(mc_saved_tmp, mc, &mc_saved_count);
/*
* Save the mc_save_tmp in global mc_saved_data.
*/
ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
if (ret) {
pr_err("Cannot save microcode patch.\n");
goto out;
}
show_saved_mc();
/*
* Free old saved microcode data.
*/
if (mc_saved) {
for (i = 0; i < mc_saved_count_init; i++)
kfree(mc_saved[i]);
kfree(mc_saved);
}
out:
mutex_unlock(&x86_cpu_microcode_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(save_mc_for_early);
#endif
static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
static __init enum ucode_state
scan_microcode(unsigned long start, unsigned long end,
struct mc_saved_data *mc_saved_data,
unsigned long *mc_saved_in_initrd,
struct ucode_cpu_info *uci)
{
unsigned int size = end - start + 1;
struct cpio_data cd;
long offset = 0;
#ifdef CONFIG_X86_32
char *p = (char *)__pa_nodebug(ucode_name);
#else
char *p = ucode_name;
#endif
cd.data = NULL;
cd.size = 0;
cd = find_cpio_data(p, (void *)start, size, &offset);
if (!cd.data)
return UCODE_ERROR;
return get_matching_model_microcode(0, start, cd.data, cd.size,
mc_saved_data, mc_saved_in_initrd,
uci);
}
/*
* Print ucode update info.
*/
static void
print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
{
int cpu = smp_processor_id();
pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
cpu,
uci->cpu_sig.rev,
date & 0xffff,
date >> 24,
(date >> 16) & 0xff);
}
#ifdef CONFIG_X86_32
static int delay_ucode_info;
static int current_mc_date;
/*
* Print early updated ucode info after printk works. This is delayed info dump.
*/
void show_ucode_info_early(void)
{
struct ucode_cpu_info uci;
if (delay_ucode_info) {
collect_cpu_info_early(&uci);
print_ucode_info(&uci, current_mc_date);
delay_ucode_info = 0;
}
}
/*
* At this point, we can not call printk() yet. Keep microcode patch number in
* mc_saved_data.mc_saved and delay printing microcode info in
* show_ucode_info_early() until printk() works.
*/
static void print_ucode(struct ucode_cpu_info *uci)
{
struct microcode_intel *mc_intel;
int *delay_ucode_info_p;
int *current_mc_date_p;
mc_intel = uci->mc;
if (mc_intel == NULL)
return;
delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
*delay_ucode_info_p = 1;
*current_mc_date_p = mc_intel->hdr.date;
}
#else
/*
* Flush global tlb. We only do this in x86_64 where paging has been enabled
* already and PGE should be enabled as well.
*/
static inline void flush_tlb_early(void)
{
__native_flush_tlb_global_irq_disabled();
}
static inline void print_ucode(struct ucode_cpu_info *uci)
{
struct microcode_intel *mc_intel;
mc_intel = uci->mc;
if (mc_intel == NULL)
return;
print_ucode_info(uci, mc_intel->hdr.date);
}
#endif
static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
{
struct microcode_intel *mc_intel;
unsigned int val[2];
mc_intel = uci->mc;
if (mc_intel == NULL)
return 0;
/* write microcode via MSR 0x79 */
native_wrmsr(MSR_IA32_UCODE_WRITE,
(unsigned long) mc_intel->bits,
(unsigned long) mc_intel->bits >> 16 >> 16);
native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
/* As documented in the SDM: Do a CPUID 1 here */
sync_core();
/* get the current revision from MSR 0x8B */
native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
if (val[1] != mc_intel->hdr.rev)
return -1;
#ifdef CONFIG_X86_64
/* Flush global tlb. This is precaution. */
flush_tlb_early();
#endif
uci->cpu_sig.rev = val[1];
if (early)
print_ucode(uci);
else
print_ucode_info(uci, mc_intel->hdr.date);
return 0;
}
/*
* This function converts microcode patch offsets previously stored in
* mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
*/
int __init save_microcode_in_initrd_intel(void)
{
unsigned int count = mc_saved_data.mc_saved_count;
struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
int ret = 0;
if (count == 0)
return ret;
microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count);
ret = save_microcode(&mc_saved_data, mc_saved, count);
if (ret)
pr_err("Cannot save microcode patches from initrd.\n");
show_saved_mc();
return ret;
}
static void __init
_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
unsigned long *mc_saved_in_initrd,
unsigned long initrd_start_early,
unsigned long initrd_end_early,
struct ucode_cpu_info *uci)
{
enum ucode_state ret;
collect_cpu_info_early(uci);
scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
mc_saved_in_initrd, uci);
ret = load_microcode(mc_saved_data, mc_saved_in_initrd,
initrd_start_early, uci);
if (ret == UCODE_OK)
apply_microcode_early(uci, true);
}
void __init
load_ucode_intel_bsp(void)
{
u64 ramdisk_image, ramdisk_size;
unsigned long initrd_start_early, initrd_end_early;
struct ucode_cpu_info uci;
#ifdef CONFIG_X86_32
struct boot_params *boot_params_p;
boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params);
ramdisk_image = boot_params_p->hdr.ramdisk_image;
ramdisk_size = boot_params_p->hdr.ramdisk_size;
initrd_start_early = ramdisk_image;
initrd_end_early = initrd_start_early + ramdisk_size;
_load_ucode_intel_bsp(
(struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
(unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
initrd_start_early, initrd_end_early, &uci);
#else
ramdisk_image = boot_params.hdr.ramdisk_image;
ramdisk_size = boot_params.hdr.ramdisk_size;
initrd_start_early = ramdisk_image + PAGE_OFFSET;
initrd_end_early = initrd_start_early + ramdisk_size;
_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
initrd_start_early, initrd_end_early,
&uci);
#endif
}
void load_ucode_intel_ap(void)
{
struct mc_saved_data *mc_saved_data_p;
struct ucode_cpu_info uci;
unsigned long *mc_saved_in_initrd_p;
unsigned long initrd_start_addr;
#ifdef CONFIG_X86_32
unsigned long *initrd_start_p;
mc_saved_in_initrd_p =
(unsigned long *)__pa_nodebug(mc_saved_in_initrd);
mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
#else
mc_saved_data_p = &mc_saved_data;
mc_saved_in_initrd_p = mc_saved_in_initrd;
initrd_start_addr = initrd_start;
#endif
/*
* If there is no valid ucode previously saved in memory, no need to
* update ucode on this AP.
*/
if (mc_saved_data_p->mc_saved_count == 0)
return;
collect_cpu_info_early(&uci);
load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
initrd_start_addr, &uci);
apply_microcode_early(&uci, true);
}
void reload_ucode_intel(void)
{
struct ucode_cpu_info uci;
enum ucode_state ret;
if (!mc_saved_data.mc_saved_count)
return;
collect_cpu_info_early(&uci);
ret = generic_load_microcode_early(mc_saved_data.mc_saved,
mc_saved_data.mc_saved_count, &uci);
if (ret != UCODE_OK)
return;
apply_microcode_early(&uci, false);
}

View file

@ -0,0 +1,174 @@
/*
* Intel CPU Microcode Update Driver for Linux
*
* Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
* H Peter Anvin" <hpa@zytor.com>
*
* This driver allows to upgrade microcode on Intel processors
* belonging to IA-32 family - PentiumPro, Pentium II,
* Pentium III, Xeon, Pentium 4, etc.
*
* Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
* Software Developer's Manual
* Order Number 253668 or free download from:
*
* http://developer.intel.com/Assets/PDF/manual/253668.pdf
*
* For more information, go to http://www.urbanmyth.org/microcode
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/firmware.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/microcode_intel.h>
#include <asm/processor.h>
#include <asm/msr.h>
static inline int
update_match_cpu(unsigned int csig, unsigned int cpf,
unsigned int sig, unsigned int pf)
{
return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
}
int
update_match_revision(struct microcode_header_intel *mc_header, int rev)
{
return (mc_header->rev <= rev) ? 0 : 1;
}
int microcode_sanity_check(void *mc, int print_err)
{
unsigned long total_size, data_size, ext_table_size;
struct microcode_header_intel *mc_header = mc;
struct extended_sigtable *ext_header = NULL;
int sum, orig_sum, ext_sigcount = 0, i;
struct extended_signature *ext_sig;
total_size = get_totalsize(mc_header);
data_size = get_datasize(mc_header);
if (data_size + MC_HEADER_SIZE > total_size) {
if (print_err)
pr_err("error! Bad data size in microcode data file\n");
return -EINVAL;
}
if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
if (print_err)
pr_err("error! Unknown microcode update format\n");
return -EINVAL;
}
ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
if (ext_table_size) {
if ((ext_table_size < EXT_HEADER_SIZE)
|| ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
if (print_err)
pr_err("error! Small exttable size in microcode data file\n");
return -EINVAL;
}
ext_header = mc + MC_HEADER_SIZE + data_size;
if (ext_table_size != exttable_size(ext_header)) {
if (print_err)
pr_err("error! Bad exttable size in microcode data file\n");
return -EFAULT;
}
ext_sigcount = ext_header->count;
}
/* check extended table checksum */
if (ext_table_size) {
int ext_table_sum = 0;
int *ext_tablep = (int *)ext_header;
i = ext_table_size / DWSIZE;
while (i--)
ext_table_sum += ext_tablep[i];
if (ext_table_sum) {
if (print_err)
pr_warn("aborting, bad extended signature table checksum\n");
return -EINVAL;
}
}
/* calculate the checksum */
orig_sum = 0;
i = (MC_HEADER_SIZE + data_size) / DWSIZE;
while (i--)
orig_sum += ((int *)mc)[i];
if (orig_sum) {
if (print_err)
pr_err("aborting, bad checksum\n");
return -EINVAL;
}
if (!ext_table_size)
return 0;
/* check extended signature checksum */
for (i = 0; i < ext_sigcount; i++) {
ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
EXT_SIGNATURE_SIZE * i;
sum = orig_sum
- (mc_header->sig + mc_header->pf + mc_header->cksum)
+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
if (sum) {
if (print_err)
pr_err("aborting, bad checksum\n");
return -EINVAL;
}
}
return 0;
}
EXPORT_SYMBOL_GPL(microcode_sanity_check);
/*
* return 0 - no update found
* return 1 - found update
*/
int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
{
struct microcode_header_intel *mc_header = mc;
struct extended_sigtable *ext_header;
unsigned long total_size = get_totalsize(mc_header);
int ext_sigcount, i;
struct extended_signature *ext_sig;
if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf))
return 1;
/* Look for ext. headers: */
if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
return 0;
ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
ext_sigcount = ext_header->count;
ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
for (i = 0; i < ext_sigcount; i++) {
if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf))
return 1;
ext_sig++;
}
return 0;
}
/*
* return 0 - no update found
* return 1 - found update
*/
int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev)
{
struct microcode_header_intel *mc_header = mc;
if (!update_match_revision(mc_header, rev))
return 0;
return get_matching_sig(csig, cpf, mc, rev);
}
EXPORT_SYMBOL_GPL(get_matching_microcode);

View file

@ -0,0 +1,64 @@
#!/bin/sh
#
# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h
#
IN=$1
OUT=$2
function dump_array()
{
ARRAY=$1
SIZE=$2
PFX=$3
POSTFIX=$4
PFX_SZ=$(echo $PFX | wc -c)
TABS="$(printf '\t\t\t\t\t')"
echo "const char * const $ARRAY[$SIZE] = {"
# Iterate through any input lines starting with #define $PFX
sed -n -e 's/\t/ /g' -e "s/^ *# *define *$PFX//p" $IN |
while read i
do
# Name is everything up to the first whitespace
NAME="$(echo "$i" | sed 's/ .*//')"
# If the /* comment */ starts with a quote string, grab that.
VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')"
[ -z "$VALUE" ] && VALUE="\"$NAME\""
[ "$VALUE" == '""' ] && continue
# Name is uppercase, VALUE is all lowercase
VALUE="$(echo "$VALUE" | tr A-Z a-z)"
if [ -n "$POSTFIX" ]; then
T=$(( $PFX_SZ + $(echo $POSTFIX | wc -c) + 2 ))
TABS="$(printf '\t\t\t\t\t\t')"
TABCOUNT=$(( ( 6*8 - ($T + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
printf "\t[%s - %s]%.*s = %s,\n" "$PFX$NAME" "$POSTFIX" "$TABCOUNT" "$TABS" "$VALUE"
else
TABCOUNT=$(( ( 5*8 - ($PFX_SZ + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
printf "\t[%s]%.*s = %s,\n" "$PFX$NAME" "$TABCOUNT" "$TABS" "$VALUE"
fi
done
echo "};"
}
trap 'rm "$OUT"' EXIT
(
echo "#ifndef _ASM_X86_CPUFEATURE_H"
echo "#include <asm/cpufeature.h>"
echo "#endif"
echo ""
dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" ""
echo ""
dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32"
) > $OUT
trap - EXIT

View file

@ -0,0 +1,153 @@
/*
* HyperV Detection code.
*
* Copyright (C) 2010, Novell, Inc.
* Author : K. Y. Srinivasan <ksrinivasan@novell.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; version 2 of the License.
*
*/
#include <linux/types.h>
#include <linux/time.h>
#include <linux/clocksource.h>
#include <linux/module.h>
#include <linux/hardirq.h>
#include <linux/efi.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
#include <asm/hyperv.h>
#include <asm/mshyperv.h>
#include <asm/desc.h>
#include <asm/idle.h>
#include <asm/irq_regs.h>
#include <asm/i8259.h>
#include <asm/apic.h>
#include <asm/timer.h>
struct ms_hyperv_info ms_hyperv;
EXPORT_SYMBOL_GPL(ms_hyperv);
#if IS_ENABLED(CONFIG_HYPERV)
static void (*vmbus_handler)(void);
void hyperv_vector_handler(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
irq_enter();
exit_idle();
inc_irq_stat(irq_hv_callback_count);
if (vmbus_handler)
vmbus_handler();
irq_exit();
set_irq_regs(old_regs);
}
void hv_setup_vmbus_irq(void (*handler)(void))
{
vmbus_handler = handler;
/*
* Setup the IDT for hypervisor callback. Prevent reallocation
* at module reload.
*/
if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors))
alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
hyperv_callback_vector);
}
void hv_remove_vmbus_irq(void)
{
/* We have no way to deallocate the interrupt gate */
vmbus_handler = NULL;
}
EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);
EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
#endif
static uint32_t __init ms_hyperv_platform(void)
{
u32 eax;
u32 hyp_signature[3];
if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
return 0;
cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
&eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
if (eax >= HYPERV_CPUID_MIN &&
eax <= HYPERV_CPUID_MAX &&
!memcmp("Microsoft Hv", hyp_signature, 12))
return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
return 0;
}
static cycle_t read_hv_clock(struct clocksource *arg)
{
cycle_t current_tick;
/*
* Read the partition counter to get the current tick count. This count
* is set to 0 when the partition is created and is incremented in
* 100 nanosecond units.
*/
rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
return current_tick;
}
static struct clocksource hyperv_cs = {
.name = "hyperv_clocksource",
.rating = 400, /* use this when running on Hyperv*/
.read = read_hv_clock,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
static void __init ms_hyperv_init_platform(void)
{
/*
* Extract the features and hints
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
ms_hyperv.features, ms_hyperv.hints);
#ifdef CONFIG_X86_LOCAL_APIC
if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
/*
* Get the APIC frequency.
*/
u64 hv_lapic_frequency;
rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
lapic_timer_frequency = hv_lapic_frequency;
printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n",
lapic_timer_frequency);
}
#endif
if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
#ifdef CONFIG_X86_IO_APIC
no_timer_check = 1;
#endif
}
const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.name = "Microsoft HyperV",
.detect = ms_hyperv_platform,
.init_platform = ms_hyperv_init_platform,
};
EXPORT_SYMBOL(x86_hyper_ms_hyperv);

View file

@ -0,0 +1,3 @@
obj-y := main.o if.o generic.o cleanup.o
obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o

View file

@ -0,0 +1,124 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include "mtrr.h"
static void
amd_get_mtrr(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type *type)
{
unsigned long low, high;
rdmsr(MSR_K6_UWCCR, low, high);
/* Upper dword is region 1, lower is region 0 */
if (reg == 1)
low = high;
/* The base masks off on the right alignment */
*base = (low & 0xFFFE0000) >> PAGE_SHIFT;
*type = 0;
if (low & 1)
*type = MTRR_TYPE_UNCACHABLE;
if (low & 2)
*type = MTRR_TYPE_WRCOMB;
if (!(low & 3)) {
*size = 0;
return;
}
/*
* This needs a little explaining. The size is stored as an
* inverted mask of bits of 128K granularity 15 bits long offset
* 2 bits.
*
* So to get a size we do invert the mask and add 1 to the lowest
* mask bit (4 as its 2 bits in). This gives us a size we then shift
* to turn into 128K blocks.
*
* eg 111 1111 1111 1100 is 512K
*
* invert 000 0000 0000 0011
* +1 000 0000 0000 0100
* *128K ...
*/
low = (~low) & 0x1FFFC;
*size = (low + 4) << (15 - PAGE_SHIFT);
}
/**
* amd_set_mtrr - Set variable MTRR register on the local CPU.
*
* @reg The register to set.
* @base The base address of the region.
* @size The size of the region. If this is 0 the region is disabled.
* @type The type of the region.
*
* Returns nothing.
*/
static void
amd_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
{
u32 regs[2];
/*
* Low is MTRR0, High MTRR 1
*/
rdmsr(MSR_K6_UWCCR, regs[0], regs[1]);
/*
* Blank to disable
*/
if (size == 0) {
regs[reg] = 0;
} else {
/*
* Set the register to the base, the type (off by one) and an
* inverted bitmask of the size The size is the only odd
* bit. We are fed say 512K We invert this and we get 111 1111
* 1111 1011 but if you subtract one and invert you get the
* desired 111 1111 1111 1100 mask
*
* But ~(x - 1) == ~x + 1 == -x. Two's complement rocks!
*/
regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC)
| (base << PAGE_SHIFT) | (type + 1);
}
/*
* The writeback rule is quite specific. See the manual. Its
* disable local interrupts, write back the cache, set the mtrr
*/
wbinvd();
wrmsr(MSR_K6_UWCCR, regs[0], regs[1]);
}
static int
amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
{
/*
* Apply the K6 block alignment and size rules
* In order
* o Uncached or gathering only
* o 128K or bigger block
* o Power of 2 block
* o base suitably aligned to the power
*/
if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT))
|| (size & ~(size - 1)) - size || (base & (size - 1)))
return -EINVAL;
return 0;
}
static const struct mtrr_ops amd_mtrr_ops = {
.vendor = X86_VENDOR_AMD,
.set = amd_set_mtrr,
.get = amd_get_mtrr,
.get_free_region = generic_get_free_region,
.validate_add_page = amd_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
int __init amd_init_mtrr(void)
{
set_mtrr_ops(&amd_mtrr_ops);
return 0;
}

View file

@ -0,0 +1,126 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include "mtrr.h"
static struct {
unsigned long high;
unsigned long low;
} centaur_mcr[8];
static u8 centaur_mcr_reserved;
static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */
/**
* centaur_get_free_region - Get a free MTRR.
*
* @base: The starting (base) address of the region.
* @size: The size (in bytes) of the region.
*
* Returns: the index of the region on success, else -1 on error.
*/
static int
centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
{
unsigned long lbase, lsize;
mtrr_type ltype;
int i, max;
max = num_var_ranges;
if (replace_reg >= 0 && replace_reg < max)
return replace_reg;
for (i = 0; i < max; ++i) {
if (centaur_mcr_reserved & (1 << i))
continue;
mtrr_if->get(i, &lbase, &lsize, &ltype);
if (lsize == 0)
return i;
}
return -ENOSPC;
}
/*
* Report boot time MCR setups
*/
void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
{
centaur_mcr[mcr].low = lo;
centaur_mcr[mcr].high = hi;
}
static void
centaur_get_mcr(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type * type)
{
*base = centaur_mcr[reg].high >> PAGE_SHIFT;
*size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
*type = MTRR_TYPE_WRCOMB; /* write-combining */
if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2))
*type = MTRR_TYPE_UNCACHABLE;
if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25)
*type = MTRR_TYPE_WRBACK;
if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31)
*type = MTRR_TYPE_WRBACK;
}
static void
centaur_set_mcr(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type)
{
unsigned long low, high;
if (size == 0) {
/* Disable */
high = low = 0;
} else {
high = base << PAGE_SHIFT;
if (centaur_mcr_type == 0) {
/* Only support write-combining... */
low = -size << PAGE_SHIFT | 0x1f;
} else {
if (type == MTRR_TYPE_UNCACHABLE)
low = -size << PAGE_SHIFT | 0x02; /* NC */
else
low = -size << PAGE_SHIFT | 0x09; /* WWO, WC */
}
}
centaur_mcr[reg].high = high;
centaur_mcr[reg].low = low;
wrmsr(MSR_IDT_MCR0 + reg, low, high);
}
static int
centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
{
/*
* FIXME: Winchip2 supports uncached
*/
if (type != MTRR_TYPE_WRCOMB &&
(centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
pr_warning("mtrr: only write-combining%s supported\n",
centaur_mcr_type ? " and uncacheable are" : " is");
return -EINVAL;
}
return 0;
}
static const struct mtrr_ops centaur_mtrr_ops = {
.vendor = X86_VENDOR_CENTAUR,
.set = centaur_set_mcr,
.get = centaur_get_mcr,
.get_free_region = centaur_get_free_region,
.validate_add_page = centaur_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
int __init centaur_init_mtrr(void)
{
set_mtrr_ops(&centaur_mtrr_ops);
return 0;
}

View file

@ -0,0 +1,980 @@
/*
* MTRR (Memory Type Range Register) cleanup
*
* Copyright (C) 2009 Yinghai Lu
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/module.h>
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
#include <linux/uaccess.h>
#include <linux/kvm_para.h>
#include <linux/range.h>
#include <asm/processor.h>
#include <asm/e820.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include "mtrr.h"
struct var_mtrr_range_state {
unsigned long base_pfn;
unsigned long size_pfn;
mtrr_type type;
};
struct var_mtrr_state {
unsigned long range_startk;
unsigned long range_sizek;
unsigned long chunk_sizek;
unsigned long gran_sizek;
unsigned int reg;
};
/* Should be related to MTRR_VAR_RANGES nums */
#define RANGE_NUM 256
static struct range __initdata range[RANGE_NUM];
static int __initdata nr_range;
static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
static int __initdata debug_print;
#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
#define BIOS_BUG_MSG KERN_WARNING \
"WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
static int __init
x86_get_mtrr_mem_range(struct range *range, int nr_range,
unsigned long extra_remove_base,
unsigned long extra_remove_size)
{
unsigned long base, size;
mtrr_type type;
int i;
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
if (type != MTRR_TYPE_WRBACK)
continue;
base = range_state[i].base_pfn;
size = range_state[i].size_pfn;
nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
base, base + size);
}
if (debug_print) {
printk(KERN_DEBUG "After WB checking\n");
for (i = 0; i < nr_range; i++)
printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
range[i].start, range[i].end);
}
/* Take out UC ranges: */
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
if (type != MTRR_TYPE_UNCACHABLE &&
type != MTRR_TYPE_WRPROT)
continue;
size = range_state[i].size_pfn;
if (!size)
continue;
base = range_state[i].base_pfn;
if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
(mtrr_state.enabled & 1)) {
/* Var MTRR contains UC entry below 1M? Skip it: */
printk(BIOS_BUG_MSG, i);
if (base + size <= (1<<(20-PAGE_SHIFT)))
continue;
size -= (1<<(20-PAGE_SHIFT)) - base;
base = 1<<(20-PAGE_SHIFT);
}
subtract_range(range, RANGE_NUM, base, base + size);
}
if (extra_remove_size)
subtract_range(range, RANGE_NUM, extra_remove_base,
extra_remove_base + extra_remove_size);
if (debug_print) {
printk(KERN_DEBUG "After UC checking\n");
for (i = 0; i < RANGE_NUM; i++) {
if (!range[i].end)
continue;
printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
range[i].start, range[i].end);
}
}
/* sort the ranges */
nr_range = clean_sort_range(range, RANGE_NUM);
if (debug_print) {
printk(KERN_DEBUG "After sorting\n");
for (i = 0; i < nr_range; i++)
printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
range[i].start, range[i].end);
}
return nr_range;
}
#ifdef CONFIG_MTRR_SANITIZER
static unsigned long __init sum_ranges(struct range *range, int nr_range)
{
unsigned long sum = 0;
int i;
for (i = 0; i < nr_range; i++)
sum += range[i].end - range[i].start;
return sum;
}
static int enable_mtrr_cleanup __initdata =
CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
static int __init disable_mtrr_cleanup_setup(char *str)
{
enable_mtrr_cleanup = 0;
return 0;
}
early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
static int __init enable_mtrr_cleanup_setup(char *str)
{
enable_mtrr_cleanup = 1;
return 0;
}
early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
static int __init mtrr_cleanup_debug_setup(char *str)
{
debug_print = 1;
return 0;
}
early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
static void __init
set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
unsigned char type, unsigned int address_bits)
{
u32 base_lo, base_hi, mask_lo, mask_hi;
u64 base, mask;
if (!sizek) {
fill_mtrr_var_range(reg, 0, 0, 0, 0);
return;
}
mask = (1ULL << address_bits) - 1;
mask &= ~((((u64)sizek) << 10) - 1);
base = ((u64)basek) << 10;
base |= type;
mask |= 0x800;
base_lo = base & ((1ULL<<32) - 1);
base_hi = base >> 32;
mask_lo = mask & ((1ULL<<32) - 1);
mask_hi = mask >> 32;
fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
}
static void __init
save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
unsigned char type)
{
range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
range_state[reg].type = type;
}
static void __init set_var_mtrr_all(unsigned int address_bits)
{
unsigned long basek, sizek;
unsigned char type;
unsigned int reg;
for (reg = 0; reg < num_var_ranges; reg++) {
basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
type = range_state[reg].type;
set_var_mtrr(reg, basek, sizek, type, address_bits);
}
}
static unsigned long to_size_factor(unsigned long sizek, char *factorp)
{
unsigned long base = sizek;
char factor;
if (base & ((1<<10) - 1)) {
/* Not MB-aligned: */
factor = 'K';
} else if (base & ((1<<20) - 1)) {
factor = 'M';
base >>= 10;
} else {
factor = 'G';
base >>= 20;
}
*factorp = factor;
return base;
}
static unsigned int __init
range_to_mtrr(unsigned int reg, unsigned long range_startk,
unsigned long range_sizek, unsigned char type)
{
if (!range_sizek || (reg >= num_var_ranges))
return reg;
while (range_sizek) {
unsigned long max_align, align;
unsigned long sizek;
/* Compute the maximum size with which we can make a range: */
if (range_startk)
max_align = __ffs(range_startk);
else
max_align = BITS_PER_LONG - 1;
align = __fls(range_sizek);
if (align > max_align)
align = max_align;
sizek = 1UL << align;
if (debug_print) {
char start_factor = 'K', size_factor = 'K';
unsigned long start_base, size_base;
start_base = to_size_factor(range_startk, &start_factor);
size_base = to_size_factor(sizek, &size_factor);
Dprintk("Setting variable MTRR %d, "
"base: %ld%cB, range: %ld%cB, type %s\n",
reg, start_base, start_factor,
size_base, size_factor,
(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")
);
}
save_var_mtrr(reg++, range_startk, sizek, type);
range_startk += sizek;
range_sizek -= sizek;
if (reg >= num_var_ranges)
break;
}
return reg;
}
static unsigned __init
range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
unsigned long sizek)
{
unsigned long hole_basek, hole_sizek;
unsigned long second_basek, second_sizek;
unsigned long range0_basek, range0_sizek;
unsigned long range_basek, range_sizek;
unsigned long chunk_sizek;
unsigned long gran_sizek;
hole_basek = 0;
hole_sizek = 0;
second_basek = 0;
second_sizek = 0;
chunk_sizek = state->chunk_sizek;
gran_sizek = state->gran_sizek;
/* Align with gran size, prevent small block used up MTRRs: */
range_basek = ALIGN(state->range_startk, gran_sizek);
if ((range_basek > basek) && basek)
return second_sizek;
state->range_sizek -= (range_basek - state->range_startk);
range_sizek = ALIGN(state->range_sizek, gran_sizek);
while (range_sizek > state->range_sizek) {
range_sizek -= gran_sizek;
if (!range_sizek)
return 0;
}
state->range_sizek = range_sizek;
/* Try to append some small hole: */
range0_basek = state->range_startk;
range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
/* No increase: */
if (range0_sizek == state->range_sizek) {
Dprintk("rangeX: %016lx - %016lx\n",
range0_basek<<10,
(range0_basek + state->range_sizek)<<10);
state->reg = range_to_mtrr(state->reg, range0_basek,
state->range_sizek, MTRR_TYPE_WRBACK);
return 0;
}
/* Only cut back when it is not the last: */
if (sizek) {
while (range0_basek + range0_sizek > (basek + sizek)) {
if (range0_sizek >= chunk_sizek)
range0_sizek -= chunk_sizek;
else
range0_sizek = 0;
if (!range0_sizek)
break;
}
}
second_try:
range_basek = range0_basek + range0_sizek;
/* One hole in the middle: */
if (range_basek > basek && range_basek <= (basek + sizek))
second_sizek = range_basek - basek;
if (range0_sizek > state->range_sizek) {
/* One hole in middle or at the end: */
hole_sizek = range0_sizek - state->range_sizek - second_sizek;
/* Hole size should be less than half of range0 size: */
if (hole_sizek >= (range0_sizek >> 1) &&
range0_sizek >= chunk_sizek) {
range0_sizek -= chunk_sizek;
second_sizek = 0;
hole_sizek = 0;
goto second_try;
}
}
if (range0_sizek) {
Dprintk("range0: %016lx - %016lx\n",
range0_basek<<10,
(range0_basek + range0_sizek)<<10);
state->reg = range_to_mtrr(state->reg, range0_basek,
range0_sizek, MTRR_TYPE_WRBACK);
}
if (range0_sizek < state->range_sizek) {
/* Need to handle left over range: */
range_sizek = state->range_sizek - range0_sizek;
Dprintk("range: %016lx - %016lx\n",
range_basek<<10,
(range_basek + range_sizek)<<10);
state->reg = range_to_mtrr(state->reg, range_basek,
range_sizek, MTRR_TYPE_WRBACK);
}
if (hole_sizek) {
hole_basek = range_basek - hole_sizek - second_sizek;
Dprintk("hole: %016lx - %016lx\n",
hole_basek<<10,
(hole_basek + hole_sizek)<<10);
state->reg = range_to_mtrr(state->reg, hole_basek,
hole_sizek, MTRR_TYPE_UNCACHABLE);
}
return second_sizek;
}
static void __init
set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
unsigned long size_pfn)
{
unsigned long basek, sizek;
unsigned long second_sizek = 0;
if (state->reg >= num_var_ranges)
return;
basek = base_pfn << (PAGE_SHIFT - 10);
sizek = size_pfn << (PAGE_SHIFT - 10);
/* See if I can merge with the last range: */
if ((basek <= 1024) ||
(state->range_startk + state->range_sizek == basek)) {
unsigned long endk = basek + sizek;
state->range_sizek = endk - state->range_startk;
return;
}
/* Write the range mtrrs: */
if (state->range_sizek != 0)
second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
/* Allocate an msr: */
state->range_startk = basek + second_sizek;
state->range_sizek = sizek - second_sizek;
}
/* Mininum size of mtrr block that can take hole: */
static u64 mtrr_chunk_size __initdata = (256ULL<<20);
static int __init parse_mtrr_chunk_size_opt(char *p)
{
if (!p)
return -EINVAL;
mtrr_chunk_size = memparse(p, &p);
return 0;
}
early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
/* Granularity of mtrr of block: */
static u64 mtrr_gran_size __initdata;
static int __init parse_mtrr_gran_size_opt(char *p)
{
if (!p)
return -EINVAL;
mtrr_gran_size = memparse(p, &p);
return 0;
}
early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
static unsigned long nr_mtrr_spare_reg __initdata =
CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
static int __init parse_mtrr_spare_reg(char *arg)
{
if (arg)
nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
return 0;
}
early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
static int __init
x86_setup_var_mtrrs(struct range *range, int nr_range,
u64 chunk_size, u64 gran_size)
{
struct var_mtrr_state var_state;
int num_reg;
int i;
var_state.range_startk = 0;
var_state.range_sizek = 0;
var_state.reg = 0;
var_state.chunk_sizek = chunk_size >> 10;
var_state.gran_sizek = gran_size >> 10;
memset(range_state, 0, sizeof(range_state));
/* Write the range: */
for (i = 0; i < nr_range; i++) {
set_var_mtrr_range(&var_state, range[i].start,
range[i].end - range[i].start);
}
/* Write the last range: */
if (var_state.range_sizek != 0)
range_to_mtrr_with_hole(&var_state, 0, 0);
num_reg = var_state.reg;
/* Clear out the extra MTRR's: */
while (var_state.reg < num_var_ranges) {
save_var_mtrr(var_state.reg, 0, 0, 0);
var_state.reg++;
}
return num_reg;
}
struct mtrr_cleanup_result {
unsigned long gran_sizek;
unsigned long chunk_sizek;
unsigned long lose_cover_sizek;
unsigned int num_reg;
int bad;
};
/*
* gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
* chunk size: gran_size, ..., 2G
* so we need (1+16)*8
*/
#define NUM_RESULT 136
#define PSHIFT (PAGE_SHIFT - 10)
static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
static unsigned long __initdata min_loss_pfn[RANGE_NUM];
static void __init print_out_mtrr_range_state(void)
{
char start_factor = 'K', size_factor = 'K';
unsigned long start_base, size_base;
mtrr_type type;
int i;
for (i = 0; i < num_var_ranges; i++) {
size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
if (!size_base)
continue;
size_base = to_size_factor(size_base, &size_factor),
start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
start_base = to_size_factor(start_base, &start_factor),
type = range_state[i].type;
printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
i, start_base, start_factor,
size_base, size_factor,
(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
((type == MTRR_TYPE_WRPROT) ? "WP" :
((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
);
}
}
static int __init mtrr_need_cleanup(void)
{
int i;
mtrr_type type;
unsigned long size;
/* Extra one for all 0: */
int num[MTRR_NUM_TYPES + 1];
/* Check entries number: */
memset(num, 0, sizeof(num));
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
size = range_state[i].size_pfn;
if (type >= MTRR_NUM_TYPES)
continue;
if (!size)
type = MTRR_NUM_TYPES;
num[type]++;
}
/* Check if we got UC entries: */
if (!num[MTRR_TYPE_UNCACHABLE])
return 0;
/* Check if we only had WB and UC */
if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
num_var_ranges - num[MTRR_NUM_TYPES])
return 0;
return 1;
}
static unsigned long __initdata range_sums;
static void __init
mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
unsigned long x_remove_base,
unsigned long x_remove_size, int i)
{
static struct range range_new[RANGE_NUM];
unsigned long range_sums_new;
static int nr_range_new;
int num_reg;
/* Convert ranges to var ranges state: */
num_reg = x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
/* We got new setting in range_state, check it: */
memset(range_new, 0, sizeof(range_new));
nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
x_remove_base, x_remove_size);
range_sums_new = sum_ranges(range_new, nr_range_new);
result[i].chunk_sizek = chunk_size >> 10;
result[i].gran_sizek = gran_size >> 10;
result[i].num_reg = num_reg;
if (range_sums < range_sums_new) {
result[i].lose_cover_sizek = (range_sums_new - range_sums) << PSHIFT;
result[i].bad = 1;
} else {
result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT;
}
/* Double check it: */
if (!result[i].bad && !result[i].lose_cover_sizek) {
if (nr_range_new != nr_range || memcmp(range, range_new, sizeof(range)))
result[i].bad = 1;
}
if (!result[i].bad && (range_sums - range_sums_new < min_loss_pfn[num_reg]))
min_loss_pfn[num_reg] = range_sums - range_sums_new;
}
static void __init mtrr_print_out_one_result(int i)
{
unsigned long gran_base, chunk_base, lose_base;
char gran_factor, chunk_factor, lose_factor;
gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
result[i].bad ? "*BAD*" : " ",
gran_base, gran_factor, chunk_base, chunk_factor);
pr_cont("num_reg: %d \tlose cover RAM: %s%ld%c\n",
result[i].num_reg, result[i].bad ? "-" : "",
lose_base, lose_factor);
}
static int __init mtrr_search_optimal_index(void)
{
int num_reg_good;
int index_good;
int i;
if (nr_mtrr_spare_reg >= num_var_ranges)
nr_mtrr_spare_reg = num_var_ranges - 1;
num_reg_good = -1;
for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
if (!min_loss_pfn[i])
num_reg_good = i;
}
index_good = -1;
if (num_reg_good != -1) {
for (i = 0; i < NUM_RESULT; i++) {
if (!result[i].bad &&
result[i].num_reg == num_reg_good &&
!result[i].lose_cover_sizek) {
index_good = i;
break;
}
}
}
return index_good;
}
int __init mtrr_cleanup(unsigned address_bits)
{
unsigned long x_remove_base, x_remove_size;
unsigned long base, size, def, dummy;
u64 chunk_size, gran_size;
mtrr_type type;
int index_good;
int i;
if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
return 0;
rdmsr(MSR_MTRRdefType, def, dummy);
def &= 0xff;
if (def != MTRR_TYPE_UNCACHABLE)
return 0;
/* Get it and store it aside: */
memset(range_state, 0, sizeof(range_state));
for (i = 0; i < num_var_ranges; i++) {
mtrr_if->get(i, &base, &size, &type);
range_state[i].base_pfn = base;
range_state[i].size_pfn = size;
range_state[i].type = type;
}
/* Check if we need handle it and can handle it: */
if (!mtrr_need_cleanup())
return 0;
/* Print original var MTRRs at first, for debugging: */
printk(KERN_DEBUG "original variable MTRRs\n");
print_out_mtrr_range_state();
memset(range, 0, sizeof(range));
x_remove_size = 0;
x_remove_base = 1 << (32 - PAGE_SHIFT);
if (mtrr_tom2)
x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base;
/*
* [0, 1M) should always be covered by var mtrr with WB
* and fixed mtrrs should take effect before var mtrr for it:
*/
nr_range = add_range_with_merge(range, RANGE_NUM, 0, 0,
1ULL<<(20 - PAGE_SHIFT));
/* add from var mtrr at last */
nr_range = x86_get_mtrr_mem_range(range, nr_range,
x_remove_base, x_remove_size);
range_sums = sum_ranges(range, nr_range);
printk(KERN_INFO "total RAM covered: %ldM\n",
range_sums >> (20 - PAGE_SHIFT));
if (mtrr_chunk_size && mtrr_gran_size) {
i = 0;
mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
x_remove_base, x_remove_size, i);
mtrr_print_out_one_result(i);
if (!result[i].bad) {
set_var_mtrr_all(address_bits);
printk(KERN_DEBUG "New variable MTRRs\n");
print_out_mtrr_range_state();
return 1;
}
printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
"will find optimal one\n");
}
i = 0;
memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
memset(result, 0, sizeof(result));
for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
for (chunk_size = gran_size; chunk_size < (1ULL<<32);
chunk_size <<= 1) {
if (i >= NUM_RESULT)
continue;
mtrr_calc_range_state(chunk_size, gran_size,
x_remove_base, x_remove_size, i);
if (debug_print) {
mtrr_print_out_one_result(i);
printk(KERN_INFO "\n");
}
i++;
}
}
/* Try to find the optimal index: */
index_good = mtrr_search_optimal_index();
if (index_good != -1) {
printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
i = index_good;
mtrr_print_out_one_result(i);
/* Convert ranges to var ranges state: */
chunk_size = result[i].chunk_sizek;
chunk_size <<= 10;
gran_size = result[i].gran_sizek;
gran_size <<= 10;
x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
set_var_mtrr_all(address_bits);
printk(KERN_DEBUG "New variable MTRRs\n");
print_out_mtrr_range_state();
return 1;
} else {
/* print out all */
for (i = 0; i < NUM_RESULT; i++)
mtrr_print_out_one_result(i);
}
printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
return 0;
}
#else
int __init mtrr_cleanup(unsigned address_bits)
{
return 0;
}
#endif
static int disable_mtrr_trim;
static int __init disable_mtrr_trim_setup(char *str)
{
disable_mtrr_trim = 1;
return 0;
}
early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
/*
* Newer AMD K8s and later CPUs have a special magic MSR way to force WB
* for memory >4GB. Check for that here.
* Note this won't check if the MTRRs < 4GB where the magic bit doesn't
* apply to are wrong, but so far we don't know of any such case in the wild.
*/
#define Tom2Enabled (1U << 21)
#define Tom2ForceMemTypeWB (1U << 22)
int __init amd_special_default_mtrr(void)
{
u32 l, h;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return 0;
if (boot_cpu_data.x86 < 0xf)
return 0;
/* In case some hypervisor doesn't pass SYSCFG through: */
if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
return 0;
/*
* Memory between 4GB and top of mem is forced WB by this magic bit.
* Reserved before K8RevF, but should be zero there.
*/
if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
(Tom2Enabled | Tom2ForceMemTypeWB))
return 1;
return 0;
}
static u64 __init
real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn)
{
u64 trim_start, trim_size;
trim_start = start_pfn;
trim_start <<= PAGE_SHIFT;
trim_size = limit_pfn;
trim_size <<= PAGE_SHIFT;
trim_size -= trim_start;
return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED);
}
/**
* mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
* @end_pfn: ending page frame number
*
* Some buggy BIOSes don't setup the MTRRs properly for systems with certain
* memory configurations. This routine checks that the highest MTRR matches
* the end of memory, to make sure the MTRRs having a write back type cover
* all of the memory the kernel is intending to use. If not, it'll trim any
* memory off the end by adjusting end_pfn, removing it from the kernel's
* allocation pools, warning the user with an obnoxious message.
*/
int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
{
unsigned long i, base, size, highest_pfn = 0, def, dummy;
mtrr_type type;
u64 total_trim_size;
/* extra one for all 0 */
int num[MTRR_NUM_TYPES + 1];
/*
* Make sure we only trim uncachable memory on machines that
* support the Intel MTRR architecture:
*/
if (!is_cpu(INTEL) || disable_mtrr_trim)
return 0;
rdmsr(MSR_MTRRdefType, def, dummy);
def &= 0xff;
if (def != MTRR_TYPE_UNCACHABLE)
return 0;
/* Get it and store it aside: */
memset(range_state, 0, sizeof(range_state));
for (i = 0; i < num_var_ranges; i++) {
mtrr_if->get(i, &base, &size, &type);
range_state[i].base_pfn = base;
range_state[i].size_pfn = size;
range_state[i].type = type;
}
/* Find highest cached pfn: */
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
if (type != MTRR_TYPE_WRBACK)
continue;
base = range_state[i].base_pfn;
size = range_state[i].size_pfn;
if (highest_pfn < base + size)
highest_pfn = base + size;
}
/* kvm/qemu doesn't have mtrr set right, don't trim them all: */
if (!highest_pfn) {
printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
return 0;
}
/* Check entries number: */
memset(num, 0, sizeof(num));
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
if (type >= MTRR_NUM_TYPES)
continue;
size = range_state[i].size_pfn;
if (!size)
type = MTRR_NUM_TYPES;
num[type]++;
}
/* No entry for WB? */
if (!num[MTRR_TYPE_WRBACK])
return 0;
/* Check if we only had WB and UC: */
if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
num_var_ranges - num[MTRR_NUM_TYPES])
return 0;
memset(range, 0, sizeof(range));
nr_range = 0;
if (mtrr_tom2) {
range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT;
if (highest_pfn < range[nr_range].end)
highest_pfn = range[nr_range].end;
nr_range++;
}
nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
/* Check the head: */
total_trim_size = 0;
if (range[0].start)
total_trim_size += real_trim_memory(0, range[0].start);
/* Check the holes: */
for (i = 0; i < nr_range - 1; i++) {
if (range[i].end < range[i+1].start)
total_trim_size += real_trim_memory(range[i].end,
range[i+1].start);
}
/* Check the top: */
i = nr_range - 1;
if (range[i].end < end_pfn)
total_trim_size += real_trim_memory(range[i].end,
end_pfn);
if (total_trim_size) {
pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20);
if (!changed_by_mtrr_cleanup)
WARN_ON(1);
pr_info("update e820 for mtrr\n");
update_e820();
return 1;
}
return 0;
}

View file

@ -0,0 +1,282 @@
#include <linux/init.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <asm/processor-cyrix.h>
#include <asm/processor-flags.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include "mtrr.h"
static void
cyrix_get_arr(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type * type)
{
unsigned char arr, ccr3, rcr, shift;
unsigned long flags;
arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
local_irq_save(flags);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
((unsigned char *)base)[3] = getCx86(arr);
((unsigned char *)base)[2] = getCx86(arr + 1);
((unsigned char *)base)[1] = getCx86(arr + 2);
rcr = getCx86(CX86_RCR_BASE + reg);
setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
local_irq_restore(flags);
shift = ((unsigned char *) base)[1] & 0x0f;
*base >>= PAGE_SHIFT;
/*
* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7
* Note: shift==0xf means 4G, this is unsupported.
*/
if (shift)
*size = (reg < 7 ? 0x1UL : 0x40UL) << (shift - 1);
else
*size = 0;
/* Bit 0 is Cache Enable on ARR7, Cache Disable on ARR0-ARR6 */
if (reg < 7) {
switch (rcr) {
case 1:
*type = MTRR_TYPE_UNCACHABLE;
break;
case 8:
*type = MTRR_TYPE_WRBACK;
break;
case 9:
*type = MTRR_TYPE_WRCOMB;
break;
case 24:
default:
*type = MTRR_TYPE_WRTHROUGH;
break;
}
} else {
switch (rcr) {
case 0:
*type = MTRR_TYPE_UNCACHABLE;
break;
case 8:
*type = MTRR_TYPE_WRCOMB;
break;
case 9:
*type = MTRR_TYPE_WRBACK;
break;
case 25:
default:
*type = MTRR_TYPE_WRTHROUGH;
break;
}
}
}
/*
* cyrix_get_free_region - get a free ARR.
*
* @base: the starting (base) address of the region.
* @size: the size (in bytes) of the region.
*
* Returns: the index of the region on success, else -1 on error.
*/
static int
cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
{
unsigned long lbase, lsize;
mtrr_type ltype;
int i;
switch (replace_reg) {
case 7:
if (size < 0x40)
break;
case 6:
case 5:
case 4:
return replace_reg;
case 3:
case 2:
case 1:
case 0:
return replace_reg;
}
/* If we are to set up a region >32M then look at ARR7 immediately */
if (size > 0x2000) {
cyrix_get_arr(7, &lbase, &lsize, &ltype);
if (lsize == 0)
return 7;
/* Else try ARR0-ARR6 first */
} else {
for (i = 0; i < 7; i++) {
cyrix_get_arr(i, &lbase, &lsize, &ltype);
if (lsize == 0)
return i;
}
/*
* ARR0-ARR6 isn't free
* try ARR7 but its size must be at least 256K
*/
cyrix_get_arr(i, &lbase, &lsize, &ltype);
if ((lsize == 0) && (size >= 0x40))
return i;
}
return -ENOSPC;
}
static u32 cr4, ccr3;
static void prepare_set(void)
{
u32 cr0;
/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (cpu_has_pge) {
cr4 = read_cr4();
write_cr4(cr4 & ~X86_CR4_PGE);
}
/*
* Disable and flush caches.
* Note that wbinvd flushes the TLBs as a side-effect
*/
cr0 = read_cr0() | X86_CR0_CD;
wbinvd();
write_cr0(cr0);
wbinvd();
/* Cyrix ARRs - everything else was excluded at the top */
ccr3 = getCx86(CX86_CCR3);
/* Cyrix ARRs - everything else was excluded at the top */
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
}
static void post_set(void)
{
/* Flush caches and TLBs */
wbinvd();
/* Cyrix ARRs - everything else was excluded at the top */
setCx86(CX86_CCR3, ccr3);
/* Enable caches */
write_cr0(read_cr0() & ~X86_CR0_CD);
/* Restore value of CR4 */
if (cpu_has_pge)
write_cr4(cr4);
}
static void cyrix_set_arr(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type)
{
unsigned char arr, arr_type, arr_size;
arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
/* count down from 32M (ARR0-ARR6) or from 2G (ARR7) */
if (reg >= 7)
size >>= 6;
size &= 0x7fff; /* make sure arr_size <= 14 */
for (arr_size = 0; size; arr_size++, size >>= 1)
;
if (reg < 7) {
switch (type) {
case MTRR_TYPE_UNCACHABLE:
arr_type = 1;
break;
case MTRR_TYPE_WRCOMB:
arr_type = 9;
break;
case MTRR_TYPE_WRTHROUGH:
arr_type = 24;
break;
default:
arr_type = 8;
break;
}
} else {
switch (type) {
case MTRR_TYPE_UNCACHABLE:
arr_type = 0;
break;
case MTRR_TYPE_WRCOMB:
arr_type = 8;
break;
case MTRR_TYPE_WRTHROUGH:
arr_type = 25;
break;
default:
arr_type = 9;
break;
}
}
prepare_set();
base <<= PAGE_SHIFT;
setCx86(arr + 0, ((unsigned char *)&base)[3]);
setCx86(arr + 1, ((unsigned char *)&base)[2]);
setCx86(arr + 2, (((unsigned char *)&base)[1]) | arr_size);
setCx86(CX86_RCR_BASE + reg, arr_type);
post_set();
}
typedef struct {
unsigned long base;
unsigned long size;
mtrr_type type;
} arr_state_t;
static arr_state_t arr_state[8] = {
{0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL},
{0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}
};
static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 };
static void cyrix_set_all(void)
{
int i;
prepare_set();
/* the CCRs are not contiguous */
for (i = 0; i < 4; i++)
setCx86(CX86_CCR0 + i, ccr_state[i]);
for (; i < 7; i++)
setCx86(CX86_CCR4 + i, ccr_state[i]);
for (i = 0; i < 8; i++) {
cyrix_set_arr(i, arr_state[i].base,
arr_state[i].size, arr_state[i].type);
}
post_set();
}
static const struct mtrr_ops cyrix_mtrr_ops = {
.vendor = X86_VENDOR_CYRIX,
.set_all = cyrix_set_all,
.set = cyrix_set_arr,
.get = cyrix_get_arr,
.get_free_region = cyrix_get_free_region,
.validate_add_page = generic_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
int __init cyrix_init_mtrr(void)
{
set_mtrr_ops(&cyrix_mtrr_ops);
return 0;
}

View file

@ -0,0 +1,845 @@
/*
* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
* because MTRRs can span up to 40 bits (36bits on most modern x86)
*/
#define DEBUG
#include <linux/module.h>
#include <linux/init.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <asm/processor-flags.h>
#include <asm/cpufeature.h>
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include <asm/pat.h>
#include "mtrr.h"
struct fixed_range_block {
int base_msr; /* start address of an MTRR block */
int ranges; /* number of MTRRs in this block */
};
static struct fixed_range_block fixed_range_blocks[] = {
{ MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */
{ MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */
{ MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */
{}
};
static unsigned long smp_changes_mask;
static int mtrr_state_set;
u64 mtrr_tom2;
struct mtrr_state_type mtrr_state;
EXPORT_SYMBOL_GPL(mtrr_state);
/*
* BIOS is expected to clear MtrrFixDramModEn bit, see for example
* "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
* Opteron Processors" (26094 Rev. 3.30 February 2006), section
* "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
* to 1 during BIOS initalization of the fixed MTRRs, then cleared to
* 0 for operation."
*/
static inline void k8_check_syscfg_dram_mod_en(void)
{
u32 lo, hi;
if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
(boot_cpu_data.x86 >= 0x0f)))
return;
rdmsr(MSR_K8_SYSCFG, lo, hi);
if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
" not cleared by BIOS, clearing this bit\n",
smp_processor_id());
lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
}
}
/* Get the size of contiguous MTRR range */
static u64 get_mtrr_size(u64 mask)
{
u64 size;
mask >>= PAGE_SHIFT;
mask |= size_or_mask;
size = -mask;
size <<= PAGE_SHIFT;
return size;
}
/*
* Check and return the effective type for MTRR-MTRR type overlap.
* Returns 1 if the effective type is UNCACHEABLE, else returns 0
*/
static int check_type_overlap(u8 *prev, u8 *curr)
{
if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) {
*prev = MTRR_TYPE_UNCACHABLE;
*curr = MTRR_TYPE_UNCACHABLE;
return 1;
}
if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) ||
(*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) {
*prev = MTRR_TYPE_WRTHROUGH;
*curr = MTRR_TYPE_WRTHROUGH;
}
if (*prev != *curr) {
*prev = MTRR_TYPE_UNCACHABLE;
*curr = MTRR_TYPE_UNCACHABLE;
return 1;
}
return 0;
}
/*
* Error/Semi-error returns:
* 0xFF - when MTRR is not enabled
* *repeat == 1 implies [start:end] spanned across MTRR range and type returned
* corresponds only to [start:*partial_end].
* Caller has to lookup again for [*partial_end:end].
*/
static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
{
int i;
u64 base, mask;
u8 prev_match, curr_match;
*repeat = 0;
if (!mtrr_state_set)
return 0xFF;
if (!mtrr_state.enabled)
return 0xFF;
/* Make end inclusive end, instead of exclusive */
end--;
/* Look in fixed ranges. Just return the type as per start */
if (mtrr_state.have_fixed && (start < 0x100000)) {
int idx;
if (start < 0x80000) {
idx = 0;
idx += (start >> 16);
return mtrr_state.fixed_ranges[idx];
} else if (start < 0xC0000) {
idx = 1 * 8;
idx += ((start - 0x80000) >> 14);
return mtrr_state.fixed_ranges[idx];
} else if (start < 0x1000000) {
idx = 3 * 8;
idx += ((start - 0xC0000) >> 12);
return mtrr_state.fixed_ranges[idx];
}
}
/*
* Look in variable ranges
* Look of multiple ranges matching this address and pick type
* as per MTRR precedence
*/
if (!(mtrr_state.enabled & 2))
return mtrr_state.def_type;
prev_match = 0xFF;
for (i = 0; i < num_var_ranges; ++i) {
unsigned short start_state, end_state;
if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
continue;
base = (((u64)mtrr_state.var_ranges[i].base_hi) << 32) +
(mtrr_state.var_ranges[i].base_lo & PAGE_MASK);
mask = (((u64)mtrr_state.var_ranges[i].mask_hi) << 32) +
(mtrr_state.var_ranges[i].mask_lo & PAGE_MASK);
start_state = ((start & mask) == (base & mask));
end_state = ((end & mask) == (base & mask));
if (start_state != end_state) {
/*
* We have start:end spanning across an MTRR.
* We split the region into
* either
* (start:mtrr_end) (mtrr_end:end)
* or
* (start:mtrr_start) (mtrr_start:end)
* depending on kind of overlap.
* Return the type for first region and a pointer to
* the start of second region so that caller will
* lookup again on the second region.
* Note: This way we handle multiple overlaps as well.
*/
if (start_state)
*partial_end = base + get_mtrr_size(mask);
else
*partial_end = base;
if (unlikely(*partial_end <= start)) {
WARN_ON(1);
*partial_end = start + PAGE_SIZE;
}
end = *partial_end - 1; /* end is inclusive */
*repeat = 1;
}
if ((start & mask) != (base & mask))
continue;
curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
if (prev_match == 0xFF) {
prev_match = curr_match;
continue;
}
if (check_type_overlap(&prev_match, &curr_match))
return curr_match;
}
if (mtrr_tom2) {
if (start >= (1ULL<<32) && (end < mtrr_tom2))
return MTRR_TYPE_WRBACK;
}
if (prev_match != 0xFF)
return prev_match;
return mtrr_state.def_type;
}
/*
* Returns the effective MTRR type for the region
* Error return:
* 0xFF - when MTRR is not enabled
*/
u8 mtrr_type_lookup(u64 start, u64 end)
{
u8 type, prev_type;
int repeat;
u64 partial_end;
type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
/*
* Common path is with repeat = 0.
* However, we can have cases where [start:end] spans across some
* MTRR range. Do repeated lookups for that case here.
*/
while (repeat) {
prev_type = type;
start = partial_end;
type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
if (check_type_overlap(&prev_type, &type))
return type;
}
return type;
}
/* Get the MSR pair relating to a var range */
static void
get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
{
rdmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
}
/* Fill the MSR pair relating to a var range */
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi)
{
struct mtrr_var_range *vr;
vr = mtrr_state.var_ranges;
vr[index].base_lo = base_lo;
vr[index].base_hi = base_hi;
vr[index].mask_lo = mask_lo;
vr[index].mask_hi = mask_hi;
}
static void get_fixed_ranges(mtrr_type *frs)
{
unsigned int *p = (unsigned int *)frs;
int i;
k8_check_syscfg_dram_mod_en();
rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
for (i = 0; i < 2; i++)
rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
for (i = 0; i < 8; i++)
rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
}
void mtrr_save_fixed_ranges(void *info)
{
if (cpu_has_mtrr)
get_fixed_ranges(mtrr_state.fixed_ranges);
}
static unsigned __initdata last_fixed_start;
static unsigned __initdata last_fixed_end;
static mtrr_type __initdata last_fixed_type;
static void __init print_fixed_last(void)
{
if (!last_fixed_end)
return;
pr_debug(" %05X-%05X %s\n", last_fixed_start,
last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
last_fixed_end = 0;
}
static void __init update_fixed_last(unsigned base, unsigned end,
mtrr_type type)
{
last_fixed_start = base;
last_fixed_end = end;
last_fixed_type = type;
}
static void __init
print_fixed(unsigned base, unsigned step, const mtrr_type *types)
{
unsigned i;
for (i = 0; i < 8; ++i, ++types, base += step) {
if (last_fixed_end == 0) {
update_fixed_last(base, base + step, *types);
continue;
}
if (last_fixed_end == base && last_fixed_type == *types) {
last_fixed_end = base + step;
continue;
}
/* new segments: gap or different type */
print_fixed_last();
update_fixed_last(base, base + step, *types);
}
}
static void prepare_set(void);
static void post_set(void);
static void __init print_mtrr_state(void)
{
unsigned int i;
int high_width;
pr_debug("MTRR default type: %s\n",
mtrr_attrib_to_str(mtrr_state.def_type));
if (mtrr_state.have_fixed) {
pr_debug("MTRR fixed ranges %sabled:\n",
mtrr_state.enabled & 1 ? "en" : "dis");
print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
for (i = 0; i < 2; ++i)
print_fixed(0x80000 + i * 0x20000, 0x04000,
mtrr_state.fixed_ranges + (i + 1) * 8);
for (i = 0; i < 8; ++i)
print_fixed(0xC0000 + i * 0x08000, 0x01000,
mtrr_state.fixed_ranges + (i + 3) * 8);
/* tail */
print_fixed_last();
}
pr_debug("MTRR variable ranges %sabled:\n",
mtrr_state.enabled & 2 ? "en" : "dis");
high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
for (i = 0; i < num_var_ranges; ++i) {
if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n",
i,
high_width,
mtrr_state.var_ranges[i].base_hi,
mtrr_state.var_ranges[i].base_lo >> 12,
high_width,
mtrr_state.var_ranges[i].mask_hi,
mtrr_state.var_ranges[i].mask_lo >> 12,
mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
else
pr_debug(" %u disabled\n", i);
}
if (mtrr_tom2)
pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
}
/* Grab all of the MTRR state for this CPU into *state */
void __init get_mtrr_state(void)
{
struct mtrr_var_range *vrs;
unsigned long flags;
unsigned lo, dummy;
unsigned int i;
vrs = mtrr_state.var_ranges;
rdmsr(MSR_MTRRcap, lo, dummy);
mtrr_state.have_fixed = (lo >> 8) & 1;
for (i = 0; i < num_var_ranges; i++)
get_mtrr_var_range(i, &vrs[i]);
if (mtrr_state.have_fixed)
get_fixed_ranges(mtrr_state.fixed_ranges);
rdmsr(MSR_MTRRdefType, lo, dummy);
mtrr_state.def_type = (lo & 0xff);
mtrr_state.enabled = (lo & 0xc00) >> 10;
if (amd_special_default_mtrr()) {
unsigned low, high;
/* TOP_MEM2 */
rdmsr(MSR_K8_TOP_MEM2, low, high);
mtrr_tom2 = high;
mtrr_tom2 <<= 32;
mtrr_tom2 |= low;
mtrr_tom2 &= 0xffffff800000ULL;
}
print_mtrr_state();
mtrr_state_set = 1;
/* PAT setup for BP. We need to go through sync steps here */
local_irq_save(flags);
prepare_set();
pat_init();
post_set();
local_irq_restore(flags);
}
/* Some BIOS's are messed up and don't set all MTRRs the same! */
void __init mtrr_state_warn(void)
{
unsigned long mask = smp_changes_mask;
if (!mask)
return;
if (mask & MTRR_CHANGE_MASK_FIXED)
pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
if (mask & MTRR_CHANGE_MASK_VARIABLE)
pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n");
if (mask & MTRR_CHANGE_MASK_DEFTYPE)
pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n");
printk(KERN_INFO "mtrr: corrected configuration.\n");
}
/*
* Doesn't attempt to pass an error out to MTRR users
* because it's quite complicated in some cases and probably not
* worth it because the best error handling is to ignore it.
*/
void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
{
if (wrmsr_safe(msr, a, b) < 0) {
printk(KERN_ERR
"MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
smp_processor_id(), msr, a, b);
}
}
/**
* set_fixed_range - checks & updates a fixed-range MTRR if it
* differs from the value it should have
* @msr: MSR address of the MTTR which should be checked and updated
* @changed: pointer which indicates whether the MTRR needed to be changed
* @msrwords: pointer to the MSR values which the MSR should have
*/
static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
{
unsigned lo, hi;
rdmsr(msr, lo, hi);
if (lo != msrwords[0] || hi != msrwords[1]) {
mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
*changed = true;
}
}
/**
* generic_get_free_region - Get a free MTRR.
* @base: The starting (base) address of the region.
* @size: The size (in bytes) of the region.
* @replace_reg: mtrr index to be replaced; set to invalid value if none.
*
* Returns: The index of the region on success, else negative on error.
*/
int
generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
{
unsigned long lbase, lsize;
mtrr_type ltype;
int i, max;
max = num_var_ranges;
if (replace_reg >= 0 && replace_reg < max)
return replace_reg;
for (i = 0; i < max; ++i) {
mtrr_if->get(i, &lbase, &lsize, &ltype);
if (lsize == 0)
return i;
}
return -ENOSPC;
}
static void generic_get_mtrr(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type *type)
{
u32 mask_lo, mask_hi, base_lo, base_hi;
unsigned int hi;
u64 tmp, mask;
/*
* get_mtrr doesn't need to update mtrr_state, also it could be called
* from any cpu, so try to print it out directly.
*/
get_cpu();
rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
if ((mask_lo & 0x800) == 0) {
/* Invalid (i.e. free) range */
*base = 0;
*size = 0;
*type = 0;
goto out_put_cpu;
}
rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
/* Work out the shifted address mask: */
tmp = (u64)mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
mask = size_or_mask | tmp;
/* Expand tmp with high bits to all 1s: */
hi = fls64(tmp);
if (hi > 0) {
tmp |= ~((1ULL<<(hi - 1)) - 1);
if (tmp != mask) {
printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
mask = tmp;
}
}
/*
* This works correctly if size is a power of two, i.e. a
* contiguous range:
*/
*size = -mask;
*base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
*type = base_lo & 0xff;
out_put_cpu:
put_cpu();
}
/**
* set_fixed_ranges - checks & updates the fixed-range MTRRs if they
* differ from the saved set
* @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges()
*/
static int set_fixed_ranges(mtrr_type *frs)
{
unsigned long long *saved = (unsigned long long *)frs;
bool changed = false;
int block = -1, range;
k8_check_syscfg_dram_mod_en();
while (fixed_range_blocks[++block].ranges) {
for (range = 0; range < fixed_range_blocks[block].ranges; range++)
set_fixed_range(fixed_range_blocks[block].base_msr + range,
&changed, (unsigned int *)saved++);
}
return changed;
}
/*
* Set the MSR pair relating to a var range.
* Returns true if changes are made.
*/
static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
{
unsigned int lo, hi;
bool changed = false;
rdmsr(MTRRphysBase_MSR(index), lo, hi);
if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
|| (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
(hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
changed = true;
}
rdmsr(MTRRphysMask_MSR(index), lo, hi);
if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL)
|| (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
(hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
changed = true;
}
return changed;
}
static u32 deftype_lo, deftype_hi;
/**
* set_mtrr_state - Set the MTRR state for this CPU.
*
* NOTE: The CPU must already be in a safe state for MTRR changes.
* RETURNS: 0 if no changes made, else a mask indicating what was changed.
*/
static unsigned long set_mtrr_state(void)
{
unsigned long change_mask = 0;
unsigned int i;
for (i = 0; i < num_var_ranges; i++) {
if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i]))
change_mask |= MTRR_CHANGE_MASK_VARIABLE;
}
if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges))
change_mask |= MTRR_CHANGE_MASK_FIXED;
/*
* Set_mtrr_restore restores the old value of MTRRdefType,
* so to set it we fiddle with the saved value:
*/
if ((deftype_lo & 0xff) != mtrr_state.def_type
|| ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type |
(mtrr_state.enabled << 10);
change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
}
return change_mask;
}
static unsigned long cr4;
static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
/*
* Since we are disabling the cache don't allow any interrupts,
* they would run extremely slow and would only increase the pain.
*
* The caller must ensure that local interrupts are disabled and
* are reenabled after post_set() has been called.
*/
static void prepare_set(void) __acquires(set_atomicity_lock)
{
unsigned long cr0;
/*
* Note that this is not ideal
* since the cache is only flushed/disabled for this CPU while the
* MTRRs are changed, but changing this requires more invasive
* changes to the way the kernel boots
*/
raw_spin_lock(&set_atomicity_lock);
/* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
cr0 = read_cr0() | X86_CR0_CD;
write_cr0(cr0);
wbinvd();
/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (cpu_has_pge) {
cr4 = read_cr4();
write_cr4(cr4 & ~X86_CR4_PGE);
}
/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
/* Save MTRR state */
rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
/* Disable MTRRs, and set the default type to uncached */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
wbinvd();
}
static void post_set(void) __releases(set_atomicity_lock)
{
/* Flush TLBs (no need to flush caches - they are disabled) */
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
/* Intel (P6) standard MTRRs */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
/* Enable caches */
write_cr0(read_cr0() & ~X86_CR0_CD);
/* Restore value of CR4 */
if (cpu_has_pge)
write_cr4(cr4);
raw_spin_unlock(&set_atomicity_lock);
}
static void generic_set_all(void)
{
unsigned long mask, count;
unsigned long flags;
local_irq_save(flags);
prepare_set();
/* Actually set the state */
mask = set_mtrr_state();
/* also set PAT */
pat_init();
post_set();
local_irq_restore(flags);
/* Use the atomic bitops to update the global mask */
for (count = 0; count < sizeof mask * 8; ++count) {
if (mask & 0x01)
set_bit(count, &smp_changes_mask);
mask >>= 1;
}
}
/**
* generic_set_mtrr - set variable MTRR register on the local CPU.
*
* @reg: The register to set.
* @base: The base address of the region.
* @size: The size of the region. If this is 0 the region is disabled.
* @type: The type of the region.
*
* Returns nothing.
*/
static void generic_set_mtrr(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type)
{
unsigned long flags;
struct mtrr_var_range *vr;
vr = &mtrr_state.var_ranges[reg];
local_irq_save(flags);
prepare_set();
if (size == 0) {
/*
* The invalid bit is kept in the mask, so we simply
* clear the relevant mask register to disable a range.
*/
mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0);
memset(vr, 0, sizeof(struct mtrr_var_range));
} else {
vr->base_lo = base << PAGE_SHIFT | type;
vr->base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT);
vr->mask_lo = -size << PAGE_SHIFT | 0x800;
vr->mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT);
mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi);
mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi);
}
post_set();
local_irq_restore(flags);
}
int generic_validate_add_page(unsigned long base, unsigned long size,
unsigned int type)
{
unsigned long lbase, last;
/*
* For Intel PPro stepping <= 7
* must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF
*/
if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 &&
boot_cpu_data.x86_model == 1 &&
boot_cpu_data.x86_mask <= 7) {
if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
return -EINVAL;
}
if (!(base + size < 0x70000 || base > 0x7003F) &&
(type == MTRR_TYPE_WRCOMB
|| type == MTRR_TYPE_WRBACK)) {
pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
return -EINVAL;
}
}
/*
* Check upper bits of base and last are equal and lower bits are 0
* for base and 1 for last
*/
last = base + size - 1;
for (lbase = base; !(lbase & 1) && (last & 1);
lbase = lbase >> 1, last = last >> 1)
;
if (lbase != last) {
pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
return -EINVAL;
}
return 0;
}
static int generic_have_wrcomb(void)
{
unsigned long config, dummy;
rdmsr(MSR_MTRRcap, config, dummy);
return config & (1 << 10);
}
int positive_have_wrcomb(void)
{
return 1;
}
/*
* Generic structure...
*/
const struct mtrr_ops generic_mtrr_ops = {
.use_intel_if = 1,
.set_all = generic_set_all,
.get = generic_get_mtrr,
.get_free_region = generic_get_free_region,
.set = generic_set_mtrr,
.validate_add_page = generic_validate_add_page,
.have_wrcomb = generic_have_wrcomb,
};

View file

@ -0,0 +1,451 @@
#include <linux/capability.h>
#include <linux/seq_file.h>
#include <linux/uaccess.h>
#include <linux/proc_fs.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/init.h>
#define LINE_SIZE 80
#include <asm/mtrr.h>
#include "mtrr.h"
#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
static const char *const mtrr_strings[MTRR_NUM_TYPES] =
{
"uncachable", /* 0 */
"write-combining", /* 1 */
"?", /* 2 */
"?", /* 3 */
"write-through", /* 4 */
"write-protect", /* 5 */
"write-back", /* 6 */
};
const char *mtrr_attrib_to_str(int x)
{
return (x <= 6) ? mtrr_strings[x] : "?";
}
#ifdef CONFIG_PROC_FS
static int
mtrr_file_add(unsigned long base, unsigned long size,
unsigned int type, bool increment, struct file *file, int page)
{
unsigned int *fcount = FILE_FCOUNT(file);
int reg, max;
max = num_var_ranges;
if (fcount == NULL) {
fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL);
if (!fcount)
return -ENOMEM;
FILE_FCOUNT(file) = fcount;
}
if (!page) {
if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)))
return -EINVAL;
base >>= PAGE_SHIFT;
size >>= PAGE_SHIFT;
}
reg = mtrr_add_page(base, size, type, true);
if (reg >= 0)
++fcount[reg];
return reg;
}
static int
mtrr_file_del(unsigned long base, unsigned long size,
struct file *file, int page)
{
unsigned int *fcount = FILE_FCOUNT(file);
int reg;
if (!page) {
if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)))
return -EINVAL;
base >>= PAGE_SHIFT;
size >>= PAGE_SHIFT;
}
reg = mtrr_del_page(-1, base, size);
if (reg < 0)
return reg;
if (fcount == NULL)
return reg;
if (fcount[reg] < 1)
return -EINVAL;
--fcount[reg];
return reg;
}
/*
* seq_file can seek but we ignore it.
*
* Format of control line:
* "base=%Lx size=%Lx type=%s" or "disable=%d"
*/
static ssize_t
mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
{
int i, err;
unsigned long reg;
unsigned long long base, size;
char *ptr;
char line[LINE_SIZE];
int length;
size_t linelen;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
memset(line, 0, LINE_SIZE);
length = len;
length--;
if (length > LINE_SIZE - 1)
length = LINE_SIZE - 1;
if (length < 0)
return -EINVAL;
if (copy_from_user(line, buf, length))
return -EFAULT;
linelen = strlen(line);
ptr = line + linelen - 1;
if (linelen && *ptr == '\n')
*ptr = '\0';
if (!strncmp(line, "disable=", 8)) {
reg = simple_strtoul(line + 8, &ptr, 0);
err = mtrr_del_page(reg, 0, 0);
if (err < 0)
return err;
return len;
}
if (strncmp(line, "base=", 5))
return -EINVAL;
base = simple_strtoull(line + 5, &ptr, 0);
ptr = skip_spaces(ptr);
if (strncmp(ptr, "size=", 5))
return -EINVAL;
size = simple_strtoull(ptr + 5, &ptr, 0);
if ((base & 0xfff) || (size & 0xfff))
return -EINVAL;
ptr = skip_spaces(ptr);
if (strncmp(ptr, "type=", 5))
return -EINVAL;
ptr = skip_spaces(ptr + 5);
for (i = 0; i < MTRR_NUM_TYPES; ++i) {
if (strcmp(ptr, mtrr_strings[i]))
continue;
base >>= PAGE_SHIFT;
size >>= PAGE_SHIFT;
err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true);
if (err < 0)
return err;
return len;
}
return -EINVAL;
}
static long
mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
{
int err = 0;
mtrr_type type;
unsigned long base;
unsigned long size;
struct mtrr_sentry sentry;
struct mtrr_gentry gentry;
void __user *arg = (void __user *) __arg;
switch (cmd) {
case MTRRIOC_ADD_ENTRY:
case MTRRIOC_SET_ENTRY:
case MTRRIOC_DEL_ENTRY:
case MTRRIOC_KILL_ENTRY:
case MTRRIOC_ADD_PAGE_ENTRY:
case MTRRIOC_SET_PAGE_ENTRY:
case MTRRIOC_DEL_PAGE_ENTRY:
case MTRRIOC_KILL_PAGE_ENTRY:
if (copy_from_user(&sentry, arg, sizeof sentry))
return -EFAULT;
break;
case MTRRIOC_GET_ENTRY:
case MTRRIOC_GET_PAGE_ENTRY:
if (copy_from_user(&gentry, arg, sizeof gentry))
return -EFAULT;
break;
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_ENTRY:
case MTRRIOC32_SET_ENTRY:
case MTRRIOC32_DEL_ENTRY:
case MTRRIOC32_KILL_ENTRY:
case MTRRIOC32_ADD_PAGE_ENTRY:
case MTRRIOC32_SET_PAGE_ENTRY:
case MTRRIOC32_DEL_PAGE_ENTRY:
case MTRRIOC32_KILL_PAGE_ENTRY: {
struct mtrr_sentry32 __user *s32;
s32 = (struct mtrr_sentry32 __user *)__arg;
err = get_user(sentry.base, &s32->base);
err |= get_user(sentry.size, &s32->size);
err |= get_user(sentry.type, &s32->type);
if (err)
return err;
break;
}
case MTRRIOC32_GET_ENTRY:
case MTRRIOC32_GET_PAGE_ENTRY: {
struct mtrr_gentry32 __user *g32;
g32 = (struct mtrr_gentry32 __user *)__arg;
err = get_user(gentry.regnum, &g32->regnum);
err |= get_user(gentry.base, &g32->base);
err |= get_user(gentry.size, &g32->size);
err |= get_user(gentry.type, &g32->type);
if (err)
return err;
break;
}
#endif
}
switch (cmd) {
default:
return -ENOTTY;
case MTRRIOC_ADD_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_ENTRY:
#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 0);
break;
case MTRRIOC_SET_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_ENTRY:
#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_add(sentry.base, sentry.size, sentry.type, false);
break;
case MTRRIOC_DEL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_ENTRY:
#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 0);
break;
case MTRRIOC_KILL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_ENTRY:
#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_del(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_GET_ENTRY:
#endif
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
mtrr_if->get(gentry.regnum, &base, &size, &type);
/* Hide entries that go above 4GB */
if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
|| size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
gentry.base = gentry.size = gentry.type = 0;
else {
gentry.base = base << PAGE_SHIFT;
gentry.size = size << PAGE_SHIFT;
gentry.type = type;
}
break;
case MTRRIOC_ADD_PAGE_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_PAGE_ENTRY:
#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 1);
break;
case MTRRIOC_SET_PAGE_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_PAGE_ENTRY:
#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err =
mtrr_add_page(sentry.base, sentry.size, sentry.type, false);
break;
case MTRRIOC_DEL_PAGE_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_PAGE_ENTRY:
#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 1);
break;
case MTRRIOC_KILL_PAGE_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_PAGE_ENTRY:
#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_del_page(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_PAGE_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_GET_PAGE_ENTRY:
#endif
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
mtrr_if->get(gentry.regnum, &base, &size, &type);
/* Hide entries that would overflow */
if (size != (__typeof__(gentry.size))size)
gentry.base = gentry.size = gentry.type = 0;
else {
gentry.base = base;
gentry.size = size;
gentry.type = type;
}
break;
}
if (err)
return err;
switch (cmd) {
case MTRRIOC_GET_ENTRY:
case MTRRIOC_GET_PAGE_ENTRY:
if (copy_to_user(arg, &gentry, sizeof gentry))
err = -EFAULT;
break;
#ifdef CONFIG_COMPAT
case MTRRIOC32_GET_ENTRY:
case MTRRIOC32_GET_PAGE_ENTRY: {
struct mtrr_gentry32 __user *g32;
g32 = (struct mtrr_gentry32 __user *)__arg;
err = put_user(gentry.base, &g32->base);
err |= put_user(gentry.size, &g32->size);
err |= put_user(gentry.regnum, &g32->regnum);
err |= put_user(gentry.type, &g32->type);
break;
}
#endif
}
return err;
}
static int mtrr_close(struct inode *ino, struct file *file)
{
unsigned int *fcount = FILE_FCOUNT(file);
int i, max;
if (fcount != NULL) {
max = num_var_ranges;
for (i = 0; i < max; ++i) {
while (fcount[i] > 0) {
mtrr_del(i, 0, 0);
--fcount[i];
}
}
kfree(fcount);
FILE_FCOUNT(file) = NULL;
}
return single_release(ino, file);
}
static int mtrr_seq_show(struct seq_file *seq, void *offset);
static int mtrr_open(struct inode *inode, struct file *file)
{
if (!mtrr_if)
return -EIO;
if (!mtrr_if->get)
return -ENXIO;
return single_open(file, mtrr_seq_show, NULL);
}
static const struct file_operations mtrr_fops = {
.owner = THIS_MODULE,
.open = mtrr_open,
.read = seq_read,
.llseek = seq_lseek,
.write = mtrr_write,
.unlocked_ioctl = mtrr_ioctl,
.compat_ioctl = mtrr_ioctl,
.release = mtrr_close,
};
static int mtrr_seq_show(struct seq_file *seq, void *offset)
{
char factor;
int i, max, len;
mtrr_type type;
unsigned long base, size;
len = 0;
max = num_var_ranges;
for (i = 0; i < max; i++) {
mtrr_if->get(i, &base, &size, &type);
if (size == 0) {
mtrr_usage_table[i] = 0;
continue;
}
if (size < (0x100000 >> PAGE_SHIFT)) {
/* less than 1MB */
factor = 'K';
size <<= PAGE_SHIFT - 10;
} else {
factor = 'M';
size >>= 20 - PAGE_SHIFT;
}
/* Base can be > 32bit */
len += seq_printf(seq, "reg%02i: base=0x%06lx000 "
"(%5luMB), size=%5lu%cB, count=%d: %s\n",
i, base, base >> (20 - PAGE_SHIFT), size,
factor, mtrr_usage_table[i],
mtrr_attrib_to_str(type));
}
return 0;
}
static int __init mtrr_if_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
(!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
(!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
(!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
return -ENODEV;
proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
return 0;
}
arch_initcall(mtrr_if_init);
#endif /* CONFIG_PROC_FS */

View file

@ -0,0 +1,842 @@
/* Generic MTRR (Memory Type Range Register) driver.
Copyright (C) 1997-2000 Richard Gooch
Copyright (c) 2002 Patrick Mochel
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with this library; if not, write to the Free
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
Richard Gooch may be reached by email at rgooch@atnf.csiro.au
The postal address is:
Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
Source: "Pentium Pro Family Developer's Manual, Volume 3:
Operating System Writer's Guide" (Intel document number 242692),
section 11.11.7
This was cleaned and made readable by Patrick Mochel <mochel@osdl.org>
on 6-7 March 2002.
Source: Intel Architecture Software Developers Manual, Volume 3:
System Programming Guide; Section 9.11. (1997 edition - PPro).
*/
#define DEBUG
#include <linux/types.h> /* FIXME: kvm_para.h needs this */
#include <linux/stop_machine.h>
#include <linux/kvm_para.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <linux/sort.h>
#include <linux/cpu.h>
#include <linux/pci.h>
#include <linux/smp.h>
#include <linux/syscore_ops.h>
#include <asm/processor.h>
#include <asm/e820.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include <asm/pat.h>
#include "mtrr.h"
/* arch_phys_wc_add returns an MTRR register index plus this offset. */
#define MTRR_TO_PHYS_WC_OFFSET 1000
u32 num_var_ranges;
unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
static DEFINE_MUTEX(mtrr_mutex);
u64 size_or_mask, size_and_mask;
static bool mtrr_aps_delayed_init;
static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
const struct mtrr_ops *mtrr_if;
static void set_mtrr(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type);
void set_mtrr_ops(const struct mtrr_ops *ops)
{
if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
mtrr_ops[ops->vendor] = ops;
}
/* Returns non-zero if we have the write-combining memory type */
static int have_wrcomb(void)
{
struct pci_dev *dev;
dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
if (dev != NULL) {
/*
* ServerWorks LE chipsets < rev 6 have problems with
* write-combining. Don't allow it and leave room for other
* chipsets to be tagged
*/
if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
dev->revision <= 5) {
pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
pci_dev_put(dev);
return 0;
}
/*
* Intel 450NX errata # 23. Non ascending cacheline evictions to
* write combining memory may resulting in data corruption
*/
if (dev->vendor == PCI_VENDOR_ID_INTEL &&
dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n");
pci_dev_put(dev);
return 0;
}
pci_dev_put(dev);
}
return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0;
}
/* This function returns the number of variable MTRRs */
static void __init set_num_var_ranges(void)
{
unsigned long config = 0, dummy;
if (use_intel())
rdmsr(MSR_MTRRcap, config, dummy);
else if (is_cpu(AMD))
config = 2;
else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
config = 8;
num_var_ranges = config & 0xff;
}
static void __init init_table(void)
{
int i, max;
max = num_var_ranges;
for (i = 0; i < max; i++)
mtrr_usage_table[i] = 1;
}
struct set_mtrr_data {
unsigned long smp_base;
unsigned long smp_size;
unsigned int smp_reg;
mtrr_type smp_type;
};
/**
* mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
* by all the CPUs.
* @info: pointer to mtrr configuration data
*
* Returns nothing.
*/
static int mtrr_rendezvous_handler(void *info)
{
struct set_mtrr_data *data = info;
/*
* We use this same function to initialize the mtrrs during boot,
* resume, runtime cpu online and on an explicit request to set a
* specific MTRR.
*
* During boot or suspend, the state of the boot cpu's mtrrs has been
* saved, and we want to replicate that across all the cpus that come
* online (either at the end of boot or resume or during a runtime cpu
* online). If we're doing that, @reg is set to something special and on
* all the cpu's we do mtrr_if->set_all() (On the logical cpu that
* started the boot/resume sequence, this might be a duplicate
* set_all()).
*/
if (data->smp_reg != ~0U) {
mtrr_if->set(data->smp_reg, data->smp_base,
data->smp_size, data->smp_type);
} else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
mtrr_if->set_all();
}
return 0;
}
static inline int types_compatible(mtrr_type type1, mtrr_type type2)
{
return type1 == MTRR_TYPE_UNCACHABLE ||
type2 == MTRR_TYPE_UNCACHABLE ||
(type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
(type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH);
}
/**
* set_mtrr - update mtrrs on all processors
* @reg: mtrr in question
* @base: mtrr base
* @size: mtrr size
* @type: mtrr type
*
* This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
*
* 1. Queue work to do the following on all processors:
* 2. Disable Interrupts
* 3. Wait for all procs to do so
* 4. Enter no-fill cache mode
* 5. Flush caches
* 6. Clear PGE bit
* 7. Flush all TLBs
* 8. Disable all range registers
* 9. Update the MTRRs
* 10. Enable all range registers
* 11. Flush all TLBs and caches again
* 12. Enter normal cache mode and reenable caching
* 13. Set PGE
* 14. Wait for buddies to catch up
* 15. Enable interrupts.
*
* What does that mean for us? Well, stop_machine() will ensure that
* the rendezvous handler is started on each CPU. And in lockstep they
* do the state transition of disabling interrupts, updating MTRR's
* (the CPU vendors may each do it differently, so we call mtrr_if->set()
* callback and let them take care of it.) and enabling interrupts.
*
* Note that the mechanism is the same for UP systems, too; all the SMP stuff
* becomes nops.
*/
static void
set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
{
struct set_mtrr_data data = { .smp_reg = reg,
.smp_base = base,
.smp_size = size,
.smp_type = type
};
stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
}
static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type)
{
struct set_mtrr_data data = { .smp_reg = reg,
.smp_base = base,
.smp_size = size,
.smp_type = type
};
stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
cpu_callout_mask);
}
/**
* mtrr_add_page - Add a memory type region
* @base: Physical base address of region in pages (in units of 4 kB!)
* @size: Physical size of region in pages (4 kB)
* @type: Type of MTRR desired
* @increment: If this is true do usage counting on the region
*
* Memory type region registers control the caching on newer Intel and
* non Intel processors. This function allows drivers to request an
* MTRR is added. The details and hardware specifics of each processor's
* implementation are hidden from the caller, but nevertheless the
* caller should expect to need to provide a power of two size on an
* equivalent power of two boundary.
*
* If the region cannot be added either because all regions are in use
* or the CPU cannot support it a negative value is returned. On success
* the register number for this entry is returned, but should be treated
* as a cookie only.
*
* On a multiprocessor machine the changes are made to all processors.
* This is required on x86 by the Intel processors.
*
* The available types are
*
* %MTRR_TYPE_UNCACHABLE - No caching
*
* %MTRR_TYPE_WRBACK - Write data back in bursts whenever
*
* %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
*
* %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
*
* BUGS: Needs a quiet flag for the cases where drivers do not mind
* failures and do not wish system log messages to be sent.
*/
int mtrr_add_page(unsigned long base, unsigned long size,
unsigned int type, bool increment)
{
unsigned long lbase, lsize;
int i, replace, error;
mtrr_type ltype;
if (!mtrr_if)
return -ENXIO;
error = mtrr_if->validate_add_page(base, size, type);
if (error)
return error;
if (type >= MTRR_NUM_TYPES) {
pr_warning("mtrr: type: %u invalid\n", type);
return -EINVAL;
}
/* If the type is WC, check that this processor supports it */
if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
pr_warning("mtrr: your processor doesn't support write-combining\n");
return -ENOSYS;
}
if (!size) {
pr_warning("mtrr: zero sized request\n");
return -EINVAL;
}
if ((base | (base + size - 1)) >>
(boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
pr_warning("mtrr: base or size exceeds the MTRR width\n");
return -EINVAL;
}
error = -EINVAL;
replace = -1;
/* No CPU hotplug when we change MTRR entries */
get_online_cpus();
/* Search for existing MTRR */
mutex_lock(&mtrr_mutex);
for (i = 0; i < num_var_ranges; ++i) {
mtrr_if->get(i, &lbase, &lsize, &ltype);
if (!lsize || base > lbase + lsize - 1 ||
base + size - 1 < lbase)
continue;
/*
* At this point we know there is some kind of
* overlap/enclosure
*/
if (base < lbase || base + size - 1 > lbase + lsize - 1) {
if (base <= lbase &&
base + size - 1 >= lbase + lsize - 1) {
/* New region encloses an existing region */
if (type == ltype) {
replace = replace == -1 ? i : -2;
continue;
} else if (types_compatible(type, ltype))
continue;
}
pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing"
" 0x%lx000,0x%lx000\n", base, size, lbase,
lsize);
goto out;
}
/* New region is enclosed by an existing region */
if (ltype != type) {
if (types_compatible(type, ltype))
continue;
pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
base, size, mtrr_attrib_to_str(ltype),
mtrr_attrib_to_str(type));
goto out;
}
if (increment)
++mtrr_usage_table[i];
error = i;
goto out;
}
/* Search for an empty MTRR */
i = mtrr_if->get_free_region(base, size, replace);
if (i >= 0) {
set_mtrr(i, base, size, type);
if (likely(replace < 0)) {
mtrr_usage_table[i] = 1;
} else {
mtrr_usage_table[i] = mtrr_usage_table[replace];
if (increment)
mtrr_usage_table[i]++;
if (unlikely(replace != i)) {
set_mtrr(replace, 0, 0, 0);
mtrr_usage_table[replace] = 0;
}
}
} else {
pr_info("mtrr: no more MTRRs available\n");
}
error = i;
out:
mutex_unlock(&mtrr_mutex);
put_online_cpus();
return error;
}
static int mtrr_check(unsigned long base, unsigned long size)
{
if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
pr_warning("mtrr: size and base must be multiples of 4 kiB\n");
pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base);
dump_stack();
return -1;
}
return 0;
}
/**
* mtrr_add - Add a memory type region
* @base: Physical base address of region
* @size: Physical size of region
* @type: Type of MTRR desired
* @increment: If this is true do usage counting on the region
*
* Memory type region registers control the caching on newer Intel and
* non Intel processors. This function allows drivers to request an
* MTRR is added. The details and hardware specifics of each processor's
* implementation are hidden from the caller, but nevertheless the
* caller should expect to need to provide a power of two size on an
* equivalent power of two boundary.
*
* If the region cannot be added either because all regions are in use
* or the CPU cannot support it a negative value is returned. On success
* the register number for this entry is returned, but should be treated
* as a cookie only.
*
* On a multiprocessor machine the changes are made to all processors.
* This is required on x86 by the Intel processors.
*
* The available types are
*
* %MTRR_TYPE_UNCACHABLE - No caching
*
* %MTRR_TYPE_WRBACK - Write data back in bursts whenever
*
* %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
*
* %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
*
* BUGS: Needs a quiet flag for the cases where drivers do not mind
* failures and do not wish system log messages to be sent.
*/
int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
bool increment)
{
if (mtrr_check(base, size))
return -EINVAL;
return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
increment);
}
EXPORT_SYMBOL(mtrr_add);
/**
* mtrr_del_page - delete a memory type region
* @reg: Register returned by mtrr_add
* @base: Physical base address
* @size: Size of region
*
* If register is supplied then base and size are ignored. This is
* how drivers should call it.
*
* Releases an MTRR region. If the usage count drops to zero the
* register is freed and the region returns to default state.
* On success the register is returned, on failure a negative error
* code.
*/
int mtrr_del_page(int reg, unsigned long base, unsigned long size)
{
int i, max;
mtrr_type ltype;
unsigned long lbase, lsize;
int error = -EINVAL;
if (!mtrr_if)
return -ENXIO;
max = num_var_ranges;
/* No CPU hotplug when we change MTRR entries */
get_online_cpus();
mutex_lock(&mtrr_mutex);
if (reg < 0) {
/* Search for existing MTRR */
for (i = 0; i < max; ++i) {
mtrr_if->get(i, &lbase, &lsize, &ltype);
if (lbase == base && lsize == size) {
reg = i;
break;
}
}
if (reg < 0) {
pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n",
base, size);
goto out;
}
}
if (reg >= max) {
pr_warning("mtrr: register: %d too big\n", reg);
goto out;
}
mtrr_if->get(reg, &lbase, &lsize, &ltype);
if (lsize < 1) {
pr_warning("mtrr: MTRR %d not used\n", reg);
goto out;
}
if (mtrr_usage_table[reg] < 1) {
pr_warning("mtrr: reg: %d has count=0\n", reg);
goto out;
}
if (--mtrr_usage_table[reg] < 1)
set_mtrr(reg, 0, 0, 0);
error = reg;
out:
mutex_unlock(&mtrr_mutex);
put_online_cpus();
return error;
}
/**
* mtrr_del - delete a memory type region
* @reg: Register returned by mtrr_add
* @base: Physical base address
* @size: Size of region
*
* If register is supplied then base and size are ignored. This is
* how drivers should call it.
*
* Releases an MTRR region. If the usage count drops to zero the
* register is freed and the region returns to default state.
* On success the register is returned, on failure a negative error
* code.
*/
int mtrr_del(int reg, unsigned long base, unsigned long size)
{
if (mtrr_check(base, size))
return -EINVAL;
return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
}
EXPORT_SYMBOL(mtrr_del);
/**
* arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable
* @base: Physical base address
* @size: Size of region
*
* If PAT is available, this does nothing. If PAT is unavailable, it
* attempts to add a WC MTRR covering size bytes starting at base and
* logs an error if this fails.
*
* Drivers must store the return value to pass to mtrr_del_wc_if_needed,
* but drivers should not try to interpret that return value.
*/
int arch_phys_wc_add(unsigned long base, unsigned long size)
{
int ret;
if (pat_enabled)
return 0; /* Success! (We don't need to do anything.) */
ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
if (ret < 0) {
pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.",
(void *)base, (void *)(base + size - 1));
return ret;
}
return ret + MTRR_TO_PHYS_WC_OFFSET;
}
EXPORT_SYMBOL(arch_phys_wc_add);
/*
* arch_phys_wc_del - undoes arch_phys_wc_add
* @handle: Return value from arch_phys_wc_add
*
* This cleans up after mtrr_add_wc_if_needed.
*
* The API guarantees that mtrr_del_wc_if_needed(error code) and
* mtrr_del_wc_if_needed(0) do nothing.
*/
void arch_phys_wc_del(int handle)
{
if (handle >= 1) {
WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET);
mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0);
}
}
EXPORT_SYMBOL(arch_phys_wc_del);
/*
* phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value
* @handle: Return value from arch_phys_wc_add
*
* This will turn the return value from arch_phys_wc_add into an mtrr
* index suitable for debugging.
*
* Note: There is no legitimate use for this function, except possibly
* in printk line. Alas there is an illegitimate use in some ancient
* drm ioctls.
*/
int phys_wc_to_mtrr_index(int handle)
{
if (handle < MTRR_TO_PHYS_WC_OFFSET)
return -1;
else
return handle - MTRR_TO_PHYS_WC_OFFSET;
}
EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index);
/*
* HACK ALERT!
* These should be called implicitly, but we can't yet until all the initcall
* stuff is done...
*/
static void __init init_ifs(void)
{
#ifndef CONFIG_X86_64
amd_init_mtrr();
cyrix_init_mtrr();
centaur_init_mtrr();
#endif
}
/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
* MTRR driver doesn't require this
*/
struct mtrr_value {
mtrr_type ltype;
unsigned long lbase;
unsigned long lsize;
};
static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
static int mtrr_save(void)
{
int i;
for (i = 0; i < num_var_ranges; i++) {
mtrr_if->get(i, &mtrr_value[i].lbase,
&mtrr_value[i].lsize,
&mtrr_value[i].ltype);
}
return 0;
}
static void mtrr_restore(void)
{
int i;
for (i = 0; i < num_var_ranges; i++) {
if (mtrr_value[i].lsize) {
set_mtrr(i, mtrr_value[i].lbase,
mtrr_value[i].lsize,
mtrr_value[i].ltype);
}
}
}
static struct syscore_ops mtrr_syscore_ops = {
.suspend = mtrr_save,
.resume = mtrr_restore,
};
int __initdata changed_by_mtrr_cleanup;
#define SIZE_OR_MASK_BITS(n) (~((1ULL << ((n) - PAGE_SHIFT)) - 1))
/**
* mtrr_bp_init - initialize mtrrs on the boot CPU
*
* This needs to be called early; before any of the other CPUs are
* initialized (i.e. before smp_init()).
*
*/
void __init mtrr_bp_init(void)
{
u32 phys_addr;
init_ifs();
phys_addr = 32;
if (cpu_has_mtrr) {
mtrr_if = &generic_mtrr_ops;
size_or_mask = SIZE_OR_MASK_BITS(36);
size_and_mask = 0x00f00000;
phys_addr = 36;
/*
* This is an AMD specific MSR, but we assume(hope?) that
* Intel will implement it too when they extend the address
* bus of the Xeon.
*/
if (cpuid_eax(0x80000000) >= 0x80000008) {
phys_addr = cpuid_eax(0x80000008) & 0xff;
/* CPUID workaround for Intel 0F33/0F34 CPU */
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
boot_cpu_data.x86 == 0xF &&
boot_cpu_data.x86_model == 0x3 &&
(boot_cpu_data.x86_mask == 0x3 ||
boot_cpu_data.x86_mask == 0x4))
phys_addr = 36;
size_or_mask = SIZE_OR_MASK_BITS(phys_addr);
size_and_mask = ~size_or_mask & 0xfffff00000ULL;
} else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
boot_cpu_data.x86 == 6) {
/*
* VIA C* family have Intel style MTRRs,
* but don't support PAE
*/
size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
phys_addr = 32;
}
} else {
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) {
/* Pre-Athlon (K6) AMD CPU MTRRs */
mtrr_if = mtrr_ops[X86_VENDOR_AMD];
size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
break;
case X86_VENDOR_CENTAUR:
if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) {
mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
break;
case X86_VENDOR_CYRIX:
if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) {
mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
break;
default:
break;
}
}
if (mtrr_if) {
set_num_var_ranges();
init_table();
if (use_intel()) {
get_mtrr_state();
if (mtrr_cleanup(phys_addr)) {
changed_by_mtrr_cleanup = 1;
mtrr_if->set_all();
}
}
}
}
void mtrr_ap_init(void)
{
if (!use_intel() || mtrr_aps_delayed_init)
return;
/*
* Ideally we should hold mtrr_mutex here to avoid mtrr entries
* changed, but this routine will be called in cpu boot time,
* holding the lock breaks it.
*
* This routine is called in two cases:
*
* 1. very earily time of software resume, when there absolutely
* isn't mtrr entry changes;
*
* 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
* lock to prevent mtrr entry changes
*/
set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
}
/**
* Save current fixed-range MTRR state of the first cpu in cpu_online_mask.
*/
void mtrr_save_state(void)
{
int first_cpu;
get_online_cpus();
first_cpu = cpumask_first(cpu_online_mask);
smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
put_online_cpus();
}
void set_mtrr_aps_delayed_init(void)
{
if (!use_intel())
return;
mtrr_aps_delayed_init = true;
}
/*
* Delayed MTRR initialization for all AP's
*/
void mtrr_aps_init(void)
{
if (!use_intel())
return;
/*
* Check if someone has requested the delay of AP MTRR initialization,
* by doing set_mtrr_aps_delayed_init(), prior to this point. If not,
* then we are done.
*/
if (!mtrr_aps_delayed_init)
return;
set_mtrr(~0U, 0, 0, 0);
mtrr_aps_delayed_init = false;
}
void mtrr_bp_restore(void)
{
if (!use_intel())
return;
mtrr_if->set_all();
}
static int __init mtrr_init_finialize(void)
{
if (!mtrr_if)
return 0;
if (use_intel()) {
if (!changed_by_mtrr_cleanup)
mtrr_state_warn();
return 0;
}
/*
* The CPU has no MTRR and seems to not support SMP. They have
* specific drivers, we use a tricky method to support
* suspend/resume for them.
*
* TBD: is there any system with such CPU which supports
* suspend/resume? If no, we should remove the code.
*/
register_syscore_ops(&mtrr_syscore_ops);
return 0;
}
subsys_initcall(mtrr_init_finialize);

View file

@ -0,0 +1,78 @@
/*
* local MTRR defines.
*/
#include <linux/types.h>
#include <linux/stddef.h>
#define MTRR_CHANGE_MASK_FIXED 0x01
#define MTRR_CHANGE_MASK_VARIABLE 0x02
#define MTRR_CHANGE_MASK_DEFTYPE 0x04
extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
struct mtrr_ops {
u32 vendor;
u32 use_intel_if;
void (*set)(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type);
void (*set_all)(void);
void (*get)(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type *type);
int (*get_free_region)(unsigned long base, unsigned long size,
int replace_reg);
int (*validate_add_page)(unsigned long base, unsigned long size,
unsigned int type);
int (*have_wrcomb)(void);
};
extern int generic_get_free_region(unsigned long base, unsigned long size,
int replace_reg);
extern int generic_validate_add_page(unsigned long base, unsigned long size,
unsigned int type);
extern const struct mtrr_ops generic_mtrr_ops;
extern int positive_have_wrcomb(void);
/* library functions for processor-specific routines */
struct set_mtrr_context {
unsigned long flags;
unsigned long cr4val;
u32 deftype_lo;
u32 deftype_hi;
u32 ccr3;
};
void set_mtrr_done(struct set_mtrr_context *ctxt);
void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
void get_mtrr_state(void);
extern void set_mtrr_ops(const struct mtrr_ops *ops);
extern u64 size_or_mask, size_and_mask;
extern const struct mtrr_ops *mtrr_if;
#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
extern unsigned int num_var_ranges;
extern u64 mtrr_tom2;
extern struct mtrr_state_type mtrr_state;
void mtrr_state_warn(void);
const char *mtrr_attrib_to_str(int x);
void mtrr_wrmsr(unsigned, unsigned, unsigned);
/* CPU specific mtrr init functions */
int amd_init_mtrr(void);
int cyrix_init_mtrr(void);
int centaur_init_mtrr(void);
extern int changed_by_mtrr_cleanup;
extern int mtrr_cleanup(unsigned address_bits);

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,775 @@
/*
* Performance events x86 architecture header
*
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2009 Jaswinder Singh Rajput
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
* Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
* Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
* Copyright (C) 2009 Google, Inc., Stephane Eranian
*
* For licencing details see kernel-base/COPYING
*/
#include <linux/perf_event.h>
#if 0
#undef wrmsrl
#define wrmsrl(msr, val) \
do { \
unsigned int _msr = (msr); \
u64 _val = (val); \
trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \
(unsigned long long)(_val)); \
native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \
} while (0)
#endif
/*
* | NHM/WSM | SNB |
* register -------------------------------
* | HT | no HT | HT | no HT |
*-----------------------------------------
* offcore | core | core | cpu | core |
* lbr_sel | core | core | cpu | core |
* ld_lat | cpu | core | cpu | core |
*-----------------------------------------
*
* Given that there is a small number of shared regs,
* we can pre-allocate their slot in the per-cpu
* per-core reg tables.
*/
enum extra_reg_type {
EXTRA_REG_NONE = -1, /* not used */
EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
EXTRA_REG_LBR = 2, /* lbr_select */
EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */
EXTRA_REG_MAX /* number of entries needed */
};
struct event_constraint {
union {
unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
u64 idxmsk64;
};
u64 code;
u64 cmask;
int weight;
int overlap;
int flags;
};
/*
* struct hw_perf_event.flags flags
*/
#define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */
#define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */
#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style datala, store */
#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */
#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */
#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */
struct amd_nb {
int nb_id; /* NorthBridge id */
int refcnt; /* reference count */
struct perf_event *owners[X86_PMC_IDX_MAX];
struct event_constraint event_constraints[X86_PMC_IDX_MAX];
};
/* The maximal number of PEBS events: */
#define MAX_PEBS_EVENTS 8
/*
* A debug store configuration.
*
* We only support architectures that use 64bit fields.
*/
struct debug_store {
u64 bts_buffer_base;
u64 bts_index;
u64 bts_absolute_maximum;
u64 bts_interrupt_threshold;
u64 pebs_buffer_base;
u64 pebs_index;
u64 pebs_absolute_maximum;
u64 pebs_interrupt_threshold;
u64 pebs_event_reset[MAX_PEBS_EVENTS];
};
/*
* Per register state.
*/
struct er_account {
raw_spinlock_t lock; /* per-core: protect structure */
u64 config; /* extra MSR config */
u64 reg; /* extra MSR number */
atomic_t ref; /* reference count */
};
/*
* Per core/cpu state
*
* Used to coordinate shared registers between HT threads or
* among events on a single PMU.
*/
struct intel_shared_regs {
struct er_account regs[EXTRA_REG_MAX];
int refcnt; /* per-core: #HT threads */
unsigned core_id; /* per-core: core id */
};
#define MAX_LBR_ENTRIES 16
struct cpu_hw_events {
/*
* Generic x86 PMC bits
*/
struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
int enabled;
int n_events; /* the # of events in the below arrays */
int n_added; /* the # last events in the below arrays;
they've never been enabled yet */
int n_txn; /* the # last events in the below arrays;
added in the current transaction */
int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
u64 tags[X86_PMC_IDX_MAX];
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
unsigned int group_flag;
int is_fake;
/*
* Intel DebugStore bits
*/
struct debug_store *ds;
u64 pebs_enabled;
/*
* Intel LBR bits
*/
int lbr_users;
void *lbr_context;
struct perf_branch_stack lbr_stack;
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
struct er_account *lbr_sel;
u64 br_sel;
/*
* Intel host/guest exclude bits
*/
u64 intel_ctrl_guest_mask;
u64 intel_ctrl_host_mask;
struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX];
/*
* Intel checkpoint mask
*/
u64 intel_cp_status;
/*
* manage shared (per-core, per-cpu) registers
* used on Intel NHM/WSM/SNB
*/
struct intel_shared_regs *shared_regs;
/*
* AMD specific bits
*/
struct amd_nb *amd_nb;
/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
u64 perf_ctr_virt_mask;
void *kfree_on_online;
};
#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
{ .idxmsk64 = (n) }, \
.code = (c), \
.cmask = (m), \
.weight = (w), \
.overlap = (o), \
.flags = f, \
}
#define EVENT_CONSTRAINT(c, n, m) \
__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
/*
* The overlap flag marks event constraints with overlapping counter
* masks. This is the case if the counter mask of such an event is not
* a subset of any other counter mask of a constraint with an equal or
* higher weight, e.g.:
*
* c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
* c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
* c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
*
* The event scheduler may not select the correct counter in the first
* cycle because it needs to know which subsequent events will be
* scheduled. It may fail to schedule the events then. So we set the
* overlap flag for such constraints to give the scheduler a hint which
* events to select for counter rescheduling.
*
* Care must be taken as the rescheduling algorithm is O(n!) which
* will increase scheduling cycles for an over-commited system
* dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros
* and its counter masks must be kept at a minimum.
*/
#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \
__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)
/*
* Constraint on the Event code.
*/
#define INTEL_EVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
/*
* Constraint on the Event code + UMask + fixed-mask
*
* filter mask to validate fixed counter events.
* the following filters disqualify for fixed counters:
* - inv
* - edge
* - cnt-mask
* - in_tx
* - in_tx_checkpointed
* The other filters are supported by fixed counters.
* The any-thread option is supported starting with v3.
*/
#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)
#define FIXED_EVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)
/*
* Constraint on the Event code + UMask
*/
#define INTEL_UEVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
#define INTEL_PLD_CONSTRAINT(c, n) \
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
#define INTEL_PST_CONSTRAINT(c, n) \
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
/* Event constraint, but match on all event flags too. */
#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
/* Check only flags, but allow all event/umask */
#define INTEL_ALL_EVENT_CONSTRAINT(code, n) \
EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
/* Check flags and event code, and set the HSW store flag */
#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
__EVENT_CONSTRAINT(code, n, \
ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
/* Check flags and event code, and set the HSW load flag */
#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
__EVENT_CONSTRAINT(code, n, \
ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
/* Check flags and event code/umask, and set the HSW store flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
__EVENT_CONSTRAINT(code, n, \
INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
/* Check flags and event code/umask, and set the HSW load flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
__EVENT_CONSTRAINT(code, n, \
INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
/* Check flags and event code/umask, and set the HSW N/A flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
__EVENT_CONSTRAINT(code, n, \
INTEL_ARCH_EVENT_MASK|INTEL_ARCH_EVENT_MASK, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
/*
* We define the end marker as having a weight of -1
* to enable blacklisting of events using a counter bitmask
* of zero and thus a weight of zero.
* The end marker has a weight that cannot possibly be
* obtained from counting the bits in the bitmask.
*/
#define EVENT_CONSTRAINT_END { .weight = -1 }
/*
* Check for end marker with weight == -1
*/
#define for_each_event_constraint(e, c) \
for ((e) = (c); (e)->weight != -1; (e)++)
/*
* Extra registers for specific events.
*
* Some events need large masks and require external MSRs.
* Those extra MSRs end up being shared for all events on
* a PMU and sometimes between PMU of sibling HT threads.
* In either case, the kernel needs to handle conflicting
* accesses to those extra, shared, regs. The data structure
* to manage those registers is stored in cpu_hw_event.
*/
struct extra_reg {
unsigned int event;
unsigned int msr;
u64 config_mask;
u64 valid_mask;
int idx; /* per_xxx->regs[] reg index */
bool extra_msr_access;
};
#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
.event = (e), \
.msr = (ms), \
.config_mask = (m), \
.valid_mask = (vm), \
.idx = EXTRA_REG_##i, \
.extra_msr_access = true, \
}
#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \
EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
ARCH_PERFMON_EVENTSEL_UMASK, vm, idx)
#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \
INTEL_UEVENT_EXTRA_REG(c, \
MSR_PEBS_LD_LAT_THRESHOLD, \
0xffff, \
LDLAT)
#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
union perf_capabilities {
struct {
u64 lbr_format:6;
u64 pebs_trap:1;
u64 pebs_arch_reg:1;
u64 pebs_format:4;
u64 smm_freeze:1;
/*
* PMU supports separate counter range for writing
* values > 32bit.
*/
u64 full_width_write:1;
};
u64 capabilities;
};
struct x86_pmu_quirk {
struct x86_pmu_quirk *next;
void (*func)(void);
};
union x86_pmu_config {
struct {
u64 event:8,
umask:8,
usr:1,
os:1,
edge:1,
pc:1,
interrupt:1,
__reserved1:1,
en:1,
inv:1,
cmask:8,
event2:4,
__reserved2:4,
go:1,
ho:1;
} bits;
u64 value;
};
#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
/*
* struct x86_pmu - generic x86 pmu
*/
struct x86_pmu {
/*
* Generic x86 PMC bits
*/
const char *name;
int version;
int (*handle_irq)(struct pt_regs *);
void (*disable_all)(void);
void (*enable_all)(int added);
void (*enable)(struct perf_event *);
void (*disable)(struct perf_event *);
int (*hw_config)(struct perf_event *event);
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
unsigned eventsel;
unsigned perfctr;
int (*addr_offset)(int index, bool eventsel);
int (*rdpmc_index)(int index);
u64 (*event_map)(int);
int max_events;
int num_counters;
int num_counters_fixed;
int cntval_bits;
u64 cntval_mask;
union {
unsigned long events_maskl;
unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
};
int events_mask_len;
int apic;
u64 max_period;
struct event_constraint *
(*get_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
void (*put_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
struct event_constraint *event_constraints;
struct x86_pmu_quirk *quirks;
int perfctr_second_write;
bool late_ack;
/*
* sysfs attrs
*/
int attr_rdpmc_broken;
int attr_rdpmc;
struct attribute **format_attrs;
struct attribute **event_attrs;
ssize_t (*events_sysfs_show)(char *page, u64 config);
struct attribute **cpu_events;
/*
* CPU Hotplug hooks
*/
int (*cpu_prepare)(int cpu);
void (*cpu_starting)(int cpu);
void (*cpu_dying)(int cpu);
void (*cpu_dead)(int cpu);
void (*check_microcode)(void);
void (*flush_branch_stack)(void);
/*
* Intel Arch Perfmon v2+
*/
u64 intel_ctrl;
union perf_capabilities intel_cap;
/*
* Intel DebugStore bits
*/
unsigned int bts :1,
bts_active :1,
pebs :1,
pebs_active :1,
pebs_broken :1;
int pebs_record_size;
void (*drain_pebs)(struct pt_regs *regs);
struct event_constraint *pebs_constraints;
void (*pebs_aliases)(struct perf_event *event);
int max_pebs_events;
/*
* Intel LBR
*/
unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
int lbr_nr; /* hardware stack size */
u64 lbr_sel_mask; /* LBR_SELECT valid bits */
const int *lbr_sel_map; /* lbr_select mappings */
bool lbr_double_abort; /* duplicated lbr aborts */
/*
* Extra registers for events
*/
struct extra_reg *extra_regs;
unsigned int er_flags;
/*
* Intel host/guest support (KVM)
*/
struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
};
#define x86_add_quirk(func_) \
do { \
static struct x86_pmu_quirk __quirk __initdata = { \
.func = func_, \
}; \
__quirk.next = x86_pmu.quirks; \
x86_pmu.quirks = &__quirk; \
} while (0)
#define ERF_NO_HT_SHARING 1
#define ERF_HAS_RSP_1 2
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
#define EVENT_ATTR(_name, _id) \
static struct perf_pmu_events_attr EVENT_VAR(_id) = { \
.attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \
.id = PERF_COUNT_HW_##_id, \
.event_str = NULL, \
};
#define EVENT_ATTR_STR(_name, v, str) \
static struct perf_pmu_events_attr event_attr_##v = { \
.attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \
.id = 0, \
.event_str = str, \
};
extern struct x86_pmu x86_pmu __read_mostly;
DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
int x86_perf_event_set_period(struct perf_event *event);
/*
* Generalized hw caching related hw_event table, filled
* in on a per model basis. A value of 0 means
* 'not supported', -1 means 'hw_event makes no sense on
* this CPU', any other value means the raw hw_event
* ID.
*/
#define C(x) PERF_COUNT_HW_CACHE_##x
extern u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX];
extern u64 __read_mostly hw_cache_extra_regs
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX];
u64 x86_perf_event_update(struct perf_event *event);
static inline unsigned int x86_pmu_config_addr(int index)
{
return x86_pmu.eventsel + (x86_pmu.addr_offset ?
x86_pmu.addr_offset(index, true) : index);
}
static inline unsigned int x86_pmu_event_addr(int index)
{
return x86_pmu.perfctr + (x86_pmu.addr_offset ?
x86_pmu.addr_offset(index, false) : index);
}
static inline int x86_pmu_rdpmc_index(int index)
{
return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
}
int x86_setup_perfctr(struct perf_event *event);
int x86_pmu_hw_config(struct perf_event *event);
void x86_pmu_disable_all(void);
static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
u64 enable_mask)
{
u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
if (hwc->extra_reg.reg)
wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
}
void x86_pmu_enable_all(int added);
int perf_assign_events(struct perf_event **events, int n,
int wmin, int wmax, int *assign);
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
void x86_pmu_stop(struct perf_event *event, int flags);
static inline void x86_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
wrmsrl(hwc->config_base, hwc->config);
}
void x86_pmu_enable_event(struct perf_event *event);
int x86_pmu_handle_irq(struct pt_regs *regs);
extern struct event_constraint emptyconstraint;
extern struct event_constraint unconstrained;
static inline bool kernel_ip(unsigned long ip)
{
#ifdef CONFIG_X86_32
return ip > PAGE_OFFSET;
#else
return (long)ip < 0;
#endif
}
/*
* Not all PMUs provide the right context information to place the reported IP
* into full context. Specifically segment registers are typically not
* supplied.
*
* Assuming the address is a linear address (it is for IBS), we fake the CS and
* vm86 mode using the known zero-based code segment and 'fix up' the registers
* to reflect this.
*
* Intel PEBS/LBR appear to typically provide the effective address, nothing
* much we can do about that but pray and treat it like a linear address.
*/
static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
{
regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
if (regs->flags & X86_VM_MASK)
regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
regs->ip = ip;
}
ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
ssize_t intel_event_sysfs_show(char *page, u64 config);
#ifdef CONFIG_CPU_SUP_AMD
int amd_pmu_init(void);
#else /* CONFIG_CPU_SUP_AMD */
static inline int amd_pmu_init(void)
{
return 0;
}
#endif /* CONFIG_CPU_SUP_AMD */
#ifdef CONFIG_CPU_SUP_INTEL
int intel_pmu_save_and_restart(struct perf_event *event);
struct event_constraint *
x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event);
struct intel_shared_regs *allocate_shared_regs(int cpu);
int intel_pmu_init(void);
void init_debug_store_on_cpu(int cpu);
void fini_debug_store_on_cpu(int cpu);
void release_ds_buffers(void);
void reserve_ds_buffers(void);
extern struct event_constraint bts_constraint;
void intel_pmu_enable_bts(u64 config);
void intel_pmu_disable_bts(void);
int intel_pmu_drain_bts_buffer(void);
extern struct event_constraint intel_core2_pebs_event_constraints[];
extern struct event_constraint intel_atom_pebs_event_constraints[];
extern struct event_constraint intel_slm_pebs_event_constraints[];
extern struct event_constraint intel_nehalem_pebs_event_constraints[];
extern struct event_constraint intel_westmere_pebs_event_constraints[];
extern struct event_constraint intel_snb_pebs_event_constraints[];
extern struct event_constraint intel_ivb_pebs_event_constraints[];
extern struct event_constraint intel_hsw_pebs_event_constraints[];
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
void intel_pmu_pebs_enable(struct perf_event *event);
void intel_pmu_pebs_disable(struct perf_event *event);
void intel_pmu_pebs_enable_all(void);
void intel_pmu_pebs_disable_all(void);
void intel_ds_init(void);
void intel_pmu_lbr_reset(void);
void intel_pmu_lbr_enable(struct perf_event *event);
void intel_pmu_lbr_disable(struct perf_event *event);
void intel_pmu_lbr_enable_all(void);
void intel_pmu_lbr_disable_all(void);
void intel_pmu_lbr_read(void);
void intel_pmu_lbr_init_core(void);
void intel_pmu_lbr_init_nhm(void);
void intel_pmu_lbr_init_atom(void);
void intel_pmu_lbr_init_snb(void);
int intel_pmu_setup_lbr_filter(struct perf_event *event);
int p4_pmu_init(void);
int p6_pmu_init(void);
int knc_pmu_init(void);
ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
char *page);
#else /* CONFIG_CPU_SUP_INTEL */
static inline void reserve_ds_buffers(void)
{
}
static inline void release_ds_buffers(void)
{
}
static inline int intel_pmu_init(void)
{
return 0;
}
static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
{
return NULL;
}
#endif /* CONFIG_CPU_SUP_INTEL */

View file

@ -0,0 +1,728 @@
#include <linux/perf_event.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <asm/apicdef.h>
#include "perf_event.h"
static __initconst const u64 amd_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
{
[ C(L1D) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
[ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
[ C(RESULT_MISS) ] = 0,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
[ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
},
},
[ C(L1I ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
[ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
[ C(RESULT_MISS) ] = 0,
},
},
[ C(LL ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
[ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
[ C(RESULT_MISS) ] = 0,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
},
[ C(DTLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
[ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
},
[ C(ITLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
[ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
},
[ C(BPU ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
[ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
},
[ C(NODE) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
[ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
},
};
/*
* AMD Performance Monitor K7 and later.
*/
static const u64 amd_perfmon_event_map[] =
{
[PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
[PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
[PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
[PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
[PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */
[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */
};
static u64 amd_pmu_event_map(int hw_event)
{
return amd_perfmon_event_map[hw_event];
}
/*
* Previously calculated offsets
*/
static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
/*
* Legacy CPUs:
* 4 counters starting at 0xc0010000 each offset by 1
*
* CPUs with core performance counter extensions:
* 6 counters starting at 0xc0010200 each offset by 2
*/
static inline int amd_pmu_addr_offset(int index, bool eventsel)
{
int offset;
if (!index)
return index;
if (eventsel)
offset = event_offsets[index];
else
offset = count_offsets[index];
if (offset)
return offset;
if (!cpu_has_perfctr_core)
offset = index;
else
offset = index << 1;
if (eventsel)
event_offsets[index] = offset;
else
count_offsets[index] = offset;
return offset;
}
static int amd_core_hw_config(struct perf_event *event)
{
if (event->attr.exclude_host && event->attr.exclude_guest)
/*
* When HO == GO == 1 the hardware treats that as GO == HO == 0
* and will count in both modes. We don't want to count in that
* case so we emulate no-counting by setting US = OS = 0.
*/
event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
ARCH_PERFMON_EVENTSEL_OS);
else if (event->attr.exclude_host)
event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
else if (event->attr.exclude_guest)
event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
return 0;
}
/*
* AMD64 events are detected based on their event codes.
*/
static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
{
return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
}
static inline int amd_is_nb_event(struct hw_perf_event *hwc)
{
return (hwc->config & 0xe0) == 0xe0;
}
static inline int amd_has_nb(struct cpu_hw_events *cpuc)
{
struct amd_nb *nb = cpuc->amd_nb;
return nb && nb->nb_id != -1;
}
static int amd_pmu_hw_config(struct perf_event *event)
{
int ret;
/* pass precise event sampling to ibs: */
if (event->attr.precise_ip && get_ibs_caps())
return -ENOENT;
if (has_branch_stack(event))
return -EOPNOTSUPP;
ret = x86_pmu_hw_config(event);
if (ret)
return ret;
if (event->attr.type == PERF_TYPE_RAW)
event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
return amd_core_hw_config(event);
}
static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
struct perf_event *event)
{
struct amd_nb *nb = cpuc->amd_nb;
int i;
/*
* need to scan whole list because event may not have
* been assigned during scheduling
*
* no race condition possible because event can only
* be removed on one CPU at a time AND PMU is disabled
* when we come here
*/
for (i = 0; i < x86_pmu.num_counters; i++) {
if (cmpxchg(nb->owners + i, event, NULL) == event)
break;
}
}
/*
* AMD64 NorthBridge events need special treatment because
* counter access needs to be synchronized across all cores
* of a package. Refer to BKDG section 3.12
*
* NB events are events measuring L3 cache, Hypertransport
* traffic. They are identified by an event code >= 0xe00.
* They measure events on the NorthBride which is shared
* by all cores on a package. NB events are counted on a
* shared set of counters. When a NB event is programmed
* in a counter, the data actually comes from a shared
* counter. Thus, access to those counters needs to be
* synchronized.
*
* We implement the synchronization such that no two cores
* can be measuring NB events using the same counters. Thus,
* we maintain a per-NB allocation table. The available slot
* is propagated using the event_constraint structure.
*
* We provide only one choice for each NB event based on
* the fact that only NB events have restrictions. Consequently,
* if a counter is available, there is a guarantee the NB event
* will be assigned to it. If no slot is available, an empty
* constraint is returned and scheduling will eventually fail
* for this event.
*
* Note that all cores attached the same NB compete for the same
* counters to host NB events, this is why we use atomic ops. Some
* multi-chip CPUs may have more than one NB.
*
* Given that resources are allocated (cmpxchg), they must be
* eventually freed for others to use. This is accomplished by
* calling __amd_put_nb_event_constraints()
*
* Non NB events are not impacted by this restriction.
*/
static struct event_constraint *
__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
struct event_constraint *c)
{
struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
struct perf_event *old;
int idx, new = -1;
if (!c)
c = &unconstrained;
if (cpuc->is_fake)
return c;
/*
* detect if already present, if so reuse
*
* cannot merge with actual allocation
* because of possible holes
*
* event can already be present yet not assigned (in hwc->idx)
* because of successive calls to x86_schedule_events() from
* hw_perf_group_sched_in() without hw_perf_enable()
*/
for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
if (new == -1 || hwc->idx == idx)
/* assign free slot, prefer hwc->idx */
old = cmpxchg(nb->owners + idx, NULL, event);
else if (nb->owners[idx] == event)
/* event already present */
old = event;
else
continue;
if (old && old != event)
continue;
/* reassign to this slot */
if (new != -1)
cmpxchg(nb->owners + new, event, NULL);
new = idx;
/* already present, reuse */
if (old == event)
break;
}
if (new == -1)
return &emptyconstraint;
return &nb->event_constraints[new];
}
static struct amd_nb *amd_alloc_nb(int cpu)
{
struct amd_nb *nb;
int i;
nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
if (!nb)
return NULL;
nb->nb_id = -1;
/*
* initialize all possible NB constraints
*/
for (i = 0; i < x86_pmu.num_counters; i++) {
__set_bit(i, nb->event_constraints[i].idxmsk);
nb->event_constraints[i].weight = 1;
}
return nb;
}
static int amd_pmu_cpu_prepare(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
WARN_ON_ONCE(cpuc->amd_nb);
if (boot_cpu_data.x86_max_cores < 2)
return NOTIFY_OK;
cpuc->amd_nb = amd_alloc_nb(cpu);
if (!cpuc->amd_nb)
return NOTIFY_BAD;
return NOTIFY_OK;
}
static void amd_pmu_cpu_starting(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
struct amd_nb *nb;
int i, nb_id;
cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
if (boot_cpu_data.x86_max_cores < 2)
return;
nb_id = amd_get_nb_id(cpu);
WARN_ON_ONCE(nb_id == BAD_APICID);
for_each_online_cpu(i) {
nb = per_cpu(cpu_hw_events, i).amd_nb;
if (WARN_ON_ONCE(!nb))
continue;
if (nb->nb_id == nb_id) {
cpuc->kfree_on_online = cpuc->amd_nb;
cpuc->amd_nb = nb;
break;
}
}
cpuc->amd_nb->nb_id = nb_id;
cpuc->amd_nb->refcnt++;
}
static void amd_pmu_cpu_dead(int cpu)
{
struct cpu_hw_events *cpuhw;
if (boot_cpu_data.x86_max_cores < 2)
return;
cpuhw = &per_cpu(cpu_hw_events, cpu);
if (cpuhw->amd_nb) {
struct amd_nb *nb = cpuhw->amd_nb;
if (nb->nb_id == -1 || --nb->refcnt == 0)
kfree(nb);
cpuhw->amd_nb = NULL;
}
}
static struct event_constraint *
amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
{
/*
* if not NB event or no NB, then no constraints
*/
if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
return &unconstrained;
return __amd_get_nb_event_constraints(cpuc, event, NULL);
}
static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
struct perf_event *event)
{
if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
__amd_put_nb_event_constraints(cpuc, event);
}
PMU_FORMAT_ATTR(event, "config:0-7,32-35");
PMU_FORMAT_ATTR(umask, "config:8-15" );
PMU_FORMAT_ATTR(edge, "config:18" );
PMU_FORMAT_ATTR(inv, "config:23" );
PMU_FORMAT_ATTR(cmask, "config:24-31" );
static struct attribute *amd_format_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
&format_attr_edge.attr,
&format_attr_inv.attr,
&format_attr_cmask.attr,
NULL,
};
/* AMD Family 15h */
#define AMD_EVENT_TYPE_MASK 0x000000F0ULL
#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL
#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL
#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL
#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL
#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL
#define AMD_EVENT_EX_LS 0x000000C0ULL
#define AMD_EVENT_DE 0x000000D0ULL
#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL
/*
* AMD family 15h event code/PMC mappings:
*
* type = event_code & 0x0F0:
*
* 0x000 FP PERF_CTL[5:3]
* 0x010 FP PERF_CTL[5:3]
* 0x020 LS PERF_CTL[5:0]
* 0x030 LS PERF_CTL[5:0]
* 0x040 DC PERF_CTL[5:0]
* 0x050 DC PERF_CTL[5:0]
* 0x060 CU PERF_CTL[2:0]
* 0x070 CU PERF_CTL[2:0]
* 0x080 IC/DE PERF_CTL[2:0]
* 0x090 IC/DE PERF_CTL[2:0]
* 0x0A0 ---
* 0x0B0 ---
* 0x0C0 EX/LS PERF_CTL[5:0]
* 0x0D0 DE PERF_CTL[2:0]
* 0x0E0 NB NB_PERF_CTL[3:0]
* 0x0F0 NB NB_PERF_CTL[3:0]
*
* Exceptions:
*
* 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*)
* 0x003 FP PERF_CTL[3]
* 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*)
* 0x00B FP PERF_CTL[3]
* 0x00D FP PERF_CTL[3]
* 0x023 DE PERF_CTL[2:0]
* 0x02D LS PERF_CTL[3]
* 0x02E LS PERF_CTL[3,0]
* 0x031 LS PERF_CTL[2:0] (**)
* 0x043 CU PERF_CTL[2:0]
* 0x045 CU PERF_CTL[2:0]
* 0x046 CU PERF_CTL[2:0]
* 0x054 CU PERF_CTL[2:0]
* 0x055 CU PERF_CTL[2:0]
* 0x08F IC PERF_CTL[0]
* 0x187 DE PERF_CTL[0]
* 0x188 DE PERF_CTL[0]
* 0x0DB EX PERF_CTL[5:0]
* 0x0DC LS PERF_CTL[5:0]
* 0x0DD LS PERF_CTL[5:0]
* 0x0DE LS PERF_CTL[5:0]
* 0x0DF LS PERF_CTL[5:0]
* 0x1C0 EX PERF_CTL[5:3]
* 0x1D6 EX PERF_CTL[5:0]
* 0x1D8 EX PERF_CTL[5:0]
*
* (*) depending on the umask all FPU counters may be used
* (**) only one unitmask enabled at a time
*/
static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0);
static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
static struct event_constraint *
amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
unsigned int event_code = amd_get_event_code(hwc);
switch (event_code & AMD_EVENT_TYPE_MASK) {
case AMD_EVENT_FP:
switch (event_code) {
case 0x000:
if (!(hwc->config & 0x0000F000ULL))
break;
if (!(hwc->config & 0x00000F00ULL))
break;
return &amd_f15_PMC3;
case 0x004:
if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
break;
return &amd_f15_PMC3;
case 0x003:
case 0x00B:
case 0x00D:
return &amd_f15_PMC3;
}
return &amd_f15_PMC53;
case AMD_EVENT_LS:
case AMD_EVENT_DC:
case AMD_EVENT_EX_LS:
switch (event_code) {
case 0x023:
case 0x043:
case 0x045:
case 0x046:
case 0x054:
case 0x055:
return &amd_f15_PMC20;
case 0x02D:
return &amd_f15_PMC3;
case 0x02E:
return &amd_f15_PMC30;
case 0x031:
if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
return &amd_f15_PMC20;
return &emptyconstraint;
case 0x1C0:
return &amd_f15_PMC53;
default:
return &amd_f15_PMC50;
}
case AMD_EVENT_CU:
case AMD_EVENT_IC_DE:
case AMD_EVENT_DE:
switch (event_code) {
case 0x08F:
case 0x187:
case 0x188:
return &amd_f15_PMC0;
case 0x0DB ... 0x0DF:
case 0x1D6:
case 0x1D8:
return &amd_f15_PMC50;
default:
return &amd_f15_PMC20;
}
case AMD_EVENT_NB:
/* moved to perf_event_amd_uncore.c */
return &emptyconstraint;
default:
return &emptyconstraint;
}
}
static ssize_t amd_event_sysfs_show(char *page, u64 config)
{
u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
(config & AMD64_EVENTSEL_EVENT) >> 24;
return x86_event_sysfs_show(page, config, event);
}
static __initconst const struct x86_pmu amd_pmu = {
.name = "AMD",
.handle_irq = x86_pmu_handle_irq,
.disable_all = x86_pmu_disable_all,
.enable_all = x86_pmu_enable_all,
.enable = x86_pmu_enable_event,
.disable = x86_pmu_disable_event,
.hw_config = amd_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_K7_EVNTSEL0,
.perfctr = MSR_K7_PERFCTR0,
.addr_offset = amd_pmu_addr_offset,
.event_map = amd_pmu_event_map,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
.num_counters = AMD64_NUM_COUNTERS,
.cntval_bits = 48,
.cntval_mask = (1ULL << 48) - 1,
.apic = 1,
/* use highest bit to detect overflow */
.max_period = (1ULL << 47) - 1,
.get_event_constraints = amd_get_event_constraints,
.put_event_constraints = amd_put_event_constraints,
.format_attrs = amd_format_attr,
.events_sysfs_show = amd_event_sysfs_show,
.cpu_prepare = amd_pmu_cpu_prepare,
.cpu_starting = amd_pmu_cpu_starting,
.cpu_dead = amd_pmu_cpu_dead,
};
static int __init amd_core_pmu_init(void)
{
if (!cpu_has_perfctr_core)
return 0;
switch (boot_cpu_data.x86) {
case 0x15:
pr_cont("Fam15h ");
x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
break;
default:
pr_err("core perfctr but no constraints; unknown hardware!\n");
return -ENODEV;
}
/*
* If core performance counter extensions exists, we must use
* MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
* amd_pmu_addr_offset().
*/
x86_pmu.eventsel = MSR_F15H_PERF_CTL;
x86_pmu.perfctr = MSR_F15H_PERF_CTR;
x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE;
pr_cont("core perfctr, ");
return 0;
}
__init int amd_pmu_init(void)
{
int ret;
/* Performance-monitoring supported from K7 and later: */
if (boot_cpu_data.x86 < 6)
return -ENODEV;
x86_pmu = amd_pmu;
ret = amd_core_pmu_init();
if (ret)
return ret;
/* Events are common for all AMDs */
memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
return 0;
}
void amd_pmu_enable_virt(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
cpuc->perf_ctr_virt_mask = 0;
/* Reload all events */
x86_pmu_disable_all();
x86_pmu_enable_all(0);
}
EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
void amd_pmu_disable_virt(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
/*
* We only mask out the Host-only bit so that host-only counting works
* when SVM is disabled. If someone sets up a guest-only counter when
* SVM is disabled the Guest-only bits still gets set and the counter
* will not count anything.
*/
cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
/* Reload all events */
x86_pmu_disable_all();
x86_pmu_enable_all(0);
}
EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);

View file

@ -0,0 +1,946 @@
/*
* Performance events - AMD IBS
*
* Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
*
* For licencing details see kernel-base/COPYING
*/
#include <linux/perf_event.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/ptrace.h>
#include <linux/syscore_ops.h>
#include <asm/apic.h>
#include "perf_event.h"
static u32 ibs_caps;
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
#include <linux/kprobes.h>
#include <linux/hardirq.h>
#include <asm/nmi.h>
#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
enum ibs_states {
IBS_ENABLED = 0,
IBS_STARTED = 1,
IBS_STOPPING = 2,
IBS_MAX_STATES,
};
struct cpu_perf_ibs {
struct perf_event *event;
unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)];
};
struct perf_ibs {
struct pmu pmu;
unsigned int msr;
u64 config_mask;
u64 cnt_mask;
u64 enable_mask;
u64 valid_mask;
u64 max_period;
unsigned long offset_mask[1];
int offset_max;
struct cpu_perf_ibs __percpu *pcpu;
struct attribute **format_attrs;
struct attribute_group format_group;
const struct attribute_group *attr_groups[2];
u64 (*get_count)(u64 config);
};
struct perf_ibs_data {
u32 size;
union {
u32 data[0]; /* data buffer starts here */
u32 caps;
};
u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX];
};
static int
perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
{
s64 left = local64_read(&hwc->period_left);
s64 period = hwc->sample_period;
int overflow = 0;
/*
* If we are way outside a reasonable range then just skip forward:
*/
if (unlikely(left <= -period)) {
left = period;
local64_set(&hwc->period_left, left);
hwc->last_period = period;
overflow = 1;
}
if (unlikely(left < (s64)min)) {
left += period;
local64_set(&hwc->period_left, left);
hwc->last_period = period;
overflow = 1;
}
/*
* If the hw period that triggers the sw overflow is too short
* we might hit the irq handler. This biases the results.
* Thus we shorten the next-to-last period and set the last
* period to the max period.
*/
if (left > max) {
left -= max;
if (left > max)
left = max;
else if (left < min)
left = min;
}
*hw_period = (u64)left;
return overflow;
}
static int
perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
{
struct hw_perf_event *hwc = &event->hw;
int shift = 64 - width;
u64 prev_raw_count;
u64 delta;
/*
* Careful: an NMI might modify the previous event value.
*
* Our tactic to handle this is to first atomically read and
* exchange a new raw count - then add that new-prev delta
* count to the generic event atomically:
*/
prev_raw_count = local64_read(&hwc->prev_count);
if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
new_raw_count) != prev_raw_count)
return 0;
/*
* Now we have the new raw value and have updated the prev
* timestamp already. We can now calculate the elapsed delta
* (event-)time and add that to the generic event.
*
* Careful, not all hw sign-extends above the physical width
* of the count.
*/
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
local64_add(delta, &event->count);
local64_sub(delta, &hwc->period_left);
return 1;
}
static struct perf_ibs perf_ibs_fetch;
static struct perf_ibs perf_ibs_op;
static struct perf_ibs *get_ibs_pmu(int type)
{
if (perf_ibs_fetch.pmu.type == type)
return &perf_ibs_fetch;
if (perf_ibs_op.pmu.type == type)
return &perf_ibs_op;
return NULL;
}
/*
* Use IBS for precise event sampling:
*
* perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count
* perf record -a -e r076:p ... # same as -e cpu-cycles:p
* perf record -a -e r0C1:p ... # use ibs op counting micro-ops
*
* IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
* MSRC001_1033) is used to select either cycle or micro-ops counting
* mode.
*
* The rip of IBS samples has skid 0. Thus, IBS supports precise
* levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
* rip is invalid when IBS was not able to record the rip correctly.
* We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
*
*/
static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
{
switch (event->attr.precise_ip) {
case 0:
return -ENOENT;
case 1:
case 2:
break;
default:
return -EOPNOTSUPP;
}
switch (event->attr.type) {
case PERF_TYPE_HARDWARE:
switch (event->attr.config) {
case PERF_COUNT_HW_CPU_CYCLES:
*config = 0;
return 0;
}
break;
case PERF_TYPE_RAW:
switch (event->attr.config) {
case 0x0076:
*config = 0;
return 0;
case 0x00C1:
*config = IBS_OP_CNT_CTL;
return 0;
}
break;
default:
return -ENOENT;
}
return -EOPNOTSUPP;
}
static const struct perf_event_attr ibs_notsupp = {
.exclude_user = 1,
.exclude_kernel = 1,
.exclude_hv = 1,
.exclude_idle = 1,
.exclude_host = 1,
.exclude_guest = 1,
};
static int perf_ibs_init(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
struct perf_ibs *perf_ibs;
u64 max_cnt, config;
int ret;
perf_ibs = get_ibs_pmu(event->attr.type);
if (perf_ibs) {
config = event->attr.config;
} else {
perf_ibs = &perf_ibs_op;
ret = perf_ibs_precise_event(event, &config);
if (ret)
return ret;
}
if (event->pmu != &perf_ibs->pmu)
return -ENOENT;
if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
return -EINVAL;
if (config & ~perf_ibs->config_mask)
return -EINVAL;
if (hwc->sample_period) {
if (config & perf_ibs->cnt_mask)
/* raw max_cnt may not be set */
return -EINVAL;
if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
/*
* lower 4 bits can not be set in ibs max cnt,
* but allowing it in case we adjust the
* sample period to set a frequency.
*/
return -EINVAL;
hwc->sample_period &= ~0x0FULL;
if (!hwc->sample_period)
hwc->sample_period = 0x10;
} else {
max_cnt = config & perf_ibs->cnt_mask;
config &= ~perf_ibs->cnt_mask;
event->attr.sample_period = max_cnt << 4;
hwc->sample_period = event->attr.sample_period;
}
if (!hwc->sample_period)
return -EINVAL;
/*
* If we modify hwc->sample_period, we also need to update
* hwc->last_period and hwc->period_left.
*/
hwc->last_period = hwc->sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
hwc->config_base = perf_ibs->msr;
hwc->config = config;
return 0;
}
static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
struct hw_perf_event *hwc, u64 *period)
{
int overflow;
/* ignore lower 4 bits in min count: */
overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
local64_set(&hwc->prev_count, 0);
return overflow;
}
static u64 get_ibs_fetch_count(u64 config)
{
return (config & IBS_FETCH_CNT) >> 12;
}
static u64 get_ibs_op_count(u64 config)
{
u64 count = 0;
if (config & IBS_OP_VAL)
count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
if (ibs_caps & IBS_CAPS_RDWROPCNT)
count += (config & IBS_OP_CUR_CNT) >> 32;
return count;
}
static void
perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
u64 *config)
{
u64 count = perf_ibs->get_count(*config);
/*
* Set width to 64 since we do not overflow on max width but
* instead on max count. In perf_ibs_set_period() we clear
* prev count manually on overflow.
*/
while (!perf_event_try_update(event, count, 64)) {
rdmsrl(event->hw.config_base, *config);
count = perf_ibs->get_count(*config);
}
}
static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
struct hw_perf_event *hwc, u64 config)
{
wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
}
/*
* Erratum #420 Instruction-Based Sampling Engine May Generate
* Interrupt that Cannot Be Cleared:
*
* Must clear counter mask first, then clear the enable bit. See
* Revision Guide for AMD Family 10h Processors, Publication #41322.
*/
static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
struct hw_perf_event *hwc, u64 config)
{
config &= ~perf_ibs->cnt_mask;
wrmsrl(hwc->config_base, config);
config &= ~perf_ibs->enable_mask;
wrmsrl(hwc->config_base, config);
}
/*
* We cannot restore the ibs pmu state, so we always needs to update
* the event while stopping it and then reset the state when starting
* again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
* perf_ibs_start()/perf_ibs_stop() and instead always do it.
*/
static void perf_ibs_start(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
u64 period;
if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
return;
WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
hwc->state = 0;
perf_ibs_set_period(perf_ibs, hwc, &period);
set_bit(IBS_STARTED, pcpu->state);
perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
perf_event_update_userpage(event);
}
static void perf_ibs_stop(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
u64 config;
int stopping;
stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
if (!stopping && (hwc->state & PERF_HES_UPTODATE))
return;
rdmsrl(hwc->config_base, config);
if (stopping) {
set_bit(IBS_STOPPING, pcpu->state);
perf_ibs_disable_event(perf_ibs, hwc, config);
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
}
if (hwc->state & PERF_HES_UPTODATE)
return;
/*
* Clear valid bit to not count rollovers on update, rollovers
* are only updated in the irq handler.
*/
config &= ~perf_ibs->valid_mask;
perf_ibs_event_update(perf_ibs, event, &config);
hwc->state |= PERF_HES_UPTODATE;
}
static int perf_ibs_add(struct perf_event *event, int flags)
{
struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
if (test_and_set_bit(IBS_ENABLED, pcpu->state))
return -ENOSPC;
event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
pcpu->event = event;
if (flags & PERF_EF_START)
perf_ibs_start(event, PERF_EF_RELOAD);
return 0;
}
static void perf_ibs_del(struct perf_event *event, int flags)
{
struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
return;
perf_ibs_stop(event, PERF_EF_UPDATE);
pcpu->event = NULL;
perf_event_update_userpage(event);
}
static void perf_ibs_read(struct perf_event *event) { }
PMU_FORMAT_ATTR(rand_en, "config:57");
PMU_FORMAT_ATTR(cnt_ctl, "config:19");
static struct attribute *ibs_fetch_format_attrs[] = {
&format_attr_rand_en.attr,
NULL,
};
static struct attribute *ibs_op_format_attrs[] = {
NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
NULL,
};
static struct perf_ibs perf_ibs_fetch = {
.pmu = {
.task_ctx_nr = perf_invalid_context,
.event_init = perf_ibs_init,
.add = perf_ibs_add,
.del = perf_ibs_del,
.start = perf_ibs_start,
.stop = perf_ibs_stop,
.read = perf_ibs_read,
},
.msr = MSR_AMD64_IBSFETCHCTL,
.config_mask = IBS_FETCH_CONFIG_MASK,
.cnt_mask = IBS_FETCH_MAX_CNT,
.enable_mask = IBS_FETCH_ENABLE,
.valid_mask = IBS_FETCH_VAL,
.max_period = IBS_FETCH_MAX_CNT << 4,
.offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK },
.offset_max = MSR_AMD64_IBSFETCH_REG_COUNT,
.format_attrs = ibs_fetch_format_attrs,
.get_count = get_ibs_fetch_count,
};
static struct perf_ibs perf_ibs_op = {
.pmu = {
.task_ctx_nr = perf_invalid_context,
.event_init = perf_ibs_init,
.add = perf_ibs_add,
.del = perf_ibs_del,
.start = perf_ibs_start,
.stop = perf_ibs_stop,
.read = perf_ibs_read,
},
.msr = MSR_AMD64_IBSOPCTL,
.config_mask = IBS_OP_CONFIG_MASK,
.cnt_mask = IBS_OP_MAX_CNT,
.enable_mask = IBS_OP_ENABLE,
.valid_mask = IBS_OP_VAL,
.max_period = IBS_OP_MAX_CNT << 4,
.offset_mask = { MSR_AMD64_IBSOP_REG_MASK },
.offset_max = MSR_AMD64_IBSOP_REG_COUNT,
.format_attrs = ibs_op_format_attrs,
.get_count = get_ibs_op_count,
};
static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
{
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
struct perf_event *event = pcpu->event;
struct hw_perf_event *hwc = &event->hw;
struct perf_sample_data data;
struct perf_raw_record raw;
struct pt_regs regs;
struct perf_ibs_data ibs_data;
int offset, size, check_rip, offset_max, throttle = 0;
unsigned int msr;
u64 *buf, *config, period;
if (!test_bit(IBS_STARTED, pcpu->state)) {
/*
* Catch spurious interrupts after stopping IBS: After
* disabling IBS there could be still incoming NMIs
* with samples that even have the valid bit cleared.
* Mark all this NMIs as handled.
*/
return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
}
msr = hwc->config_base;
buf = ibs_data.regs;
rdmsrl(msr, *buf);
if (!(*buf++ & perf_ibs->valid_mask))
return 0;
config = &ibs_data.regs[0];
perf_ibs_event_update(perf_ibs, event, config);
perf_sample_data_init(&data, 0, hwc->last_period);
if (!perf_ibs_set_period(perf_ibs, hwc, &period))
goto out; /* no sw counter overflow */
ibs_data.caps = ibs_caps;
size = 1;
offset = 1;
check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
if (event->attr.sample_type & PERF_SAMPLE_RAW)
offset_max = perf_ibs->offset_max;
else if (check_rip)
offset_max = 2;
else
offset_max = 1;
do {
rdmsrl(msr + offset, *buf++);
size++;
offset = find_next_bit(perf_ibs->offset_mask,
perf_ibs->offset_max,
offset + 1);
} while (offset < offset_max);
ibs_data.size = sizeof(u64) * size;
regs = *iregs;
if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
regs.flags &= ~PERF_EFLAGS_EXACT;
} else {
set_linear_ip(&regs, ibs_data.regs[1]);
regs.flags |= PERF_EFLAGS_EXACT;
}
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw.size = sizeof(u32) + ibs_data.size;
raw.data = ibs_data.data;
data.raw = &raw;
}
throttle = perf_event_overflow(event, &data, &regs);
out:
if (throttle)
perf_ibs_disable_event(perf_ibs, hwc, *config);
else
perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
perf_event_update_userpage(event);
return 1;
}
static int
perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
int handled = 0;
handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
if (handled)
inc_irq_stat(apic_perf_irqs);
return handled;
}
NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
{
struct cpu_perf_ibs __percpu *pcpu;
int ret;
pcpu = alloc_percpu(struct cpu_perf_ibs);
if (!pcpu)
return -ENOMEM;
perf_ibs->pcpu = pcpu;
/* register attributes */
if (perf_ibs->format_attrs[0]) {
memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
perf_ibs->format_group.name = "format";
perf_ibs->format_group.attrs = perf_ibs->format_attrs;
memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
perf_ibs->attr_groups[0] = &perf_ibs->format_group;
perf_ibs->pmu.attr_groups = perf_ibs->attr_groups;
}
ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
if (ret) {
perf_ibs->pcpu = NULL;
free_percpu(pcpu);
}
return ret;
}
static __init int perf_event_ibs_init(void)
{
struct attribute **attr = ibs_op_format_attrs;
if (!ibs_caps)
return -ENODEV; /* ibs not supported by the cpu */
perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
if (ibs_caps & IBS_CAPS_OPCNT) {
perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
*attr++ = &format_attr_cnt_ctl.attr;
}
perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
return 0;
}
#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
static __init int perf_event_ibs_init(void) { return 0; }
#endif
/* IBS - apic initialization, for perf and oprofile */
static __init u32 __get_ibs_caps(void)
{
u32 caps;
unsigned int max_level;
if (!boot_cpu_has(X86_FEATURE_IBS))
return 0;
/* check IBS cpuid feature flags */
max_level = cpuid_eax(0x80000000);
if (max_level < IBS_CPUID_FEATURES)
return IBS_CAPS_DEFAULT;
caps = cpuid_eax(IBS_CPUID_FEATURES);
if (!(caps & IBS_CAPS_AVAIL))
/* cpuid flags not valid */
return IBS_CAPS_DEFAULT;
return caps;
}
u32 get_ibs_caps(void)
{
return ibs_caps;
}
EXPORT_SYMBOL(get_ibs_caps);
static inline int get_eilvt(int offset)
{
return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
}
static inline int put_eilvt(int offset)
{
return !setup_APIC_eilvt(offset, 0, 0, 1);
}
/*
* Check and reserve APIC extended interrupt LVT offset for IBS if available.
*/
static inline int ibs_eilvt_valid(void)
{
int offset;
u64 val;
int valid = 0;
preempt_disable();
rdmsrl(MSR_AMD64_IBSCTL, val);
offset = val & IBSCTL_LVT_OFFSET_MASK;
if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
goto out;
}
if (!get_eilvt(offset)) {
pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
goto out;
}
valid = 1;
out:
preempt_enable();
return valid;
}
static int setup_ibs_ctl(int ibs_eilvt_off)
{
struct pci_dev *cpu_cfg;
int nodes;
u32 value = 0;
nodes = 0;
cpu_cfg = NULL;
do {
cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
PCI_DEVICE_ID_AMD_10H_NB_MISC,
cpu_cfg);
if (!cpu_cfg)
break;
++nodes;
pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
| IBSCTL_LVT_OFFSET_VALID);
pci_read_config_dword(cpu_cfg, IBSCTL, &value);
if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
pci_dev_put(cpu_cfg);
printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
"IBSCTL = 0x%08x\n", value);
return -EINVAL;
}
} while (1);
if (!nodes) {
printk(KERN_DEBUG "No CPU node configured for IBS\n");
return -ENODEV;
}
return 0;
}
/*
* This runs only on the current cpu. We try to find an LVT offset and
* setup the local APIC. For this we must disable preemption. On
* success we initialize all nodes with this offset. This updates then
* the offset in the IBS_CTL per-node msr. The per-core APIC setup of
* the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
* is using the new offset.
*/
static int force_ibs_eilvt_setup(void)
{
int offset;
int ret;
preempt_disable();
/* find the next free available EILVT entry, skip offset 0 */
for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
if (get_eilvt(offset))
break;
}
preempt_enable();
if (offset == APIC_EILVT_NR_MAX) {
printk(KERN_DEBUG "No EILVT entry available\n");
return -EBUSY;
}
ret = setup_ibs_ctl(offset);
if (ret)
goto out;
if (!ibs_eilvt_valid()) {
ret = -EFAULT;
goto out;
}
pr_info("IBS: LVT offset %d assigned\n", offset);
return 0;
out:
preempt_disable();
put_eilvt(offset);
preempt_enable();
return ret;
}
static void ibs_eilvt_setup(void)
{
/*
* Force LVT offset assignment for family 10h: The offsets are
* not assigned by the BIOS for this family, so the OS is
* responsible for doing it. If the OS assignment fails, fall
* back to BIOS settings and try to setup this.
*/
if (boot_cpu_data.x86 == 0x10)
force_ibs_eilvt_setup();
}
static inline int get_ibs_lvt_offset(void)
{
u64 val;
rdmsrl(MSR_AMD64_IBSCTL, val);
if (!(val & IBSCTL_LVT_OFFSET_VALID))
return -EINVAL;
return val & IBSCTL_LVT_OFFSET_MASK;
}
static void setup_APIC_ibs(void *dummy)
{
int offset;
offset = get_ibs_lvt_offset();
if (offset < 0)
goto failed;
if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
return;
failed:
pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
smp_processor_id());
}
static void clear_APIC_ibs(void *dummy)
{
int offset;
offset = get_ibs_lvt_offset();
if (offset >= 0)
setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
}
#ifdef CONFIG_PM
static int perf_ibs_suspend(void)
{
clear_APIC_ibs(NULL);
return 0;
}
static void perf_ibs_resume(void)
{
ibs_eilvt_setup();
setup_APIC_ibs(NULL);
}
static struct syscore_ops perf_ibs_syscore_ops = {
.resume = perf_ibs_resume,
.suspend = perf_ibs_suspend,
};
static void perf_ibs_pm_init(void)
{
register_syscore_ops(&perf_ibs_syscore_ops);
}
#else
static inline void perf_ibs_pm_init(void) { }
#endif
static int
perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_STARTING:
setup_APIC_ibs(NULL);
break;
case CPU_DYING:
clear_APIC_ibs(NULL);
break;
default:
break;
}
return NOTIFY_OK;
}
static __init int amd_ibs_init(void)
{
u32 caps;
int ret = -EINVAL;
caps = __get_ibs_caps();
if (!caps)
return -ENODEV; /* ibs not supported by the cpu */
ibs_eilvt_setup();
if (!ibs_eilvt_valid())
goto out;
perf_ibs_pm_init();
cpu_notifier_register_begin();
ibs_caps = caps;
/* make ibs_caps visible to other cpus: */
smp_mb();
smp_call_function(setup_APIC_ibs, NULL, 1);
__perf_cpu_notifier(perf_ibs_cpu_notifier);
cpu_notifier_register_done();
ret = perf_event_ibs_init();
out:
if (ret)
pr_err("Failed to setup IBS, %d\n", ret);
return ret;
}
/* Since we need the pci subsystem to init ibs we can't do this earlier: */
device_initcall(amd_ibs_init);

View file

@ -0,0 +1,502 @@
/*
* Copyright (C) 2013 Advanced Micro Devices, Inc.
*
* Author: Steven Kinney <Steven.Kinney@amd.com>
* Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
*
* Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/perf_event.h>
#include <linux/module.h>
#include <linux/cpumask.h>
#include <linux/slab.h>
#include "perf_event.h"
#include "perf_event_amd_iommu.h"
#define COUNTER_SHIFT 16
#define _GET_BANK(ev) ((u8)(ev->hw.extra_reg.reg >> 8))
#define _GET_CNTR(ev) ((u8)(ev->hw.extra_reg.reg))
/* iommu pmu config masks */
#define _GET_CSOURCE(ev) ((ev->hw.config & 0xFFULL))
#define _GET_DEVID(ev) ((ev->hw.config >> 8) & 0xFFFFULL)
#define _GET_PASID(ev) ((ev->hw.config >> 24) & 0xFFFFULL)
#define _GET_DOMID(ev) ((ev->hw.config >> 40) & 0xFFFFULL)
#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config) & 0xFFFFULL)
#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
static struct perf_amd_iommu __perf_iommu;
struct perf_amd_iommu {
struct pmu pmu;
u8 max_banks;
u8 max_counters;
u64 cntr_assign_mask;
raw_spinlock_t lock;
const struct attribute_group *attr_groups[4];
};
#define format_group attr_groups[0]
#define cpumask_group attr_groups[1]
#define events_group attr_groups[2]
#define null_group attr_groups[3]
/*---------------------------------------------
* sysfs format attributes
*---------------------------------------------*/
PMU_FORMAT_ATTR(csource, "config:0-7");
PMU_FORMAT_ATTR(devid, "config:8-23");
PMU_FORMAT_ATTR(pasid, "config:24-39");
PMU_FORMAT_ATTR(domid, "config:40-55");
PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
static struct attribute *iommu_format_attrs[] = {
&format_attr_csource.attr,
&format_attr_devid.attr,
&format_attr_pasid.attr,
&format_attr_domid.attr,
&format_attr_devid_mask.attr,
&format_attr_pasid_mask.attr,
&format_attr_domid_mask.attr,
NULL,
};
static struct attribute_group amd_iommu_format_group = {
.name = "format",
.attrs = iommu_format_attrs,
};
/*---------------------------------------------
* sysfs events attributes
*---------------------------------------------*/
struct amd_iommu_event_desc {
struct kobj_attribute attr;
const char *event;
};
static ssize_t _iommu_event_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct amd_iommu_event_desc *event =
container_of(attr, struct amd_iommu_event_desc, attr);
return sprintf(buf, "%s\n", event->event);
}
#define AMD_IOMMU_EVENT_DESC(_name, _event) \
{ \
.attr = __ATTR(_name, 0444, _iommu_event_show, NULL), \
.event = _event, \
}
static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
AMD_IOMMU_EVENT_DESC(mem_pass_untrans, "csource=0x01"),
AMD_IOMMU_EVENT_DESC(mem_pass_pretrans, "csource=0x02"),
AMD_IOMMU_EVENT_DESC(mem_pass_excl, "csource=0x03"),
AMD_IOMMU_EVENT_DESC(mem_target_abort, "csource=0x04"),
AMD_IOMMU_EVENT_DESC(mem_trans_total, "csource=0x05"),
AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit, "csource=0x06"),
AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis, "csource=0x07"),
AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit, "csource=0x08"),
AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis, "csource=0x09"),
AMD_IOMMU_EVENT_DESC(mem_dte_hit, "csource=0x0a"),
AMD_IOMMU_EVENT_DESC(mem_dte_mis, "csource=0x0b"),
AMD_IOMMU_EVENT_DESC(page_tbl_read_tot, "csource=0x0c"),
AMD_IOMMU_EVENT_DESC(page_tbl_read_nst, "csource=0x0d"),
AMD_IOMMU_EVENT_DESC(page_tbl_read_gst, "csource=0x0e"),
AMD_IOMMU_EVENT_DESC(int_dte_hit, "csource=0x0f"),
AMD_IOMMU_EVENT_DESC(int_dte_mis, "csource=0x10"),
AMD_IOMMU_EVENT_DESC(cmd_processed, "csource=0x11"),
AMD_IOMMU_EVENT_DESC(cmd_processed_inv, "csource=0x12"),
AMD_IOMMU_EVENT_DESC(tlb_inv, "csource=0x13"),
{ /* end: all zeroes */ },
};
/*---------------------------------------------
* sysfs cpumask attributes
*---------------------------------------------*/
static cpumask_t iommu_cpumask;
static ssize_t _iommu_cpumask_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &iommu_cpumask);
buf[n++] = '\n';
buf[n] = '\0';
return n;
}
static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
static struct attribute *iommu_cpumask_attrs[] = {
&dev_attr_cpumask.attr,
NULL,
};
static struct attribute_group amd_iommu_cpumask_group = {
.attrs = iommu_cpumask_attrs,
};
/*---------------------------------------------*/
static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
{
unsigned long flags;
int shift, bank, cntr, retval;
int max_banks = perf_iommu->max_banks;
int max_cntrs = perf_iommu->max_counters;
raw_spin_lock_irqsave(&perf_iommu->lock, flags);
for (bank = 0, shift = 0; bank < max_banks; bank++) {
for (cntr = 0; cntr < max_cntrs; cntr++) {
shift = bank + (bank*3) + cntr;
if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
continue;
} else {
perf_iommu->cntr_assign_mask |= (1ULL<<shift);
retval = ((u16)((u16)bank<<8) | (u8)(cntr));
goto out;
}
}
}
retval = -ENOSPC;
out:
raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
return retval;
}
static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
u8 bank, u8 cntr)
{
unsigned long flags;
int max_banks, max_cntrs;
int shift = 0;
max_banks = perf_iommu->max_banks;
max_cntrs = perf_iommu->max_counters;
if ((bank > max_banks) || (cntr > max_cntrs))
return -EINVAL;
shift = bank + cntr + (bank*3);
raw_spin_lock_irqsave(&perf_iommu->lock, flags);
perf_iommu->cntr_assign_mask &= ~(1ULL<<shift);
raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
return 0;
}
static int perf_iommu_event_init(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
struct perf_amd_iommu *perf_iommu;
u64 config, config1;
/* test the event attr type check for PMU enumeration */
if (event->attr.type != event->pmu->type)
return -ENOENT;
/*
* IOMMU counters are shared across all cores.
* Therefore, it does not support per-process mode.
* Also, it does not support event sampling mode.
*/
if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
return -EINVAL;
/* IOMMU counters do not have usr/os/guest/host bits */
if (event->attr.exclude_user || event->attr.exclude_kernel ||
event->attr.exclude_host || event->attr.exclude_guest)
return -EINVAL;
if (event->cpu < 0)
return -EINVAL;
perf_iommu = &__perf_iommu;
if (event->pmu != &perf_iommu->pmu)
return -ENOENT;
if (perf_iommu) {
config = event->attr.config;
config1 = event->attr.config1;
} else {
return -EINVAL;
}
/* integrate with iommu base devid (0000), assume one iommu */
perf_iommu->max_banks =
amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
perf_iommu->max_counters =
amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
return -EINVAL;
/* update the hw_perf_event struct with the iommu config data */
hwc->config = config;
hwc->extra_reg.config = config1;
return 0;
}
static void perf_iommu_enable_event(struct perf_event *ev)
{
u8 csource = _GET_CSOURCE(ev);
u16 devid = _GET_DEVID(ev);
u64 reg = 0ULL;
reg = csource;
amd_iommu_pc_get_set_reg_val(devid,
_GET_BANK(ev), _GET_CNTR(ev) ,
IOMMU_PC_COUNTER_SRC_REG, &reg, true);
reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
if (reg)
reg |= (1UL << 31);
amd_iommu_pc_get_set_reg_val(devid,
_GET_BANK(ev), _GET_CNTR(ev) ,
IOMMU_PC_DEVID_MATCH_REG, &reg, true);
reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
if (reg)
reg |= (1UL << 31);
amd_iommu_pc_get_set_reg_val(devid,
_GET_BANK(ev), _GET_CNTR(ev) ,
IOMMU_PC_PASID_MATCH_REG, &reg, true);
reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
if (reg)
reg |= (1UL << 31);
amd_iommu_pc_get_set_reg_val(devid,
_GET_BANK(ev), _GET_CNTR(ev) ,
IOMMU_PC_DOMID_MATCH_REG, &reg, true);
}
static void perf_iommu_disable_event(struct perf_event *event)
{
u64 reg = 0ULL;
amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
_GET_BANK(event), _GET_CNTR(event),
IOMMU_PC_COUNTER_SRC_REG, &reg, true);
}
static void perf_iommu_start(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
pr_debug("perf: amd_iommu:perf_iommu_start\n");
if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
return;
WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
hwc->state = 0;
if (flags & PERF_EF_RELOAD) {
u64 prev_raw_count = local64_read(&hwc->prev_count);
amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
_GET_BANK(event), _GET_CNTR(event),
IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
}
perf_iommu_enable_event(event);
perf_event_update_userpage(event);
}
static void perf_iommu_read(struct perf_event *event)
{
u64 count = 0ULL;
u64 prev_raw_count = 0ULL;
u64 delta = 0ULL;
struct hw_perf_event *hwc = &event->hw;
pr_debug("perf: amd_iommu:perf_iommu_read\n");
amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
_GET_BANK(event), _GET_CNTR(event),
IOMMU_PC_COUNTER_REG, &count, false);
/* IOMMU pc counter register is only 48 bits */
count &= 0xFFFFFFFFFFFFULL;
prev_raw_count = local64_read(&hwc->prev_count);
if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
count) != prev_raw_count)
return;
/* Handling 48-bit counter overflowing */
delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
delta >>= COUNTER_SHIFT;
local64_add(delta, &event->count);
}
static void perf_iommu_stop(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
u64 config;
pr_debug("perf: amd_iommu:perf_iommu_stop\n");
if (hwc->state & PERF_HES_UPTODATE)
return;
perf_iommu_disable_event(event);
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
if (hwc->state & PERF_HES_UPTODATE)
return;
config = hwc->config;
perf_iommu_read(event);
hwc->state |= PERF_HES_UPTODATE;
}
static int perf_iommu_add(struct perf_event *event, int flags)
{
int retval;
struct perf_amd_iommu *perf_iommu =
container_of(event->pmu, struct perf_amd_iommu, pmu);
pr_debug("perf: amd_iommu:perf_iommu_add\n");
event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
/* request an iommu bank/counter */
retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
if (retval != -ENOSPC)
event->hw.extra_reg.reg = (u16)retval;
else
return retval;
if (flags & PERF_EF_START)
perf_iommu_start(event, PERF_EF_RELOAD);
return 0;
}
static void perf_iommu_del(struct perf_event *event, int flags)
{
struct perf_amd_iommu *perf_iommu =
container_of(event->pmu, struct perf_amd_iommu, pmu);
pr_debug("perf: amd_iommu:perf_iommu_del\n");
perf_iommu_stop(event, PERF_EF_UPDATE);
/* clear the assigned iommu bank/counter */
clear_avail_iommu_bnk_cntr(perf_iommu,
_GET_BANK(event),
_GET_CNTR(event));
perf_event_update_userpage(event);
}
static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
{
struct attribute **attrs;
struct attribute_group *attr_group;
int i = 0, j;
while (amd_iommu_v2_event_descs[i].attr.attr.name)
i++;
attr_group = kzalloc(sizeof(struct attribute *)
* (i + 1) + sizeof(*attr_group), GFP_KERNEL);
if (!attr_group)
return -ENOMEM;
attrs = (struct attribute **)(attr_group + 1);
for (j = 0; j < i; j++)
attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
attr_group->name = "events";
attr_group->attrs = attrs;
perf_iommu->events_group = attr_group;
return 0;
}
static __init void amd_iommu_pc_exit(void)
{
if (__perf_iommu.events_group != NULL) {
kfree(__perf_iommu.events_group);
__perf_iommu.events_group = NULL;
}
}
static __init int _init_perf_amd_iommu(
struct perf_amd_iommu *perf_iommu, char *name)
{
int ret;
raw_spin_lock_init(&perf_iommu->lock);
/* Init format attributes */
perf_iommu->format_group = &amd_iommu_format_group;
/* Init cpumask attributes to only core 0 */
cpumask_set_cpu(0, &iommu_cpumask);
perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
/* Init events attributes */
if (_init_events_attrs(perf_iommu) != 0)
pr_err("perf: amd_iommu: Only support raw events.\n");
/* Init null attributes */
perf_iommu->null_group = NULL;
perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
if (ret) {
pr_err("perf: amd_iommu: Failed to initialized.\n");
amd_iommu_pc_exit();
} else {
pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
}
return ret;
}
static struct perf_amd_iommu __perf_iommu = {
.pmu = {
.event_init = perf_iommu_event_init,
.add = perf_iommu_add,
.del = perf_iommu_del,
.start = perf_iommu_start,
.stop = perf_iommu_stop,
.read = perf_iommu_read,
},
.max_banks = 0x00,
.max_counters = 0x00,
.cntr_assign_mask = 0ULL,
.format_group = NULL,
.cpumask_group = NULL,
.events_group = NULL,
.null_group = NULL,
};
static __init int amd_iommu_pc_init(void)
{
/* Make sure the IOMMU PC resource is available */
if (!amd_iommu_pc_supported())
return -ENODEV;
_init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
return 0;
}
device_initcall(amd_iommu_pc_init);

View file

@ -0,0 +1,40 @@
/*
* Copyright (C) 2013 Advanced Micro Devices, Inc.
*
* Author: Steven Kinney <Steven.Kinney@amd.com>
* Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#ifndef _PERF_EVENT_AMD_IOMMU_H_
#define _PERF_EVENT_AMD_IOMMU_H_
/* iommu pc mmio region register indexes */
#define IOMMU_PC_COUNTER_REG 0x00
#define IOMMU_PC_COUNTER_SRC_REG 0x08
#define IOMMU_PC_PASID_MATCH_REG 0x10
#define IOMMU_PC_DOMID_MATCH_REG 0x18
#define IOMMU_PC_DEVID_MATCH_REG 0x20
#define IOMMU_PC_COUNTER_REPORT_REG 0x28
/* maximun specified bank/counters */
#define PC_MAX_SPEC_BNKS 64
#define PC_MAX_SPEC_CNTRS 16
/* iommu pc reg masks*/
#define IOMMU_BASE_DEVID 0x0000
/* amd_iommu_init.c external support functions */
extern bool amd_iommu_pc_supported(void);
extern u8 amd_iommu_pc_get_max_banks(u16 devid);
extern u8 amd_iommu_pc_get_max_counters(u16 devid);
extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
u8 fxn, u64 *value, bool is_write);
#endif /*_PERF_EVENT_AMD_IOMMU_H_*/

View file

@ -0,0 +1,604 @@
/*
* Copyright (C) 2013 Advanced Micro Devices, Inc.
*
* Author: Jacob Shin <jacob.shin@amd.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/perf_event.h>
#include <linux/percpu.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <asm/cpufeature.h>
#include <asm/perf_event.h>
#include <asm/msr.h>
#define NUM_COUNTERS_NB 4
#define NUM_COUNTERS_L2 4
#define MAX_COUNTERS NUM_COUNTERS_NB
#define RDPMC_BASE_NB 6
#define RDPMC_BASE_L2 10
#define COUNTER_SHIFT 16
struct amd_uncore {
int id;
int refcnt;
int cpu;
int num_counters;
int rdpmc_base;
u32 msr_base;
cpumask_t *active_mask;
struct pmu *pmu;
struct perf_event *events[MAX_COUNTERS];
struct amd_uncore *free_when_cpu_online;
};
static struct amd_uncore * __percpu *amd_uncore_nb;
static struct amd_uncore * __percpu *amd_uncore_l2;
static struct pmu amd_nb_pmu;
static struct pmu amd_l2_pmu;
static cpumask_t amd_nb_active_mask;
static cpumask_t amd_l2_active_mask;
static bool is_nb_event(struct perf_event *event)
{
return event->pmu->type == amd_nb_pmu.type;
}
static bool is_l2_event(struct perf_event *event)
{
return event->pmu->type == amd_l2_pmu.type;
}
static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
{
if (is_nb_event(event) && amd_uncore_nb)
return *per_cpu_ptr(amd_uncore_nb, event->cpu);
else if (is_l2_event(event) && amd_uncore_l2)
return *per_cpu_ptr(amd_uncore_l2, event->cpu);
return NULL;
}
static void amd_uncore_read(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
u64 prev, new;
s64 delta;
/*
* since we do not enable counter overflow interrupts,
* we do not have to worry about prev_count changing on us
*/
prev = local64_read(&hwc->prev_count);
rdpmcl(hwc->event_base_rdpmc, new);
local64_set(&hwc->prev_count, new);
delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
delta >>= COUNTER_SHIFT;
local64_add(delta, &event->count);
}
static void amd_uncore_start(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
if (flags & PERF_EF_RELOAD)
wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
hwc->state = 0;
wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE));
perf_event_update_userpage(event);
}
static void amd_uncore_stop(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
wrmsrl(hwc->config_base, hwc->config);
hwc->state |= PERF_HES_STOPPED;
if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
amd_uncore_read(event);
hwc->state |= PERF_HES_UPTODATE;
}
}
static int amd_uncore_add(struct perf_event *event, int flags)
{
int i;
struct amd_uncore *uncore = event_to_amd_uncore(event);
struct hw_perf_event *hwc = &event->hw;
/* are we already assigned? */
if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
goto out;
for (i = 0; i < uncore->num_counters; i++) {
if (uncore->events[i] == event) {
hwc->idx = i;
goto out;
}
}
/* if not, take the first available counter */
hwc->idx = -1;
for (i = 0; i < uncore->num_counters; i++) {
if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
hwc->idx = i;
break;
}
}
out:
if (hwc->idx == -1)
return -EBUSY;
hwc->config_base = uncore->msr_base + (2 * hwc->idx);
hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
if (flags & PERF_EF_START)
amd_uncore_start(event, PERF_EF_RELOAD);
return 0;
}
static void amd_uncore_del(struct perf_event *event, int flags)
{
int i;
struct amd_uncore *uncore = event_to_amd_uncore(event);
struct hw_perf_event *hwc = &event->hw;
amd_uncore_stop(event, PERF_EF_UPDATE);
for (i = 0; i < uncore->num_counters; i++) {
if (cmpxchg(&uncore->events[i], event, NULL) == event)
break;
}
hwc->idx = -1;
}
static int amd_uncore_event_init(struct perf_event *event)
{
struct amd_uncore *uncore;
struct hw_perf_event *hwc = &event->hw;
if (event->attr.type != event->pmu->type)
return -ENOENT;
/*
* NB and L2 counters (MSRs) are shared across all cores that share the
* same NB / L2 cache. Interrupts can be directed to a single target
* core, however, event counts generated by processes running on other
* cores cannot be masked out. So we do not support sampling and
* per-thread events.
*/
if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
return -EINVAL;
/* NB and L2 counters do not have usr/os/guest/host bits */
if (event->attr.exclude_user || event->attr.exclude_kernel ||
event->attr.exclude_host || event->attr.exclude_guest)
return -EINVAL;
/* and we do not enable counter overflow interrupts */
hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
hwc->idx = -1;
if (event->cpu < 0)
return -EINVAL;
uncore = event_to_amd_uncore(event);
if (!uncore)
return -ENODEV;
/*
* since request can come in to any of the shared cores, we will remap
* to a single common cpu.
*/
event->cpu = uncore->cpu;
return 0;
}
static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
struct device_attribute *attr,
char *buf)
{
int n;
cpumask_t *active_mask;
struct pmu *pmu = dev_get_drvdata(dev);
if (pmu->type == amd_nb_pmu.type)
active_mask = &amd_nb_active_mask;
else if (pmu->type == amd_l2_pmu.type)
active_mask = &amd_l2_active_mask;
else
return 0;
n = cpulist_scnprintf(buf, PAGE_SIZE - 2, active_mask);
buf[n++] = '\n';
buf[n] = '\0';
return n;
}
static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
static struct attribute *amd_uncore_attrs[] = {
&dev_attr_cpumask.attr,
NULL,
};
static struct attribute_group amd_uncore_attr_group = {
.attrs = amd_uncore_attrs,
};
PMU_FORMAT_ATTR(event, "config:0-7,32-35");
PMU_FORMAT_ATTR(umask, "config:8-15");
static struct attribute *amd_uncore_format_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
NULL,
};
static struct attribute_group amd_uncore_format_group = {
.name = "format",
.attrs = amd_uncore_format_attr,
};
static const struct attribute_group *amd_uncore_attr_groups[] = {
&amd_uncore_attr_group,
&amd_uncore_format_group,
NULL,
};
static struct pmu amd_nb_pmu = {
.attr_groups = amd_uncore_attr_groups,
.name = "amd_nb",
.event_init = amd_uncore_event_init,
.add = amd_uncore_add,
.del = amd_uncore_del,
.start = amd_uncore_start,
.stop = amd_uncore_stop,
.read = amd_uncore_read,
};
static struct pmu amd_l2_pmu = {
.attr_groups = amd_uncore_attr_groups,
.name = "amd_l2",
.event_init = amd_uncore_event_init,
.add = amd_uncore_add,
.del = amd_uncore_del,
.start = amd_uncore_start,
.stop = amd_uncore_stop,
.read = amd_uncore_read,
};
static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
{
return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
cpu_to_node(cpu));
}
static int amd_uncore_cpu_up_prepare(unsigned int cpu)
{
struct amd_uncore *uncore_nb = NULL, *uncore_l2;
if (amd_uncore_nb) {
uncore_nb = amd_uncore_alloc(cpu);
if (!uncore_nb)
goto fail;
uncore_nb->cpu = cpu;
uncore_nb->num_counters = NUM_COUNTERS_NB;
uncore_nb->rdpmc_base = RDPMC_BASE_NB;
uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
uncore_nb->active_mask = &amd_nb_active_mask;
uncore_nb->pmu = &amd_nb_pmu;
*per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
}
if (amd_uncore_l2) {
uncore_l2 = amd_uncore_alloc(cpu);
if (!uncore_l2)
goto fail;
uncore_l2->cpu = cpu;
uncore_l2->num_counters = NUM_COUNTERS_L2;
uncore_l2->rdpmc_base = RDPMC_BASE_L2;
uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
uncore_l2->active_mask = &amd_l2_active_mask;
uncore_l2->pmu = &amd_l2_pmu;
*per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
}
return 0;
fail:
kfree(uncore_nb);
return -ENOMEM;
}
static struct amd_uncore *
amd_uncore_find_online_sibling(struct amd_uncore *this,
struct amd_uncore * __percpu *uncores)
{
unsigned int cpu;
struct amd_uncore *that;
for_each_online_cpu(cpu) {
that = *per_cpu_ptr(uncores, cpu);
if (!that)
continue;
if (this == that)
continue;
if (this->id == that->id) {
that->free_when_cpu_online = this;
this = that;
break;
}
}
this->refcnt++;
return this;
}
static void amd_uncore_cpu_starting(unsigned int cpu)
{
unsigned int eax, ebx, ecx, edx;
struct amd_uncore *uncore;
if (amd_uncore_nb) {
uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
uncore->id = ecx & 0xff;
uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
*per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
}
if (amd_uncore_l2) {
unsigned int apicid = cpu_data(cpu).apicid;
unsigned int nshared;
uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
nshared = ((eax >> 14) & 0xfff) + 1;
uncore->id = apicid - (apicid % nshared);
uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
*per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
}
}
static void uncore_online(unsigned int cpu,
struct amd_uncore * __percpu *uncores)
{
struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
kfree(uncore->free_when_cpu_online);
uncore->free_when_cpu_online = NULL;
if (cpu == uncore->cpu)
cpumask_set_cpu(cpu, uncore->active_mask);
}
static void amd_uncore_cpu_online(unsigned int cpu)
{
if (amd_uncore_nb)
uncore_online(cpu, amd_uncore_nb);
if (amd_uncore_l2)
uncore_online(cpu, amd_uncore_l2);
}
static void uncore_down_prepare(unsigned int cpu,
struct amd_uncore * __percpu *uncores)
{
unsigned int i;
struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
if (this->cpu != cpu)
return;
/* this cpu is going down, migrate to a shared sibling if possible */
for_each_online_cpu(i) {
struct amd_uncore *that = *per_cpu_ptr(uncores, i);
if (cpu == i)
continue;
if (this == that) {
perf_pmu_migrate_context(this->pmu, cpu, i);
cpumask_clear_cpu(cpu, that->active_mask);
cpumask_set_cpu(i, that->active_mask);
that->cpu = i;
break;
}
}
}
static void amd_uncore_cpu_down_prepare(unsigned int cpu)
{
if (amd_uncore_nb)
uncore_down_prepare(cpu, amd_uncore_nb);
if (amd_uncore_l2)
uncore_down_prepare(cpu, amd_uncore_l2);
}
static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
{
struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
if (cpu == uncore->cpu)
cpumask_clear_cpu(cpu, uncore->active_mask);
if (!--uncore->refcnt)
kfree(uncore);
*per_cpu_ptr(uncores, cpu) = NULL;
}
static void amd_uncore_cpu_dead(unsigned int cpu)
{
if (amd_uncore_nb)
uncore_dead(cpu, amd_uncore_nb);
if (amd_uncore_l2)
uncore_dead(cpu, amd_uncore_l2);
}
static int
amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
void *hcpu)
{
unsigned int cpu = (long)hcpu;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
if (amd_uncore_cpu_up_prepare(cpu))
return notifier_from_errno(-ENOMEM);
break;
case CPU_STARTING:
amd_uncore_cpu_starting(cpu);
break;
case CPU_ONLINE:
amd_uncore_cpu_online(cpu);
break;
case CPU_DOWN_PREPARE:
amd_uncore_cpu_down_prepare(cpu);
break;
case CPU_UP_CANCELED:
case CPU_DEAD:
amd_uncore_cpu_dead(cpu);
break;
default:
break;
}
return NOTIFY_OK;
}
static struct notifier_block amd_uncore_cpu_notifier_block = {
.notifier_call = amd_uncore_cpu_notifier,
.priority = CPU_PRI_PERF + 1,
};
static void __init init_cpu_already_online(void *dummy)
{
unsigned int cpu = smp_processor_id();
amd_uncore_cpu_starting(cpu);
amd_uncore_cpu_online(cpu);
}
static void cleanup_cpu_online(void *dummy)
{
unsigned int cpu = smp_processor_id();
amd_uncore_cpu_dead(cpu);
}
static int __init amd_uncore_init(void)
{
unsigned int cpu, cpu2;
int ret = -ENODEV;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
goto fail_nodev;
if (!cpu_has_topoext)
goto fail_nodev;
if (cpu_has_perfctr_nb) {
amd_uncore_nb = alloc_percpu(struct amd_uncore *);
if (!amd_uncore_nb) {
ret = -ENOMEM;
goto fail_nb;
}
ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
if (ret)
goto fail_nb;
printk(KERN_INFO "perf: AMD NB counters detected\n");
ret = 0;
}
if (cpu_has_perfctr_l2) {
amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
if (!amd_uncore_l2) {
ret = -ENOMEM;
goto fail_l2;
}
ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
if (ret)
goto fail_l2;
printk(KERN_INFO "perf: AMD L2I counters detected\n");
ret = 0;
}
if (ret)
goto fail_nodev;
cpu_notifier_register_begin();
/* init cpus already online before registering for hotplug notifier */
for_each_online_cpu(cpu) {
ret = amd_uncore_cpu_up_prepare(cpu);
if (ret)
goto fail_online;
smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
}
__register_cpu_notifier(&amd_uncore_cpu_notifier_block);
cpu_notifier_register_done();
return 0;
fail_online:
for_each_online_cpu(cpu2) {
if (cpu2 == cpu)
break;
smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1);
}
cpu_notifier_register_done();
/* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
amd_uncore_nb = amd_uncore_l2 = NULL;
if (cpu_has_perfctr_l2)
perf_pmu_unregister(&amd_l2_pmu);
fail_l2:
if (cpu_has_perfctr_nb)
perf_pmu_unregister(&amd_nb_pmu);
if (amd_uncore_l2)
free_percpu(amd_uncore_l2);
fail_nb:
if (amd_uncore_nb)
free_percpu(amd_uncore_nb);
fail_nodev:
return ret;
}
device_initcall(amd_uncore_init);

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,779 @@
#include <linux/perf_event.h>
#include <linux/types.h>
#include <asm/perf_event.h>
#include <asm/msr.h>
#include <asm/insn.h>
#include "perf_event.h"
enum {
LBR_FORMAT_32 = 0x00,
LBR_FORMAT_LIP = 0x01,
LBR_FORMAT_EIP = 0x02,
LBR_FORMAT_EIP_FLAGS = 0x03,
LBR_FORMAT_EIP_FLAGS2 = 0x04,
LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_EIP_FLAGS2,
};
static enum {
LBR_EIP_FLAGS = 1,
LBR_TSX = 2,
} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
[LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS,
[LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
};
/*
* Intel LBR_SELECT bits
* Intel Vol3a, April 2011, Section 16.7 Table 16-10
*
* Hardware branch filter (not available on all CPUs)
*/
#define LBR_KERNEL_BIT 0 /* do not capture at ring0 */
#define LBR_USER_BIT 1 /* do not capture at ring > 0 */
#define LBR_JCC_BIT 2 /* do not capture conditional branches */
#define LBR_REL_CALL_BIT 3 /* do not capture relative calls */
#define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */
#define LBR_RETURN_BIT 5 /* do not capture near returns */
#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
#define LBR_FAR_BIT 8 /* do not capture far branches */
#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
#define LBR_USER (1 << LBR_USER_BIT)
#define LBR_JCC (1 << LBR_JCC_BIT)
#define LBR_REL_CALL (1 << LBR_REL_CALL_BIT)
#define LBR_IND_CALL (1 << LBR_IND_CALL_BIT)
#define LBR_RETURN (1 << LBR_RETURN_BIT)
#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
#define LBR_FAR (1 << LBR_FAR_BIT)
#define LBR_PLM (LBR_KERNEL | LBR_USER)
#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */
#define LBR_NOT_SUPP -1 /* LBR filter not supported */
#define LBR_IGN 0 /* ignored */
#define LBR_ANY \
(LBR_JCC |\
LBR_REL_CALL |\
LBR_IND_CALL |\
LBR_RETURN |\
LBR_REL_JMP |\
LBR_IND_JMP |\
LBR_FAR)
#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
#define LBR_FROM_FLAG_IN_TX (1ULL << 62)
#define LBR_FROM_FLAG_ABORT (1ULL << 61)
#define for_each_branch_sample_type(x) \
for ((x) = PERF_SAMPLE_BRANCH_USER; \
(x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
/*
* x86control flow change classification
* x86control flow changes include branches, interrupts, traps, faults
*/
enum {
X86_BR_NONE = 0, /* unknown */
X86_BR_USER = 1 << 0, /* branch target is user */
X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
X86_BR_CALL = 1 << 2, /* call */
X86_BR_RET = 1 << 3, /* return */
X86_BR_SYSCALL = 1 << 4, /* syscall */
X86_BR_SYSRET = 1 << 5, /* syscall return */
X86_BR_INT = 1 << 6, /* sw interrupt */
X86_BR_IRET = 1 << 7, /* return from interrupt */
X86_BR_JCC = 1 << 8, /* conditional */
X86_BR_JMP = 1 << 9, /* jump */
X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
X86_BR_IND_CALL = 1 << 11,/* indirect calls */
X86_BR_ABORT = 1 << 12,/* transaction abort */
X86_BR_IN_TX = 1 << 13,/* in transaction */
X86_BR_NO_TX = 1 << 14,/* not in transaction */
};
#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
#define X86_BR_ANY \
(X86_BR_CALL |\
X86_BR_RET |\
X86_BR_SYSCALL |\
X86_BR_SYSRET |\
X86_BR_INT |\
X86_BR_IRET |\
X86_BR_JCC |\
X86_BR_JMP |\
X86_BR_IRQ |\
X86_BR_ABORT |\
X86_BR_IND_CALL)
#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
#define X86_BR_ANY_CALL \
(X86_BR_CALL |\
X86_BR_IND_CALL |\
X86_BR_SYSCALL |\
X86_BR_IRQ |\
X86_BR_INT)
static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
/*
* We only support LBR implementations that have FREEZE_LBRS_ON_PMI
* otherwise it becomes near impossible to get a reliable stack.
*/
static void __intel_pmu_lbr_enable(void)
{
u64 debugctl;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (cpuc->lbr_sel)
wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
}
static void __intel_pmu_lbr_disable(void)
{
u64 debugctl;
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
}
static void intel_pmu_lbr_reset_32(void)
{
int i;
for (i = 0; i < x86_pmu.lbr_nr; i++)
wrmsrl(x86_pmu.lbr_from + i, 0);
}
static void intel_pmu_lbr_reset_64(void)
{
int i;
for (i = 0; i < x86_pmu.lbr_nr; i++) {
wrmsrl(x86_pmu.lbr_from + i, 0);
wrmsrl(x86_pmu.lbr_to + i, 0);
}
}
void intel_pmu_lbr_reset(void)
{
if (!x86_pmu.lbr_nr)
return;
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
intel_pmu_lbr_reset_32();
else
intel_pmu_lbr_reset_64();
}
void intel_pmu_lbr_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (!x86_pmu.lbr_nr)
return;
/*
* Reset the LBR stack if we changed task context to
* avoid data leaks.
*/
if (event->ctx->task && cpuc->lbr_context != event->ctx) {
intel_pmu_lbr_reset();
cpuc->lbr_context = event->ctx;
}
cpuc->br_sel = event->hw.branch_reg.reg;
cpuc->lbr_users++;
}
void intel_pmu_lbr_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (!x86_pmu.lbr_nr)
return;
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
if (cpuc->enabled && !cpuc->lbr_users) {
__intel_pmu_lbr_disable();
/* avoid stale pointer */
cpuc->lbr_context = NULL;
}
}
void intel_pmu_lbr_enable_all(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (cpuc->lbr_users)
__intel_pmu_lbr_enable();
}
void intel_pmu_lbr_disable_all(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (cpuc->lbr_users)
__intel_pmu_lbr_disable();
}
/*
* TOS = most recently recorded branch
*/
static inline u64 intel_pmu_lbr_tos(void)
{
u64 tos;
rdmsrl(x86_pmu.lbr_tos, tos);
return tos;
}
static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
{
unsigned long mask = x86_pmu.lbr_nr - 1;
u64 tos = intel_pmu_lbr_tos();
int i;
for (i = 0; i < x86_pmu.lbr_nr; i++) {
unsigned long lbr_idx = (tos - i) & mask;
union {
struct {
u32 from;
u32 to;
};
u64 lbr;
} msr_lastbranch;
rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
cpuc->lbr_entries[i].from = msr_lastbranch.from;
cpuc->lbr_entries[i].to = msr_lastbranch.to;
cpuc->lbr_entries[i].mispred = 0;
cpuc->lbr_entries[i].predicted = 0;
cpuc->lbr_entries[i].reserved = 0;
}
cpuc->lbr_stack.nr = i;
}
/*
* Due to lack of segmentation in Linux the effective address (offset)
* is the same as the linear address, allowing us to merge the LIP and EIP
* LBR formats.
*/
static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
{
unsigned long mask = x86_pmu.lbr_nr - 1;
int lbr_format = x86_pmu.intel_cap.lbr_format;
u64 tos = intel_pmu_lbr_tos();
int i;
int out = 0;
for (i = 0; i < x86_pmu.lbr_nr; i++) {
unsigned long lbr_idx = (tos - i) & mask;
u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
int skip = 0;
int lbr_flags = lbr_desc[lbr_format];
rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
if (lbr_flags & LBR_EIP_FLAGS) {
mis = !!(from & LBR_FROM_FLAG_MISPRED);
pred = !mis;
skip = 1;
}
if (lbr_flags & LBR_TSX) {
in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
abort = !!(from & LBR_FROM_FLAG_ABORT);
skip = 3;
}
from = (u64)((((s64)from) << skip) >> skip);
/*
* Some CPUs report duplicated abort records,
* with the second entry not having an abort bit set.
* Skip them here. This loop runs backwards,
* so we need to undo the previous record.
* If the abort just happened outside the window
* the extra entry cannot be removed.
*/
if (abort && x86_pmu.lbr_double_abort && out > 0)
out--;
cpuc->lbr_entries[out].from = from;
cpuc->lbr_entries[out].to = to;
cpuc->lbr_entries[out].mispred = mis;
cpuc->lbr_entries[out].predicted = pred;
cpuc->lbr_entries[out].in_tx = in_tx;
cpuc->lbr_entries[out].abort = abort;
cpuc->lbr_entries[out].reserved = 0;
out++;
}
cpuc->lbr_stack.nr = out;
}
void intel_pmu_lbr_read(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (!cpuc->lbr_users)
return;
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
intel_pmu_lbr_read_32(cpuc);
else
intel_pmu_lbr_read_64(cpuc);
intel_pmu_lbr_filter(cpuc);
}
/*
* SW filter is used:
* - in case there is no HW filter
* - in case the HW filter has errata or limitations
*/
static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
{
u64 br_type = event->attr.branch_sample_type;
int mask = 0;
if (br_type & PERF_SAMPLE_BRANCH_USER)
mask |= X86_BR_USER;
if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
mask |= X86_BR_KERNEL;
/* we ignore BRANCH_HV here */
if (br_type & PERF_SAMPLE_BRANCH_ANY)
mask |= X86_BR_ANY;
if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
mask |= X86_BR_ANY_CALL;
if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
mask |= X86_BR_IND_CALL;
if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
mask |= X86_BR_ABORT;
if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
mask |= X86_BR_IN_TX;
if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
mask |= X86_BR_NO_TX;
if (br_type & PERF_SAMPLE_BRANCH_COND)
mask |= X86_BR_JCC;
/*
* stash actual user request into reg, it may
* be used by fixup code for some CPU
*/
event->hw.branch_reg.reg = mask;
}
/*
* setup the HW LBR filter
* Used only when available, may not be enough to disambiguate
* all branches, may need the help of the SW filter
*/
static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
{
struct hw_perf_event_extra *reg;
u64 br_type = event->attr.branch_sample_type;
u64 mask = 0, m;
u64 v;
for_each_branch_sample_type(m) {
if (!(br_type & m))
continue;
v = x86_pmu.lbr_sel_map[m];
if (v == LBR_NOT_SUPP)
return -EOPNOTSUPP;
if (v != LBR_IGN)
mask |= v;
}
reg = &event->hw.branch_reg;
reg->idx = EXTRA_REG_LBR;
/* LBR_SELECT operates in suppress mode so invert mask */
reg->config = ~mask & x86_pmu.lbr_sel_mask;
return 0;
}
int intel_pmu_setup_lbr_filter(struct perf_event *event)
{
int ret = 0;
/*
* no LBR on this PMU
*/
if (!x86_pmu.lbr_nr)
return -EOPNOTSUPP;
/*
* setup SW LBR filter
*/
intel_pmu_setup_sw_lbr_filter(event);
/*
* setup HW LBR filter, if any
*/
if (x86_pmu.lbr_sel_map)
ret = intel_pmu_setup_hw_lbr_filter(event);
return ret;
}
/*
* return the type of control flow change at address "from"
* intruction is not necessarily a branch (in case of interrupt).
*
* The branch type returned also includes the priv level of the
* target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
*
* If a branch type is unknown OR the instruction cannot be
* decoded (e.g., text page not present), then X86_BR_NONE is
* returned.
*/
static int branch_type(unsigned long from, unsigned long to, int abort)
{
struct insn insn;
void *addr;
int bytes, size = MAX_INSN_SIZE;
int ret = X86_BR_NONE;
int ext, to_plm, from_plm;
u8 buf[MAX_INSN_SIZE];
int is64 = 0;
to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
/*
* maybe zero if lbr did not fill up after a reset by the time
* we get a PMU interrupt
*/
if (from == 0 || to == 0)
return X86_BR_NONE;
if (abort)
return X86_BR_ABORT | to_plm;
if (from_plm == X86_BR_USER) {
/*
* can happen if measuring at the user level only
* and we interrupt in a kernel thread, e.g., idle.
*/
if (!current->mm)
return X86_BR_NONE;
/* may fail if text not present */
bytes = copy_from_user_nmi(buf, (void __user *)from, size);
if (bytes != 0)
return X86_BR_NONE;
addr = buf;
} else {
/*
* The LBR logs any address in the IP, even if the IP just
* faulted. This means userspace can control the from address.
* Ensure we don't blindy read any address by validating it is
* a known text address.
*/
if (kernel_text_address(from))
addr = (void *)from;
else
return X86_BR_NONE;
}
/*
* decoder needs to know the ABI especially
* on 64-bit systems running 32-bit apps
*/
#ifdef CONFIG_X86_64
is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
#endif
insn_init(&insn, addr, is64);
insn_get_opcode(&insn);
switch (insn.opcode.bytes[0]) {
case 0xf:
switch (insn.opcode.bytes[1]) {
case 0x05: /* syscall */
case 0x34: /* sysenter */
ret = X86_BR_SYSCALL;
break;
case 0x07: /* sysret */
case 0x35: /* sysexit */
ret = X86_BR_SYSRET;
break;
case 0x80 ... 0x8f: /* conditional */
ret = X86_BR_JCC;
break;
default:
ret = X86_BR_NONE;
}
break;
case 0x70 ... 0x7f: /* conditional */
ret = X86_BR_JCC;
break;
case 0xc2: /* near ret */
case 0xc3: /* near ret */
case 0xca: /* far ret */
case 0xcb: /* far ret */
ret = X86_BR_RET;
break;
case 0xcf: /* iret */
ret = X86_BR_IRET;
break;
case 0xcc ... 0xce: /* int */
ret = X86_BR_INT;
break;
case 0xe8: /* call near rel */
case 0x9a: /* call far absolute */
ret = X86_BR_CALL;
break;
case 0xe0 ... 0xe3: /* loop jmp */
ret = X86_BR_JCC;
break;
case 0xe9 ... 0xeb: /* jmp */
ret = X86_BR_JMP;
break;
case 0xff: /* call near absolute, call far absolute ind */
insn_get_modrm(&insn);
ext = (insn.modrm.bytes[0] >> 3) & 0x7;
switch (ext) {
case 2: /* near ind call */
case 3: /* far ind call */
ret = X86_BR_IND_CALL;
break;
case 4:
case 5:
ret = X86_BR_JMP;
break;
}
break;
default:
ret = X86_BR_NONE;
}
/*
* interrupts, traps, faults (and thus ring transition) may
* occur on any instructions. Thus, to classify them correctly,
* we need to first look at the from and to priv levels. If they
* are different and to is in the kernel, then it indicates
* a ring transition. If the from instruction is not a ring
* transition instr (syscall, systenter, int), then it means
* it was a irq, trap or fault.
*
* we have no way of detecting kernel to kernel faults.
*/
if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
&& ret != X86_BR_SYSCALL && ret != X86_BR_INT)
ret = X86_BR_IRQ;
/*
* branch priv level determined by target as
* is done by HW when LBR_SELECT is implemented
*/
if (ret != X86_BR_NONE)
ret |= to_plm;
return ret;
}
/*
* implement actual branch filter based on user demand.
* Hardware may not exactly satisfy that request, thus
* we need to inspect opcodes. Mismatched branches are
* discarded. Therefore, the number of branches returned
* in PERF_SAMPLE_BRANCH_STACK sample may vary.
*/
static void
intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
{
u64 from, to;
int br_sel = cpuc->br_sel;
int i, j, type;
bool compress = false;
/* if sampling all branches, then nothing to filter */
if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
return;
for (i = 0; i < cpuc->lbr_stack.nr; i++) {
from = cpuc->lbr_entries[i].from;
to = cpuc->lbr_entries[i].to;
type = branch_type(from, to, cpuc->lbr_entries[i].abort);
if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
if (cpuc->lbr_entries[i].in_tx)
type |= X86_BR_IN_TX;
else
type |= X86_BR_NO_TX;
}
/* if type does not correspond, then discard */
if (type == X86_BR_NONE || (br_sel & type) != type) {
cpuc->lbr_entries[i].from = 0;
compress = true;
}
}
if (!compress)
return;
/* remove all entries with from=0 */
for (i = 0; i < cpuc->lbr_stack.nr; ) {
if (!cpuc->lbr_entries[i].from) {
j = i;
while (++j < cpuc->lbr_stack.nr)
cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
cpuc->lbr_stack.nr--;
if (!cpuc->lbr_entries[i].from)
continue;
}
i++;
}
}
/*
* Map interface branch filters onto LBR filters
*/
static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
[PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
[PERF_SAMPLE_BRANCH_USER] = LBR_USER,
[PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
[PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
[PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP
| LBR_IND_JMP | LBR_FAR,
/*
* NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
*/
[PERF_SAMPLE_BRANCH_ANY_CALL] =
LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
/*
* NHM/WSM erratum: must include IND_JMP to capture IND_CALL
*/
[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
[PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
};
static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
[PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
[PERF_SAMPLE_BRANCH_USER] = LBR_USER,
[PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
[PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
[PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR,
[PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
| LBR_FAR,
[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
[PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
};
/* core */
void __init intel_pmu_lbr_init_core(void)
{
x86_pmu.lbr_nr = 4;
x86_pmu.lbr_tos = MSR_LBR_TOS;
x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
x86_pmu.lbr_to = MSR_LBR_CORE_TO;
/*
* SW branch filter usage:
* - compensate for lack of HW filter
*/
pr_cont("4-deep LBR, ");
}
/* nehalem/westmere */
void __init intel_pmu_lbr_init_nhm(void)
{
x86_pmu.lbr_nr = 16;
x86_pmu.lbr_tos = MSR_LBR_TOS;
x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
x86_pmu.lbr_to = MSR_LBR_NHM_TO;
x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
x86_pmu.lbr_sel_map = nhm_lbr_sel_map;
/*
* SW branch filter usage:
* - workaround LBR_SEL errata (see above)
* - support syscall, sysret capture.
* That requires LBR_FAR but that means far
* jmp need to be filtered out
*/
pr_cont("16-deep LBR, ");
}
/* sandy bridge */
void __init intel_pmu_lbr_init_snb(void)
{
x86_pmu.lbr_nr = 16;
x86_pmu.lbr_tos = MSR_LBR_TOS;
x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
x86_pmu.lbr_to = MSR_LBR_NHM_TO;
x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
x86_pmu.lbr_sel_map = snb_lbr_sel_map;
/*
* SW branch filter usage:
* - support syscall, sysret capture.
* That requires LBR_FAR but that means far
* jmp need to be filtered out
*/
pr_cont("16-deep LBR, ");
}
/* atom */
void __init intel_pmu_lbr_init_atom(void)
{
/*
* only models starting at stepping 10 seems
* to have an operational LBR which can freeze
* on PMU interrupt
*/
if (boot_cpu_data.x86_model == 28
&& boot_cpu_data.x86_mask < 10) {
pr_cont("LBR disabled due to erratum");
return;
}
x86_pmu.lbr_nr = 8;
x86_pmu.lbr_tos = MSR_LBR_TOS;
x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
x86_pmu.lbr_to = MSR_LBR_CORE_TO;
/*
* SW branch filter usage:
* - compensate for lack of HW filter
*/
pr_cont("8-deep LBR, ");
}

View file

@ -0,0 +1,714 @@
/*
* perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
* Copyright (C) 2013 Google, Inc., Stephane Eranian
*
* Intel RAPL interface is specified in the IA-32 Manual Vol3b
* section 14.7.1 (September 2013)
*
* RAPL provides more controls than just reporting energy consumption
* however here we only expose the 3 energy consumption free running
* counters (pp0, pkg, dram).
*
* Each of those counters increments in a power unit defined by the
* RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
* but it can vary.
*
* Counter to rapl events mappings:
*
* pp0 counter: consumption of all physical cores (power plane 0)
* event: rapl_energy_cores
* perf code: 0x1
*
* pkg counter: consumption of the whole processor package
* event: rapl_energy_pkg
* perf code: 0x2
*
* dram counter: consumption of the dram domain (servers only)
* event: rapl_energy_dram
* perf code: 0x3
*
* dram counter: consumption of the builtin-gpu domain (client only)
* event: rapl_energy_gpu
* perf code: 0x4
*
* We manage those counters as free running (read-only). They may be
* use simultaneously by other tools, such as turbostat.
*
* The events only support system-wide mode counting. There is no
* sampling support because it does not make sense and is not
* supported by the RAPL hardware.
*
* Because we want to avoid floating-point operations in the kernel,
* the events are all reported in fixed point arithmetic (32.32).
* Tools must adjust the counts to convert them to Watts using
* the duration of the measurement. Tools may use a function such as
* ldexp(raw_count, -32);
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <asm/cpu_device_id.h>
#include "perf_event.h"
/*
* RAPL energy status counters
*/
#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */
#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */
#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */
#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */
#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */
#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */
#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */
#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
/* Clients have PP0, PKG */
#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
1<<RAPL_IDX_PKG_NRG_STAT|\
1<<RAPL_IDX_PP1_NRG_STAT)
/* Servers have PP0, PKG, RAM */
#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\
1<<RAPL_IDX_PKG_NRG_STAT|\
1<<RAPL_IDX_RAM_NRG_STAT)
/* Servers have PP0, PKG, RAM, PP1 */
#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\
1<<RAPL_IDX_PKG_NRG_STAT|\
1<<RAPL_IDX_RAM_NRG_STAT|\
1<<RAPL_IDX_PP1_NRG_STAT)
/*
* event code: LSB 8 bits, passed in attr->config
* any other bit is reserved
*/
#define RAPL_EVENT_MASK 0xFFULL
#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
struct kobj_attribute *attr, \
char *page) \
{ \
BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
return sprintf(page, _format "\n"); \
} \
static struct kobj_attribute format_attr_##_var = \
__ATTR(_name, 0444, __rapl_##_var##_show, NULL)
#define RAPL_EVENT_DESC(_name, _config) \
{ \
.attr = __ATTR(_name, 0444, rapl_event_show, NULL), \
.config = _config, \
}
#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
struct rapl_pmu {
spinlock_t lock;
int hw_unit; /* 1/2^hw_unit Joule */
int n_active; /* number of active events */
struct list_head active_list;
struct pmu *pmu; /* pointer to rapl_pmu_class */
ktime_t timer_interval; /* in ktime_t unit */
struct hrtimer hrtimer;
};
static struct pmu rapl_pmu_class;
static cpumask_t rapl_cpu_mask;
static int rapl_cntr_mask;
static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
static inline u64 rapl_read_counter(struct perf_event *event)
{
u64 raw;
rdmsrl(event->hw.event_base, raw);
return raw;
}
static inline u64 rapl_scale(u64 v)
{
/*
* scale delta to smallest unit (1/2^32)
* users must then scale back: count * 1/(1e9*2^32) to get Joules
* or use ldexp(count, -32).
* Watts = Joules/Time delta
*/
return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit);
}
static u64 rapl_event_update(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
u64 prev_raw_count, new_raw_count;
s64 delta, sdelta;
int shift = RAPL_CNTR_WIDTH;
again:
prev_raw_count = local64_read(&hwc->prev_count);
rdmsrl(event->hw.event_base, new_raw_count);
if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
new_raw_count) != prev_raw_count) {
cpu_relax();
goto again;
}
/*
* Now we have the new raw value and have updated the prev
* timestamp already. We can now calculate the elapsed delta
* (event-)time and add that to the generic event.
*
* Careful, not all hw sign-extends above the physical width
* of the count.
*/
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
sdelta = rapl_scale(delta);
local64_add(sdelta, &event->count);
return new_raw_count;
}
static void rapl_start_hrtimer(struct rapl_pmu *pmu)
{
__hrtimer_start_range_ns(&pmu->hrtimer,
pmu->timer_interval, 0,
HRTIMER_MODE_REL_PINNED, 0);
}
static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
{
hrtimer_cancel(&pmu->hrtimer);
}
static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
{
struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
struct perf_event *event;
unsigned long flags;
if (!pmu->n_active)
return HRTIMER_NORESTART;
spin_lock_irqsave(&pmu->lock, flags);
list_for_each_entry(event, &pmu->active_list, active_entry) {
rapl_event_update(event);
}
spin_unlock_irqrestore(&pmu->lock, flags);
hrtimer_forward_now(hrtimer, pmu->timer_interval);
return HRTIMER_RESTART;
}
static void rapl_hrtimer_init(struct rapl_pmu *pmu)
{
struct hrtimer *hr = &pmu->hrtimer;
hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hr->function = rapl_hrtimer_handle;
}
static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
struct perf_event *event)
{
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
return;
event->hw.state = 0;
list_add_tail(&event->active_entry, &pmu->active_list);
local64_set(&event->hw.prev_count, rapl_read_counter(event));
pmu->n_active++;
if (pmu->n_active == 1)
rapl_start_hrtimer(pmu);
}
static void rapl_pmu_event_start(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
unsigned long flags;
spin_lock_irqsave(&pmu->lock, flags);
__rapl_pmu_event_start(pmu, event);
spin_unlock_irqrestore(&pmu->lock, flags);
}
static void rapl_pmu_event_stop(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
spin_lock_irqsave(&pmu->lock, flags);
/* mark event as deactivated and stopped */
if (!(hwc->state & PERF_HES_STOPPED)) {
WARN_ON_ONCE(pmu->n_active <= 0);
pmu->n_active--;
if (pmu->n_active == 0)
rapl_stop_hrtimer(pmu);
list_del(&event->active_entry);
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
}
/* check if update of sw counter is necessary */
if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
/*
* Drain the remaining delta count out of a event
* that we are disabling:
*/
rapl_event_update(event);
hwc->state |= PERF_HES_UPTODATE;
}
spin_unlock_irqrestore(&pmu->lock, flags);
}
static int rapl_pmu_event_add(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
spin_lock_irqsave(&pmu->lock, flags);
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
if (mode & PERF_EF_START)
__rapl_pmu_event_start(pmu, event);
spin_unlock_irqrestore(&pmu->lock, flags);
return 0;
}
static void rapl_pmu_event_del(struct perf_event *event, int flags)
{
rapl_pmu_event_stop(event, PERF_EF_UPDATE);
}
static int rapl_pmu_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
int bit, msr, ret = 0;
/* only look at RAPL events */
if (event->attr.type != rapl_pmu_class.type)
return -ENOENT;
/* check only supported bits are set */
if (event->attr.config & ~RAPL_EVENT_MASK)
return -EINVAL;
/*
* check event is known (determines counter)
*/
switch (cfg) {
case INTEL_RAPL_PP0:
bit = RAPL_IDX_PP0_NRG_STAT;
msr = MSR_PP0_ENERGY_STATUS;
break;
case INTEL_RAPL_PKG:
bit = RAPL_IDX_PKG_NRG_STAT;
msr = MSR_PKG_ENERGY_STATUS;
break;
case INTEL_RAPL_RAM:
bit = RAPL_IDX_RAM_NRG_STAT;
msr = MSR_DRAM_ENERGY_STATUS;
break;
case INTEL_RAPL_PP1:
bit = RAPL_IDX_PP1_NRG_STAT;
msr = MSR_PP1_ENERGY_STATUS;
break;
default:
return -EINVAL;
}
/* check event supported */
if (!(rapl_cntr_mask & (1 << bit)))
return -EINVAL;
/* unsupported modes and filters */
if (event->attr.exclude_user ||
event->attr.exclude_kernel ||
event->attr.exclude_hv ||
event->attr.exclude_idle ||
event->attr.exclude_host ||
event->attr.exclude_guest ||
event->attr.sample_period) /* no sampling */
return -EINVAL;
/* must be done before validate_group */
event->hw.event_base = msr;
event->hw.config = cfg;
event->hw.idx = bit;
return ret;
}
static void rapl_pmu_event_read(struct perf_event *event)
{
rapl_event_update(event);
}
static ssize_t rapl_get_attr_cpumask(struct device *dev,
struct device_attribute *attr, char *buf)
{
int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask);
buf[n++] = '\n';
buf[n] = '\0';
return n;
}
static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
static struct attribute *rapl_pmu_attrs[] = {
&dev_attr_cpumask.attr,
NULL,
};
static struct attribute_group rapl_pmu_attr_group = {
.attrs = rapl_pmu_attrs,
};
EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
/*
* we compute in 0.23 nJ increments regardless of MSR
*/
EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
static struct attribute *rapl_events_srv_attr[] = {
EVENT_PTR(rapl_cores),
EVENT_PTR(rapl_pkg),
EVENT_PTR(rapl_ram),
EVENT_PTR(rapl_cores_unit),
EVENT_PTR(rapl_pkg_unit),
EVENT_PTR(rapl_ram_unit),
EVENT_PTR(rapl_cores_scale),
EVENT_PTR(rapl_pkg_scale),
EVENT_PTR(rapl_ram_scale),
NULL,
};
static struct attribute *rapl_events_cln_attr[] = {
EVENT_PTR(rapl_cores),
EVENT_PTR(rapl_pkg),
EVENT_PTR(rapl_gpu),
EVENT_PTR(rapl_cores_unit),
EVENT_PTR(rapl_pkg_unit),
EVENT_PTR(rapl_gpu_unit),
EVENT_PTR(rapl_cores_scale),
EVENT_PTR(rapl_pkg_scale),
EVENT_PTR(rapl_gpu_scale),
NULL,
};
static struct attribute *rapl_events_hsw_attr[] = {
EVENT_PTR(rapl_cores),
EVENT_PTR(rapl_pkg),
EVENT_PTR(rapl_gpu),
EVENT_PTR(rapl_ram),
EVENT_PTR(rapl_cores_unit),
EVENT_PTR(rapl_pkg_unit),
EVENT_PTR(rapl_gpu_unit),
EVENT_PTR(rapl_ram_unit),
EVENT_PTR(rapl_cores_scale),
EVENT_PTR(rapl_pkg_scale),
EVENT_PTR(rapl_gpu_scale),
EVENT_PTR(rapl_ram_scale),
NULL,
};
static struct attribute_group rapl_pmu_events_group = {
.name = "events",
.attrs = NULL, /* patched at runtime */
};
DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
static struct attribute *rapl_formats_attr[] = {
&format_attr_event.attr,
NULL,
};
static struct attribute_group rapl_pmu_format_group = {
.name = "format",
.attrs = rapl_formats_attr,
};
const struct attribute_group *rapl_attr_groups[] = {
&rapl_pmu_attr_group,
&rapl_pmu_format_group,
&rapl_pmu_events_group,
NULL,
};
static struct pmu rapl_pmu_class = {
.attr_groups = rapl_attr_groups,
.task_ctx_nr = perf_invalid_context, /* system-wide only */
.event_init = rapl_pmu_event_init,
.add = rapl_pmu_event_add, /* must have */
.del = rapl_pmu_event_del, /* must have */
.start = rapl_pmu_event_start,
.stop = rapl_pmu_event_stop,
.read = rapl_pmu_event_read,
};
static void rapl_cpu_exit(int cpu)
{
struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
int i, phys_id = topology_physical_package_id(cpu);
int target = -1;
/* find a new cpu on same package */
for_each_online_cpu(i) {
if (i == cpu)
continue;
if (phys_id == topology_physical_package_id(i)) {
target = i;
break;
}
}
/*
* clear cpu from cpumask
* if was set in cpumask and still some cpu on package,
* then move to new cpu
*/
if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
cpumask_set_cpu(target, &rapl_cpu_mask);
WARN_ON(cpumask_empty(&rapl_cpu_mask));
/*
* migrate events and context to new cpu
*/
if (target >= 0)
perf_pmu_migrate_context(pmu->pmu, cpu, target);
/* cancel overflow polling timer for CPU */
rapl_stop_hrtimer(pmu);
}
static void rapl_cpu_init(int cpu)
{
int i, phys_id = topology_physical_package_id(cpu);
/* check if phys_is is already covered */
for_each_cpu(i, &rapl_cpu_mask) {
if (phys_id == topology_physical_package_id(i))
return;
}
/* was not found, so add it */
cpumask_set_cpu(cpu, &rapl_cpu_mask);
}
static int rapl_cpu_prepare(int cpu)
{
struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
int phys_id = topology_physical_package_id(cpu);
u64 ms;
u64 msr_rapl_power_unit_bits;
if (pmu)
return 0;
if (phys_id < 0)
return -1;
/* protect rdmsrl() to handle virtualization */
if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
return -1;
pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
if (!pmu)
return -1;
spin_lock_init(&pmu->lock);
INIT_LIST_HEAD(&pmu->active_list);
/*
* grab power unit as: 1/2^unit Joules
*
* we cache in local PMU instance
*/
pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
pmu->pmu = &rapl_pmu_class;
/*
* use reference of 200W for scaling the timeout
* to avoid missing counter overflows.
* 200W = 200 Joules/sec
* divide interval by 2 to avoid lockstep (2 * 100)
* if hw unit is 32, then we use 2 ms 1/200/2
*/
if (pmu->hw_unit < 32)
ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
else
ms = 2;
pmu->timer_interval = ms_to_ktime(ms);
rapl_hrtimer_init(pmu);
/* set RAPL pmu for this cpu for now */
per_cpu(rapl_pmu, cpu) = pmu;
per_cpu(rapl_pmu_to_free, cpu) = NULL;
return 0;
}
static void rapl_cpu_kfree(int cpu)
{
struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
kfree(pmu);
per_cpu(rapl_pmu_to_free, cpu) = NULL;
}
static int rapl_cpu_dying(int cpu)
{
struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
if (!pmu)
return 0;
per_cpu(rapl_pmu, cpu) = NULL;
per_cpu(rapl_pmu_to_free, cpu) = pmu;
return 0;
}
static int rapl_cpu_notifier(struct notifier_block *self,
unsigned long action, void *hcpu)
{
unsigned int cpu = (long)hcpu;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
rapl_cpu_prepare(cpu);
break;
case CPU_STARTING:
rapl_cpu_init(cpu);
break;
case CPU_UP_CANCELED:
case CPU_DYING:
rapl_cpu_dying(cpu);
break;
case CPU_ONLINE:
case CPU_DEAD:
rapl_cpu_kfree(cpu);
break;
case CPU_DOWN_PREPARE:
rapl_cpu_exit(cpu);
break;
default:
break;
}
return NOTIFY_OK;
}
static const struct x86_cpu_id rapl_cpu_match[] = {
[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
[1] = {},
};
static int __init rapl_pmu_init(void)
{
struct rapl_pmu *pmu;
int cpu, ret;
/*
* check for Intel processor family 6
*/
if (!x86_match_cpu(rapl_cpu_match))
return 0;
/* check supported CPU */
switch (boot_cpu_data.x86_model) {
case 42: /* Sandy Bridge */
case 58: /* Ivy Bridge */
rapl_cntr_mask = RAPL_IDX_CLN;
rapl_pmu_events_group.attrs = rapl_events_cln_attr;
break;
case 60: /* Haswell */
case 69: /* Haswell-Celeron */
rapl_cntr_mask = RAPL_IDX_HSW;
rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
break;
case 45: /* Sandy Bridge-EP */
case 62: /* IvyTown */
rapl_cntr_mask = RAPL_IDX_SRV;
rapl_pmu_events_group.attrs = rapl_events_srv_attr;
break;
default:
/* unsupported */
return 0;
}
cpu_notifier_register_begin();
for_each_online_cpu(cpu) {
ret = rapl_cpu_prepare(cpu);
if (ret)
goto out;
rapl_cpu_init(cpu);
}
__perf_cpu_notifier(rapl_cpu_notifier);
ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
if (WARN_ON(ret)) {
pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
cpu_notifier_register_done();
return -1;
}
pmu = __this_cpu_read(rapl_pmu);
pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
" API unit is 2^-32 Joules,"
" %d fixed counters"
" %llu ms ovfl timer\n",
pmu->hw_unit,
hweight32(rapl_cntr_mask),
ktime_to_ms(pmu->timer_interval));
out:
cpu_notifier_register_done();
return 0;
}
device_initcall(rapl_pmu_init);

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,339 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/pci.h>
#include <linux/perf_event.h>
#include "perf_event.h"
#define UNCORE_PMU_NAME_LEN 32
#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC)
#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
#define UNCORE_FIXED_EVENT 0xff
#define UNCORE_PMC_IDX_MAX_GENERIC 8
#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC
#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1)
#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx)
#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff)
#define UNCORE_PCI_DEV_IDX(data) (data & 0xff)
#define UNCORE_EXTRA_PCI_DEV 0xff
#define UNCORE_EXTRA_PCI_DEV_MAX 3
/* support up to 8 sockets */
#define UNCORE_SOCKET_MAX 8
#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
struct intel_uncore_ops;
struct intel_uncore_pmu;
struct intel_uncore_box;
struct uncore_event_desc;
struct intel_uncore_type {
const char *name;
int num_counters;
int num_boxes;
int perf_ctr_bits;
int fixed_ctr_bits;
unsigned perf_ctr;
unsigned event_ctl;
unsigned event_mask;
unsigned fixed_ctr;
unsigned fixed_ctl;
unsigned box_ctl;
unsigned msr_offset;
unsigned num_shared_regs:8;
unsigned single_fixed:1;
unsigned pair_ctr_ctl:1;
unsigned *msr_offsets;
struct event_constraint unconstrainted;
struct event_constraint *constraints;
struct intel_uncore_pmu *pmus;
struct intel_uncore_ops *ops;
struct uncore_event_desc *event_descs;
const struct attribute_group *attr_groups[4];
struct pmu *pmu; /* for custom pmu ops */
};
#define pmu_group attr_groups[0]
#define format_group attr_groups[1]
#define events_group attr_groups[2]
struct intel_uncore_ops {
void (*init_box)(struct intel_uncore_box *);
void (*disable_box)(struct intel_uncore_box *);
void (*enable_box)(struct intel_uncore_box *);
void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
struct perf_event *);
void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
};
struct intel_uncore_pmu {
struct pmu pmu;
char name[UNCORE_PMU_NAME_LEN];
int pmu_idx;
int func_id;
struct intel_uncore_type *type;
struct intel_uncore_box ** __percpu box;
struct list_head box_list;
};
struct intel_uncore_extra_reg {
raw_spinlock_t lock;
u64 config, config1, config2;
atomic_t ref;
};
struct intel_uncore_box {
int phys_id;
int n_active; /* number of active events */
int n_events;
int cpu; /* cpu to collect events */
unsigned long flags;
atomic_t refcnt;
struct perf_event *events[UNCORE_PMC_IDX_MAX];
struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
u64 tags[UNCORE_PMC_IDX_MAX];
struct pci_dev *pci_dev;
struct intel_uncore_pmu *pmu;
u64 hrtimer_duration; /* hrtimer timeout for this box */
struct hrtimer hrtimer;
struct list_head list;
struct list_head active_list;
void *io_addr;
struct intel_uncore_extra_reg shared_regs[0];
};
#define UNCORE_BOX_FLAG_INITIATED 0
struct uncore_event_desc {
struct kobj_attribute attr;
const char *config;
};
ssize_t uncore_event_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf);
#define INTEL_UNCORE_EVENT_DESC(_name, _config) \
{ \
.attr = __ATTR(_name, 0444, uncore_event_show, NULL), \
.config = _config, \
}
#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \
static ssize_t __uncore_##_var##_show(struct kobject *kobj, \
struct kobj_attribute *attr, \
char *page) \
{ \
BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
return sprintf(page, _format "\n"); \
} \
static struct kobj_attribute format_attr_##_var = \
__ATTR(_name, 0444, __uncore_##_var##_show, NULL)
static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
{
return box->pmu->type->box_ctl;
}
static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
{
return box->pmu->type->fixed_ctl;
}
static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
{
return box->pmu->type->fixed_ctr;
}
static inline
unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
{
return idx * 4 + box->pmu->type->event_ctl;
}
static inline
unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
{
return idx * 8 + box->pmu->type->perf_ctr;
}
static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
{
struct intel_uncore_pmu *pmu = box->pmu;
return pmu->type->msr_offsets ?
pmu->type->msr_offsets[pmu->pmu_idx] :
pmu->type->msr_offset * pmu->pmu_idx;
}
static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
{
if (!box->pmu->type->box_ctl)
return 0;
return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
}
static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
{
if (!box->pmu->type->fixed_ctl)
return 0;
return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
}
static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
{
return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
}
static inline
unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
{
return box->pmu->type->event_ctl +
(box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
uncore_msr_box_offset(box);
}
static inline
unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
{
return box->pmu->type->perf_ctr +
(box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
uncore_msr_box_offset(box);
}
static inline
unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
{
if (box->pci_dev)
return uncore_pci_fixed_ctl(box);
else
return uncore_msr_fixed_ctl(box);
}
static inline
unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
{
if (box->pci_dev)
return uncore_pci_fixed_ctr(box);
else
return uncore_msr_fixed_ctr(box);
}
static inline
unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
{
if (box->pci_dev)
return uncore_pci_event_ctl(box, idx);
else
return uncore_msr_event_ctl(box, idx);
}
static inline
unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
{
if (box->pci_dev)
return uncore_pci_perf_ctr(box, idx);
else
return uncore_msr_perf_ctr(box, idx);
}
static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
{
return box->pmu->type->perf_ctr_bits;
}
static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
{
return box->pmu->type->fixed_ctr_bits;
}
static inline int uncore_num_counters(struct intel_uncore_box *box)
{
return box->pmu->type->num_counters;
}
static inline void uncore_disable_box(struct intel_uncore_box *box)
{
if (box->pmu->type->ops->disable_box)
box->pmu->type->ops->disable_box(box);
}
static inline void uncore_enable_box(struct intel_uncore_box *box)
{
if (box->pmu->type->ops->enable_box)
box->pmu->type->ops->enable_box(box);
}
static inline void uncore_disable_event(struct intel_uncore_box *box,
struct perf_event *event)
{
box->pmu->type->ops->disable_event(box, event);
}
static inline void uncore_enable_event(struct intel_uncore_box *box,
struct perf_event *event)
{
box->pmu->type->ops->enable_event(box, event);
}
static inline u64 uncore_read_counter(struct intel_uncore_box *box,
struct perf_event *event)
{
return box->pmu->type->ops->read_counter(box, event);
}
static inline void uncore_box_init(struct intel_uncore_box *box)
{
if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
if (box->pmu->type->ops->init_box)
box->pmu->type->ops->init_box(box);
}
}
static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
{
return (box->phys_id < 0);
}
struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event);
struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
struct intel_uncore_box *uncore_event_to_box(struct perf_event *event);
u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
void uncore_pmu_event_read(struct perf_event *event);
void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
struct event_constraint *
uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
extern struct intel_uncore_type **uncore_msr_uncores;
extern struct intel_uncore_type **uncore_pci_uncores;
extern struct pci_driver *uncore_pci_driver;
extern int uncore_pcibus_to_physid[256];
extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
extern struct event_constraint uncore_constraint_empty;
/* perf_event_intel_uncore_snb.c */
int snb_uncore_pci_init(void);
int ivb_uncore_pci_init(void);
int hsw_uncore_pci_init(void);
void snb_uncore_cpu_init(void);
void nhm_uncore_cpu_init(void);
/* perf_event_intel_uncore_snbep.c */
int snbep_uncore_pci_init(void);
void snbep_uncore_cpu_init(void);
int ivbep_uncore_pci_init(void);
void ivbep_uncore_cpu_init(void);
int hswep_uncore_pci_init(void);
void hswep_uncore_cpu_init(void);
/* perf_event_intel_uncore_nhmex.c */
void nhmex_uncore_cpu_init(void);

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,636 @@
/* Nehalem/SandBridge/Haswell uncore support */
#include "perf_event_intel_uncore.h"
/* SNB event control */
#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00
#define SNB_UNC_CTL_EDGE_DET (1 << 18)
#define SNB_UNC_CTL_EN (1 << 22)
#define SNB_UNC_CTL_INVERT (1 << 23)
#define SNB_UNC_CTL_CMASK_MASK 0x1f000000
#define NHM_UNC_CTL_CMASK_MASK 0xff000000
#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0)
#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
SNB_UNC_CTL_UMASK_MASK | \
SNB_UNC_CTL_EDGE_DET | \
SNB_UNC_CTL_INVERT | \
SNB_UNC_CTL_CMASK_MASK)
#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
SNB_UNC_CTL_UMASK_MASK | \
SNB_UNC_CTL_EDGE_DET | \
SNB_UNC_CTL_INVERT | \
NHM_UNC_CTL_CMASK_MASK)
/* SNB global control register */
#define SNB_UNC_PERF_GLOBAL_CTL 0x391
#define SNB_UNC_FIXED_CTR_CTRL 0x394
#define SNB_UNC_FIXED_CTR 0x395
/* SNB uncore global control */
#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1)
#define SNB_UNC_GLOBAL_CTL_EN (1 << 29)
/* SNB Cbo register */
#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700
#define SNB_UNC_CBO_0_PER_CTR0 0x706
#define SNB_UNC_CBO_MSR_OFFSET 0x10
/* NHM global control register */
#define NHM_UNC_PERF_GLOBAL_CTL 0x391
#define NHM_UNC_FIXED_CTR 0x394
#define NHM_UNC_FIXED_CTR_CTRL 0x395
/* NHM uncore global control */
#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1)
#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32)
/* NHM uncore register */
#define NHM_UNC_PERFEVTSEL0 0x3c0
#define NHM_UNC_UNCORE_PMC0 0x3b0
DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
/* Sandy Bridge uncore support */
static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
if (hwc->idx < UNCORE_PMC_IDX_FIXED)
wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
else
wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
}
static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
{
wrmsrl(event->hw.config_base, 0);
}
static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
{
if (box->pmu->pmu_idx == 0) {
wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
}
}
static struct uncore_event_desc snb_uncore_events[] = {
INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
{ /* end: all zeroes */ },
};
static struct attribute *snb_uncore_formats_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
&format_attr_edge.attr,
&format_attr_inv.attr,
&format_attr_cmask5.attr,
NULL,
};
static struct attribute_group snb_uncore_format_group = {
.name = "format",
.attrs = snb_uncore_formats_attr,
};
static struct intel_uncore_ops snb_uncore_msr_ops = {
.init_box = snb_uncore_msr_init_box,
.disable_event = snb_uncore_msr_disable_event,
.enable_event = snb_uncore_msr_enable_event,
.read_counter = uncore_msr_read_counter,
};
static struct event_constraint snb_uncore_cbox_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
EVENT_CONSTRAINT_END
};
static struct intel_uncore_type snb_uncore_cbox = {
.name = "cbox",
.num_counters = 2,
.num_boxes = 4,
.perf_ctr_bits = 44,
.fixed_ctr_bits = 48,
.perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
.event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
.fixed_ctr = SNB_UNC_FIXED_CTR,
.fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
.single_fixed = 1,
.event_mask = SNB_UNC_RAW_EVENT_MASK,
.msr_offset = SNB_UNC_CBO_MSR_OFFSET,
.constraints = snb_uncore_cbox_constraints,
.ops = &snb_uncore_msr_ops,
.format_group = &snb_uncore_format_group,
.event_descs = snb_uncore_events,
};
static struct intel_uncore_type *snb_msr_uncores[] = {
&snb_uncore_cbox,
NULL,
};
void snb_uncore_cpu_init(void)
{
uncore_msr_uncores = snb_msr_uncores;
if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
}
enum {
SNB_PCI_UNCORE_IMC,
};
static struct uncore_event_desc snb_uncore_imc_events[] = {
INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"),
INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
{ /* end: all zeroes */ },
};
#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff
#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48
/* page size multiple covering all config regs */
#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000
#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1
#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050
#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2
#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054
#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE
static struct attribute *snb_uncore_imc_formats_attr[] = {
&format_attr_event.attr,
NULL,
};
static struct attribute_group snb_uncore_imc_format_group = {
.name = "format",
.attrs = snb_uncore_imc_formats_attr,
};
static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
{
struct pci_dev *pdev = box->pci_dev;
int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
resource_size_t addr;
u32 pci_dword;
pci_read_config_dword(pdev, where, &pci_dword);
addr = pci_dword;
#ifdef CONFIG_PHYS_ADDR_T_64BIT
pci_read_config_dword(pdev, where + 4, &pci_dword);
addr |= ((resource_size_t)pci_dword << 32);
#endif
addr &= ~(PAGE_SIZE - 1);
box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
}
static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
{}
static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
{}
static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
{}
static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
{}
static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
}
/*
* custom event_init() function because we define our own fixed, free
* running counters, so we do not want to conflict with generic uncore
* logic. Also simplifies processing
*/
static int snb_uncore_imc_event_init(struct perf_event *event)
{
struct intel_uncore_pmu *pmu;
struct intel_uncore_box *box;
struct hw_perf_event *hwc = &event->hw;
u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
int idx, base;
if (event->attr.type != event->pmu->type)
return -ENOENT;
pmu = uncore_event_to_pmu(event);
/* no device found for this pmu */
if (pmu->func_id < 0)
return -ENOENT;
/* Sampling not supported yet */
if (hwc->sample_period)
return -EINVAL;
/* unsupported modes and filters */
if (event->attr.exclude_user ||
event->attr.exclude_kernel ||
event->attr.exclude_hv ||
event->attr.exclude_idle ||
event->attr.exclude_host ||
event->attr.exclude_guest ||
event->attr.sample_period) /* no sampling */
return -EINVAL;
/*
* Place all uncore events for a particular physical package
* onto a single cpu
*/
if (event->cpu < 0)
return -EINVAL;
/* check only supported bits are set */
if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
return -EINVAL;
box = uncore_pmu_to_box(pmu, event->cpu);
if (!box || box->cpu < 0)
return -EINVAL;
event->cpu = box->cpu;
event->hw.idx = -1;
event->hw.last_tag = ~0ULL;
event->hw.extra_reg.idx = EXTRA_REG_NONE;
event->hw.branch_reg.idx = EXTRA_REG_NONE;
/*
* check event is known (whitelist, determines counter)
*/
switch (cfg) {
case SNB_UNCORE_PCI_IMC_DATA_READS:
base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
idx = UNCORE_PMC_IDX_FIXED;
break;
case SNB_UNCORE_PCI_IMC_DATA_WRITES:
base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
idx = UNCORE_PMC_IDX_FIXED + 1;
break;
default:
return -EINVAL;
}
/* must be done before validate_group */
event->hw.event_base = base;
event->hw.config = cfg;
event->hw.idx = idx;
/* no group validation needed, we have free running counters */
return 0;
}
static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
{
return 0;
}
static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
{
struct intel_uncore_box *box = uncore_event_to_box(event);
u64 count;
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
return;
event->hw.state = 0;
box->n_active++;
list_add_tail(&event->active_entry, &box->active_list);
count = snb_uncore_imc_read_counter(box, event);
local64_set(&event->hw.prev_count, count);
if (box->n_active == 1)
uncore_pmu_start_hrtimer(box);
}
static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
{
struct intel_uncore_box *box = uncore_event_to_box(event);
struct hw_perf_event *hwc = &event->hw;
if (!(hwc->state & PERF_HES_STOPPED)) {
box->n_active--;
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
list_del(&event->active_entry);
if (box->n_active == 0)
uncore_pmu_cancel_hrtimer(box);
}
if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
/*
* Drain the remaining delta count out of a event
* that we are disabling:
*/
uncore_perf_event_update(box, event);
hwc->state |= PERF_HES_UPTODATE;
}
}
static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
{
struct intel_uncore_box *box = uncore_event_to_box(event);
struct hw_perf_event *hwc = &event->hw;
if (!box)
return -ENODEV;
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
if (!(flags & PERF_EF_START))
hwc->state |= PERF_HES_ARCH;
snb_uncore_imc_event_start(event, 0);
box->n_events++;
return 0;
}
static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
{
struct intel_uncore_box *box = uncore_event_to_box(event);
int i;
snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
for (i = 0; i < box->n_events; i++) {
if (event == box->event_list[i]) {
--box->n_events;
break;
}
}
}
static int snb_pci2phy_map_init(int devid)
{
struct pci_dev *dev = NULL;
int bus;
dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
if (!dev)
return -ENOTTY;
bus = dev->bus->number;
uncore_pcibus_to_physid[bus] = 0;
pci_dev_put(dev);
return 0;
}
static struct pmu snb_uncore_imc_pmu = {
.task_ctx_nr = perf_invalid_context,
.event_init = snb_uncore_imc_event_init,
.add = snb_uncore_imc_event_add,
.del = snb_uncore_imc_event_del,
.start = snb_uncore_imc_event_start,
.stop = snb_uncore_imc_event_stop,
.read = uncore_pmu_event_read,
};
static struct intel_uncore_ops snb_uncore_imc_ops = {
.init_box = snb_uncore_imc_init_box,
.enable_box = snb_uncore_imc_enable_box,
.disable_box = snb_uncore_imc_disable_box,
.disable_event = snb_uncore_imc_disable_event,
.enable_event = snb_uncore_imc_enable_event,
.hw_config = snb_uncore_imc_hw_config,
.read_counter = snb_uncore_imc_read_counter,
};
static struct intel_uncore_type snb_uncore_imc = {
.name = "imc",
.num_counters = 2,
.num_boxes = 1,
.fixed_ctr_bits = 32,
.fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE,
.event_descs = snb_uncore_imc_events,
.format_group = &snb_uncore_imc_format_group,
.perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
.event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK,
.ops = &snb_uncore_imc_ops,
.pmu = &snb_uncore_imc_pmu,
};
static struct intel_uncore_type *snb_pci_uncores[] = {
[SNB_PCI_UNCORE_IMC] = &snb_uncore_imc,
NULL,
};
static const struct pci_device_id snb_uncore_pci_ids[] = {
{ /* IMC */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
{ /* end: all zeroes */ },
};
static const struct pci_device_id ivb_uncore_pci_ids[] = {
{ /* IMC */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
{ /* IMC */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
{ /* end: all zeroes */ },
};
static const struct pci_device_id hsw_uncore_pci_ids[] = {
{ /* IMC */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
{ /* end: all zeroes */ },
};
static struct pci_driver snb_uncore_pci_driver = {
.name = "snb_uncore",
.id_table = snb_uncore_pci_ids,
};
static struct pci_driver ivb_uncore_pci_driver = {
.name = "ivb_uncore",
.id_table = ivb_uncore_pci_ids,
};
static struct pci_driver hsw_uncore_pci_driver = {
.name = "hsw_uncore",
.id_table = hsw_uncore_pci_ids,
};
struct imc_uncore_pci_dev {
__u32 pci_id;
struct pci_driver *driver;
};
#define IMC_DEV(a, d) \
{ .pci_id = PCI_DEVICE_ID_INTEL_##a, .driver = (d) }
static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
IMC_DEV(SNB_IMC, &snb_uncore_pci_driver),
IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver), /* 3rd Gen Core processor */
IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */
{ /* end marker */ }
};
#define for_each_imc_pci_id(x, t) \
for (x = (t); (x)->pci_id; x++)
static struct pci_driver *imc_uncore_find_dev(void)
{
const struct imc_uncore_pci_dev *p;
int ret;
for_each_imc_pci_id(p, desktop_imc_pci_ids) {
ret = snb_pci2phy_map_init(p->pci_id);
if (ret == 0)
return p->driver;
}
return NULL;
}
static int imc_uncore_pci_init(void)
{
struct pci_driver *imc_drv = imc_uncore_find_dev();
if (!imc_drv)
return -ENODEV;
uncore_pci_uncores = snb_pci_uncores;
uncore_pci_driver = imc_drv;
return 0;
}
int snb_uncore_pci_init(void)
{
return imc_uncore_pci_init();
}
int ivb_uncore_pci_init(void)
{
return imc_uncore_pci_init();
}
int hsw_uncore_pci_init(void)
{
return imc_uncore_pci_init();
}
/* end of Sandy Bridge uncore support */
/* Nehalem uncore support */
static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
{
wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
}
static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
{
wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
}
static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
if (hwc->idx < UNCORE_PMC_IDX_FIXED)
wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
else
wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
}
static struct attribute *nhm_uncore_formats_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
&format_attr_edge.attr,
&format_attr_inv.attr,
&format_attr_cmask8.attr,
NULL,
};
static struct attribute_group nhm_uncore_format_group = {
.name = "format",
.attrs = nhm_uncore_formats_attr,
};
static struct uncore_event_desc nhm_uncore_events[] = {
INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"),
INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"),
INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"),
INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"),
INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"),
INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"),
INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"),
{ /* end: all zeroes */ },
};
static struct intel_uncore_ops nhm_uncore_msr_ops = {
.disable_box = nhm_uncore_msr_disable_box,
.enable_box = nhm_uncore_msr_enable_box,
.disable_event = snb_uncore_msr_disable_event,
.enable_event = nhm_uncore_msr_enable_event,
.read_counter = uncore_msr_read_counter,
};
static struct intel_uncore_type nhm_uncore = {
.name = "",
.num_counters = 8,
.num_boxes = 1,
.perf_ctr_bits = 48,
.fixed_ctr_bits = 48,
.event_ctl = NHM_UNC_PERFEVTSEL0,
.perf_ctr = NHM_UNC_UNCORE_PMC0,
.fixed_ctr = NHM_UNC_FIXED_CTR,
.fixed_ctl = NHM_UNC_FIXED_CTR_CTRL,
.event_mask = NHM_UNC_RAW_EVENT_MASK,
.event_descs = nhm_uncore_events,
.ops = &nhm_uncore_msr_ops,
.format_group = &nhm_uncore_format_group,
};
static struct intel_uncore_type *nhm_msr_uncores[] = {
&nhm_uncore,
NULL,
};
void nhm_uncore_cpu_init(void)
{
uncore_msr_uncores = nhm_msr_uncores;
}
/* end of Nehalem uncore support */

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,319 @@
/* Driver for Intel Xeon Phi "Knights Corner" PMU */
#include <linux/perf_event.h>
#include <linux/types.h>
#include <asm/hardirq.h>
#include "perf_event.h"
static const u64 knc_perfmon_event_map[] =
{
[PERF_COUNT_HW_CPU_CYCLES] = 0x002a,
[PERF_COUNT_HW_INSTRUCTIONS] = 0x0016,
[PERF_COUNT_HW_CACHE_REFERENCES] = 0x0028,
[PERF_COUNT_HW_CACHE_MISSES] = 0x0029,
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0012,
[PERF_COUNT_HW_BRANCH_MISSES] = 0x002b,
};
static const u64 __initconst knc_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
{
[ C(L1D) ] = {
[ C(OP_READ) ] = {
/* On Xeon Phi event "0" is a valid DATA_READ */
/* (L1 Data Cache Reads) Instruction. */
/* We code this as ARCH_PERFMON_EVENTSEL_INT as this */
/* bit will always be set in x86_pmu_hw_config(). */
[ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
/* DATA_READ */
[ C(RESULT_MISS) ] = 0x0003, /* DATA_READ_MISS */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
[ C(RESULT_MISS) ] = 0x0004, /* DATA_WRITE_MISS */
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0x0011, /* L1_DATA_PF1 */
[ C(RESULT_MISS) ] = 0x001c, /* L1_DATA_PF1_MISS */
},
},
[ C(L1I ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
[ C(RESULT_MISS) ] = 0x000e, /* CODE_CACHE_MISS */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0x0,
[ C(RESULT_MISS) ] = 0x0,
},
},
[ C(LL ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0x10cb, /* L2_READ_MISS */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0x10cc, /* L2_WRITE_HIT */
[ C(RESULT_MISS) ] = 0,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0x10fc, /* L2_DATA_PF2 */
[ C(RESULT_MISS) ] = 0x10fe, /* L2_DATA_PF2_MISS */
},
},
[ C(DTLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
/* DATA_READ */
/* see note on L1 OP_READ */
[ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
[ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0x0,
[ C(RESULT_MISS) ] = 0x0,
},
},
[ C(ITLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
[ C(RESULT_MISS) ] = 0x000d, /* CODE_PAGE_WALK */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
},
[ C(BPU ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0012, /* BRANCHES */
[ C(RESULT_MISS) ] = 0x002b, /* BRANCHES_MISPREDICTED */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
},
};
static u64 knc_pmu_event_map(int hw_event)
{
return knc_perfmon_event_map[hw_event];
}
static struct event_constraint knc_event_constraints[] =
{
INTEL_EVENT_CONSTRAINT(0xc3, 0x1), /* HWP_L2HIT */
INTEL_EVENT_CONSTRAINT(0xc4, 0x1), /* HWP_L2MISS */
INTEL_EVENT_CONSTRAINT(0xc8, 0x1), /* L2_READ_HIT_E */
INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* L2_READ_HIT_M */
INTEL_EVENT_CONSTRAINT(0xca, 0x1), /* L2_READ_HIT_S */
INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* L2_READ_MISS */
INTEL_EVENT_CONSTRAINT(0xcc, 0x1), /* L2_WRITE_HIT */
INTEL_EVENT_CONSTRAINT(0xce, 0x1), /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
INTEL_EVENT_CONSTRAINT(0xcf, 0x1), /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
INTEL_EVENT_CONSTRAINT(0xd7, 0x1), /* L2_VICTIM_REQ_WITH_DATA */
INTEL_EVENT_CONSTRAINT(0xe3, 0x1), /* SNP_HITM_BUNIT */
INTEL_EVENT_CONSTRAINT(0xe6, 0x1), /* SNP_HIT_L2 */
INTEL_EVENT_CONSTRAINT(0xe7, 0x1), /* SNP_HITM_L2 */
INTEL_EVENT_CONSTRAINT(0xf1, 0x1), /* L2_DATA_READ_MISS_CACHE_FILL */
INTEL_EVENT_CONSTRAINT(0xf2, 0x1), /* L2_DATA_WRITE_MISS_CACHE_FILL */
INTEL_EVENT_CONSTRAINT(0xf6, 0x1), /* L2_DATA_READ_MISS_MEM_FILL */
INTEL_EVENT_CONSTRAINT(0xf7, 0x1), /* L2_DATA_WRITE_MISS_MEM_FILL */
INTEL_EVENT_CONSTRAINT(0xfc, 0x1), /* L2_DATA_PF2 */
INTEL_EVENT_CONSTRAINT(0xfd, 0x1), /* L2_DATA_PF2_DROP */
INTEL_EVENT_CONSTRAINT(0xfe, 0x1), /* L2_DATA_PF2_MISS */
INTEL_EVENT_CONSTRAINT(0xff, 0x1), /* L2_DATA_HIT_INFLIGHT_PF2 */
EVENT_CONSTRAINT_END
};
#define MSR_KNC_IA32_PERF_GLOBAL_STATUS 0x0000002d
#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL 0x0000002e
#define MSR_KNC_IA32_PERF_GLOBAL_CTRL 0x0000002f
#define KNC_ENABLE_COUNTER0 0x00000001
#define KNC_ENABLE_COUNTER1 0x00000002
static void knc_pmu_disable_all(void)
{
u64 val;
rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
}
static void knc_pmu_enable_all(int added)
{
u64 val;
rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
}
static inline void
knc_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
u64 val;
val = hwc->config;
val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
(void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
}
static void knc_pmu_enable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
u64 val;
val = hwc->config;
val |= ARCH_PERFMON_EVENTSEL_ENABLE;
(void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
}
static inline u64 knc_pmu_get_status(void)
{
u64 status;
rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status);
return status;
}
static inline void knc_pmu_ack_status(u64 ack)
{
wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack);
}
static int knc_pmu_handle_irq(struct pt_regs *regs)
{
struct perf_sample_data data;
struct cpu_hw_events *cpuc;
int handled = 0;
int bit, loops;
u64 status;
cpuc = this_cpu_ptr(&cpu_hw_events);
knc_pmu_disable_all();
status = knc_pmu_get_status();
if (!status) {
knc_pmu_enable_all(0);
return handled;
}
loops = 0;
again:
knc_pmu_ack_status(status);
if (++loops > 100) {
WARN_ONCE(1, "perf: irq loop stuck!\n");
perf_event_print_debug();
goto done;
}
inc_irq_stat(apic_perf_irqs);
for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
struct perf_event *event = cpuc->events[bit];
handled++;
if (!test_bit(bit, cpuc->active_mask))
continue;
if (!intel_pmu_save_and_restart(event))
continue;
perf_sample_data_init(&data, 0, event->hw.last_period);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}
/*
* Repeat if there is more work to be done:
*/
status = knc_pmu_get_status();
if (status)
goto again;
done:
knc_pmu_enable_all(0);
return handled;
}
PMU_FORMAT_ATTR(event, "config:0-7" );
PMU_FORMAT_ATTR(umask, "config:8-15" );
PMU_FORMAT_ATTR(edge, "config:18" );
PMU_FORMAT_ATTR(inv, "config:23" );
PMU_FORMAT_ATTR(cmask, "config:24-31" );
static struct attribute *intel_knc_formats_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
&format_attr_edge.attr,
&format_attr_inv.attr,
&format_attr_cmask.attr,
NULL,
};
static const struct x86_pmu knc_pmu __initconst = {
.name = "knc",
.handle_irq = knc_pmu_handle_irq,
.disable_all = knc_pmu_disable_all,
.enable_all = knc_pmu_enable_all,
.enable = knc_pmu_enable_event,
.disable = knc_pmu_disable_event,
.hw_config = x86_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_KNC_EVNTSEL0,
.perfctr = MSR_KNC_PERFCTR0,
.event_map = knc_pmu_event_map,
.max_events = ARRAY_SIZE(knc_perfmon_event_map),
.apic = 1,
.max_period = (1ULL << 39) - 1,
.version = 0,
.num_counters = 2,
.cntval_bits = 40,
.cntval_mask = (1ULL << 40) - 1,
.get_event_constraints = x86_get_event_constraints,
.event_constraints = knc_event_constraints,
.format_attrs = intel_knc_formats_attr,
};
__init int knc_pmu_init(void)
{
x86_pmu = knc_pmu;
memcpy(hw_cache_event_ids, knc_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
return 0;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,279 @@
#include <linux/perf_event.h>
#include <linux/types.h>
#include "perf_event.h"
/*
* Not sure about some of these
*/
static const u64 p6_perfmon_event_map[] =
{
[PERF_COUNT_HW_CPU_CYCLES] = 0x0079, /* CPU_CLK_UNHALTED */
[PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, /* INST_RETIRED */
[PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, /* L2_RQSTS:M:E:S:I */
[PERF_COUNT_HW_CACHE_MISSES] = 0x012e, /* L2_RQSTS:I */
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, /* BR_INST_RETIRED */
[PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, /* BR_MISS_PRED_RETIRED */
[PERF_COUNT_HW_BUS_CYCLES] = 0x0062, /* BUS_DRDY_CLOCKS */
[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2, /* RESOURCE_STALLS */
};
static const u64 __initconst p6_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
{
[ C(L1D) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */
[ C(RESULT_MISS) ] = 0x0045, /* DCU_LINES_IN */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0x0f29, /* L2_LD:M:E:S:I */
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
},
[ C(L1I ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */
[ C(RESULT_MISS) ] = 0x0f28, /* L2_IFETCH:M:E:S:I */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
},
[ C(LL ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0x0025, /* L2_M_LINES_INM */
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
},
[ C(DTLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */
[ C(RESULT_MISS) ] = 0,
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
},
[ C(ITLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */
[ C(RESULT_MISS) ] = 0x0085, /* ITLB_MISS */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
},
[ C(BPU ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED */
[ C(RESULT_MISS) ] = 0x00c5, /* BR_MISS_PRED_RETIRED */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = -1,
[ C(RESULT_MISS) ] = -1,
},
},
};
static u64 p6_pmu_event_map(int hw_event)
{
return p6_perfmon_event_map[hw_event];
}
/*
* Event setting that is specified not to count anything.
* We use this to effectively disable a counter.
*
* L2_RQSTS with 0 MESI unit mask.
*/
#define P6_NOP_EVENT 0x0000002EULL
static struct event_constraint p6_event_constraints[] =
{
INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
EVENT_CONSTRAINT_END
};
static void p6_pmu_disable_all(void)
{
u64 val;
/* p6 only has one enable register */
rdmsrl(MSR_P6_EVNTSEL0, val);
val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
wrmsrl(MSR_P6_EVNTSEL0, val);
}
static void p6_pmu_enable_all(int added)
{
unsigned long val;
/* p6 only has one enable register */
rdmsrl(MSR_P6_EVNTSEL0, val);
val |= ARCH_PERFMON_EVENTSEL_ENABLE;
wrmsrl(MSR_P6_EVNTSEL0, val);
}
static inline void
p6_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
u64 val = P6_NOP_EVENT;
(void)wrmsrl_safe(hwc->config_base, val);
}
static void p6_pmu_enable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
u64 val;
val = hwc->config;
/*
* p6 only has a global event enable, set on PerfEvtSel0
* We "disable" events by programming P6_NOP_EVENT
* and we rely on p6_pmu_enable_all() being called
* to actually enable the events.
*/
(void)wrmsrl_safe(hwc->config_base, val);
}
PMU_FORMAT_ATTR(event, "config:0-7" );
PMU_FORMAT_ATTR(umask, "config:8-15" );
PMU_FORMAT_ATTR(edge, "config:18" );
PMU_FORMAT_ATTR(pc, "config:19" );
PMU_FORMAT_ATTR(inv, "config:23" );
PMU_FORMAT_ATTR(cmask, "config:24-31" );
static struct attribute *intel_p6_formats_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
&format_attr_edge.attr,
&format_attr_pc.attr,
&format_attr_inv.attr,
&format_attr_cmask.attr,
NULL,
};
static __initconst const struct x86_pmu p6_pmu = {
.name = "p6",
.handle_irq = x86_pmu_handle_irq,
.disable_all = p6_pmu_disable_all,
.enable_all = p6_pmu_enable_all,
.enable = p6_pmu_enable_event,
.disable = p6_pmu_disable_event,
.hw_config = x86_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_P6_EVNTSEL0,
.perfctr = MSR_P6_PERFCTR0,
.event_map = p6_pmu_event_map,
.max_events = ARRAY_SIZE(p6_perfmon_event_map),
.apic = 1,
.max_period = (1ULL << 31) - 1,
.version = 0,
.num_counters = 2,
/*
* Events have 40 bits implemented. However they are designed such
* that bits [32-39] are sign extensions of bit 31. As such the
* effective width of a event for P6-like PMU is 32 bits only.
*
* See IA-32 Intel Architecture Software developer manual Vol 3B
*/
.cntval_bits = 32,
.cntval_mask = (1ULL << 32) - 1,
.get_event_constraints = x86_get_event_constraints,
.event_constraints = p6_event_constraints,
.format_attrs = intel_p6_formats_attr,
.events_sysfs_show = intel_event_sysfs_show,
};
static __init void p6_pmu_rdpmc_quirk(void)
{
if (boot_cpu_data.x86_mask < 9) {
/*
* PPro erratum 26; fixed in stepping 9 and above.
*/
pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n");
x86_pmu.attr_rdpmc_broken = 1;
x86_pmu.attr_rdpmc = 0;
}
}
__init int p6_pmu_init(void)
{
x86_pmu = p6_pmu;
switch (boot_cpu_data.x86_model) {
case 1: /* Pentium Pro */
x86_add_quirk(p6_pmu_rdpmc_quirk);
break;
case 3: /* Pentium II - Klamath */
case 5: /* Pentium II - Deschutes */
case 6: /* Pentium II - Mendocino */
break;
case 7: /* Pentium III - Katmai */
case 8: /* Pentium III - Coppermine */
case 10: /* Pentium III Xeon */
case 11: /* Pentium III - Tualatin */
break;
case 9: /* Pentium M - Banias */
case 13: /* Pentium M - Dothan */
break;
default:
pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);
return -ENODEV;
}
memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
return 0;
}

View file

@ -0,0 +1,160 @@
/*
* local apic based NMI watchdog for various CPUs.
*
* This file also handles reservation of performance counters for coordination
* with other users (like oprofile).
*
* Note that these events normally don't tick when the CPU idles. This means
* the frequency varies with CPU load.
*
* Original code for K7/P6 written by Keith Owens
*
*/
#include <linux/percpu.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/smp.h>
#include <asm/nmi.h>
#include <linux/kprobes.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
/*
* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
* offset from MSR_P4_BSU_ESCR0.
*
* It will be the max for all platforms (for now)
*/
#define NMI_MAX_COUNTER_BITS 66
/*
* perfctr_nmi_owner tracks the ownership of the perfctr registers:
* evtsel_nmi_owner tracks the ownership of the event selection
* - different performance counters/ event selection may be reserved for
* different subsystems this reservation system just tries to coordinate
* things a little
*/
static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
/* converts an msr to an appropriate reservation bit */
static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
{
/* returns the bit offset of the performance counter register */
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
if (msr >= MSR_F15H_PERF_CTR)
return (msr - MSR_F15H_PERF_CTR) >> 1;
return msr - MSR_K7_PERFCTR0;
case X86_VENDOR_INTEL:
if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
return msr - MSR_ARCH_PERFMON_PERFCTR0;
switch (boot_cpu_data.x86) {
case 6:
return msr - MSR_P6_PERFCTR0;
case 11:
return msr - MSR_KNC_PERFCTR0;
case 15:
return msr - MSR_P4_BPU_PERFCTR0;
}
}
return 0;
}
/*
* converts an msr to an appropriate reservation bit
* returns the bit offset of the event selection register
*/
static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
{
/* returns the bit offset of the event selection register */
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
if (msr >= MSR_F15H_PERF_CTL)
return (msr - MSR_F15H_PERF_CTL) >> 1;
return msr - MSR_K7_EVNTSEL0;
case X86_VENDOR_INTEL:
if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
return msr - MSR_ARCH_PERFMON_EVENTSEL0;
switch (boot_cpu_data.x86) {
case 6:
return msr - MSR_P6_EVNTSEL0;
case 11:
return msr - MSR_KNC_EVNTSEL0;
case 15:
return msr - MSR_P4_BSU_ESCR0;
}
}
return 0;
}
/* checks for a bit availability (hack for oprofile) */
int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
{
BUG_ON(counter > NMI_MAX_COUNTER_BITS);
return !test_bit(counter, perfctr_nmi_owner);
}
EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
int reserve_perfctr_nmi(unsigned int msr)
{
unsigned int counter;
counter = nmi_perfctr_msr_to_bit(msr);
/* register not managed by the allocator? */
if (counter > NMI_MAX_COUNTER_BITS)
return 1;
if (!test_and_set_bit(counter, perfctr_nmi_owner))
return 1;
return 0;
}
EXPORT_SYMBOL(reserve_perfctr_nmi);
void release_perfctr_nmi(unsigned int msr)
{
unsigned int counter;
counter = nmi_perfctr_msr_to_bit(msr);
/* register not managed by the allocator? */
if (counter > NMI_MAX_COUNTER_BITS)
return;
clear_bit(counter, perfctr_nmi_owner);
}
EXPORT_SYMBOL(release_perfctr_nmi);
int reserve_evntsel_nmi(unsigned int msr)
{
unsigned int counter;
counter = nmi_evntsel_msr_to_bit(msr);
/* register not managed by the allocator? */
if (counter > NMI_MAX_COUNTER_BITS)
return 1;
if (!test_and_set_bit(counter, evntsel_nmi_owner))
return 1;
return 0;
}
EXPORT_SYMBOL(reserve_evntsel_nmi);
void release_evntsel_nmi(unsigned int msr)
{
unsigned int counter;
counter = nmi_evntsel_msr_to_bit(msr);
/* register not managed by the allocator? */
if (counter > NMI_MAX_COUNTER_BITS)
return;
clear_bit(counter, evntsel_nmi_owner);
}
EXPORT_SYMBOL(release_evntsel_nmi);

View file

@ -0,0 +1,21 @@
/*
* Strings for the various x86 power flags
*
* This file must not contain any executable code.
*/
#include <asm/cpufeature.h>
const char *const x86_power_flags[32] = {
"ts", /* temperature sensor */
"fid", /* frequency id control */
"vid", /* voltage id control */
"ttp", /* thermal trip */
"tm", /* hardware thermal control */
"stc", /* software thermal control */
"100mhzsteps", /* 100 MHz multiplier control */
"hwpstate", /* hardware P-state control */
"", /* tsc invariant mapped to constant_tsc */
"cpb", /* core performance boost */
"eff_freq_ro", /* Readonly aperf/mperf */
};

162
arch/x86/kernel/cpu/proc.c Normal file
View file

@ -0,0 +1,162 @@
#include <linux/smp.h>
#include <linux/timex.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/cpufreq.h>
/*
* Get CPU information for use by the procfs.
*/
static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
unsigned int cpu)
{
#ifdef CONFIG_SMP
seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu)));
seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
seq_printf(m, "apicid\t\t: %d\n", c->apicid);
seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
#endif
}
#ifdef CONFIG_X86_32
static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
{
seq_printf(m,
"fdiv_bug\t: %s\n"
"f00f_bug\t: %s\n"
"coma_bug\t: %s\n"
"fpu\t\t: %s\n"
"fpu_exception\t: %s\n"
"cpuid level\t: %d\n"
"wp\t\t: %s\n",
static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no",
static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no",
static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no",
static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
c->cpuid_level,
c->wp_works_ok ? "yes" : "no");
}
#else
static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
{
seq_printf(m,
"fpu\t\t: yes\n"
"fpu_exception\t: yes\n"
"cpuid level\t: %d\n"
"wp\t\t: yes\n",
c->cpuid_level);
}
#endif
static int show_cpuinfo(struct seq_file *m, void *v)
{
struct cpuinfo_x86 *c = v;
unsigned int cpu;
int i;
cpu = c->cpu_index;
seq_printf(m, "processor\t: %u\n"
"vendor_id\t: %s\n"
"cpu family\t: %d\n"
"model\t\t: %u\n"
"model name\t: %s\n",
cpu,
c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
c->x86,
c->x86_model,
c->x86_model_id[0] ? c->x86_model_id : "unknown");
if (c->x86_mask || c->cpuid_level >= 0)
seq_printf(m, "stepping\t: %d\n", c->x86_mask);
else
seq_printf(m, "stepping\t: unknown\n");
if (c->microcode)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
if (cpu_has(c, X86_FEATURE_TSC)) {
unsigned int freq = cpufreq_quick_get(cpu);
if (!freq)
freq = cpu_khz;
seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
freq / 1000, (freq % 1000));
}
/* Cache size */
if (c->x86_cache_size >= 0)
seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
show_cpuinfo_core(m, c, cpu);
show_cpuinfo_misc(m, c);
seq_printf(m, "flags\t\t:");
for (i = 0; i < 32*NCAPINTS; i++)
if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
seq_printf(m, " %s", x86_cap_flags[i]);
seq_printf(m, "\nbugs\t\t:");
for (i = 0; i < 32*NBUGINTS; i++) {
unsigned int bug_bit = 32*NCAPINTS + i;
if (cpu_has_bug(c, bug_bit) && x86_bug_flags[i])
seq_printf(m, " %s", x86_bug_flags[i]);
}
seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
c->loops_per_jiffy/(500000/HZ),
(c->loops_per_jiffy/(5000/HZ)) % 100);
#ifdef CONFIG_X86_64
if (c->x86_tlbsize > 0)
seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
#endif
seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size);
seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
c->x86_phys_bits, c->x86_virt_bits);
seq_printf(m, "power management:");
for (i = 0; i < 32; i++) {
if (c->x86_power & (1 << i)) {
if (i < ARRAY_SIZE(x86_power_flags) &&
x86_power_flags[i])
seq_printf(m, "%s%s",
x86_power_flags[i][0] ? " " : "",
x86_power_flags[i]);
else
seq_printf(m, " [%d]", i);
}
}
seq_printf(m, "\n\n");
return 0;
}
static void *c_start(struct seq_file *m, loff_t *pos)
{
*pos = cpumask_next(*pos - 1, cpu_online_mask);
if ((*pos) < nr_cpu_ids)
return &cpu_data(*pos);
return NULL;
}
static void *c_next(struct seq_file *m, void *v, loff_t *pos)
{
(*pos)++;
return c_start(m, pos);
}
static void c_stop(struct seq_file *m, void *v)
{
}
const struct seq_operations cpuinfo_op = {
.start = c_start,
.next = c_next,
.stop = c_stop,
.show = show_cpuinfo,
};

View file

@ -0,0 +1,60 @@
/*
* This file is part of the Linux kernel.
*
* Copyright (c) 2011, Intel Corporation
* Authors: Fenghua Yu <fenghua.yu@intel.com>,
* H. Peter Anvin <hpa@linux.intel.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*
*/
#include <asm/processor.h>
#include <asm/archrandom.h>
#include <asm/sections.h>
static int __init x86_rdrand_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_RDRAND);
setup_clear_cpu_cap(X86_FEATURE_RDSEED);
return 1;
}
__setup("nordrand", x86_rdrand_setup);
/*
* Force a reseed cycle; we are architecturally guaranteed a reseed
* after no more than 512 128-bit chunks of random data. This also
* acts as a test of the CPU capability.
*/
#define RESEED_LOOP ((512*128)/sizeof(unsigned long))
void x86_init_rdrand(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_ARCH_RANDOM
unsigned long tmp;
int i, count, ok;
if (!cpu_has(c, X86_FEATURE_RDRAND))
return; /* Nothing to do */
for (count = i = 0; i < RESEED_LOOP; i++) {
ok = rdrand_long(&tmp);
if (ok)
count++;
}
if (count != RESEED_LOOP)
clear_cpu_cap(c, X86_FEATURE_RDRAND);
#endif
}

View file

@ -0,0 +1,71 @@
/*
* Routines to identify additional cpu features that are scattered in
* cpuid space.
*/
#include <linux/cpu.h>
#include <asm/pat.h>
#include <asm/processor.h>
#include <asm/apic.h>
struct cpuid_bit {
u16 feature;
u8 reg;
u8 bit;
u32 level;
u32 sub_leaf;
};
enum cpuid_regs {
CR_EAX = 0,
CR_ECX,
CR_EDX,
CR_EBX
};
void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{
u32 max_level;
u32 regs[4];
const struct cpuid_bit *cb;
static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 },
{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
{ X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
{ X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
{ X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
{ X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 },
{ X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
{ X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
{ X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
{ X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
{ X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 },
{ X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 },
{ X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 },
{ X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 },
{ X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 },
{ X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 },
{ 0, 0, 0, 0, 0 }
};
for (cb = cpuid_bits; cb->feature; cb++) {
/* Verify that the level is valid */
max_level = cpuid_eax(cb->level & 0xffff0000);
if (max_level < cb->level ||
max_level > (cb->level | 0xffff))
continue;
cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
&regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
if (regs[cb->reg] & (1 << cb->bit))
set_cpu_cap(c, cb->feature);
}
}

View file

@ -0,0 +1,99 @@
/*
* Check for extended topology enumeration cpuid leaf 0xb and if it
* exists, use it for populating initial_apicid and cpu topology
* detection.
*/
#include <linux/cpu.h>
#include <asm/apic.h>
#include <asm/pat.h>
#include <asm/processor.h>
/* leaf 0xb SMT level */
#define SMT_LEVEL 0
/* leaf 0xb sub-leaf types */
#define INVALID_TYPE 0
#define SMT_TYPE 1
#define CORE_TYPE 2
#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
/*
* Check for extended topology enumeration cpuid leaf 0xb and if it
* exists, use it for populating initial_apicid and cpu topology
* detection.
*/
void detect_extended_topology(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
unsigned int eax, ebx, ecx, edx, sub_index;
unsigned int ht_mask_width, core_plus_mask_width;
unsigned int core_select_mask, core_level_siblings;
static bool printed;
if (c->cpuid_level < 0xb)
return;
cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
/*
* check if the cpuid leaf 0xb is actually implemented.
*/
if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
return;
set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
/*
* initial apic id, which also represents 32-bit extended x2apic id.
*/
c->initial_apicid = edx;
/*
* Populate HT related information from sub-leaf level 0.
*/
core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
sub_index = 1;
do {
cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
/*
* Check for the Core type in the implemented sub leaves.
*/
if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
break;
}
sub_index++;
} while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width)
& core_select_mask;
c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width);
/*
* Reinit the apicid, now that we have extended initial_apicid.
*/
c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
c->x86_max_cores = (core_level_siblings / smp_num_siblings);
if (!printed) {
printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
c->phys_proc_id);
if (c->x86_max_cores > 1)
printk(KERN_INFO "CPU: Processor Core ID: %d\n",
c->cpu_core_id);
printed = 1;
}
return;
#endif
}

View file

@ -0,0 +1,108 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include "cpu.h"
static void early_init_transmeta(struct cpuinfo_x86 *c)
{
u32 xlvl;
/* Transmeta-defined flags: level 0x80860001 */
xlvl = cpuid_eax(0x80860000);
if ((xlvl & 0xffff0000) == 0x80860000) {
if (xlvl >= 0x80860001)
c->x86_capability[2] = cpuid_edx(0x80860001);
}
}
static void init_transmeta(struct cpuinfo_x86 *c)
{
unsigned int cap_mask, uk, max, dummy;
unsigned int cms_rev1, cms_rev2;
unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
char cpu_info[65];
early_init_transmeta(c);
cpu_detect_cache_sizes(c);
/* Print CMS and CPU revision */
max = cpuid_eax(0x80860000);
cpu_rev = 0;
if (max >= 0x80860001) {
cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags);
if (cpu_rev != 0x02000000) {
printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
(cpu_rev >> 24) & 0xff,
(cpu_rev >> 16) & 0xff,
(cpu_rev >> 8) & 0xff,
cpu_rev & 0xff,
cpu_freq);
}
}
if (max >= 0x80860002) {
cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy);
if (cpu_rev == 0x02000000) {
printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n",
new_cpu_rev, cpu_freq);
}
printk(KERN_INFO "CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
(cms_rev1 >> 24) & 0xff,
(cms_rev1 >> 16) & 0xff,
(cms_rev1 >> 8) & 0xff,
cms_rev1 & 0xff,
cms_rev2);
}
if (max >= 0x80860006) {
cpuid(0x80860003,
(void *)&cpu_info[0],
(void *)&cpu_info[4],
(void *)&cpu_info[8],
(void *)&cpu_info[12]);
cpuid(0x80860004,
(void *)&cpu_info[16],
(void *)&cpu_info[20],
(void *)&cpu_info[24],
(void *)&cpu_info[28]);
cpuid(0x80860005,
(void *)&cpu_info[32],
(void *)&cpu_info[36],
(void *)&cpu_info[40],
(void *)&cpu_info[44]);
cpuid(0x80860006,
(void *)&cpu_info[48],
(void *)&cpu_info[52],
(void *)&cpu_info[56],
(void *)&cpu_info[60]);
cpu_info[64] = '\0';
printk(KERN_INFO "CPU: %s\n", cpu_info);
}
/* Unhide possibly hidden capability flags */
rdmsr(0x80860004, cap_mask, uk);
wrmsr(0x80860004, ~0, uk);
c->x86_capability[0] = cpuid_edx(0x00000001);
wrmsr(0x80860004, cap_mask, uk);
/* All Transmeta CPUs have a constant TSC */
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
#ifdef CONFIG_SYSCTL
/*
* randomize_va_space slows us down enormously;
* it probably triggers retranslation of x86->native bytecode
*/
randomize_va_space = 0;
#endif
}
static const struct cpu_dev transmeta_cpu_dev = {
.c_vendor = "Transmeta",
.c_ident = { "GenuineTMx86", "TransmetaCPU" },
.c_early_init = early_init_transmeta,
.c_init = init_transmeta,
.c_x86_vendor = X86_VENDOR_TRANSMETA,
};
cpu_dev_register(transmeta_cpu_dev);

25
arch/x86/kernel/cpu/umc.c Normal file
View file

@ -0,0 +1,25 @@
#include <linux/kernel.h>
#include <asm/processor.h>
#include "cpu.h"
/*
* UMC chips appear to be only either 386 or 486,
* so no special init takes place.
*/
static const struct cpu_dev umc_cpu_dev = {
.c_vendor = "UMC",
.c_ident = { "UMC UMC UMC" },
.legacy_models = {
{ .family = 4, .model_names =
{
[1] = "U5D",
[2] = "U5S",
}
},
},
.c_x86_vendor = X86_VENDOR_UMC,
};
cpu_dev_register(umc_cpu_dev);

View file

@ -0,0 +1,147 @@
/*
* VMware Detection code.
*
* Copyright (C) 2008, VMware, Inc.
* Author : Alok N Kataria <akataria@vmware.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
*/
#include <linux/dmi.h>
#include <linux/module.h>
#include <asm/div64.h>
#include <asm/x86_init.h>
#include <asm/hypervisor.h>
#define CPUID_VMWARE_INFO_LEAF 0x40000000
#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
#define VMWARE_HYPERVISOR_PORT 0x5658
#define VMWARE_PORT_CMD_GETVERSION 10
#define VMWARE_PORT_CMD_GETHZ 45
#define VMWARE_PORT_CMD_GETVCPU_INFO 68
#define VMWARE_PORT_CMD_LEGACY_X2APIC 3
#define VMWARE_PORT_CMD_VCPU_RESERVED 31
#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
__asm__("inl (%%dx)" : \
"=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
"0"(VMWARE_HYPERVISOR_MAGIC), \
"1"(VMWARE_PORT_CMD_##cmd), \
"2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
"memory");
static inline int __vmware_platform(void)
{
uint32_t eax, ebx, ecx, edx;
VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx);
return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
}
static unsigned long vmware_get_tsc_khz(void)
{
uint64_t tsc_hz, lpj;
uint32_t eax, ebx, ecx, edx;
VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
tsc_hz = eax | (((uint64_t)ebx) << 32);
do_div(tsc_hz, 1000);
BUG_ON(tsc_hz >> 32);
printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
(unsigned long) tsc_hz / 1000,
(unsigned long) tsc_hz % 1000);
if (!preset_lpj) {
lpj = ((u64)tsc_hz * 1000);
do_div(lpj, HZ);
preset_lpj = lpj;
}
return tsc_hz;
}
static void __init vmware_platform_setup(void)
{
uint32_t eax, ebx, ecx, edx;
VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
if (ebx != UINT_MAX)
x86_platform.calibrate_tsc = vmware_get_tsc_khz;
else
printk(KERN_WARNING
"Failed to get TSC freq from the hypervisor\n");
}
/*
* While checking the dmi string information, just checking the product
* serial key should be enough, as this will always have a VMware
* specific string when running under VMware hypervisor.
*/
static uint32_t __init vmware_platform(void)
{
if (cpu_has_hypervisor) {
unsigned int eax;
unsigned int hyper_vendor_id[3];
cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
&hyper_vendor_id[1], &hyper_vendor_id[2]);
if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
return CPUID_VMWARE_INFO_LEAF;
} else if (dmi_available && dmi_name_in_serial("VMware") &&
__vmware_platform())
return 1;
return 0;
}
/*
* VMware hypervisor takes care of exporting a reliable TSC to the guest.
* Still, due to timing difference when running on virtual cpus, the TSC can
* be marked as unstable in some cases. For example, the TSC sync check at
* bootup can fail due to a marginal offset between vcpus' TSCs (though the
* TSCs do not drift from each other). Also, the ACPI PM timer clocksource
* is not suitable as a watchdog when running on a hypervisor because the
* kernel may miss a wrap of the counter if the vcpu is descheduled for a
* long time. To skip these checks at runtime we set these capability bits,
* so that the kernel could just trust the hypervisor with providing a
* reliable virtual TSC that is suitable for timekeeping.
*/
static void vmware_set_cpu_features(struct cpuinfo_x86 *c)
{
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
}
/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */
static bool __init vmware_legacy_x2apic_available(void)
{
uint32_t eax, ebx, ecx, edx;
VMWARE_PORT(GETVCPU_INFO, eax, ebx, ecx, edx);
return (eax & (1 << VMWARE_PORT_CMD_VCPU_RESERVED)) == 0 &&
(eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0;
}
const __refconst struct hypervisor_x86 x86_hyper_vmware = {
.name = "VMware",
.detect = vmware_platform,
.set_cpu_features = vmware_set_cpu_features,
.init_platform = vmware_platform_setup,
.x2apic_available = vmware_legacy_x2apic_available,
};
EXPORT_SYMBOL(x86_hyper_vmware);