Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

View file

@ -0,0 +1,135 @@
config PPC_PSERIES
depends on PPC64 && PPC_BOOK3S
bool "IBM pSeries & new (POWER5-based) iSeries"
select HAVE_PCSPKR_PLATFORM
select MPIC
select OF_DYNAMIC
select PCI_MSI
select PPC_XICS
select PPC_ICP_NATIVE
select PPC_ICP_HV
select PPC_ICS_RTAS
select PPC_I8259
select PPC_RTAS
select PPC_RTAS_DAEMON
select RTAS_ERROR_LOGGING
select PPC_UDBG_16550
select PPC_NATIVE
select PPC_PCI_CHOICE if EXPERT
select ZLIB_DEFLATE
select PPC_DOORBELL
select HAVE_CONTEXT_TRACKING
select HOTPLUG_CPU if SMP
select ARCH_RANDOM
select PPC_DOORBELL
default y
config PPC_SPLPAR
depends on PPC_PSERIES
bool "Support for shared-processor logical partitions"
default n
help
Enabling this option will make the kernel run more efficiently
on logically-partitioned pSeries systems which use shared
processors, that is, which share physical processors between
two or more partitions.
config PSERIES_MSI
bool
depends on PCI_MSI && PPC_PSERIES && EEH
default y
config PSERIES_ENERGY
tristate "pSeries energy management capabilities driver"
depends on PPC_PSERIES
default y
help
Provides interface to platform energy management capabilities
on supported PSERIES platforms.
Provides: /sys/devices/system/cpu/pseries_(de)activation_hint_list
and /sys/devices/system/cpu/cpuN/pseries_(de)activation_hint
config SCANLOG
tristate "Scanlog dump interface"
depends on RTAS_PROC && PPC_PSERIES
config IO_EVENT_IRQ
bool "IO Event Interrupt support"
depends on PPC_PSERIES
default y
help
Select this option, if you want to enable support for IO Event
interrupts. IO event interrupt is a mechanism provided by RTAS
to return information about hardware error and non-error events
which may need OS attention. RTAS returns events for multiple
event types and scopes. Device drivers can register their handlers
to receive events.
This option will only enable the IO event platform code. You
will still need to enable or compile the actual drivers
that use this infrastructure to handle IO event interrupts.
Say Y if you are unsure.
config LPARCFG
bool "LPAR Configuration Data"
depends on PPC_PSERIES
help
Provide system capacity information via human readable
<key word>=<value> pairs through a /proc/ppc64/lparcfg interface.
config PPC_PSERIES_DEBUG
depends on PPC_PSERIES && PPC_EARLY_DEBUG
bool "Enable extra debug logging in platforms/pseries"
help
Say Y here if you want the pseries core to produce a bunch of
debug messages to the system log. Select this if you are having a
problem with the pseries core and want to see more of what is
going on. This does not enable debugging in lpar.c, which must
be manually done due to its verbosity.
default y
config PPC_SMLPAR
bool "Support for shared-memory logical partitions"
depends on PPC_PSERIES
select LPARCFG
default n
help
Select this option to enable shared memory partition support.
With this option a system running in an LPAR can be given more
memory than physically available and will allow firmware to
balance memory across many LPARs.
config CMM
tristate "Collaborative memory management"
depends on PPC_SMLPAR
default y
help
Select this option, if you want to enable the kernel interface
to reduce the memory size of the system. This is accomplished
by allocating pages of memory and put them "on hold". This only
makes sense for a system running in an LPAR where the unused pages
will be reused for other LPARs. The interface allows firmware to
balance memory across many LPARs.
config HV_PERF_CTRS
bool "Hypervisor supplied PMU events (24x7 & GPCI)"
default y
depends on PERF_EVENTS && PPC_PSERIES
help
Enable access to hypervisor supplied counters in perf. Currently,
this enables code that uses the hcall GetPerfCounterInfo and 24x7
interfaces to retrieve counters. GPCI exists on Power 6 and later
systems. 24x7 is available on Power 8 systems.
If unsure, select Y.
config DTL
bool "Dispatch Trace Log"
depends on PPC_SPLPAR && DEBUG_FS
help
SPLPAR machines can log hypervisor preempt & dispatch events to a
kernel buffer. Saying Y here will enable logging these events,
which are accessible through a debugfs file.
Say N if you are unsure.

View file

@ -0,0 +1,28 @@
ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
ccflags-$(CONFIG_PPC_PSERIES_DEBUG) += -DDEBUG
obj-y := lpar.o hvCall.o nvram.o reconfig.o \
setup.o iommu.o event_sources.o ras.o \
firmware.o power.o dlpar.o mobility.o rng.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_SCANLOG) += scanlog.o
obj-$(CONFIG_EEH) += eeh_pseries.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_PCI) += pci.o pci_dlpar.o
obj-$(CONFIG_PSERIES_MSI) += msi.o
obj-$(CONFIG_PSERIES_ENERGY) += pseries_energy.o
obj-$(CONFIG_HOTPLUG_CPU) += hotplug-cpu.o
obj-$(CONFIG_MEMORY_HOTPLUG) += hotplug-memory.o
obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o
obj-$(CONFIG_HVCS) += hvcserver.o
obj-$(CONFIG_HCALL_STATS) += hvCall_inst.o
obj-$(CONFIG_CMM) += cmm.o
obj-$(CONFIG_DTL) += dtl.o
obj-$(CONFIG_IO_EVENT_IRQ) += io_event_irq.o
obj-$(CONFIG_LPARCFG) += lparcfg.o
ifeq ($(CONFIG_PPC_PSERIES),y)
obj-$(CONFIG_SUSPEND) += suspend.o
endif

View file

@ -0,0 +1,739 @@
/*
* Collaborative memory management interface.
*
* Copyright (C) 2008 IBM Corporation
* Author(s): Brian King (brking@linux.vnet.ibm.com),
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*/
#include <linux/ctype.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/oom.h>
#include <linux/reboot.h>
#include <linux/sched.h>
#include <linux/stringify.h>
#include <linux/swap.h>
#include <linux/device.h>
#include <asm/firmware.h>
#include <asm/hvcall.h>
#include <asm/mmu.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <linux/memory.h>
#include <asm/plpar_wrappers.h>
#define CMM_DRIVER_VERSION "1.0.0"
#define CMM_DEFAULT_DELAY 1
#define CMM_HOTPLUG_DELAY 5
#define CMM_DEBUG 0
#define CMM_DISABLE 0
#define CMM_OOM_KB 1024
#define CMM_MIN_MEM_MB 256
#define KB2PAGES(_p) ((_p)>>(PAGE_SHIFT-10))
#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
/*
* The priority level tries to ensure that this notifier is called as
* late as possible to reduce thrashing in the shared memory pool.
*/
#define CMM_MEM_HOTPLUG_PRI 1
#define CMM_MEM_ISOLATE_PRI 15
static unsigned int delay = CMM_DEFAULT_DELAY;
static unsigned int hotplug_delay = CMM_HOTPLUG_DELAY;
static unsigned int oom_kb = CMM_OOM_KB;
static unsigned int cmm_debug = CMM_DEBUG;
static unsigned int cmm_disabled = CMM_DISABLE;
static unsigned long min_mem_mb = CMM_MIN_MEM_MB;
static struct device cmm_dev;
MODULE_AUTHOR("Brian King <brking@linux.vnet.ibm.com>");
MODULE_DESCRIPTION("IBM System p Collaborative Memory Manager");
MODULE_LICENSE("GPL");
MODULE_VERSION(CMM_DRIVER_VERSION);
module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. "
"[Default=" __stringify(CMM_DEFAULT_DELAY) "]");
module_param_named(hotplug_delay, hotplug_delay, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(delay, "Delay (in seconds) after memory hotplug remove "
"before loaning resumes. "
"[Default=" __stringify(CMM_HOTPLUG_DELAY) "]");
module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
"[Default=" __stringify(CMM_OOM_KB) "]");
module_param_named(min_mem_mb, min_mem_mb, ulong, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(min_mem_mb, "Minimum amount of memory (in MB) to not balloon. "
"[Default=" __stringify(CMM_MIN_MEM_MB) "]");
module_param_named(debug, cmm_debug, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(debug, "Enable module debugging logging. Set to 1 to enable. "
"[Default=" __stringify(CMM_DEBUG) "]");
#define CMM_NR_PAGES ((PAGE_SIZE - sizeof(void *) - sizeof(unsigned long)) / sizeof(unsigned long))
#define cmm_dbg(...) if (cmm_debug) { printk(KERN_INFO "cmm: "__VA_ARGS__); }
struct cmm_page_array {
struct cmm_page_array *next;
unsigned long index;
unsigned long page[CMM_NR_PAGES];
};
static unsigned long loaned_pages;
static unsigned long loaned_pages_target;
static unsigned long oom_freed_pages;
static struct cmm_page_array *cmm_page_list;
static DEFINE_SPINLOCK(cmm_lock);
static DEFINE_MUTEX(hotplug_mutex);
static int hotplug_occurred; /* protected by the hotplug mutex */
static struct task_struct *cmm_thread_ptr;
/**
* cmm_alloc_pages - Allocate pages and mark them as loaned
* @nr: number of pages to allocate
*
* Return value:
* number of pages requested to be allocated which were not
**/
static long cmm_alloc_pages(long nr)
{
struct cmm_page_array *pa, *npa;
unsigned long addr;
long rc;
cmm_dbg("Begin request for %ld pages\n", nr);
while (nr) {
/* Exit if a hotplug operation is in progress or occurred */
if (mutex_trylock(&hotplug_mutex)) {
if (hotplug_occurred) {
mutex_unlock(&hotplug_mutex);
break;
}
mutex_unlock(&hotplug_mutex);
} else {
break;
}
addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY | __GFP_NOMEMALLOC);
if (!addr)
break;
spin_lock(&cmm_lock);
pa = cmm_page_list;
if (!pa || pa->index >= CMM_NR_PAGES) {
/* Need a new page for the page list. */
spin_unlock(&cmm_lock);
npa = (struct cmm_page_array *)__get_free_page(
GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY | __GFP_NOMEMALLOC);
if (!npa) {
pr_info("%s: Can not allocate new page list\n", __func__);
free_page(addr);
break;
}
spin_lock(&cmm_lock);
pa = cmm_page_list;
if (!pa || pa->index >= CMM_NR_PAGES) {
npa->next = pa;
npa->index = 0;
pa = npa;
cmm_page_list = pa;
} else
free_page((unsigned long) npa);
}
if ((rc = plpar_page_set_loaned(__pa(addr)))) {
pr_err("%s: Can not set page to loaned. rc=%ld\n", __func__, rc);
spin_unlock(&cmm_lock);
free_page(addr);
break;
}
pa->page[pa->index++] = addr;
loaned_pages++;
totalram_pages--;
spin_unlock(&cmm_lock);
nr--;
}
cmm_dbg("End request with %ld pages unfulfilled\n", nr);
return nr;
}
/**
* cmm_free_pages - Free pages and mark them as active
* @nr: number of pages to free
*
* Return value:
* number of pages requested to be freed which were not
**/
static long cmm_free_pages(long nr)
{
struct cmm_page_array *pa;
unsigned long addr;
cmm_dbg("Begin free of %ld pages.\n", nr);
spin_lock(&cmm_lock);
pa = cmm_page_list;
while (nr) {
if (!pa || pa->index <= 0)
break;
addr = pa->page[--pa->index];
if (pa->index == 0) {
pa = pa->next;
free_page((unsigned long) cmm_page_list);
cmm_page_list = pa;
}
plpar_page_set_active(__pa(addr));
free_page(addr);
loaned_pages--;
nr--;
totalram_pages++;
}
spin_unlock(&cmm_lock);
cmm_dbg("End request with %ld pages unfulfilled\n", nr);
return nr;
}
/**
* cmm_oom_notify - OOM notifier
* @self: notifier block struct
* @dummy: not used
* @parm: returned - number of pages freed
*
* Return value:
* NOTIFY_OK
**/
static int cmm_oom_notify(struct notifier_block *self,
unsigned long dummy, void *parm)
{
unsigned long *freed = parm;
long nr = KB2PAGES(oom_kb);
cmm_dbg("OOM processing started\n");
nr = cmm_free_pages(nr);
loaned_pages_target = loaned_pages;
*freed += KB2PAGES(oom_kb) - nr;
oom_freed_pages += KB2PAGES(oom_kb) - nr;
cmm_dbg("OOM processing complete\n");
return NOTIFY_OK;
}
/**
* cmm_get_mpp - Read memory performance parameters
*
* Makes hcall to query the current page loan request from the hypervisor.
*
* Return value:
* nothing
**/
static void cmm_get_mpp(void)
{
int rc;
struct hvcall_mpp_data mpp_data;
signed long active_pages_target, page_loan_request, target;
signed long total_pages = totalram_pages + loaned_pages;
signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE;
rc = h_get_mpp(&mpp_data);
if (rc != H_SUCCESS)
return;
page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE);
target = page_loan_request + (signed long)loaned_pages;
if (target < 0 || total_pages < min_mem_pages)
target = 0;
if (target > oom_freed_pages)
target -= oom_freed_pages;
else
target = 0;
active_pages_target = total_pages - target;
if (min_mem_pages > active_pages_target)
target = total_pages - min_mem_pages;
if (target < 0)
target = 0;
loaned_pages_target = target;
cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n",
page_loan_request, loaned_pages, loaned_pages_target,
oom_freed_pages, totalram_pages);
}
static struct notifier_block cmm_oom_nb = {
.notifier_call = cmm_oom_notify
};
/**
* cmm_thread - CMM task thread
* @dummy: not used
*
* Return value:
* 0
**/
static int cmm_thread(void *dummy)
{
unsigned long timeleft;
while (1) {
timeleft = msleep_interruptible(delay * 1000);
if (kthread_should_stop() || timeleft)
break;
if (mutex_trylock(&hotplug_mutex)) {
if (hotplug_occurred) {
hotplug_occurred = 0;
mutex_unlock(&hotplug_mutex);
cmm_dbg("Hotplug operation has occurred, "
"loaning activity suspended "
"for %d seconds.\n",
hotplug_delay);
timeleft = msleep_interruptible(hotplug_delay *
1000);
if (kthread_should_stop() || timeleft)
break;
continue;
}
mutex_unlock(&hotplug_mutex);
} else {
cmm_dbg("Hotplug operation in progress, activity "
"suspended\n");
continue;
}
cmm_get_mpp();
if (loaned_pages_target > loaned_pages) {
if (cmm_alloc_pages(loaned_pages_target - loaned_pages))
loaned_pages_target = loaned_pages;
} else if (loaned_pages_target < loaned_pages)
cmm_free_pages(loaned_pages - loaned_pages_target);
}
return 0;
}
#define CMM_SHOW(name, format, args...) \
static ssize_t show_##name(struct device *dev, \
struct device_attribute *attr, \
char *buf) \
{ \
return sprintf(buf, format, ##args); \
} \
static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
CMM_SHOW(loaned_kb, "%lu\n", PAGES2KB(loaned_pages));
CMM_SHOW(loaned_target_kb, "%lu\n", PAGES2KB(loaned_pages_target));
static ssize_t show_oom_pages(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sprintf(buf, "%lu\n", PAGES2KB(oom_freed_pages));
}
static ssize_t store_oom_pages(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
unsigned long val = simple_strtoul (buf, NULL, 10);
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (val != 0)
return -EBADMSG;
oom_freed_pages = 0;
return count;
}
static DEVICE_ATTR(oom_freed_kb, S_IWUSR | S_IRUGO,
show_oom_pages, store_oom_pages);
static struct device_attribute *cmm_attrs[] = {
&dev_attr_loaned_kb,
&dev_attr_loaned_target_kb,
&dev_attr_oom_freed_kb,
};
static struct bus_type cmm_subsys = {
.name = "cmm",
.dev_name = "cmm",
};
/**
* cmm_sysfs_register - Register with sysfs
*
* Return value:
* 0 on success / other on failure
**/
static int cmm_sysfs_register(struct device *dev)
{
int i, rc;
if ((rc = subsys_system_register(&cmm_subsys, NULL)))
return rc;
dev->id = 0;
dev->bus = &cmm_subsys;
if ((rc = device_register(dev)))
goto subsys_unregister;
for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++) {
if ((rc = device_create_file(dev, cmm_attrs[i])))
goto fail;
}
return 0;
fail:
while (--i >= 0)
device_remove_file(dev, cmm_attrs[i]);
device_unregister(dev);
subsys_unregister:
bus_unregister(&cmm_subsys);
return rc;
}
/**
* cmm_unregister_sysfs - Unregister from sysfs
*
**/
static void cmm_unregister_sysfs(struct device *dev)
{
int i;
for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++)
device_remove_file(dev, cmm_attrs[i]);
device_unregister(dev);
bus_unregister(&cmm_subsys);
}
/**
* cmm_reboot_notifier - Make sure pages are not still marked as "loaned"
*
**/
static int cmm_reboot_notifier(struct notifier_block *nb,
unsigned long action, void *unused)
{
if (action == SYS_RESTART) {
if (cmm_thread_ptr)
kthread_stop(cmm_thread_ptr);
cmm_thread_ptr = NULL;
cmm_free_pages(loaned_pages);
}
return NOTIFY_DONE;
}
static struct notifier_block cmm_reboot_nb = {
.notifier_call = cmm_reboot_notifier,
};
/**
* cmm_count_pages - Count the number of pages loaned in a particular range.
*
* @arg: memory_isolate_notify structure with address range and count
*
* Return value:
* 0 on success
**/
static unsigned long cmm_count_pages(void *arg)
{
struct memory_isolate_notify *marg = arg;
struct cmm_page_array *pa;
unsigned long start = (unsigned long)pfn_to_kaddr(marg->start_pfn);
unsigned long end = start + (marg->nr_pages << PAGE_SHIFT);
unsigned long idx;
spin_lock(&cmm_lock);
pa = cmm_page_list;
while (pa) {
if ((unsigned long)pa >= start && (unsigned long)pa < end)
marg->pages_found++;
for (idx = 0; idx < pa->index; idx++)
if (pa->page[idx] >= start && pa->page[idx] < end)
marg->pages_found++;
pa = pa->next;
}
spin_unlock(&cmm_lock);
return 0;
}
/**
* cmm_memory_isolate_cb - Handle memory isolation notifier calls
* @self: notifier block struct
* @action: action to take
* @arg: struct memory_isolate_notify data for handler
*
* Return value:
* NOTIFY_OK or notifier error based on subfunction return value
**/
static int cmm_memory_isolate_cb(struct notifier_block *self,
unsigned long action, void *arg)
{
int ret = 0;
if (action == MEM_ISOLATE_COUNT)
ret = cmm_count_pages(arg);
return notifier_from_errno(ret);
}
static struct notifier_block cmm_mem_isolate_nb = {
.notifier_call = cmm_memory_isolate_cb,
.priority = CMM_MEM_ISOLATE_PRI
};
/**
* cmm_mem_going_offline - Unloan pages where memory is to be removed
* @arg: memory_notify structure with page range to be offlined
*
* Return value:
* 0 on success
**/
static int cmm_mem_going_offline(void *arg)
{
struct memory_notify *marg = arg;
unsigned long start_page = (unsigned long)pfn_to_kaddr(marg->start_pfn);
unsigned long end_page = start_page + (marg->nr_pages << PAGE_SHIFT);
struct cmm_page_array *pa_curr, *pa_last, *npa;
unsigned long idx;
unsigned long freed = 0;
cmm_dbg("Memory going offline, searching 0x%lx (%ld pages).\n",
start_page, marg->nr_pages);
spin_lock(&cmm_lock);
/* Search the page list for pages in the range to be offlined */
pa_last = pa_curr = cmm_page_list;
while (pa_curr) {
for (idx = (pa_curr->index - 1); (idx + 1) > 0; idx--) {
if ((pa_curr->page[idx] < start_page) ||
(pa_curr->page[idx] >= end_page))
continue;
plpar_page_set_active(__pa(pa_curr->page[idx]));
free_page(pa_curr->page[idx]);
freed++;
loaned_pages--;
totalram_pages++;
pa_curr->page[idx] = pa_last->page[--pa_last->index];
if (pa_last->index == 0) {
if (pa_curr == pa_last)
pa_curr = pa_last->next;
pa_last = pa_last->next;
free_page((unsigned long)cmm_page_list);
cmm_page_list = pa_last;
}
}
pa_curr = pa_curr->next;
}
/* Search for page list structures in the range to be offlined */
pa_last = NULL;
pa_curr = cmm_page_list;
while (pa_curr) {
if (((unsigned long)pa_curr >= start_page) &&
((unsigned long)pa_curr < end_page)) {
npa = (struct cmm_page_array *)__get_free_page(
GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY | __GFP_NOMEMALLOC);
if (!npa) {
spin_unlock(&cmm_lock);
cmm_dbg("Failed to allocate memory for list "
"management. Memory hotplug "
"failed.\n");
return ENOMEM;
}
memcpy(npa, pa_curr, PAGE_SIZE);
if (pa_curr == cmm_page_list)
cmm_page_list = npa;
if (pa_last)
pa_last->next = npa;
free_page((unsigned long) pa_curr);
freed++;
pa_curr = npa;
}
pa_last = pa_curr;
pa_curr = pa_curr->next;
}
spin_unlock(&cmm_lock);
cmm_dbg("Released %ld pages in the search range.\n", freed);
return 0;
}
/**
* cmm_memory_cb - Handle memory hotplug notifier calls
* @self: notifier block struct
* @action: action to take
* @arg: struct memory_notify data for handler
*
* Return value:
* NOTIFY_OK or notifier error based on subfunction return value
*
**/
static int cmm_memory_cb(struct notifier_block *self,
unsigned long action, void *arg)
{
int ret = 0;
switch (action) {
case MEM_GOING_OFFLINE:
mutex_lock(&hotplug_mutex);
hotplug_occurred = 1;
ret = cmm_mem_going_offline(arg);
break;
case MEM_OFFLINE:
case MEM_CANCEL_OFFLINE:
mutex_unlock(&hotplug_mutex);
cmm_dbg("Memory offline operation complete.\n");
break;
case MEM_GOING_ONLINE:
case MEM_ONLINE:
case MEM_CANCEL_ONLINE:
break;
}
return notifier_from_errno(ret);
}
static struct notifier_block cmm_mem_nb = {
.notifier_call = cmm_memory_cb,
.priority = CMM_MEM_HOTPLUG_PRI
};
/**
* cmm_init - Module initialization
*
* Return value:
* 0 on success / other on failure
**/
static int cmm_init(void)
{
int rc = -ENOMEM;
if (!firmware_has_feature(FW_FEATURE_CMO))
return -EOPNOTSUPP;
if ((rc = register_oom_notifier(&cmm_oom_nb)) < 0)
return rc;
if ((rc = register_reboot_notifier(&cmm_reboot_nb)))
goto out_oom_notifier;
if ((rc = cmm_sysfs_register(&cmm_dev)))
goto out_reboot_notifier;
if (register_memory_notifier(&cmm_mem_nb) ||
register_memory_isolate_notifier(&cmm_mem_isolate_nb))
goto out_unregister_notifier;
if (cmm_disabled)
return rc;
cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
if (IS_ERR(cmm_thread_ptr)) {
rc = PTR_ERR(cmm_thread_ptr);
goto out_unregister_notifier;
}
return rc;
out_unregister_notifier:
unregister_memory_notifier(&cmm_mem_nb);
unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
cmm_unregister_sysfs(&cmm_dev);
out_reboot_notifier:
unregister_reboot_notifier(&cmm_reboot_nb);
out_oom_notifier:
unregister_oom_notifier(&cmm_oom_nb);
return rc;
}
/**
* cmm_exit - Module exit
*
* Return value:
* nothing
**/
static void cmm_exit(void)
{
if (cmm_thread_ptr)
kthread_stop(cmm_thread_ptr);
unregister_oom_notifier(&cmm_oom_nb);
unregister_reboot_notifier(&cmm_reboot_nb);
unregister_memory_notifier(&cmm_mem_nb);
unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
cmm_free_pages(loaned_pages);
cmm_unregister_sysfs(&cmm_dev);
}
/**
* cmm_set_disable - Disable/Enable CMM
*
* Return value:
* 0 on success / other on failure
**/
static int cmm_set_disable(const char *val, struct kernel_param *kp)
{
int disable = simple_strtoul(val, NULL, 10);
if (disable != 0 && disable != 1)
return -EINVAL;
if (disable && !cmm_disabled) {
if (cmm_thread_ptr)
kthread_stop(cmm_thread_ptr);
cmm_thread_ptr = NULL;
cmm_free_pages(loaned_pages);
} else if (!disable && cmm_disabled) {
cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
if (IS_ERR(cmm_thread_ptr))
return PTR_ERR(cmm_thread_ptr);
}
cmm_disabled = disable;
return 0;
}
module_param_call(disable, cmm_set_disable, param_get_uint,
&cmm_disabled, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(disable, "Disable CMM. Set to 1 to disable. "
"[Default=" __stringify(CMM_DISABLE) "]");
module_init(cmm_init);
module_exit(cmm_exit);

View file

@ -0,0 +1,547 @@
/*
* Support for dynamic reconfiguration for PCI, Memory, and CPU
* Hotplug and Dynamic Logical Partitioning on RPA platforms.
*
* Copyright (C) 2009 Nathan Fontenot
* Copyright (C) 2009 IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*/
#include <linux/kernel.h>
#include <linux/notifier.h>
#include <linux/spinlock.h>
#include <linux/cpu.h>
#include <linux/slab.h>
#include <linux/of.h>
#include "offline_states.h"
#include "pseries.h"
#include <asm/prom.h>
#include <asm/machdep.h>
#include <asm/uaccess.h>
#include <asm/rtas.h>
struct cc_workarea {
__be32 drc_index;
__be32 zero;
__be32 name_offset;
__be32 prop_length;
__be32 prop_offset;
};
void dlpar_free_cc_property(struct property *prop)
{
kfree(prop->name);
kfree(prop->value);
kfree(prop);
}
static struct property *dlpar_parse_cc_property(struct cc_workarea *ccwa)
{
struct property *prop;
char *name;
char *value;
prop = kzalloc(sizeof(*prop), GFP_KERNEL);
if (!prop)
return NULL;
name = (char *)ccwa + be32_to_cpu(ccwa->name_offset);
prop->name = kstrdup(name, GFP_KERNEL);
prop->length = be32_to_cpu(ccwa->prop_length);
value = (char *)ccwa + be32_to_cpu(ccwa->prop_offset);
prop->value = kmemdup(value, prop->length, GFP_KERNEL);
if (!prop->value) {
dlpar_free_cc_property(prop);
return NULL;
}
return prop;
}
static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa,
const char *path)
{
struct device_node *dn;
char *name;
/* If parent node path is "/" advance path to NULL terminator to
* prevent double leading slashs in full_name.
*/
if (!path[1])
path++;
dn = kzalloc(sizeof(*dn), GFP_KERNEL);
if (!dn)
return NULL;
name = (char *)ccwa + be32_to_cpu(ccwa->name_offset);
dn->full_name = kasprintf(GFP_KERNEL, "%s/%s", path, name);
if (!dn->full_name) {
kfree(dn);
return NULL;
}
of_node_set_flag(dn, OF_DYNAMIC);
of_node_init(dn);
return dn;
}
static void dlpar_free_one_cc_node(struct device_node *dn)
{
struct property *prop;
while (dn->properties) {
prop = dn->properties;
dn->properties = prop->next;
dlpar_free_cc_property(prop);
}
kfree(dn->full_name);
kfree(dn);
}
void dlpar_free_cc_nodes(struct device_node *dn)
{
if (dn->child)
dlpar_free_cc_nodes(dn->child);
if (dn->sibling)
dlpar_free_cc_nodes(dn->sibling);
dlpar_free_one_cc_node(dn);
}
#define COMPLETE 0
#define NEXT_SIBLING 1
#define NEXT_CHILD 2
#define NEXT_PROPERTY 3
#define PREV_PARENT 4
#define MORE_MEMORY 5
#define CALL_AGAIN -2
#define ERR_CFG_USE -9003
struct device_node *dlpar_configure_connector(__be32 drc_index,
struct device_node *parent)
{
struct device_node *dn;
struct device_node *first_dn = NULL;
struct device_node *last_dn = NULL;
struct property *property;
struct property *last_property = NULL;
struct cc_workarea *ccwa;
char *data_buf;
const char *parent_path = parent->full_name;
int cc_token;
int rc = -1;
cc_token = rtas_token("ibm,configure-connector");
if (cc_token == RTAS_UNKNOWN_SERVICE)
return NULL;
data_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
if (!data_buf)
return NULL;
ccwa = (struct cc_workarea *)&data_buf[0];
ccwa->drc_index = drc_index;
ccwa->zero = 0;
do {
/* Since we release the rtas_data_buf lock between configure
* connector calls we want to re-populate the rtas_data_buffer
* with the contents of the previous call.
*/
spin_lock(&rtas_data_buf_lock);
memcpy(rtas_data_buf, data_buf, RTAS_DATA_BUF_SIZE);
rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL);
memcpy(data_buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
spin_unlock(&rtas_data_buf_lock);
switch (rc) {
case COMPLETE:
break;
case NEXT_SIBLING:
dn = dlpar_parse_cc_node(ccwa, parent_path);
if (!dn)
goto cc_error;
dn->parent = last_dn->parent;
last_dn->sibling = dn;
last_dn = dn;
break;
case NEXT_CHILD:
if (first_dn)
parent_path = last_dn->full_name;
dn = dlpar_parse_cc_node(ccwa, parent_path);
if (!dn)
goto cc_error;
if (!first_dn) {
dn->parent = parent;
first_dn = dn;
} else {
dn->parent = last_dn;
if (last_dn)
last_dn->child = dn;
}
last_dn = dn;
break;
case NEXT_PROPERTY:
property = dlpar_parse_cc_property(ccwa);
if (!property)
goto cc_error;
if (!last_dn->properties)
last_dn->properties = property;
else
last_property->next = property;
last_property = property;
break;
case PREV_PARENT:
last_dn = last_dn->parent;
parent_path = last_dn->parent->full_name;
break;
case CALL_AGAIN:
break;
case MORE_MEMORY:
case ERR_CFG_USE:
default:
printk(KERN_ERR "Unexpected Error (%d) "
"returned from configure-connector\n", rc);
goto cc_error;
}
} while (rc);
cc_error:
kfree(data_buf);
if (rc) {
if (first_dn)
dlpar_free_cc_nodes(first_dn);
return NULL;
}
return first_dn;
}
static struct device_node *derive_parent(const char *path)
{
struct device_node *parent;
char *last_slash;
last_slash = strrchr(path, '/');
if (last_slash == path) {
parent = of_find_node_by_path("/");
} else {
char *parent_path;
int parent_path_len = last_slash - path + 1;
parent_path = kmalloc(parent_path_len, GFP_KERNEL);
if (!parent_path)
return NULL;
strlcpy(parent_path, path, parent_path_len);
parent = of_find_node_by_path(parent_path);
kfree(parent_path);
}
return parent;
}
int dlpar_attach_node(struct device_node *dn)
{
int rc;
dn->parent = derive_parent(dn->full_name);
if (!dn->parent)
return -ENOMEM;
rc = of_attach_node(dn);
if (rc) {
printk(KERN_ERR "Failed to add device node %s\n",
dn->full_name);
return rc;
}
of_node_put(dn->parent);
return 0;
}
int dlpar_detach_node(struct device_node *dn)
{
struct device_node *child;
int rc;
child = of_get_next_child(dn, NULL);
while (child) {
dlpar_detach_node(child);
child = of_get_next_child(dn, child);
}
rc = of_detach_node(dn);
if (rc)
return rc;
of_node_put(dn); /* Must decrement the refcount */
return 0;
}
#define DR_ENTITY_SENSE 9003
#define DR_ENTITY_PRESENT 1
#define DR_ENTITY_UNUSABLE 2
#define ALLOCATION_STATE 9003
#define ALLOC_UNUSABLE 0
#define ALLOC_USABLE 1
#define ISOLATION_STATE 9001
#define ISOLATE 0
#define UNISOLATE 1
int dlpar_acquire_drc(u32 drc_index)
{
int dr_status, rc;
rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
DR_ENTITY_SENSE, drc_index);
if (rc || dr_status != DR_ENTITY_UNUSABLE)
return -1;
rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_USABLE);
if (rc)
return rc;
rc = rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
if (rc) {
rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
return rc;
}
return 0;
}
int dlpar_release_drc(u32 drc_index)
{
int dr_status, rc;
rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
DR_ENTITY_SENSE, drc_index);
if (rc || dr_status != DR_ENTITY_PRESENT)
return -1;
rc = rtas_set_indicator(ISOLATION_STATE, drc_index, ISOLATE);
if (rc)
return rc;
rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
if (rc) {
rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
return rc;
}
return 0;
}
#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
static int dlpar_online_cpu(struct device_node *dn)
{
int rc = 0;
unsigned int cpu;
int len, nthreads, i;
const __be32 *intserv;
u32 thread;
intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
if (!intserv)
return -EINVAL;
nthreads = len / sizeof(u32);
cpu_maps_update_begin();
for (i = 0; i < nthreads; i++) {
thread = be32_to_cpu(intserv[i]);
for_each_present_cpu(cpu) {
if (get_hard_smp_processor_id(cpu) != thread)
continue;
BUG_ON(get_cpu_current_state(cpu)
!= CPU_STATE_OFFLINE);
cpu_maps_update_done();
rc = device_online(get_cpu_device(cpu));
if (rc)
goto out;
cpu_maps_update_begin();
break;
}
if (cpu == num_possible_cpus())
printk(KERN_WARNING "Could not find cpu to online "
"with physical id 0x%x\n", thread);
}
cpu_maps_update_done();
out:
return rc;
}
static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
{
struct device_node *dn, *parent;
u32 drc_index;
int rc;
rc = kstrtou32(buf, 0, &drc_index);
if (rc)
return -EINVAL;
parent = of_find_node_by_path("/cpus");
if (!parent)
return -ENODEV;
dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent);
if (!dn)
return -EINVAL;
of_node_put(parent);
rc = dlpar_acquire_drc(drc_index);
if (rc) {
dlpar_free_cc_nodes(dn);
return -EINVAL;
}
rc = dlpar_attach_node(dn);
if (rc) {
dlpar_release_drc(drc_index);
dlpar_free_cc_nodes(dn);
return rc;
}
rc = dlpar_online_cpu(dn);
if (rc)
return rc;
return count;
}
static int dlpar_offline_cpu(struct device_node *dn)
{
int rc = 0;
unsigned int cpu;
int len, nthreads, i;
const __be32 *intserv;
u32 thread;
intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
if (!intserv)
return -EINVAL;
nthreads = len / sizeof(u32);
cpu_maps_update_begin();
for (i = 0; i < nthreads; i++) {
thread = be32_to_cpu(intserv[i]);
for_each_present_cpu(cpu) {
if (get_hard_smp_processor_id(cpu) != thread)
continue;
if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE)
break;
if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) {
set_preferred_offline_state(cpu, CPU_STATE_OFFLINE);
cpu_maps_update_done();
rc = device_offline(get_cpu_device(cpu));
if (rc)
goto out;
cpu_maps_update_begin();
break;
}
/*
* The cpu is in CPU_STATE_INACTIVE.
* Upgrade it's state to CPU_STATE_OFFLINE.
*/
set_preferred_offline_state(cpu, CPU_STATE_OFFLINE);
BUG_ON(plpar_hcall_norets(H_PROD, thread)
!= H_SUCCESS);
__cpu_die(cpu);
break;
}
if (cpu == num_possible_cpus())
printk(KERN_WARNING "Could not find cpu to offline "
"with physical id 0x%x\n", thread);
}
cpu_maps_update_done();
out:
return rc;
}
static ssize_t dlpar_cpu_release(const char *buf, size_t count)
{
struct device_node *dn;
u32 drc_index;
int rc;
dn = of_find_node_by_path(buf);
if (!dn)
return -EINVAL;
rc = of_property_read_u32(dn, "ibm,my-drc-index", &drc_index);
if (rc) {
of_node_put(dn);
return -EINVAL;
}
rc = dlpar_offline_cpu(dn);
if (rc) {
of_node_put(dn);
return -EINVAL;
}
rc = dlpar_release_drc(drc_index);
if (rc) {
of_node_put(dn);
return rc;
}
rc = dlpar_detach_node(dn);
if (rc) {
dlpar_acquire_drc(drc_index);
return rc;
}
of_node_put(dn);
return count;
}
static int __init pseries_dlpar_init(void)
{
ppc_md.cpu_probe = dlpar_cpu_probe;
ppc_md.cpu_release = dlpar_cpu_release;
return 0;
}
machine_device_initcall(pseries, pseries_dlpar_init);
#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */

View file

@ -0,0 +1,395 @@
/*
* Virtual Processor Dispatch Trace Log
*
* (C) Copyright IBM Corporation 2009
*
* Author: Jeremy Kerr <jk@ozlabs.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/slab.h>
#include <linux/debugfs.h>
#include <linux/spinlock.h>
#include <asm/smp.h>
#include <asm/uaccess.h>
#include <asm/firmware.h>
#include <asm/lppaca.h>
#include <asm/debug.h>
#include <asm/plpar_wrappers.h>
#include <asm/machdep.h>
struct dtl {
struct dtl_entry *buf;
struct dentry *file;
int cpu;
int buf_entries;
u64 last_idx;
spinlock_t lock;
};
static DEFINE_PER_CPU(struct dtl, cpu_dtl);
/*
* Dispatch trace log event mask:
* 0x7: 0x1: voluntary virtual processor waits
* 0x2: time-slice preempts
* 0x4: virtual partition memory page faults
*/
static u8 dtl_event_mask = 0x7;
/*
* Size of per-cpu log buffers. Firmware requires that the buffer does
* not cross a 4k boundary.
*/
static int dtl_buf_entries = N_DISPATCH_LOG;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
struct dtl_ring {
u64 write_index;
struct dtl_entry *write_ptr;
struct dtl_entry *buf;
struct dtl_entry *buf_end;
u8 saved_dtl_mask;
};
static DEFINE_PER_CPU(struct dtl_ring, dtl_rings);
static atomic_t dtl_count;
/*
* The cpu accounting code controls the DTL ring buffer, and we get
* given entries as they are processed.
*/
static void consume_dtle(struct dtl_entry *dtle, u64 index)
{
struct dtl_ring *dtlr = &__get_cpu_var(dtl_rings);
struct dtl_entry *wp = dtlr->write_ptr;
struct lppaca *vpa = local_paca->lppaca_ptr;
if (!wp)
return;
*wp = *dtle;
barrier();
/* check for hypervisor ring buffer overflow, ignore this entry if so */
if (index + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx))
return;
++wp;
if (wp == dtlr->buf_end)
wp = dtlr->buf;
dtlr->write_ptr = wp;
/* incrementing write_index makes the new entry visible */
smp_wmb();
++dtlr->write_index;
}
static int dtl_start(struct dtl *dtl)
{
struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu);
dtlr->buf = dtl->buf;
dtlr->buf_end = dtl->buf + dtl->buf_entries;
dtlr->write_index = 0;
/* setting write_ptr enables logging into our buffer */
smp_wmb();
dtlr->write_ptr = dtl->buf;
/* enable event logging */
dtlr->saved_dtl_mask = lppaca_of(dtl->cpu).dtl_enable_mask;
lppaca_of(dtl->cpu).dtl_enable_mask |= dtl_event_mask;
dtl_consumer = consume_dtle;
atomic_inc(&dtl_count);
return 0;
}
static void dtl_stop(struct dtl *dtl)
{
struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu);
dtlr->write_ptr = NULL;
smp_wmb();
dtlr->buf = NULL;
/* restore dtl_enable_mask */
lppaca_of(dtl->cpu).dtl_enable_mask = dtlr->saved_dtl_mask;
if (atomic_dec_and_test(&dtl_count))
dtl_consumer = NULL;
}
static u64 dtl_current_index(struct dtl *dtl)
{
return per_cpu(dtl_rings, dtl->cpu).write_index;
}
#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
static int dtl_start(struct dtl *dtl)
{
unsigned long addr;
int ret, hwcpu;
/* Register our dtl buffer with the hypervisor. The HV expects the
* buffer size to be passed in the second word of the buffer */
((u32 *)dtl->buf)[1] = DISPATCH_LOG_BYTES;
hwcpu = get_hard_smp_processor_id(dtl->cpu);
addr = __pa(dtl->buf);
ret = register_dtl(hwcpu, addr);
if (ret) {
printk(KERN_WARNING "%s: DTL registration for cpu %d (hw %d) "
"failed with %d\n", __func__, dtl->cpu, hwcpu, ret);
return -EIO;
}
/* set our initial buffer indices */
lppaca_of(dtl->cpu).dtl_idx = 0;
/* ensure that our updates to the lppaca fields have occurred before
* we actually enable the logging */
smp_wmb();
/* enable event logging */
lppaca_of(dtl->cpu).dtl_enable_mask = dtl_event_mask;
return 0;
}
static void dtl_stop(struct dtl *dtl)
{
int hwcpu = get_hard_smp_processor_id(dtl->cpu);
lppaca_of(dtl->cpu).dtl_enable_mask = 0x0;
unregister_dtl(hwcpu);
}
static u64 dtl_current_index(struct dtl *dtl)
{
return lppaca_of(dtl->cpu).dtl_idx;
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
static int dtl_enable(struct dtl *dtl)
{
long int n_entries;
long int rc;
struct dtl_entry *buf = NULL;
if (!dtl_cache)
return -ENOMEM;
/* only allow one reader */
if (dtl->buf)
return -EBUSY;
n_entries = dtl_buf_entries;
buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL, cpu_to_node(dtl->cpu));
if (!buf) {
printk(KERN_WARNING "%s: buffer alloc failed for cpu %d\n",
__func__, dtl->cpu);
return -ENOMEM;
}
spin_lock(&dtl->lock);
rc = -EBUSY;
if (!dtl->buf) {
/* store the original allocation size for use during read */
dtl->buf_entries = n_entries;
dtl->buf = buf;
dtl->last_idx = 0;
rc = dtl_start(dtl);
if (rc)
dtl->buf = NULL;
}
spin_unlock(&dtl->lock);
if (rc)
kmem_cache_free(dtl_cache, buf);
return rc;
}
static void dtl_disable(struct dtl *dtl)
{
spin_lock(&dtl->lock);
dtl_stop(dtl);
kmem_cache_free(dtl_cache, dtl->buf);
dtl->buf = NULL;
dtl->buf_entries = 0;
spin_unlock(&dtl->lock);
}
/* file interface */
static int dtl_file_open(struct inode *inode, struct file *filp)
{
struct dtl *dtl = inode->i_private;
int rc;
rc = dtl_enable(dtl);
if (rc)
return rc;
filp->private_data = dtl;
return 0;
}
static int dtl_file_release(struct inode *inode, struct file *filp)
{
struct dtl *dtl = inode->i_private;
dtl_disable(dtl);
return 0;
}
static ssize_t dtl_file_read(struct file *filp, char __user *buf, size_t len,
loff_t *pos)
{
long int rc, n_read, n_req, read_size;
struct dtl *dtl;
u64 cur_idx, last_idx, i;
if ((len % sizeof(struct dtl_entry)) != 0)
return -EINVAL;
dtl = filp->private_data;
/* requested number of entries to read */
n_req = len / sizeof(struct dtl_entry);
/* actual number of entries read */
n_read = 0;
spin_lock(&dtl->lock);
cur_idx = dtl_current_index(dtl);
last_idx = dtl->last_idx;
if (last_idx + dtl->buf_entries <= cur_idx)
last_idx = cur_idx - dtl->buf_entries + 1;
if (last_idx + n_req > cur_idx)
n_req = cur_idx - last_idx;
if (n_req > 0)
dtl->last_idx = last_idx + n_req;
spin_unlock(&dtl->lock);
if (n_req <= 0)
return 0;
i = last_idx % dtl->buf_entries;
/* read the tail of the buffer if we've wrapped */
if (i + n_req > dtl->buf_entries) {
read_size = dtl->buf_entries - i;
rc = copy_to_user(buf, &dtl->buf[i],
read_size * sizeof(struct dtl_entry));
if (rc)
return -EFAULT;
i = 0;
n_req -= read_size;
n_read += read_size;
buf += read_size * sizeof(struct dtl_entry);
}
/* .. and now the head */
rc = copy_to_user(buf, &dtl->buf[i], n_req * sizeof(struct dtl_entry));
if (rc)
return -EFAULT;
n_read += n_req;
return n_read * sizeof(struct dtl_entry);
}
static const struct file_operations dtl_fops = {
.open = dtl_file_open,
.release = dtl_file_release,
.read = dtl_file_read,
.llseek = no_llseek,
};
static struct dentry *dtl_dir;
static int dtl_setup_file(struct dtl *dtl)
{
char name[10];
sprintf(name, "cpu-%d", dtl->cpu);
dtl->file = debugfs_create_file(name, 0400, dtl_dir, dtl, &dtl_fops);
if (!dtl->file)
return -ENOMEM;
return 0;
}
static int dtl_init(void)
{
struct dentry *event_mask_file, *buf_entries_file;
int rc, i;
if (!firmware_has_feature(FW_FEATURE_SPLPAR))
return -ENODEV;
/* set up common debugfs structure */
rc = -ENOMEM;
dtl_dir = debugfs_create_dir("dtl", powerpc_debugfs_root);
if (!dtl_dir) {
printk(KERN_WARNING "%s: can't create dtl root dir\n",
__func__);
goto err;
}
event_mask_file = debugfs_create_x8("dtl_event_mask", 0600,
dtl_dir, &dtl_event_mask);
buf_entries_file = debugfs_create_u32("dtl_buf_entries", 0400,
dtl_dir, &dtl_buf_entries);
if (!event_mask_file || !buf_entries_file) {
printk(KERN_WARNING "%s: can't create dtl files\n", __func__);
goto err_remove_dir;
}
/* set up the per-cpu log structures */
for_each_possible_cpu(i) {
struct dtl *dtl = &per_cpu(cpu_dtl, i);
spin_lock_init(&dtl->lock);
dtl->cpu = i;
rc = dtl_setup_file(dtl);
if (rc)
goto err_remove_dir;
}
return 0;
err_remove_dir:
debugfs_remove_recursive(dtl_dir);
err:
return rc;
}
machine_arch_initcall(pseries, dtl_init);

View file

@ -0,0 +1,745 @@
/*
* The file intends to implement the platform dependent EEH operations on pseries.
* Actually, the pseries platform is built based on RTAS heavily. That means the
* pseries platform dependent EEH operations will be built on RTAS calls. The functions
* are devired from arch/powerpc/platforms/pseries/eeh.c and necessary cleanup has
* been done.
*
* Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2011.
* Copyright IBM Corporation 2001, 2005, 2006
* Copyright Dave Engebretsen & Todd Inglett 2001
* Copyright Linas Vepstas 2005, 2006
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/atomic.h>
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/of.h>
#include <linux/pci.h>
#include <linux/proc_fs.h>
#include <linux/rbtree.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/spinlock.h>
#include <asm/eeh.h>
#include <asm/eeh_event.h>
#include <asm/io.h>
#include <asm/machdep.h>
#include <asm/ppc-pci.h>
#include <asm/rtas.h>
/* RTAS tokens */
static int ibm_set_eeh_option;
static int ibm_set_slot_reset;
static int ibm_read_slot_reset_state;
static int ibm_read_slot_reset_state2;
static int ibm_slot_error_detail;
static int ibm_get_config_addr_info;
static int ibm_get_config_addr_info2;
static int ibm_configure_bridge;
static int ibm_configure_pe;
/*
* Buffer for reporting slot-error-detail rtas calls. Its here
* in BSS, and not dynamically alloced, so that it ends up in
* RMO where RTAS can access it.
*/
static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
static DEFINE_SPINLOCK(slot_errbuf_lock);
static int eeh_error_buf_size;
/**
* pseries_eeh_init - EEH platform dependent initialization
*
* EEH platform dependent initialization on pseries.
*/
static int pseries_eeh_init(void)
{
/* figure out EEH RTAS function call tokens */
ibm_set_eeh_option = rtas_token("ibm,set-eeh-option");
ibm_set_slot_reset = rtas_token("ibm,set-slot-reset");
ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2");
ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state");
ibm_slot_error_detail = rtas_token("ibm,slot-error-detail");
ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2");
ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info");
ibm_configure_pe = rtas_token("ibm,configure-pe");
ibm_configure_bridge = rtas_token("ibm,configure-bridge");
/*
* Necessary sanity check. We needn't check "get-config-addr-info"
* and its variant since the old firmware probably support address
* of domain/bus/slot/function for EEH RTAS operations.
*/
if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE ||
ibm_set_slot_reset == RTAS_UNKNOWN_SERVICE ||
(ibm_read_slot_reset_state2 == RTAS_UNKNOWN_SERVICE &&
ibm_read_slot_reset_state == RTAS_UNKNOWN_SERVICE) ||
ibm_slot_error_detail == RTAS_UNKNOWN_SERVICE ||
(ibm_configure_pe == RTAS_UNKNOWN_SERVICE &&
ibm_configure_bridge == RTAS_UNKNOWN_SERVICE)) {
pr_info("EEH functionality not supported\n");
return -EINVAL;
}
/* Initialize error log lock and size */
spin_lock_init(&slot_errbuf_lock);
eeh_error_buf_size = rtas_token("rtas-error-log-max");
if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) {
pr_info("%s: unknown EEH error log size\n",
__func__);
eeh_error_buf_size = 1024;
} else if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) {
pr_info("%s: EEH error log size %d exceeds the maximal %d\n",
__func__, eeh_error_buf_size, RTAS_ERROR_LOG_MAX);
eeh_error_buf_size = RTAS_ERROR_LOG_MAX;
}
/* Set EEH probe mode */
eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG);
return 0;
}
static int pseries_eeh_cap_start(struct device_node *dn)
{
struct pci_dn *pdn = PCI_DN(dn);
u32 status;
if (!pdn)
return 0;
rtas_read_config(pdn, PCI_STATUS, 2, &status);
if (!(status & PCI_STATUS_CAP_LIST))
return 0;
return PCI_CAPABILITY_LIST;
}
static int pseries_eeh_find_cap(struct device_node *dn, int cap)
{
struct pci_dn *pdn = PCI_DN(dn);
int pos = pseries_eeh_cap_start(dn);
int cnt = 48; /* Maximal number of capabilities */
u32 id;
if (!pos)
return 0;
while (cnt--) {
rtas_read_config(pdn, pos, 1, &pos);
if (pos < 0x40)
break;
pos &= ~3;
rtas_read_config(pdn, pos + PCI_CAP_LIST_ID, 1, &id);
if (id == 0xff)
break;
if (id == cap)
return pos;
pos += PCI_CAP_LIST_NEXT;
}
return 0;
}
static int pseries_eeh_find_ecap(struct device_node *dn, int cap)
{
struct pci_dn *pdn = PCI_DN(dn);
struct eeh_dev *edev = of_node_to_eeh_dev(dn);
u32 header;
int pos = 256;
int ttl = (4096 - 256) / 8;
if (!edev || !edev->pcie_cap)
return 0;
if (rtas_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL)
return 0;
else if (!header)
return 0;
while (ttl-- > 0) {
if (PCI_EXT_CAP_ID(header) == cap && pos)
return pos;
pos = PCI_EXT_CAP_NEXT(header);
if (pos < 256)
break;
if (rtas_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL)
break;
}
return 0;
}
/**
* pseries_eeh_of_probe - EEH probe on the given device
* @dn: OF node
* @flag: Unused
*
* When EEH module is installed during system boot, all PCI devices
* are checked one by one to see if it supports EEH. The function
* is introduced for the purpose.
*/
static void *pseries_eeh_of_probe(struct device_node *dn, void *flag)
{
struct eeh_dev *edev;
struct eeh_pe pe;
struct pci_dn *pdn = PCI_DN(dn);
const __be32 *classp, *vendorp, *devicep;
u32 class_code;
const __be32 *regs;
u32 pcie_flags;
int enable = 0;
int ret;
/* Retrieve OF node and eeh device */
edev = of_node_to_eeh_dev(dn);
if (edev->pe || !of_device_is_available(dn))
return NULL;
/* Retrieve class/vendor/device IDs */
classp = of_get_property(dn, "class-code", NULL);
vendorp = of_get_property(dn, "vendor-id", NULL);
devicep = of_get_property(dn, "device-id", NULL);
/* Skip for bad OF node or PCI-ISA bridge */
if (!classp || !vendorp || !devicep)
return NULL;
if (dn->type && !strcmp(dn->type, "isa"))
return NULL;
class_code = of_read_number(classp, 1);
/*
* Update class code and mode of eeh device. We need
* correctly reflects that current device is root port
* or PCIe switch downstream port.
*/
edev->class_code = class_code;
edev->pcix_cap = pseries_eeh_find_cap(dn, PCI_CAP_ID_PCIX);
edev->pcie_cap = pseries_eeh_find_cap(dn, PCI_CAP_ID_EXP);
edev->aer_cap = pseries_eeh_find_ecap(dn, PCI_EXT_CAP_ID_ERR);
edev->mode &= 0xFFFFFF00;
if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) {
edev->mode |= EEH_DEV_BRIDGE;
if (edev->pcie_cap) {
rtas_read_config(pdn, edev->pcie_cap + PCI_EXP_FLAGS,
2, &pcie_flags);
pcie_flags = (pcie_flags & PCI_EXP_FLAGS_TYPE) >> 4;
if (pcie_flags == PCI_EXP_TYPE_ROOT_PORT)
edev->mode |= EEH_DEV_ROOT_PORT;
else if (pcie_flags == PCI_EXP_TYPE_DOWNSTREAM)
edev->mode |= EEH_DEV_DS_PORT;
}
}
/* Retrieve the device address */
regs = of_get_property(dn, "reg", NULL);
if (!regs) {
pr_warn("%s: OF node property %s::reg not found\n",
__func__, dn->full_name);
return NULL;
}
/* Initialize the fake PE */
memset(&pe, 0, sizeof(struct eeh_pe));
pe.phb = edev->phb;
pe.config_addr = of_read_number(regs, 1);
/* Enable EEH on the device */
ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE);
if (!ret) {
edev->config_addr = of_read_number(regs, 1);
/* Retrieve PE address */
edev->pe_config_addr = eeh_ops->get_pe_addr(&pe);
pe.addr = edev->pe_config_addr;
/* Some older systems (Power4) allow the ibm,set-eeh-option
* call to succeed even on nodes where EEH is not supported.
* Verify support explicitly.
*/
ret = eeh_ops->get_state(&pe, NULL);
if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT)
enable = 1;
if (enable) {
eeh_add_flag(EEH_ENABLED);
eeh_add_to_parent_pe(edev);
pr_debug("%s: EEH enabled on %s PHB#%d-PE#%x, config addr#%x\n",
__func__, dn->full_name, pe.phb->global_number,
pe.addr, pe.config_addr);
} else if (dn->parent && of_node_to_eeh_dev(dn->parent) &&
(of_node_to_eeh_dev(dn->parent))->pe) {
/* This device doesn't support EEH, but it may have an
* EEH parent, in which case we mark it as supported.
*/
edev->config_addr = of_node_to_eeh_dev(dn->parent)->config_addr;
edev->pe_config_addr = of_node_to_eeh_dev(dn->parent)->pe_config_addr;
eeh_add_to_parent_pe(edev);
}
}
/* Save memory bars */
eeh_save_bars(edev);
return NULL;
}
/**
* pseries_eeh_set_option - Initialize EEH or MMIO/DMA reenable
* @pe: EEH PE
* @option: operation to be issued
*
* The function is used to control the EEH functionality globally.
* Currently, following options are support according to PAPR:
* Enable EEH, Disable EEH, Enable MMIO and Enable DMA
*/
static int pseries_eeh_set_option(struct eeh_pe *pe, int option)
{
int ret = 0;
int config_addr;
/*
* When we're enabling or disabling EEH functioality on
* the particular PE, the PE config address is possibly
* unavailable. Therefore, we have to figure it out from
* the FDT node.
*/
switch (option) {
case EEH_OPT_DISABLE:
case EEH_OPT_ENABLE:
case EEH_OPT_THAW_MMIO:
case EEH_OPT_THAW_DMA:
config_addr = pe->config_addr;
if (pe->addr)
config_addr = pe->addr;
break;
case EEH_OPT_FREEZE_PE:
/* Not support */
return 0;
default:
pr_err("%s: Invalid option %d\n",
__func__, option);
return -EINVAL;
}
ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
config_addr, BUID_HI(pe->phb->buid),
BUID_LO(pe->phb->buid), option);
return ret;
}
/**
* pseries_eeh_get_pe_addr - Retrieve PE address
* @pe: EEH PE
*
* Retrieve the assocated PE address. Actually, there're 2 RTAS
* function calls dedicated for the purpose. We need implement
* it through the new function and then the old one. Besides,
* you should make sure the config address is figured out from
* FDT node before calling the function.
*
* It's notable that zero'ed return value means invalid PE config
* address.
*/
static int pseries_eeh_get_pe_addr(struct eeh_pe *pe)
{
int ret = 0;
int rets[3];
if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
/*
* First of all, we need to make sure there has one PE
* associated with the device. Otherwise, PE address is
* meaningless.
*/
ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
pe->config_addr, BUID_HI(pe->phb->buid),
BUID_LO(pe->phb->buid), 1);
if (ret || (rets[0] == 0))
return 0;
/* Retrieve the associated PE config address */
ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
pe->config_addr, BUID_HI(pe->phb->buid),
BUID_LO(pe->phb->buid), 0);
if (ret) {
pr_warn("%s: Failed to get address for PHB#%d-PE#%x\n",
__func__, pe->phb->global_number, pe->config_addr);
return 0;
}
return rets[0];
}
if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets,
pe->config_addr, BUID_HI(pe->phb->buid),
BUID_LO(pe->phb->buid), 0);
if (ret) {
pr_warn("%s: Failed to get address for PHB#%d-PE#%x\n",
__func__, pe->phb->global_number, pe->config_addr);
return 0;
}
return rets[0];
}
return ret;
}
/**
* pseries_eeh_get_state - Retrieve PE state
* @pe: EEH PE
* @state: return value
*
* Retrieve the state of the specified PE. On RTAS compliant
* pseries platform, there already has one dedicated RTAS function
* for the purpose. It's notable that the associated PE config address
* might be ready when calling the function. Therefore, endeavour to
* use the PE config address if possible. Further more, there're 2
* RTAS calls for the purpose, we need to try the new one and back
* to the old one if the new one couldn't work properly.
*/
static int pseries_eeh_get_state(struct eeh_pe *pe, int *state)
{
int config_addr;
int ret;
int rets[4];
int result;
/* Figure out PE config address if possible */
config_addr = pe->config_addr;
if (pe->addr)
config_addr = pe->addr;
if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) {
ret = rtas_call(ibm_read_slot_reset_state2, 3, 4, rets,
config_addr, BUID_HI(pe->phb->buid),
BUID_LO(pe->phb->buid));
} else if (ibm_read_slot_reset_state != RTAS_UNKNOWN_SERVICE) {
/* Fake PE unavailable info */
rets[2] = 0;
ret = rtas_call(ibm_read_slot_reset_state, 3, 3, rets,
config_addr, BUID_HI(pe->phb->buid),
BUID_LO(pe->phb->buid));
} else {
return EEH_STATE_NOT_SUPPORT;
}
if (ret)
return ret;
/* Parse the result out */
result = 0;
if (rets[1]) {
switch(rets[0]) {
case 0:
result &= ~EEH_STATE_RESET_ACTIVE;
result |= EEH_STATE_MMIO_ACTIVE;
result |= EEH_STATE_DMA_ACTIVE;
break;
case 1:
result |= EEH_STATE_RESET_ACTIVE;
result |= EEH_STATE_MMIO_ACTIVE;
result |= EEH_STATE_DMA_ACTIVE;
break;
case 2:
result &= ~EEH_STATE_RESET_ACTIVE;
result &= ~EEH_STATE_MMIO_ACTIVE;
result &= ~EEH_STATE_DMA_ACTIVE;
break;
case 4:
result &= ~EEH_STATE_RESET_ACTIVE;
result &= ~EEH_STATE_MMIO_ACTIVE;
result &= ~EEH_STATE_DMA_ACTIVE;
result |= EEH_STATE_MMIO_ENABLED;
break;
case 5:
if (rets[2]) {
if (state) *state = rets[2];
result = EEH_STATE_UNAVAILABLE;
} else {
result = EEH_STATE_NOT_SUPPORT;
}
break;
default:
result = EEH_STATE_NOT_SUPPORT;
}
} else {
result = EEH_STATE_NOT_SUPPORT;
}
return result;
}
/**
* pseries_eeh_reset - Reset the specified PE
* @pe: EEH PE
* @option: reset option
*
* Reset the specified PE
*/
static int pseries_eeh_reset(struct eeh_pe *pe, int option)
{
int config_addr;
int ret;
/* Figure out PE address */
config_addr = pe->config_addr;
if (pe->addr)
config_addr = pe->addr;
/* Reset PE through RTAS call */
ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
config_addr, BUID_HI(pe->phb->buid),
BUID_LO(pe->phb->buid), option);
/* If fundamental-reset not supported, try hot-reset */
if (option == EEH_RESET_FUNDAMENTAL &&
ret == -8) {
option = EEH_RESET_HOT;
ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
config_addr, BUID_HI(pe->phb->buid),
BUID_LO(pe->phb->buid), option);
}
/* We need reset hold or settlement delay */
if (option == EEH_RESET_FUNDAMENTAL ||
option == EEH_RESET_HOT)
msleep(EEH_PE_RST_HOLD_TIME);
else
msleep(EEH_PE_RST_SETTLE_TIME);
return ret;
}
/**
* pseries_eeh_wait_state - Wait for PE state
* @pe: EEH PE
* @max_wait: maximal period in microsecond
*
* Wait for the state of associated PE. It might take some time
* to retrieve the PE's state.
*/
static int pseries_eeh_wait_state(struct eeh_pe *pe, int max_wait)
{
int ret;
int mwait;
/*
* According to PAPR, the state of PE might be temporarily
* unavailable. Under the circumstance, we have to wait
* for indicated time determined by firmware. The maximal
* wait time is 5 minutes, which is acquired from the original
* EEH implementation. Also, the original implementation
* also defined the minimal wait time as 1 second.
*/
#define EEH_STATE_MIN_WAIT_TIME (1000)
#define EEH_STATE_MAX_WAIT_TIME (300 * 1000)
while (1) {
ret = pseries_eeh_get_state(pe, &mwait);
/*
* If the PE's state is temporarily unavailable,
* we have to wait for the specified time. Otherwise,
* the PE's state will be returned immediately.
*/
if (ret != EEH_STATE_UNAVAILABLE)
return ret;
if (max_wait <= 0) {
pr_warn("%s: Timeout when getting PE's state (%d)\n",
__func__, max_wait);
return EEH_STATE_NOT_SUPPORT;
}
if (mwait <= 0) {
pr_warn("%s: Firmware returned bad wait value %d\n",
__func__, mwait);
mwait = EEH_STATE_MIN_WAIT_TIME;
} else if (mwait > EEH_STATE_MAX_WAIT_TIME) {
pr_warn("%s: Firmware returned too long wait value %d\n",
__func__, mwait);
mwait = EEH_STATE_MAX_WAIT_TIME;
}
max_wait -= mwait;
msleep(mwait);
}
return EEH_STATE_NOT_SUPPORT;
}
/**
* pseries_eeh_get_log - Retrieve error log
* @pe: EEH PE
* @severity: temporary or permanent error log
* @drv_log: driver log to be combined with retrieved error log
* @len: length of driver log
*
* Retrieve the temporary or permanent error from the PE.
* Actually, the error will be retrieved through the dedicated
* RTAS call.
*/
static int pseries_eeh_get_log(struct eeh_pe *pe, int severity, char *drv_log, unsigned long len)
{
int config_addr;
unsigned long flags;
int ret;
spin_lock_irqsave(&slot_errbuf_lock, flags);
memset(slot_errbuf, 0, eeh_error_buf_size);
/* Figure out the PE address */
config_addr = pe->config_addr;
if (pe->addr)
config_addr = pe->addr;
ret = rtas_call(ibm_slot_error_detail, 8, 1, NULL, config_addr,
BUID_HI(pe->phb->buid), BUID_LO(pe->phb->buid),
virt_to_phys(drv_log), len,
virt_to_phys(slot_errbuf), eeh_error_buf_size,
severity);
if (!ret)
log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
spin_unlock_irqrestore(&slot_errbuf_lock, flags);
return ret;
}
/**
* pseries_eeh_configure_bridge - Configure PCI bridges in the indicated PE
* @pe: EEH PE
*
* The function will be called to reconfigure the bridges included
* in the specified PE so that the mulfunctional PE would be recovered
* again.
*/
static int pseries_eeh_configure_bridge(struct eeh_pe *pe)
{
int config_addr;
int ret;
/* Figure out the PE address */
config_addr = pe->config_addr;
if (pe->addr)
config_addr = pe->addr;
/* Use new configure-pe function, if supported */
if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE) {
ret = rtas_call(ibm_configure_pe, 3, 1, NULL,
config_addr, BUID_HI(pe->phb->buid),
BUID_LO(pe->phb->buid));
} else if (ibm_configure_bridge != RTAS_UNKNOWN_SERVICE) {
ret = rtas_call(ibm_configure_bridge, 3, 1, NULL,
config_addr, BUID_HI(pe->phb->buid),
BUID_LO(pe->phb->buid));
} else {
return -EFAULT;
}
if (ret)
pr_warn("%s: Unable to configure bridge PHB#%d-PE#%x (%d)\n",
__func__, pe->phb->global_number, pe->addr, ret);
return ret;
}
/**
* pseries_eeh_read_config - Read PCI config space
* @dn: device node
* @where: PCI address
* @size: size to read
* @val: return value
*
* Read config space from the speicifed device
*/
static int pseries_eeh_read_config(struct device_node *dn, int where, int size, u32 *val)
{
struct pci_dn *pdn;
pdn = PCI_DN(dn);
return rtas_read_config(pdn, where, size, val);
}
/**
* pseries_eeh_write_config - Write PCI config space
* @dn: device node
* @where: PCI address
* @size: size to write
* @val: value to be written
*
* Write config space to the specified device
*/
static int pseries_eeh_write_config(struct device_node *dn, int where, int size, u32 val)
{
struct pci_dn *pdn;
pdn = PCI_DN(dn);
return rtas_write_config(pdn, where, size, val);
}
static struct eeh_ops pseries_eeh_ops = {
.name = "pseries",
.init = pseries_eeh_init,
.of_probe = pseries_eeh_of_probe,
.dev_probe = NULL,
.set_option = pseries_eeh_set_option,
.get_pe_addr = pseries_eeh_get_pe_addr,
.get_state = pseries_eeh_get_state,
.reset = pseries_eeh_reset,
.wait_state = pseries_eeh_wait_state,
.get_log = pseries_eeh_get_log,
.configure_bridge = pseries_eeh_configure_bridge,
.err_inject = NULL,
.read_config = pseries_eeh_read_config,
.write_config = pseries_eeh_write_config,
.next_error = NULL,
.restore_config = NULL
};
/**
* eeh_pseries_init - Register platform dependent EEH operations
*
* EEH initialization on pseries platform. This function should be
* called before any EEH related functions.
*/
static int __init eeh_pseries_init(void)
{
int ret;
ret = eeh_ops_register(&pseries_eeh_ops);
if (!ret)
pr_info("EEH: pSeries platform initialized\n");
else
pr_info("EEH: pSeries platform initialization failure (%d)\n",
ret);
return ret;
}
machine_early_initcall(pseries, eeh_pseries_init);

View file

@ -0,0 +1,84 @@
/*
* Copyright (C) 2001 Dave Engebretsen IBM Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <asm/prom.h>
#include "pseries.h"
void request_event_sources_irqs(struct device_node *np,
irq_handler_t handler,
const char *name)
{
int i, index, count = 0;
struct of_phandle_args oirq;
const u32 *opicprop;
unsigned int opicplen;
unsigned int virqs[16];
/* Check for obsolete "open-pic-interrupt" property. If present, then
* map those interrupts using the default interrupt host and default
* trigger
*/
opicprop = of_get_property(np, "open-pic-interrupt", &opicplen);
if (opicprop) {
opicplen /= sizeof(u32);
for (i = 0; i < opicplen; i++) {
if (count > 15)
break;
virqs[count] = irq_create_mapping(NULL, *(opicprop++));
if (virqs[count] == NO_IRQ) {
pr_err("event-sources: Unable to allocate "
"interrupt number for %s\n",
np->full_name);
WARN_ON(1);
}
else
count++;
}
}
/* Else use normal interrupt tree parsing */
else {
/* First try to do a proper OF tree parsing */
for (index = 0; of_irq_parse_one(np, index, &oirq) == 0;
index++) {
if (count > 15)
break;
virqs[count] = irq_create_of_mapping(&oirq);
if (virqs[count] == NO_IRQ) {
pr_err("event-sources: Unable to allocate "
"interrupt number for %s\n",
np->full_name);
WARN_ON(1);
}
else
count++;
}
}
/* Now request them */
for (i = 0; i < count; i++) {
if (request_irq(virqs[i], handler, 0, name, NULL)) {
pr_err("event-sources: Unable to request interrupt "
"%d for %s\n", virqs[i], np->full_name);
WARN_ON(1);
return;
}
}
}

View file

@ -0,0 +1,133 @@
/*
* pSeries firmware setup code.
*
* Portions from arch/powerpc/platforms/pseries/setup.c:
* Copyright (C) 1995 Linus Torvalds
* Adapted from 'alpha' version by Gary Thomas
* Modified by Cort Dougan (cort@cs.nmt.edu)
* Modified by PPC64 Team, IBM Corp
*
* Portions from arch/powerpc/kernel/firmware.c
* Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org)
* Modifications for ppc64:
* Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com>
* Copyright (C) 2005 Stephen Rothwell, IBM Corporation
*
* Copyright 2006 IBM Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <asm/firmware.h>
#include <asm/prom.h>
#include <asm/udbg.h>
#include "pseries.h"
struct hypertas_fw_feature {
unsigned long val;
char * name;
};
/*
* The names in this table match names in rtas/ibm,hypertas-functions. If the
* entry ends in a '*', only upto the '*' is matched. Otherwise the entire
* string must match.
*/
static __initdata struct hypertas_fw_feature
hypertas_fw_features_table[] = {
{FW_FEATURE_PFT, "hcall-pft"},
{FW_FEATURE_TCE, "hcall-tce"},
{FW_FEATURE_SPRG0, "hcall-sprg0"},
{FW_FEATURE_DABR, "hcall-dabr"},
{FW_FEATURE_COPY, "hcall-copy"},
{FW_FEATURE_ASR, "hcall-asr"},
{FW_FEATURE_DEBUG, "hcall-debug"},
{FW_FEATURE_PERF, "hcall-perf"},
{FW_FEATURE_DUMP, "hcall-dump"},
{FW_FEATURE_INTERRUPT, "hcall-interrupt"},
{FW_FEATURE_MIGRATE, "hcall-migrate"},
{FW_FEATURE_PERFMON, "hcall-perfmon"},
{FW_FEATURE_CRQ, "hcall-crq"},
{FW_FEATURE_VIO, "hcall-vio"},
{FW_FEATURE_RDMA, "hcall-rdma"},
{FW_FEATURE_LLAN, "hcall-lLAN"},
{FW_FEATURE_BULK_REMOVE, "hcall-bulk"},
{FW_FEATURE_XDABR, "hcall-xdabr"},
{FW_FEATURE_MULTITCE, "hcall-multi-tce"},
{FW_FEATURE_SPLPAR, "hcall-splpar"},
{FW_FEATURE_VPHN, "hcall-vphn"},
{FW_FEATURE_SET_MODE, "hcall-set-mode"},
{FW_FEATURE_BEST_ENERGY, "hcall-best-energy-1*"},
};
/* Build up the firmware features bitmask using the contents of
* device-tree/ibm,hypertas-functions. Ultimately this functionality may
* be moved into prom.c prom_init().
*/
void __init fw_hypertas_feature_init(const char *hypertas, unsigned long len)
{
const char *s;
int i;
pr_debug(" -> fw_hypertas_feature_init()\n");
for (s = hypertas; s < hypertas + len; s += strlen(s) + 1) {
for (i = 0; i < ARRAY_SIZE(hypertas_fw_features_table); i++) {
const char *name = hypertas_fw_features_table[i].name;
size_t size;
/*
* If there is a '*' at the end of name, only check
* upto there
*/
size = strlen(name);
if (size && name[size - 1] == '*') {
if (strncmp(name, s, size - 1))
continue;
} else if (strcmp(name, s))
continue;
/* we have a match */
powerpc_firmware_features |=
hypertas_fw_features_table[i].val;
break;
}
}
pr_debug(" <- fw_hypertas_feature_init()\n");
}
struct vec5_fw_feature {
unsigned long val;
unsigned int feature;
};
static __initdata struct vec5_fw_feature
vec5_fw_features_table[] = {
{FW_FEATURE_TYPE1_AFFINITY, OV5_TYPE1_AFFINITY},
{FW_FEATURE_PRRN, OV5_PRRN},
};
void __init fw_vec5_feature_init(const char *vec5, unsigned long len)
{
unsigned int index, feat;
int i;
pr_debug(" -> fw_vec5_feature_init()\n");
for (i = 0; i < ARRAY_SIZE(vec5_fw_features_table); i++) {
index = OV5_INDX(vec5_fw_features_table[i].feature);
feat = OV5_FEAT(vec5_fw_features_table[i].feature);
if (vec5[index] & feat)
powerpc_firmware_features |=
vec5_fw_features_table[i].val;
}
pr_debug(" <- fw_vec5_feature_init()\n");
}

View file

@ -0,0 +1,426 @@
/*
* pseries CPU Hotplug infrastructure.
*
* Split out from arch/powerpc/platforms/pseries/setup.c
* arch/powerpc/kernel/rtas.c, and arch/powerpc/platforms/pseries/smp.c
*
* Peter Bergner, IBM March 2001.
* Copyright (C) 2001 IBM.
* Dave Engebretsen, Peter Bergner, and
* Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
* Plus various changes from other IBM teams...
*
* Copyright (C) 2006 Michael Ellerman, IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/sched.h> /* for idle_task_exit */
#include <linux/cpu.h>
#include <linux/of.h>
#include <asm/prom.h>
#include <asm/rtas.h>
#include <asm/firmware.h>
#include <asm/machdep.h>
#include <asm/vdso_datapage.h>
#include <asm/xics.h>
#include <asm/plpar_wrappers.h>
#include "offline_states.h"
/* This version can't take the spinlock, because it never returns */
static int rtas_stop_self_token = RTAS_UNKNOWN_SERVICE;
static DEFINE_PER_CPU(enum cpu_state_vals, preferred_offline_state) =
CPU_STATE_OFFLINE;
static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE;
static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE;
static int cede_offline_enabled __read_mostly = 1;
/*
* Enable/disable cede_offline when available.
*/
static int __init setup_cede_offline(char *str)
{
if (!strcmp(str, "off"))
cede_offline_enabled = 0;
else if (!strcmp(str, "on"))
cede_offline_enabled = 1;
else
return 0;
return 1;
}
__setup("cede_offline=", setup_cede_offline);
enum cpu_state_vals get_cpu_current_state(int cpu)
{
return per_cpu(current_state, cpu);
}
void set_cpu_current_state(int cpu, enum cpu_state_vals state)
{
per_cpu(current_state, cpu) = state;
}
enum cpu_state_vals get_preferred_offline_state(int cpu)
{
return per_cpu(preferred_offline_state, cpu);
}
void set_preferred_offline_state(int cpu, enum cpu_state_vals state)
{
per_cpu(preferred_offline_state, cpu) = state;
}
void set_default_offline_state(int cpu)
{
per_cpu(preferred_offline_state, cpu) = default_offline_state;
}
static void rtas_stop_self(void)
{
static struct rtas_args args = {
.nargs = 0,
.nret = cpu_to_be32(1),
.rets = &args.args[0],
};
args.token = cpu_to_be32(rtas_stop_self_token);
local_irq_disable();
BUG_ON(rtas_stop_self_token == RTAS_UNKNOWN_SERVICE);
printk("cpu %u (hwid %u) Ready to die...\n",
smp_processor_id(), hard_smp_processor_id());
enter_rtas(__pa(&args));
panic("Alas, I survived.\n");
}
static void pseries_mach_cpu_die(void)
{
unsigned int cpu = smp_processor_id();
unsigned int hwcpu = hard_smp_processor_id();
u8 cede_latency_hint = 0;
local_irq_disable();
idle_task_exit();
xics_teardown_cpu();
if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
set_cpu_current_state(cpu, CPU_STATE_INACTIVE);
if (ppc_md.suspend_disable_cpu)
ppc_md.suspend_disable_cpu();
cede_latency_hint = 2;
get_lppaca()->idle = 1;
if (!lppaca_shared_proc(get_lppaca()))
get_lppaca()->donate_dedicated_cpu = 1;
while (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
while (!prep_irq_for_idle()) {
local_irq_enable();
local_irq_disable();
}
extended_cede_processor(cede_latency_hint);
}
local_irq_disable();
if (!lppaca_shared_proc(get_lppaca()))
get_lppaca()->donate_dedicated_cpu = 0;
get_lppaca()->idle = 0;
if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) {
unregister_slb_shadow(hwcpu);
hard_irq_disable();
/*
* Call to start_secondary_resume() will not return.
* Kernel stack will be reset and start_secondary()
* will be called to continue the online operation.
*/
start_secondary_resume();
}
}
/* Requested state is CPU_STATE_OFFLINE at this point */
WARN_ON(get_preferred_offline_state(cpu) != CPU_STATE_OFFLINE);
set_cpu_current_state(cpu, CPU_STATE_OFFLINE);
unregister_slb_shadow(hwcpu);
rtas_stop_self();
/* Should never get here... */
BUG();
for(;;);
}
static int pseries_cpu_disable(void)
{
int cpu = smp_processor_id();
set_cpu_online(cpu, false);
vdso_data->processorCount--;
/*fix boot_cpuid here*/
if (cpu == boot_cpuid)
boot_cpuid = cpumask_any(cpu_online_mask);
/* FIXME: abstract this to not be platform specific later on */
xics_migrate_irqs_away();
return 0;
}
/*
* pseries_cpu_die: Wait for the cpu to die.
* @cpu: logical processor id of the CPU whose death we're awaiting.
*
* This function is called from the context of the thread which is performing
* the cpu-offline. Here we wait for long enough to allow the cpu in question
* to self-destroy so that the cpu-offline thread can send the CPU_DEAD
* notifications.
*
* OTOH, pseries_mach_cpu_die() is called by the @cpu when it wants to
* self-destruct.
*/
static void pseries_cpu_die(unsigned int cpu)
{
int tries;
int cpu_status = 1;
unsigned int pcpu = get_hard_smp_processor_id(cpu);
if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
cpu_status = 1;
for (tries = 0; tries < 5000; tries++) {
if (get_cpu_current_state(cpu) == CPU_STATE_INACTIVE) {
cpu_status = 0;
break;
}
msleep(1);
}
} else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) {
for (tries = 0; tries < 25; tries++) {
cpu_status = smp_query_cpu_stopped(pcpu);
if (cpu_status == QCSS_STOPPED ||
cpu_status == QCSS_HARDWARE_ERROR)
break;
cpu_relax();
}
}
if (cpu_status != 0) {
printk("Querying DEAD? cpu %i (%i) shows %i\n",
cpu, pcpu, cpu_status);
}
/* Isolation and deallocation are definitely done by
* drslot_chrp_cpu. If they were not they would be
* done here. Change isolate state to Isolate and
* change allocation-state to Unusable.
*/
paca[cpu].cpu_start = 0;
}
/*
* Update cpu_present_mask and paca(s) for a new cpu node. The wrinkle
* here is that a cpu device node may represent up to two logical cpus
* in the SMT case. We must honor the assumption in other code that
* the logical ids for sibling SMT threads x and y are adjacent, such
* that x^1 == y and y^1 == x.
*/
static int pseries_add_processor(struct device_node *np)
{
unsigned int cpu;
cpumask_var_t candidate_mask, tmp;
int err = -ENOSPC, len, nthreads, i;
const __be32 *intserv;
intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len);
if (!intserv)
return 0;
zalloc_cpumask_var(&candidate_mask, GFP_KERNEL);
zalloc_cpumask_var(&tmp, GFP_KERNEL);
nthreads = len / sizeof(u32);
for (i = 0; i < nthreads; i++)
cpumask_set_cpu(i, tmp);
cpu_maps_update_begin();
BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask));
/* Get a bitmap of unoccupied slots. */
cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask);
if (cpumask_empty(candidate_mask)) {
/* If we get here, it most likely means that NR_CPUS is
* less than the partition's max processors setting.
*/
printk(KERN_ERR "Cannot add cpu %s; this system configuration"
" supports %d logical cpus.\n", np->full_name,
cpumask_weight(cpu_possible_mask));
goto out_unlock;
}
while (!cpumask_empty(tmp))
if (cpumask_subset(tmp, candidate_mask))
/* Found a range where we can insert the new cpu(s) */
break;
else
cpumask_shift_left(tmp, tmp, nthreads);
if (cpumask_empty(tmp)) {
printk(KERN_ERR "Unable to find space in cpu_present_mask for"
" processor %s with %d thread(s)\n", np->name,
nthreads);
goto out_unlock;
}
for_each_cpu(cpu, tmp) {
BUG_ON(cpu_present(cpu));
set_cpu_present(cpu, true);
set_hard_smp_processor_id(cpu, be32_to_cpu(*intserv++));
}
err = 0;
out_unlock:
cpu_maps_update_done();
free_cpumask_var(candidate_mask);
free_cpumask_var(tmp);
return err;
}
/*
* Update the present map for a cpu node which is going away, and set
* the hard id in the paca(s) to -1 to be consistent with boot time
* convention for non-present cpus.
*/
static void pseries_remove_processor(struct device_node *np)
{
unsigned int cpu;
int len, nthreads, i;
const __be32 *intserv;
u32 thread;
intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len);
if (!intserv)
return;
nthreads = len / sizeof(u32);
cpu_maps_update_begin();
for (i = 0; i < nthreads; i++) {
thread = be32_to_cpu(intserv[i]);
for_each_present_cpu(cpu) {
if (get_hard_smp_processor_id(cpu) != thread)
continue;
BUG_ON(cpu_online(cpu));
set_cpu_present(cpu, false);
set_hard_smp_processor_id(cpu, -1);
break;
}
if (cpu >= nr_cpu_ids)
printk(KERN_WARNING "Could not find cpu to remove "
"with physical id 0x%x\n", thread);
}
cpu_maps_update_done();
}
static int pseries_smp_notifier(struct notifier_block *nb,
unsigned long action, void *node)
{
int err = 0;
switch (action) {
case OF_RECONFIG_ATTACH_NODE:
err = pseries_add_processor(node);
break;
case OF_RECONFIG_DETACH_NODE:
pseries_remove_processor(node);
break;
}
return notifier_from_errno(err);
}
static struct notifier_block pseries_smp_nb = {
.notifier_call = pseries_smp_notifier,
};
#define MAX_CEDE_LATENCY_LEVELS 4
#define CEDE_LATENCY_PARAM_LENGTH 10
#define CEDE_LATENCY_PARAM_MAX_LENGTH \
(MAX_CEDE_LATENCY_LEVELS * CEDE_LATENCY_PARAM_LENGTH * sizeof(char))
#define CEDE_LATENCY_TOKEN 45
static char cede_parameters[CEDE_LATENCY_PARAM_MAX_LENGTH];
static int parse_cede_parameters(void)
{
memset(cede_parameters, 0, CEDE_LATENCY_PARAM_MAX_LENGTH);
return rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
NULL,
CEDE_LATENCY_TOKEN,
__pa(cede_parameters),
CEDE_LATENCY_PARAM_MAX_LENGTH);
}
static int __init pseries_cpu_hotplug_init(void)
{
struct device_node *np;
const char *typep;
int cpu;
int qcss_tok;
for_each_node_by_name(np, "interrupt-controller") {
typep = of_get_property(np, "compatible", NULL);
if (strstr(typep, "open-pic")) {
of_node_put(np);
printk(KERN_INFO "CPU Hotplug not supported on "
"systems using MPIC\n");
return 0;
}
}
rtas_stop_self_token = rtas_token("stop-self");
qcss_tok = rtas_token("query-cpu-stopped-state");
if (rtas_stop_self_token == RTAS_UNKNOWN_SERVICE ||
qcss_tok == RTAS_UNKNOWN_SERVICE) {
printk(KERN_INFO "CPU Hotplug not supported by firmware "
"- disabling.\n");
return 0;
}
ppc_md.cpu_die = pseries_mach_cpu_die;
smp_ops->cpu_disable = pseries_cpu_disable;
smp_ops->cpu_die = pseries_cpu_die;
/* Processors can be added/removed only on LPAR */
if (firmware_has_feature(FW_FEATURE_LPAR)) {
of_reconfig_notifier_register(&pseries_smp_nb);
cpu_maps_update_begin();
if (cede_offline_enabled && parse_cede_parameters() == 0) {
default_offline_state = CPU_STATE_INACTIVE;
for_each_online_cpu(cpu)
set_default_offline_state(cpu);
}
cpu_maps_update_done();
}
return 0;
}
machine_arch_initcall(pseries, pseries_cpu_hotplug_init);

View file

@ -0,0 +1,271 @@
/*
* pseries Memory Hotplug infrastructure.
*
* Copyright (C) 2008 Badari Pulavarty, IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/of.h>
#include <linux/of_address.h>
#include <linux/memblock.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <asm/firmware.h>
#include <asm/machdep.h>
#include <asm/prom.h>
#include <asm/sparsemem.h>
#include "pseries.h"
unsigned long pseries_memory_block_size(void)
{
struct device_node *np;
unsigned int memblock_size = MIN_MEMORY_BLOCK_SIZE;
struct resource r;
np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (np) {
const __be64 *size;
size = of_get_property(np, "ibm,lmb-size", NULL);
if (size)
memblock_size = be64_to_cpup(size);
of_node_put(np);
} else if (machine_is(pseries)) {
/* This fallback really only applies to pseries */
unsigned int memzero_size = 0;
np = of_find_node_by_path("/memory@0");
if (np) {
if (!of_address_to_resource(np, 0, &r))
memzero_size = resource_size(&r);
of_node_put(np);
}
if (memzero_size) {
/* We now know the size of memory@0, use this to find
* the first memoryblock and get its size.
*/
char buf[64];
sprintf(buf, "/memory@%x", memzero_size);
np = of_find_node_by_path(buf);
if (np) {
if (!of_address_to_resource(np, 0, &r))
memblock_size = resource_size(&r);
of_node_put(np);
}
}
}
return memblock_size;
}
#ifdef CONFIG_MEMORY_HOTREMOVE
static int pseries_remove_memory(u64 start, u64 size)
{
int ret;
/* Remove htab bolted mappings for this section of memory */
start = (unsigned long)__va(start);
ret = remove_section_mapping(start, start + size);
/* Ensure all vmalloc mappings are flushed in case they also
* hit that section of memory
*/
vm_unmap_aliases();
return ret;
}
static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size)
{
unsigned long block_sz, start_pfn;
int sections_per_block;
int i, nid;
start_pfn = base >> PAGE_SHIFT;
lock_device_hotplug();
if (!pfn_valid(start_pfn))
goto out;
block_sz = pseries_memory_block_size();
sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
nid = memory_add_physaddr_to_nid(base);
for (i = 0; i < sections_per_block; i++) {
remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE);
base += MIN_MEMORY_BLOCK_SIZE;
}
out:
/* Update memory regions for memory remove */
memblock_remove(base, memblock_size);
unlock_device_hotplug();
return 0;
}
static int pseries_remove_mem_node(struct device_node *np)
{
const char *type;
const __be32 *regs;
unsigned long base;
unsigned int lmb_size;
int ret = -EINVAL;
/*
* Check to see if we are actually removing memory
*/
type = of_get_property(np, "device_type", NULL);
if (type == NULL || strcmp(type, "memory") != 0)
return 0;
/*
* Find the base address and size of the memblock
*/
regs = of_get_property(np, "reg", NULL);
if (!regs)
return ret;
base = be64_to_cpu(*(unsigned long *)regs);
lmb_size = be32_to_cpu(regs[3]);
pseries_remove_memblock(base, lmb_size);
return 0;
}
#else
static inline int pseries_remove_memblock(unsigned long base,
unsigned int memblock_size)
{
return -EOPNOTSUPP;
}
static inline int pseries_remove_mem_node(struct device_node *np)
{
return 0;
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
static int pseries_add_mem_node(struct device_node *np)
{
const char *type;
const __be32 *regs;
unsigned long base;
unsigned int lmb_size;
int ret = -EINVAL;
/*
* Check to see if we are actually adding memory
*/
type = of_get_property(np, "device_type", NULL);
if (type == NULL || strcmp(type, "memory") != 0)
return 0;
/*
* Find the base and size of the memblock
*/
regs = of_get_property(np, "reg", NULL);
if (!regs)
return ret;
base = be64_to_cpu(*(unsigned long *)regs);
lmb_size = be32_to_cpu(regs[3]);
/*
* Update memory region to represent the memory add
*/
ret = memblock_add(base, lmb_size);
return (ret < 0) ? -EINVAL : 0;
}
static int pseries_update_drconf_memory(struct of_prop_reconfig *pr)
{
struct of_drconf_cell *new_drmem, *old_drmem;
unsigned long memblock_size;
u32 entries;
__be32 *p;
int i, rc = -EINVAL;
memblock_size = pseries_memory_block_size();
if (!memblock_size)
return -EINVAL;
p = (__be32 *) pr->old_prop->value;
if (!p)
return -EINVAL;
/* The first int of the property is the number of lmb's described
* by the property. This is followed by an array of of_drconf_cell
* entries. Get the number of entries and skip to the array of
* of_drconf_cell's.
*/
entries = be32_to_cpu(*p++);
old_drmem = (struct of_drconf_cell *)p;
p = (__be32 *)pr->prop->value;
p++;
new_drmem = (struct of_drconf_cell *)p;
for (i = 0; i < entries; i++) {
if ((be32_to_cpu(old_drmem[i].flags) & DRCONF_MEM_ASSIGNED) &&
(!(be32_to_cpu(new_drmem[i].flags) & DRCONF_MEM_ASSIGNED))) {
rc = pseries_remove_memblock(
be64_to_cpu(old_drmem[i].base_addr),
memblock_size);
break;
} else if ((!(be32_to_cpu(old_drmem[i].flags) &
DRCONF_MEM_ASSIGNED)) &&
(be32_to_cpu(new_drmem[i].flags) &
DRCONF_MEM_ASSIGNED)) {
rc = memblock_add(be64_to_cpu(old_drmem[i].base_addr),
memblock_size);
rc = (rc < 0) ? -EINVAL : 0;
break;
}
}
return rc;
}
static int pseries_memory_notifier(struct notifier_block *nb,
unsigned long action, void *node)
{
struct of_prop_reconfig *pr;
int err = 0;
switch (action) {
case OF_RECONFIG_ATTACH_NODE:
err = pseries_add_mem_node(node);
break;
case OF_RECONFIG_DETACH_NODE:
err = pseries_remove_mem_node(node);
break;
case OF_RECONFIG_UPDATE_PROPERTY:
pr = (struct of_prop_reconfig *)node;
if (!strcmp(pr->prop->name, "ibm,dynamic-memory"))
err = pseries_update_drconf_memory(pr);
break;
}
return notifier_from_errno(err);
}
static struct notifier_block pseries_mem_nb = {
.notifier_call = pseries_memory_notifier,
};
static int __init pseries_memory_hotplug_init(void)
{
if (firmware_has_feature(FW_FEATURE_LPAR))
of_reconfig_notifier_register(&pseries_mem_nb);
#ifdef CONFIG_MEMORY_HOTREMOVE
ppc_md.remove_memory = pseries_remove_memory;
#endif
return 0;
}
machine_device_initcall(pseries, pseries_memory_hotplug_init);

View file

@ -0,0 +1,338 @@
/*
* This file contains the generic code to perform a call to the
* pSeries LPAR hypervisor.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <asm/hvcall.h>
#include <asm/processor.h>
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
#include <asm/ptrace.h>
#include <asm/jump_label.h>
.section ".text"
#ifdef CONFIG_TRACEPOINTS
#ifndef CONFIG_JUMP_LABEL
.section ".toc","aw"
.globl hcall_tracepoint_refcount
hcall_tracepoint_refcount:
.llong 0
.section ".text"
#endif
/*
* precall must preserve all registers. use unused STK_PARAM()
* areas to save snapshots and opcode.
*/
#define HCALL_INST_PRECALL(FIRST_REG) \
mflr r0; \
std r3,STK_PARAM(R3)(r1); \
std r4,STK_PARAM(R4)(r1); \
std r5,STK_PARAM(R5)(r1); \
std r6,STK_PARAM(R6)(r1); \
std r7,STK_PARAM(R7)(r1); \
std r8,STK_PARAM(R8)(r1); \
std r9,STK_PARAM(R9)(r1); \
std r10,STK_PARAM(R10)(r1); \
std r0,16(r1); \
addi r4,r1,STK_PARAM(FIRST_REG); \
stdu r1,-STACK_FRAME_OVERHEAD(r1); \
bl __trace_hcall_entry; \
ld r3,STACK_FRAME_OVERHEAD+STK_PARAM(R3)(r1); \
ld r4,STACK_FRAME_OVERHEAD+STK_PARAM(R4)(r1); \
ld r5,STACK_FRAME_OVERHEAD+STK_PARAM(R5)(r1); \
ld r6,STACK_FRAME_OVERHEAD+STK_PARAM(R6)(r1); \
ld r7,STACK_FRAME_OVERHEAD+STK_PARAM(R7)(r1); \
ld r8,STACK_FRAME_OVERHEAD+STK_PARAM(R8)(r1); \
ld r9,STACK_FRAME_OVERHEAD+STK_PARAM(R9)(r1); \
ld r10,STACK_FRAME_OVERHEAD+STK_PARAM(R10)(r1)
/*
* postcall is performed immediately before function return which
* allows liberal use of volatile registers.
*/
#define __HCALL_INST_POSTCALL \
ld r0,STACK_FRAME_OVERHEAD+STK_PARAM(R3)(r1); \
std r3,STACK_FRAME_OVERHEAD+STK_PARAM(R3)(r1); \
mr r4,r3; \
mr r3,r0; \
bl __trace_hcall_exit; \
ld r0,STACK_FRAME_OVERHEAD+16(r1); \
addi r1,r1,STACK_FRAME_OVERHEAD; \
ld r3,STK_PARAM(R3)(r1); \
mtlr r0
#define HCALL_INST_POSTCALL_NORETS \
li r5,0; \
__HCALL_INST_POSTCALL
#define HCALL_INST_POSTCALL(BUFREG) \
mr r5,BUFREG; \
__HCALL_INST_POSTCALL
#ifdef CONFIG_JUMP_LABEL
#define HCALL_BRANCH(LABEL) \
ARCH_STATIC_BRANCH(LABEL, hcall_tracepoint_key)
#else
/*
* We branch around this in early init (eg when populating the MMU
* hashtable) by using an unconditional cpu feature.
*/
#define HCALL_BRANCH(LABEL) \
BEGIN_FTR_SECTION; \
b 1f; \
END_FTR_SECTION(0, 1); \
ld r12,hcall_tracepoint_refcount@toc(r2); \
std r12,32(r1); \
cmpdi r12,0; \
bne- LABEL; \
1:
#endif
#else
#define HCALL_INST_PRECALL(FIRST_ARG)
#define HCALL_INST_POSTCALL_NORETS
#define HCALL_INST_POSTCALL(BUFREG)
#define HCALL_BRANCH(LABEL)
#endif
_GLOBAL_TOC(plpar_hcall_norets)
HMT_MEDIUM
mfcr r0
stw r0,8(r1)
HCALL_BRANCH(plpar_hcall_norets_trace)
HVSC /* invoke the hypervisor */
lwz r0,8(r1)
mtcrf 0xff,r0
blr /* return r3 = status */
#ifdef CONFIG_TRACEPOINTS
plpar_hcall_norets_trace:
HCALL_INST_PRECALL(R4)
HVSC
HCALL_INST_POSTCALL_NORETS
lwz r0,8(r1)
mtcrf 0xff,r0
blr
#endif
_GLOBAL_TOC(plpar_hcall)
HMT_MEDIUM
mfcr r0
stw r0,8(r1)
HCALL_BRANCH(plpar_hcall_trace)
std r4,STK_PARAM(R4)(r1) /* Save ret buffer */
mr r4,r5
mr r5,r6
mr r6,r7
mr r7,r8
mr r8,r9
mr r9,r10
HVSC /* invoke the hypervisor */
ld r12,STK_PARAM(R4)(r1)
std r4, 0(r12)
std r5, 8(r12)
std r6, 16(r12)
std r7, 24(r12)
lwz r0,8(r1)
mtcrf 0xff,r0
blr /* return r3 = status */
#ifdef CONFIG_TRACEPOINTS
plpar_hcall_trace:
HCALL_INST_PRECALL(R5)
std r4,STK_PARAM(R4)(r1)
mr r0,r4
mr r4,r5
mr r5,r6
mr r6,r7
mr r7,r8
mr r8,r9
mr r9,r10
HVSC
ld r12,STK_PARAM(R4)(r1)
std r4,0(r12)
std r5,8(r12)
std r6,16(r12)
std r7,24(r12)
HCALL_INST_POSTCALL(r12)
lwz r0,8(r1)
mtcrf 0xff,r0
blr
#endif
/*
* plpar_hcall_raw can be called in real mode. kexec/kdump need some
* hypervisor calls to be executed in real mode. So plpar_hcall_raw
* does not access the per cpu hypervisor call statistics variables,
* since these variables may not be present in the RMO region.
*/
_GLOBAL(plpar_hcall_raw)
HMT_MEDIUM
mfcr r0
stw r0,8(r1)
std r4,STK_PARAM(R4)(r1) /* Save ret buffer */
mr r4,r5
mr r5,r6
mr r6,r7
mr r7,r8
mr r8,r9
mr r9,r10
HVSC /* invoke the hypervisor */
ld r12,STK_PARAM(R4)(r1)
std r4, 0(r12)
std r5, 8(r12)
std r6, 16(r12)
std r7, 24(r12)
lwz r0,8(r1)
mtcrf 0xff,r0
blr /* return r3 = status */
_GLOBAL_TOC(plpar_hcall9)
HMT_MEDIUM
mfcr r0
stw r0,8(r1)
HCALL_BRANCH(plpar_hcall9_trace)
std r4,STK_PARAM(R4)(r1) /* Save ret buffer */
mr r4,r5
mr r5,r6
mr r6,r7
mr r7,r8
mr r8,r9
mr r9,r10
ld r10,STK_PARAM(R11)(r1) /* put arg7 in R10 */
ld r11,STK_PARAM(R12)(r1) /* put arg8 in R11 */
ld r12,STK_PARAM(R13)(r1) /* put arg9 in R12 */
HVSC /* invoke the hypervisor */
mr r0,r12
ld r12,STK_PARAM(R4)(r1)
std r4, 0(r12)
std r5, 8(r12)
std r6, 16(r12)
std r7, 24(r12)
std r8, 32(r12)
std r9, 40(r12)
std r10,48(r12)
std r11,56(r12)
std r0, 64(r12)
lwz r0,8(r1)
mtcrf 0xff,r0
blr /* return r3 = status */
#ifdef CONFIG_TRACEPOINTS
plpar_hcall9_trace:
HCALL_INST_PRECALL(R5)
std r4,STK_PARAM(R4)(r1)
mr r0,r4
mr r4,r5
mr r5,r6
mr r6,r7
mr r7,r8
mr r8,r9
mr r9,r10
ld r10,STACK_FRAME_OVERHEAD+STK_PARAM(R11)(r1)
ld r11,STACK_FRAME_OVERHEAD+STK_PARAM(R12)(r1)
ld r12,STACK_FRAME_OVERHEAD+STK_PARAM(R13)(r1)
HVSC
mr r0,r12
ld r12,STACK_FRAME_OVERHEAD+STK_PARAM(R4)(r1)
std r4,0(r12)
std r5,8(r12)
std r6,16(r12)
std r7,24(r12)
std r8,32(r12)
std r9,40(r12)
std r10,48(r12)
std r11,56(r12)
std r0,64(r12)
HCALL_INST_POSTCALL(r12)
lwz r0,8(r1)
mtcrf 0xff,r0
blr
#endif
/* See plpar_hcall_raw to see why this is needed */
_GLOBAL(plpar_hcall9_raw)
HMT_MEDIUM
mfcr r0
stw r0,8(r1)
std r4,STK_PARAM(R4)(r1) /* Save ret buffer */
mr r4,r5
mr r5,r6
mr r6,r7
mr r7,r8
mr r8,r9
mr r9,r10
ld r10,STK_PARAM(R11)(r1) /* put arg7 in R10 */
ld r11,STK_PARAM(R12)(r1) /* put arg8 in R11 */
ld r12,STK_PARAM(R13)(r1) /* put arg9 in R12 */
HVSC /* invoke the hypervisor */
mr r0,r12
ld r12,STK_PARAM(R4)(r1)
std r4, 0(r12)
std r5, 8(r12)
std r6, 16(r12)
std r7, 24(r12)
std r8, 32(r12)
std r9, 40(r12)
std r10,48(r12)
std r11,56(r12)
std r0, 64(r12)
lwz r0,8(r1)
mtcrf 0xff,r0
blr /* return r3 = status */

View file

@ -0,0 +1,166 @@
/*
* Copyright (C) 2006 Mike Kravetz IBM Corporation
*
* Hypervisor Call Instrumentation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/cpumask.h>
#include <asm/hvcall.h>
#include <asm/firmware.h>
#include <asm/cputable.h>
#include <asm/trace.h>
#include <asm/machdep.h>
DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats);
/*
* Routines for displaying the statistics in debugfs
*/
static void *hc_start(struct seq_file *m, loff_t *pos)
{
if ((int)*pos < (HCALL_STAT_ARRAY_SIZE-1))
return (void *)(unsigned long)(*pos + 1);
return NULL;
}
static void *hc_next(struct seq_file *m, void *p, loff_t * pos)
{
++*pos;
return hc_start(m, pos);
}
static void hc_stop(struct seq_file *m, void *p)
{
}
static int hc_show(struct seq_file *m, void *p)
{
unsigned long h_num = (unsigned long)p;
struct hcall_stats *hs = m->private;
if (hs[h_num].num_calls) {
if (cpu_has_feature(CPU_FTR_PURR))
seq_printf(m, "%lu %lu %lu %lu\n", h_num<<2,
hs[h_num].num_calls,
hs[h_num].tb_total,
hs[h_num].purr_total);
else
seq_printf(m, "%lu %lu %lu\n", h_num<<2,
hs[h_num].num_calls,
hs[h_num].tb_total);
}
return 0;
}
static const struct seq_operations hcall_inst_seq_ops = {
.start = hc_start,
.next = hc_next,
.stop = hc_stop,
.show = hc_show
};
static int hcall_inst_seq_open(struct inode *inode, struct file *file)
{
int rc;
struct seq_file *seq;
rc = seq_open(file, &hcall_inst_seq_ops);
seq = file->private_data;
seq->private = file_inode(file)->i_private;
return rc;
}
static const struct file_operations hcall_inst_seq_fops = {
.open = hcall_inst_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
#define HCALL_ROOT_DIR "hcall_inst"
#define CPU_NAME_BUF_SIZE 32
static void probe_hcall_entry(void *ignored, unsigned long opcode, unsigned long *args)
{
struct hcall_stats *h;
if (opcode > MAX_HCALL_OPCODE)
return;
h = &__get_cpu_var(hcall_stats)[opcode / 4];
h->tb_start = mftb();
h->purr_start = mfspr(SPRN_PURR);
}
static void probe_hcall_exit(void *ignored, unsigned long opcode, unsigned long retval,
unsigned long *retbuf)
{
struct hcall_stats *h;
if (opcode > MAX_HCALL_OPCODE)
return;
h = &__get_cpu_var(hcall_stats)[opcode / 4];
h->num_calls++;
h->tb_total += mftb() - h->tb_start;
h->purr_total += mfspr(SPRN_PURR) - h->purr_start;
}
static int __init hcall_inst_init(void)
{
struct dentry *hcall_root;
struct dentry *hcall_file;
char cpu_name_buf[CPU_NAME_BUF_SIZE];
int cpu;
if (!firmware_has_feature(FW_FEATURE_LPAR))
return 0;
if (register_trace_hcall_entry(probe_hcall_entry, NULL))
return -EINVAL;
if (register_trace_hcall_exit(probe_hcall_exit, NULL)) {
unregister_trace_hcall_entry(probe_hcall_entry, NULL);
return -EINVAL;
}
hcall_root = debugfs_create_dir(HCALL_ROOT_DIR, NULL);
if (!hcall_root)
return -ENOMEM;
for_each_possible_cpu(cpu) {
snprintf(cpu_name_buf, CPU_NAME_BUF_SIZE, "cpu%d", cpu);
hcall_file = debugfs_create_file(cpu_name_buf, S_IRUGO,
hcall_root,
per_cpu(hcall_stats, cpu),
&hcall_inst_seq_fops);
if (!hcall_file)
return -ENOMEM;
}
return 0;
}
machine_device_initcall(pseries, hcall_inst_init);

View file

@ -0,0 +1,88 @@
/*
* hvconsole.c
* Copyright (C) 2004 Hollis Blanchard, IBM Corporation
* Copyright (C) 2004 IBM Corporation
*
* Additional Author(s):
* Ryan S. Arnold <rsa@us.ibm.com>
*
* LPAR console support.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/errno.h>
#include <asm/hvcall.h>
#include <asm/hvconsole.h>
#include <asm/plpar_wrappers.h>
/**
* hvc_get_chars - retrieve characters from firmware for denoted vterm adatper
* @vtermno: The vtermno or unit_address of the adapter from which to fetch the
* data.
* @buf: The character buffer into which to put the character data fetched from
* firmware.
* @count: not used?
*/
int hvc_get_chars(uint32_t vtermno, char *buf, int count)
{
long ret;
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
unsigned long *lbuf = (unsigned long *)buf;
ret = plpar_hcall(H_GET_TERM_CHAR, retbuf, vtermno);
lbuf[0] = be64_to_cpu(retbuf[1]);
lbuf[1] = be64_to_cpu(retbuf[2]);
if (ret == H_SUCCESS)
return retbuf[0];
return 0;
}
EXPORT_SYMBOL(hvc_get_chars);
/**
* hvc_put_chars: send characters to firmware for denoted vterm adapter
* @vtermno: The vtermno or unit_address of the adapter from which the data
* originated.
* @buf: The character buffer that contains the character data to send to
* firmware.
* @count: Send this number of characters.
*/
int hvc_put_chars(uint32_t vtermno, const char *buf, int count)
{
unsigned long *lbuf = (unsigned long *) buf;
long ret;
/* hcall will ret H_PARAMETER if 'count' exceeds firmware max.*/
if (count > MAX_VIO_PUT_CHARS)
count = MAX_VIO_PUT_CHARS;
ret = plpar_hcall_norets(H_PUT_TERM_CHAR, vtermno, count,
cpu_to_be64(lbuf[0]),
cpu_to_be64(lbuf[1]));
if (ret == H_SUCCESS)
return count;
if (ret == H_BUSY)
return -EAGAIN;
return -EIO;
}
EXPORT_SYMBOL(hvc_put_chars);

View file

@ -0,0 +1,252 @@
/*
* hvcserver.c
* Copyright (C) 2004 Ryan S Arnold, IBM Corporation
*
* PPC64 virtual I/O console server support.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <asm/hvcall.h>
#include <asm/hvcserver.h>
#include <asm/io.h>
#define HVCS_ARCH_VERSION "1.0.0"
MODULE_AUTHOR("Ryan S. Arnold <rsa@us.ibm.com>");
MODULE_DESCRIPTION("IBM hvcs ppc64 API");
MODULE_LICENSE("GPL");
MODULE_VERSION(HVCS_ARCH_VERSION);
/*
* Convert arch specific return codes into relevant errnos. The hvcs
* functions aren't performance sensitive, so this conversion isn't an
* issue.
*/
static int hvcs_convert(long to_convert)
{
switch (to_convert) {
case H_SUCCESS:
return 0;
case H_PARAMETER:
return -EINVAL;
case H_HARDWARE:
return -EIO;
case H_BUSY:
case H_LONG_BUSY_ORDER_1_MSEC:
case H_LONG_BUSY_ORDER_10_MSEC:
case H_LONG_BUSY_ORDER_100_MSEC:
case H_LONG_BUSY_ORDER_1_SEC:
case H_LONG_BUSY_ORDER_10_SEC:
case H_LONG_BUSY_ORDER_100_SEC:
return -EBUSY;
case H_FUNCTION: /* fall through */
default:
return -EPERM;
}
}
/**
* hvcs_free_partner_info - free pi allocated by hvcs_get_partner_info
* @head: list_head pointer for an allocated list of partner info structs to
* free.
*
* This function is used to free the partner info list that was returned by
* calling hvcs_get_partner_info().
*/
int hvcs_free_partner_info(struct list_head *head)
{
struct hvcs_partner_info *pi;
struct list_head *element;
if (!head)
return -EINVAL;
while (!list_empty(head)) {
element = head->next;
pi = list_entry(element, struct hvcs_partner_info, node);
list_del(element);
kfree(pi);
}
return 0;
}
EXPORT_SYMBOL(hvcs_free_partner_info);
/* Helper function for hvcs_get_partner_info */
static int hvcs_next_partner(uint32_t unit_address,
unsigned long last_p_partition_ID,
unsigned long last_p_unit_address, unsigned long *pi_buff)
{
long retval;
retval = plpar_hcall_norets(H_VTERM_PARTNER_INFO, unit_address,
last_p_partition_ID,
last_p_unit_address, virt_to_phys(pi_buff));
return hvcs_convert(retval);
}
/**
* hvcs_get_partner_info - Get all of the partner info for a vty-server adapter
* @unit_address: The unit_address of the vty-server adapter for which this
* function is fetching partner info.
* @head: An initialized list_head pointer to an empty list to use to return the
* list of partner info fetched from the hypervisor to the caller.
* @pi_buff: A page sized buffer pre-allocated prior to calling this function
* that is to be used to be used by firmware as an iterator to keep track
* of the partner info retrieval.
*
* This function returns non-zero on success, or if there is no partner info.
*
* The pi_buff is pre-allocated prior to calling this function because this
* function may be called with a spin_lock held and kmalloc of a page is not
* recommended as GFP_ATOMIC.
*
* The first long of this buffer is used to store a partner unit address. The
* second long is used to store a partner partition ID and starting at
* pi_buff[2] is the 79 character Converged Location Code (diff size than the
* unsigned longs, hence the casting mumbo jumbo you see later).
*
* Invocation of this function should always be followed by an invocation of
* hvcs_free_partner_info() using a pointer to the SAME list head instance
* that was passed as a parameter to this function.
*/
int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head,
unsigned long *pi_buff)
{
/*
* Dealt with as longs because of the hcall interface even though the
* values are uint32_t.
*/
unsigned long last_p_partition_ID;
unsigned long last_p_unit_address;
struct hvcs_partner_info *next_partner_info = NULL;
int more = 1;
int retval;
memset(pi_buff, 0x00, PAGE_SIZE);
/* invalid parameters */
if (!head || !pi_buff)
return -EINVAL;
last_p_partition_ID = last_p_unit_address = ~0UL;
INIT_LIST_HEAD(head);
do {
retval = hvcs_next_partner(unit_address, last_p_partition_ID,
last_p_unit_address, pi_buff);
if (retval) {
/*
* Don't indicate that we've failed if we have
* any list elements.
*/
if (!list_empty(head))
return 0;
return retval;
}
last_p_partition_ID = be64_to_cpu(pi_buff[0]);
last_p_unit_address = be64_to_cpu(pi_buff[1]);
/* This indicates that there are no further partners */
if (last_p_partition_ID == ~0UL
&& last_p_unit_address == ~0UL)
break;
/* This is a very small struct and will be freed soon in
* hvcs_free_partner_info(). */
next_partner_info = kmalloc(sizeof(struct hvcs_partner_info),
GFP_ATOMIC);
if (!next_partner_info) {
printk(KERN_WARNING "HVCONSOLE: kmalloc() failed to"
" allocate partner info struct.\n");
hvcs_free_partner_info(head);
return -ENOMEM;
}
next_partner_info->unit_address
= (unsigned int)last_p_unit_address;
next_partner_info->partition_ID
= (unsigned int)last_p_partition_ID;
/* copy the Null-term char too */
strlcpy(&next_partner_info->location_code[0],
(char *)&pi_buff[2],
sizeof(next_partner_info->location_code));
list_add_tail(&(next_partner_info->node), head);
next_partner_info = NULL;
} while (more);
return 0;
}
EXPORT_SYMBOL(hvcs_get_partner_info);
/**
* hvcs_register_connection - establish a connection between this vty-server and
* a vty.
* @unit_address: The unit address of the vty-server adapter that is to be
* establish a connection.
* @p_partition_ID: The partition ID of the vty adapter that is to be connected.
* @p_unit_address: The unit address of the vty adapter to which the vty-server
* is to be connected.
*
* If this function is called once and -EINVAL is returned it may
* indicate that the partner info needs to be refreshed for the
* target unit address at which point the caller must invoke
* hvcs_get_partner_info() and then call this function again. If,
* for a second time, -EINVAL is returned then it indicates that
* there is probably already a partner connection registered to a
* different vty-server adapter. It is also possible that a second
* -EINVAL may indicate that one of the parms is not valid, for
* instance if the link was removed between the vty-server adapter
* and the vty adapter that you are trying to open. Don't shoot the
* messenger. Firmware implemented it this way.
*/
int hvcs_register_connection( uint32_t unit_address,
uint32_t p_partition_ID, uint32_t p_unit_address)
{
long retval;
retval = plpar_hcall_norets(H_REGISTER_VTERM, unit_address,
p_partition_ID, p_unit_address);
return hvcs_convert(retval);
}
EXPORT_SYMBOL(hvcs_register_connection);
/**
* hvcs_free_connection - free the connection between a vty-server and vty
* @unit_address: The unit address of the vty-server that is to have its
* connection severed.
*
* This function is used to free the partner connection between a vty-server
* adapter and a vty adapter.
*
* If -EBUSY is returned continue to call this function until 0 is returned.
*/
int hvcs_free_connection(uint32_t unit_address)
{
long retval;
retval = plpar_hcall_norets(H_FREE_VTERM, unit_address);
return hvcs_convert(retval);
}
EXPORT_SYMBOL(hvcs_free_connection);

View file

@ -0,0 +1,165 @@
/*
* Copyright 2010 2011 Mark Nelson and Tseng-Hui (Frank) Lin, IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/of.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <asm/machdep.h>
#include <asm/rtas.h>
#include <asm/irq.h>
#include <asm/io_event_irq.h>
#include "pseries.h"
/*
* IO event interrupt is a mechanism provided by RTAS to return
* information about hardware error and non-error events. Device
* drivers can register their event handlers to receive events.
* Device drivers are expected to use atomic_notifier_chain_register()
* and atomic_notifier_chain_unregister() to register and unregister
* their event handlers. Since multiple IO event types and scopes
* share an IO event interrupt, the event handlers are called one
* by one until the IO event is claimed by one of the handlers.
* The event handlers are expected to return NOTIFY_OK if the
* event is handled by the event handler or NOTIFY_DONE if the
* event does not belong to the handler.
*
* Usage:
*
* Notifier function:
* #include <asm/io_event_irq.h>
* int event_handler(struct notifier_block *nb, unsigned long val, void *data) {
* p = (struct pseries_io_event_sect_data *) data;
* if (! is_my_event(p->scope, p->event_type)) return NOTIFY_DONE;
* :
* :
* return NOTIFY_OK;
* }
* struct notifier_block event_nb = {
* .notifier_call = event_handler,
* }
*
* Registration:
* atomic_notifier_chain_register(&pseries_ioei_notifier_list, &event_nb);
*
* Unregistration:
* atomic_notifier_chain_unregister(&pseries_ioei_notifier_list, &event_nb);
*/
ATOMIC_NOTIFIER_HEAD(pseries_ioei_notifier_list);
EXPORT_SYMBOL_GPL(pseries_ioei_notifier_list);
static int ioei_check_exception_token;
static char ioei_rtas_buf[RTAS_DATA_BUF_SIZE] __cacheline_aligned;
/**
* Find the data portion of an IO Event section from event log.
* @elog: RTAS error/event log.
*
* Return:
* pointer to a valid IO event section data. NULL if not found.
*/
static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog)
{
struct pseries_errorlog *sect;
/* We should only ever get called for io-event interrupts, but if
* we do get called for another type then something went wrong so
* make some noise about it.
* RTAS_TYPE_IO only exists in extended event log version 6 or later.
* No need to check event log version.
*/
if (unlikely(rtas_error_type(elog) != RTAS_TYPE_IO)) {
printk_once(KERN_WARNING"io_event_irq: Unexpected event type %d",
rtas_error_type(elog));
return NULL;
}
sect = get_pseries_errorlog(elog, PSERIES_ELOG_SECT_ID_IO_EVENT);
if (unlikely(!sect)) {
printk_once(KERN_WARNING "io_event_irq: RTAS extended event "
"log does not contain an IO Event section. "
"Could be a bug in system firmware!\n");
return NULL;
}
return (struct pseries_io_event *) &sect->data;
}
/*
* PAPR:
* - check-exception returns the first found error or event and clear that
* error or event so it is reported once.
* - Each interrupt returns one event. If a plateform chooses to report
* multiple events through a single interrupt, it must ensure that the
* interrupt remains asserted until check-exception has been used to
* process all out-standing events for that interrupt.
*
* Implementation notes:
* - Events must be processed in the order they are returned. Hence,
* sequential in nature.
* - The owner of an event is determined by combinations of scope,
* event type, and sub-type. There is no easy way to pre-sort clients
* by scope or event type alone. For example, Torrent ISR route change
* event is reported with scope 0x00 (Not Applicatable) rather than
* 0x3B (Torrent-hub). It is better to let the clients to identify
* who owns the event.
*/
static irqreturn_t ioei_interrupt(int irq, void *dev_id)
{
struct pseries_io_event *event;
int rtas_rc;
for (;;) {
rtas_rc = rtas_call(ioei_check_exception_token, 6, 1, NULL,
RTAS_VECTOR_EXTERNAL_INTERRUPT,
virq_to_hw(irq),
RTAS_IO_EVENTS, 1 /* Time Critical */,
__pa(ioei_rtas_buf),
RTAS_DATA_BUF_SIZE);
if (rtas_rc != 0)
break;
event = ioei_find_event((struct rtas_error_log *)ioei_rtas_buf);
if (!event)
continue;
atomic_notifier_call_chain(&pseries_ioei_notifier_list,
0, event);
}
return IRQ_HANDLED;
}
static int __init ioei_init(void)
{
struct device_node *np;
ioei_check_exception_token = rtas_token("check-exception");
if (ioei_check_exception_token == RTAS_UNKNOWN_SERVICE)
return -ENODEV;
np = of_find_node_by_path("/event-sources/ibm,io-events");
if (np) {
request_event_sources_irqs(np, ioei_interrupt, "IO_EVENT");
pr_info("IBM I/O event interrupts enabled\n");
of_node_put(np);
} else {
return -ENODEV;
}
return 0;
}
machine_subsys_initcall(pseries, ioei_init);

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,76 @@
/*
* Copyright 2006 Michael Ellerman, IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/interrupt.h>
#include <asm/machdep.h>
#include <asm/page.h>
#include <asm/firmware.h>
#include <asm/kexec.h>
#include <asm/mpic.h>
#include <asm/xics.h>
#include <asm/smp.h>
#include <asm/plpar_wrappers.h>
#include "pseries.h"
static void pseries_kexec_cpu_down(int crash_shutdown, int secondary)
{
/* Don't risk a hypervisor call if we're crashing */
if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) {
int ret;
int cpu = smp_processor_id();
int hwcpu = hard_smp_processor_id();
if (get_lppaca()->dtl_enable_mask) {
ret = unregister_dtl(hwcpu);
if (ret) {
pr_err("WARNING: DTL deregistration for cpu "
"%d (hw %d) failed with %d\n",
cpu, hwcpu, ret);
}
}
ret = unregister_slb_shadow(hwcpu);
if (ret) {
pr_err("WARNING: SLB shadow buffer deregistration "
"for cpu %d (hw %d) failed with %d\n",
cpu, hwcpu, ret);
}
ret = unregister_vpa(hwcpu);
if (ret) {
pr_err("WARNING: VPA deregistration for cpu %d "
"(hw %d) failed with %d\n", cpu, hwcpu, ret);
}
}
}
static void pseries_kexec_cpu_down_mpic(int crash_shutdown, int secondary)
{
pseries_kexec_cpu_down(crash_shutdown, secondary);
mpic_teardown_this_cpu(secondary);
}
void __init setup_kexec_cpu_down_mpic(void)
{
ppc_md.kexec_cpu_down = pseries_kexec_cpu_down_mpic;
}
static void pseries_kexec_cpu_down_xics(int crash_shutdown, int secondary)
{
pseries_kexec_cpu_down(crash_shutdown, secondary);
xics_kexec_teardown_cpu(secondary);
}
void __init setup_kexec_cpu_down_xics(void)
{
ppc_md.kexec_cpu_down = pseries_kexec_cpu_down_xics;
}

View file

@ -0,0 +1,790 @@
/*
* pSeries_lpar.c
* Copyright (C) 2001 Todd Inglett, IBM Corporation
*
* pSeries LPAR support.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/* Enables debugging of low-level hash table routines - careful! */
#undef DEBUG
#include <linux/kernel.h>
#include <linux/dma-mapping.h>
#include <linux/console.h>
#include <linux/export.h>
#include <linux/static_key.h>
#include <asm/processor.h>
#include <asm/mmu.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/machdep.h>
#include <asm/mmu_context.h>
#include <asm/iommu.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/prom.h>
#include <asm/cputable.h>
#include <asm/udbg.h>
#include <asm/smp.h>
#include <asm/trace.h>
#include <asm/firmware.h>
#include <asm/plpar_wrappers.h>
#include <asm/fadump.h>
#include "pseries.h"
/* Flag bits for H_BULK_REMOVE */
#define HBR_REQUEST 0x4000000000000000UL
#define HBR_RESPONSE 0x8000000000000000UL
#define HBR_END 0xc000000000000000UL
#define HBR_AVPN 0x0200000000000000UL
#define HBR_ANDCOND 0x0100000000000000UL
/* in hvCall.S */
EXPORT_SYMBOL(plpar_hcall);
EXPORT_SYMBOL(plpar_hcall9);
EXPORT_SYMBOL(plpar_hcall_norets);
void vpa_init(int cpu)
{
int hwcpu = get_hard_smp_processor_id(cpu);
unsigned long addr;
long ret;
struct paca_struct *pp;
struct dtl_entry *dtl;
/*
* The spec says it "may be problematic" if CPU x registers the VPA of
* CPU y. We should never do that, but wail if we ever do.
*/
WARN_ON(cpu != smp_processor_id());
if (cpu_has_feature(CPU_FTR_ALTIVEC))
lppaca_of(cpu).vmxregs_in_use = 1;
if (cpu_has_feature(CPU_FTR_ARCH_207S))
lppaca_of(cpu).ebb_regs_in_use = 1;
addr = __pa(&lppaca_of(cpu));
ret = register_vpa(hwcpu, addr);
if (ret) {
pr_err("WARNING: VPA registration for cpu %d (hw %d) of area "
"%lx failed with %ld\n", cpu, hwcpu, addr, ret);
return;
}
/*
* PAPR says this feature is SLB-Buffer but firmware never
* reports that. All SPLPAR support SLB shadow buffer.
*/
addr = __pa(paca[cpu].slb_shadow_ptr);
if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
ret = register_slb_shadow(hwcpu, addr);
if (ret)
pr_err("WARNING: SLB shadow buffer registration for "
"cpu %d (hw %d) of area %lx failed with %ld\n",
cpu, hwcpu, addr, ret);
}
/*
* Register dispatch trace log, if one has been allocated.
*/
pp = &paca[cpu];
dtl = pp->dispatch_log;
if (dtl) {
pp->dtl_ridx = 0;
pp->dtl_curr = dtl;
lppaca_of(cpu).dtl_idx = 0;
/* hypervisor reads buffer length from this field */
dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
ret = register_dtl(hwcpu, __pa(dtl));
if (ret)
pr_err("WARNING: DTL registration of cpu %d (hw %d) "
"failed with %ld\n", smp_processor_id(),
hwcpu, ret);
lppaca_of(cpu).dtl_enable_mask = 2;
}
}
static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
unsigned long vpn, unsigned long pa,
unsigned long rflags, unsigned long vflags,
int psize, int apsize, int ssize)
{
unsigned long lpar_rc;
unsigned long flags;
unsigned long slot;
unsigned long hpte_v, hpte_r;
if (!(vflags & HPTE_V_BOLTED))
pr_devel("hpte_insert(group=%lx, vpn=%016lx, "
"pa=%016lx, rflags=%lx, vflags=%lx, psize=%d)\n",
hpte_group, vpn, pa, rflags, vflags, psize);
hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
if (!(vflags & HPTE_V_BOLTED))
pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
/* Now fill in the actual HPTE */
/* Set CEC cookie to 0 */
/* Zero page = 0 */
/* I-cache Invalidate = 0 */
/* I-cache synchronize = 0 */
/* Exact = 0 */
flags = 0;
/* Make pHyp happy */
if ((rflags & _PAGE_NO_CACHE) && !(rflags & _PAGE_WRITETHRU))
hpte_r &= ~HPTE_R_M;
if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N))
flags |= H_COALESCE_CAND;
lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot);
if (unlikely(lpar_rc == H_PTEG_FULL)) {
if (!(vflags & HPTE_V_BOLTED))
pr_devel(" full\n");
return -1;
}
/*
* Since we try and ioremap PHBs we don't own, the pte insert
* will fail. However we must catch the failure in hash_page
* or we will loop forever, so return -2 in this case.
*/
if (unlikely(lpar_rc != H_SUCCESS)) {
if (!(vflags & HPTE_V_BOLTED))
pr_devel(" lpar err %ld\n", lpar_rc);
return -2;
}
if (!(vflags & HPTE_V_BOLTED))
pr_devel(" -> slot: %lu\n", slot & 7);
/* Because of iSeries, we have to pass down the secondary
* bucket bit here as well
*/
return (slot & 7) | (!!(vflags & HPTE_V_SECONDARY) << 3);
}
static DEFINE_SPINLOCK(pSeries_lpar_tlbie_lock);
static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
{
unsigned long slot_offset;
unsigned long lpar_rc;
int i;
unsigned long dummy1, dummy2;
/* pick a random slot to start at */
slot_offset = mftb() & 0x7;
for (i = 0; i < HPTES_PER_GROUP; i++) {
/* don't remove a bolted entry */
lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset,
(0x1UL << 4), &dummy1, &dummy2);
if (lpar_rc == H_SUCCESS)
return i;
/*
* The test for adjunct partition is performed before the
* ANDCOND test. H_RESOURCE may be returned, so we need to
* check for that as well.
*/
BUG_ON(lpar_rc != H_NOT_FOUND && lpar_rc != H_RESOURCE);
slot_offset++;
slot_offset &= 0x7;
}
return -1;
}
static void pSeries_lpar_hptab_clear(void)
{
unsigned long size_bytes = 1UL << ppc64_pft_size;
unsigned long hpte_count = size_bytes >> 4;
struct {
unsigned long pteh;
unsigned long ptel;
} ptes[4];
long lpar_rc;
unsigned long i, j;
/* Read in batches of 4,
* invalidate only valid entries not in the VRMA
* hpte_count will be a multiple of 4
*/
for (i = 0; i < hpte_count; i += 4) {
lpar_rc = plpar_pte_read_4_raw(0, i, (void *)ptes);
if (lpar_rc != H_SUCCESS)
continue;
for (j = 0; j < 4; j++){
if ((ptes[j].pteh & HPTE_V_VRMA_MASK) ==
HPTE_V_VRMA_MASK)
continue;
if (ptes[j].pteh & HPTE_V_VALID)
plpar_pte_remove_raw(0, i + j, 0,
&(ptes[j].pteh), &(ptes[j].ptel));
}
}
#ifdef __LITTLE_ENDIAN__
/*
* Reset exceptions to big endian.
*
* FIXME this is a hack for kexec, we need to reset the exception
* endian before starting the new kernel and this is a convenient place
* to do it.
*
* This is also called on boot when a fadump happens. In that case we
* must not change the exception endian mode.
*/
if (firmware_has_feature(FW_FEATURE_SET_MODE) && !is_fadump_active()) {
long rc;
rc = pseries_big_endian_exceptions();
/*
* At this point it is unlikely panic() will get anything
* out to the user, but at least this will stop us from
* continuing on further and creating an even more
* difficult to debug situation.
*/
if (rc)
panic("Could not enable big endian exceptions");
}
#endif
}
/*
* NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and
* the low 3 bits of flags happen to line up. So no transform is needed.
* We can probably optimize here and assume the high bits of newpp are
* already zero. For now I am paranoid.
*/
static long pSeries_lpar_hpte_updatepp(unsigned long slot,
unsigned long newpp,
unsigned long vpn,
int psize, int apsize,
int ssize, int local)
{
unsigned long lpar_rc;
unsigned long flags = (newpp & 7) | H_AVPN;
unsigned long want_v;
want_v = hpte_encode_avpn(vpn, psize, ssize);
pr_devel(" update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...",
want_v, slot, flags, psize);
lpar_rc = plpar_pte_protect(flags, slot, want_v);
if (lpar_rc == H_NOT_FOUND) {
pr_devel("not found !\n");
return -1;
}
pr_devel("ok\n");
BUG_ON(lpar_rc != H_SUCCESS);
return 0;
}
static unsigned long pSeries_lpar_hpte_getword0(unsigned long slot)
{
unsigned long dword0;
unsigned long lpar_rc;
unsigned long dummy_word1;
unsigned long flags;
/* Read 1 pte at a time */
/* Do not need RPN to logical page translation */
/* No cross CEC PFT access */
flags = 0;
lpar_rc = plpar_pte_read(flags, slot, &dword0, &dummy_word1);
BUG_ON(lpar_rc != H_SUCCESS);
return dword0;
}
static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize)
{
unsigned long hash;
unsigned long i;
long slot;
unsigned long want_v, hpte_v;
hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
want_v = hpte_encode_avpn(vpn, psize, ssize);
/* Bolted entries are always in the primary group */
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
for (i = 0; i < HPTES_PER_GROUP; i++) {
hpte_v = pSeries_lpar_hpte_getword0(slot);
if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
/* HPTE matches */
return slot;
++slot;
}
return -1;
}
static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
unsigned long ea,
int psize, int ssize)
{
unsigned long vpn;
unsigned long lpar_rc, slot, vsid, flags;
vsid = get_kernel_vsid(ea, ssize);
vpn = hpt_vpn(ea, vsid, ssize);
slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
BUG_ON(slot == -1);
flags = newpp & 7;
lpar_rc = plpar_pte_protect(flags, slot, 0);
BUG_ON(lpar_rc != H_SUCCESS);
}
static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
int psize, int apsize,
int ssize, int local)
{
unsigned long want_v;
unsigned long lpar_rc;
unsigned long dummy1, dummy2;
pr_devel(" inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n",
slot, vpn, psize, local);
want_v = hpte_encode_avpn(vpn, psize, ssize);
lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2);
if (lpar_rc == H_NOT_FOUND)
return;
BUG_ON(lpar_rc != H_SUCCESS);
}
/*
* Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
* to make sure that we avoid bouncing the hypervisor tlbie lock.
*/
#define PPC64_HUGE_HPTE_BATCH 12
static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
unsigned long *vpn, int count,
int psize, int ssize)
{
unsigned long param[8];
int i = 0, pix = 0, rc;
unsigned long flags = 0;
int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
if (lock_tlbie)
spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
for (i = 0; i < count; i++) {
if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize, 0,
ssize, 0);
} else {
param[pix] = HBR_REQUEST | HBR_AVPN | slot[i];
param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize);
pix += 2;
if (pix == 8) {
rc = plpar_hcall9(H_BULK_REMOVE, param,
param[0], param[1], param[2],
param[3], param[4], param[5],
param[6], param[7]);
BUG_ON(rc != H_SUCCESS);
pix = 0;
}
}
}
if (pix) {
param[pix] = HBR_END;
rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
param[2], param[3], param[4], param[5],
param[6], param[7]);
BUG_ON(rc != H_SUCCESS);
}
if (lock_tlbie)
spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
}
static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
unsigned long addr,
unsigned char *hpte_slot_array,
int psize, int ssize)
{
int i, index = 0;
unsigned long s_addr = addr;
unsigned int max_hpte_count, valid;
unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
unsigned long shift, hidx, vpn = 0, hash, slot;
shift = mmu_psize_defs[psize].shift;
max_hpte_count = 1U << (PMD_SHIFT - shift);
for (i = 0; i < max_hpte_count; i++) {
valid = hpte_valid(hpte_slot_array, i);
if (!valid)
continue;
hidx = hpte_hash_index(hpte_slot_array, i);
/* get the vpn */
addr = s_addr + (i * (1ul << shift));
vpn = hpt_vpn(addr, vsid, ssize);
hash = hpt_hash(vpn, shift, ssize);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
slot_array[index] = slot;
vpn_array[index] = vpn;
if (index == PPC64_HUGE_HPTE_BATCH - 1) {
/*
* Now do a bluk invalidate
*/
__pSeries_lpar_hugepage_invalidate(slot_array,
vpn_array,
PPC64_HUGE_HPTE_BATCH,
psize, ssize);
index = 0;
} else
index++;
}
if (index)
__pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
index, psize, ssize);
}
static void pSeries_lpar_hpte_removebolted(unsigned long ea,
int psize, int ssize)
{
unsigned long vpn;
unsigned long slot, vsid;
vsid = get_kernel_vsid(ea, ssize);
vpn = hpt_vpn(ea, vsid, ssize);
slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
BUG_ON(slot == -1);
/*
* lpar doesn't use the passed actual page size
*/
pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0);
}
/*
* Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
* lock.
*/
static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
{
unsigned long vpn;
unsigned long i, pix, rc;
unsigned long flags = 0;
struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
unsigned long param[9];
unsigned long hash, index, shift, hidx, slot;
real_pte_t pte;
int psize, ssize;
if (lock_tlbie)
spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
psize = batch->psize;
ssize = batch->ssize;
pix = 0;
for (i = 0; i < number; i++) {
vpn = batch->vpn[i];
pte = batch->pte[i];
pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
hash = hpt_hash(vpn, shift, ssize);
hidx = __rpte_to_hidx(pte, index);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
/*
* lpar doesn't use the passed actual page size
*/
pSeries_lpar_hpte_invalidate(slot, vpn, psize,
0, ssize, local);
} else {
param[pix] = HBR_REQUEST | HBR_AVPN | slot;
param[pix+1] = hpte_encode_avpn(vpn, psize,
ssize);
pix += 2;
if (pix == 8) {
rc = plpar_hcall9(H_BULK_REMOVE, param,
param[0], param[1], param[2],
param[3], param[4], param[5],
param[6], param[7]);
BUG_ON(rc != H_SUCCESS);
pix = 0;
}
}
} pte_iterate_hashed_end();
}
if (pix) {
param[pix] = HBR_END;
rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
param[2], param[3], param[4], param[5],
param[6], param[7]);
BUG_ON(rc != H_SUCCESS);
}
if (lock_tlbie)
spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
}
static int __init disable_bulk_remove(char *str)
{
if (strcmp(str, "off") == 0 &&
firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
printk(KERN_INFO "Disabling BULK_REMOVE firmware feature");
powerpc_firmware_features &= ~FW_FEATURE_BULK_REMOVE;
}
return 1;
}
__setup("bulk_remove=", disable_bulk_remove);
void __init hpte_init_lpar(void)
{
ppc_md.hpte_invalidate = pSeries_lpar_hpte_invalidate;
ppc_md.hpte_updatepp = pSeries_lpar_hpte_updatepp;
ppc_md.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp;
ppc_md.hpte_insert = pSeries_lpar_hpte_insert;
ppc_md.hpte_remove = pSeries_lpar_hpte_remove;
ppc_md.hpte_removebolted = pSeries_lpar_hpte_removebolted;
ppc_md.flush_hash_range = pSeries_lpar_flush_hash_range;
ppc_md.hpte_clear_all = pSeries_lpar_hptab_clear;
ppc_md.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
}
#ifdef CONFIG_PPC_SMLPAR
#define CMO_FREE_HINT_DEFAULT 1
static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT;
static int __init cmo_free_hint(char *str)
{
char *parm;
parm = strstrip(str);
if (strcasecmp(parm, "no") == 0 || strcasecmp(parm, "off") == 0) {
printk(KERN_INFO "cmo_free_hint: CMO free page hinting is not active.\n");
cmo_free_hint_flag = 0;
return 1;
}
cmo_free_hint_flag = 1;
printk(KERN_INFO "cmo_free_hint: CMO free page hinting is active.\n");
if (strcasecmp(parm, "yes") == 0 || strcasecmp(parm, "on") == 0)
return 1;
return 0;
}
__setup("cmo_free_hint=", cmo_free_hint);
static void pSeries_set_page_state(struct page *page, int order,
unsigned long state)
{
int i, j;
unsigned long cmo_page_sz, addr;
cmo_page_sz = cmo_get_page_size();
addr = __pa((unsigned long)page_address(page));
for (i = 0; i < (1 << order); i++, addr += PAGE_SIZE) {
for (j = 0; j < PAGE_SIZE; j += cmo_page_sz)
plpar_hcall_norets(H_PAGE_INIT, state, addr + j, 0);
}
}
void arch_free_page(struct page *page, int order)
{
if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO))
return;
pSeries_set_page_state(page, order, H_PAGE_SET_UNUSED);
}
EXPORT_SYMBOL(arch_free_page);
#endif
#ifdef CONFIG_TRACEPOINTS
#ifdef HAVE_JUMP_LABEL
struct static_key hcall_tracepoint_key = STATIC_KEY_INIT;
void hcall_tracepoint_regfunc(void)
{
static_key_slow_inc(&hcall_tracepoint_key);
}
void hcall_tracepoint_unregfunc(void)
{
static_key_slow_dec(&hcall_tracepoint_key);
}
#else
/*
* We optimise our hcall path by placing hcall_tracepoint_refcount
* directly in the TOC so we can check if the hcall tracepoints are
* enabled via a single load.
*/
/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
extern long hcall_tracepoint_refcount;
void hcall_tracepoint_regfunc(void)
{
hcall_tracepoint_refcount++;
}
void hcall_tracepoint_unregfunc(void)
{
hcall_tracepoint_refcount--;
}
#endif
/*
* Since the tracing code might execute hcalls we need to guard against
* recursion. One example of this are spinlocks calling H_YIELD on
* shared processor partitions.
*/
static DEFINE_PER_CPU(unsigned int, hcall_trace_depth);
void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
{
unsigned long flags;
unsigned int *depth;
/*
* We cannot call tracepoints inside RCU idle regions which
* means we must not trace H_CEDE.
*/
if (opcode == H_CEDE)
return;
local_irq_save(flags);
depth = &__get_cpu_var(hcall_trace_depth);
if (*depth)
goto out;
(*depth)++;
preempt_disable();
trace_hcall_entry(opcode, args);
(*depth)--;
out:
local_irq_restore(flags);
}
void __trace_hcall_exit(long opcode, unsigned long retval,
unsigned long *retbuf)
{
unsigned long flags;
unsigned int *depth;
if (opcode == H_CEDE)
return;
local_irq_save(flags);
depth = &__get_cpu_var(hcall_trace_depth);
if (*depth)
goto out;
(*depth)++;
trace_hcall_exit(opcode, retval, retbuf);
preempt_enable();
(*depth)--;
out:
local_irq_restore(flags);
}
#endif
/**
* h_get_mpp
* H_GET_MPP hcall returns info in 7 parms
*/
int h_get_mpp(struct hvcall_mpp_data *mpp_data)
{
int rc;
unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
rc = plpar_hcall9(H_GET_MPP, retbuf);
mpp_data->entitled_mem = retbuf[0];
mpp_data->mapped_mem = retbuf[1];
mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
mpp_data->pool_num = retbuf[2] & 0xffff;
mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffffUL;
mpp_data->pool_size = retbuf[4];
mpp_data->loan_request = retbuf[5];
mpp_data->backing_mem = retbuf[6];
return rc;
}
EXPORT_SYMBOL(h_get_mpp);
int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data)
{
int rc;
unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = { 0 };
rc = plpar_hcall9(H_GET_MPP_X, retbuf);
mpp_x_data->coalesced_bytes = retbuf[0];
mpp_x_data->pool_coalesced_bytes = retbuf[1];
mpp_x_data->pool_purr_cycles = retbuf[2];
mpp_x_data->pool_spurr_cycles = retbuf[3];
return rc;
}

View file

@ -0,0 +1,710 @@
/*
* PowerPC64 LPAR Configuration Information Driver
*
* Dave Engebretsen engebret@us.ibm.com
* Copyright (c) 2003 Dave Engebretsen
* Will Schmidt willschm@us.ibm.com
* SPLPAR updates, Copyright (c) 2003 Will Schmidt IBM Corporation.
* seq_file updates, Copyright (c) 2004 Will Schmidt IBM Corporation.
* Nathan Lynch nathanl@austin.ibm.com
* Added lparcfg_write, Copyright (C) 2004 Nathan Lynch IBM Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* This driver creates a proc file at /proc/ppc64/lparcfg which contains
* keyword - value pairs that specify the configuration of the partition.
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <asm/lppaca.h>
#include <asm/hvcall.h>
#include <asm/firmware.h>
#include <asm/rtas.h>
#include <asm/time.h>
#include <asm/prom.h>
#include <asm/vdso_datapage.h>
#include <asm/vio.h>
#include <asm/mmu.h>
#include <asm/machdep.h>
/*
* This isn't a module but we expose that to userspace
* via /proc so leave the definitions here
*/
#define MODULE_VERS "1.9"
#define MODULE_NAME "lparcfg"
/* #define LPARCFG_DEBUG */
/*
* Track sum of all purrs across all processors. This is used to further
* calculate usage values by different applications
*/
static unsigned long get_purr(void)
{
unsigned long sum_purr = 0;
int cpu;
for_each_possible_cpu(cpu) {
struct cpu_usage *cu;
cu = &per_cpu(cpu_usage_array, cpu);
sum_purr += cu->current_tb;
}
return sum_purr;
}
/*
* Methods used to fetch LPAR data when running on a pSeries platform.
*/
struct hvcall_ppp_data {
u64 entitlement;
u64 unallocated_entitlement;
u16 group_num;
u16 pool_num;
u8 capped;
u8 weight;
u8 unallocated_weight;
u16 active_procs_in_pool;
u16 active_system_procs;
u16 phys_platform_procs;
u32 max_proc_cap_avail;
u32 entitled_proc_cap_avail;
};
/*
* H_GET_PPP hcall returns info in 4 parms.
* entitled_capacity,unallocated_capacity,
* aggregation, resource_capability).
*
* R4 = Entitled Processor Capacity Percentage.
* R5 = Unallocated Processor Capacity Percentage.
* R6 (AABBCCDDEEFFGGHH).
* XXXX - reserved (0)
* XXXX - reserved (0)
* XXXX - Group Number
* XXXX - Pool Number.
* R7 (IIJJKKLLMMNNOOPP).
* XX - reserved. (0)
* XX - bit 0-6 reserved (0). bit 7 is Capped indicator.
* XX - variable processor Capacity Weight
* XX - Unallocated Variable Processor Capacity Weight.
* XXXX - Active processors in Physical Processor Pool.
* XXXX - Processors active on platform.
* R8 (QQQQRRRRRRSSSSSS). if ibm,partition-performance-parameters-level >= 1
* XXXX - Physical platform procs allocated to virtualization.
* XXXXXX - Max procs capacity % available to the partitions pool.
* XXXXXX - Entitled procs capacity % available to the
* partitions pool.
*/
static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
{
unsigned long rc;
unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
rc = plpar_hcall9(H_GET_PPP, retbuf);
ppp_data->entitlement = retbuf[0];
ppp_data->unallocated_entitlement = retbuf[1];
ppp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
ppp_data->pool_num = retbuf[2] & 0xffff;
ppp_data->capped = (retbuf[3] >> 6 * 8) & 0x01;
ppp_data->weight = (retbuf[3] >> 5 * 8) & 0xff;
ppp_data->unallocated_weight = (retbuf[3] >> 4 * 8) & 0xff;
ppp_data->active_procs_in_pool = (retbuf[3] >> 2 * 8) & 0xffff;
ppp_data->active_system_procs = retbuf[3] & 0xffff;
ppp_data->phys_platform_procs = retbuf[4] >> 6 * 8;
ppp_data->max_proc_cap_avail = (retbuf[4] >> 3 * 8) & 0xffffff;
ppp_data->entitled_proc_cap_avail = retbuf[4] & 0xffffff;
return rc;
}
static unsigned h_pic(unsigned long *pool_idle_time,
unsigned long *num_procs)
{
unsigned long rc;
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
rc = plpar_hcall(H_PIC, retbuf);
*pool_idle_time = retbuf[0];
*num_procs = retbuf[1];
return rc;
}
/*
* parse_ppp_data
* Parse out the data returned from h_get_ppp and h_pic
*/
static void parse_ppp_data(struct seq_file *m)
{
struct hvcall_ppp_data ppp_data;
struct device_node *root;
const __be32 *perf_level;
int rc;
rc = h_get_ppp(&ppp_data);
if (rc)
return;
seq_printf(m, "partition_entitled_capacity=%lld\n",
ppp_data.entitlement);
seq_printf(m, "group=%d\n", ppp_data.group_num);
seq_printf(m, "system_active_processors=%d\n",
ppp_data.active_system_procs);
/* pool related entries are appropriate for shared configs */
if (lppaca_shared_proc(get_lppaca())) {
unsigned long pool_idle_time, pool_procs;
seq_printf(m, "pool=%d\n", ppp_data.pool_num);
/* report pool_capacity in percentage */
seq_printf(m, "pool_capacity=%d\n",
ppp_data.active_procs_in_pool * 100);
h_pic(&pool_idle_time, &pool_procs);
seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
}
seq_printf(m, "unallocated_capacity_weight=%d\n",
ppp_data.unallocated_weight);
seq_printf(m, "capacity_weight=%d\n", ppp_data.weight);
seq_printf(m, "capped=%d\n", ppp_data.capped);
seq_printf(m, "unallocated_capacity=%lld\n",
ppp_data.unallocated_entitlement);
/* The last bits of information returned from h_get_ppp are only
* valid if the ibm,partition-performance-parameters-level
* property is >= 1.
*/
root = of_find_node_by_path("/");
if (root) {
perf_level = of_get_property(root,
"ibm,partition-performance-parameters-level",
NULL);
if (perf_level && (be32_to_cpup(perf_level) >= 1)) {
seq_printf(m,
"physical_procs_allocated_to_virtualization=%d\n",
ppp_data.phys_platform_procs);
seq_printf(m, "max_proc_capacity_available=%d\n",
ppp_data.max_proc_cap_avail);
seq_printf(m, "entitled_proc_capacity_available=%d\n",
ppp_data.entitled_proc_cap_avail);
}
of_node_put(root);
}
}
/**
* parse_mpp_data
* Parse out data returned from h_get_mpp
*/
static void parse_mpp_data(struct seq_file *m)
{
struct hvcall_mpp_data mpp_data;
int rc;
rc = h_get_mpp(&mpp_data);
if (rc)
return;
seq_printf(m, "entitled_memory=%ld\n", mpp_data.entitled_mem);
if (mpp_data.mapped_mem != -1)
seq_printf(m, "mapped_entitled_memory=%ld\n",
mpp_data.mapped_mem);
seq_printf(m, "entitled_memory_group_number=%d\n", mpp_data.group_num);
seq_printf(m, "entitled_memory_pool_number=%d\n", mpp_data.pool_num);
seq_printf(m, "entitled_memory_weight=%d\n", mpp_data.mem_weight);
seq_printf(m, "unallocated_entitled_memory_weight=%d\n",
mpp_data.unallocated_mem_weight);
seq_printf(m, "unallocated_io_mapping_entitlement=%ld\n",
mpp_data.unallocated_entitlement);
if (mpp_data.pool_size != -1)
seq_printf(m, "entitled_memory_pool_size=%ld bytes\n",
mpp_data.pool_size);
seq_printf(m, "entitled_memory_loan_request=%ld\n",
mpp_data.loan_request);
seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem);
}
/**
* parse_mpp_x_data
* Parse out data returned from h_get_mpp_x
*/
static void parse_mpp_x_data(struct seq_file *m)
{
struct hvcall_mpp_x_data mpp_x_data;
if (!firmware_has_feature(FW_FEATURE_XCMO))
return;
if (h_get_mpp_x(&mpp_x_data))
return;
seq_printf(m, "coalesced_bytes=%ld\n", mpp_x_data.coalesced_bytes);
if (mpp_x_data.pool_coalesced_bytes)
seq_printf(m, "pool_coalesced_bytes=%ld\n",
mpp_x_data.pool_coalesced_bytes);
if (mpp_x_data.pool_purr_cycles)
seq_printf(m, "coalesce_pool_purr=%ld\n", mpp_x_data.pool_purr_cycles);
if (mpp_x_data.pool_spurr_cycles)
seq_printf(m, "coalesce_pool_spurr=%ld\n", mpp_x_data.pool_spurr_cycles);
}
#define SPLPAR_CHARACTERISTICS_TOKEN 20
#define SPLPAR_MAXLENGTH 1026*(sizeof(char))
/*
* parse_system_parameter_string()
* Retrieve the potential_processors, max_entitled_capacity and friends
* through the get-system-parameter rtas call. Replace keyword strings as
* necessary.
*/
static void parse_system_parameter_string(struct seq_file *m)
{
int call_status;
unsigned char *local_buffer = kmalloc(SPLPAR_MAXLENGTH, GFP_KERNEL);
if (!local_buffer) {
printk(KERN_ERR "%s %s kmalloc failure at line %d\n",
__FILE__, __func__, __LINE__);
return;
}
spin_lock(&rtas_data_buf_lock);
memset(rtas_data_buf, 0, SPLPAR_MAXLENGTH);
call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
NULL,
SPLPAR_CHARACTERISTICS_TOKEN,
__pa(rtas_data_buf),
RTAS_DATA_BUF_SIZE);
memcpy(local_buffer, rtas_data_buf, SPLPAR_MAXLENGTH);
local_buffer[SPLPAR_MAXLENGTH - 1] = '\0';
spin_unlock(&rtas_data_buf_lock);
if (call_status != 0) {
printk(KERN_INFO
"%s %s Error calling get-system-parameter (0x%x)\n",
__FILE__, __func__, call_status);
} else {
int splpar_strlen;
int idx, w_idx;
char *workbuffer = kzalloc(SPLPAR_MAXLENGTH, GFP_KERNEL);
if (!workbuffer) {
printk(KERN_ERR "%s %s kmalloc failure at line %d\n",
__FILE__, __func__, __LINE__);
kfree(local_buffer);
return;
}
#ifdef LPARCFG_DEBUG
printk(KERN_INFO "success calling get-system-parameter\n");
#endif
splpar_strlen = local_buffer[0] * 256 + local_buffer[1];
local_buffer += 2; /* step over strlen value */
w_idx = 0;
idx = 0;
while ((*local_buffer) && (idx < splpar_strlen)) {
workbuffer[w_idx++] = local_buffer[idx++];
if ((local_buffer[idx] == ',')
|| (local_buffer[idx] == '\0')) {
workbuffer[w_idx] = '\0';
if (w_idx) {
/* avoid the empty string */
seq_printf(m, "%s\n", workbuffer);
}
memset(workbuffer, 0, SPLPAR_MAXLENGTH);
idx++; /* skip the comma */
w_idx = 0;
} else if (local_buffer[idx] == '=') {
/* code here to replace workbuffer contents
with different keyword strings */
if (0 == strcmp(workbuffer, "MaxEntCap")) {
strcpy(workbuffer,
"partition_max_entitled_capacity");
w_idx = strlen(workbuffer);
}
if (0 == strcmp(workbuffer, "MaxPlatProcs")) {
strcpy(workbuffer,
"system_potential_processors");
w_idx = strlen(workbuffer);
}
}
}
kfree(workbuffer);
local_buffer -= 2; /* back up over strlen value */
}
kfree(local_buffer);
}
/* Return the number of processors in the system.
* This function reads through the device tree and counts
* the virtual processors, this does not include threads.
*/
static int lparcfg_count_active_processors(void)
{
struct device_node *cpus_dn = NULL;
int count = 0;
while ((cpus_dn = of_find_node_by_type(cpus_dn, "cpu"))) {
#ifdef LPARCFG_DEBUG
printk(KERN_ERR "cpus_dn %p\n", cpus_dn);
#endif
count++;
}
return count;
}
static void pseries_cmo_data(struct seq_file *m)
{
int cpu;
unsigned long cmo_faults = 0;
unsigned long cmo_fault_time = 0;
seq_printf(m, "cmo_enabled=%d\n", firmware_has_feature(FW_FEATURE_CMO));
if (!firmware_has_feature(FW_FEATURE_CMO))
return;
for_each_possible_cpu(cpu) {
cmo_faults += be64_to_cpu(lppaca_of(cpu).cmo_faults);
cmo_fault_time += be64_to_cpu(lppaca_of(cpu).cmo_fault_time);
}
seq_printf(m, "cmo_faults=%lu\n", cmo_faults);
seq_printf(m, "cmo_fault_time_usec=%lu\n",
cmo_fault_time / tb_ticks_per_usec);
seq_printf(m, "cmo_primary_psp=%d\n", cmo_get_primary_psp());
seq_printf(m, "cmo_secondary_psp=%d\n", cmo_get_secondary_psp());
seq_printf(m, "cmo_page_size=%lu\n", cmo_get_page_size());
}
static void splpar_dispatch_data(struct seq_file *m)
{
int cpu;
unsigned long dispatches = 0;
unsigned long dispatch_dispersions = 0;
for_each_possible_cpu(cpu) {
dispatches += be32_to_cpu(lppaca_of(cpu).yield_count);
dispatch_dispersions +=
be32_to_cpu(lppaca_of(cpu).dispersion_count);
}
seq_printf(m, "dispatches=%lu\n", dispatches);
seq_printf(m, "dispatch_dispersions=%lu\n", dispatch_dispersions);
}
static void parse_em_data(struct seq_file *m)
{
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
if (firmware_has_feature(FW_FEATURE_LPAR) &&
plpar_hcall(H_GET_EM_PARMS, retbuf) == H_SUCCESS)
seq_printf(m, "power_mode_data=%016lx\n", retbuf[0]);
}
static int pseries_lparcfg_data(struct seq_file *m, void *v)
{
int partition_potential_processors;
int partition_active_processors;
struct device_node *rtas_node;
const __be32 *lrdrp = NULL;
rtas_node = of_find_node_by_path("/rtas");
if (rtas_node)
lrdrp = of_get_property(rtas_node, "ibm,lrdr-capacity", NULL);
if (lrdrp == NULL) {
partition_potential_processors = vdso_data->processorCount;
} else {
partition_potential_processors = be32_to_cpup(lrdrp + 4);
}
of_node_put(rtas_node);
partition_active_processors = lparcfg_count_active_processors();
if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
/* this call handles the ibm,get-system-parameter contents */
parse_system_parameter_string(m);
parse_ppp_data(m);
parse_mpp_data(m);
parse_mpp_x_data(m);
pseries_cmo_data(m);
splpar_dispatch_data(m);
seq_printf(m, "purr=%ld\n", get_purr());
} else { /* non SPLPAR case */
seq_printf(m, "system_active_processors=%d\n",
partition_potential_processors);
seq_printf(m, "system_potential_processors=%d\n",
partition_potential_processors);
seq_printf(m, "partition_max_entitled_capacity=%d\n",
partition_potential_processors * 100);
seq_printf(m, "partition_entitled_capacity=%d\n",
partition_active_processors * 100);
}
seq_printf(m, "partition_active_processors=%d\n",
partition_active_processors);
seq_printf(m, "partition_potential_processors=%d\n",
partition_potential_processors);
seq_printf(m, "shared_processor_mode=%d\n",
lppaca_shared_proc(get_lppaca()));
seq_printf(m, "slb_size=%d\n", mmu_slb_size);
parse_em_data(m);
return 0;
}
static ssize_t update_ppp(u64 *entitlement, u8 *weight)
{
struct hvcall_ppp_data ppp_data;
u8 new_weight;
u64 new_entitled;
ssize_t retval;
/* Get our current parameters */
retval = h_get_ppp(&ppp_data);
if (retval)
return retval;
if (entitlement) {
new_weight = ppp_data.weight;
new_entitled = *entitlement;
} else if (weight) {
new_weight = *weight;
new_entitled = ppp_data.entitlement;
} else
return -EINVAL;
pr_debug("%s: current_entitled = %llu, current_weight = %u\n",
__func__, ppp_data.entitlement, ppp_data.weight);
pr_debug("%s: new_entitled = %llu, new_weight = %u\n",
__func__, new_entitled, new_weight);
retval = plpar_hcall_norets(H_SET_PPP, new_entitled, new_weight);
return retval;
}
/**
* update_mpp
*
* Update the memory entitlement and weight for the partition. Caller must
* specify either a new entitlement or weight, not both, to be updated
* since the h_set_mpp call takes both entitlement and weight as parameters.
*/
static ssize_t update_mpp(u64 *entitlement, u8 *weight)
{
struct hvcall_mpp_data mpp_data;
u64 new_entitled;
u8 new_weight;
ssize_t rc;
if (entitlement) {
/* Check with vio to ensure the new memory entitlement
* can be handled.
*/
rc = vio_cmo_entitlement_update(*entitlement);
if (rc)
return rc;
}
rc = h_get_mpp(&mpp_data);
if (rc)
return rc;
if (entitlement) {
new_weight = mpp_data.mem_weight;
new_entitled = *entitlement;
} else if (weight) {
new_weight = *weight;
new_entitled = mpp_data.entitled_mem;
} else
return -EINVAL;
pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
__func__, mpp_data.entitled_mem, mpp_data.mem_weight);
pr_debug("%s: new_entitled = %llu, new_weight = %u\n",
__func__, new_entitled, new_weight);
rc = plpar_hcall_norets(H_SET_MPP, new_entitled, new_weight);
return rc;
}
/*
* Interface for changing system parameters (variable capacity weight
* and entitled capacity). Format of input is "param_name=value";
* anything after value is ignored. Valid parameters at this time are
* "partition_entitled_capacity" and "capacity_weight". We use
* H_SET_PPP to alter parameters.
*
* This function should be invoked only on systems with
* FW_FEATURE_SPLPAR.
*/
static ssize_t lparcfg_write(struct file *file, const char __user * buf,
size_t count, loff_t * off)
{
int kbuf_sz = 64;
char kbuf[kbuf_sz];
char *tmp;
u64 new_entitled, *new_entitled_ptr = &new_entitled;
u8 new_weight, *new_weight_ptr = &new_weight;
ssize_t retval;
if (!firmware_has_feature(FW_FEATURE_SPLPAR))
return -EINVAL;
if (count > kbuf_sz)
return -EINVAL;
if (copy_from_user(kbuf, buf, count))
return -EFAULT;
kbuf[count - 1] = '\0';
tmp = strchr(kbuf, '=');
if (!tmp)
return -EINVAL;
*tmp++ = '\0';
if (!strcmp(kbuf, "partition_entitled_capacity")) {
char *endp;
*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
if (endp == tmp)
return -EINVAL;
retval = update_ppp(new_entitled_ptr, NULL);
} else if (!strcmp(kbuf, "capacity_weight")) {
char *endp;
*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
if (endp == tmp)
return -EINVAL;
retval = update_ppp(NULL, new_weight_ptr);
} else if (!strcmp(kbuf, "entitled_memory")) {
char *endp;
*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
if (endp == tmp)
return -EINVAL;
retval = update_mpp(new_entitled_ptr, NULL);
} else if (!strcmp(kbuf, "entitled_memory_weight")) {
char *endp;
*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
if (endp == tmp)
return -EINVAL;
retval = update_mpp(NULL, new_weight_ptr);
} else
return -EINVAL;
if (retval == H_SUCCESS || retval == H_CONSTRAINED) {
retval = count;
} else if (retval == H_BUSY) {
retval = -EBUSY;
} else if (retval == H_HARDWARE) {
retval = -EIO;
} else if (retval == H_PARAMETER) {
retval = -EINVAL;
}
return retval;
}
static int lparcfg_data(struct seq_file *m, void *v)
{
struct device_node *rootdn;
const char *model = "";
const char *system_id = "";
const char *tmp;
const __be32 *lp_index_ptr;
unsigned int lp_index = 0;
seq_printf(m, "%s %s\n", MODULE_NAME, MODULE_VERS);
rootdn = of_find_node_by_path("/");
if (rootdn) {
tmp = of_get_property(rootdn, "model", NULL);
if (tmp)
model = tmp;
tmp = of_get_property(rootdn, "system-id", NULL);
if (tmp)
system_id = tmp;
lp_index_ptr = of_get_property(rootdn, "ibm,partition-no",
NULL);
if (lp_index_ptr)
lp_index = be32_to_cpup(lp_index_ptr);
of_node_put(rootdn);
}
seq_printf(m, "serial_number=%s\n", system_id);
seq_printf(m, "system_type=%s\n", model);
seq_printf(m, "partition_id=%d\n", (int)lp_index);
return pseries_lparcfg_data(m, v);
}
static int lparcfg_open(struct inode *inode, struct file *file)
{
return single_open(file, lparcfg_data, NULL);
}
static const struct file_operations lparcfg_fops = {
.read = seq_read,
.write = lparcfg_write,
.open = lparcfg_open,
.release = single_release,
.llseek = seq_lseek,
};
static int __init lparcfg_init(void)
{
umode_t mode = S_IRUSR | S_IRGRP | S_IROTH;
/* Allow writing if we have FW_FEATURE_SPLPAR */
if (firmware_has_feature(FW_FEATURE_SPLPAR))
mode |= S_IWUSR;
if (!proc_create("powerpc/lparcfg", mode, NULL, &lparcfg_fops)) {
printk(KERN_ERR "Failed to create powerpc/lparcfg\n");
return -EIO;
}
return 0;
}
machine_device_initcall(pseries, lparcfg_init);

View file

@ -0,0 +1,368 @@
/*
* Support for Partition Mobility/Migration
*
* Copyright (C) 2010 Nathan Fontenot
* Copyright (C) 2010 IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*/
#include <linux/kernel.h>
#include <linux/kobject.h>
#include <linux/smp.h>
#include <linux/stat.h>
#include <linux/completion.h>
#include <linux/device.h>
#include <linux/delay.h>
#include <linux/slab.h>
#include <asm/machdep.h>
#include <asm/rtas.h>
#include "pseries.h"
static struct kobject *mobility_kobj;
struct update_props_workarea {
__be32 phandle;
__be32 state;
__be64 reserved;
__be32 nprops;
} __packed;
#define NODE_ACTION_MASK 0xff000000
#define NODE_COUNT_MASK 0x00ffffff
#define DELETE_DT_NODE 0x01000000
#define UPDATE_DT_NODE 0x02000000
#define ADD_DT_NODE 0x03000000
#define MIGRATION_SCOPE (1)
static int mobility_rtas_call(int token, char *buf, s32 scope)
{
int rc;
spin_lock(&rtas_data_buf_lock);
memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);
rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope);
memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
spin_unlock(&rtas_data_buf_lock);
return rc;
}
static int delete_dt_node(__be32 phandle)
{
struct device_node *dn;
dn = of_find_node_by_phandle(be32_to_cpu(phandle));
if (!dn)
return -ENOENT;
dlpar_detach_node(dn);
of_node_put(dn);
return 0;
}
static int update_dt_property(struct device_node *dn, struct property **prop,
const char *name, u32 vd, char *value)
{
struct property *new_prop = *prop;
int more = 0;
/* A negative 'vd' value indicates that only part of the new property
* value is contained in the buffer and we need to call
* ibm,update-properties again to get the rest of the value.
*
* A negative value is also the two's compliment of the actual value.
*/
if (vd & 0x80000000) {
vd = ~vd + 1;
more = 1;
}
if (new_prop) {
/* partial property fixup */
char *new_data = kzalloc(new_prop->length + vd, GFP_KERNEL);
if (!new_data)
return -ENOMEM;
memcpy(new_data, new_prop->value, new_prop->length);
memcpy(new_data + new_prop->length, value, vd);
kfree(new_prop->value);
new_prop->value = new_data;
new_prop->length += vd;
} else {
new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
if (!new_prop)
return -ENOMEM;
new_prop->name = kstrdup(name, GFP_KERNEL);
if (!new_prop->name) {
kfree(new_prop);
return -ENOMEM;
}
new_prop->length = vd;
new_prop->value = kzalloc(new_prop->length, GFP_KERNEL);
if (!new_prop->value) {
kfree(new_prop->name);
kfree(new_prop);
return -ENOMEM;
}
memcpy(new_prop->value, value, vd);
*prop = new_prop;
}
if (!more) {
of_update_property(dn, new_prop);
*prop = NULL;
}
return 0;
}
static int update_dt_node(__be32 phandle, s32 scope)
{
struct update_props_workarea *upwa;
struct device_node *dn;
struct property *prop = NULL;
int i, rc, rtas_rc;
char *prop_data;
char *rtas_buf;
int update_properties_token;
u32 nprops;
u32 vd;
update_properties_token = rtas_token("ibm,update-properties");
if (update_properties_token == RTAS_UNKNOWN_SERVICE)
return -EINVAL;
rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
if (!rtas_buf)
return -ENOMEM;
dn = of_find_node_by_phandle(be32_to_cpu(phandle));
if (!dn) {
kfree(rtas_buf);
return -ENOENT;
}
upwa = (struct update_props_workarea *)&rtas_buf[0];
upwa->phandle = phandle;
do {
rtas_rc = mobility_rtas_call(update_properties_token, rtas_buf,
scope);
if (rtas_rc < 0)
break;
prop_data = rtas_buf + sizeof(*upwa);
nprops = be32_to_cpu(upwa->nprops);
/* On the first call to ibm,update-properties for a node the
* the first property value descriptor contains an empty
* property name, the property value length encoded as u32,
* and the property value is the node path being updated.
*/
if (*prop_data == 0) {
prop_data++;
vd = be32_to_cpu(*(__be32 *)prop_data);
prop_data += vd + sizeof(vd);
nprops--;
}
for (i = 0; i < nprops; i++) {
char *prop_name;
prop_name = prop_data;
prop_data += strlen(prop_name) + 1;
vd = be32_to_cpu(*(__be32 *)prop_data);
prop_data += sizeof(vd);
switch (vd) {
case 0x00000000:
/* name only property, nothing to do */
break;
case 0x80000000:
prop = of_find_property(dn, prop_name, NULL);
of_remove_property(dn, prop);
prop = NULL;
break;
default:
rc = update_dt_property(dn, &prop, prop_name,
vd, prop_data);
if (rc) {
printk(KERN_ERR "Could not update %s"
" property\n", prop_name);
}
prop_data += vd;
}
}
} while (rtas_rc == 1);
of_node_put(dn);
kfree(rtas_buf);
return 0;
}
static int add_dt_node(__be32 parent_phandle, __be32 drc_index)
{
struct device_node *dn;
struct device_node *parent_dn;
int rc;
parent_dn = of_find_node_by_phandle(be32_to_cpu(parent_phandle));
if (!parent_dn)
return -ENOENT;
dn = dlpar_configure_connector(drc_index, parent_dn);
if (!dn)
return -ENOENT;
rc = dlpar_attach_node(dn);
if (rc)
dlpar_free_cc_nodes(dn);
of_node_put(parent_dn);
return rc;
}
int pseries_devicetree_update(s32 scope)
{
char *rtas_buf;
__be32 *data;
int update_nodes_token;
int rc;
update_nodes_token = rtas_token("ibm,update-nodes");
if (update_nodes_token == RTAS_UNKNOWN_SERVICE)
return -EINVAL;
rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
if (!rtas_buf)
return -ENOMEM;
do {
rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope);
if (rc && rc != 1)
break;
data = (__be32 *)rtas_buf + 4;
while (be32_to_cpu(*data) & NODE_ACTION_MASK) {
int i;
u32 action = be32_to_cpu(*data) & NODE_ACTION_MASK;
u32 node_count = be32_to_cpu(*data) & NODE_COUNT_MASK;
data++;
for (i = 0; i < node_count; i++) {
__be32 phandle = *data++;
__be32 drc_index;
switch (action) {
case DELETE_DT_NODE:
delete_dt_node(phandle);
break;
case UPDATE_DT_NODE:
update_dt_node(phandle, scope);
break;
case ADD_DT_NODE:
drc_index = *data++;
add_dt_node(phandle, drc_index);
break;
}
}
}
} while (rc == 1);
kfree(rtas_buf);
return rc;
}
void post_mobility_fixup(void)
{
int rc;
int activate_fw_token;
activate_fw_token = rtas_token("ibm,activate-firmware");
if (activate_fw_token == RTAS_UNKNOWN_SERVICE) {
printk(KERN_ERR "Could not make post-mobility "
"activate-fw call.\n");
return;
}
do {
rc = rtas_call(activate_fw_token, 0, 1, NULL);
} while (rtas_busy_delay(rc));
if (rc)
printk(KERN_ERR "Post-mobility activate-fw failed: %d\n", rc);
rc = pseries_devicetree_update(MIGRATION_SCOPE);
if (rc)
printk(KERN_ERR "Post-mobility device tree update "
"failed: %d\n", rc);
return;
}
static ssize_t migrate_store(struct class *class, struct class_attribute *attr,
const char *buf, size_t count)
{
struct rtas_args args;
u64 streamid;
int rc;
rc = kstrtou64(buf, 0, &streamid);
if (rc)
return rc;
memset(&args, 0, sizeof(args));
args.token = rtas_token("ibm,suspend-me");
args.nargs = 2;
args.nret = 1;
args.args[0] = streamid >> 32 ;
args.args[1] = streamid & 0xffffffff;
args.rets = &args.args[args.nargs];
do {
args.rets[0] = 0;
rc = rtas_ibm_suspend_me(&args);
if (!rc && args.rets[0] == RTAS_NOT_SUSPENDABLE)
ssleep(1);
} while (!rc && args.rets[0] == RTAS_NOT_SUSPENDABLE);
if (rc)
return rc;
else if (args.rets[0])
return args.rets[0];
post_mobility_fixup();
return count;
}
static CLASS_ATTR(migration, S_IWUSR, NULL, migrate_store);
static int __init mobility_sysfs_init(void)
{
int rc;
mobility_kobj = kobject_create_and_add("mobility", kernel_kobj);
if (!mobility_kobj)
return -ENOMEM;
rc = sysfs_create_file(mobility_kobj, &class_attr_migration.attr);
return rc;
}
machine_device_initcall(pseries, mobility_sysfs_init);

View file

@ -0,0 +1,526 @@
/*
* Copyright 2006 Jake Moilanen <moilanen@austin.ibm.com>, IBM Corp.
* Copyright 2006-2007 Michael Ellerman, IBM Corp.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2 of the
* License.
*
*/
#include <linux/device.h>
#include <linux/irq.h>
#include <linux/msi.h>
#include <asm/rtas.h>
#include <asm/hw_irq.h>
#include <asm/ppc-pci.h>
#include <asm/machdep.h>
static int query_token, change_token;
#define RTAS_QUERY_FN 0
#define RTAS_CHANGE_FN 1
#define RTAS_RESET_FN 2
#define RTAS_CHANGE_MSI_FN 3
#define RTAS_CHANGE_MSIX_FN 4
#define RTAS_CHANGE_32MSI_FN 5
/* RTAS Helpers */
static int rtas_change_msi(struct pci_dn *pdn, u32 func, u32 num_irqs)
{
u32 addr, seq_num, rtas_ret[3];
unsigned long buid;
int rc;
addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
buid = pdn->phb->buid;
seq_num = 1;
do {
if (func == RTAS_CHANGE_MSI_FN || func == RTAS_CHANGE_MSIX_FN ||
func == RTAS_CHANGE_32MSI_FN)
rc = rtas_call(change_token, 6, 4, rtas_ret, addr,
BUID_HI(buid), BUID_LO(buid),
func, num_irqs, seq_num);
else
rc = rtas_call(change_token, 6, 3, rtas_ret, addr,
BUID_HI(buid), BUID_LO(buid),
func, num_irqs, seq_num);
seq_num = rtas_ret[1];
} while (rtas_busy_delay(rc));
/*
* If the RTAS call succeeded, return the number of irqs allocated.
* If not, make sure we return a negative error code.
*/
if (rc == 0)
rc = rtas_ret[0];
else if (rc > 0)
rc = -rc;
pr_debug("rtas_msi: ibm,change_msi(func=%d,num=%d), got %d rc = %d\n",
func, num_irqs, rtas_ret[0], rc);
return rc;
}
static void rtas_disable_msi(struct pci_dev *pdev)
{
struct pci_dn *pdn;
pdn = pci_get_pdn(pdev);
if (!pdn)
return;
/*
* disabling MSI with the explicit interface also disables MSI-X
*/
if (rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, 0) != 0) {
/*
* may have failed because explicit interface is not
* present
*/
if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) {
pr_debug("rtas_msi: Setting MSIs to 0 failed!\n");
}
}
}
static int rtas_query_irq_number(struct pci_dn *pdn, int offset)
{
u32 addr, rtas_ret[2];
unsigned long buid;
int rc;
addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
buid = pdn->phb->buid;
do {
rc = rtas_call(query_token, 4, 3, rtas_ret, addr,
BUID_HI(buid), BUID_LO(buid), offset);
} while (rtas_busy_delay(rc));
if (rc) {
pr_debug("rtas_msi: error (%d) querying source number\n", rc);
return rc;
}
return rtas_ret[0];
}
static void rtas_teardown_msi_irqs(struct pci_dev *pdev)
{
struct msi_desc *entry;
list_for_each_entry(entry, &pdev->msi_list, list) {
if (entry->irq == NO_IRQ)
continue;
irq_set_msi_desc(entry->irq, NULL);
irq_dispose_mapping(entry->irq);
}
rtas_disable_msi(pdev);
}
static int check_req(struct pci_dev *pdev, int nvec, char *prop_name)
{
struct device_node *dn;
struct pci_dn *pdn;
const __be32 *p;
u32 req_msi;
pdn = pci_get_pdn(pdev);
if (!pdn)
return -ENODEV;
dn = pdn->node;
p = of_get_property(dn, prop_name, NULL);
if (!p) {
pr_debug("rtas_msi: No %s on %s\n", prop_name, dn->full_name);
return -ENOENT;
}
req_msi = be32_to_cpup(p);
if (req_msi < nvec) {
pr_debug("rtas_msi: %s requests < %d MSIs\n", prop_name, nvec);
if (req_msi == 0) /* Be paranoid */
return -ENOSPC;
return req_msi;
}
return 0;
}
static int check_req_msi(struct pci_dev *pdev, int nvec)
{
return check_req(pdev, nvec, "ibm,req#msi");
}
static int check_req_msix(struct pci_dev *pdev, int nvec)
{
return check_req(pdev, nvec, "ibm,req#msi-x");
}
/* Quota calculation */
static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
{
struct device_node *dn;
const __be32 *p;
dn = of_node_get(pci_device_to_OF_node(dev));
while (dn) {
p = of_get_property(dn, "ibm,pe-total-#msi", NULL);
if (p) {
pr_debug("rtas_msi: found prop on dn %s\n",
dn->full_name);
*total = be32_to_cpup(p);
return dn;
}
dn = of_get_next_parent(dn);
}
return NULL;
}
static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
{
struct device_node *dn;
struct eeh_dev *edev;
/* Found our PE and assume 8 at that point. */
dn = pci_device_to_OF_node(dev);
if (!dn)
return NULL;
/* Get the top level device in the PE */
edev = of_node_to_eeh_dev(dn);
if (edev->pe)
edev = list_first_entry(&edev->pe->edevs, struct eeh_dev, list);
dn = eeh_dev_to_of_node(edev);
if (!dn)
return NULL;
/* We actually want the parent */
dn = of_get_parent(dn);
if (!dn)
return NULL;
/* Hardcode of 8 for old firmwares */
*total = 8;
pr_debug("rtas_msi: using PE dn %s\n", dn->full_name);
return dn;
}
struct msi_counts {
struct device_node *requestor;
int num_devices;
int request;
int quota;
int spare;
int over_quota;
};
static void *count_non_bridge_devices(struct device_node *dn, void *data)
{
struct msi_counts *counts = data;
const __be32 *p;
u32 class;
pr_debug("rtas_msi: counting %s\n", dn->full_name);
p = of_get_property(dn, "class-code", NULL);
class = p ? be32_to_cpup(p) : 0;
if ((class >> 8) != PCI_CLASS_BRIDGE_PCI)
counts->num_devices++;
return NULL;
}
static void *count_spare_msis(struct device_node *dn, void *data)
{
struct msi_counts *counts = data;
const __be32 *p;
int req;
if (dn == counts->requestor)
req = counts->request;
else {
/* We don't know if a driver will try to use MSI or MSI-X,
* so we just have to punt and use the larger of the two. */
req = 0;
p = of_get_property(dn, "ibm,req#msi", NULL);
if (p)
req = be32_to_cpup(p);
p = of_get_property(dn, "ibm,req#msi-x", NULL);
if (p)
req = max(req, (int)be32_to_cpup(p));
}
if (req < counts->quota)
counts->spare += counts->quota - req;
else if (req > counts->quota)
counts->over_quota++;
return NULL;
}
static int msi_quota_for_device(struct pci_dev *dev, int request)
{
struct device_node *pe_dn;
struct msi_counts counts;
int total;
pr_debug("rtas_msi: calc quota for %s, request %d\n", pci_name(dev),
request);
pe_dn = find_pe_total_msi(dev, &total);
if (!pe_dn)
pe_dn = find_pe_dn(dev, &total);
if (!pe_dn) {
pr_err("rtas_msi: couldn't find PE for %s\n", pci_name(dev));
goto out;
}
pr_debug("rtas_msi: found PE %s\n", pe_dn->full_name);
memset(&counts, 0, sizeof(struct msi_counts));
/* Work out how many devices we have below this PE */
traverse_pci_devices(pe_dn, count_non_bridge_devices, &counts);
if (counts.num_devices == 0) {
pr_err("rtas_msi: found 0 devices under PE for %s\n",
pci_name(dev));
goto out;
}
counts.quota = total / counts.num_devices;
if (request <= counts.quota)
goto out;
/* else, we have some more calculating to do */
counts.requestor = pci_device_to_OF_node(dev);
counts.request = request;
traverse_pci_devices(pe_dn, count_spare_msis, &counts);
/* If the quota isn't an integer multiple of the total, we can
* use the remainder as spare MSIs for anyone that wants them. */
counts.spare += total % counts.num_devices;
/* Divide any spare by the number of over-quota requestors */
if (counts.over_quota)
counts.quota += counts.spare / counts.over_quota;
/* And finally clamp the request to the possibly adjusted quota */
request = min(counts.quota, request);
pr_debug("rtas_msi: request clamped to quota %d\n", request);
out:
of_node_put(pe_dn);
return request;
}
static int check_msix_entries(struct pci_dev *pdev)
{
struct msi_desc *entry;
int expected;
/* There's no way for us to express to firmware that we want
* a discontiguous, or non-zero based, range of MSI-X entries.
* So we must reject such requests. */
expected = 0;
list_for_each_entry(entry, &pdev->msi_list, list) {
if (entry->msi_attrib.entry_nr != expected) {
pr_debug("rtas_msi: bad MSI-X entries.\n");
return -EINVAL;
}
expected++;
}
return 0;
}
static void rtas_hack_32bit_msi_gen2(struct pci_dev *pdev)
{
u32 addr_hi, addr_lo;
/*
* We should only get in here for IODA1 configs. This is based on the
* fact that we using RTAS for MSIs, we don't have the 32 bit MSI RTAS
* support, and we are in a PCIe Gen2 slot.
*/
dev_info(&pdev->dev,
"rtas_msi: No 32 bit MSI firmware support, forcing 32 bit MSI\n");
pci_read_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, &addr_hi);
addr_lo = 0xffff0000 | ((addr_hi >> (48 - 32)) << 4);
pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, addr_lo);
pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, 0);
}
static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
{
struct pci_dn *pdn;
int hwirq, virq, i, quota, rc;
struct msi_desc *entry;
struct msi_msg msg;
int nvec = nvec_in;
int use_32bit_msi_hack = 0;
if (type == PCI_CAP_ID_MSIX)
rc = check_req_msix(pdev, nvec);
else
rc = check_req_msi(pdev, nvec);
if (rc)
return rc;
quota = msi_quota_for_device(pdev, nvec);
if (quota && quota < nvec)
return quota;
if (type == PCI_CAP_ID_MSIX && check_msix_entries(pdev))
return -EINVAL;
/*
* Firmware currently refuse any non power of two allocation
* so we round up if the quota will allow it.
*/
if (type == PCI_CAP_ID_MSIX) {
int m = roundup_pow_of_two(nvec);
quota = msi_quota_for_device(pdev, m);
if (quota >= m)
nvec = m;
}
pdn = pci_get_pdn(pdev);
/*
* Try the new more explicit firmware interface, if that fails fall
* back to the old interface. The old interface is known to never
* return MSI-Xs.
*/
again:
if (type == PCI_CAP_ID_MSI) {
if (pdev->no_64bit_msi) {
rc = rtas_change_msi(pdn, RTAS_CHANGE_32MSI_FN, nvec);
if (rc < 0) {
/*
* We only want to run the 32 bit MSI hack below if
* the max bus speed is Gen2 speed
*/
if (pdev->bus->max_bus_speed != PCIE_SPEED_5_0GT)
return rc;
use_32bit_msi_hack = 1;
}
} else
rc = -1;
if (rc < 0)
rc = rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, nvec);
if (rc < 0) {
pr_debug("rtas_msi: trying the old firmware call.\n");
rc = rtas_change_msi(pdn, RTAS_CHANGE_FN, nvec);
}
if (use_32bit_msi_hack && rc > 0)
rtas_hack_32bit_msi_gen2(pdev);
} else
rc = rtas_change_msi(pdn, RTAS_CHANGE_MSIX_FN, nvec);
if (rc != nvec) {
if (nvec != nvec_in) {
nvec = nvec_in;
goto again;
}
pr_debug("rtas_msi: rtas_change_msi() failed\n");
return rc;
}
i = 0;
list_for_each_entry(entry, &pdev->msi_list, list) {
hwirq = rtas_query_irq_number(pdn, i++);
if (hwirq < 0) {
pr_debug("rtas_msi: error (%d) getting hwirq\n", rc);
return hwirq;
}
virq = irq_create_mapping(NULL, hwirq);
if (virq == NO_IRQ) {
pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq);
return -ENOSPC;
}
dev_dbg(&pdev->dev, "rtas_msi: allocated virq %d\n", virq);
irq_set_msi_desc(virq, entry);
/* Read config space back so we can restore after reset */
__read_msi_msg(entry, &msg);
entry->msg = msg;
}
return 0;
}
static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev)
{
/* No LSI -> leave MSIs (if any) configured */
if (pdev->irq == NO_IRQ) {
dev_dbg(&pdev->dev, "rtas_msi: no LSI, nothing to do.\n");
return;
}
/* No MSI -> MSIs can't have been assigned by fw, leave LSI */
if (check_req_msi(pdev, 1) && check_req_msix(pdev, 1)) {
dev_dbg(&pdev->dev, "rtas_msi: no req#msi/x, nothing to do.\n");
return;
}
dev_dbg(&pdev->dev, "rtas_msi: disabling existing MSI.\n");
rtas_disable_msi(pdev);
}
static int rtas_msi_init(void)
{
query_token = rtas_token("ibm,query-interrupt-source-number");
change_token = rtas_token("ibm,change-msi");
if ((query_token == RTAS_UNKNOWN_SERVICE) ||
(change_token == RTAS_UNKNOWN_SERVICE)) {
pr_debug("rtas_msi: no RTAS tokens, no MSI support.\n");
return -1;
}
pr_debug("rtas_msi: Registering RTAS MSI callbacks.\n");
WARN_ON(ppc_md.setup_msi_irqs);
ppc_md.setup_msi_irqs = rtas_setup_msi_irqs;
ppc_md.teardown_msi_irqs = rtas_teardown_msi_irqs;
WARN_ON(ppc_md.pci_irq_fixup);
ppc_md.pci_irq_fixup = rtas_msi_pci_irq_fixup;
return 0;
}
machine_arch_initcall(pseries, rtas_msi_init);

View file

@ -0,0 +1,903 @@
/*
* c 2001 PPC 64 Team, IBM Corp
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* /dev/nvram driver for PPC64
*
* This perhaps should live in drivers/char
*/
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/kmsg_dump.h>
#include <linux/pstore.h>
#include <linux/ctype.h>
#include <linux/zlib.h>
#include <asm/uaccess.h>
#include <asm/nvram.h>
#include <asm/rtas.h>
#include <asm/prom.h>
#include <asm/machdep.h>
/* Max bytes to read/write in one go */
#define NVRW_CNT 0x20
/*
* Set oops header version to distinguish between old and new format header.
* lnx,oops-log partition max size is 4000, header version > 4000 will
* help in identifying new header.
*/
#define OOPS_HDR_VERSION 5000
static unsigned int nvram_size;
static int nvram_fetch, nvram_store;
static char nvram_buf[NVRW_CNT]; /* assume this is in the first 4GB */
static DEFINE_SPINLOCK(nvram_lock);
struct err_log_info {
__be32 error_type;
__be32 seq_num;
};
struct nvram_os_partition {
const char *name;
int req_size; /* desired size, in bytes */
int min_size; /* minimum acceptable size (0 means req_size) */
long size; /* size of data portion (excluding err_log_info) */
long index; /* offset of data portion of partition */
bool os_partition; /* partition initialized by OS, not FW */
};
static struct nvram_os_partition rtas_log_partition = {
.name = "ibm,rtas-log",
.req_size = 2079,
.min_size = 1055,
.index = -1,
.os_partition = true
};
static struct nvram_os_partition oops_log_partition = {
.name = "lnx,oops-log",
.req_size = 4000,
.min_size = 2000,
.index = -1,
.os_partition = true
};
static const char *pseries_nvram_os_partitions[] = {
"ibm,rtas-log",
"lnx,oops-log",
NULL
};
struct oops_log_info {
__be16 version;
__be16 report_length;
__be64 timestamp;
} __attribute__((packed));
static void oops_to_nvram(struct kmsg_dumper *dumper,
enum kmsg_dump_reason reason);
static struct kmsg_dumper nvram_kmsg_dumper = {
.dump = oops_to_nvram
};
/* See clobbering_unread_rtas_event() */
#define NVRAM_RTAS_READ_TIMEOUT 5 /* seconds */
static unsigned long last_unread_rtas_event; /* timestamp */
/*
* For capturing and compressing an oops or panic report...
* big_oops_buf[] holds the uncompressed text we're capturing.
*
* oops_buf[] holds the compressed text, preceded by a oops header.
* oops header has u16 holding the version of oops header (to differentiate
* between old and new format header) followed by u16 holding the length of
* the compressed* text (*Or uncompressed, if compression fails.) and u64
* holding the timestamp. oops_buf[] gets written to NVRAM.
*
* oops_log_info points to the header. oops_data points to the compressed text.
*
* +- oops_buf
* | +- oops_data
* v v
* +-----------+-----------+-----------+------------------------+
* | version | length | timestamp | text |
* | (2 bytes) | (2 bytes) | (8 bytes) | (oops_data_sz bytes) |
* +-----------+-----------+-----------+------------------------+
* ^
* +- oops_log_info
*
* We preallocate these buffers during init to avoid kmalloc during oops/panic.
*/
static size_t big_oops_buf_sz;
static char *big_oops_buf, *oops_buf;
static char *oops_data;
static size_t oops_data_sz;
/* Compression parameters */
#define COMPR_LEVEL 6
#define WINDOW_BITS 12
#define MEM_LEVEL 4
static struct z_stream_s stream;
#ifdef CONFIG_PSTORE
static struct nvram_os_partition of_config_partition = {
.name = "of-config",
.index = -1,
.os_partition = false
};
static struct nvram_os_partition common_partition = {
.name = "common",
.index = -1,
.os_partition = false
};
static enum pstore_type_id nvram_type_ids[] = {
PSTORE_TYPE_DMESG,
PSTORE_TYPE_PPC_RTAS,
PSTORE_TYPE_PPC_OF,
PSTORE_TYPE_PPC_COMMON,
-1
};
static int read_type;
static unsigned long last_rtas_event;
#endif
static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
{
unsigned int i;
unsigned long len;
int done;
unsigned long flags;
char *p = buf;
if (nvram_size == 0 || nvram_fetch == RTAS_UNKNOWN_SERVICE)
return -ENODEV;
if (*index >= nvram_size)
return 0;
i = *index;
if (i + count > nvram_size)
count = nvram_size - i;
spin_lock_irqsave(&nvram_lock, flags);
for (; count != 0; count -= len) {
len = count;
if (len > NVRW_CNT)
len = NVRW_CNT;
if ((rtas_call(nvram_fetch, 3, 2, &done, i, __pa(nvram_buf),
len) != 0) || len != done) {
spin_unlock_irqrestore(&nvram_lock, flags);
return -EIO;
}
memcpy(p, nvram_buf, len);
p += len;
i += len;
}
spin_unlock_irqrestore(&nvram_lock, flags);
*index = i;
return p - buf;
}
static ssize_t pSeries_nvram_write(char *buf, size_t count, loff_t *index)
{
unsigned int i;
unsigned long len;
int done;
unsigned long flags;
const char *p = buf;
if (nvram_size == 0 || nvram_store == RTAS_UNKNOWN_SERVICE)
return -ENODEV;
if (*index >= nvram_size)
return 0;
i = *index;
if (i + count > nvram_size)
count = nvram_size - i;
spin_lock_irqsave(&nvram_lock, flags);
for (; count != 0; count -= len) {
len = count;
if (len > NVRW_CNT)
len = NVRW_CNT;
memcpy(nvram_buf, p, len);
if ((rtas_call(nvram_store, 3, 2, &done, i, __pa(nvram_buf),
len) != 0) || len != done) {
spin_unlock_irqrestore(&nvram_lock, flags);
return -EIO;
}
p += len;
i += len;
}
spin_unlock_irqrestore(&nvram_lock, flags);
*index = i;
return p - buf;
}
static ssize_t pSeries_nvram_get_size(void)
{
return nvram_size ? nvram_size : -ENODEV;
}
/* nvram_write_os_partition, nvram_write_error_log
*
* We need to buffer the error logs into nvram to ensure that we have
* the failure information to decode. If we have a severe error there
* is no way to guarantee that the OS or the machine is in a state to
* get back to user land and write the error to disk. For example if
* the SCSI device driver causes a Machine Check by writing to a bad
* IO address, there is no way of guaranteeing that the device driver
* is in any state that is would also be able to write the error data
* captured to disk, thus we buffer it in NVRAM for analysis on the
* next boot.
*
* In NVRAM the partition containing the error log buffer will looks like:
* Header (in bytes):
* +-----------+----------+--------+------------+------------------+
* | signature | checksum | length | name | data |
* |0 |1 |2 3|4 15|16 length-1|
* +-----------+----------+--------+------------+------------------+
*
* The 'data' section would look like (in bytes):
* +--------------+------------+-----------------------------------+
* | event_logged | sequence # | error log |
* |0 3|4 7|8 error_log_size-1|
* +--------------+------------+-----------------------------------+
*
* event_logged: 0 if event has not been logged to syslog, 1 if it has
* sequence #: The unique sequence # for each event. (until it wraps)
* error log: The error log from event_scan
*/
static int nvram_write_os_partition(struct nvram_os_partition *part,
char *buff, int length,
unsigned int err_type,
unsigned int error_log_cnt)
{
int rc;
loff_t tmp_index;
struct err_log_info info;
if (part->index == -1) {
return -ESPIPE;
}
if (length > part->size) {
length = part->size;
}
info.error_type = cpu_to_be32(err_type);
info.seq_num = cpu_to_be32(error_log_cnt);
tmp_index = part->index;
rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index);
if (rc <= 0) {
pr_err("%s: Failed nvram_write (%d)\n", __func__, rc);
return rc;
}
rc = ppc_md.nvram_write(buff, length, &tmp_index);
if (rc <= 0) {
pr_err("%s: Failed nvram_write (%d)\n", __func__, rc);
return rc;
}
return 0;
}
int nvram_write_error_log(char * buff, int length,
unsigned int err_type, unsigned int error_log_cnt)
{
int rc = nvram_write_os_partition(&rtas_log_partition, buff, length,
err_type, error_log_cnt);
if (!rc) {
last_unread_rtas_event = get_seconds();
#ifdef CONFIG_PSTORE
last_rtas_event = get_seconds();
#endif
}
return rc;
}
/* nvram_read_partition
*
* Reads nvram partition for at most 'length'
*/
static int nvram_read_partition(struct nvram_os_partition *part, char *buff,
int length, unsigned int *err_type,
unsigned int *error_log_cnt)
{
int rc;
loff_t tmp_index;
struct err_log_info info;
if (part->index == -1)
return -1;
if (length > part->size)
length = part->size;
tmp_index = part->index;
if (part->os_partition) {
rc = ppc_md.nvram_read((char *)&info,
sizeof(struct err_log_info),
&tmp_index);
if (rc <= 0) {
pr_err("%s: Failed nvram_read (%d)\n", __func__, rc);
return rc;
}
}
rc = ppc_md.nvram_read(buff, length, &tmp_index);
if (rc <= 0) {
pr_err("%s: Failed nvram_read (%d)\n", __func__, rc);
return rc;
}
if (part->os_partition) {
*error_log_cnt = be32_to_cpu(info.seq_num);
*err_type = be32_to_cpu(info.error_type);
}
return 0;
}
/* nvram_read_error_log
*
* Reads nvram for error log for at most 'length'
*/
int nvram_read_error_log(char *buff, int length,
unsigned int *err_type, unsigned int *error_log_cnt)
{
return nvram_read_partition(&rtas_log_partition, buff, length,
err_type, error_log_cnt);
}
/* This doesn't actually zero anything, but it sets the event_logged
* word to tell that this event is safely in syslog.
*/
int nvram_clear_error_log(void)
{
loff_t tmp_index;
int clear_word = ERR_FLAG_ALREADY_LOGGED;
int rc;
if (rtas_log_partition.index == -1)
return -1;
tmp_index = rtas_log_partition.index;
rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index);
if (rc <= 0) {
printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc);
return rc;
}
last_unread_rtas_event = 0;
return 0;
}
/* pseries_nvram_init_os_partition
*
* This sets up a partition with an "OS" signature.
*
* The general strategy is the following:
* 1.) If a partition with the indicated name already exists...
* - If it's large enough, use it.
* - Otherwise, recycle it and keep going.
* 2.) Search for a free partition that is large enough.
* 3.) If there's not a free partition large enough, recycle any obsolete
* OS partitions and try again.
* 4.) Will first try getting a chunk that will satisfy the requested size.
* 5.) If a chunk of the requested size cannot be allocated, then try finding
* a chunk that will satisfy the minum needed.
*
* Returns 0 on success, else -1.
*/
static int __init pseries_nvram_init_os_partition(struct nvram_os_partition
*part)
{
loff_t p;
int size;
/* Look for ours */
p = nvram_find_partition(part->name, NVRAM_SIG_OS, &size);
/* Found one but too small, remove it */
if (p && size < part->min_size) {
pr_info("nvram: Found too small %s partition,"
" removing it...\n", part->name);
nvram_remove_partition(part->name, NVRAM_SIG_OS, NULL);
p = 0;
}
/* Create one if we didn't find */
if (!p) {
p = nvram_create_partition(part->name, NVRAM_SIG_OS,
part->req_size, part->min_size);
if (p == -ENOSPC) {
pr_info("nvram: No room to create %s partition, "
"deleting any obsolete OS partitions...\n",
part->name);
nvram_remove_partition(NULL, NVRAM_SIG_OS,
pseries_nvram_os_partitions);
p = nvram_create_partition(part->name, NVRAM_SIG_OS,
part->req_size, part->min_size);
}
}
if (p <= 0) {
pr_err("nvram: Failed to find or create %s"
" partition, err %d\n", part->name, (int)p);
return -1;
}
part->index = p;
part->size = nvram_get_partition_size(p) - sizeof(struct err_log_info);
return 0;
}
/*
* Are we using the ibm,rtas-log for oops/panic reports? And if so,
* would logging this oops/panic overwrite an RTAS event that rtas_errd
* hasn't had a chance to read and process? Return 1 if so, else 0.
*
* We assume that if rtas_errd hasn't read the RTAS event in
* NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to.
*/
static int clobbering_unread_rtas_event(void)
{
return (oops_log_partition.index == rtas_log_partition.index
&& last_unread_rtas_event
&& get_seconds() - last_unread_rtas_event <=
NVRAM_RTAS_READ_TIMEOUT);
}
/* Derived from logfs_compress() */
static int nvram_compress(const void *in, void *out, size_t inlen,
size_t outlen)
{
int err, ret;
ret = -EIO;
err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
MEM_LEVEL, Z_DEFAULT_STRATEGY);
if (err != Z_OK)
goto error;
stream.next_in = in;
stream.avail_in = inlen;
stream.total_in = 0;
stream.next_out = out;
stream.avail_out = outlen;
stream.total_out = 0;
err = zlib_deflate(&stream, Z_FINISH);
if (err != Z_STREAM_END)
goto error;
err = zlib_deflateEnd(&stream);
if (err != Z_OK)
goto error;
if (stream.total_out >= stream.total_in)
goto error;
ret = stream.total_out;
error:
return ret;
}
/* Compress the text from big_oops_buf into oops_buf. */
static int zip_oops(size_t text_len)
{
struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len,
oops_data_sz);
if (zipped_len < 0) {
pr_err("nvram: compression failed; returned %d\n", zipped_len);
pr_err("nvram: logging uncompressed oops/panic report\n");
return -1;
}
oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION);
oops_hdr->report_length = cpu_to_be16(zipped_len);
oops_hdr->timestamp = cpu_to_be64(get_seconds());
return 0;
}
#ifdef CONFIG_PSTORE
static int nvram_pstore_open(struct pstore_info *psi)
{
/* Reset the iterator to start reading partitions again */
read_type = -1;
return 0;
}
/**
* nvram_pstore_write - pstore write callback for nvram
* @type: Type of message logged
* @reason: reason behind dump (oops/panic)
* @id: identifier to indicate the write performed
* @part: pstore writes data to registered buffer in parts,
* part number will indicate the same.
* @count: Indicates oops count
* @compressed: Flag to indicate the log is compressed
* @size: number of bytes written to the registered buffer
* @psi: registered pstore_info structure
*
* Called by pstore_dump() when an oops or panic report is logged in the
* printk buffer.
* Returns 0 on successful write.
*/
static int nvram_pstore_write(enum pstore_type_id type,
enum kmsg_dump_reason reason,
u64 *id, unsigned int part, int count,
bool compressed, size_t size,
struct pstore_info *psi)
{
int rc;
unsigned int err_type = ERR_TYPE_KERNEL_PANIC;
struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf;
/* part 1 has the recent messages from printk buffer */
if (part > 1 || type != PSTORE_TYPE_DMESG ||
clobbering_unread_rtas_event())
return -1;
oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION);
oops_hdr->report_length = cpu_to_be16(size);
oops_hdr->timestamp = cpu_to_be64(get_seconds());
if (compressed)
err_type = ERR_TYPE_KERNEL_PANIC_GZ;
rc = nvram_write_os_partition(&oops_log_partition, oops_buf,
(int) (sizeof(*oops_hdr) + size), err_type, count);
if (rc != 0)
return rc;
*id = part;
return 0;
}
/*
* Reads the oops/panic report, rtas, of-config and common partition.
* Returns the length of the data we read from each partition.
* Returns 0 if we've been called before.
*/
static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
int *count, struct timespec *time, char **buf,
bool *compressed, struct pstore_info *psi)
{
struct oops_log_info *oops_hdr;
unsigned int err_type, id_no, size = 0;
struct nvram_os_partition *part = NULL;
char *buff = NULL;
int sig = 0;
loff_t p;
read_type++;
switch (nvram_type_ids[read_type]) {
case PSTORE_TYPE_DMESG:
part = &oops_log_partition;
*type = PSTORE_TYPE_DMESG;
break;
case PSTORE_TYPE_PPC_RTAS:
part = &rtas_log_partition;
*type = PSTORE_TYPE_PPC_RTAS;
time->tv_sec = last_rtas_event;
time->tv_nsec = 0;
break;
case PSTORE_TYPE_PPC_OF:
sig = NVRAM_SIG_OF;
part = &of_config_partition;
*type = PSTORE_TYPE_PPC_OF;
*id = PSTORE_TYPE_PPC_OF;
time->tv_sec = 0;
time->tv_nsec = 0;
break;
case PSTORE_TYPE_PPC_COMMON:
sig = NVRAM_SIG_SYS;
part = &common_partition;
*type = PSTORE_TYPE_PPC_COMMON;
*id = PSTORE_TYPE_PPC_COMMON;
time->tv_sec = 0;
time->tv_nsec = 0;
break;
default:
return 0;
}
if (!part->os_partition) {
p = nvram_find_partition(part->name, sig, &size);
if (p <= 0) {
pr_err("nvram: Failed to find partition %s, "
"err %d\n", part->name, (int)p);
return 0;
}
part->index = p;
part->size = size;
}
buff = kmalloc(part->size, GFP_KERNEL);
if (!buff)
return -ENOMEM;
if (nvram_read_partition(part, buff, part->size, &err_type, &id_no)) {
kfree(buff);
return 0;
}
*count = 0;
if (part->os_partition)
*id = id_no;
if (nvram_type_ids[read_type] == PSTORE_TYPE_DMESG) {
size_t length, hdr_size;
oops_hdr = (struct oops_log_info *)buff;
if (be16_to_cpu(oops_hdr->version) < OOPS_HDR_VERSION) {
/* Old format oops header had 2-byte record size */
hdr_size = sizeof(u16);
length = be16_to_cpu(oops_hdr->version);
time->tv_sec = 0;
time->tv_nsec = 0;
} else {
hdr_size = sizeof(*oops_hdr);
length = be16_to_cpu(oops_hdr->report_length);
time->tv_sec = be64_to_cpu(oops_hdr->timestamp);
time->tv_nsec = 0;
}
*buf = kmalloc(length, GFP_KERNEL);
if (*buf == NULL)
return -ENOMEM;
memcpy(*buf, buff + hdr_size, length);
kfree(buff);
if (err_type == ERR_TYPE_KERNEL_PANIC_GZ)
*compressed = true;
else
*compressed = false;
return length;
}
*buf = buff;
return part->size;
}
static struct pstore_info nvram_pstore_info = {
.owner = THIS_MODULE,
.name = "nvram",
.open = nvram_pstore_open,
.read = nvram_pstore_read,
.write = nvram_pstore_write,
};
static int nvram_pstore_init(void)
{
int rc = 0;
nvram_pstore_info.buf = oops_data;
nvram_pstore_info.bufsize = oops_data_sz;
rc = pstore_register(&nvram_pstore_info);
if (rc != 0)
pr_err("nvram: pstore_register() failed, defaults to "
"kmsg_dump; returned %d\n", rc);
return rc;
}
#else
static int nvram_pstore_init(void)
{
return -1;
}
#endif
static void __init nvram_init_oops_partition(int rtas_partition_exists)
{
int rc;
rc = pseries_nvram_init_os_partition(&oops_log_partition);
if (rc != 0) {
if (!rtas_partition_exists)
return;
pr_notice("nvram: Using %s partition to log both"
" RTAS errors and oops/panic reports\n",
rtas_log_partition.name);
memcpy(&oops_log_partition, &rtas_log_partition,
sizeof(rtas_log_partition));
}
oops_buf = kmalloc(oops_log_partition.size, GFP_KERNEL);
if (!oops_buf) {
pr_err("nvram: No memory for %s partition\n",
oops_log_partition.name);
return;
}
oops_data = oops_buf + sizeof(struct oops_log_info);
oops_data_sz = oops_log_partition.size - sizeof(struct oops_log_info);
rc = nvram_pstore_init();
if (!rc)
return;
/*
* Figure compression (preceded by elimination of each line's <n>
* severity prefix) will reduce the oops/panic report to at most
* 45% of its original size.
*/
big_oops_buf_sz = (oops_data_sz * 100) / 45;
big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
if (big_oops_buf) {
stream.workspace = kmalloc(zlib_deflate_workspacesize(
WINDOW_BITS, MEM_LEVEL), GFP_KERNEL);
if (!stream.workspace) {
pr_err("nvram: No memory for compression workspace; "
"skipping compression of %s partition data\n",
oops_log_partition.name);
kfree(big_oops_buf);
big_oops_buf = NULL;
}
} else {
pr_err("No memory for uncompressed %s data; "
"skipping compression\n", oops_log_partition.name);
stream.workspace = NULL;
}
rc = kmsg_dump_register(&nvram_kmsg_dumper);
if (rc != 0) {
pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc);
kfree(oops_buf);
kfree(big_oops_buf);
kfree(stream.workspace);
}
}
static int __init pseries_nvram_init_log_partitions(void)
{
int rc;
/* Scan nvram for partitions */
nvram_scan_partitions();
rc = pseries_nvram_init_os_partition(&rtas_log_partition);
nvram_init_oops_partition(rc == 0);
return 0;
}
machine_arch_initcall(pseries, pseries_nvram_init_log_partitions);
int __init pSeries_nvram_init(void)
{
struct device_node *nvram;
const __be32 *nbytes_p;
unsigned int proplen;
nvram = of_find_node_by_type(NULL, "nvram");
if (nvram == NULL)
return -ENODEV;
nbytes_p = of_get_property(nvram, "#bytes", &proplen);
if (nbytes_p == NULL || proplen != sizeof(unsigned int)) {
of_node_put(nvram);
return -EIO;
}
nvram_size = be32_to_cpup(nbytes_p);
nvram_fetch = rtas_token("nvram-fetch");
nvram_store = rtas_token("nvram-store");
printk(KERN_INFO "PPC64 nvram contains %d bytes\n", nvram_size);
of_node_put(nvram);
ppc_md.nvram_read = pSeries_nvram_read;
ppc_md.nvram_write = pSeries_nvram_write;
ppc_md.nvram_size = pSeries_nvram_get_size;
return 0;
}
/*
* This is our kmsg_dump callback, called after an oops or panic report
* has been written to the printk buffer. We want to capture as much
* of the printk buffer as possible. First, capture as much as we can
* that we think will compress sufficiently to fit in the lnx,oops-log
* partition. If that's too much, go back and capture uncompressed text.
*/
static void oops_to_nvram(struct kmsg_dumper *dumper,
enum kmsg_dump_reason reason)
{
struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
static unsigned int oops_count = 0;
static bool panicking = false;
static DEFINE_SPINLOCK(lock);
unsigned long flags;
size_t text_len;
unsigned int err_type = ERR_TYPE_KERNEL_PANIC_GZ;
int rc = -1;
switch (reason) {
case KMSG_DUMP_RESTART:
case KMSG_DUMP_HALT:
case KMSG_DUMP_POWEROFF:
/* These are almost always orderly shutdowns. */
return;
case KMSG_DUMP_OOPS:
break;
case KMSG_DUMP_PANIC:
panicking = true;
break;
case KMSG_DUMP_EMERG:
if (panicking)
/* Panic report already captured. */
return;
break;
default:
pr_err("%s: ignoring unrecognized KMSG_DUMP_* reason %d\n",
__func__, (int) reason);
return;
}
if (clobbering_unread_rtas_event())
return;
if (!spin_trylock_irqsave(&lock, flags))
return;
if (big_oops_buf) {
kmsg_dump_get_buffer(dumper, false,
big_oops_buf, big_oops_buf_sz, &text_len);
rc = zip_oops(text_len);
}
if (rc != 0) {
kmsg_dump_rewind(dumper);
kmsg_dump_get_buffer(dumper, false,
oops_data, oops_data_sz, &text_len);
err_type = ERR_TYPE_KERNEL_PANIC;
oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION);
oops_hdr->report_length = cpu_to_be16(text_len);
oops_hdr->timestamp = cpu_to_be64(get_seconds());
}
(void) nvram_write_os_partition(&oops_log_partition, oops_buf,
(int) (sizeof(*oops_hdr) + text_len), err_type,
++oops_count);
spin_unlock_irqrestore(&lock, flags);
}

View file

@ -0,0 +1,37 @@
#ifndef _OFFLINE_STATES_H_
#define _OFFLINE_STATES_H_
/* Cpu offline states go here */
enum cpu_state_vals {
CPU_STATE_OFFLINE,
CPU_STATE_INACTIVE,
CPU_STATE_ONLINE,
CPU_MAX_OFFLINE_STATES
};
#ifdef CONFIG_HOTPLUG_CPU
extern enum cpu_state_vals get_cpu_current_state(int cpu);
extern void set_cpu_current_state(int cpu, enum cpu_state_vals state);
extern void set_preferred_offline_state(int cpu, enum cpu_state_vals state);
extern void set_default_offline_state(int cpu);
#else
static inline enum cpu_state_vals get_cpu_current_state(int cpu)
{
return CPU_STATE_ONLINE;
}
static inline void set_cpu_current_state(int cpu, enum cpu_state_vals state)
{
}
static inline void set_preferred_offline_state(int cpu, enum cpu_state_vals state)
{
}
static inline void set_default_offline_state(int cpu)
{
}
#endif
extern enum cpu_state_vals get_preferred_offline_state(int cpu);
#endif

View file

@ -0,0 +1,172 @@
/*
* Copyright (C) 2001 Dave Engebretsen, IBM Corporation
* Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM
*
* pSeries specific routines for PCI.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/init.h>
#include <linux/ioport.h>
#include <linux/kernel.h>
#include <linux/pci.h>
#include <linux/string.h>
#include <asm/eeh.h>
#include <asm/pci-bridge.h>
#include <asm/prom.h>
#include <asm/ppc-pci.h>
#include "pseries.h"
#if 0
void pcibios_name_device(struct pci_dev *dev)
{
struct device_node *dn;
/*
* Add IBM loc code (slot) as a prefix to the device names for service
*/
dn = pci_device_to_OF_node(dev);
if (dn) {
const char *loc_code = of_get_property(dn, "ibm,loc-code",
NULL);
if (loc_code) {
int loc_len = strlen(loc_code);
if (loc_len < sizeof(dev->dev.name)) {
memmove(dev->dev.name+loc_len+1, dev->dev.name,
sizeof(dev->dev.name)-loc_len-1);
memcpy(dev->dev.name, loc_code, loc_len);
dev->dev.name[loc_len] = ' ';
dev->dev.name[sizeof(dev->dev.name)-1] = '\0';
}
}
}
}
DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_name_device);
#endif
static void __init pSeries_request_regions(void)
{
if (!isa_io_base)
return;
request_region(0x20,0x20,"pic1");
request_region(0xa0,0x20,"pic2");
request_region(0x00,0x20,"dma1");
request_region(0x40,0x20,"timer");
request_region(0x80,0x10,"dma page reg");
request_region(0xc0,0x20,"dma2");
}
void __init pSeries_final_fixup(void)
{
pSeries_request_regions();
eeh_addr_cache_build();
}
/*
* Assume the winbond 82c105 is the IDE controller on a
* p610/p615/p630. We should probably be more careful in case
* someone tries to plug in a similar adapter.
*/
static void fixup_winbond_82c105(struct pci_dev* dev)
{
int i;
unsigned int reg;
if (!machine_is(pseries))
return;
printk("Using INTC for W82c105 IDE controller.\n");
pci_read_config_dword(dev, 0x40, &reg);
/* Enable LEGIRQ to use INTC instead of ISA interrupts */
pci_write_config_dword(dev, 0x40, reg | (1<<11));
for (i = 0; i < DEVICE_COUNT_RESOURCE; ++i) {
/* zap the 2nd function of the winbond chip */
if (dev->resource[i].flags & IORESOURCE_IO
&& dev->bus->number == 0 && dev->devfn == 0x81)
dev->resource[i].flags &= ~IORESOURCE_IO;
if (dev->resource[i].start == 0 && dev->resource[i].end) {
dev->resource[i].flags = 0;
dev->resource[i].end = 0;
}
}
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_WINBOND, PCI_DEVICE_ID_WINBOND_82C105,
fixup_winbond_82c105);
int pseries_root_bridge_prepare(struct pci_host_bridge *bridge)
{
struct device_node *dn, *pdn;
struct pci_bus *bus;
u32 pcie_link_speed_stats[2];
int rc;
bus = bridge->bus;
dn = pcibios_get_phb_of_node(bus);
if (!dn)
return 0;
for (pdn = dn; pdn != NULL; pdn = of_get_next_parent(pdn)) {
rc = of_property_read_u32_array(pdn,
"ibm,pcie-link-speed-stats",
&pcie_link_speed_stats[0], 2);
if (!rc)
break;
}
of_node_put(pdn);
if (rc) {
pr_err("no ibm,pcie-link-speed-stats property\n");
return 0;
}
switch (pcie_link_speed_stats[0]) {
case 0x01:
bus->max_bus_speed = PCIE_SPEED_2_5GT;
break;
case 0x02:
bus->max_bus_speed = PCIE_SPEED_5_0GT;
break;
case 0x04:
bus->max_bus_speed = PCIE_SPEED_8_0GT;
break;
default:
bus->max_bus_speed = PCI_SPEED_UNKNOWN;
break;
}
switch (pcie_link_speed_stats[1]) {
case 0x01:
bus->cur_bus_speed = PCIE_SPEED_2_5GT;
break;
case 0x02:
bus->cur_bus_speed = PCIE_SPEED_5_0GT;
break;
case 0x04:
bus->cur_bus_speed = PCIE_SPEED_8_0GT;
break;
default:
bus->cur_bus_speed = PCI_SPEED_UNKNOWN;
break;
}
return 0;
}

View file

@ -0,0 +1,143 @@
/*
* PCI Dynamic LPAR, PCI Hot Plug and PCI EEH recovery code
* for RPA-compliant PPC64 platform.
* Copyright (C) 2003 Linda Xie <lxie@us.ibm.com>
* Copyright (C) 2005 International Business Machines
*
* Updates, 2005, John Rose <johnrose@austin.ibm.com>
* Updates, 2005, Linas Vepstas <linas@austin.ibm.com>
*
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/pci.h>
#include <linux/export.h>
#include <asm/pci-bridge.h>
#include <asm/ppc-pci.h>
#include <asm/firmware.h>
#include <asm/eeh.h>
static struct pci_bus *
find_bus_among_children(struct pci_bus *bus,
struct device_node *dn)
{
struct pci_bus *child = NULL;
struct pci_bus *tmp;
struct device_node *busdn;
busdn = pci_bus_to_OF_node(bus);
if (busdn == dn)
return bus;
list_for_each_entry(tmp, &bus->children, node) {
child = find_bus_among_children(tmp, dn);
if (child)
break;
};
return child;
}
struct pci_bus *
pcibios_find_pci_bus(struct device_node *dn)
{
struct pci_dn *pdn = dn->data;
if (!pdn || !pdn->phb || !pdn->phb->bus)
return NULL;
return find_bus_among_children(pdn->phb->bus, dn);
}
EXPORT_SYMBOL_GPL(pcibios_find_pci_bus);
struct pci_controller *init_phb_dynamic(struct device_node *dn)
{
struct pci_controller *phb;
pr_debug("PCI: Initializing new hotplug PHB %s\n", dn->full_name);
phb = pcibios_alloc_controller(dn);
if (!phb)
return NULL;
rtas_setup_phb(phb);
pci_process_bridge_OF_ranges(phb, dn, 0);
pci_devs_phb_init_dynamic(phb);
/* Create EEH devices for the PHB */
eeh_dev_phb_init_dynamic(phb);
if (dn->child)
eeh_add_device_tree_early(dn);
pcibios_scan_phb(phb);
pcibios_finish_adding_to_bus(phb->bus);
return phb;
}
EXPORT_SYMBOL_GPL(init_phb_dynamic);
/* RPA-specific bits for removing PHBs */
int remove_phb_dynamic(struct pci_controller *phb)
{
struct pci_bus *b = phb->bus;
struct resource *res;
int rc, i;
pr_debug("PCI: Removing PHB %04x:%02x...\n",
pci_domain_nr(b), b->number);
/* We cannot to remove a root bus that has children */
if (!(list_empty(&b->children) && list_empty(&b->devices)))
return -EBUSY;
/* We -know- there aren't any child devices anymore at this stage
* and thus, we can safely unmap the IO space as it's not in use
*/
res = &phb->io_resource;
if (res->flags & IORESOURCE_IO) {
rc = pcibios_unmap_io_space(b);
if (rc) {
printk(KERN_ERR "%s: failed to unmap IO on bus %s\n",
__func__, b->name);
return 1;
}
}
/* Remove the PCI bus and unregister the bridge device from sysfs */
phb->bus = NULL;
pci_remove_bus(b);
device_unregister(b->bridge);
/* Now release the IO resource */
if (res->flags & IORESOURCE_IO)
release_resource(res);
/* Release memory resources */
for (i = 0; i < 3; ++i) {
res = &phb->mem_resources[i];
if (!(res->flags & IORESOURCE_MEM))
continue;
release_resource(res);
}
/* Free pci_controller data structure */
pcibios_free_controller(phb);
return 0;
}
EXPORT_SYMBOL_GPL(remove_phb_dynamic);

View file

@ -0,0 +1,82 @@
/*
* Interface for power-management for ppc64 compliant platform
*
* Manish Ahuja <mahuja@us.ibm.com>
*
* Feb 2007
*
* Copyright (C) 2007 IBM Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; version 2 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <asm/machdep.h>
unsigned long rtas_poweron_auto; /* default and normal state is 0 */
static ssize_t auto_poweron_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%lu\n", rtas_poweron_auto);
}
static ssize_t auto_poweron_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t n)
{
int ret;
unsigned long ups_restart;
ret = sscanf(buf, "%lu", &ups_restart);
if ((ret == 1) && ((ups_restart == 1) || (ups_restart == 0))){
rtas_poweron_auto = ups_restart;
return n;
}
return -EINVAL;
}
static struct kobj_attribute auto_poweron_attr =
__ATTR(auto_poweron, 0644, auto_poweron_show, auto_poweron_store);
#ifndef CONFIG_PM
struct kobject *power_kobj;
static struct attribute *g[] = {
&auto_poweron_attr.attr,
NULL,
};
static struct attribute_group attr_group = {
.attrs = g,
};
static int __init pm_init(void)
{
power_kobj = kobject_create_and_add("power", NULL);
if (!power_kobj)
return -ENOMEM;
return sysfs_create_group(power_kobj, &attr_group);
}
machine_core_initcall(pseries, pm_init);
#else
static int __init apo_pm_init(void)
{
return (sysfs_create_file(power_kobj, &auto_poweron_attr.attr));
}
machine_device_initcall(pseries, apo_pm_init);
#endif

View file

@ -0,0 +1,70 @@
/*
* Copyright 2006 IBM Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _PSERIES_PSERIES_H
#define _PSERIES_PSERIES_H
#include <linux/interrupt.h>
struct device_node;
extern void request_event_sources_irqs(struct device_node *np,
irq_handler_t handler, const char *name);
#include <linux/of.h>
extern void __init fw_hypertas_feature_init(const char *hypertas,
unsigned long len);
extern void __init fw_vec5_feature_init(const char *hypertas,
unsigned long len);
struct pt_regs;
extern int pSeries_system_reset_exception(struct pt_regs *regs);
extern int pSeries_machine_check_exception(struct pt_regs *regs);
#ifdef CONFIG_SMP
extern void smp_init_pseries_mpic(void);
extern void smp_init_pseries_xics(void);
#else
static inline void smp_init_pseries_mpic(void) { };
static inline void smp_init_pseries_xics(void) { };
#endif
#ifdef CONFIG_KEXEC
extern void setup_kexec_cpu_down_xics(void);
extern void setup_kexec_cpu_down_mpic(void);
#else
static inline void setup_kexec_cpu_down_xics(void) { }
static inline void setup_kexec_cpu_down_mpic(void) { }
#endif
extern void pSeries_final_fixup(void);
/* Poweron flag used for enabling auto ups restart */
extern unsigned long rtas_poweron_auto;
/* Provided by HVC VIO */
extern void hvc_vio_init_early(void);
/* Dynamic logical Partitioning/Mobility */
extern void dlpar_free_cc_nodes(struct device_node *);
extern void dlpar_free_cc_property(struct property *);
extern struct device_node *dlpar_configure_connector(__be32,
struct device_node *);
extern int dlpar_attach_node(struct device_node *);
extern int dlpar_detach_node(struct device_node *);
/* PCI root bridge prepare function override for pseries */
struct pci_host_bridge;
int pseries_root_bridge_prepare(struct pci_host_bridge *bridge);
unsigned long pseries_memory_block_size(void);
#endif /* _PSERIES_PSERIES_H */

View file

@ -0,0 +1,290 @@
/*
* POWER platform energy management driver
* Copyright (C) 2010 IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation.
*
* This pseries platform device driver provides access to
* platform energy management capabilities.
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/seq_file.h>
#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/of.h>
#include <asm/cputhreads.h>
#include <asm/page.h>
#include <asm/hvcall.h>
#include <asm/firmware.h>
#define MODULE_VERS "1.0"
#define MODULE_NAME "pseries_energy"
/* Driver flags */
static int sysfs_entries;
/* Helper routines */
/* Helper Routines to convert between drc_index to cpu numbers */
static u32 cpu_to_drc_index(int cpu)
{
struct device_node *dn = NULL;
const int *indexes;
int i;
int rc = 1;
u32 ret = 0;
dn = of_find_node_by_path("/cpus");
if (dn == NULL)
goto err;
indexes = of_get_property(dn, "ibm,drc-indexes", NULL);
if (indexes == NULL)
goto err_of_node_put;
/* Convert logical cpu number to core number */
i = cpu_core_index_of_thread(cpu);
/*
* The first element indexes[0] is the number of drc_indexes
* returned in the list. Hence i+1 will get the drc_index
* corresponding to core number i.
*/
WARN_ON(i > indexes[0]);
ret = indexes[i + 1];
rc = 0;
err_of_node_put:
of_node_put(dn);
err:
if (rc)
printk(KERN_WARNING "cpu_to_drc_index(%d) failed", cpu);
return ret;
}
static int drc_index_to_cpu(u32 drc_index)
{
struct device_node *dn = NULL;
const int *indexes;
int i, cpu = 0;
int rc = 1;
dn = of_find_node_by_path("/cpus");
if (dn == NULL)
goto err;
indexes = of_get_property(dn, "ibm,drc-indexes", NULL);
if (indexes == NULL)
goto err_of_node_put;
/*
* First element in the array is the number of drc_indexes
* returned. Search through the list to find the matching
* drc_index and get the core number
*/
for (i = 0; i < indexes[0]; i++) {
if (indexes[i + 1] == drc_index)
break;
}
/* Convert core number to logical cpu number */
cpu = cpu_first_thread_of_core(i);
rc = 0;
err_of_node_put:
of_node_put(dn);
err:
if (rc)
printk(KERN_WARNING "drc_index_to_cpu(%d) failed", drc_index);
return cpu;
}
/*
* pseries hypervisor call H_BEST_ENERGY provides hints to OS on
* preferred logical cpus to activate or deactivate for optimized
* energy consumption.
*/
#define FLAGS_MODE1 0x004E200000080E01UL
#define FLAGS_MODE2 0x004E200000080401UL
#define FLAGS_ACTIVATE 0x100
static ssize_t get_best_energy_list(char *page, int activate)
{
int rc, cnt, i, cpu;
unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
unsigned long flags = 0;
u32 *buf_page;
char *s = page;
buf_page = (u32 *) get_zeroed_page(GFP_KERNEL);
if (!buf_page)
return -ENOMEM;
flags = FLAGS_MODE1;
if (activate)
flags |= FLAGS_ACTIVATE;
rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags, 0, __pa(buf_page),
0, 0, 0, 0, 0, 0);
if (rc != H_SUCCESS) {
free_page((unsigned long) buf_page);
return -EINVAL;
}
cnt = retbuf[0];
for (i = 0; i < cnt; i++) {
cpu = drc_index_to_cpu(buf_page[2*i+1]);
if ((cpu_online(cpu) && !activate) ||
(!cpu_online(cpu) && activate))
s += sprintf(s, "%d,", cpu);
}
if (s > page) { /* Something to show */
s--; /* Suppress last comma */
s += sprintf(s, "\n");
}
free_page((unsigned long) buf_page);
return s-page;
}
static ssize_t get_best_energy_data(struct device *dev,
char *page, int activate)
{
int rc;
unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
unsigned long flags = 0;
flags = FLAGS_MODE2;
if (activate)
flags |= FLAGS_ACTIVATE;
rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags,
cpu_to_drc_index(dev->id),
0, 0, 0, 0, 0, 0, 0);
if (rc != H_SUCCESS)
return -EINVAL;
return sprintf(page, "%lu\n", retbuf[1] >> 32);
}
/* Wrapper functions */
static ssize_t cpu_activate_hint_list_show(struct device *dev,
struct device_attribute *attr, char *page)
{
return get_best_energy_list(page, 1);
}
static ssize_t cpu_deactivate_hint_list_show(struct device *dev,
struct device_attribute *attr, char *page)
{
return get_best_energy_list(page, 0);
}
static ssize_t percpu_activate_hint_show(struct device *dev,
struct device_attribute *attr, char *page)
{
return get_best_energy_data(dev, page, 1);
}
static ssize_t percpu_deactivate_hint_show(struct device *dev,
struct device_attribute *attr, char *page)
{
return get_best_energy_data(dev, page, 0);
}
/*
* Create sysfs interface:
* /sys/devices/system/cpu/pseries_activate_hint_list
* /sys/devices/system/cpu/pseries_deactivate_hint_list
* Comma separated list of cpus to activate or deactivate
* /sys/devices/system/cpu/cpuN/pseries_activate_hint
* /sys/devices/system/cpu/cpuN/pseries_deactivate_hint
* Per-cpu value of the hint
*/
struct device_attribute attr_cpu_activate_hint_list =
__ATTR(pseries_activate_hint_list, 0444,
cpu_activate_hint_list_show, NULL);
struct device_attribute attr_cpu_deactivate_hint_list =
__ATTR(pseries_deactivate_hint_list, 0444,
cpu_deactivate_hint_list_show, NULL);
struct device_attribute attr_percpu_activate_hint =
__ATTR(pseries_activate_hint, 0444,
percpu_activate_hint_show, NULL);
struct device_attribute attr_percpu_deactivate_hint =
__ATTR(pseries_deactivate_hint, 0444,
percpu_deactivate_hint_show, NULL);
static int __init pseries_energy_init(void)
{
int cpu, err;
struct device *cpu_dev;
if (!firmware_has_feature(FW_FEATURE_BEST_ENERGY)) {
printk(KERN_INFO "Hypercall H_BEST_ENERGY not supported\n");
return 0;
}
/* Create the sysfs files */
err = device_create_file(cpu_subsys.dev_root,
&attr_cpu_activate_hint_list);
if (!err)
err = device_create_file(cpu_subsys.dev_root,
&attr_cpu_deactivate_hint_list);
if (err)
return err;
for_each_possible_cpu(cpu) {
cpu_dev = get_cpu_device(cpu);
err = device_create_file(cpu_dev,
&attr_percpu_activate_hint);
if (err)
break;
err = device_create_file(cpu_dev,
&attr_percpu_deactivate_hint);
if (err)
break;
}
if (err)
return err;
sysfs_entries = 1; /* Removed entries on cleanup */
return 0;
}
static void __exit pseries_energy_cleanup(void)
{
int cpu;
struct device *cpu_dev;
if (!sysfs_entries)
return;
/* Remove the sysfs files */
device_remove_file(cpu_subsys.dev_root, &attr_cpu_activate_hint_list);
device_remove_file(cpu_subsys.dev_root, &attr_cpu_deactivate_hint_list);
for_each_possible_cpu(cpu) {
cpu_dev = get_cpu_device(cpu);
sysfs_remove_file(&cpu_dev->kobj,
&attr_percpu_activate_hint.attr);
sysfs_remove_file(&cpu_dev->kobj,
&attr_percpu_deactivate_hint.attr);
}
}
module_init(pseries_energy_init);
module_exit(pseries_energy_cleanup);
MODULE_DESCRIPTION("Driver for pSeries platform energy management");
MODULE_AUTHOR("Vaidyanathan Srinivasan");
MODULE_LICENSE("GPL");

View file

@ -0,0 +1,414 @@
/*
* Copyright (C) 2001 Dave Engebretsen IBM Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/sched.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/of.h>
#include <linux/fs.h>
#include <linux/reboot.h>
#include <asm/machdep.h>
#include <asm/rtas.h>
#include <asm/firmware.h>
#include "pseries.h"
static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
static DEFINE_SPINLOCK(ras_log_buf_lock);
static char global_mce_data_buf[RTAS_ERROR_LOG_MAX];
static DEFINE_PER_CPU(__u64, mce_data_buf);
static int ras_check_exception_token;
#define EPOW_SENSOR_TOKEN 9
#define EPOW_SENSOR_INDEX 0
static irqreturn_t ras_epow_interrupt(int irq, void *dev_id);
static irqreturn_t ras_error_interrupt(int irq, void *dev_id);
/*
* Initialize handlers for the set of interrupts caused by hardware errors
* and power system events.
*/
static int __init init_ras_IRQ(void)
{
struct device_node *np;
ras_check_exception_token = rtas_token("check-exception");
/* Internal Errors */
np = of_find_node_by_path("/event-sources/internal-errors");
if (np != NULL) {
request_event_sources_irqs(np, ras_error_interrupt,
"RAS_ERROR");
of_node_put(np);
}
/* EPOW Events */
np = of_find_node_by_path("/event-sources/epow-events");
if (np != NULL) {
request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW");
of_node_put(np);
}
return 0;
}
machine_subsys_initcall(pseries, init_ras_IRQ);
#define EPOW_SHUTDOWN_NORMAL 1
#define EPOW_SHUTDOWN_ON_UPS 2
#define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3
#define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4
static void handle_system_shutdown(char event_modifier)
{
switch (event_modifier) {
case EPOW_SHUTDOWN_NORMAL:
pr_emerg("Firmware initiated power off");
orderly_poweroff(true);
break;
case EPOW_SHUTDOWN_ON_UPS:
pr_emerg("Loss of power reported by firmware, system is "
"running on UPS/battery");
break;
case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS:
pr_emerg("Loss of system critical functions reported by "
"firmware");
pr_emerg("Check RTAS error log for details");
orderly_poweroff(true);
break;
case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH:
pr_emerg("Ambient temperature too high reported by firmware");
pr_emerg("Check RTAS error log for details");
orderly_poweroff(true);
break;
default:
pr_err("Unknown power/cooling shutdown event (modifier %d)",
event_modifier);
}
}
struct epow_errorlog {
unsigned char sensor_value;
unsigned char event_modifier;
unsigned char extended_modifier;
unsigned char reserved;
unsigned char platform_reason;
};
#define EPOW_RESET 0
#define EPOW_WARN_COOLING 1
#define EPOW_WARN_POWER 2
#define EPOW_SYSTEM_SHUTDOWN 3
#define EPOW_SYSTEM_HALT 4
#define EPOW_MAIN_ENCLOSURE 5
#define EPOW_POWER_OFF 7
static void rtas_parse_epow_errlog(struct rtas_error_log *log)
{
struct pseries_errorlog *pseries_log;
struct epow_errorlog *epow_log;
char action_code;
char modifier;
pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW);
if (pseries_log == NULL)
return;
epow_log = (struct epow_errorlog *)pseries_log->data;
action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */
modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */
switch (action_code) {
case EPOW_RESET:
pr_err("Non critical power or cooling issue cleared");
break;
case EPOW_WARN_COOLING:
pr_err("Non critical cooling issue reported by firmware");
pr_err("Check RTAS error log for details");
break;
case EPOW_WARN_POWER:
pr_err("Non critical power issue reported by firmware");
pr_err("Check RTAS error log for details");
break;
case EPOW_SYSTEM_SHUTDOWN:
handle_system_shutdown(epow_log->event_modifier);
break;
case EPOW_SYSTEM_HALT:
pr_emerg("Firmware initiated power off");
orderly_poweroff(true);
break;
case EPOW_MAIN_ENCLOSURE:
case EPOW_POWER_OFF:
pr_emerg("Critical power/cooling issue reported by firmware");
pr_emerg("Check RTAS error log for details");
pr_emerg("Immediate power off");
emergency_sync();
kernel_power_off();
break;
default:
pr_err("Unknown power/cooling event (action code %d)",
action_code);
}
}
/* Handle environmental and power warning (EPOW) interrupts. */
static irqreturn_t ras_epow_interrupt(int irq, void *dev_id)
{
int status;
int state;
int critical;
status = rtas_get_sensor(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, &state);
if (state > 3)
critical = 1; /* Time Critical */
else
critical = 0;
spin_lock(&ras_log_buf_lock);
status = rtas_call(ras_check_exception_token, 6, 1, NULL,
RTAS_VECTOR_EXTERNAL_INTERRUPT,
virq_to_hw(irq),
RTAS_EPOW_WARNING,
critical, __pa(&ras_log_buf),
rtas_get_error_log_max());
log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf);
spin_unlock(&ras_log_buf_lock);
return IRQ_HANDLED;
}
/*
* Handle hardware error interrupts.
*
* RTAS check-exception is called to collect data on the exception. If
* the error is deemed recoverable, we log a warning and return.
* For nonrecoverable errors, an error is logged and we stop all processing
* as quickly as possible in order to prevent propagation of the failure.
*/
static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
{
struct rtas_error_log *rtas_elog;
int status;
int fatal;
spin_lock(&ras_log_buf_lock);
status = rtas_call(ras_check_exception_token, 6, 1, NULL,
RTAS_VECTOR_EXTERNAL_INTERRUPT,
virq_to_hw(irq),
RTAS_INTERNAL_ERROR, 1 /* Time Critical */,
__pa(&ras_log_buf),
rtas_get_error_log_max());
rtas_elog = (struct rtas_error_log *)ras_log_buf;
if (status == 0 &&
rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC)
fatal = 1;
else
fatal = 0;
/* format and print the extended information */
log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal);
if (fatal) {
pr_emerg("Fatal hardware error reported by firmware");
pr_emerg("Check RTAS error log for details");
pr_emerg("Immediate power off");
emergency_sync();
kernel_power_off();
} else {
pr_err("Recoverable hardware error reported by firmware");
}
spin_unlock(&ras_log_buf_lock);
return IRQ_HANDLED;
}
/*
* Some versions of FWNMI place the buffer inside the 4kB page starting at
* 0x7000. Other versions place it inside the rtas buffer. We check both.
*/
#define VALID_FWNMI_BUFFER(A) \
((((A) >= 0x7000) && ((A) < 0x7ff0)) || \
(((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16))))
/*
* Get the error information for errors coming through the
* FWNMI vectors. The pt_regs' r3 will be updated to reflect
* the actual r3 if possible, and a ptr to the error log entry
* will be returned if found.
*
* If the RTAS error is not of the extended type, then we put it in a per
* cpu 64bit buffer. If it is the extended type we use global_mce_data_buf.
*
* The global_mce_data_buf does not have any locks or protection around it,
* if a second machine check comes in, or a system reset is done
* before we have logged the error, then we will get corruption in the
* error log. This is preferable over holding off on calling
* ibm,nmi-interlock which would result in us checkstopping if a
* second machine check did come in.
*/
static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
{
unsigned long *savep;
struct rtas_error_log *h, *errhdr = NULL;
/* Mask top two bits */
regs->gpr[3] &= ~(0x3UL << 62);
if (!VALID_FWNMI_BUFFER(regs->gpr[3])) {
printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);
return NULL;
}
savep = __va(regs->gpr[3]);
regs->gpr[3] = savep[0]; /* restore original r3 */
/* If it isn't an extended log we can use the per cpu 64bit buffer */
h = (struct rtas_error_log *)&savep[1];
if (!rtas_error_extended(h)) {
memcpy(&__get_cpu_var(mce_data_buf), h, sizeof(__u64));
errhdr = (struct rtas_error_log *)&__get_cpu_var(mce_data_buf);
} else {
int len, error_log_length;
error_log_length = 8 + rtas_error_extended_log_length(h);
len = max_t(int, error_log_length, RTAS_ERROR_LOG_MAX);
memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
memcpy(global_mce_data_buf, h, len);
errhdr = (struct rtas_error_log *)global_mce_data_buf;
}
return errhdr;
}
/* Call this when done with the data returned by FWNMI_get_errinfo.
* It will release the saved data area for other CPUs in the
* partition to receive FWNMI errors.
*/
static void fwnmi_release_errinfo(void)
{
int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);
if (ret != 0)
printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret);
}
int pSeries_system_reset_exception(struct pt_regs *regs)
{
if (fwnmi_active) {
struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs);
if (errhdr) {
/* XXX Should look at FWNMI information */
}
fwnmi_release_errinfo();
}
return 0; /* need to perform reset */
}
/*
* See if we can recover from a machine check exception.
* This is only called on power4 (or above) and only via
* the Firmware Non-Maskable Interrupts (fwnmi) handler
* which provides the error analysis for us.
*
* Return 1 if corrected (or delivered a signal).
* Return 0 if there is nothing we can do.
*/
static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
{
int recovered = 0;
int disposition = rtas_error_disposition(err);
if (!(regs->msr & MSR_RI)) {
/* If MSR_RI isn't set, we cannot recover */
recovered = 0;
} else if (disposition == RTAS_DISP_FULLY_RECOVERED) {
/* Platform corrected itself */
recovered = 1;
} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
/* Platform corrected itself but could be degraded */
printk(KERN_ERR "MCE: limited recovery, system may "
"be degraded\n");
recovered = 1;
} else if (user_mode(regs) && !is_global_init(current) &&
rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) {
/*
* If we received a synchronous error when in userspace
* kill the task. Firmware may report details of the fail
* asynchronously, so we can't rely on the target and type
* fields being valid here.
*/
printk(KERN_ERR "MCE: uncorrectable error, killing task "
"%s:%d\n", current->comm, current->pid);
_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
recovered = 1;
}
log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
return recovered;
}
/*
* Handle a machine check.
*
* Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi)
* should be present. If so the handler which called us tells us if the
* error was recovered (never true if RI=0).
*
* On hardware prior to Power 4 these exceptions were asynchronous which
* means we can't tell exactly where it occurred and so we can't recover.
*/
int pSeries_machine_check_exception(struct pt_regs *regs)
{
struct rtas_error_log *errp;
if (fwnmi_active) {
errp = fwnmi_get_errinfo(regs);
fwnmi_release_errinfo();
if (errp && recover_mce(regs, errp))
return 1;
}
return 0;
}

View file

@ -0,0 +1,455 @@
/*
* pSeries_reconfig.c - support for dynamic reconfiguration (including PCI
* Hotplug and Dynamic Logical Partitioning on RPA platforms).
*
* Copyright (C) 2005 Nathan Lynch
* Copyright (C) 2005 IBM Corporation
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*/
#include <linux/kernel.h>
#include <linux/notifier.h>
#include <linux/proc_fs.h>
#include <linux/slab.h>
#include <linux/of.h>
#include <asm/prom.h>
#include <asm/machdep.h>
#include <asm/uaccess.h>
#include <asm/mmu.h>
/**
* derive_parent - basically like dirname(1)
* @path: the full_name of a node to be added to the tree
*
* Returns the node which should be the parent of the node
* described by path. E.g., for path = "/foo/bar", returns
* the node with full_name = "/foo".
*/
static struct device_node *derive_parent(const char *path)
{
struct device_node *parent = NULL;
char *parent_path = "/";
size_t parent_path_len = strrchr(path, '/') - path + 1;
/* reject if path is "/" */
if (!strcmp(path, "/"))
return ERR_PTR(-EINVAL);
if (strrchr(path, '/') != path) {
parent_path = kmalloc(parent_path_len, GFP_KERNEL);
if (!parent_path)
return ERR_PTR(-ENOMEM);
strlcpy(parent_path, path, parent_path_len);
}
parent = of_find_node_by_path(parent_path);
if (!parent)
return ERR_PTR(-EINVAL);
if (strcmp(parent_path, "/"))
kfree(parent_path);
return parent;
}
static int pSeries_reconfig_add_node(const char *path, struct property *proplist)
{
struct device_node *np;
int err = -ENOMEM;
np = kzalloc(sizeof(*np), GFP_KERNEL);
if (!np)
goto out_err;
np->full_name = kstrdup(path, GFP_KERNEL);
if (!np->full_name)
goto out_err;
np->properties = proplist;
of_node_set_flag(np, OF_DYNAMIC);
of_node_init(np);
np->parent = derive_parent(path);
if (IS_ERR(np->parent)) {
err = PTR_ERR(np->parent);
goto out_err;
}
err = of_attach_node(np);
if (err) {
printk(KERN_ERR "Failed to add device node %s\n", path);
goto out_err;
}
of_node_put(np->parent);
return 0;
out_err:
if (np) {
of_node_put(np->parent);
kfree(np->full_name);
kfree(np);
}
return err;
}
static int pSeries_reconfig_remove_node(struct device_node *np)
{
struct device_node *parent, *child;
parent = of_get_parent(np);
if (!parent)
return -EINVAL;
if ((child = of_get_next_child(np, NULL))) {
of_node_put(child);
of_node_put(parent);
return -EBUSY;
}
of_detach_node(np);
of_node_put(parent);
of_node_put(np); /* Must decrement the refcount */
return 0;
}
/*
* /proc/powerpc/ofdt - yucky binary interface for adding and removing
* OF device nodes. Should be deprecated as soon as we get an
* in-kernel wrapper for the RTAS ibm,configure-connector call.
*/
static void release_prop_list(const struct property *prop)
{
struct property *next;
for (; prop; prop = next) {
next = prop->next;
kfree(prop->name);
kfree(prop->value);
kfree(prop);
}
}
/**
* parse_next_property - process the next property from raw input buffer
* @buf: input buffer, must be nul-terminated
* @end: end of the input buffer + 1, for validation
* @name: return value; set to property name in buf
* @length: return value; set to length of value
* @value: return value; set to the property value in buf
*
* Note that the caller must make copies of the name and value returned,
* this function does no allocation or copying of the data. Return value
* is set to the next name in buf, or NULL on error.
*/
static char * parse_next_property(char *buf, char *end, char **name, int *length,
unsigned char **value)
{
char *tmp;
*name = buf;
tmp = strchr(buf, ' ');
if (!tmp) {
printk(KERN_ERR "property parse failed in %s at line %d\n",
__func__, __LINE__);
return NULL;
}
*tmp = '\0';
if (++tmp >= end) {
printk(KERN_ERR "property parse failed in %s at line %d\n",
__func__, __LINE__);
return NULL;
}
/* now we're on the length */
*length = -1;
*length = simple_strtoul(tmp, &tmp, 10);
if (*length == -1) {
printk(KERN_ERR "property parse failed in %s at line %d\n",
__func__, __LINE__);
return NULL;
}
if (*tmp != ' ' || ++tmp >= end) {
printk(KERN_ERR "property parse failed in %s at line %d\n",
__func__, __LINE__);
return NULL;
}
/* now we're on the value */
*value = tmp;
tmp += *length;
if (tmp > end) {
printk(KERN_ERR "property parse failed in %s at line %d\n",
__func__, __LINE__);
return NULL;
}
else if (tmp < end && *tmp != ' ' && *tmp != '\0') {
printk(KERN_ERR "property parse failed in %s at line %d\n",
__func__, __LINE__);
return NULL;
}
tmp++;
/* and now we should be on the next name, or the end */
return tmp;
}
static struct property *new_property(const char *name, const int length,
const unsigned char *value, struct property *last)
{
struct property *new = kzalloc(sizeof(*new), GFP_KERNEL);
if (!new)
return NULL;
if (!(new->name = kstrdup(name, GFP_KERNEL)))
goto cleanup;
if (!(new->value = kmalloc(length + 1, GFP_KERNEL)))
goto cleanup;
memcpy(new->value, value, length);
*(((char *)new->value) + length) = 0;
new->length = length;
new->next = last;
return new;
cleanup:
kfree(new->name);
kfree(new->value);
kfree(new);
return NULL;
}
static int do_add_node(char *buf, size_t bufsize)
{
char *path, *end, *name;
struct device_node *np;
struct property *prop = NULL;
unsigned char* value;
int length, rv = 0;
end = buf + bufsize;
path = buf;
buf = strchr(buf, ' ');
if (!buf)
return -EINVAL;
*buf = '\0';
buf++;
if ((np = of_find_node_by_path(path))) {
of_node_put(np);
return -EINVAL;
}
/* rv = build_prop_list(tmp, bufsize - (tmp - buf), &proplist); */
while (buf < end &&
(buf = parse_next_property(buf, end, &name, &length, &value))) {
struct property *last = prop;
prop = new_property(name, length, value, last);
if (!prop) {
rv = -ENOMEM;
prop = last;
goto out;
}
}
if (!buf) {
rv = -EINVAL;
goto out;
}
rv = pSeries_reconfig_add_node(path, prop);
out:
if (rv)
release_prop_list(prop);
return rv;
}
static int do_remove_node(char *buf)
{
struct device_node *node;
int rv = -ENODEV;
if ((node = of_find_node_by_path(buf)))
rv = pSeries_reconfig_remove_node(node);
of_node_put(node);
return rv;
}
static char *parse_node(char *buf, size_t bufsize, struct device_node **npp)
{
char *handle_str;
phandle handle;
*npp = NULL;
handle_str = buf;
buf = strchr(buf, ' ');
if (!buf)
return NULL;
*buf = '\0';
buf++;
handle = simple_strtoul(handle_str, NULL, 0);
*npp = of_find_node_by_phandle(handle);
return buf;
}
static int do_add_property(char *buf, size_t bufsize)
{
struct property *prop = NULL;
struct device_node *np;
unsigned char *value;
char *name, *end;
int length;
end = buf + bufsize;
buf = parse_node(buf, bufsize, &np);
if (!np)
return -ENODEV;
if (parse_next_property(buf, end, &name, &length, &value) == NULL)
return -EINVAL;
prop = new_property(name, length, value, NULL);
if (!prop)
return -ENOMEM;
of_add_property(np, prop);
return 0;
}
static int do_remove_property(char *buf, size_t bufsize)
{
struct device_node *np;
char *tmp;
struct property *prop;
buf = parse_node(buf, bufsize, &np);
if (!np)
return -ENODEV;
tmp = strchr(buf,' ');
if (tmp)
*tmp = '\0';
if (strlen(buf) == 0)
return -EINVAL;
prop = of_find_property(np, buf, NULL);
return of_remove_property(np, prop);
}
static int do_update_property(char *buf, size_t bufsize)
{
struct device_node *np;
unsigned char *value;
char *name, *end, *next_prop;
int length;
struct property *newprop;
buf = parse_node(buf, bufsize, &np);
end = buf + bufsize;
if (!np)
return -ENODEV;
next_prop = parse_next_property(buf, end, &name, &length, &value);
if (!next_prop)
return -EINVAL;
if (!strlen(name))
return -ENODEV;
newprop = new_property(name, length, value, NULL);
if (!newprop)
return -ENOMEM;
if (!strcmp(name, "slb-size") || !strcmp(name, "ibm,slb-size"))
slb_set_size(*(int *)value);
return of_update_property(np, newprop);
}
/**
* ofdt_write - perform operations on the Open Firmware device tree
*
* @file: not used
* @buf: command and arguments
* @count: size of the command buffer
* @off: not used
*
* Operations supported at this time are addition and removal of
* whole nodes along with their properties. Operations on individual
* properties are not implemented (yet).
*/
static ssize_t ofdt_write(struct file *file, const char __user *buf, size_t count,
loff_t *off)
{
int rv = 0;
char *kbuf;
char *tmp;
if (!(kbuf = kmalloc(count + 1, GFP_KERNEL))) {
rv = -ENOMEM;
goto out;
}
if (copy_from_user(kbuf, buf, count)) {
rv = -EFAULT;
goto out;
}
kbuf[count] = '\0';
tmp = strchr(kbuf, ' ');
if (!tmp) {
rv = -EINVAL;
goto out;
}
*tmp = '\0';
tmp++;
if (!strcmp(kbuf, "add_node"))
rv = do_add_node(tmp, count - (tmp - kbuf));
else if (!strcmp(kbuf, "remove_node"))
rv = do_remove_node(tmp);
else if (!strcmp(kbuf, "add_property"))
rv = do_add_property(tmp, count - (tmp - kbuf));
else if (!strcmp(kbuf, "remove_property"))
rv = do_remove_property(tmp, count - (tmp - kbuf));
else if (!strcmp(kbuf, "update_property"))
rv = do_update_property(tmp, count - (tmp - kbuf));
else
rv = -EINVAL;
out:
kfree(kbuf);
return rv ? rv : count;
}
static const struct file_operations ofdt_fops = {
.write = ofdt_write,
.llseek = noop_llseek,
};
/* create /proc/powerpc/ofdt write-only by root */
static int proc_ppc64_create_ofdt(void)
{
struct proc_dir_entry *ent;
ent = proc_create("powerpc/ofdt", S_IWUSR, NULL, &ofdt_fops);
if (ent)
proc_set_size(ent, 0);
return 0;
}
machine_device_initcall(pseries, proc_ppc64_create_ofdt);

View file

@ -0,0 +1,45 @@
/*
* Copyright 2013, Michael Ellerman, IBM Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "pseries-rng: " fmt
#include <linux/kernel.h>
#include <linux/of.h>
#include <asm/archrandom.h>
#include <asm/machdep.h>
#include <asm/plpar_wrappers.h>
static int pseries_get_random_long(unsigned long *v)
{
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
if (plpar_hcall(H_RANDOM, retbuf) == H_SUCCESS) {
*v = retbuf[0];
return 1;
}
return 0;
}
static __init int rng_init(void)
{
struct device_node *dn;
dn = of_find_compatible_node(NULL, NULL, "ibm,random");
if (!dn)
return -ENODEV;
pr_info("Registering arch random hook.\n");
ppc_md.get_random_long = pseries_get_random_long;
return 0;
}
machine_subsys_initcall(pseries, rng_init);

View file

@ -0,0 +1,200 @@
/*
* c 2001 PPC 64 Team, IBM Corp
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* scan-log-data driver for PPC64 Todd Inglett <tinglett@vnet.ibm.com>
*
* When ppc64 hardware fails the service processor dumps internal state
* of the system. After a reboot the operating system can access a dump
* of this data using this driver. A dump exists if the device-tree
* /chosen/ibm,scan-log-data property exists.
*
* This driver exports /proc/powerpc/scan-log-dump which can be read.
* The driver supports only sequential reads.
*
* The driver looks at a write to the driver for the single word "reset".
* If given, the driver will reset the scanlog so the platform can free it.
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <asm/rtas.h>
#include <asm/prom.h>
#define MODULE_VERS "1.0"
#define MODULE_NAME "scanlog"
/* Status returns from ibm,scan-log-dump */
#define SCANLOG_COMPLETE 0
#define SCANLOG_HWERROR -1
#define SCANLOG_CONTINUE 1
static unsigned int ibm_scan_log_dump; /* RTAS token */
static unsigned int *scanlog_buffer; /* The data buffer */
static ssize_t scanlog_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
unsigned int *data = scanlog_buffer;
int status;
unsigned long len, off;
unsigned int wait_time;
if (count > RTAS_DATA_BUF_SIZE)
count = RTAS_DATA_BUF_SIZE;
if (count < 1024) {
/* This is the min supported by this RTAS call. Rather
* than do all the buffering we insist the user code handle
* larger reads. As long as cp works... :)
*/
printk(KERN_ERR "scanlog: cannot perform a small read (%ld)\n", count);
return -EINVAL;
}
if (!access_ok(VERIFY_WRITE, buf, count))
return -EFAULT;
for (;;) {
wait_time = 500; /* default wait if no data */
spin_lock(&rtas_data_buf_lock);
memcpy(rtas_data_buf, data, RTAS_DATA_BUF_SIZE);
status = rtas_call(ibm_scan_log_dump, 2, 1, NULL,
(u32) __pa(rtas_data_buf), (u32) count);
memcpy(data, rtas_data_buf, RTAS_DATA_BUF_SIZE);
spin_unlock(&rtas_data_buf_lock);
pr_debug("scanlog: status=%d, data[0]=%x, data[1]=%x, " \
"data[2]=%x\n", status, data[0], data[1], data[2]);
switch (status) {
case SCANLOG_COMPLETE:
pr_debug("scanlog: hit eof\n");
return 0;
case SCANLOG_HWERROR:
pr_debug("scanlog: hardware error reading data\n");
return -EIO;
case SCANLOG_CONTINUE:
/* We may or may not have data yet */
len = data[1];
off = data[2];
if (len > 0) {
if (copy_to_user(buf, ((char *)data)+off, len))
return -EFAULT;
return len;
}
/* Break to sleep default time */
break;
default:
/* Assume extended busy */
wait_time = rtas_busy_delay_time(status);
if (!wait_time) {
printk(KERN_ERR "scanlog: unknown error " \
"from rtas: %d\n", status);
return -EIO;
}
}
/* Apparently no data yet. Wait and try again. */
msleep_interruptible(wait_time);
}
/*NOTREACHED*/
}
static ssize_t scanlog_write(struct file * file, const char __user * buf,
size_t count, loff_t *ppos)
{
char stkbuf[20];
int status;
if (count > 19) count = 19;
if (copy_from_user (stkbuf, buf, count)) {
return -EFAULT;
}
stkbuf[count] = 0;
if (buf) {
if (strncmp(stkbuf, "reset", 5) == 0) {
pr_debug("scanlog: reset scanlog\n");
status = rtas_call(ibm_scan_log_dump, 2, 1, NULL, 0, 0);
pr_debug("scanlog: rtas returns %d\n", status);
}
}
return count;
}
static int scanlog_open(struct inode * inode, struct file * file)
{
unsigned int *data = scanlog_buffer;
if (data[0] != 0) {
/* This imperfect test stops a second copy of the
* data (or a reset while data is being copied)
*/
return -EBUSY;
}
data[0] = 0; /* re-init so we restart the scan */
return 0;
}
static int scanlog_release(struct inode * inode, struct file * file)
{
unsigned int *data = scanlog_buffer;
data[0] = 0;
return 0;
}
const struct file_operations scanlog_fops = {
.owner = THIS_MODULE,
.read = scanlog_read,
.write = scanlog_write,
.open = scanlog_open,
.release = scanlog_release,
.llseek = noop_llseek,
};
static int __init scanlog_init(void)
{
struct proc_dir_entry *ent;
int err = -ENOMEM;
ibm_scan_log_dump = rtas_token("ibm,scan-log-dump");
if (ibm_scan_log_dump == RTAS_UNKNOWN_SERVICE)
return -ENODEV;
/* Ideally we could allocate a buffer < 4G */
scanlog_buffer = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
if (!scanlog_buffer)
goto err;
ent = proc_create("powerpc/rtas/scan-log-dump", S_IRUSR, NULL,
&scanlog_fops);
if (!ent)
goto err;
return 0;
err:
kfree(scanlog_buffer);
return err;
}
static void __exit scanlog_cleanup(void)
{
remove_proc_entry("powerpc/rtas/scan-log-dump", NULL);
kfree(scanlog_buffer);
}
module_init(scanlog_init);
module_exit(scanlog_cleanup);
MODULE_LICENSE("GPL");

View file

@ -0,0 +1,815 @@
/*
* 64-bit pSeries and RS/6000 setup code.
*
* Copyright (C) 1995 Linus Torvalds
* Adapted from 'alpha' version by Gary Thomas
* Modified by Cort Dougan (cort@cs.nmt.edu)
* Modified by PPC64 Team, IBM Corp
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
/*
* bootup setup stuff..
*/
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/stddef.h>
#include <linux/unistd.h>
#include <linux/user.h>
#include <linux/tty.h>
#include <linux/major.h>
#include <linux/interrupt.h>
#include <linux/reboot.h>
#include <linux/init.h>
#include <linux/ioport.h>
#include <linux/console.h>
#include <linux/pci.h>
#include <linux/utsname.h>
#include <linux/adb.h>
#include <linux/export.h>
#include <linux/delay.h>
#include <linux/irq.h>
#include <linux/seq_file.h>
#include <linux/root_dev.h>
#include <linux/of.h>
#include <linux/kexec.h>
#include <asm/mmu.h>
#include <asm/processor.h>
#include <asm/io.h>
#include <asm/pgtable.h>
#include <asm/prom.h>
#include <asm/rtas.h>
#include <asm/pci-bridge.h>
#include <asm/iommu.h>
#include <asm/dma.h>
#include <asm/machdep.h>
#include <asm/irq.h>
#include <asm/time.h>
#include <asm/nvram.h>
#include <asm/pmc.h>
#include <asm/mpic.h>
#include <asm/xics.h>
#include <asm/ppc-pci.h>
#include <asm/i8259.h>
#include <asm/udbg.h>
#include <asm/smp.h>
#include <asm/firmware.h>
#include <asm/eeh.h>
#include <asm/reg.h>
#include <asm/plpar_wrappers.h>
#include "pseries.h"
int CMO_PrPSP = -1;
int CMO_SecPSP = -1;
unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT_4K);
EXPORT_SYMBOL(CMO_PageSize);
int fwnmi_active; /* TRUE if an FWNMI handler is present */
static struct device_node *pSeries_mpic_node;
static void pSeries_show_cpuinfo(struct seq_file *m)
{
struct device_node *root;
const char *model = "";
root = of_find_node_by_path("/");
if (root)
model = of_get_property(root, "model", NULL);
seq_printf(m, "machine\t\t: CHRP %s\n", model);
of_node_put(root);
}
/* Initialize firmware assisted non-maskable interrupts if
* the firmware supports this feature.
*/
static void __init fwnmi_init(void)
{
unsigned long system_reset_addr, machine_check_addr;
int ibm_nmi_register = rtas_token("ibm,nmi-register");
if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE)
return;
/* If the kernel's not linked at zero we point the firmware at low
* addresses anyway, and use a trampoline to get to the real code. */
system_reset_addr = __pa(system_reset_fwnmi) - PHYSICAL_START;
machine_check_addr = __pa(machine_check_fwnmi) - PHYSICAL_START;
if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr,
machine_check_addr))
fwnmi_active = 1;
}
static void pseries_8259_cascade(unsigned int irq, struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
unsigned int cascade_irq = i8259_irq();
if (cascade_irq != NO_IRQ)
generic_handle_irq(cascade_irq);
chip->irq_eoi(&desc->irq_data);
}
static void __init pseries_setup_i8259_cascade(void)
{
struct device_node *np, *old, *found = NULL;
unsigned int cascade;
const u32 *addrp;
unsigned long intack = 0;
int naddr;
for_each_node_by_type(np, "interrupt-controller") {
if (of_device_is_compatible(np, "chrp,iic")) {
found = np;
break;
}
}
if (found == NULL) {
printk(KERN_DEBUG "pic: no ISA interrupt controller\n");
return;
}
cascade = irq_of_parse_and_map(found, 0);
if (cascade == NO_IRQ) {
printk(KERN_ERR "pic: failed to map cascade interrupt");
return;
}
pr_debug("pic: cascade mapped to irq %d\n", cascade);
for (old = of_node_get(found); old != NULL ; old = np) {
np = of_get_parent(old);
of_node_put(old);
if (np == NULL)
break;
if (strcmp(np->name, "pci") != 0)
continue;
addrp = of_get_property(np, "8259-interrupt-acknowledge", NULL);
if (addrp == NULL)
continue;
naddr = of_n_addr_cells(np);
intack = addrp[naddr-1];
if (naddr > 1)
intack |= ((unsigned long)addrp[naddr-2]) << 32;
}
if (intack)
printk(KERN_DEBUG "pic: PCI 8259 intack at 0x%016lx\n", intack);
i8259_init(found, intack);
of_node_put(found);
irq_set_chained_handler(cascade, pseries_8259_cascade);
}
static void __init pseries_mpic_init_IRQ(void)
{
struct device_node *np;
const unsigned int *opprop;
unsigned long openpic_addr = 0;
int naddr, n, i, opplen;
struct mpic *mpic;
np = of_find_node_by_path("/");
naddr = of_n_addr_cells(np);
opprop = of_get_property(np, "platform-open-pic", &opplen);
if (opprop != NULL) {
openpic_addr = of_read_number(opprop, naddr);
printk(KERN_DEBUG "OpenPIC addr: %lx\n", openpic_addr);
}
of_node_put(np);
BUG_ON(openpic_addr == 0);
/* Setup the openpic driver */
mpic = mpic_alloc(pSeries_mpic_node, openpic_addr,
MPIC_NO_RESET, 16, 0, " MPIC ");
BUG_ON(mpic == NULL);
/* Add ISUs */
opplen /= sizeof(u32);
for (n = 0, i = naddr; i < opplen; i += naddr, n++) {
unsigned long isuaddr = of_read_number(opprop + i, naddr);
mpic_assign_isu(mpic, n, isuaddr);
}
/* Setup top-level get_irq */
ppc_md.get_irq = mpic_get_irq;
/* All ISUs are setup, complete initialization */
mpic_init(mpic);
/* Look for cascade */
pseries_setup_i8259_cascade();
}
static void __init pseries_xics_init_IRQ(void)
{
xics_init();
pseries_setup_i8259_cascade();
}
static void pseries_lpar_enable_pmcs(void)
{
unsigned long set, reset;
set = 1UL << 63;
reset = 0;
plpar_hcall_norets(H_PERFMON, set, reset);
}
static void __init pseries_discover_pic(void)
{
struct device_node *np;
const char *typep;
for_each_node_by_name(np, "interrupt-controller") {
typep = of_get_property(np, "compatible", NULL);
if (strstr(typep, "open-pic")) {
pSeries_mpic_node = of_node_get(np);
ppc_md.init_IRQ = pseries_mpic_init_IRQ;
setup_kexec_cpu_down_mpic();
smp_init_pseries_mpic();
return;
} else if (strstr(typep, "ppc-xicp")) {
ppc_md.init_IRQ = pseries_xics_init_IRQ;
setup_kexec_cpu_down_xics();
smp_init_pseries_xics();
return;
}
}
printk(KERN_ERR "pSeries_discover_pic: failed to recognize"
" interrupt-controller\n");
}
static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node)
{
struct device_node *np = node;
struct pci_dn *pci = NULL;
int err = NOTIFY_OK;
switch (action) {
case OF_RECONFIG_ATTACH_NODE:
pci = np->parent->data;
if (pci) {
update_dn_pci_info(np, pci->phb);
/* Create EEH device for the OF node */
eeh_dev_init(np, pci->phb);
}
break;
default:
err = NOTIFY_DONE;
break;
}
return err;
}
static struct notifier_block pci_dn_reconfig_nb = {
.notifier_call = pci_dn_reconfig_notifier,
};
struct kmem_cache *dtl_cache;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
/*
* Allocate space for the dispatch trace log for all possible cpus
* and register the buffers with the hypervisor. This is used for
* computing time stolen by the hypervisor.
*/
static int alloc_dispatch_logs(void)
{
int cpu, ret;
struct paca_struct *pp;
struct dtl_entry *dtl;
if (!firmware_has_feature(FW_FEATURE_SPLPAR))
return 0;
if (!dtl_cache)
return 0;
for_each_possible_cpu(cpu) {
pp = &paca[cpu];
dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL);
if (!dtl) {
pr_warn("Failed to allocate dispatch trace log for cpu %d\n",
cpu);
pr_warn("Stolen time statistics will be unreliable\n");
break;
}
pp->dtl_ridx = 0;
pp->dispatch_log = dtl;
pp->dispatch_log_end = dtl + N_DISPATCH_LOG;
pp->dtl_curr = dtl;
}
/* Register the DTL for the current (boot) cpu */
dtl = get_paca()->dispatch_log;
get_paca()->dtl_ridx = 0;
get_paca()->dtl_curr = dtl;
get_paca()->lppaca_ptr->dtl_idx = 0;
/* hypervisor reads buffer length from this field */
dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
ret = register_dtl(hard_smp_processor_id(), __pa(dtl));
if (ret)
pr_err("WARNING: DTL registration of cpu %d (hw %d) failed "
"with %d\n", smp_processor_id(),
hard_smp_processor_id(), ret);
get_paca()->lppaca_ptr->dtl_enable_mask = 2;
return 0;
}
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
static inline int alloc_dispatch_logs(void)
{
return 0;
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
static int alloc_dispatch_log_kmem_cache(void)
{
dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES,
DISPATCH_LOG_BYTES, 0, NULL);
if (!dtl_cache) {
pr_warn("Failed to create dispatch trace log buffer cache\n");
pr_warn("Stolen time statistics will be unreliable\n");
return 0;
}
return alloc_dispatch_logs();
}
machine_early_initcall(pseries, alloc_dispatch_log_kmem_cache);
static void pseries_lpar_idle(void)
{
/*
* Default handler to go into low thread priority and possibly
* low power mode by cedeing processor to hypervisor
*/
/* Indicate to hypervisor that we are idle. */
get_lppaca()->idle = 1;
/*
* Yield the processor to the hypervisor. We return if
* an external interrupt occurs (which are driven prior
* to returning here) or if a prod occurs from another
* processor. When returning here, external interrupts
* are enabled.
*/
cede_processor();
get_lppaca()->idle = 0;
}
/*
* Enable relocation on during exceptions. This has partition wide scope and
* may take a while to complete, if it takes longer than one second we will
* just give up rather than wasting any more time on this - if that turns out
* to ever be a problem in practice we can move this into a kernel thread to
* finish off the process later in boot.
*/
long pSeries_enable_reloc_on_exc(void)
{
long rc;
unsigned int delay, total_delay = 0;
while (1) {
rc = enable_reloc_on_exceptions();
if (!H_IS_LONG_BUSY(rc))
return rc;
delay = get_longbusy_msecs(rc);
total_delay += delay;
if (total_delay > 1000) {
pr_warn("Warning: Giving up waiting to enable "
"relocation on exceptions (%u msec)!\n",
total_delay);
return rc;
}
mdelay(delay);
}
}
EXPORT_SYMBOL(pSeries_enable_reloc_on_exc);
long pSeries_disable_reloc_on_exc(void)
{
long rc;
while (1) {
rc = disable_reloc_on_exceptions();
if (!H_IS_LONG_BUSY(rc))
return rc;
mdelay(get_longbusy_msecs(rc));
}
}
EXPORT_SYMBOL(pSeries_disable_reloc_on_exc);
#ifdef CONFIG_KEXEC
static void pSeries_machine_kexec(struct kimage *image)
{
long rc;
if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
rc = pSeries_disable_reloc_on_exc();
if (rc != H_SUCCESS)
pr_warning("Warning: Failed to disable relocation on "
"exceptions: %ld\n", rc);
}
default_machine_kexec(image);
}
#endif
#ifdef __LITTLE_ENDIAN__
long pseries_big_endian_exceptions(void)
{
long rc;
while (1) {
rc = enable_big_endian_exceptions();
if (!H_IS_LONG_BUSY(rc))
return rc;
mdelay(get_longbusy_msecs(rc));
}
}
static long pseries_little_endian_exceptions(void)
{
long rc;
while (1) {
rc = enable_little_endian_exceptions();
if (!H_IS_LONG_BUSY(rc))
return rc;
mdelay(get_longbusy_msecs(rc));
}
}
#endif
static void __init pSeries_setup_arch(void)
{
set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
/* Discover PIC type and setup ppc_md accordingly */
pseries_discover_pic();
/* openpic global configuration register (64-bit format). */
/* openpic Interrupt Source Unit pointer (64-bit format). */
/* python0 facility area (mmio) (64-bit format) REAL address. */
/* init to some ~sane value until calibrate_delay() runs */
loops_per_jiffy = 50000000;
fwnmi_init();
/* By default, only probe PCI (can be overriden by rtas_pci) */
pci_add_flags(PCI_PROBE_ONLY);
/* Find and initialize PCI host bridges */
init_pci_config_tokens();
find_and_init_phbs();
of_reconfig_notifier_register(&pci_dn_reconfig_nb);
pSeries_nvram_init();
if (firmware_has_feature(FW_FEATURE_LPAR)) {
vpa_init(boot_cpuid);
ppc_md.power_save = pseries_lpar_idle;
ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
} else {
/* No special idle routine */
ppc_md.enable_pmcs = power4_enable_pmcs;
}
ppc_md.pcibios_root_bridge_prepare = pseries_root_bridge_prepare;
if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
long rc;
if ((rc = pSeries_enable_reloc_on_exc()) != H_SUCCESS) {
pr_warn("Unable to enable relocation on exceptions: "
"%ld\n", rc);
}
}
}
static int __init pSeries_init_panel(void)
{
/* Manually leave the kernel version on the panel. */
#ifdef __BIG_ENDIAN__
ppc_md.progress("Linux ppc64\n", 0);
#else
ppc_md.progress("Linux ppc64le\n", 0);
#endif
ppc_md.progress(init_utsname()->version, 0);
return 0;
}
machine_arch_initcall(pseries, pSeries_init_panel);
static int pseries_set_dabr(unsigned long dabr, unsigned long dabrx)
{
return plpar_hcall_norets(H_SET_DABR, dabr);
}
static int pseries_set_xdabr(unsigned long dabr, unsigned long dabrx)
{
/* Have to set at least one bit in the DABRX according to PAPR */
if (dabrx == 0 && dabr == 0)
dabrx = DABRX_USER;
/* PAPR says we can only set kernel and user bits */
dabrx &= DABRX_KERNEL | DABRX_USER;
return plpar_hcall_norets(H_SET_XDABR, dabr, dabrx);
}
static int pseries_set_dawr(unsigned long dawr, unsigned long dawrx)
{
/* PAPR says we can't set HYP */
dawrx &= ~DAWRX_HYP;
return plapr_set_watchpoint0(dawr, dawrx);
}
#define CMO_CHARACTERISTICS_TOKEN 44
#define CMO_MAXLENGTH 1026
void pSeries_coalesce_init(void)
{
struct hvcall_mpp_x_data mpp_x_data;
if (firmware_has_feature(FW_FEATURE_CMO) && !h_get_mpp_x(&mpp_x_data))
powerpc_firmware_features |= FW_FEATURE_XCMO;
else
powerpc_firmware_features &= ~FW_FEATURE_XCMO;
}
/**
* fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions,
* handle that here. (Stolen from parse_system_parameter_string)
*/
static void pSeries_cmo_feature_init(void)
{
char *ptr, *key, *value, *end;
int call_status;
int page_order = IOMMU_PAGE_SHIFT_4K;
pr_debug(" -> fw_cmo_feature_init()\n");
spin_lock(&rtas_data_buf_lock);
memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
NULL,
CMO_CHARACTERISTICS_TOKEN,
__pa(rtas_data_buf),
RTAS_DATA_BUF_SIZE);
if (call_status != 0) {
spin_unlock(&rtas_data_buf_lock);
pr_debug("CMO not available\n");
pr_debug(" <- fw_cmo_feature_init()\n");
return;
}
end = rtas_data_buf + CMO_MAXLENGTH - 2;
ptr = rtas_data_buf + 2; /* step over strlen value */
key = value = ptr;
while (*ptr && (ptr <= end)) {
/* Separate the key and value by replacing '=' with '\0' and
* point the value at the string after the '='
*/
if (ptr[0] == '=') {
ptr[0] = '\0';
value = ptr + 1;
} else if (ptr[0] == '\0' || ptr[0] == ',') {
/* Terminate the string containing the key/value pair */
ptr[0] = '\0';
if (key == value) {
pr_debug("Malformed key/value pair\n");
/* Never found a '=', end processing */
break;
}
if (0 == strcmp(key, "CMOPageSize"))
page_order = simple_strtol(value, NULL, 10);
else if (0 == strcmp(key, "PrPSP"))
CMO_PrPSP = simple_strtol(value, NULL, 10);
else if (0 == strcmp(key, "SecPSP"))
CMO_SecPSP = simple_strtol(value, NULL, 10);
value = key = ptr + 1;
}
ptr++;
}
/* Page size is returned as the power of 2 of the page size,
* convert to the page size in bytes before returning
*/
CMO_PageSize = 1 << page_order;
pr_debug("CMO_PageSize = %lu\n", CMO_PageSize);
if (CMO_PrPSP != -1 || CMO_SecPSP != -1) {
pr_info("CMO enabled\n");
pr_debug("CMO enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP,
CMO_SecPSP);
powerpc_firmware_features |= FW_FEATURE_CMO;
pSeries_coalesce_init();
} else
pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP,
CMO_SecPSP);
spin_unlock(&rtas_data_buf_lock);
pr_debug(" <- fw_cmo_feature_init()\n");
}
/*
* Early initialization. Relocation is on but do not reference unbolted pages
*/
static void __init pSeries_init_early(void)
{
pr_debug(" -> pSeries_init_early()\n");
#ifdef CONFIG_HVC_CONSOLE
if (firmware_has_feature(FW_FEATURE_LPAR))
hvc_vio_init_early();
#endif
if (firmware_has_feature(FW_FEATURE_XDABR))
ppc_md.set_dabr = pseries_set_xdabr;
else if (firmware_has_feature(FW_FEATURE_DABR))
ppc_md.set_dabr = pseries_set_dabr;
if (firmware_has_feature(FW_FEATURE_SET_MODE))
ppc_md.set_dawr = pseries_set_dawr;
pSeries_cmo_feature_init();
iommu_init_early_pSeries();
pr_debug(" <- pSeries_init_early()\n");
}
/*
* Called very early, MMU is off, device-tree isn't unflattened
*/
static int __init pseries_probe_fw_features(unsigned long node,
const char *uname, int depth,
void *data)
{
const char *prop;
int len;
static int hypertas_found;
static int vec5_found;
if (depth != 1)
return 0;
if (!strcmp(uname, "rtas") || !strcmp(uname, "rtas@0")) {
prop = of_get_flat_dt_prop(node, "ibm,hypertas-functions",
&len);
if (prop) {
powerpc_firmware_features |= FW_FEATURE_LPAR;
fw_hypertas_feature_init(prop, len);
}
hypertas_found = 1;
}
if (!strcmp(uname, "chosen")) {
prop = of_get_flat_dt_prop(node, "ibm,architecture-vec-5",
&len);
if (prop)
fw_vec5_feature_init(prop, len);
vec5_found = 1;
}
return hypertas_found && vec5_found;
}
static int __init pSeries_probe(void)
{
unsigned long root = of_get_flat_dt_root();
const char *dtype = of_get_flat_dt_prop(root, "device_type", NULL);
if (dtype == NULL)
return 0;
if (strcmp(dtype, "chrp"))
return 0;
/* Cell blades firmware claims to be chrp while it's not. Until this
* is fixed, we need to avoid those here.
*/
if (of_flat_dt_is_compatible(root, "IBM,CPBW-1.0") ||
of_flat_dt_is_compatible(root, "IBM,CBEA"))
return 0;
pr_debug("pSeries detected, looking for LPAR capability...\n");
/* Now try to figure out if we are running on LPAR */
of_scan_flat_dt(pseries_probe_fw_features, NULL);
#ifdef __LITTLE_ENDIAN__
if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
long rc;
/*
* Tell the hypervisor that we want our exceptions to
* be taken in little endian mode. If this fails we don't
* want to use BUG() because it will trigger an exception.
*/
rc = pseries_little_endian_exceptions();
if (rc) {
ppc_md.progress("H_SET_MODE LE exception fail", 0);
panic("Could not enable little endian exceptions");
}
}
#endif
if (firmware_has_feature(FW_FEATURE_LPAR))
hpte_init_lpar();
else
hpte_init_native();
pr_debug("Machine is%s LPAR !\n",
(powerpc_firmware_features & FW_FEATURE_LPAR) ? "" : " not");
return 1;
}
static int pSeries_pci_probe_mode(struct pci_bus *bus)
{
if (firmware_has_feature(FW_FEATURE_LPAR))
return PCI_PROBE_DEVTREE;
return PCI_PROBE_NORMAL;
}
/**
* pSeries_power_off - tell firmware about how to power off the system.
*
* This function calls either the power-off rtas token in normal cases
* or the ibm,power-off-ups token (if present & requested) in case of
* a power failure. If power-off token is used, power on will only be
* possible with power button press. If ibm,power-off-ups token is used
* it will allow auto poweron after power is restored.
*/
static void pSeries_power_off(void)
{
int rc;
int rtas_poweroff_ups_token = rtas_token("ibm,power-off-ups");
if (rtas_flash_term_hook)
rtas_flash_term_hook(SYS_POWER_OFF);
if (rtas_poweron_auto == 0 ||
rtas_poweroff_ups_token == RTAS_UNKNOWN_SERVICE) {
rc = rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1);
printk(KERN_INFO "RTAS power-off returned %d\n", rc);
} else {
rc = rtas_call(rtas_poweroff_ups_token, 0, 1, NULL);
printk(KERN_INFO "RTAS ibm,power-off-ups returned %d\n", rc);
}
for (;;);
}
#ifndef CONFIG_PCI
void pSeries_final_fixup(void) { }
#endif
define_machine(pseries) {
.name = "pSeries",
.probe = pSeries_probe,
.setup_arch = pSeries_setup_arch,
.init_early = pSeries_init_early,
.show_cpuinfo = pSeries_show_cpuinfo,
.log_error = pSeries_log_error,
.pcibios_fixup = pSeries_final_fixup,
.pci_probe_mode = pSeries_pci_probe_mode,
.restart = rtas_restart,
.power_off = pSeries_power_off,
.halt = rtas_halt,
.panic = rtas_os_term,
.get_boot_time = rtas_get_boot_time,
.get_rtc_time = rtas_get_rtc_time,
.set_rtc_time = rtas_set_rtc_time,
.calibrate_decr = generic_calibrate_decr,
.progress = rtas_progress,
.system_reset_exception = pSeries_system_reset_exception,
.machine_check_exception = pSeries_machine_check_exception,
#ifdef CONFIG_KEXEC
.machine_kexec = pSeries_machine_kexec,
#endif
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
.memory_block_size = pseries_memory_block_size,
#endif
};

View file

@ -0,0 +1,276 @@
/*
* SMP support for pSeries machines.
*
* Dave Engebretsen, Peter Bergner, and
* Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
*
* Plus various changes from other IBM teams...
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/cache.h>
#include <linux/err.h>
#include <linux/device.h>
#include <linux/cpu.h>
#include <asm/ptrace.h>
#include <linux/atomic.h>
#include <asm/irq.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/io.h>
#include <asm/prom.h>
#include <asm/smp.h>
#include <asm/paca.h>
#include <asm/machdep.h>
#include <asm/cputable.h>
#include <asm/firmware.h>
#include <asm/rtas.h>
#include <asm/mpic.h>
#include <asm/vdso_datapage.h>
#include <asm/cputhreads.h>
#include <asm/xics.h>
#include <asm/dbell.h>
#include <asm/plpar_wrappers.h>
#include <asm/code-patching.h>
#include "pseries.h"
#include "offline_states.h"
/*
* The Primary thread of each non-boot processor was started from the OF client
* interface by prom_hold_cpus and is spinning on secondary_hold_spinloop.
*/
static cpumask_var_t of_spin_mask;
/*
* If we multiplex IPI mechanisms, store the appropriate XICS IPI mechanism here
*/
static void (*xics_cause_ipi)(int cpu, unsigned long data);
/* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */
int smp_query_cpu_stopped(unsigned int pcpu)
{
int cpu_status, status;
int qcss_tok = rtas_token("query-cpu-stopped-state");
if (qcss_tok == RTAS_UNKNOWN_SERVICE) {
printk_once(KERN_INFO
"Firmware doesn't support query-cpu-stopped-state\n");
return QCSS_HARDWARE_ERROR;
}
status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu);
if (status != 0) {
printk(KERN_ERR
"RTAS query-cpu-stopped-state failed: %i\n", status);
return status;
}
return cpu_status;
}
/**
* smp_startup_cpu() - start the given cpu
*
* At boot time, there is nothing to do for primary threads which were
* started from Open Firmware. For anything else, call RTAS with the
* appropriate start location.
*
* Returns:
* 0 - failure
* 1 - success
*/
static inline int smp_startup_cpu(unsigned int lcpu)
{
int status;
unsigned long start_here =
__pa(ppc_function_entry(generic_secondary_smp_init));
unsigned int pcpu;
int start_cpu;
if (cpumask_test_cpu(lcpu, of_spin_mask))
/* Already started by OF and sitting in spin loop */
return 1;
pcpu = get_hard_smp_processor_id(lcpu);
/* Check to see if the CPU out of FW already for kexec */
if (smp_query_cpu_stopped(pcpu) == QCSS_NOT_STOPPED){
cpumask_set_cpu(lcpu, of_spin_mask);
return 1;
}
/* Fixup atomic count: it exited inside IRQ handler. */
task_thread_info(paca[lcpu].__current)->preempt_count = 0;
#ifdef CONFIG_HOTPLUG_CPU
if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE)
goto out;
#endif
/*
* If the RTAS start-cpu token does not exist then presume the
* cpu is already spinning.
*/
start_cpu = rtas_token("start-cpu");
if (start_cpu == RTAS_UNKNOWN_SERVICE)
return 1;
status = rtas_call(start_cpu, 3, 1, NULL, pcpu, start_here, pcpu);
if (status != 0) {
printk(KERN_ERR "start-cpu failed: %i\n", status);
return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
out:
#endif
return 1;
}
static void smp_xics_setup_cpu(int cpu)
{
if (cpu != boot_cpuid)
xics_setup_cpu();
if (cpu_has_feature(CPU_FTR_DBELL))
doorbell_setup_this_cpu();
if (firmware_has_feature(FW_FEATURE_SPLPAR))
vpa_init(cpu);
cpumask_clear_cpu(cpu, of_spin_mask);
#ifdef CONFIG_HOTPLUG_CPU
set_cpu_current_state(cpu, CPU_STATE_ONLINE);
set_default_offline_state(cpu);
#endif
}
static int smp_pSeries_kick_cpu(int nr)
{
BUG_ON(nr < 0 || nr >= NR_CPUS);
if (!smp_startup_cpu(nr))
return -ENOENT;
/*
* The processor is currently spinning, waiting for the
* cpu_start field to become non-zero After we set cpu_start,
* the processor will continue on to secondary_start
*/
paca[nr].cpu_start = 1;
#ifdef CONFIG_HOTPLUG_CPU
set_preferred_offline_state(nr, CPU_STATE_ONLINE);
if (get_cpu_current_state(nr) == CPU_STATE_INACTIVE) {
long rc;
unsigned long hcpuid;
hcpuid = get_hard_smp_processor_id(nr);
rc = plpar_hcall_norets(H_PROD, hcpuid);
if (rc != H_SUCCESS)
printk(KERN_ERR "Error: Prod to wake up processor %d "
"Ret= %ld\n", nr, rc);
}
#endif
return 0;
}
/* Only used on systems that support multiple IPI mechanisms */
static void pSeries_cause_ipi_mux(int cpu, unsigned long data)
{
if (cpumask_test_cpu(cpu, cpu_sibling_mask(smp_processor_id())))
doorbell_cause_ipi(cpu, data);
else
xics_cause_ipi(cpu, data);
}
static __init int pSeries_smp_probe(void)
{
int ret = xics_smp_probe();
if (cpu_has_feature(CPU_FTR_DBELL)) {
xics_cause_ipi = smp_ops->cause_ipi;
smp_ops->cause_ipi = pSeries_cause_ipi_mux;
}
return ret;
}
static struct smp_ops_t pSeries_mpic_smp_ops = {
.message_pass = smp_mpic_message_pass,
.probe = smp_mpic_probe,
.kick_cpu = smp_pSeries_kick_cpu,
.setup_cpu = smp_mpic_setup_cpu,
};
static struct smp_ops_t pSeries_xics_smp_ops = {
.message_pass = NULL, /* Use smp_muxed_ipi_message_pass */
.cause_ipi = NULL, /* Filled at runtime by pSeries_smp_probe() */
.probe = pSeries_smp_probe,
.kick_cpu = smp_pSeries_kick_cpu,
.setup_cpu = smp_xics_setup_cpu,
.cpu_bootable = smp_generic_cpu_bootable,
};
/* This is called very early */
static void __init smp_init_pseries(void)
{
int i;
pr_debug(" -> smp_init_pSeries()\n");
alloc_bootmem_cpumask_var(&of_spin_mask);
/*
* Mark threads which are still spinning in hold loops
*
* We know prom_init will not have started them if RTAS supports
* query-cpu-stopped-state.
*/
if (rtas_token("query-cpu-stopped-state") == RTAS_UNKNOWN_SERVICE) {
if (cpu_has_feature(CPU_FTR_SMT)) {
for_each_present_cpu(i) {
if (cpu_thread_in_core(i) == 0)
cpumask_set_cpu(i, of_spin_mask);
}
} else
cpumask_copy(of_spin_mask, cpu_present_mask);
cpumask_clear_cpu(boot_cpuid, of_spin_mask);
}
/* Non-lpar has additional take/give timebase */
if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) {
smp_ops->give_timebase = rtas_give_timebase;
smp_ops->take_timebase = rtas_take_timebase;
}
pr_debug(" <- smp_init_pSeries()\n");
}
void __init smp_init_pseries_mpic(void)
{
smp_ops = &pSeries_mpic_smp_ops;
smp_init_pseries();
}
void __init smp_init_pseries_xics(void)
{
smp_ops = &pSeries_xics_smp_ops;
smp_init_pseries();
}

View file

@ -0,0 +1,283 @@
/*
* Copyright (C) 2010 Brian King IBM Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/suspend.h>
#include <linux/stat.h>
#include <asm/firmware.h>
#include <asm/hvcall.h>
#include <asm/machdep.h>
#include <asm/mmu.h>
#include <asm/rtas.h>
#include <asm/topology.h>
#include "../../kernel/cacheinfo.h"
static u64 stream_id;
static struct device suspend_dev;
static DECLARE_COMPLETION(suspend_work);
static struct rtas_suspend_me_data suspend_data;
static atomic_t suspending;
/**
* pseries_suspend_begin - First phase of hibernation
*
* Check to ensure we are in a valid state to hibernate
*
* Return value:
* 0 on success / other on failure
**/
static int pseries_suspend_begin(suspend_state_t state)
{
long vasi_state, rc;
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
/* Make sure the state is valid */
rc = plpar_hcall(H_VASI_STATE, retbuf, stream_id);
vasi_state = retbuf[0];
if (rc) {
pr_err("pseries_suspend_begin: vasi_state returned %ld\n",rc);
return rc;
} else if (vasi_state == H_VASI_ENABLED) {
return -EAGAIN;
} else if (vasi_state != H_VASI_SUSPENDING) {
pr_err("pseries_suspend_begin: vasi_state returned state %ld\n",
vasi_state);
return -EIO;
}
return 0;
}
/**
* pseries_suspend_cpu - Suspend a single CPU
*
* Makes the H_JOIN call to suspend the CPU
*
**/
static int pseries_suspend_cpu(void)
{
if (atomic_read(&suspending))
return rtas_suspend_cpu(&suspend_data);
return 0;
}
/**
* pseries_suspend_enable_irqs
*
* Post suspend configuration updates
*
**/
static void pseries_suspend_enable_irqs(void)
{
/*
* Update configuration which can be modified based on device tree
* changes during resume.
*/
cacheinfo_cpu_offline(smp_processor_id());
post_mobility_fixup();
cacheinfo_cpu_online(smp_processor_id());
}
/**
* pseries_suspend_enter - Final phase of hibernation
*
* Return value:
* 0 on success / other on failure
**/
static int pseries_suspend_enter(suspend_state_t state)
{
int rc = rtas_suspend_last_cpu(&suspend_data);
atomic_set(&suspending, 0);
atomic_set(&suspend_data.done, 1);
return rc;
}
/**
* pseries_prepare_late - Prepare to suspend all other CPUs
*
* Return value:
* 0 on success / other on failure
**/
static int pseries_prepare_late(void)
{
atomic_set(&suspending, 1);
atomic_set(&suspend_data.working, 0);
atomic_set(&suspend_data.done, 0);
atomic_set(&suspend_data.error, 0);
suspend_data.complete = &suspend_work;
reinit_completion(&suspend_work);
return 0;
}
/**
* store_hibernate - Initiate partition hibernation
* @dev: subsys root device
* @attr: device attribute struct
* @buf: buffer
* @count: buffer size
*
* Write the stream ID received from the HMC to this file
* to trigger hibernating the partition
*
* Return value:
* number of bytes printed to buffer / other on failure
**/
static ssize_t store_hibernate(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
cpumask_var_t offline_mask;
int rc;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY))
return -ENOMEM;
stream_id = simple_strtoul(buf, NULL, 16);
do {
rc = pseries_suspend_begin(PM_SUSPEND_MEM);
if (rc == -EAGAIN)
ssleep(1);
} while (rc == -EAGAIN);
if (!rc) {
/* All present CPUs must be online */
cpumask_andnot(offline_mask, cpu_present_mask,
cpu_online_mask);
rc = rtas_online_cpus_mask(offline_mask);
if (rc) {
pr_err("%s: Could not bring present CPUs online.\n",
__func__);
goto out;
}
stop_topology_update();
rc = pm_suspend(PM_SUSPEND_MEM);
start_topology_update();
/* Take down CPUs not online prior to suspend */
if (!rtas_offline_cpus_mask(offline_mask))
pr_warn("%s: Could not restore CPUs to offline "
"state.\n", __func__);
}
stream_id = 0;
if (!rc)
rc = count;
out:
free_cpumask_var(offline_mask);
return rc;
}
#define USER_DT_UPDATE 0
#define KERN_DT_UPDATE 1
/**
* show_hibernate - Report device tree update responsibilty
* @dev: subsys root device
* @attr: device attribute struct
* @buf: buffer
*
* Report whether a device tree update is performed by the kernel after a
* resume, or if drmgr must coordinate the update from user space.
*
* Return value:
* 0 if drmgr is to initiate update, and 1 otherwise
**/
static ssize_t show_hibernate(struct device *dev,
struct device_attribute *attr,
char *buf)
{
return sprintf(buf, "%d\n", KERN_DT_UPDATE);
}
static DEVICE_ATTR(hibernate, S_IWUSR | S_IRUGO,
show_hibernate, store_hibernate);
static struct bus_type suspend_subsys = {
.name = "power",
.dev_name = "power",
};
static const struct platform_suspend_ops pseries_suspend_ops = {
.valid = suspend_valid_only_mem,
.begin = pseries_suspend_begin,
.prepare_late = pseries_prepare_late,
.enter = pseries_suspend_enter,
};
/**
* pseries_suspend_sysfs_register - Register with sysfs
*
* Return value:
* 0 on success / other on failure
**/
static int pseries_suspend_sysfs_register(struct device *dev)
{
int rc;
if ((rc = subsys_system_register(&suspend_subsys, NULL)))
return rc;
dev->id = 0;
dev->bus = &suspend_subsys;
if ((rc = device_create_file(suspend_subsys.dev_root, &dev_attr_hibernate)))
goto subsys_unregister;
return 0;
subsys_unregister:
bus_unregister(&suspend_subsys);
return rc;
}
/**
* pseries_suspend_init - initcall for pSeries suspend
*
* Return value:
* 0 on success / other on failure
**/
static int __init pseries_suspend_init(void)
{
int rc;
if (!firmware_has_feature(FW_FEATURE_LPAR))
return 0;
suspend_data.token = rtas_token("ibm,suspend-me");
if (suspend_data.token == RTAS_UNKNOWN_SERVICE)
return 0;
if ((rc = pseries_suspend_sysfs_register(&suspend_dev)))
return rc;
ppc_md.suspend_disable_cpu = pseries_suspend_cpu;
ppc_md.suspend_enable_irqs = pseries_suspend_enable_irqs;
suspend_set_ops(&pseries_suspend_ops);
return 0;
}
machine_device_initcall(pseries, pseries_suspend_init);