Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

2
kernel/Kconfig.freezer Normal file
View file

@ -0,0 +1,2 @@
config FREEZER
def_bool PM_SLEEP || CGROUP_FREEZER

58
kernel/Kconfig.hz Normal file
View file

@ -0,0 +1,58 @@
#
# Timer Interrupt Frequency Configuration
#
choice
prompt "Timer frequency"
default HZ_250
help
Allows the configuration of the timer frequency. It is customary
to have the timer interrupt run at 1000 Hz but 100 Hz may be more
beneficial for servers and NUMA systems that do not need to have
a fast response for user interaction and that may experience bus
contention and cacheline bounces as a result of timer interrupts.
Note that the timer interrupt occurs on each processor in an SMP
environment leading to NR_CPUS * HZ number of timer interrupts
per second.
config HZ_100
bool "100 HZ"
help
100 Hz is a typical choice for servers, SMP and NUMA systems
with lots of processors that may show reduced performance if
too many timer interrupts are occurring.
config HZ_250
bool "250 HZ"
help
250 Hz is a good compromise choice allowing server performance
while also showing good interactive responsiveness even
on SMP and NUMA systems. If you are going to be using NTSC video
or multimedia, selected 300Hz instead.
config HZ_300
bool "300 HZ"
help
300 Hz is a good compromise choice allowing server performance
while also showing good interactive responsiveness even
on SMP and NUMA systems and exactly dividing by both PAL and
NTSC frame rates for video and multimedia work.
config HZ_1000
bool "1000 HZ"
help
1000 Hz is the preferred choice for desktop systems and other
systems requiring fast interactive responses to events.
endchoice
config HZ
int
default 100 if HZ_100
default 250 if HZ_250
default 300 if HZ_300
default 1000 if HZ_1000
config SCHED_HRTICK
def_bool HIGH_RES_TIMERS

239
kernel/Kconfig.locks Normal file
View file

@ -0,0 +1,239 @@
#
# The ARCH_INLINE foo is necessary because select ignores "depends on"
#
config ARCH_INLINE_SPIN_TRYLOCK
bool
config ARCH_INLINE_SPIN_TRYLOCK_BH
bool
config ARCH_INLINE_SPIN_LOCK
bool
config ARCH_INLINE_SPIN_LOCK_BH
bool
config ARCH_INLINE_SPIN_LOCK_IRQ
bool
config ARCH_INLINE_SPIN_LOCK_IRQSAVE
bool
config ARCH_INLINE_SPIN_UNLOCK
bool
config ARCH_INLINE_SPIN_UNLOCK_BH
bool
config ARCH_INLINE_SPIN_UNLOCK_IRQ
bool
config ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
bool
config ARCH_INLINE_READ_TRYLOCK
bool
config ARCH_INLINE_READ_LOCK
bool
config ARCH_INLINE_READ_LOCK_BH
bool
config ARCH_INLINE_READ_LOCK_IRQ
bool
config ARCH_INLINE_READ_LOCK_IRQSAVE
bool
config ARCH_INLINE_READ_UNLOCK
bool
config ARCH_INLINE_READ_UNLOCK_BH
bool
config ARCH_INLINE_READ_UNLOCK_IRQ
bool
config ARCH_INLINE_READ_UNLOCK_IRQRESTORE
bool
config ARCH_INLINE_WRITE_TRYLOCK
bool
config ARCH_INLINE_WRITE_LOCK
bool
config ARCH_INLINE_WRITE_LOCK_BH
bool
config ARCH_INLINE_WRITE_LOCK_IRQ
bool
config ARCH_INLINE_WRITE_LOCK_IRQSAVE
bool
config ARCH_INLINE_WRITE_UNLOCK
bool
config ARCH_INLINE_WRITE_UNLOCK_BH
bool
config ARCH_INLINE_WRITE_UNLOCK_IRQ
bool
config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
bool
config UNINLINE_SPIN_UNLOCK
bool
#
# lock_* functions are inlined when:
# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
#
# trylock_* functions are inlined when:
# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
#
# unlock and unlock_irq functions are inlined when:
# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
# or
# - DEBUG_SPINLOCK=n and PREEMPT=n
#
# unlock_bh and unlock_irqrestore functions are inlined when:
# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
#
if !DEBUG_SPINLOCK
config INLINE_SPIN_TRYLOCK
def_bool y
depends on ARCH_INLINE_SPIN_TRYLOCK
config INLINE_SPIN_TRYLOCK_BH
def_bool y
depends on ARCH_INLINE_SPIN_TRYLOCK_BH
config INLINE_SPIN_LOCK
def_bool y
depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
config INLINE_SPIN_LOCK_BH
def_bool y
depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_BH
config INLINE_SPIN_LOCK_IRQ
def_bool y
depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQ
config INLINE_SPIN_LOCK_IRQSAVE
def_bool y
depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQSAVE
config INLINE_SPIN_UNLOCK_BH
def_bool y
depends on ARCH_INLINE_SPIN_UNLOCK_BH
config INLINE_SPIN_UNLOCK_IRQ
def_bool y
depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ
config INLINE_SPIN_UNLOCK_IRQRESTORE
def_bool y
depends on ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
config INLINE_READ_TRYLOCK
def_bool y
depends on ARCH_INLINE_READ_TRYLOCK
config INLINE_READ_LOCK
def_bool y
depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
config INLINE_READ_LOCK_BH
def_bool y
depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_BH
config INLINE_READ_LOCK_IRQ
def_bool y
depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQ
config INLINE_READ_LOCK_IRQSAVE
def_bool y
depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQSAVE
config INLINE_READ_UNLOCK
def_bool y
depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK
config INLINE_READ_UNLOCK_BH
def_bool y
depends on ARCH_INLINE_READ_UNLOCK_BH
config INLINE_READ_UNLOCK_IRQ
def_bool y
depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ
config INLINE_READ_UNLOCK_IRQRESTORE
def_bool y
depends on ARCH_INLINE_READ_UNLOCK_IRQRESTORE
config INLINE_WRITE_TRYLOCK
def_bool y
depends on ARCH_INLINE_WRITE_TRYLOCK
config INLINE_WRITE_LOCK
def_bool y
depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
config INLINE_WRITE_LOCK_BH
def_bool y
depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_BH
config INLINE_WRITE_LOCK_IRQ
def_bool y
depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQ
config INLINE_WRITE_LOCK_IRQSAVE
def_bool y
depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQSAVE
config INLINE_WRITE_UNLOCK
def_bool y
depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK
config INLINE_WRITE_UNLOCK_BH
def_bool y
depends on ARCH_INLINE_WRITE_UNLOCK_BH
config INLINE_WRITE_UNLOCK_IRQ
def_bool y
depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ
config INLINE_WRITE_UNLOCK_IRQRESTORE
def_bool y
depends on ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
endif
config ARCH_SUPPORTS_ATOMIC_RMW
bool
config MUTEX_SPIN_ON_OWNER
def_bool y
depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
config RWSEM_SPIN_ON_OWNER
def_bool y
depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
config ARCH_USE_QUEUE_RWLOCK
bool
config QUEUE_RWLOCK
def_bool y if ARCH_USE_QUEUE_RWLOCK
depends on SMP

58
kernel/Kconfig.preempt Normal file
View file

@ -0,0 +1,58 @@
choice
prompt "Preemption Model"
default PREEMPT_NONE
config PREEMPT_NONE
bool "No Forced Preemption (Server)"
help
This is the traditional Linux preemption model, geared towards
throughput. It will still provide good latencies most of the
time, but there are no guarantees and occasional longer delays
are possible.
Select this option if you are building a kernel for a server or
scientific/computation system, or if you want to maximize the
raw processing power of the kernel, irrespective of scheduling
latencies.
config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
help
This option reduces the latency of the kernel by adding more
"explicit preemption points" to the kernel code. These new
preemption points have been selected to reduce the maximum
latency of rescheduling, providing faster application reactions,
at the cost of slightly lower throughput.
This allows reaction to interactive events by allowing a
low priority process to voluntarily preempt itself even if it
is in kernel mode executing a system call. This allows
applications to run more 'smoothly' even when the system is
under load.
Select this if you are building a kernel for a desktop system.
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
select PREEMPT_COUNT
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
preemptible. This allows reaction to interactive events by
permitting a low priority process to be preempted involuntarily
even if it is in kernel mode executing a system call and would
otherwise not be about to reach a natural preemption point.
This allows applications to run more 'smoothly' even when the
system is under load, at the cost of slightly lower throughput
and a slight runtime overhead to kernel code.
Select this if you are building a kernel for a desktop or
embedded system with latency requirements in the milliseconds
range.
endchoice
config PREEMPT_COUNT
bool

207
kernel/Makefile Normal file
View file

@ -0,0 +1,207 @@
#
# Makefile for the linux kernel.
#
obj-y = fork.o exec_domain.o panic.o \
cpu.o exit.o softirq.o resource.o \
sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
async.o range.o groups.o smpboot.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_irq_work.o = -pg
endif
# cond_syscall is currently not LTO compatible
CFLAGS_sys_ni.o = $(DISABLE_LTO)
obj-y += sched/
obj-y += locking/
obj-y += power/
obj-y += printk/
obj-y += irq/
obj-y += rcu/
obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
obj-$(CONFIG_FUTEX) += futex.o
ifeq ($(CONFIG_COMPAT),y)
obj-$(CONFIG_FUTEX) += futex_compat.o
endif
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
obj-$(CONFIG_BINFMT_ELF) += elfcore.o
obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_TRACE_CLOCK) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
obj-$(CONFIG_BPF) += bpf/
obj-$(CONFIG_PERF_EVENTS) += events/
obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
obj-$(CONFIG_PADATA) += padata.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
$(obj)/configs.o: $(obj)/config_data.h
# config_data.h contains the same information as ikconfig.h but gzipped.
# Info from config_data can be extracted from /proc/config*
targets += config_data.gz
$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;")
targets += config_data.h
$(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(call filechk,ikconfiggz)
###############################################################################
#
# Roll all the X.509 certificates that we can find together and pull them into
# the kernel so that they get loaded into the system trusted keyring during
# boot.
#
# We look in the source root and the build root for all files whose name ends
# in ".x509". Unfortunately, this will generate duplicate filenames, so we
# have make canonicalise the pathnames and then sort them to discard the
# duplicates.
#
###############################################################################
ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
$(or $(realpath $(CERT)),$(CERT))))
X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
ifeq ($(X509_CERTIFICATES),)
$(warning *** No X.509 certificates found ***)
endif
ifneq ($(wildcard $(obj)/.x509.list),)
ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
$(info X.509 certificate list changed)
$(shell rm $(obj)/.x509.list)
endif
endif
kernel/system_certificates.o: $(obj)/x509_certificate_list
quiet_cmd_x509certs = CERTS $@
cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)")
targets += $(obj)/x509_certificate_list
$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
$(call if_changed,x509certs)
targets += $(obj)/.x509.list
$(obj)/.x509.list:
@echo $(X509_CERTIFICATES) >$@
endif
clean-files := x509_certificate_list .x509.list
ifeq ($(CONFIG_MODULE_SIG),y)
###############################################################################
#
# If module signing is requested, say by allyesconfig, but a key has not been
# supplied, then one will need to be generated to make sure the build does not
# fail and that the kernel may be used afterwards.
#
###############################################################################
ifndef CONFIG_MODULE_SIG_HASH
$(error Could not determine digest type to use from kernel config)
endif
signing_key.priv signing_key.x509: x509.genkey
@echo "###"
@echo "### Now generating an X.509 key pair to be used for signing modules."
@echo "###"
@echo "### If this takes a long time, you might wish to run rngd in the"
@echo "### background to keep the supply of entropy topped up. It"
@echo "### needs to be run as root, and uses a hardware random"
@echo "### number generator if one is available."
@echo "###"
openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
-batch -x509 -config x509.genkey \
-outform DER -out signing_key.x509 \
-keyout signing_key.priv 2>&1
@echo "###"
@echo "### Key pair generated."
@echo "###"
x509.genkey:
@echo Generating X.509 key generation config
@echo >x509.genkey "[ req ]"
@echo >>x509.genkey "default_bits = 4096"
@echo >>x509.genkey "distinguished_name = req_distinguished_name"
@echo >>x509.genkey "prompt = no"
@echo >>x509.genkey "string_mask = utf8only"
@echo >>x509.genkey "x509_extensions = myexts"
@echo >>x509.genkey
@echo >>x509.genkey "[ req_distinguished_name ]"
@echo >>x509.genkey "O = Magrathea"
@echo >>x509.genkey "CN = Glacier signing key"
@echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2"
@echo >>x509.genkey
@echo >>x509.genkey "[ myexts ]"
@echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
@echo >>x509.genkey "keyUsage=digitalSignature"
@echo >>x509.genkey "subjectKeyIdentifier=hash"
@echo >>x509.genkey "authorityKeyIdentifier=keyid"
endif

604
kernel/acct.c Normal file
View file

@ -0,0 +1,604 @@
/*
* linux/kernel/acct.c
*
* BSD Process Accounting for Linux
*
* Author: Marco van Wieringen <mvw@planets.elm.net>
*
* Some code based on ideas and code from:
* Thomas K. Dyas <tdyas@eden.rutgers.edu>
*
* This file implements BSD-style process accounting. Whenever any
* process exits, an accounting record of type "struct acct" is
* written to the file specified with the acct() system call. It is
* up to user-level programs to do useful things with the accounting
* log. The kernel just provides the raw accounting information.
*
* (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
*
* Plugged two leaks. 1) It didn't return acct_file into the free_filps if
* the file happened to be read-only. 2) If the accounting was suspended
* due to the lack of space it happily allowed to reopen it and completely
* lost the old acct_file. 3/10/98, Al Viro.
*
* Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
* XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
*
* Fixed a nasty interaction with with sys_umount(). If the accointing
* was suspeneded we failed to stop it on umount(). Messy.
* Another one: remount to readonly didn't stop accounting.
* Question: what should we do if we have CAP_SYS_ADMIN but not
* CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
* unless we are messing with the root. In that case we are getting a
* real mess with do_remount_sb(). 9/11/98, AV.
*
* Fixed a bunch of races (and pair of leaks). Probably not the best way,
* but this one obviously doesn't introduce deadlocks. Later. BTW, found
* one race (and leak) in BSD implementation.
* OK, that's better. ANOTHER race and leak in BSD variant. There always
* is one more bug... 10/11/98, AV.
*
* Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
* ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
* a struct file opened for write. Fixed. 2/6/2000, AV.
*/
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/acct.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/tty.h>
#include <linux/security.h>
#include <linux/vfs.h>
#include <linux/jiffies.h>
#include <linux/times.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/uaccess.h>
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
#include <linux/fs_pin.h>
/*
* These constants control the amount of freespace that suspend and
* resume the process accounting system, and the time delay between
* each check.
* Turned into sysctl-controllable parameters. AV, 12/11/98
*/
int acct_parm[3] = {4, 2, 30};
#define RESUME (acct_parm[0]) /* >foo% free space - resume */
#define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */
#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */
/*
* External references and all of the globals.
*/
static void do_acct_process(struct bsd_acct_struct *acct);
struct bsd_acct_struct {
struct fs_pin pin;
struct mutex lock;
int active;
unsigned long needcheck;
struct file *file;
struct pid_namespace *ns;
struct work_struct work;
struct completion done;
};
/*
* Check the amount of free space and suspend/resume accordingly.
*/
static int check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
if (time_is_before_jiffies(acct->needcheck))
goto out;
/* May block */
if (vfs_statfs(&acct->file->f_path, &sbuf))
goto out;
if (acct->active) {
u64 suspend = sbuf.f_blocks * SUSPEND;
do_div(suspend, 100);
if (sbuf.f_bavail <= suspend) {
acct->active = 0;
pr_info("Process accounting paused\n");
}
} else {
u64 resume = sbuf.f_blocks * RESUME;
do_div(resume, 100);
if (sbuf.f_bavail >= resume) {
acct->active = 1;
pr_info("Process accounting resumed\n");
}
}
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
out:
return acct->active;
}
static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
{
struct bsd_acct_struct *res;
again:
smp_rmb();
rcu_read_lock();
res = ACCESS_ONCE(ns->bacct);
if (!res) {
rcu_read_unlock();
return NULL;
}
if (!atomic_long_inc_not_zero(&res->pin.count)) {
rcu_read_unlock();
cpu_relax();
goto again;
}
rcu_read_unlock();
mutex_lock(&res->lock);
if (!res->ns) {
mutex_unlock(&res->lock);
pin_put(&res->pin);
goto again;
}
return res;
}
static void close_work(struct work_struct *work)
{
struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
struct file *file = acct->file;
if (file->f_op->flush)
file->f_op->flush(file, NULL);
__fput_sync(file);
complete(&acct->done);
}
static void acct_kill(struct bsd_acct_struct *acct,
struct bsd_acct_struct *new)
{
if (acct) {
struct pid_namespace *ns = acct->ns;
do_acct_process(acct);
INIT_WORK(&acct->work, close_work);
init_completion(&acct->done);
schedule_work(&acct->work);
wait_for_completion(&acct->done);
pin_remove(&acct->pin);
ns->bacct = new;
acct->ns = NULL;
atomic_long_dec(&acct->pin.count);
mutex_unlock(&acct->lock);
pin_put(&acct->pin);
}
}
static void acct_pin_kill(struct fs_pin *pin)
{
struct bsd_acct_struct *acct;
acct = container_of(pin, struct bsd_acct_struct, pin);
mutex_lock(&acct->lock);
if (!acct->ns) {
mutex_unlock(&acct->lock);
pin_put(pin);
acct = NULL;
}
acct_kill(acct, NULL);
}
static int acct_on(struct filename *pathname)
{
struct file *file;
struct vfsmount *mnt, *internal;
struct pid_namespace *ns = task_active_pid_ns(current);
struct bsd_acct_struct *acct, *old;
int err;
acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
if (!acct)
return -ENOMEM;
/* Difference from BSD - they don't do O_APPEND */
file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
if (IS_ERR(file)) {
kfree(acct);
return PTR_ERR(file);
}
if (!S_ISREG(file_inode(file)->i_mode)) {
kfree(acct);
filp_close(file, NULL);
return -EACCES;
}
if (!file->f_op->write) {
kfree(acct);
filp_close(file, NULL);
return -EIO;
}
internal = mnt_clone_internal(&file->f_path);
if (IS_ERR(internal)) {
kfree(acct);
filp_close(file, NULL);
return PTR_ERR(internal);
}
err = mnt_want_write(internal);
if (err) {
mntput(internal);
kfree(acct);
filp_close(file, NULL);
return err;
}
mnt = file->f_path.mnt;
file->f_path.mnt = internal;
atomic_long_set(&acct->pin.count, 1);
acct->pin.kill = acct_pin_kill;
acct->file = file;
acct->needcheck = jiffies;
acct->ns = ns;
mutex_init(&acct->lock);
mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
pin_insert(&acct->pin, mnt);
old = acct_get(ns);
if (old)
acct_kill(old, acct);
else
ns->bacct = acct;
mutex_unlock(&acct->lock);
mnt_drop_write(mnt);
mntput(mnt);
return 0;
}
static DEFINE_MUTEX(acct_on_mutex);
/**
* sys_acct - enable/disable process accounting
* @name: file name for accounting records or NULL to shutdown accounting
*
* Returns 0 for success or negative errno values for failure.
*
* sys_acct() is the only system call needed to implement process
* accounting. It takes the name of the file where accounting records
* should be written. If the filename is NULL, accounting will be
* shutdown.
*/
SYSCALL_DEFINE1(acct, const char __user *, name)
{
int error = 0;
if (!capable(CAP_SYS_PACCT))
return -EPERM;
if (name) {
struct filename *tmp = getname(name);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
mutex_lock(&acct_on_mutex);
error = acct_on(tmp);
mutex_unlock(&acct_on_mutex);
putname(tmp);
} else {
acct_kill(acct_get(task_active_pid_ns(current)), NULL);
}
return error;
}
void acct_exit_ns(struct pid_namespace *ns)
{
acct_kill(acct_get(ns), NULL);
}
/*
* encode an unsigned long into a comp_t
*
* This routine has been adopted from the encode_comp_t() function in
* the kern_acct.c file of the FreeBSD operating system. The encoding
* is a 13-bit fraction with a 3-bit (base 8) exponent.
*/
#define MANTSIZE 13 /* 13 bit mantissa. */
#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
static comp_t encode_comp_t(unsigned long value)
{
int exp, rnd;
exp = rnd = 0;
while (value > MAXFRACT) {
rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */
value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
exp++;
}
/*
* If we need to round up, do it (and handle overflow correctly).
*/
if (rnd && (++value > MAXFRACT)) {
value >>= EXPSIZE;
exp++;
}
/*
* Clean it up and polish it off.
*/
exp <<= MANTSIZE; /* Shift the exponent into place */
exp += value; /* and add on the mantissa. */
return exp;
}
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/*
* encode an u64 into a comp2_t (24 bits)
*
* Format: 5 bit base 2 exponent, 20 bits mantissa.
* The leading bit of the mantissa is not stored, but implied for
* non-zero exponents.
* Largest encodable value is 50 bits.
*/
#define MANTSIZE2 20 /* 20 bit mantissa. */
#define EXPSIZE2 5 /* 5 bit base 2 exponent. */
#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
#define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */
static comp2_t encode_comp2_t(u64 value)
{
int exp, rnd;
exp = (value > (MAXFRACT2>>1));
rnd = 0;
while (value > MAXFRACT2) {
rnd = value & 1;
value >>= 1;
exp++;
}
/*
* If we need to round up, do it (and handle overflow correctly).
*/
if (rnd && (++value > MAXFRACT2)) {
value >>= 1;
exp++;
}
if (exp > MAXEXP2) {
/* Overflow. Return largest representable number instead. */
return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
} else {
return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
}
}
#endif
#if ACCT_VERSION == 3
/*
* encode an u64 into a 32 bit IEEE float
*/
static u32 encode_float(u64 value)
{
unsigned exp = 190;
unsigned u;
if (value == 0)
return 0;
while ((s64)value > 0) {
value <<= 1;
exp--;
}
u = (u32)(value >> 40) & 0x7fffffu;
return u | (exp << 23);
}
#endif
/*
* Write an accounting entry for an exiting process
*
* The acct_process() call is the workhorse of the process
* accounting system. The struct acct is built here and then written
* into the accounting file. This function should only be called from
* do_exit() or when switching to a different output file.
*/
static void fill_ac(acct_t *ac)
{
struct pacct_struct *pacct = &current->signal->pacct;
u64 elapsed, run_time;
struct tty_struct *tty;
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
*/
memset(ac, 0, sizeof(acct_t));
ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
/* calculate run_time in nsec*/
run_time = ktime_get_ns();
run_time -= current->group_leader->start_time;
/* convert nsec -> AHZ */
elapsed = nsec_to_AHZ(run_time);
#if ACCT_VERSION == 3
ac->ac_etime = encode_float(elapsed);
#else
ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
(unsigned long) elapsed : (unsigned long) -1l);
#endif
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
{
/* new enlarged etime field */
comp2_t etime = encode_comp2_t(elapsed);
ac->ac_etime_hi = etime >> 16;
ac->ac_etime_lo = (u16) etime;
}
#endif
do_div(elapsed, AHZ);
ac->ac_btime = get_seconds() - elapsed;
#if ACCT_VERSION==2
ac->ac_ahz = AHZ;
#endif
spin_lock_irq(&current->sighand->siglock);
tty = current->signal->tty; /* Safe as we hold the siglock */
ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
ac->ac_flag = pacct->ac_flag;
ac->ac_mem = encode_comp_t(pacct->ac_mem);
ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
ac->ac_exitcode = pacct->ac_exitcode;
spin_unlock_irq(&current->sighand->siglock);
}
/*
* do_acct_process does all actual work. Caller holds the reference to file.
*/
static void do_acct_process(struct bsd_acct_struct *acct)
{
acct_t ac;
unsigned long flim;
const struct cred *orig_cred;
struct file *file = acct->file;
/*
* Accounting records are not subject to resource limits.
*/
flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
/* Perform file operations on behalf of whoever enabled accounting */
orig_cred = override_creds(file->f_cred);
/*
* First check to see if there is enough free_space to continue
* the process accounting system.
*/
if (!check_free_space(acct))
goto out;
fill_ac(&ac);
/* we really need to bite the bullet and change layout */
ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/* backward-compatible 16 bit fields */
ac.ac_uid16 = ac.ac_uid;
ac.ac_gid16 = ac.ac_gid;
#endif
#if ACCT_VERSION == 3
{
struct pid_namespace *ns = acct->ns;
ac.ac_pid = task_tgid_nr_ns(current, ns);
rcu_read_lock();
ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
ns);
rcu_read_unlock();
}
#endif
/*
* Get freeze protection. If the fs is frozen, just skip the write
* as we could deadlock the system otherwise.
*/
if (file_start_write_trylock(file)) {
/* it's been opened O_APPEND, so position is irrelevant */
loff_t pos = 0;
__kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
file_end_write(file);
}
out:
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
revert_creds(orig_cred);
}
/**
* acct_collect - collect accounting information into pacct_struct
* @exitcode: task exit code
* @group_dead: not 0, if this thread is the last one in the process.
*/
void acct_collect(long exitcode, int group_dead)
{
struct pacct_struct *pacct = &current->signal->pacct;
cputime_t utime, stime;
unsigned long vsize = 0;
if (group_dead && current->mm) {
struct vm_area_struct *vma;
down_read(&current->mm->mmap_sem);
vma = current->mm->mmap;
while (vma) {
vsize += vma->vm_end - vma->vm_start;
vma = vma->vm_next;
}
up_read(&current->mm->mmap_sem);
}
spin_lock_irq(&current->sighand->siglock);
if (group_dead)
pacct->ac_mem = vsize / 1024;
if (thread_group_leader(current)) {
pacct->ac_exitcode = exitcode;
if (current->flags & PF_FORKNOEXEC)
pacct->ac_flag |= AFORK;
}
if (current->flags & PF_SUPERPRIV)
pacct->ac_flag |= ASU;
if (current->flags & PF_DUMPCORE)
pacct->ac_flag |= ACORE;
if (current->flags & PF_SIGNALED)
pacct->ac_flag |= AXSIG;
task_cputime(current, &utime, &stime);
pacct->ac_utime += utime;
pacct->ac_stime += stime;
pacct->ac_minflt += current->min_flt;
pacct->ac_majflt += current->maj_flt;
spin_unlock_irq(&current->sighand->siglock);
}
static void slow_acct_process(struct pid_namespace *ns)
{
for ( ; ns; ns = ns->parent) {
struct bsd_acct_struct *acct = acct_get(ns);
if (acct) {
do_acct_process(acct);
mutex_unlock(&acct->lock);
pin_put(&acct->pin);
}
}
}
/**
* acct_process
*
* handles process accounting for an exiting task
*/
void acct_process(void)
{
struct pid_namespace *ns;
/*
* This loop is safe lockless, since current is still
* alive and holds its namespace, which in turn holds
* its parent.
*/
for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
if (ns->bacct)
break;
}
if (unlikely(ns))
slow_acct_process(ns);
}

328
kernel/async.c Normal file
View file

@ -0,0 +1,328 @@
/*
* async.c: Asynchronous function calls for boot performance
*
* (C) Copyright 2009 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
/*
Goals and Theory of Operation
The primary goal of this feature is to reduce the kernel boot time,
by doing various independent hardware delays and discovery operations
decoupled and not strictly serialized.
More specifically, the asynchronous function call concept allows
certain operations (primarily during system boot) to happen
asynchronously, out of order, while these operations still
have their externally visible parts happen sequentially and in-order.
(not unlike how out-of-order CPUs retire their instructions in order)
Key to the asynchronous function call implementation is the concept of
a "sequence cookie" (which, although it has an abstracted type, can be
thought of as a monotonically incrementing number).
The async core will assign each scheduled event such a sequence cookie and
pass this to the called functions.
The asynchronously called function should before doing a globally visible
operation, such as registering device numbers, call the
async_synchronize_cookie() function and pass in its own cookie. The
async_synchronize_cookie() function will make sure that all asynchronous
operations that were scheduled prior to the operation corresponding with the
cookie have completed.
Subsystem/driver initialization code that scheduled asynchronous probe
functions, but which shares global resources with other drivers/subsystems
that do not use the asynchronous call feature, need to do a full
synchronization with the async_synchronize_full() function, before returning
from their init function. This is to maintain strict ordering between the
asynchronous and synchronous parts of the kernel.
*/
#include <linux/async.h>
#include <linux/atomic.h>
#include <linux/ktime.h>
#include <linux/export.h>
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include "workqueue_internal.h"
static async_cookie_t next_cookie = 1;
#define MAX_WORK 32768
#define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */
static LIST_HEAD(async_global_pending); /* pending from all registered doms */
static ASYNC_DOMAIN(async_dfl_domain);
static DEFINE_SPINLOCK(async_lock);
struct async_entry {
struct list_head domain_list;
struct list_head global_list;
struct work_struct work;
async_cookie_t cookie;
async_func_t func;
void *data;
struct async_domain *domain;
};
static DECLARE_WAIT_QUEUE_HEAD(async_done);
static atomic_t entry_count;
static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
struct list_head *pending;
async_cookie_t ret = ASYNC_COOKIE_MAX;
unsigned long flags;
spin_lock_irqsave(&async_lock, flags);
if (domain)
pending = &domain->pending;
else
pending = &async_global_pending;
if (!list_empty(pending))
ret = list_first_entry(pending, struct async_entry,
domain_list)->cookie;
spin_unlock_irqrestore(&async_lock, flags);
return ret;
}
/*
* pick the first pending entry and run it
*/
static void async_run_entry_fn(struct work_struct *work)
{
struct async_entry *entry =
container_of(work, struct async_entry, work);
unsigned long flags;
ktime_t uninitialized_var(calltime), delta, rettime;
/* 1) run (and print duration) */
if (initcall_debug && system_state == SYSTEM_BOOTING) {
pr_debug("calling %lli_%pF @ %i\n",
(long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
}
entry->func(entry->data, entry->cookie);
if (initcall_debug && system_state == SYSTEM_BOOTING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
(long long)entry->cookie,
entry->func,
(long long)ktime_to_ns(delta) >> 10);
}
/* 2) remove self from the pending queues */
spin_lock_irqsave(&async_lock, flags);
list_del_init(&entry->domain_list);
list_del_init(&entry->global_list);
/* 3) free the entry */
kfree(entry);
atomic_dec(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
/* 4) wake up any waiters */
wake_up(&async_done);
}
static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
{
struct async_entry *entry;
unsigned long flags;
async_cookie_t newcookie;
/* allow irq-off callers */
entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
/*
* If we're out of memory or if there's too much work
* pending already, we execute synchronously.
*/
if (!entry || atomic_read(&entry_count) > MAX_WORK) {
kfree(entry);
spin_lock_irqsave(&async_lock, flags);
newcookie = next_cookie++;
spin_unlock_irqrestore(&async_lock, flags);
/* low on memory.. run synchronously */
func(data, newcookie);
return newcookie;
}
INIT_LIST_HEAD(&entry->domain_list);
INIT_LIST_HEAD(&entry->global_list);
INIT_WORK(&entry->work, async_run_entry_fn);
entry->func = func;
entry->data = data;
entry->domain = domain;
spin_lock_irqsave(&async_lock, flags);
/* allocate cookie and queue */
newcookie = entry->cookie = next_cookie++;
list_add_tail(&entry->domain_list, &domain->pending);
if (domain->registered)
list_add_tail(&entry->global_list, &async_global_pending);
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
/* mark that this task has queued an async job, used by module init */
current->flags |= PF_USED_ASYNC;
/* schedule for execution */
queue_work(system_unbound_wq, &entry->work);
return newcookie;
}
/**
* async_schedule - schedule a function for asynchronous execution
* @func: function to execute asynchronously
* @data: data pointer to pass to the function
*
* Returns an async_cookie_t that may be used for checkpointing later.
* Note: This function may be called from atomic or non-atomic contexts.
*/
async_cookie_t async_schedule(async_func_t func, void *data)
{
return __async_schedule(func, data, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_schedule);
/**
* async_schedule_domain - schedule a function for asynchronous execution within a certain domain
* @func: function to execute asynchronously
* @data: data pointer to pass to the function
* @domain: the domain
*
* Returns an async_cookie_t that may be used for checkpointing later.
* @domain may be used in the async_synchronize_*_domain() functions to
* wait within a certain synchronization domain rather than globally. A
* synchronization domain is specified via @domain. Note: This function
* may be called from atomic or non-atomic contexts.
*/
async_cookie_t async_schedule_domain(async_func_t func, void *data,
struct async_domain *domain)
{
return __async_schedule(func, data, domain);
}
EXPORT_SYMBOL_GPL(async_schedule_domain);
/**
* async_synchronize_full - synchronize all asynchronous function calls
*
* This function waits until all asynchronous function calls have been done.
*/
void async_synchronize_full(void)
{
async_synchronize_full_domain(NULL);
}
EXPORT_SYMBOL_GPL(async_synchronize_full);
/**
* async_unregister_domain - ensure no more anonymous waiters on this domain
* @domain: idle domain to flush out of any async_synchronize_full instances
*
* async_synchronize_{cookie|full}_domain() are not flushed since callers
* of these routines should know the lifetime of @domain
*
* Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
*/
void async_unregister_domain(struct async_domain *domain)
{
spin_lock_irq(&async_lock);
WARN_ON(!domain->registered || !list_empty(&domain->pending));
domain->registered = 0;
spin_unlock_irq(&async_lock);
}
EXPORT_SYMBOL_GPL(async_unregister_domain);
/**
* async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
* @domain: the domain to synchronize
*
* This function waits until all asynchronous function calls for the
* synchronization domain specified by @domain have been done.
*/
void async_synchronize_full_domain(struct async_domain *domain)
{
async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
/**
* async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
* @cookie: async_cookie_t to use as checkpoint
* @domain: the domain to synchronize (%NULL for all registered domains)
*
* This function waits until all asynchronous function calls for the
* synchronization domain specified by @domain submitted prior to @cookie
* have been done.
*/
void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
ktime_t uninitialized_var(starttime), delta, endtime;
if (initcall_debug && system_state == SYSTEM_BOOTING) {
pr_debug("async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
wait_event(async_done, lowest_in_progress(domain) >= cookie);
if (initcall_debug && system_state == SYSTEM_BOOTING) {
endtime = ktime_get();
delta = ktime_sub(endtime, starttime);
pr_debug("async_continuing @ %i after %lli usec\n",
task_pid_nr(current),
(long long)ktime_to_ns(delta) >> 10);
}
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
/**
* async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing
* @cookie: async_cookie_t to use as checkpoint
*
* This function waits until all asynchronous function calls prior to @cookie
* have been done.
*/
void async_synchronize_cookie(async_cookie_t cookie)
{
async_synchronize_cookie_domain(cookie, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie);
/**
* current_is_async - is %current an async worker task?
*
* Returns %true if %current is an async worker task.
*/
bool current_is_async(void)
{
struct worker *worker = current_wq_worker();
return worker && worker->current_func == async_run_entry_fn;
}

2055
kernel/audit.c Normal file

File diff suppressed because it is too large Load diff

338
kernel/audit.h Normal file
View file

@ -0,0 +1,338 @@
/* audit -- definition of audit_context structure and supporting types
*
* Copyright 2003-2004 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
#include <linux/audit.h>
#include <linux/skbuff.h>
#include <uapi/linux/mqueue.h>
/* 0 = no checking
1 = put_count checking
2 = verbose put_count checking
*/
#define AUDIT_DEBUG 0
/* AUDIT_NAMES is the number of slots we reserve in the audit_context
* for saving names from getname(). If we get more names we will allocate
* a name dynamically and also add those to the list anchored by names_list. */
#define AUDIT_NAMES 5
/* At task start time, the audit_state is set in the audit_context using
a per-task filter. At syscall entry, the audit_state is augmented by
the syscall filter. */
enum audit_state {
AUDIT_DISABLED, /* Do not create per-task audit_context.
* No syscall-specific audit records can
* be generated. */
AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
* and fill it in at syscall
* entry time. This makes a full
* syscall record available if some
* other part of the kernel decides it
* should be recorded. */
AUDIT_RECORD_CONTEXT /* Create the per-task audit_context,
* always fill it in at syscall entry
* time, and always write out the audit
* record at syscall exit time. */
};
/* Rule lists */
struct audit_watch;
struct audit_tree;
struct audit_chunk;
struct audit_entry {
struct list_head list;
struct rcu_head rcu;
struct audit_krule rule;
};
struct audit_cap_data {
kernel_cap_t permitted;
kernel_cap_t inheritable;
union {
unsigned int fE; /* effective bit of file cap */
kernel_cap_t effective; /* effective set of process */
};
};
/* When fs/namei.c:getname() is called, we store the pointer in name and
* we don't let putname() free it (instead we free all of the saved
* pointers at syscall exit time).
*
* Further, in fs/namei.c:path_lookup() we store the inode and device.
*/
struct audit_names {
struct list_head list; /* audit_context->names_list */
struct filename *name;
int name_len; /* number of chars to log */
bool hidden; /* don't log this record */
bool name_put; /* call __putname()? */
unsigned long ino;
dev_t dev;
umode_t mode;
kuid_t uid;
kgid_t gid;
dev_t rdev;
u32 osid;
struct audit_cap_data fcap;
unsigned int fcap_ver;
unsigned char type; /* record type */
/*
* This was an allocated audit_names and not from the array of
* names allocated in the task audit context. Thus this name
* should be freed on syscall exit.
*/
bool should_free;
};
struct audit_proctitle {
int len; /* length of the cmdline field. */
char *value; /* the cmdline field */
};
/* The per-task audit context. */
struct audit_context {
int dummy; /* must be the first element */
int in_syscall; /* 1 if task is in a syscall */
enum audit_state state, current_state;
unsigned int serial; /* serial number for record */
int major; /* syscall number */
struct timespec ctime; /* time of syscall entry */
unsigned long argv[4]; /* syscall arguments */
long return_code;/* syscall return code */
u64 prio;
int return_valid; /* return code is valid */
/*
* The names_list is the list of all audit_names collected during this
* syscall. The first AUDIT_NAMES entries in the names_list will
* actually be from the preallocated_names array for performance
* reasons. Except during allocation they should never be referenced
* through the preallocated_names array and should only be found/used
* by running the names_list.
*/
struct audit_names preallocated_names[AUDIT_NAMES];
int name_count; /* total records in names_list */
struct list_head names_list; /* struct audit_names->list anchor */
char *filterkey; /* key for rule that triggered record */
struct path pwd;
struct audit_aux_data *aux;
struct audit_aux_data *aux_pids;
struct sockaddr_storage *sockaddr;
size_t sockaddr_len;
/* Save things to print about task_struct */
pid_t pid, ppid;
kuid_t uid, euid, suid, fsuid;
kgid_t gid, egid, sgid, fsgid;
unsigned long personality;
int arch;
pid_t target_pid;
kuid_t target_auid;
kuid_t target_uid;
unsigned int target_sessionid;
u32 target_sid;
char target_comm[TASK_COMM_LEN];
struct audit_tree_refs *trees, *first_trees;
struct list_head killed_trees;
int tree_count;
int type;
union {
struct {
int nargs;
long args[6];
} socketcall;
struct {
kuid_t uid;
kgid_t gid;
umode_t mode;
u32 osid;
int has_perm;
uid_t perm_uid;
gid_t perm_gid;
umode_t perm_mode;
unsigned long qbytes;
} ipc;
struct {
mqd_t mqdes;
struct mq_attr mqstat;
} mq_getsetattr;
struct {
mqd_t mqdes;
int sigev_signo;
} mq_notify;
struct {
mqd_t mqdes;
size_t msg_len;
unsigned int msg_prio;
struct timespec abs_timeout;
} mq_sendrecv;
struct {
int oflag;
umode_t mode;
struct mq_attr attr;
} mq_open;
struct {
pid_t pid;
struct audit_cap_data cap;
} capset;
struct {
int fd;
int flags;
} mmap;
struct {
int argc;
} execve;
};
int fds[2];
struct audit_proctitle proctitle;
#if AUDIT_DEBUG
int put_count;
int ino_count;
#endif
};
extern u32 audit_ever_enabled;
extern void audit_copy_inode(struct audit_names *name,
const struct dentry *dentry,
const struct inode *inode);
extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
kernel_cap_t *cap);
extern void audit_log_name(struct audit_context *context,
struct audit_names *n, struct path *path,
int record_num, int *call_panic);
extern int audit_pid;
#define AUDIT_INODE_BUCKETS 32
extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
static inline int audit_hash_ino(u32 ino)
{
return (ino & (AUDIT_INODE_BUCKETS-1));
}
/* Indicates that audit should log the full pathname. */
#define AUDIT_NAME_FULL -1
extern int audit_match_class(int class, unsigned syscall);
extern int audit_comparator(const u32 left, const u32 op, const u32 right);
extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
extern int parent_len(const char *path);
extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type,
int done, int multi,
const void *payload, int size);
extern void audit_panic(const char *message);
struct audit_netlink_list {
__u32 portid;
struct net *net;
struct sk_buff_head q;
};
int audit_send_list(void *);
struct audit_net {
struct sock *nlsk;
};
extern int selinux_audit_rule_update(void);
extern struct mutex audit_filter_mutex;
extern void audit_free_rule_rcu(struct rcu_head *);
extern struct list_head audit_filter_list[];
extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
/* audit watch functions */
#ifdef CONFIG_AUDIT_WATCH
extern void audit_put_watch(struct audit_watch *watch);
extern void audit_get_watch(struct audit_watch *watch);
extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
extern void audit_remove_watch_rule(struct audit_krule *krule);
extern char *audit_watch_path(struct audit_watch *watch);
extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
#else
#define audit_put_watch(w) {}
#define audit_get_watch(w) {}
#define audit_to_watch(k, p, l, o) (-EINVAL)
#define audit_add_watch(k, l) (-EINVAL)
#define audit_remove_watch_rule(k) BUG()
#define audit_watch_path(w) ""
#define audit_watch_compare(w, i, d) 0
#endif /* CONFIG_AUDIT_WATCH */
#ifdef CONFIG_AUDIT_TREE
extern struct audit_chunk *audit_tree_lookup(const struct inode *);
extern void audit_put_chunk(struct audit_chunk *);
extern int audit_tree_match(struct audit_chunk *, struct audit_tree *);
extern int audit_make_tree(struct audit_krule *, char *, u32);
extern int audit_add_tree_rule(struct audit_krule *);
extern int audit_remove_tree_rule(struct audit_krule *);
extern void audit_trim_trees(void);
extern int audit_tag_tree(char *old, char *new);
extern const char *audit_tree_path(struct audit_tree *);
extern void audit_put_tree(struct audit_tree *);
extern void audit_kill_trees(struct list_head *);
#else
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
#define audit_make_tree(rule, str, op) -EINVAL
#define audit_trim_trees() (void)0
#define audit_put_tree(tree) (void)0
#define audit_tag_tree(old, new) -EINVAL
#define audit_tree_path(rule) "" /* never called */
#define audit_kill_trees(list) BUG()
#endif
extern char *audit_unpack_string(void **, size_t *, size_t);
extern pid_t audit_sig_pid;
extern kuid_t audit_sig_uid;
extern u32 audit_sig_sid;
#ifdef CONFIG_AUDITSYSCALL
extern int __audit_signal_info(int sig, struct task_struct *t);
static inline int audit_signal_info(int sig, struct task_struct *t)
{
if (unlikely((audit_pid && t->tgid == audit_pid) ||
(audit_signals && !audit_dummy_context())))
return __audit_signal_info(sig, t);
return 0;
}
extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
extern struct list_head *audit_killed_trees(void);
#else
#define audit_signal_info(s,t) AUDIT_DISABLED
#define audit_filter_inodes(t,c) AUDIT_DISABLED
#endif
extern struct mutex audit_cmd_mutex;

956
kernel/audit_tree.c Normal file
View file

@ -0,0 +1,956 @@
#include "audit.h"
#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/kthread.h>
#include <linux/slab.h>
struct audit_tree;
struct audit_chunk;
struct audit_tree {
atomic_t count;
int goner;
struct audit_chunk *root;
struct list_head chunks;
struct list_head rules;
struct list_head list;
struct list_head same_root;
struct rcu_head head;
char pathname[];
};
struct audit_chunk {
struct list_head hash;
struct fsnotify_mark mark;
struct list_head trees; /* with root here */
int dead;
int count;
atomic_long_t refs;
struct rcu_head head;
struct node {
struct list_head list;
struct audit_tree *owner;
unsigned index; /* index; upper bit indicates 'will prune' */
} owners[];
};
static LIST_HEAD(tree_list);
static LIST_HEAD(prune_list);
/*
* One struct chunk is attached to each inode of interest.
* We replace struct chunk on tagging/untagging.
* Rules have pointer to struct audit_tree.
* Rules have struct list_head rlist forming a list of rules over
* the same tree.
* References to struct chunk are collected at audit_inode{,_child}()
* time and used in AUDIT_TREE rule matching.
* These references are dropped at the same time we are calling
* audit_free_names(), etc.
*
* Cyclic lists galore:
* tree.chunks anchors chunk.owners[].list hash_lock
* tree.rules anchors rule.rlist audit_filter_mutex
* chunk.trees anchors tree.same_root hash_lock
* chunk.hash is a hash with middle bits of watch.inode as
* a hash function. RCU, hash_lock
*
* tree is refcounted; one reference for "some rules on rules_list refer to
* it", one for each chunk with pointer to it.
*
* chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
* of watch contributes 1 to .refs).
*
* node.index allows to get from node.list to containing chunk.
* MSB of that sucker is stolen to mark taggings that we might have to
* revert - several operations have very unpleasant cleanup logics and
* that makes a difference. Some.
*/
static struct fsnotify_group *audit_tree_group;
static struct audit_tree *alloc_tree(const char *s)
{
struct audit_tree *tree;
tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL);
if (tree) {
atomic_set(&tree->count, 1);
tree->goner = 0;
INIT_LIST_HEAD(&tree->chunks);
INIT_LIST_HEAD(&tree->rules);
INIT_LIST_HEAD(&tree->list);
INIT_LIST_HEAD(&tree->same_root);
tree->root = NULL;
strcpy(tree->pathname, s);
}
return tree;
}
static inline void get_tree(struct audit_tree *tree)
{
atomic_inc(&tree->count);
}
static inline void put_tree(struct audit_tree *tree)
{
if (atomic_dec_and_test(&tree->count))
kfree_rcu(tree, head);
}
/* to avoid bringing the entire thing in audit.h */
const char *audit_tree_path(struct audit_tree *tree)
{
return tree->pathname;
}
static void free_chunk(struct audit_chunk *chunk)
{
int i;
for (i = 0; i < chunk->count; i++) {
if (chunk->owners[i].owner)
put_tree(chunk->owners[i].owner);
}
kfree(chunk);
}
void audit_put_chunk(struct audit_chunk *chunk)
{
if (atomic_long_dec_and_test(&chunk->refs))
free_chunk(chunk);
}
static void __put_chunk(struct rcu_head *rcu)
{
struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
audit_put_chunk(chunk);
}
static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
{
struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
call_rcu(&chunk->head, __put_chunk);
}
static struct audit_chunk *alloc_chunk(int count)
{
struct audit_chunk *chunk;
size_t size;
int i;
size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
chunk = kzalloc(size, GFP_KERNEL);
if (!chunk)
return NULL;
INIT_LIST_HEAD(&chunk->hash);
INIT_LIST_HEAD(&chunk->trees);
chunk->count = count;
atomic_long_set(&chunk->refs, 1);
for (i = 0; i < count; i++) {
INIT_LIST_HEAD(&chunk->owners[i].list);
chunk->owners[i].index = i;
}
fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
chunk->mark.mask = FS_IN_IGNORED;
return chunk;
}
enum {HASH_SIZE = 128};
static struct list_head chunk_hash_heads[HASH_SIZE];
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
static inline struct list_head *chunk_hash(const struct inode *inode)
{
unsigned long n = (unsigned long)inode / L1_CACHE_BYTES;
return chunk_hash_heads + n % HASH_SIZE;
}
/* hash_lock & entry->lock is held by caller */
static void insert_hash(struct audit_chunk *chunk)
{
struct fsnotify_mark *entry = &chunk->mark;
struct list_head *list;
if (!entry->i.inode)
return;
list = chunk_hash(entry->i.inode);
list_add_rcu(&chunk->hash, list);
}
/* called under rcu_read_lock */
struct audit_chunk *audit_tree_lookup(const struct inode *inode)
{
struct list_head *list = chunk_hash(inode);
struct audit_chunk *p;
list_for_each_entry_rcu(p, list, hash) {
/* mark.inode may have gone NULL, but who cares? */
if (p->mark.i.inode == inode) {
atomic_long_inc(&p->refs);
return p;
}
}
return NULL;
}
int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
{
int n;
for (n = 0; n < chunk->count; n++)
if (chunk->owners[n].owner == tree)
return 1;
return 0;
}
/* tagging and untagging inodes with trees */
static struct audit_chunk *find_chunk(struct node *p)
{
int index = p->index & ~(1U<<31);
p -= index;
return container_of(p, struct audit_chunk, owners[0]);
}
static void untag_chunk(struct node *p)
{
struct audit_chunk *chunk = find_chunk(p);
struct fsnotify_mark *entry = &chunk->mark;
struct audit_chunk *new = NULL;
struct audit_tree *owner;
int size = chunk->count - 1;
int i, j;
fsnotify_get_mark(entry);
spin_unlock(&hash_lock);
if (size)
new = alloc_chunk(size);
spin_lock(&entry->lock);
if (chunk->dead || !entry->i.inode) {
spin_unlock(&entry->lock);
if (new)
free_chunk(new);
goto out;
}
owner = p->owner;
if (!size) {
chunk->dead = 1;
spin_lock(&hash_lock);
list_del_init(&chunk->trees);
if (owner->root == chunk)
owner->root = NULL;
list_del_init(&p->list);
list_del_rcu(&chunk->hash);
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
fsnotify_destroy_mark(entry, audit_tree_group);
goto out;
}
if (!new)
goto Fallback;
fsnotify_duplicate_mark(&new->mark, entry);
if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
fsnotify_put_mark(&new->mark);
goto Fallback;
}
chunk->dead = 1;
spin_lock(&hash_lock);
list_replace_init(&chunk->trees, &new->trees);
if (owner->root == chunk) {
list_del_init(&owner->same_root);
owner->root = NULL;
}
for (i = j = 0; j <= size; i++, j++) {
struct audit_tree *s;
if (&chunk->owners[j] == p) {
list_del_init(&p->list);
i--;
continue;
}
s = chunk->owners[j].owner;
new->owners[i].owner = s;
new->owners[i].index = chunk->owners[j].index - j + i;
if (!s) /* result of earlier fallback */
continue;
get_tree(s);
list_replace_init(&chunk->owners[j].list, &new->owners[i].list);
}
list_replace_rcu(&chunk->hash, &new->hash);
list_for_each_entry(owner, &new->trees, same_root)
owner->root = new;
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
fsnotify_destroy_mark(entry, audit_tree_group);
fsnotify_put_mark(&new->mark); /* drop initial reference */
goto out;
Fallback:
// do the best we can
spin_lock(&hash_lock);
if (owner->root == chunk) {
list_del_init(&owner->same_root);
owner->root = NULL;
}
list_del_init(&p->list);
p->owner = NULL;
put_tree(owner);
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
out:
fsnotify_put_mark(entry);
spin_lock(&hash_lock);
}
static int create_chunk(struct inode *inode, struct audit_tree *tree)
{
struct fsnotify_mark *entry;
struct audit_chunk *chunk = alloc_chunk(1);
if (!chunk)
return -ENOMEM;
entry = &chunk->mark;
if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
fsnotify_put_mark(entry);
return -ENOSPC;
}
spin_lock(&entry->lock);
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
chunk->dead = 1;
spin_unlock(&entry->lock);
fsnotify_destroy_mark(entry, audit_tree_group);
fsnotify_put_mark(entry);
return 0;
}
chunk->owners[0].index = (1U << 31);
chunk->owners[0].owner = tree;
get_tree(tree);
list_add(&chunk->owners[0].list, &tree->chunks);
if (!tree->root) {
tree->root = chunk;
list_add(&tree->same_root, &chunk->trees);
}
insert_hash(chunk);
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
fsnotify_put_mark(entry); /* drop initial reference */
return 0;
}
/* the first tagged inode becomes root of tree */
static int tag_chunk(struct inode *inode, struct audit_tree *tree)
{
struct fsnotify_mark *old_entry, *chunk_entry;
struct audit_tree *owner;
struct audit_chunk *chunk, *old;
struct node *p;
int n;
old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
if (!old_entry)
return create_chunk(inode, tree);
old = container_of(old_entry, struct audit_chunk, mark);
/* are we already there? */
spin_lock(&hash_lock);
for (n = 0; n < old->count; n++) {
if (old->owners[n].owner == tree) {
spin_unlock(&hash_lock);
fsnotify_put_mark(old_entry);
return 0;
}
}
spin_unlock(&hash_lock);
chunk = alloc_chunk(old->count + 1);
if (!chunk) {
fsnotify_put_mark(old_entry);
return -ENOMEM;
}
chunk_entry = &chunk->mark;
spin_lock(&old_entry->lock);
if (!old_entry->i.inode) {
/* old_entry is being shot, lets just lie */
spin_unlock(&old_entry->lock);
fsnotify_put_mark(old_entry);
free_chunk(chunk);
return -ENOENT;
}
fsnotify_duplicate_mark(chunk_entry, old_entry);
if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
spin_unlock(&old_entry->lock);
fsnotify_put_mark(chunk_entry);
fsnotify_put_mark(old_entry);
return -ENOSPC;
}
/* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
spin_lock(&chunk_entry->lock);
spin_lock(&hash_lock);
/* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
if (tree->goner) {
spin_unlock(&hash_lock);
chunk->dead = 1;
spin_unlock(&chunk_entry->lock);
spin_unlock(&old_entry->lock);
fsnotify_destroy_mark(chunk_entry, audit_tree_group);
fsnotify_put_mark(chunk_entry);
fsnotify_put_mark(old_entry);
return 0;
}
list_replace_init(&old->trees, &chunk->trees);
for (n = 0, p = chunk->owners; n < old->count; n++, p++) {
struct audit_tree *s = old->owners[n].owner;
p->owner = s;
p->index = old->owners[n].index;
if (!s) /* result of fallback in untag */
continue;
get_tree(s);
list_replace_init(&old->owners[n].list, &p->list);
}
p->index = (chunk->count - 1) | (1U<<31);
p->owner = tree;
get_tree(tree);
list_add(&p->list, &tree->chunks);
list_replace_rcu(&old->hash, &chunk->hash);
list_for_each_entry(owner, &chunk->trees, same_root)
owner->root = chunk;
old->dead = 1;
if (!tree->root) {
tree->root = chunk;
list_add(&tree->same_root, &chunk->trees);
}
spin_unlock(&hash_lock);
spin_unlock(&chunk_entry->lock);
spin_unlock(&old_entry->lock);
fsnotify_destroy_mark(old_entry, audit_tree_group);
fsnotify_put_mark(chunk_entry); /* drop initial reference */
fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
return 0;
}
static void audit_tree_log_remove_rule(struct audit_krule *rule)
{
struct audit_buffer *ab;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
audit_log_format(ab, "op=");
audit_log_string(ab, "remove_rule");
audit_log_format(ab, " dir=");
audit_log_untrustedstring(ab, rule->tree->pathname);
audit_log_key(ab, rule->filterkey);
audit_log_format(ab, " list=%d res=1", rule->listnr);
audit_log_end(ab);
}
static void kill_rules(struct audit_tree *tree)
{
struct audit_krule *rule, *next;
struct audit_entry *entry;
list_for_each_entry_safe(rule, next, &tree->rules, rlist) {
entry = container_of(rule, struct audit_entry, rule);
list_del_init(&rule->rlist);
if (rule->tree) {
/* not a half-baked one */
audit_tree_log_remove_rule(rule);
rule->tree = NULL;
list_del_rcu(&entry->list);
list_del(&entry->rule.list);
call_rcu(&entry->rcu, audit_free_rule_rcu);
}
}
}
/*
* finish killing struct audit_tree
*/
static void prune_one(struct audit_tree *victim)
{
spin_lock(&hash_lock);
while (!list_empty(&victim->chunks)) {
struct node *p;
p = list_entry(victim->chunks.next, struct node, list);
untag_chunk(p);
}
spin_unlock(&hash_lock);
put_tree(victim);
}
/* trim the uncommitted chunks from tree */
static void trim_marked(struct audit_tree *tree)
{
struct list_head *p, *q;
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
return;
}
/* reorder */
for (p = tree->chunks.next; p != &tree->chunks; p = q) {
struct node *node = list_entry(p, struct node, list);
q = p->next;
if (node->index & (1U<<31)) {
list_del_init(p);
list_add(p, &tree->chunks);
}
}
while (!list_empty(&tree->chunks)) {
struct node *node;
node = list_entry(tree->chunks.next, struct node, list);
/* have we run out of marked? */
if (!(node->index & (1U<<31)))
break;
untag_chunk(node);
}
if (!tree->root && !tree->goner) {
tree->goner = 1;
spin_unlock(&hash_lock);
mutex_lock(&audit_filter_mutex);
kill_rules(tree);
list_del_init(&tree->list);
mutex_unlock(&audit_filter_mutex);
prune_one(tree);
} else {
spin_unlock(&hash_lock);
}
}
static void audit_schedule_prune(void);
/* called with audit_filter_mutex */
int audit_remove_tree_rule(struct audit_krule *rule)
{
struct audit_tree *tree;
tree = rule->tree;
if (tree) {
spin_lock(&hash_lock);
list_del_init(&rule->rlist);
if (list_empty(&tree->rules) && !tree->goner) {
tree->root = NULL;
list_del_init(&tree->same_root);
tree->goner = 1;
list_move(&tree->list, &prune_list);
rule->tree = NULL;
spin_unlock(&hash_lock);
audit_schedule_prune();
return 1;
}
rule->tree = NULL;
spin_unlock(&hash_lock);
return 1;
}
return 0;
}
static int compare_root(struct vfsmount *mnt, void *arg)
{
return mnt->mnt_root->d_inode == arg;
}
void audit_trim_trees(void)
{
struct list_head cursor;
mutex_lock(&audit_filter_mutex);
list_add(&cursor, &tree_list);
while (cursor.next != &tree_list) {
struct audit_tree *tree;
struct path path;
struct vfsmount *root_mnt;
struct node *node;
int err;
tree = container_of(cursor.next, struct audit_tree, list);
get_tree(tree);
list_del(&cursor);
list_add(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
err = kern_path(tree->pathname, 0, &path);
if (err)
goto skip_it;
root_mnt = collect_mounts(&path);
path_put(&path);
if (IS_ERR(root_mnt))
goto skip_it;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list) {
struct audit_chunk *chunk = find_chunk(node);
/* this could be NULL if the watch is dying else where... */
struct inode *inode = chunk->mark.i.inode;
node->index |= 1U<<31;
if (iterate_mounts(compare_root, inode, root_mnt))
node->index &= ~(1U<<31);
}
spin_unlock(&hash_lock);
trim_marked(tree);
drop_collected_mounts(root_mnt);
skip_it:
put_tree(tree);
mutex_lock(&audit_filter_mutex);
}
list_del(&cursor);
mutex_unlock(&audit_filter_mutex);
}
int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
{
if (pathname[0] != '/' ||
rule->listnr != AUDIT_FILTER_EXIT ||
op != Audit_equal ||
rule->inode_f || rule->watch || rule->tree)
return -EINVAL;
rule->tree = alloc_tree(pathname);
if (!rule->tree)
return -ENOMEM;
return 0;
}
void audit_put_tree(struct audit_tree *tree)
{
put_tree(tree);
}
static int tag_mount(struct vfsmount *mnt, void *arg)
{
return tag_chunk(mnt->mnt_root->d_inode, arg);
}
/* called with audit_filter_mutex */
int audit_add_tree_rule(struct audit_krule *rule)
{
struct audit_tree *seed = rule->tree, *tree;
struct path path;
struct vfsmount *mnt;
int err;
rule->tree = NULL;
list_for_each_entry(tree, &tree_list, list) {
if (!strcmp(seed->pathname, tree->pathname)) {
put_tree(seed);
rule->tree = tree;
list_add(&rule->rlist, &tree->rules);
return 0;
}
}
tree = seed;
list_add(&tree->list, &tree_list);
list_add(&rule->rlist, &tree->rules);
/* do not set rule->tree yet */
mutex_unlock(&audit_filter_mutex);
err = kern_path(tree->pathname, 0, &path);
if (err)
goto Err;
mnt = collect_mounts(&path);
path_put(&path);
if (IS_ERR(mnt)) {
err = PTR_ERR(mnt);
goto Err;
}
get_tree(tree);
err = iterate_mounts(tag_mount, tree, mnt);
drop_collected_mounts(mnt);
if (!err) {
struct node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);
spin_unlock(&hash_lock);
} else {
trim_marked(tree);
goto Err;
}
mutex_lock(&audit_filter_mutex);
if (list_empty(&rule->rlist)) {
put_tree(tree);
return -ENOENT;
}
rule->tree = tree;
put_tree(tree);
return 0;
Err:
mutex_lock(&audit_filter_mutex);
list_del_init(&tree->list);
list_del_init(&tree->rules);
put_tree(tree);
return err;
}
int audit_tag_tree(char *old, char *new)
{
struct list_head cursor, barrier;
int failed = 0;
struct path path1, path2;
struct vfsmount *tagged;
int err;
err = kern_path(new, 0, &path2);
if (err)
return err;
tagged = collect_mounts(&path2);
path_put(&path2);
if (IS_ERR(tagged))
return PTR_ERR(tagged);
err = kern_path(old, 0, &path1);
if (err) {
drop_collected_mounts(tagged);
return err;
}
mutex_lock(&audit_filter_mutex);
list_add(&barrier, &tree_list);
list_add(&cursor, &barrier);
while (cursor.next != &tree_list) {
struct audit_tree *tree;
int good_one = 0;
tree = container_of(cursor.next, struct audit_tree, list);
get_tree(tree);
list_del(&cursor);
list_add(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
err = kern_path(tree->pathname, 0, &path2);
if (!err) {
good_one = path_is_under(&path1, &path2);
path_put(&path2);
}
if (!good_one) {
put_tree(tree);
mutex_lock(&audit_filter_mutex);
continue;
}
failed = iterate_mounts(tag_mount, tree, tagged);
if (failed) {
put_tree(tree);
mutex_lock(&audit_filter_mutex);
break;
}
mutex_lock(&audit_filter_mutex);
spin_lock(&hash_lock);
if (!tree->goner) {
list_del(&tree->list);
list_add(&tree->list, &tree_list);
}
spin_unlock(&hash_lock);
put_tree(tree);
}
while (barrier.prev != &tree_list) {
struct audit_tree *tree;
tree = container_of(barrier.prev, struct audit_tree, list);
get_tree(tree);
list_del(&tree->list);
list_add(&tree->list, &barrier);
mutex_unlock(&audit_filter_mutex);
if (!failed) {
struct node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);
spin_unlock(&hash_lock);
} else {
trim_marked(tree);
}
put_tree(tree);
mutex_lock(&audit_filter_mutex);
}
list_del(&barrier);
list_del(&cursor);
mutex_unlock(&audit_filter_mutex);
path_put(&path1);
drop_collected_mounts(tagged);
return failed;
}
/*
* That gets run when evict_chunk() ends up needing to kill audit_tree.
* Runs from a separate thread.
*/
static int prune_tree_thread(void *unused)
{
mutex_lock(&audit_cmd_mutex);
mutex_lock(&audit_filter_mutex);
while (!list_empty(&prune_list)) {
struct audit_tree *victim;
victim = list_entry(prune_list.next, struct audit_tree, list);
list_del_init(&victim->list);
mutex_unlock(&audit_filter_mutex);
prune_one(victim);
mutex_lock(&audit_filter_mutex);
}
mutex_unlock(&audit_filter_mutex);
mutex_unlock(&audit_cmd_mutex);
return 0;
}
static void audit_schedule_prune(void)
{
kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
}
/*
* ... and that one is done if evict_chunk() decides to delay until the end
* of syscall. Runs synchronously.
*/
void audit_kill_trees(struct list_head *list)
{
mutex_lock(&audit_cmd_mutex);
mutex_lock(&audit_filter_mutex);
while (!list_empty(list)) {
struct audit_tree *victim;
victim = list_entry(list->next, struct audit_tree, list);
kill_rules(victim);
list_del_init(&victim->list);
mutex_unlock(&audit_filter_mutex);
prune_one(victim);
mutex_lock(&audit_filter_mutex);
}
mutex_unlock(&audit_filter_mutex);
mutex_unlock(&audit_cmd_mutex);
}
/*
* Here comes the stuff asynchronous to auditctl operations
*/
static void evict_chunk(struct audit_chunk *chunk)
{
struct audit_tree *owner;
struct list_head *postponed = audit_killed_trees();
int need_prune = 0;
int n;
if (chunk->dead)
return;
chunk->dead = 1;
mutex_lock(&audit_filter_mutex);
spin_lock(&hash_lock);
while (!list_empty(&chunk->trees)) {
owner = list_entry(chunk->trees.next,
struct audit_tree, same_root);
owner->goner = 1;
owner->root = NULL;
list_del_init(&owner->same_root);
spin_unlock(&hash_lock);
if (!postponed) {
kill_rules(owner);
list_move(&owner->list, &prune_list);
need_prune = 1;
} else {
list_move(&owner->list, postponed);
}
spin_lock(&hash_lock);
}
list_del_rcu(&chunk->hash);
for (n = 0; n < chunk->count; n++)
list_del_init(&chunk->owners[n].list);
spin_unlock(&hash_lock);
if (need_prune)
audit_schedule_prune();
mutex_unlock(&audit_filter_mutex);
}
static int audit_tree_handle_event(struct fsnotify_group *group,
struct inode *to_tell,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
u32 mask, void *data, int data_type,
const unsigned char *file_name, u32 cookie)
{
return 0;
}
static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
{
struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
evict_chunk(chunk);
/*
* We are guaranteed to have at least one reference to the mark from
* either the inode or the caller of fsnotify_destroy_mark().
*/
BUG_ON(atomic_read(&entry->refcnt) < 1);
}
static const struct fsnotify_ops audit_tree_ops = {
.handle_event = audit_tree_handle_event,
.freeing_mark = audit_tree_freeing_mark,
};
static int __init audit_tree_init(void)
{
int i;
audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
if (IS_ERR(audit_tree_group))
audit_panic("cannot initialize fsnotify group for rectree watches");
for (i = 0; i < HASH_SIZE; i++)
INIT_LIST_HEAD(&chunk_hash_heads[i]);
return 0;
}
__initcall(audit_tree_init);

519
kernel/audit_watch.c Normal file
View file

@ -0,0 +1,519 @@
/* audit_watch.c -- watching inodes
*
* Copyright 2003-2009 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/fs.h>
#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
#include <linux/netlink.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
#include "audit.h"
/*
* Reference counting:
*
* audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED
* event. Each audit_watch holds a reference to its associated parent.
*
* audit_watch: if added to lists, lifetime is from audit_init_watch() to
* audit_remove_watch(). Additionally, an audit_watch may exist
* temporarily to assist in searching existing filter data. Each
* audit_krule holds a reference to its associated watch.
*/
struct audit_watch {
atomic_t count; /* reference count */
dev_t dev; /* associated superblock device */
char *path; /* insertion path */
unsigned long ino; /* associated inode number */
struct audit_parent *parent; /* associated parent */
struct list_head wlist; /* entry in parent->watches list */
struct list_head rules; /* anchor for krule->rlist */
};
struct audit_parent {
struct list_head watches; /* anchor for audit_watch->wlist */
struct fsnotify_mark mark; /* fsnotify mark on the inode */
};
/* fsnotify handle. */
static struct fsnotify_group *audit_watch_group;
/* fsnotify events we care about. */
#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
FS_MOVE_SELF | FS_EVENT_ON_CHILD)
static void audit_free_parent(struct audit_parent *parent)
{
WARN_ON(!list_empty(&parent->watches));
kfree(parent);
}
static void audit_watch_free_mark(struct fsnotify_mark *entry)
{
struct audit_parent *parent;
parent = container_of(entry, struct audit_parent, mark);
audit_free_parent(parent);
}
static void audit_get_parent(struct audit_parent *parent)
{
if (likely(parent))
fsnotify_get_mark(&parent->mark);
}
static void audit_put_parent(struct audit_parent *parent)
{
if (likely(parent))
fsnotify_put_mark(&parent->mark);
}
/*
* Find and return the audit_parent on the given inode. If found a reference
* is taken on this parent.
*/
static inline struct audit_parent *audit_find_parent(struct inode *inode)
{
struct audit_parent *parent = NULL;
struct fsnotify_mark *entry;
entry = fsnotify_find_inode_mark(audit_watch_group, inode);
if (entry)
parent = container_of(entry, struct audit_parent, mark);
return parent;
}
void audit_get_watch(struct audit_watch *watch)
{
atomic_inc(&watch->count);
}
void audit_put_watch(struct audit_watch *watch)
{
if (atomic_dec_and_test(&watch->count)) {
WARN_ON(watch->parent);
WARN_ON(!list_empty(&watch->rules));
kfree(watch->path);
kfree(watch);
}
}
static void audit_remove_watch(struct audit_watch *watch)
{
list_del(&watch->wlist);
audit_put_parent(watch->parent);
watch->parent = NULL;
audit_put_watch(watch); /* match initial get */
}
char *audit_watch_path(struct audit_watch *watch)
{
return watch->path;
}
int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
{
return (watch->ino != (unsigned long)-1) &&
(watch->ino == ino) &&
(watch->dev == dev);
}
/* Initialize a parent watch entry. */
static struct audit_parent *audit_init_parent(struct path *path)
{
struct inode *inode = path->dentry->d_inode;
struct audit_parent *parent;
int ret;
parent = kzalloc(sizeof(*parent), GFP_KERNEL);
if (unlikely(!parent))
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&parent->watches);
fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
parent->mark.mask = AUDIT_FS_WATCH;
ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
if (ret < 0) {
audit_free_parent(parent);
return ERR_PTR(ret);
}
return parent;
}
/* Initialize a watch entry. */
static struct audit_watch *audit_init_watch(char *path)
{
struct audit_watch *watch;
watch = kzalloc(sizeof(*watch), GFP_KERNEL);
if (unlikely(!watch))
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&watch->rules);
atomic_set(&watch->count, 1);
watch->path = path;
watch->dev = (dev_t)-1;
watch->ino = (unsigned long)-1;
return watch;
}
/* Translate a watch string to kernel respresentation. */
int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
{
struct audit_watch *watch;
if (!audit_watch_group)
return -EOPNOTSUPP;
if (path[0] != '/' || path[len-1] == '/' ||
krule->listnr != AUDIT_FILTER_EXIT ||
op != Audit_equal ||
krule->inode_f || krule->watch || krule->tree)
return -EINVAL;
watch = audit_init_watch(path);
if (IS_ERR(watch))
return PTR_ERR(watch);
audit_get_watch(watch);
krule->watch = watch;
return 0;
}
/* Duplicate the given audit watch. The new watch's rules list is initialized
* to an empty list and wlist is undefined. */
static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
{
char *path;
struct audit_watch *new;
path = kstrdup(old->path, GFP_KERNEL);
if (unlikely(!path))
return ERR_PTR(-ENOMEM);
new = audit_init_watch(path);
if (IS_ERR(new)) {
kfree(path);
goto out;
}
new->dev = old->dev;
new->ino = old->ino;
audit_get_parent(old->parent);
new->parent = old->parent;
out:
return new;
}
static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
{
if (audit_enabled) {
struct audit_buffer *ab;
ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
audit_log_format(ab, "auid=%u ses=%u op=",
from_kuid(&init_user_ns, audit_get_loginuid(current)),
audit_get_sessionid(current));
audit_log_string(ab, op);
audit_log_format(ab, " path=");
audit_log_untrustedstring(ab, w->path);
audit_log_key(ab, r->filterkey);
audit_log_format(ab, " list=%d res=1", r->listnr);
audit_log_end(ab);
}
}
/* Update inode info in audit rules based on filesystem event. */
static void audit_update_watch(struct audit_parent *parent,
const char *dname, dev_t dev,
unsigned long ino, unsigned invalidating)
{
struct audit_watch *owatch, *nwatch, *nextw;
struct audit_krule *r, *nextr;
struct audit_entry *oentry, *nentry;
mutex_lock(&audit_filter_mutex);
/* Run all of the watches on this parent looking for the one that
* matches the given dname */
list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
if (audit_compare_dname_path(dname, owatch->path,
AUDIT_NAME_FULL))
continue;
/* If the update involves invalidating rules, do the inode-based
* filtering now, so we don't omit records. */
if (invalidating && !audit_dummy_context())
audit_filter_inodes(current, current->audit_context);
/* updating ino will likely change which audit_hash_list we
* are on so we need a new watch for the new list */
nwatch = audit_dupe_watch(owatch);
if (IS_ERR(nwatch)) {
mutex_unlock(&audit_filter_mutex);
audit_panic("error updating watch, skipping");
return;
}
nwatch->dev = dev;
nwatch->ino = ino;
list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
oentry = container_of(r, struct audit_entry, rule);
list_del(&oentry->rule.rlist);
list_del_rcu(&oentry->list);
nentry = audit_dupe_rule(&oentry->rule);
if (IS_ERR(nentry)) {
list_del(&oentry->rule.list);
audit_panic("error updating watch, removing");
} else {
int h = audit_hash_ino((u32)ino);
/*
* nentry->rule.watch == oentry->rule.watch so
* we must drop that reference and set it to our
* new watch.
*/
audit_put_watch(nentry->rule.watch);
audit_get_watch(nwatch);
nentry->rule.watch = nwatch;
list_add(&nentry->rule.rlist, &nwatch->rules);
list_add_rcu(&nentry->list, &audit_inode_hash[h]);
list_replace(&oentry->rule.list,
&nentry->rule.list);
}
audit_watch_log_rule_change(r, owatch, "updated_rules");
call_rcu(&oentry->rcu, audit_free_rule_rcu);
}
audit_remove_watch(owatch);
goto add_watch_to_parent; /* event applies to a single watch */
}
mutex_unlock(&audit_filter_mutex);
return;
add_watch_to_parent:
list_add(&nwatch->wlist, &parent->watches);
mutex_unlock(&audit_filter_mutex);
return;
}
/* Remove all watches & rules associated with a parent that is going away. */
static void audit_remove_parent_watches(struct audit_parent *parent)
{
struct audit_watch *w, *nextw;
struct audit_krule *r, *nextr;
struct audit_entry *e;
mutex_lock(&audit_filter_mutex);
list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
e = container_of(r, struct audit_entry, rule);
audit_watch_log_rule_change(r, w, "remove_rule");
list_del(&r->rlist);
list_del(&r->list);
list_del_rcu(&e->list);
call_rcu(&e->rcu, audit_free_rule_rcu);
}
audit_remove_watch(w);
}
mutex_unlock(&audit_filter_mutex);
fsnotify_destroy_mark(&parent->mark, audit_watch_group);
}
/* Get path information necessary for adding watches. */
static int audit_get_nd(struct audit_watch *watch, struct path *parent)
{
struct dentry *d = kern_path_locked(watch->path, parent);
if (IS_ERR(d))
return PTR_ERR(d);
mutex_unlock(&parent->dentry->d_inode->i_mutex);
if (d->d_inode) {
/* update watch filter fields */
watch->dev = d->d_inode->i_sb->s_dev;
watch->ino = d->d_inode->i_ino;
}
dput(d);
return 0;
}
/* Associate the given rule with an existing parent.
* Caller must hold audit_filter_mutex. */
static void audit_add_to_parent(struct audit_krule *krule,
struct audit_parent *parent)
{
struct audit_watch *w, *watch = krule->watch;
int watch_found = 0;
BUG_ON(!mutex_is_locked(&audit_filter_mutex));
list_for_each_entry(w, &parent->watches, wlist) {
if (strcmp(watch->path, w->path))
continue;
watch_found = 1;
/* put krule's and initial refs to temporary watch */
audit_put_watch(watch);
audit_put_watch(watch);
audit_get_watch(w);
krule->watch = watch = w;
break;
}
if (!watch_found) {
audit_get_parent(parent);
watch->parent = parent;
list_add(&watch->wlist, &parent->watches);
}
list_add(&krule->rlist, &watch->rules);
}
/* Find a matching watch entry, or add this one.
* Caller must hold audit_filter_mutex. */
int audit_add_watch(struct audit_krule *krule, struct list_head **list)
{
struct audit_watch *watch = krule->watch;
struct audit_parent *parent;
struct path parent_path;
int h, ret = 0;
mutex_unlock(&audit_filter_mutex);
/* Avoid calling path_lookup under audit_filter_mutex. */
ret = audit_get_nd(watch, &parent_path);
/* caller expects mutex locked */
mutex_lock(&audit_filter_mutex);
if (ret)
return ret;
/* either find an old parent or attach a new one */
parent = audit_find_parent(parent_path.dentry->d_inode);
if (!parent) {
parent = audit_init_parent(&parent_path);
if (IS_ERR(parent)) {
ret = PTR_ERR(parent);
goto error;
}
}
audit_add_to_parent(krule, parent);
/* match get in audit_find_parent or audit_init_parent */
audit_put_parent(parent);
h = audit_hash_ino((u32)watch->ino);
*list = &audit_inode_hash[h];
error:
path_put(&parent_path);
return ret;
}
void audit_remove_watch_rule(struct audit_krule *krule)
{
struct audit_watch *watch = krule->watch;
struct audit_parent *parent = watch->parent;
list_del(&krule->rlist);
if (list_empty(&watch->rules)) {
audit_remove_watch(watch);
if (list_empty(&parent->watches)) {
audit_get_parent(parent);
fsnotify_destroy_mark(&parent->mark, audit_watch_group);
audit_put_parent(parent);
}
}
}
/* Update watch data in audit rules based on fsnotify events. */
static int audit_watch_handle_event(struct fsnotify_group *group,
struct inode *to_tell,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
u32 mask, void *data, int data_type,
const unsigned char *dname, u32 cookie)
{
struct inode *inode;
struct audit_parent *parent;
parent = container_of(inode_mark, struct audit_parent, mark);
BUG_ON(group != audit_watch_group);
switch (data_type) {
case (FSNOTIFY_EVENT_PATH):
inode = ((struct path *)data)->dentry->d_inode;
break;
case (FSNOTIFY_EVENT_INODE):
inode = (struct inode *)data;
break;
default:
BUG();
inode = NULL;
break;
};
if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
else if (mask & (FS_DELETE|FS_MOVED_FROM))
audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
audit_remove_parent_watches(parent);
return 0;
}
static const struct fsnotify_ops audit_watch_fsnotify_ops = {
.handle_event = audit_watch_handle_event,
};
static int __init audit_watch_init(void)
{
audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
if (IS_ERR(audit_watch_group)) {
audit_watch_group = NULL;
audit_panic("cannot create audit fsnotify group");
}
return 0;
}
device_initcall(audit_watch_init);

1418
kernel/auditfilter.c Normal file

File diff suppressed because it is too large Load diff

2500
kernel/auditsc.c Normal file

File diff suppressed because it is too large Load diff

91
kernel/backtracetest.c Normal file
View file

@ -0,0 +1,91 @@
/*
* Simple stack backtrace regression test module
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
#include <linux/completion.h>
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/stacktrace.h>
static void backtrace_test_normal(void)
{
pr_info("Testing a backtrace from process context.\n");
pr_info("The following trace is a kernel self test and not a bug!\n");
dump_stack();
}
static DECLARE_COMPLETION(backtrace_work);
static void backtrace_test_irq_callback(unsigned long data)
{
dump_stack();
complete(&backtrace_work);
}
static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
static void backtrace_test_irq(void)
{
pr_info("Testing a backtrace from irq context.\n");
pr_info("The following trace is a kernel self test and not a bug!\n");
init_completion(&backtrace_work);
tasklet_schedule(&backtrace_tasklet);
wait_for_completion(&backtrace_work);
}
#ifdef CONFIG_STACKTRACE
static void backtrace_test_saved(void)
{
struct stack_trace trace;
unsigned long entries[8];
pr_info("Testing a saved backtrace.\n");
pr_info("The following trace is a kernel self test and not a bug!\n");
trace.nr_entries = 0;
trace.max_entries = ARRAY_SIZE(entries);
trace.entries = entries;
trace.skip = 0;
save_stack_trace(&trace);
print_stack_trace(&trace, 0);
}
#else
static void backtrace_test_saved(void)
{
pr_info("Saved backtrace test skipped.\n");
}
#endif
static int backtrace_regression_test(void)
{
pr_info("====[ backtrace testing ]===========\n");
backtrace_test_normal();
backtrace_test_irq();
backtrace_test_saved();
pr_info("====[ end of backtrace testing ]====\n");
return 0;
}
static void exitf(void)
{
}
module_init(backtrace_regression_test);
module_exit(exitf);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");

25
kernel/bounds.c Normal file
View file

@ -0,0 +1,25 @@
/*
* Generate definitions needed by the preprocessor.
* This code generates raw asm output which is post-processed
* to extract and format the required data.
*/
#define __GENERATING_BOUNDS_H
/* Include headers that define the enum constants of interest */
#include <linux/page-flags.h>
#include <linux/mmzone.h>
#include <linux/kbuild.h>
#include <linux/log2.h>
#include <linux/spinlock_types.h>
void foo(void)
{
/* The enum constants to put into include/generated/bounds.h */
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
#ifdef CONFIG_SMP
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
/* End of constants */
}

5
kernel/bpf/Makefile Normal file
View file

@ -0,0 +1,5 @@
obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o
ifdef CONFIG_TEST_BPF
obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
endif

666
kernel/bpf/core.c Normal file
View file

@ -0,0 +1,666 @@
/*
* Linux Socket Filter - Kernel level socket filtering
*
* Based on the design of the Berkeley Packet Filter. The new
* internal format has been designed by PLUMgrid:
*
* Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
*
* Authors:
*
* Jay Schulist <jschlst@samba.org>
* Alexei Starovoitov <ast@plumgrid.com>
* Daniel Borkmann <dborkman@redhat.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Andi Kleen - Fix a few bad bugs and races.
* Kris Katterjohn - Added many additional checks in bpf_check_classic()
*/
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <linux/random.h>
#include <linux/moduleloader.h>
#include <asm/unaligned.h>
#include <linux/bpf.h>
/* Registers */
#define BPF_R0 regs[BPF_REG_0]
#define BPF_R1 regs[BPF_REG_1]
#define BPF_R2 regs[BPF_REG_2]
#define BPF_R3 regs[BPF_REG_3]
#define BPF_R4 regs[BPF_REG_4]
#define BPF_R5 regs[BPF_REG_5]
#define BPF_R6 regs[BPF_REG_6]
#define BPF_R7 regs[BPF_REG_7]
#define BPF_R8 regs[BPF_REG_8]
#define BPF_R9 regs[BPF_REG_9]
#define BPF_R10 regs[BPF_REG_10]
/* Named registers */
#define DST regs[insn->dst_reg]
#define SRC regs[insn->src_reg]
#define FP regs[BPF_REG_FP]
#define ARG1 regs[BPF_REG_ARG1]
#define CTX regs[BPF_REG_CTX]
#define IMM insn->imm
/* No hurry in this branch
*
* Exported for the bpf jit load helper.
*/
void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
{
u8 *ptr = NULL;
if (k >= SKF_NET_OFF)
ptr = skb_network_header(skb) + k - SKF_NET_OFF;
else if (k >= SKF_LL_OFF)
ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
return ptr;
return NULL;
}
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
{
gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
gfp_extra_flags;
struct bpf_prog_aux *aux;
struct bpf_prog *fp;
size = round_up(size, PAGE_SIZE);
fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
if (fp == NULL)
return NULL;
aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
if (aux == NULL) {
vfree(fp);
return NULL;
}
fp->pages = size / PAGE_SIZE;
fp->aux = aux;
return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_alloc);
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_extra_flags)
{
gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
gfp_extra_flags;
struct bpf_prog *fp;
BUG_ON(fp_old == NULL);
size = round_up(size, PAGE_SIZE);
if (size <= fp_old->pages * PAGE_SIZE)
return fp_old;
fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
if (fp != NULL) {
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
fp->pages = size / PAGE_SIZE;
/* We keep fp->aux from fp_old around in the new
* reallocated structure.
*/
fp_old->aux = NULL;
__bpf_prog_free(fp_old);
}
return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_realloc);
void __bpf_prog_free(struct bpf_prog *fp)
{
kfree(fp->aux);
vfree(fp);
}
EXPORT_SYMBOL_GPL(__bpf_prog_free);
#ifdef CONFIG_BPF_JIT
struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
unsigned int alignment,
bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_binary_header *hdr;
unsigned int size, hole, start;
/* Most of BPF filters are really small, but if some of them
* fill a page, allow at least 128 extra bytes to insert a
* random section of illegal instructions.
*/
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
hdr = module_alloc(size);
if (hdr == NULL)
return NULL;
/* Fill space with illegal/arch-dep instructions. */
bpf_fill_ill_insns(hdr, size);
hdr->pages = size / PAGE_SIZE;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
start = (prandom_u32() % hole) & ~(alignment - 1);
/* Leave a random number of instructions before BPF code. */
*image_ptr = &hdr->image[start];
return hdr;
}
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
module_free(NULL, hdr);
}
#endif /* CONFIG_BPF_JIT */
/* Base function for offset calculation. Needs to go into .text section,
* therefore keeping it non-static as well; will also be used by JITs
* anyway later on, so do not let the compiler omit it.
*/
noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
return 0;
}
/**
* __bpf_prog_run - run eBPF program on a given context
* @ctx: is the data we are operating on
* @insn: is the array of eBPF instructions
*
* Decode and execute eBPF instructions.
*/
static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
{
u64 stack[MAX_BPF_STACK / sizeof(u64)];
u64 regs[MAX_BPF_REG], tmp;
static const void *jumptable[256] = {
[0 ... 255] = &&default_label,
/* Now overwrite non-defaults ... */
/* 32 bit ALU operations */
[BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
[BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
[BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
[BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
[BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
[BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
[BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X,
[BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K,
[BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
[BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
[BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
[BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
[BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
[BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
[BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
[BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
[BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
[BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
[BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
[BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
[BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
[BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
[BPF_ALU | BPF_NEG] = &&ALU_NEG,
[BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
[BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
/* 64 bit ALU operations */
[BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
[BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
[BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
[BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
[BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
[BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
[BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
[BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
[BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
[BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
[BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
[BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
[BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
[BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
[BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
[BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
[BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
[BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
[BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
[BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
[BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
[BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
[BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
[BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
/* Call instruction */
[BPF_JMP | BPF_CALL] = &&JMP_CALL,
/* Jumps */
[BPF_JMP | BPF_JA] = &&JMP_JA,
[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
[BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
[BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
[BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
[BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
[BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
[BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
[BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
[BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
[BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
[BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
[BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
[BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
[BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
/* Program return */
[BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
/* Store instructions */
[BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
[BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
[BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
[BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
[BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
[BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
[BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
[BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
[BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
[BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
/* Load instructions */
[BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
[BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
[BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
[BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
[BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
[BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
[BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
[BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
[BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
[BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
};
void *ptr;
int off;
#define CONT ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
ARG1 = (u64) (unsigned long) ctx;
/* Registers used in classic BPF programs need to be reset first. */
regs[BPF_REG_A] = 0;
regs[BPF_REG_X] = 0;
select_insn:
goto *jumptable[insn->code];
/* ALU */
#define ALU(OPCODE, OP) \
ALU64_##OPCODE##_X: \
DST = DST OP SRC; \
CONT; \
ALU_##OPCODE##_X: \
DST = (u32) DST OP (u32) SRC; \
CONT; \
ALU64_##OPCODE##_K: \
DST = DST OP IMM; \
CONT; \
ALU_##OPCODE##_K: \
DST = (u32) DST OP (u32) IMM; \
CONT;
ALU(ADD, +)
ALU(SUB, -)
ALU(AND, &)
ALU(OR, |)
ALU(LSH, <<)
ALU(RSH, >>)
ALU(XOR, ^)
ALU(MUL, *)
#undef ALU
ALU_NEG:
DST = (u32) -DST;
CONT;
ALU64_NEG:
DST = -DST;
CONT;
ALU_MOV_X:
DST = (u32) SRC;
CONT;
ALU_MOV_K:
DST = (u32) IMM;
CONT;
ALU64_MOV_X:
DST = SRC;
CONT;
ALU64_MOV_K:
DST = IMM;
CONT;
LD_IMM_DW:
DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
insn++;
CONT;
ALU64_ARSH_X:
(*(s64 *) &DST) >>= SRC;
CONT;
ALU64_ARSH_K:
(*(s64 *) &DST) >>= IMM;
CONT;
ALU64_MOD_X:
if (unlikely(SRC == 0))
return 0;
div64_u64_rem(DST, SRC, &tmp);
DST = tmp;
CONT;
ALU_MOD_X:
if (unlikely(SRC == 0))
return 0;
tmp = (u32) DST;
DST = do_div(tmp, (u32) SRC);
CONT;
ALU64_MOD_K:
div64_u64_rem(DST, IMM, &tmp);
DST = tmp;
CONT;
ALU_MOD_K:
tmp = (u32) DST;
DST = do_div(tmp, (u32) IMM);
CONT;
ALU64_DIV_X:
if (unlikely(SRC == 0))
return 0;
DST = div64_u64(DST, SRC);
CONT;
ALU_DIV_X:
if (unlikely(SRC == 0))
return 0;
tmp = (u32) DST;
do_div(tmp, (u32) SRC);
DST = (u32) tmp;
CONT;
ALU64_DIV_K:
DST = div64_u64(DST, IMM);
CONT;
ALU_DIV_K:
tmp = (u32) DST;
do_div(tmp, (u32) IMM);
DST = (u32) tmp;
CONT;
ALU_END_TO_BE:
switch (IMM) {
case 16:
DST = (__force u16) cpu_to_be16(DST);
break;
case 32:
DST = (__force u32) cpu_to_be32(DST);
break;
case 64:
DST = (__force u64) cpu_to_be64(DST);
break;
}
CONT;
ALU_END_TO_LE:
switch (IMM) {
case 16:
DST = (__force u16) cpu_to_le16(DST);
break;
case 32:
DST = (__force u32) cpu_to_le32(DST);
break;
case 64:
DST = (__force u64) cpu_to_le64(DST);
break;
}
CONT;
/* CALL */
JMP_CALL:
/* Function call scratches BPF_R1-BPF_R5 registers,
* preserves BPF_R6-BPF_R9, and stores return value
* into BPF_R0.
*/
BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
BPF_R4, BPF_R5);
CONT;
/* JMP */
JMP_JA:
insn += insn->off;
CONT;
JMP_JEQ_X:
if (DST == SRC) {
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_JEQ_K:
if (DST == IMM) {
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_JNE_X:
if (DST != SRC) {
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_JNE_K:
if (DST != IMM) {
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_JGT_X:
if (DST > SRC) {
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_JGT_K:
if (DST > IMM) {
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_JGE_X:
if (DST >= SRC) {
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_JGE_K:
if (DST >= IMM) {
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_JSGT_X:
if (((s64) DST) > ((s64) SRC)) {
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_JSGT_K:
if (((s64) DST) > ((s64) IMM)) {
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_JSGE_X:
if (((s64) DST) >= ((s64) SRC)) {
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_JSGE_K:
if (((s64) DST) >= ((s64) IMM)) {
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_JSET_X:
if (DST & SRC) {
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_JSET_K:
if (DST & IMM) {
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_EXIT:
return BPF_R0;
/* STX and ST and LDX*/
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
*(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
CONT; \
ST_MEM_##SIZEOP: \
*(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
CONT; \
LDX_MEM_##SIZEOP: \
DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
CONT;
LDST(B, u8)
LDST(H, u16)
LDST(W, u32)
LDST(DW, u64)
#undef LDST
STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
atomic_add((u32) SRC, (atomic_t *)(unsigned long)
(DST + insn->off));
CONT;
STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
(DST + insn->off));
CONT;
LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
off = IMM;
load_word:
/* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
* only appearing in the programs where ctx ==
* skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
* == BPF_R6, bpf_convert_filter() saves it in BPF_R6,
* internal BPF verifier will check that BPF_R6 ==
* ctx.
*
* BPF_ABS and BPF_IND are wrappers of function calls,
* so they scratch BPF_R1-BPF_R5 registers, preserve
* BPF_R6-BPF_R9, and store return value into BPF_R0.
*
* Implicit input:
* ctx == skb == BPF_R6 == CTX
*
* Explicit input:
* SRC == any register
* IMM == 32-bit immediate
*
* Output:
* BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
*/
ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
if (likely(ptr != NULL)) {
BPF_R0 = get_unaligned_be32(ptr);
CONT;
}
return 0;
LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
off = IMM;
load_half:
ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
if (likely(ptr != NULL)) {
BPF_R0 = get_unaligned_be16(ptr);
CONT;
}
return 0;
LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
off = IMM;
load_byte:
ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
if (likely(ptr != NULL)) {
BPF_R0 = *(u8 *)ptr;
CONT;
}
return 0;
LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
off = IMM + SRC;
goto load_word;
LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
off = IMM + SRC;
goto load_half;
LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
off = IMM + SRC;
goto load_byte;
default_label:
/* If we ever reach this, we have a bug somewhere. */
WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
return 0;
}
void __weak bpf_int_jit_compile(struct bpf_prog *prog)
{
}
/**
* bpf_prog_select_runtime - select execution runtime for BPF program
* @fp: bpf_prog populated with internal BPF program
*
* try to JIT internal BPF program, if JIT is not available select interpreter
* BPF program will be executed via BPF_PROG_RUN() macro
*/
void bpf_prog_select_runtime(struct bpf_prog *fp)
{
fp->bpf_func = (void *) __bpf_prog_run;
/* Probe if internal BPF can be JITed */
bpf_int_jit_compile(fp);
/* Lock whole bpf_prog as read-only */
bpf_prog_lock_ro(fp);
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
static void bpf_prog_free_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
aux = container_of(work, struct bpf_prog_aux, work);
bpf_jit_free(aux->prog);
}
/* Free internal BPF program */
void bpf_prog_free(struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
INIT_WORK(&aux->work, bpf_prog_free_deferred);
aux->prog = fp;
schedule_work(&aux->work);
}
EXPORT_SYMBOL_GPL(bpf_prog_free);
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
*/
int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
int len)
{
return -EFAULT;
}

606
kernel/bpf/syscall.c Normal file
View file

@ -0,0 +1,606 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/bpf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/license.h>
#include <linux/filter.h>
static LIST_HEAD(bpf_map_types);
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
struct bpf_map_type_list *tl;
struct bpf_map *map;
list_for_each_entry(tl, &bpf_map_types, list_node) {
if (tl->type == attr->map_type) {
map = tl->ops->map_alloc(attr);
if (IS_ERR(map))
return map;
map->ops = tl->ops;
map->map_type = attr->map_type;
return map;
}
}
return ERR_PTR(-EINVAL);
}
/* boot time registration of different map implementations */
void bpf_register_map_type(struct bpf_map_type_list *tl)
{
list_add(&tl->list_node, &bpf_map_types);
}
/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
/* implementation dependent freeing */
map->ops->map_free(map);
}
/* decrement map refcnt and schedule it for freeing via workqueue
* (unrelying map implementation ops->map_free() might sleep)
*/
void bpf_map_put(struct bpf_map *map)
{
if (atomic_dec_and_test(&map->refcnt)) {
INIT_WORK(&map->work, bpf_map_free_deferred);
schedule_work(&map->work);
}
}
static int bpf_map_release(struct inode *inode, struct file *filp)
{
struct bpf_map *map = filp->private_data;
bpf_map_put(map);
return 0;
}
static const struct file_operations bpf_map_fops = {
.release = bpf_map_release,
};
/* helper macro to check that unused fields 'union bpf_attr' are zero */
#define CHECK_ATTR(CMD) \
memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
sizeof(attr->CMD##_LAST_FIELD), 0, \
sizeof(*attr) - \
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
#define BPF_MAP_CREATE_LAST_FIELD max_entries
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
struct bpf_map *map;
int err;
err = CHECK_ATTR(BPF_MAP_CREATE);
if (err)
return -EINVAL;
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
map = find_and_alloc_map(attr);
if (IS_ERR(map))
return PTR_ERR(map);
atomic_set(&map->refcnt, 1);
err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
if (err < 0)
/* failed to allocate fd */
goto free_map;
return err;
free_map:
map->ops->map_free(map);
return err;
}
/* if error is returned, fd is released.
* On success caller should complete fd access with matching fdput()
*/
struct bpf_map *bpf_map_get(struct fd f)
{
struct bpf_map *map;
if (!f.file)
return ERR_PTR(-EBADF);
if (f.file->f_op != &bpf_map_fops) {
fdput(f);
return ERR_PTR(-EINVAL);
}
map = f.file->private_data;
return map;
}
/* helper to convert user pointers passed inside __aligned_u64 fields */
static void __user *u64_to_ptr(__u64 val)
{
return (void __user *) (unsigned long) val;
}
/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
static int map_lookup_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_ptr(attr->key);
void __user *uvalue = u64_to_ptr(attr->value);
int ufd = attr->map_fd;
struct fd f = fdget(ufd);
struct bpf_map *map;
void *key, *value;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
return -EINVAL;
map = bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
err = -ENOMEM;
key = kmalloc(map->key_size, GFP_USER);
if (!key)
goto err_put;
err = -EFAULT;
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
err = -ESRCH;
rcu_read_lock();
value = map->ops->map_lookup_elem(map, key);
if (!value)
goto err_unlock;
err = -EFAULT;
if (copy_to_user(uvalue, value, map->value_size) != 0)
goto err_unlock;
err = 0;
err_unlock:
rcu_read_unlock();
free_key:
kfree(key);
err_put:
fdput(f);
return err;
}
#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
static int map_update_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_ptr(attr->key);
void __user *uvalue = u64_to_ptr(attr->value);
int ufd = attr->map_fd;
struct fd f = fdget(ufd);
struct bpf_map *map;
void *key, *value;
int err;
if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
return -EINVAL;
map = bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
err = -ENOMEM;
key = kmalloc(map->key_size, GFP_USER);
if (!key)
goto err_put;
err = -EFAULT;
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
err = -ENOMEM;
value = kmalloc(map->value_size, GFP_USER);
if (!value)
goto free_key;
err = -EFAULT;
if (copy_from_user(value, uvalue, map->value_size) != 0)
goto free_value;
/* eBPF program that use maps are running under rcu_read_lock(),
* therefore all map accessors rely on this fact, so do the same here
*/
rcu_read_lock();
err = map->ops->map_update_elem(map, key, value);
rcu_read_unlock();
free_value:
kfree(value);
free_key:
kfree(key);
err_put:
fdput(f);
return err;
}
#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
static int map_delete_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_ptr(attr->key);
int ufd = attr->map_fd;
struct fd f = fdget(ufd);
struct bpf_map *map;
void *key;
int err;
if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
return -EINVAL;
map = bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
err = -ENOMEM;
key = kmalloc(map->key_size, GFP_USER);
if (!key)
goto err_put;
err = -EFAULT;
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
rcu_read_lock();
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
free_key:
kfree(key);
err_put:
fdput(f);
return err;
}
/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
static int map_get_next_key(union bpf_attr *attr)
{
void __user *ukey = u64_to_ptr(attr->key);
void __user *unext_key = u64_to_ptr(attr->next_key);
int ufd = attr->map_fd;
struct fd f = fdget(ufd);
struct bpf_map *map;
void *key, *next_key;
int err;
if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
return -EINVAL;
map = bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
err = -ENOMEM;
key = kmalloc(map->key_size, GFP_USER);
if (!key)
goto err_put;
err = -EFAULT;
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
err = -ENOMEM;
next_key = kmalloc(map->key_size, GFP_USER);
if (!next_key)
goto free_key;
rcu_read_lock();
err = map->ops->map_get_next_key(map, key, next_key);
rcu_read_unlock();
if (err)
goto free_next_key;
err = -EFAULT;
if (copy_to_user(unext_key, next_key, map->key_size) != 0)
goto free_next_key;
err = 0;
free_next_key:
kfree(next_key);
free_key:
kfree(key);
err_put:
fdput(f);
return err;
}
static LIST_HEAD(bpf_prog_types);
static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
{
struct bpf_prog_type_list *tl;
list_for_each_entry(tl, &bpf_prog_types, list_node) {
if (tl->type == type) {
prog->aux->ops = tl->ops;
prog->aux->prog_type = type;
return 0;
}
}
return -EINVAL;
}
void bpf_register_prog_type(struct bpf_prog_type_list *tl)
{
list_add(&tl->list_node, &bpf_prog_types);
}
/* fixup insn->imm field of bpf_call instructions:
* if (insn->imm == BPF_FUNC_map_lookup_elem)
* insn->imm = bpf_map_lookup_elem - __bpf_call_base;
* else if (insn->imm == BPF_FUNC_map_update_elem)
* insn->imm = bpf_map_update_elem - __bpf_call_base;
* else ...
*
* this function is called after eBPF program passed verification
*/
static void fixup_bpf_calls(struct bpf_prog *prog)
{
const struct bpf_func_proto *fn;
int i;
for (i = 0; i < prog->len; i++) {
struct bpf_insn *insn = &prog->insnsi[i];
if (insn->code == (BPF_JMP | BPF_CALL)) {
/* we reach here when program has bpf_call instructions
* and it passed bpf_check(), means that
* ops->get_func_proto must have been supplied, check it
*/
BUG_ON(!prog->aux->ops->get_func_proto);
fn = prog->aux->ops->get_func_proto(insn->imm);
/* all functions that have prototype and verifier allowed
* programs to call them, must be real in-kernel functions
*/
BUG_ON(!fn->func);
insn->imm = fn->func - __bpf_call_base;
}
}
}
/* drop refcnt on maps used by eBPF program and free auxilary data */
static void free_used_maps(struct bpf_prog_aux *aux)
{
int i;
for (i = 0; i < aux->used_map_cnt; i++)
bpf_map_put(aux->used_maps[i]);
kfree(aux->used_maps);
}
void bpf_prog_put(struct bpf_prog *prog)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
free_used_maps(prog->aux);
bpf_prog_free(prog);
}
}
static int bpf_prog_release(struct inode *inode, struct file *filp)
{
struct bpf_prog *prog = filp->private_data;
bpf_prog_put(prog);
return 0;
}
static const struct file_operations bpf_prog_fops = {
.release = bpf_prog_release,
};
static struct bpf_prog *get_prog(struct fd f)
{
struct bpf_prog *prog;
if (!f.file)
return ERR_PTR(-EBADF);
if (f.file->f_op != &bpf_prog_fops) {
fdput(f);
return ERR_PTR(-EINVAL);
}
prog = f.file->private_data;
return prog;
}
/* called by sockets/tracing/seccomp before attaching program to an event
* pairs with bpf_prog_put()
*/
struct bpf_prog *bpf_prog_get(u32 ufd)
{
struct fd f = fdget(ufd);
struct bpf_prog *prog;
prog = get_prog(f);
if (IS_ERR(prog))
return prog;
atomic_inc(&prog->aux->refcnt);
fdput(f);
return prog;
}
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD log_buf
static int bpf_prog_load(union bpf_attr *attr)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog;
int err;
char license[128];
bool is_gpl;
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
/* copy eBPF program license from user space */
if (strncpy_from_user(license, u64_to_ptr(attr->license),
sizeof(license) - 1) < 0)
return -EFAULT;
license[sizeof(license) - 1] = 0;
/* eBPF programs must be GPL compatible to use GPL-ed functions */
is_gpl = license_is_gpl_compatible(license);
if (attr->insn_cnt >= BPF_MAXINSNS)
return -EINVAL;
/* plain bpf_prog allocation */
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
if (!prog)
return -ENOMEM;
prog->len = attr->insn_cnt;
err = -EFAULT;
if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
prog->len * sizeof(struct bpf_insn)) != 0)
goto free_prog;
prog->orig_prog = NULL;
prog->jited = false;
atomic_set(&prog->aux->refcnt, 1);
prog->aux->is_gpl_compatible = is_gpl;
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
if (err < 0)
goto free_prog;
/* run eBPF verifier */
err = bpf_check(prog, attr);
if (err < 0)
goto free_used_maps;
/* fixup BPF_CALL->imm field */
fixup_bpf_calls(prog);
/* eBPF program is ready to be JITed */
bpf_prog_select_runtime(prog);
err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
if (err < 0)
/* failed to allocate fd */
goto free_used_maps;
return err;
free_used_maps:
free_used_maps(prog->aux);
free_prog:
bpf_prog_free(prog);
return err;
}
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
int err;
/* the syscall is limited to root temporarily. This restriction will be
* lifted when security audit is clean. Note that eBPF+tracing must have
* this restriction, since it may pass kernel data to user space
*/
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!access_ok(VERIFY_READ, uattr, 1))
return -EFAULT;
if (size > PAGE_SIZE) /* silly large */
return -E2BIG;
/* If we're handed a bigger struct than we know of,
* ensure all the unknown bits are 0 - i.e. new
* user-space does not rely on any kernel feature
* extensions we dont know about yet.
*/
if (size > sizeof(attr)) {
unsigned char __user *addr;
unsigned char __user *end;
unsigned char val;
addr = (void __user *)uattr + sizeof(attr);
end = (void __user *)uattr + size;
for (; addr < end; addr++) {
err = get_user(val, addr);
if (err)
return err;
if (val)
return -E2BIG;
}
size = sizeof(attr);
}
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
if (copy_from_user(&attr, uattr, size) != 0)
return -EFAULT;
switch (cmd) {
case BPF_MAP_CREATE:
err = map_create(&attr);
break;
case BPF_MAP_LOOKUP_ELEM:
err = map_lookup_elem(&attr);
break;
case BPF_MAP_UPDATE_ELEM:
err = map_update_elem(&attr);
break;
case BPF_MAP_DELETE_ELEM:
err = map_delete_elem(&attr);
break;
case BPF_MAP_GET_NEXT_KEY:
err = map_get_next_key(&attr);
break;
case BPF_PROG_LOAD:
err = bpf_prog_load(&attr);
break;
default:
err = -EINVAL;
break;
}
return err;
}

116
kernel/bpf/test_stub.c Normal file
View file

@ -0,0 +1,116 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/bpf.h>
/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC
* to be used by user space verifier testsuite
*/
struct bpf_context {
u64 arg1;
u64 arg2;
};
static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
return 0;
}
static struct bpf_func_proto test_funcs[] = {
[BPF_FUNC_unspec] = {
.func = test_func,
.gpl_only = true,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_MAP_KEY,
},
};
static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
{
if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs))
return NULL;
return &test_funcs[func_id];
}
static const struct bpf_context_access {
int size;
enum bpf_access_type type;
} test_ctx_access[] = {
[offsetof(struct bpf_context, arg1)] = {
FIELD_SIZEOF(struct bpf_context, arg1),
BPF_READ
},
[offsetof(struct bpf_context, arg2)] = {
FIELD_SIZEOF(struct bpf_context, arg2),
BPF_READ
},
};
static bool test_is_valid_access(int off, int size, enum bpf_access_type type)
{
const struct bpf_context_access *access;
if (off < 0 || off >= ARRAY_SIZE(test_ctx_access))
return false;
access = &test_ctx_access[off];
if (access->size == size && (access->type & type))
return true;
return false;
}
static struct bpf_verifier_ops test_ops = {
.get_func_proto = test_func_proto,
.is_valid_access = test_is_valid_access,
};
static struct bpf_prog_type_list tl_prog = {
.ops = &test_ops,
.type = BPF_PROG_TYPE_UNSPEC,
};
static struct bpf_map *test_map_alloc(union bpf_attr *attr)
{
struct bpf_map *map;
map = kzalloc(sizeof(*map), GFP_USER);
if (!map)
return ERR_PTR(-ENOMEM);
map->key_size = attr->key_size;
map->value_size = attr->value_size;
map->max_entries = attr->max_entries;
return map;
}
static void test_map_free(struct bpf_map *map)
{
kfree(map);
}
static struct bpf_map_ops test_map_ops = {
.map_alloc = test_map_alloc,
.map_free = test_map_free,
};
static struct bpf_map_type_list tl_map = {
.ops = &test_map_ops,
.type = BPF_MAP_TYPE_UNSPEC,
};
static int __init register_test_ops(void)
{
bpf_register_map_type(&tl_map);
bpf_register_prog_type(&tl_prog);
return 0;
}
late_initcall(register_test_ops);

1927
kernel/bpf/verifier.c Normal file

File diff suppressed because it is too large Load diff

446
kernel/capability.c Normal file
View file

@ -0,0 +1,446 @@
/*
* linux/kernel/capability.c
*
* Copyright (C) 1997 Andrew Main <zefram@fysh.org>
*
* Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org>
* 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <asm/uaccess.h>
/*
* Leveraged for setting/resetting capabilities
*/
const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
EXPORT_SYMBOL(__cap_empty_set);
int file_caps_enabled = 1;
static int __init file_caps_disable(char *str)
{
file_caps_enabled = 0;
return 1;
}
__setup("no_file_caps", file_caps_disable);
/*
* More recent versions of libcap are available from:
*
* http://www.kernel.org/pub/linux/libs/security/linux-privs/
*/
static void warn_legacy_capability_use(void)
{
char name[sizeof(current->comm)];
pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
get_task_comm(name, current));
}
/*
* Version 2 capabilities worked fine, but the linux/capability.h file
* that accompanied their introduction encouraged their use without
* the necessary user-space source code changes. As such, we have
* created a version 3 with equivalent functionality to version 2, but
* with a header change to protect legacy source code from using
* version 2 when it wanted to use version 1. If your system has code
* that trips the following warning, it is using version 2 specific
* capabilities and may be doing so insecurely.
*
* The remedy is to either upgrade your version of libcap (to 2.10+,
* if the application is linked against it), or recompile your
* application with modern kernel headers and this warning will go
* away.
*/
static void warn_deprecated_v2(void)
{
char name[sizeof(current->comm)];
pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
get_task_comm(name, current));
}
/*
* Version check. Return the number of u32s in each capability flag
* array, or a negative value on error.
*/
static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
{
__u32 version;
if (get_user(version, &header->version))
return -EFAULT;
switch (version) {
case _LINUX_CAPABILITY_VERSION_1:
warn_legacy_capability_use();
*tocopy = _LINUX_CAPABILITY_U32S_1;
break;
case _LINUX_CAPABILITY_VERSION_2:
warn_deprecated_v2();
/*
* fall through - v3 is otherwise equivalent to v2.
*/
case _LINUX_CAPABILITY_VERSION_3:
*tocopy = _LINUX_CAPABILITY_U32S_3;
break;
default:
if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version))
return -EFAULT;
return -EINVAL;
}
return 0;
}
/*
* The only thing that can change the capabilities of the current
* process is the current process. As such, we can't be in this code
* at the same time as we are in the process of setting capabilities
* in this process. The net result is that we can limit our use of
* locks to when we are reading the caps of another process.
*/
static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
kernel_cap_t *pIp, kernel_cap_t *pPp)
{
int ret;
if (pid && (pid != task_pid_vnr(current))) {
struct task_struct *target;
rcu_read_lock();
target = find_task_by_vpid(pid);
if (!target)
ret = -ESRCH;
else
ret = security_capget(target, pEp, pIp, pPp);
rcu_read_unlock();
} else
ret = security_capget(current, pEp, pIp, pPp);
return ret;
}
/**
* sys_capget - get the capabilities of a given process.
* @header: pointer to struct that contains capability version and
* target pid data
* @dataptr: pointer to struct that contains the effective, permitted,
* and inheritable capabilities that are returned
*
* Returns 0 on success and < 0 on error.
*/
SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
{
int ret = 0;
pid_t pid;
unsigned tocopy;
kernel_cap_t pE, pI, pP;
ret = cap_validate_magic(header, &tocopy);
if ((dataptr == NULL) || (ret != 0))
return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;
if (get_user(pid, &header->pid))
return -EFAULT;
if (pid < 0)
return -EINVAL;
ret = cap_get_target_pid(pid, &pE, &pI, &pP);
if (!ret) {
struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
unsigned i;
for (i = 0; i < tocopy; i++) {
kdata[i].effective = pE.cap[i];
kdata[i].permitted = pP.cap[i];
kdata[i].inheritable = pI.cap[i];
}
/*
* Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
* we silently drop the upper capabilities here. This
* has the effect of making older libcap
* implementations implicitly drop upper capability
* bits when they perform a: capget/modify/capset
* sequence.
*
* This behavior is considered fail-safe
* behavior. Upgrading the application to a newer
* version of libcap will enable access to the newer
* capabilities.
*
* An alternative would be to return an error here
* (-ERANGE), but that causes legacy applications to
* unexpectedly fail; the capget/modify/capset aborts
* before modification is attempted and the application
* fails.
*/
if (copy_to_user(dataptr, kdata, tocopy
* sizeof(struct __user_cap_data_struct))) {
return -EFAULT;
}
}
return ret;
}
/**
* sys_capset - set capabilities for a process or (*) a group of processes
* @header: pointer to struct that contains capability version and
* target pid data
* @data: pointer to struct that contains the effective, permitted,
* and inheritable capabilities
*
* Set capabilities for the current process only. The ability to any other
* process(es) has been deprecated and removed.
*
* The restrictions on setting capabilities are specified as:
*
* I: any raised capabilities must be a subset of the old permitted
* P: any raised capabilities must be a subset of the old permitted
* E: must be set to a subset of new permitted
*
* Returns 0 on success and < 0 on error.
*/
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
unsigned i, tocopy, copybytes;
kernel_cap_t inheritable, permitted, effective;
struct cred *new;
int ret;
pid_t pid;
ret = cap_validate_magic(header, &tocopy);
if (ret != 0)
return ret;
if (get_user(pid, &header->pid))
return -EFAULT;
/* may only affect current now */
if (pid != 0 && pid != task_pid_vnr(current))
return -EPERM;
copybytes = tocopy * sizeof(struct __user_cap_data_struct);
if (copybytes > sizeof(kdata))
return -EFAULT;
if (copy_from_user(&kdata, data, copybytes))
return -EFAULT;
for (i = 0; i < tocopy; i++) {
effective.cap[i] = kdata[i].effective;
permitted.cap[i] = kdata[i].permitted;
inheritable.cap[i] = kdata[i].inheritable;
}
while (i < _KERNEL_CAPABILITY_U32S) {
effective.cap[i] = 0;
permitted.cap[i] = 0;
inheritable.cap[i] = 0;
i++;
}
effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
new = prepare_creds();
if (!new)
return -ENOMEM;
ret = security_capset(new, current_cred(),
&effective, &inheritable, &permitted);
if (ret < 0)
goto error;
audit_log_capset(new, current_cred());
return commit_creds(new);
error:
abort_creds(new);
return ret;
}
/**
* has_ns_capability - Does a task have a capability in a specific user ns
* @t: The task in question
* @ns: target user namespace
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
* currently in effect to the specified user namespace, false if not.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
bool has_ns_capability(struct task_struct *t,
struct user_namespace *ns, int cap)
{
int ret;
rcu_read_lock();
ret = security_capable(__task_cred(t), ns, cap);
rcu_read_unlock();
return (ret == 0);
}
/**
* has_capability - Does a task have a capability in init_user_ns
* @t: The task in question
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
* currently in effect to the initial user namespace, false if not.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
bool has_capability(struct task_struct *t, int cap)
{
return has_ns_capability(t, &init_user_ns, cap);
}
/**
* has_ns_capability_noaudit - Does a task have a capability (unaudited)
* in a specific user ns.
* @t: The task in question
* @ns: target user namespace
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
* currently in effect to the specified user namespace, false if not.
* Do not write an audit message for the check.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
bool has_ns_capability_noaudit(struct task_struct *t,
struct user_namespace *ns, int cap)
{
int ret;
rcu_read_lock();
ret = security_capable_noaudit(__task_cred(t), ns, cap);
rcu_read_unlock();
return (ret == 0);
}
/**
* has_capability_noaudit - Does a task have a capability (unaudited) in the
* initial user ns
* @t: The task in question
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
* currently in effect to init_user_ns, false if not. Don't write an
* audit message for the check.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
bool has_capability_noaudit(struct task_struct *t, int cap)
{
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
/**
* ns_capable - Determine if the current task has a superior capability in effect
* @ns: The usernamespace we want the capability in
* @cap: The capability to be tested for
*
* Return true if the current task has the given superior capability currently
* available for use, false if not.
*
* This sets PF_SUPERPRIV on the task if the capability is available on the
* assumption that it's about to be used.
*/
bool ns_capable(struct user_namespace *ns, int cap)
{
if (unlikely(!cap_valid(cap))) {
pr_crit("capable() called with invalid cap=%u\n", cap);
BUG();
}
if (security_capable(current_cred(), ns, cap) == 0) {
current->flags |= PF_SUPERPRIV;
return true;
}
return false;
}
EXPORT_SYMBOL(ns_capable);
/**
* file_ns_capable - Determine if the file's opener had a capability in effect
* @file: The file we want to check
* @ns: The usernamespace we want the capability in
* @cap: The capability to be tested for
*
* Return true if task that opened the file had a capability in effect
* when the file was opened.
*
* This does not set PF_SUPERPRIV because the caller may not
* actually be privileged.
*/
bool file_ns_capable(const struct file *file, struct user_namespace *ns,
int cap)
{
if (WARN_ON_ONCE(!cap_valid(cap)))
return false;
if (security_capable(file->f_cred, ns, cap) == 0)
return true;
return false;
}
EXPORT_SYMBOL(file_ns_capable);
/**
* capable - Determine if the current task has a superior capability in effect
* @cap: The capability to be tested for
*
* Return true if the current task has the given superior capability currently
* available for use, false if not.
*
* This sets PF_SUPERPRIV on the task if the capability is available on the
* assumption that it's about to be used.
*/
bool capable(int cap)
{
return ns_capable(&init_user_ns, cap);
}
EXPORT_SYMBOL(capable);
/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
* @inode: The inode in question
* @cap: The capability in question
*
* Return true if the current task has the given capability targeted at
* its own user namespace and that the given inode's uid and gid are
* mapped into the current user namespace.
*/
bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
kgid_has_mapping(ns, inode->i_gid);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);

5583
kernel/cgroup.c Normal file

File diff suppressed because it is too large Load diff

484
kernel/cgroup_freezer.c Normal file
View file

@ -0,0 +1,484 @@
/*
* cgroup_freezer.c - control group freezer subsystem
*
* Copyright IBM Corporation, 2007
*
* Author : Cedric Le Goater <clg@fr.ibm.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2.1 of the GNU Lesser General Public License
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*/
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/cgroup.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
#include <linux/mutex.h>
/*
* A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
* set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
* for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
* for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
* its ancestors has FREEZING_SELF set.
*/
enum freezer_state_flags {
CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
/* mask for all FREEZING flags */
CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
};
struct freezer {
struct cgroup_subsys_state css;
unsigned int state;
};
static DEFINE_MUTEX(freezer_mutex);
static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct freezer, css) : NULL;
}
static inline struct freezer *task_freezer(struct task_struct *task)
{
return css_freezer(task_css(task, freezer_cgrp_id));
}
static struct freezer *parent_freezer(struct freezer *freezer)
{
return css_freezer(freezer->css.parent);
}
bool cgroup_freezing(struct task_struct *task)
{
bool ret;
rcu_read_lock();
ret = task_freezer(task)->state & CGROUP_FREEZING;
rcu_read_unlock();
return ret;
}
static const char *freezer_state_strs(unsigned int state)
{
if (state & CGROUP_FROZEN)
return "FROZEN";
if (state & CGROUP_FREEZING)
return "FREEZING";
return "THAWED";
};
static struct cgroup_subsys_state *
freezer_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct freezer *freezer;
freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
if (!freezer)
return ERR_PTR(-ENOMEM);
return &freezer->css;
}
/**
* freezer_css_online - commit creation of a freezer css
* @css: css being created
*
* We're committing to creation of @css. Mark it online and inherit
* parent's freezing state while holding both parent's and our
* freezer->lock.
*/
static int freezer_css_online(struct cgroup_subsys_state *css)
{
struct freezer *freezer = css_freezer(css);
struct freezer *parent = parent_freezer(freezer);
mutex_lock(&freezer_mutex);
freezer->state |= CGROUP_FREEZER_ONLINE;
if (parent && (parent->state & CGROUP_FREEZING)) {
freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
atomic_inc(&system_freezing_cnt);
}
mutex_unlock(&freezer_mutex);
return 0;
}
/**
* freezer_css_offline - initiate destruction of a freezer css
* @css: css being destroyed
*
* @css is going away. Mark it dead and decrement system_freezing_count if
* it was holding one.
*/
static void freezer_css_offline(struct cgroup_subsys_state *css)
{
struct freezer *freezer = css_freezer(css);
mutex_lock(&freezer_mutex);
if (freezer->state & CGROUP_FREEZING)
atomic_dec(&system_freezing_cnt);
freezer->state = 0;
mutex_unlock(&freezer_mutex);
}
static void freezer_css_free(struct cgroup_subsys_state *css)
{
kfree(css_freezer(css));
}
/*
* Tasks can be migrated into a different freezer anytime regardless of its
* current state. freezer_attach() is responsible for making new tasks
* conform to the current state.
*
* Freezer state changes and task migration are synchronized via
* @freezer->lock. freezer_attach() makes the new tasks conform to the
* current state and all following state changes can see the new tasks.
*/
static void freezer_attach(struct cgroup_subsys_state *new_css,
struct cgroup_taskset *tset)
{
struct freezer *freezer = css_freezer(new_css);
struct task_struct *task;
bool clear_frozen = false;
mutex_lock(&freezer_mutex);
/*
* Make the new tasks conform to the current state of @new_css.
* For simplicity, when migrating any task to a FROZEN cgroup, we
* revert it to FREEZING and let update_if_frozen() determine the
* correct state later.
*
* Tasks in @tset are on @new_css but may not conform to its
* current state before executing the following - !frozen tasks may
* be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
*/
cgroup_taskset_for_each(task, tset) {
if (!(freezer->state & CGROUP_FREEZING)) {
__thaw_task(task);
} else {
freeze_task(task);
freezer->state &= ~CGROUP_FROZEN;
clear_frozen = true;
}
}
/* propagate FROZEN clearing upwards */
while (clear_frozen && (freezer = parent_freezer(freezer))) {
freezer->state &= ~CGROUP_FROZEN;
clear_frozen = freezer->state & CGROUP_FREEZING;
}
mutex_unlock(&freezer_mutex);
}
/**
* freezer_fork - cgroup post fork callback
* @task: a task which has just been forked
*
* @task has just been created and should conform to the current state of
* the cgroup_freezer it belongs to. This function may race against
* freezer_attach(). Losing to freezer_attach() means that we don't have
* to do anything as freezer_attach() will put @task into the appropriate
* state.
*/
static void freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
/*
* The root cgroup is non-freezable, so we can skip locking the
* freezer. This is safe regardless of race with task migration.
* If we didn't race or won, skipping is obviously the right thing
* to do. If we lost and root is the new cgroup, noop is still the
* right thing to do.
*/
if (task_css_is_root(task, freezer_cgrp_id))
return;
mutex_lock(&freezer_mutex);
rcu_read_lock();
freezer = task_freezer(task);
if (freezer->state & CGROUP_FREEZING)
freeze_task(task);
rcu_read_unlock();
mutex_unlock(&freezer_mutex);
}
/**
* update_if_frozen - update whether a cgroup finished freezing
* @css: css of interest
*
* Once FREEZING is initiated, transition to FROZEN is lazily updated by
* calling this function. If the current state is FREEZING but not FROZEN,
* this function checks whether all tasks of this cgroup and the descendant
* cgroups finished freezing and, if so, sets FROZEN.
*
* The caller is responsible for grabbing RCU read lock and calling
* update_if_frozen() on all descendants prior to invoking this function.
*
* Task states and freezer state might disagree while tasks are being
* migrated into or out of @css, so we can't verify task states against
* @freezer state here. See freezer_attach() for details.
*/
static void update_if_frozen(struct cgroup_subsys_state *css)
{
struct freezer *freezer = css_freezer(css);
struct cgroup_subsys_state *pos;
struct css_task_iter it;
struct task_struct *task;
lockdep_assert_held(&freezer_mutex);
if (!(freezer->state & CGROUP_FREEZING) ||
(freezer->state & CGROUP_FROZEN))
return;
/* are all (live) children frozen? */
rcu_read_lock();
css_for_each_child(pos, css) {
struct freezer *child = css_freezer(pos);
if ((child->state & CGROUP_FREEZER_ONLINE) &&
!(child->state & CGROUP_FROZEN)) {
rcu_read_unlock();
return;
}
}
rcu_read_unlock();
/* are all tasks frozen? */
css_task_iter_start(css, &it);
while ((task = css_task_iter_next(&it))) {
if (freezing(task)) {
/*
* freezer_should_skip() indicates that the task
* should be skipped when determining freezing
* completion. Consider it frozen in addition to
* the usual frozen condition.
*/
if (!frozen(task) && !freezer_should_skip(task))
goto out_iter_end;
}
}
freezer->state |= CGROUP_FROZEN;
out_iter_end:
css_task_iter_end(&it);
}
static int freezer_read(struct seq_file *m, void *v)
{
struct cgroup_subsys_state *css = seq_css(m), *pos;
mutex_lock(&freezer_mutex);
rcu_read_lock();
/* update states bottom-up */
css_for_each_descendant_post(pos, css) {
if (!css_tryget_online(pos))
continue;
rcu_read_unlock();
update_if_frozen(pos);
rcu_read_lock();
css_put(pos);
}
rcu_read_unlock();
mutex_unlock(&freezer_mutex);
seq_puts(m, freezer_state_strs(css_freezer(css)->state));
seq_putc(m, '\n');
return 0;
}
static void freeze_cgroup(struct freezer *freezer)
{
struct css_task_iter it;
struct task_struct *task;
css_task_iter_start(&freezer->css, &it);
while ((task = css_task_iter_next(&it)))
freeze_task(task);
css_task_iter_end(&it);
}
static void unfreeze_cgroup(struct freezer *freezer)
{
struct css_task_iter it;
struct task_struct *task;
css_task_iter_start(&freezer->css, &it);
while ((task = css_task_iter_next(&it)))
__thaw_task(task);
css_task_iter_end(&it);
}
/**
* freezer_apply_state - apply state change to a single cgroup_freezer
* @freezer: freezer to apply state change to
* @freeze: whether to freeze or unfreeze
* @state: CGROUP_FREEZING_* flag to set or clear
*
* Set or clear @state on @cgroup according to @freeze, and perform
* freezing or thawing as necessary.
*/
static void freezer_apply_state(struct freezer *freezer, bool freeze,
unsigned int state)
{
/* also synchronizes against task migration, see freezer_attach() */
lockdep_assert_held(&freezer_mutex);
if (!(freezer->state & CGROUP_FREEZER_ONLINE))
return;
if (freeze) {
if (!(freezer->state & CGROUP_FREEZING))
atomic_inc(&system_freezing_cnt);
freezer->state |= state;
freeze_cgroup(freezer);
} else {
bool was_freezing = freezer->state & CGROUP_FREEZING;
freezer->state &= ~state;
if (!(freezer->state & CGROUP_FREEZING)) {
if (was_freezing)
atomic_dec(&system_freezing_cnt);
freezer->state &= ~CGROUP_FROZEN;
unfreeze_cgroup(freezer);
}
}
}
/**
* freezer_change_state - change the freezing state of a cgroup_freezer
* @freezer: freezer of interest
* @freeze: whether to freeze or thaw
*
* Freeze or thaw @freezer according to @freeze. The operations are
* recursive - all descendants of @freezer will be affected.
*/
static void freezer_change_state(struct freezer *freezer, bool freeze)
{
struct cgroup_subsys_state *pos;
/*
* Update all its descendants in pre-order traversal. Each
* descendant will try to inherit its parent's FREEZING state as
* CGROUP_FREEZING_PARENT.
*/
mutex_lock(&freezer_mutex);
rcu_read_lock();
css_for_each_descendant_pre(pos, &freezer->css) {
struct freezer *pos_f = css_freezer(pos);
struct freezer *parent = parent_freezer(pos_f);
if (!css_tryget_online(pos))
continue;
rcu_read_unlock();
if (pos_f == freezer)
freezer_apply_state(pos_f, freeze,
CGROUP_FREEZING_SELF);
else
freezer_apply_state(pos_f,
parent->state & CGROUP_FREEZING,
CGROUP_FREEZING_PARENT);
rcu_read_lock();
css_put(pos);
}
rcu_read_unlock();
mutex_unlock(&freezer_mutex);
}
static ssize_t freezer_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
bool freeze;
buf = strstrip(buf);
if (strcmp(buf, freezer_state_strs(0)) == 0)
freeze = false;
else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
freeze = true;
else
return -EINVAL;
freezer_change_state(css_freezer(of_css(of)), freeze);
return nbytes;
}
static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct freezer *freezer = css_freezer(css);
return (bool)(freezer->state & CGROUP_FREEZING_SELF);
}
static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct freezer *freezer = css_freezer(css);
return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
}
static struct cftype files[] = {
{
.name = "state",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = freezer_read,
.write = freezer_write,
},
{
.name = "self_freezing",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = freezer_self_freezing_read,
},
{
.name = "parent_freezing",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = freezer_parent_freezing_read,
},
{ } /* terminate */
};
struct cgroup_subsys freezer_cgrp_subsys = {
.css_alloc = freezer_css_alloc,
.css_online = freezer_css_online,
.css_offline = freezer_css_offline,
.css_free = freezer_css_free,
.attach = freezer_attach,
.fork = freezer_fork,
.legacy_cftypes = files,
};

1175
kernel/compat.c Normal file

File diff suppressed because it is too large Load diff

99
kernel/configs.c Normal file
View file

@ -0,0 +1,99 @@
/*
* kernel/configs.c
* Echo the kernel .config file used to build the kernel
*
* Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com>
* Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
* Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
* Copyright (C) 2002 Hewlett-Packard Company
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/init.h>
#include <asm/uaccess.h>
/**************************************************/
/* the actual current config file */
/*
* Define kernel_config_data and kernel_config_data_size, which contains the
* wrapped and compressed configuration file. The file is first compressed
* with gzip and then bounded by two eight byte magic numbers to allow
* extraction from a binary kernel image:
*
* IKCFG_ST
* <image>
* IKCFG_ED
*/
#define MAGIC_START "IKCFG_ST"
#define MAGIC_END "IKCFG_ED"
#include "config_data.h"
#define MAGIC_SIZE (sizeof(MAGIC_START) - 1)
#define kernel_config_data_size \
(sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2)
#ifdef CONFIG_IKCONFIG_PROC
static ssize_t
ikconfig_read_current(struct file *file, char __user *buf,
size_t len, loff_t * offset)
{
return simple_read_from_buffer(buf, len, offset,
kernel_config_data + MAGIC_SIZE,
kernel_config_data_size);
}
static const struct file_operations ikconfig_file_ops = {
.owner = THIS_MODULE,
.read = ikconfig_read_current,
.llseek = default_llseek,
};
static int __init ikconfig_init(void)
{
struct proc_dir_entry *entry;
/* create the current config file */
entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL,
&ikconfig_file_ops);
if (!entry)
return -ENOMEM;
proc_set_size(entry, kernel_config_data_size);
return 0;
}
static void __exit ikconfig_cleanup(void)
{
remove_proc_entry("config.gz", NULL);
}
module_init(ikconfig_init);
module_exit(ikconfig_cleanup);
#endif /* CONFIG_IKCONFIG_PROC */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Randy Dunlap");
MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel");

View file

@ -0,0 +1,4 @@
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_KERNEL_XZ=y
CONFIG_OPTIMIZE_INLINING=y
CONFIG_SLOB=y

176
kernel/context_tracking.c Normal file
View file

@ -0,0 +1,176 @@
/*
* Context tracking: Probe on high level context boundaries such as kernel
* and userspace. This includes syscalls and exceptions entry/exit.
*
* This is used by RCU to remove its dependency on the timer tick while a CPU
* runs in userspace.
*
* Started by Frederic Weisbecker:
*
* Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
*
* Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
* Steven Rostedt, Peter Zijlstra for suggestions and improvements.
*
*/
#include <linux/context_tracking.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/hardirq.h>
#include <linux/export.h>
#include <linux/kprobes.h>
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
EXPORT_SYMBOL_GPL(context_tracking_enabled);
DEFINE_PER_CPU(struct context_tracking, context_tracking);
EXPORT_SYMBOL_GPL(context_tracking);
void context_tracking_cpu_set(int cpu)
{
if (!per_cpu(context_tracking.active, cpu)) {
per_cpu(context_tracking.active, cpu) = true;
static_key_slow_inc(&context_tracking_enabled);
}
}
/**
* context_tracking_user_enter - Inform the context tracking that the CPU is going to
* enter userspace mode.
*
* This function must be called right before we switch from the kernel
* to userspace, when it's guaranteed the remaining kernel instructions
* to execute won't use any RCU read side critical section because this
* function sets RCU in extended quiescent state.
*/
void context_tracking_user_enter(void)
{
unsigned long flags;
/*
* Repeat the user_enter() check here because some archs may be calling
* this from asm and if no CPU needs context tracking, they shouldn't
* go further. Repeat the check here until they support the inline static
* key check.
*/
if (!context_tracking_is_enabled())
return;
/*
* Some contexts may involve an exception occuring in an irq,
* leading to that nesting:
* rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
* This would mess up the dyntick_nesting count though. And rcu_irq_*()
* helpers are enough to protect RCU uses inside the exception. So
* just return immediately if we detect we are in an IRQ.
*/
if (in_interrupt())
return;
/* Kernel threads aren't supposed to go to userspace */
WARN_ON_ONCE(!current->mm);
local_irq_save(flags);
if ( __this_cpu_read(context_tracking.state) != IN_USER) {
if (__this_cpu_read(context_tracking.active)) {
trace_user_enter(0);
/*
* At this stage, only low level arch entry code remains and
* then we'll run in userspace. We can assume there won't be
* any RCU read-side critical section until the next call to
* user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
* on the tick.
*/
vtime_user_enter(current);
rcu_user_enter();
}
/*
* Even if context tracking is disabled on this CPU, because it's outside
* the full dynticks mask for example, we still have to keep track of the
* context transitions and states to prevent inconsistency on those of
* other CPUs.
* If a task triggers an exception in userspace, sleep on the exception
* handler and then migrate to another CPU, that new CPU must know where
* the exception returns by the time we call exception_exit().
* This information can only be provided by the previous CPU when it called
* exception_enter().
* OTOH we can spare the calls to vtime and RCU when context_tracking.active
* is false because we know that CPU is not tickless.
*/
__this_cpu_write(context_tracking.state, IN_USER);
}
local_irq_restore(flags);
}
NOKPROBE_SYMBOL(context_tracking_user_enter);
/**
* context_tracking_user_exit - Inform the context tracking that the CPU is
* exiting userspace mode and entering the kernel.
*
* This function must be called after we entered the kernel from userspace
* before any use of RCU read side critical section. This potentially include
* any high level kernel code like syscalls, exceptions, signal handling, etc...
*
* This call supports re-entrancy. This way it can be called from any exception
* handler without needing to know if we came from userspace or not.
*/
void context_tracking_user_exit(void)
{
unsigned long flags;
if (!context_tracking_is_enabled())
return;
if (in_interrupt())
return;
local_irq_save(flags);
if (__this_cpu_read(context_tracking.state) == IN_USER) {
if (__this_cpu_read(context_tracking.active)) {
/*
* We are going to run code that may use RCU. Inform
* RCU core about that (ie: we may need the tick again).
*/
rcu_user_exit();
vtime_user_exit(current);
trace_user_exit(0);
}
__this_cpu_write(context_tracking.state, IN_KERNEL);
}
local_irq_restore(flags);
}
NOKPROBE_SYMBOL(context_tracking_user_exit);
/**
* __context_tracking_task_switch - context switch the syscall callbacks
* @prev: the task that is being switched out
* @next: the task that is being switched in
*
* The context tracking uses the syscall slow path to implement its user-kernel
* boundaries probes on syscalls. This way it doesn't impact the syscall fast
* path on CPUs that don't do context tracking.
*
* But we need to clear the flag on the previous task because it may later
* migrate to some CPU that doesn't do the context tracking. As such the TIF
* flag may not be desired there.
*/
void __context_tracking_task_switch(struct task_struct *prev,
struct task_struct *next)
{
clear_tsk_thread_flag(prev, TIF_NOHZ);
set_tsk_thread_flag(next, TIF_NOHZ);
}
#ifdef CONFIG_CONTEXT_TRACKING_FORCE
void __init context_tracking_init(void)
{
int cpu;
for_each_possible_cpu(cpu)
context_tracking_cpu_set(cpu);
}
#endif

789
kernel/cpu.c Normal file
View file

@ -0,0 +1,789 @@
/* CPU control.
* (C) 2001, 2002, 2003, 2004 Rusty Russell
*
* This code is licenced under the GPL.
*/
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/notifier.h>
#include <linux/sched.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/bug.h>
#include <linux/kthread.h>
#include <linux/stop_machine.h>
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/suspend.h>
#include <linux/lockdep.h>
#include <trace/events/power.h>
#include <trace/events/sched.h>
#include "smpboot.h"
#ifdef CONFIG_SMP
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
/*
* The following two APIs (cpu_maps_update_begin/done) must be used when
* attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
* The APIs cpu_notifier_register_begin/done() must be used to protect CPU
* hotplug callback (un)registration performed using __register_cpu_notifier()
* or __unregister_cpu_notifier().
*/
void cpu_maps_update_begin(void)
{
mutex_lock(&cpu_add_remove_lock);
}
EXPORT_SYMBOL(cpu_notifier_register_begin);
void cpu_maps_update_done(void)
{
mutex_unlock(&cpu_add_remove_lock);
}
EXPORT_SYMBOL(cpu_notifier_register_done);
static RAW_NOTIFIER_HEAD(cpu_chain);
/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
* Should always be manipulated under cpu_add_remove_lock
*/
static int cpu_hotplug_disabled;
#ifdef CONFIG_HOTPLUG_CPU
static struct {
struct task_struct *active_writer;
struct mutex lock; /* Synchronizes accesses to refcount, */
/*
* Also blocks the new readers during
* an ongoing cpu hotplug operation.
*/
int refcount;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
} cpu_hotplug = {
.active_writer = NULL,
.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
.refcount = 0,
#ifdef CONFIG_DEBUG_LOCK_ALLOC
.dep_map = {.name = "cpu_hotplug.lock" },
#endif
};
/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
void get_online_cpus(void)
{
might_sleep();
if (cpu_hotplug.active_writer == current)
return;
cpuhp_lock_acquire_read();
mutex_lock(&cpu_hotplug.lock);
cpu_hotplug.refcount++;
mutex_unlock(&cpu_hotplug.lock);
}
EXPORT_SYMBOL_GPL(get_online_cpus);
void put_online_cpus(void)
{
if (cpu_hotplug.active_writer == current)
return;
mutex_lock(&cpu_hotplug.lock);
if (WARN_ON(!cpu_hotplug.refcount))
cpu_hotplug.refcount++; /* try to fix things up */
if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
wake_up_process(cpu_hotplug.active_writer);
mutex_unlock(&cpu_hotplug.lock);
cpuhp_lock_release();
}
EXPORT_SYMBOL_GPL(put_online_cpus);
/*
* This ensures that the hotplug operation can begin only when the
* refcount goes to zero.
*
* Note that during a cpu-hotplug operation, the new readers, if any,
* will be blocked by the cpu_hotplug.lock
*
* Since cpu_hotplug_begin() is always called after invoking
* cpu_maps_update_begin(), we can be sure that only one writer is active.
*
* Note that theoretically, there is a possibility of a livelock:
* - Refcount goes to zero, last reader wakes up the sleeping
* writer.
* - Last reader unlocks the cpu_hotplug.lock.
* - A new reader arrives at this moment, bumps up the refcount.
* - The writer acquires the cpu_hotplug.lock finds the refcount
* non zero and goes to sleep again.
*
* However, this is very difficult to achieve in practice since
* get_online_cpus() not an api which is called all that often.
*
*/
void cpu_hotplug_begin(void)
{
cpu_hotplug.active_writer = current;
cpuhp_lock_acquire();
for (;;) {
mutex_lock(&cpu_hotplug.lock);
if (likely(!cpu_hotplug.refcount))
break;
__set_current_state(TASK_UNINTERRUPTIBLE);
mutex_unlock(&cpu_hotplug.lock);
schedule();
}
}
void cpu_hotplug_done(void)
{
cpu_hotplug.active_writer = NULL;
mutex_unlock(&cpu_hotplug.lock);
cpuhp_lock_release();
}
/*
* Wait for currently running CPU hotplug operations to complete (if any) and
* disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
* the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
* hotplug path before performing hotplug operations. So acquiring that lock
* guarantees mutual exclusion from any currently running hotplug operations.
*/
void cpu_hotplug_disable(void)
{
cpu_maps_update_begin();
cpu_hotplug_disabled = 1;
cpu_maps_update_done();
}
void cpu_hotplug_enable(void)
{
cpu_maps_update_begin();
cpu_hotplug_disabled = 0;
cpu_maps_update_done();
}
#endif /* CONFIG_HOTPLUG_CPU */
/* Need to know about CPUs going up/down? */
int __ref register_cpu_notifier(struct notifier_block *nb)
{
int ret;
cpu_maps_update_begin();
ret = raw_notifier_chain_register(&cpu_chain, nb);
cpu_maps_update_done();
return ret;
}
int __ref __register_cpu_notifier(struct notifier_block *nb)
{
return raw_notifier_chain_register(&cpu_chain, nb);
}
static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
int *nr_calls)
{
int ret;
ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
nr_calls);
return notifier_to_errno(ret);
}
static int cpu_notify(unsigned long val, void *v)
{
return __cpu_notify(val, v, -1, NULL);
}
#ifdef CONFIG_HOTPLUG_CPU
static void cpu_notify_nofail(unsigned long val, void *v)
{
BUG_ON(cpu_notify(val, v));
}
EXPORT_SYMBOL(register_cpu_notifier);
EXPORT_SYMBOL(__register_cpu_notifier);
void __ref unregister_cpu_notifier(struct notifier_block *nb)
{
cpu_maps_update_begin();
raw_notifier_chain_unregister(&cpu_chain, nb);
cpu_maps_update_done();
}
EXPORT_SYMBOL(unregister_cpu_notifier);
void __ref __unregister_cpu_notifier(struct notifier_block *nb)
{
raw_notifier_chain_unregister(&cpu_chain, nb);
}
EXPORT_SYMBOL(__unregister_cpu_notifier);
/**
* clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
* @cpu: a CPU id
*
* This function walks all processes, finds a valid mm struct for each one and
* then clears a corresponding bit in mm's cpumask. While this all sounds
* trivial, there are various non-obvious corner cases, which this function
* tries to solve in a safe manner.
*
* Also note that the function uses a somewhat relaxed locking scheme, so it may
* be called only for an already offlined CPU.
*/
void clear_tasks_mm_cpumask(int cpu)
{
struct task_struct *p;
/*
* This function is called after the cpu is taken down and marked
* offline, so its not like new tasks will ever get this cpu set in
* their mm mask. -- Peter Zijlstra
* Thus, we may use rcu_read_lock() here, instead of grabbing
* full-fledged tasklist_lock.
*/
WARN_ON(cpu_online(cpu));
rcu_read_lock();
for_each_process(p) {
struct task_struct *t;
/*
* Main thread might exit, but other threads may still have
* a valid mm. Find one.
*/
t = find_lock_task_mm(p);
if (!t)
continue;
cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
task_unlock(t);
}
rcu_read_unlock();
}
static inline void check_for_tasks(int dead_cpu)
{
struct task_struct *g, *p;
read_lock_irq(&tasklist_lock);
do_each_thread(g, p) {
if (!p->on_rq)
continue;
/*
* We do the check with unlocked task_rq(p)->lock.
* Order the reading to do not warn about a task,
* which was running on this cpu in the past, and
* it's just been woken on another cpu.
*/
rmb();
if (task_cpu(p) != dead_cpu)
continue;
pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
} while_each_thread(g, p);
read_unlock_irq(&tasklist_lock);
}
struct take_cpu_down_param {
unsigned long mod;
void *hcpu;
};
/* Take this CPU down. */
static int __ref take_cpu_down(void *_param)
{
struct take_cpu_down_param *param = _param;
int err;
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
if (err < 0)
return err;
cpu_notify(CPU_DYING | param->mod, param->hcpu);
/* Park the stopper thread */
kthread_park(current);
return 0;
}
/* Requires cpu_add_remove_lock to be held */
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
{
int err, nr_calls = 0;
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
struct take_cpu_down_param tcd_param = {
.mod = mod,
.hcpu = hcpu,
};
if (num_online_cpus() == 1)
return -EBUSY;
if (!cpu_online(cpu))
return -EINVAL;
cpu_hotplug_begin();
err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
if (err) {
nr_calls--;
__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
pr_warn("%s: attempt to take down CPU %u failed\n",
__func__, cpu);
goto out_release;
}
smpboot_park_threads(cpu);
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
/* CPU didn't die: tell everyone. Can't complain. */
smpboot_unpark_threads(cpu);
cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
goto out_release;
}
BUG_ON(cpu_online(cpu));
/*
* The migration_call() CPU_DYING callback will have removed all
* runnable tasks from the cpu, there's only the idle task left now
* that the migration thread is done doing the stop_machine thing.
*
* Wait for the stop thread to go away.
*/
while (!idle_cpu(cpu))
cpu_relax();
/* This actually kills the CPU. */
__cpu_die(cpu);
/* CPU is completely dead: tell everyone. Too late to complain. */
cpu_notify_nofail(CPU_DEAD | mod, hcpu);
check_for_tasks(cpu);
out_release:
cpu_hotplug_done();
trace_sched_cpu_hotplug(cpu, err, 0);
if (!err)
cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
return err;
}
int __ref cpu_down(unsigned int cpu)
{
int err;
cpu_maps_update_begin();
if (cpu_hotplug_disabled) {
err = -EBUSY;
goto out;
}
err = _cpu_down(cpu, 0);
out:
cpu_maps_update_done();
return err;
}
EXPORT_SYMBOL(cpu_down);
#endif /*CONFIG_HOTPLUG_CPU*/
/* Requires cpu_add_remove_lock to be held */
static int _cpu_up(unsigned int cpu, int tasks_frozen)
{
int ret, nr_calls = 0;
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
struct task_struct *idle;
cpu_hotplug_begin();
if (cpu_online(cpu) || !cpu_present(cpu)) {
ret = -EINVAL;
goto out;
}
idle = idle_thread_get(cpu);
if (IS_ERR(idle)) {
ret = PTR_ERR(idle);
goto out;
}
ret = smpboot_create_threads(cpu);
if (ret)
goto out;
ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
if (ret) {
nr_calls--;
pr_warn("%s: attempt to bring up CPU %u failed\n",
__func__, cpu);
goto out_notify;
}
/* Arch-specific enabling code. */
ret = __cpu_up(cpu, idle);
if (ret != 0)
goto out_notify;
BUG_ON(!cpu_online(cpu));
/* Wake the per cpu threads */
smpboot_unpark_threads(cpu);
/* Now call notifier in preparation. */
cpu_notify(CPU_ONLINE | mod, hcpu);
out_notify:
if (ret != 0)
__cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
out:
cpu_hotplug_done();
trace_sched_cpu_hotplug(cpu, ret, 1);
return ret;
}
int cpu_up(unsigned int cpu)
{
int err = 0;
if (!cpu_possible(cpu)) {
pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
cpu);
#if defined(CONFIG_IA64)
pr_err("please check additional_cpus= boot parameter\n");
#endif
return -EINVAL;
}
err = try_online_node(cpu_to_node(cpu));
if (err)
return err;
cpu_maps_update_begin();
if (cpu_hotplug_disabled) {
err = -EBUSY;
goto out;
}
err = _cpu_up(cpu, 0);
out:
cpu_maps_update_done();
return err;
}
EXPORT_SYMBOL_GPL(cpu_up);
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
int disable_nonboot_cpus(void)
{
int cpu, first_cpu, error = 0;
#ifdef CONFIG_ARM_EXYNOS_MP_CPUFREQ
int nonboot_cluster_first_cpu = 4;
#endif
cpu_maps_update_begin();
first_cpu = cpumask_first(cpu_online_mask);
/*
* We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
*/
cpumask_clear(frozen_cpus);
pr_info("Disabling non-boot CPUs ...\n");
for_each_online_cpu(cpu) {
#if defined(CONFIG_ARM_EXYNOS_MP_CPUFREQ)
if (cpu == first_cpu || cpu == nonboot_cluster_first_cpu)
#else
if (cpu == first_cpu)
#endif
continue;
trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
error = _cpu_down(cpu, 1);
trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
if (!error)
cpumask_set_cpu(cpu, frozen_cpus);
else {
pr_err("Error taking CPU%d down: %d\n", cpu, error);
break;
}
}
#if defined(CONFIG_ARM_EXYNOS_MP_CPUFREQ)
if (num_online_cpus() > 1) {
error = _cpu_down(nonboot_cluster_first_cpu, 1);
if (!error)
cpumask_set_cpu(nonboot_cluster_first_cpu, frozen_cpus);
else
printk(KERN_ERR "Error taking CPU%d down: %d\n",
nonboot_cluster_first_cpu, error);
}
#endif
if (!error) {
BUG_ON(num_online_cpus() > 1);
/* Make sure the CPUs won't be enabled by someone else */
cpu_hotplug_disabled = 1;
} else {
pr_err("Non-boot CPUs are not disabled\n");
}
cpu_maps_update_done();
return error;
}
void __weak arch_enable_nonboot_cpus_begin(void)
{
}
void __weak arch_enable_nonboot_cpus_end(void)
{
}
void __ref enable_nonboot_cpus(void)
{
int cpu, error;
/* Allow everyone to use the CPU hotplug again */
cpu_maps_update_begin();
cpu_hotplug_disabled = 0;
if (cpumask_empty(frozen_cpus))
goto out;
pr_info("Enabling non-boot CPUs ...\n");
arch_enable_nonboot_cpus_begin();
for_each_cpu(cpu, frozen_cpus) {
trace_suspend_resume(TPS("CPU_ON"), cpu, true);
error = _cpu_up(cpu, 1);
trace_suspend_resume(TPS("CPU_ON"), cpu, false);
if (!error) {
pr_info("CPU%d is up\n", cpu);
continue;
}
pr_warn("Error taking CPU%d up: %d\n", cpu, error);
}
arch_enable_nonboot_cpus_end();
cpumask_clear(frozen_cpus);
out:
cpu_maps_update_done();
}
static int __init alloc_frozen_cpus(void)
{
if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
return -ENOMEM;
return 0;
}
core_initcall(alloc_frozen_cpus);
/*
* When callbacks for CPU hotplug notifications are being executed, we must
* ensure that the state of the system with respect to the tasks being frozen
* or not, as reported by the notification, remains unchanged *throughout the
* duration* of the execution of the callbacks.
* Hence we need to prevent the freezer from racing with regular CPU hotplug.
*
* This synchronization is implemented by mutually excluding regular CPU
* hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
* Hibernate notifications.
*/
static int
cpu_hotplug_pm_callback(struct notifier_block *nb,
unsigned long action, void *ptr)
{
switch (action) {
case PM_SUSPEND_PREPARE:
case PM_HIBERNATION_PREPARE:
cpu_hotplug_disable();
break;
case PM_POST_SUSPEND:
case PM_POST_HIBERNATION:
cpu_hotplug_enable();
break;
default:
return NOTIFY_DONE;
}
return NOTIFY_OK;
}
static int __init cpu_hotplug_pm_sync_init(void)
{
/*
* cpu_hotplug_pm_callback has higher priority than x86
* bsp_pm_callback which depends on cpu_hotplug_pm_callback
* to disable cpu hotplug to avoid cpu hotplug race.
*/
pm_notifier(cpu_hotplug_pm_callback, 0);
return 0;
}
core_initcall(cpu_hotplug_pm_sync_init);
#endif /* CONFIG_PM_SLEEP_SMP */
/**
* notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
* @cpu: cpu that just started
*
* This function calls the cpu_chain notifiers with CPU_STARTING.
* It must be called by the arch code on the new cpu, before the new cpu
* enables interrupts and before the "boot" cpu returns from __cpu_up().
*/
void notify_cpu_starting(unsigned int cpu)
{
unsigned long val = CPU_STARTING;
#ifdef CONFIG_PM_SLEEP_SMP
if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
val = CPU_STARTING_FROZEN;
#endif /* CONFIG_PM_SLEEP_SMP */
cpu_notify(val, (void *)(long)cpu);
}
#endif /* CONFIG_SMP */
/*
* cpu_bit_bitmap[] is a special, "compressed" data structure that
* represents all NR_CPUS bits binary values of 1<<nr.
*
* It is used by cpumask_of() to get a constant address to a CPU
* mask value that has a single bit set only.
*/
/* cpu_bit_bitmap[0] is empty - so we can back into it */
#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x))
#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
MASK_DECLARE_8(0), MASK_DECLARE_8(8),
MASK_DECLARE_8(16), MASK_DECLARE_8(24),
#if BITS_PER_LONG > 32
MASK_DECLARE_8(32), MASK_DECLARE_8(40),
MASK_DECLARE_8(48), MASK_DECLARE_8(56),
#endif
};
EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);
#ifdef CONFIG_INIT_ALL_POSSIBLE
static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
= CPU_BITS_ALL;
#else
static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
#endif
const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
EXPORT_SYMBOL(cpu_possible_mask);
static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
EXPORT_SYMBOL(cpu_online_mask);
static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
EXPORT_SYMBOL(cpu_present_mask);
static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
EXPORT_SYMBOL(cpu_active_mask);
void set_cpu_possible(unsigned int cpu, bool possible)
{
if (possible)
cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
else
cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
}
void set_cpu_present(unsigned int cpu, bool present)
{
if (present)
cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
else
cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
}
void set_cpu_online(unsigned int cpu, bool online)
{
if (online) {
cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
} else {
cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
}
}
void set_cpu_active(unsigned int cpu, bool active)
{
if (active)
cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
else
cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
}
void init_cpu_present(const struct cpumask *src)
{
cpumask_copy(to_cpumask(cpu_present_bits), src);
}
void init_cpu_possible(const struct cpumask *src)
{
cpumask_copy(to_cpumask(cpu_possible_bits), src);
}
void init_cpu_online(const struct cpumask *src)
{
cpumask_copy(to_cpumask(cpu_online_bits), src);
}
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
void idle_notifier_register(struct notifier_block *n)
{
atomic_notifier_chain_register(&idle_notifier, n);
}
EXPORT_SYMBOL_GPL(idle_notifier_register);
void idle_notifier_unregister(struct notifier_block *n)
{
atomic_notifier_chain_unregister(&idle_notifier, n);
}
EXPORT_SYMBOL_GPL(idle_notifier_unregister);
void idle_notifier_call_chain(unsigned long val)
{
atomic_notifier_call_chain(&idle_notifier, val, NULL);
}
EXPORT_SYMBOL_GPL(idle_notifier_call_chain);

233
kernel/cpu_pm.c Normal file
View file

@ -0,0 +1,233 @@
/*
* Copyright (C) 2011 Google, Inc.
*
* Author:
* Colin Cross <ccross@android.com>
*
* This software is licensed under the terms of the GNU General Public
* License version 2, as published by the Free Software Foundation, and
* may be copied, distributed, and modified under those terms.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*/
#include <linux/kernel.h>
#include <linux/cpu_pm.h>
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/spinlock.h>
#include <linux/syscore_ops.h>
static DEFINE_RWLOCK(cpu_pm_notifier_lock);
static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
{
int ret;
ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
nr_to_call, nr_calls);
return notifier_to_errno(ret);
}
/**
* cpu_pm_register_notifier - register a driver with cpu_pm
* @nb: notifier block to register
*
* Add a driver to a list of drivers that are notified about
* CPU and CPU cluster low power entry and exit.
*
* This function may sleep, and has the same return conditions as
* raw_notifier_chain_register.
*/
int cpu_pm_register_notifier(struct notifier_block *nb)
{
unsigned long flags;
int ret;
write_lock_irqsave(&cpu_pm_notifier_lock, flags);
ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
/**
* cpu_pm_unregister_notifier - unregister a driver with cpu_pm
* @nb: notifier block to be unregistered
*
* Remove a driver from the CPU PM notifier list.
*
* This function may sleep, and has the same return conditions as
* raw_notifier_chain_unregister.
*/
int cpu_pm_unregister_notifier(struct notifier_block *nb)
{
unsigned long flags;
int ret;
write_lock_irqsave(&cpu_pm_notifier_lock, flags);
ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
/**
* cpu_pm_enter - CPU low power entry notifier
*
* Notifies listeners that a single CPU is entering a low power state that may
* cause some blocks in the same power domain as the cpu to reset.
*
* Must be called on the affected CPU with interrupts disabled. Platform is
* responsible for ensuring that cpu_pm_enter is not called twice on the same
* CPU before cpu_pm_exit is called. Notified drivers can include VFP
* co-processor, interrupt controller and its PM extensions, local CPU
* timers context save/restore which shouldn't be interrupted. Hence it
* must be called with interrupts disabled.
*
* Return conditions are same as __raw_notifier_call_chain.
*/
int cpu_pm_enter(void)
{
int nr_calls;
int ret = 0;
read_lock(&cpu_pm_notifier_lock);
ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
if (ret)
/*
* Inform listeners (nr_calls - 1) about failure of CPU PM
* PM entry who are notified earlier to prepare for it.
*/
cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
read_unlock(&cpu_pm_notifier_lock);
return ret;
}
EXPORT_SYMBOL_GPL(cpu_pm_enter);
/**
* cpu_pm_exit - CPU low power exit notifier
*
* Notifies listeners that a single CPU is exiting a low power state that may
* have caused some blocks in the same power domain as the cpu to reset.
*
* Notified drivers can include VFP co-processor, interrupt controller
* and its PM extensions, local CPU timers context save/restore which
* shouldn't be interrupted. Hence it must be called with interrupts disabled.
*
* Return conditions are same as __raw_notifier_call_chain.
*/
int cpu_pm_exit(void)
{
int ret;
read_lock(&cpu_pm_notifier_lock);
ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
read_unlock(&cpu_pm_notifier_lock);
return ret;
}
EXPORT_SYMBOL_GPL(cpu_pm_exit);
/**
* cpu_cluster_pm_enter - CPU cluster low power entry notifier
*
* Notifies listeners that all cpus in a power domain are entering a low power
* state that may cause some blocks in the same power domain to reset.
*
* Must be called after cpu_pm_enter has been called on all cpus in the power
* domain, and before cpu_pm_exit has been called on any cpu in the power
* domain. Notified drivers can include VFP co-processor, interrupt controller
* and its PM extensions, local CPU timers context save/restore which
* shouldn't be interrupted. Hence it must be called with interrupts disabled.
*
* Must be called with interrupts disabled.
*
* Return conditions are same as __raw_notifier_call_chain.
*/
int cpu_cluster_pm_enter(void)
{
int nr_calls;
int ret = 0;
read_lock(&cpu_pm_notifier_lock);
ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
if (ret)
/*
* Inform listeners (nr_calls - 1) about failure of CPU cluster
* PM entry who are notified earlier to prepare for it.
*/
cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
read_unlock(&cpu_pm_notifier_lock);
return ret;
}
EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
/**
* cpu_cluster_pm_exit - CPU cluster low power exit notifier
*
* Notifies listeners that all cpus in a power domain are exiting form a
* low power state that may have caused some blocks in the same power domain
* to reset.
*
* Must be called after cpu_pm_exit has been called on all cpus in the power
* domain, and before cpu_pm_exit has been called on any cpu in the power
* domain. Notified drivers can include VFP co-processor, interrupt controller
* and its PM extensions, local CPU timers context save/restore which
* shouldn't be interrupted. Hence it must be called with interrupts disabled.
*
* Return conditions are same as __raw_notifier_call_chain.
*/
int cpu_cluster_pm_exit(void)
{
int ret;
read_lock(&cpu_pm_notifier_lock);
ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
read_unlock(&cpu_pm_notifier_lock);
return ret;
}
EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
#ifdef CONFIG_PM
static int cpu_pm_suspend(void)
{
int ret;
ret = cpu_pm_enter();
if (ret)
return ret;
ret = cpu_cluster_pm_enter();
return ret;
}
static void cpu_pm_resume(void)
{
cpu_cluster_pm_exit();
cpu_pm_exit();
}
static struct syscore_ops cpu_pm_syscore_ops = {
.suspend = cpu_pm_suspend,
.resume = cpu_pm_resume,
};
static int cpu_pm_init(void)
{
register_syscore_ops(&cpu_pm_syscore_ops);
return 0;
}
core_initcall(cpu_pm_init);
#endif

2770
kernel/cpuset.c Normal file

File diff suppressed because it is too large Load diff

46
kernel/crash_dump.c Normal file
View file

@ -0,0 +1,46 @@
#include <linux/kernel.h>
#include <linux/crash_dump.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/export.h>
/*
* If we have booted due to a crash, max_pfn will be a very low value. We need
* to know the amount of memory that the previous kernel used.
*/
unsigned long saved_max_pfn;
/*
* stores the physical address of elf header of crash image
*
* Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
* is_kdump_kernel() to determine if we are booting after a panic. Hence put
* it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
*/
unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
EXPORT_SYMBOL_GPL(elfcorehdr_addr);
/*
* stores the size of elf header of crash image
*/
unsigned long long elfcorehdr_size;
/*
* elfcorehdr= specifies the location of elf core header stored by the crashed
* kernel. This option will be passed by kexec loader to the capture kernel.
*
* Syntax: elfcorehdr=[size[KMG]@]offset[KMG]
*/
static int __init setup_elfcorehdr(char *arg)
{
char *end;
if (!arg)
return -EINVAL;
elfcorehdr_addr = memparse(arg, &end);
if (*end == '@') {
elfcorehdr_size = elfcorehdr_addr;
elfcorehdr_addr = memparse(end + 1, &end);
}
return end > arg ? 0 : -EINVAL;
}
early_param("elfcorehdr", setup_elfcorehdr);

807
kernel/cred.c Normal file
View file

@ -0,0 +1,807 @@
/* Task credentials management - see Documentation/security/credentials.txt
*
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public Licence
* as published by the Free Software Foundation; either version
* 2 of the Licence, or (at your option) any later version.
*/
#include <linux/export.h>
#include <linux/cred.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/key.h>
#include <linux/keyctl.h>
#include <linux/init_task.h>
#include <linux/security.h>
#include <linux/binfmts.h>
#include <linux/cn_proc.h>
#if 0
#define kdebug(FMT, ...) \
printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
#else
#define kdebug(FMT, ...) \
no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
#endif
static struct kmem_cache *cred_jar;
/*
* The initial credentials for the initial task
*/
struct cred init_cred = {
.usage = ATOMIC_INIT(4),
#ifdef CONFIG_DEBUG_CREDENTIALS
.subscribers = ATOMIC_INIT(2),
.magic = CRED_MAGIC,
#endif
.uid = GLOBAL_ROOT_UID,
.gid = GLOBAL_ROOT_GID,
.suid = GLOBAL_ROOT_UID,
.sgid = GLOBAL_ROOT_GID,
.euid = GLOBAL_ROOT_UID,
.egid = GLOBAL_ROOT_GID,
.fsuid = GLOBAL_ROOT_UID,
.fsgid = GLOBAL_ROOT_GID,
.securebits = SECUREBITS_DEFAULT,
.cap_inheritable = CAP_EMPTY_SET,
.cap_permitted = CAP_FULL_SET,
.cap_effective = CAP_FULL_SET,
.cap_bset = CAP_FULL_SET,
.user = INIT_USER,
.user_ns = &init_user_ns,
.group_info = &init_groups,
};
static inline void set_cred_subscribers(struct cred *cred, int n)
{
#ifdef CONFIG_DEBUG_CREDENTIALS
atomic_set(&cred->subscribers, n);
#endif
}
static inline int read_cred_subscribers(const struct cred *cred)
{
#ifdef CONFIG_DEBUG_CREDENTIALS
return atomic_read(&cred->subscribers);
#else
return 0;
#endif
}
static inline void alter_cred_subscribers(const struct cred *_cred, int n)
{
#ifdef CONFIG_DEBUG_CREDENTIALS
struct cred *cred = (struct cred *) _cred;
atomic_add(n, &cred->subscribers);
#endif
}
/*
* The RCU callback to actually dispose of a set of credentials
*/
static void put_cred_rcu(struct rcu_head *rcu)
{
struct cred *cred = container_of(rcu, struct cred, rcu);
kdebug("put_cred_rcu(%p)", cred);
#ifdef CONFIG_DEBUG_CREDENTIALS
if (cred->magic != CRED_MAGIC_DEAD ||
atomic_read(&cred->usage) != 0 ||
read_cred_subscribers(cred) != 0)
panic("CRED: put_cred_rcu() sees %p with"
" mag %x, put %p, usage %d, subscr %d\n",
cred, cred->magic, cred->put_addr,
atomic_read(&cred->usage),
read_cred_subscribers(cred));
#else
if (atomic_read(&cred->usage) != 0)
panic("CRED: put_cred_rcu() sees %p with usage %d\n",
cred, atomic_read(&cred->usage));
#endif
security_cred_free(cred);
key_put(cred->session_keyring);
key_put(cred->process_keyring);
key_put(cred->thread_keyring);
key_put(cred->request_key_auth);
if (cred->group_info)
put_group_info(cred->group_info);
free_uid(cred->user);
put_user_ns(cred->user_ns);
kmem_cache_free(cred_jar, cred);
}
/**
* __put_cred - Destroy a set of credentials
* @cred: The record to release
*
* Destroy a set of credentials on which no references remain.
*/
void __put_cred(struct cred *cred)
{
kdebug("__put_cred(%p{%d,%d})", cred,
atomic_read(&cred->usage),
read_cred_subscribers(cred));
BUG_ON(atomic_read(&cred->usage) != 0);
#ifdef CONFIG_DEBUG_CREDENTIALS
BUG_ON(read_cred_subscribers(cred) != 0);
cred->magic = CRED_MAGIC_DEAD;
cred->put_addr = __builtin_return_address(0);
#endif
BUG_ON(cred == current->cred);
BUG_ON(cred == current->real_cred);
call_rcu(&cred->rcu, put_cred_rcu);
}
EXPORT_SYMBOL(__put_cred);
/*
* Clean up a task's credentials when it exits
*/
void exit_creds(struct task_struct *tsk)
{
struct cred *cred;
kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
atomic_read(&tsk->cred->usage),
read_cred_subscribers(tsk->cred));
cred = (struct cred *) tsk->real_cred;
tsk->real_cred = NULL;
validate_creds(cred);
alter_cred_subscribers(cred, -1);
put_cred(cred);
cred = (struct cred *) tsk->cred;
tsk->cred = NULL;
validate_creds(cred);
alter_cred_subscribers(cred, -1);
put_cred(cred);
}
/**
* get_task_cred - Get another task's objective credentials
* @task: The task to query
*
* Get the objective credentials of a task, pinning them so that they can't go
* away. Accessing a task's credentials directly is not permitted.
*
* The caller must also make sure task doesn't get deleted, either by holding a
* ref on task or by holding tasklist_lock to prevent it from being unlinked.
*/
const struct cred *get_task_cred(struct task_struct *task)
{
const struct cred *cred;
rcu_read_lock();
do {
cred = __task_cred((task));
BUG_ON(!cred);
} while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
rcu_read_unlock();
return cred;
}
/*
* Allocate blank credentials, such that the credentials can be filled in at a
* later date without risk of ENOMEM.
*/
struct cred *cred_alloc_blank(void)
{
struct cred *new;
new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
atomic_set(&new->usage, 1);
#ifdef CONFIG_DEBUG_CREDENTIALS
new->magic = CRED_MAGIC;
#endif
if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
goto error;
return new;
error:
abort_creds(new);
return NULL;
}
/**
* prepare_creds - Prepare a new set of credentials for modification
*
* Prepare a new set of task credentials for modification. A task's creds
* shouldn't generally be modified directly, therefore this function is used to
* prepare a new copy, which the caller then modifies and then commits by
* calling commit_creds().
*
* Preparation involves making a copy of the objective creds for modification.
*
* Returns a pointer to the new creds-to-be if successful, NULL otherwise.
*
* Call commit_creds() or abort_creds() to clean up.
*/
struct cred *prepare_creds(void)
{
struct task_struct *task = current;
const struct cred *old;
struct cred *new;
validate_process_creds();
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
kdebug("prepare_creds() alloc %p", new);
old = task->cred;
memcpy(new, old, sizeof(struct cred));
atomic_set(&new->usage, 1);
set_cred_subscribers(new, 0);
get_group_info(new->group_info);
get_uid(new->user);
get_user_ns(new->user_ns);
#ifdef CONFIG_KEYS
key_get(new->session_keyring);
key_get(new->process_keyring);
key_get(new->thread_keyring);
key_get(new->request_key_auth);
#endif
#ifdef CONFIG_SECURITY
new->security = NULL;
#endif
if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
goto error;
validate_creds(new);
return new;
error:
abort_creds(new);
return NULL;
}
EXPORT_SYMBOL(prepare_creds);
/*
* Prepare credentials for current to perform an execve()
* - The caller must hold ->cred_guard_mutex
*/
struct cred *prepare_exec_creds(void)
{
struct cred *new;
new = prepare_creds();
if (!new)
return new;
#ifdef CONFIG_KEYS
/* newly exec'd tasks don't get a thread keyring */
key_put(new->thread_keyring);
new->thread_keyring = NULL;
/* inherit the session keyring; new process keyring */
key_put(new->process_keyring);
new->process_keyring = NULL;
#endif
return new;
}
/*
* Copy credentials for the new process created by fork()
*
* We share if we can, but under some circumstances we have to generate a new
* set.
*
* The new process gets the current process's subjective credentials as its
* objective and subjective credentials
*/
int copy_creds(struct task_struct *p, unsigned long clone_flags)
{
struct cred *new;
int ret;
if (
#ifdef CONFIG_KEYS
!p->cred->thread_keyring &&
#endif
clone_flags & CLONE_THREAD
) {
p->real_cred = get_cred(p->cred);
get_cred(p->cred);
alter_cred_subscribers(p->cred, 2);
kdebug("share_creds(%p{%d,%d})",
p->cred, atomic_read(&p->cred->usage),
read_cred_subscribers(p->cred));
atomic_inc(&p->cred->user->processes);
return 0;
}
new = prepare_creds();
if (!new)
return -ENOMEM;
if (clone_flags & CLONE_NEWUSER) {
ret = create_user_ns(new);
if (ret < 0)
goto error_put;
}
#ifdef CONFIG_KEYS
/* new threads get their own thread keyrings if their parent already
* had one */
if (new->thread_keyring) {
key_put(new->thread_keyring);
new->thread_keyring = NULL;
if (clone_flags & CLONE_THREAD)
install_thread_keyring_to_cred(new);
}
/* The process keyring is only shared between the threads in a process;
* anything outside of those threads doesn't inherit.
*/
if (!(clone_flags & CLONE_THREAD)) {
key_put(new->process_keyring);
new->process_keyring = NULL;
}
#endif
atomic_inc(&new->user->processes);
p->cred = p->real_cred = get_cred(new);
alter_cred_subscribers(new, 2);
validate_creds(new);
return 0;
error_put:
put_cred(new);
return ret;
}
static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
{
const struct user_namespace *set_ns = set->user_ns;
const struct user_namespace *subset_ns = subset->user_ns;
/* If the two credentials are in the same user namespace see if
* the capabilities of subset are a subset of set.
*/
if (set_ns == subset_ns)
return cap_issubset(subset->cap_permitted, set->cap_permitted);
/* The credentials are in a different user namespaces
* therefore one is a subset of the other only if a set is an
* ancestor of subset and set->euid is owner of subset or one
* of subsets ancestors.
*/
for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
if ((set_ns == subset_ns->parent) &&
uid_eq(subset_ns->owner, set->euid))
return true;
}
return false;
}
/**
* commit_creds - Install new credentials upon the current task
* @new: The credentials to be assigned
*
* Install a new set of credentials to the current task, using RCU to replace
* the old set. Both the objective and the subjective credentials pointers are
* updated. This function may not be called if the subjective credentials are
* in an overridden state.
*
* This function eats the caller's reference to the new credentials.
*
* Always returns 0 thus allowing this function to be tail-called at the end
* of, say, sys_setgid().
*/
int commit_creds(struct cred *new)
{
struct task_struct *task = current;
const struct cred *old = task->real_cred;
kdebug("commit_creds(%p{%d,%d})", new,
atomic_read(&new->usage),
read_cred_subscribers(new));
BUG_ON(task->cred != old);
#ifdef CONFIG_DEBUG_CREDENTIALS
BUG_ON(read_cred_subscribers(old) < 2);
validate_creds(old);
validate_creds(new);
#endif
BUG_ON(atomic_read(&new->usage) < 1);
get_cred(new); /* we will require a ref for the subj creds too */
/* dumpability changes */
if (!uid_eq(old->euid, new->euid) ||
!gid_eq(old->egid, new->egid) ||
!uid_eq(old->fsuid, new->fsuid) ||
!gid_eq(old->fsgid, new->fsgid) ||
!cred_cap_issubset(old, new)) {
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
smp_wmb();
}
/* alter the thread keyring */
if (!uid_eq(new->fsuid, old->fsuid))
key_fsuid_changed(task);
if (!gid_eq(new->fsgid, old->fsgid))
key_fsgid_changed(task);
/* do it
* RLIMIT_NPROC limits on user->processes have already been checked
* in set_user().
*/
alter_cred_subscribers(new, 2);
if (new->user != old->user)
atomic_inc(&new->user->processes);
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user)
atomic_dec(&old->user->processes);
alter_cred_subscribers(old, -2);
/* send notifications */
if (!uid_eq(new->uid, old->uid) ||
!uid_eq(new->euid, old->euid) ||
!uid_eq(new->suid, old->suid) ||
!uid_eq(new->fsuid, old->fsuid))
proc_id_connector(task, PROC_EVENT_UID);
if (!gid_eq(new->gid, old->gid) ||
!gid_eq(new->egid, old->egid) ||
!gid_eq(new->sgid, old->sgid) ||
!gid_eq(new->fsgid, old->fsgid))
proc_id_connector(task, PROC_EVENT_GID);
/* release the old obj and subj refs both */
put_cred(old);
put_cred(old);
return 0;
}
EXPORT_SYMBOL(commit_creds);
/**
* abort_creds - Discard a set of credentials and unlock the current task
* @new: The credentials that were going to be applied
*
* Discard a set of credentials that were under construction and unlock the
* current task.
*/
void abort_creds(struct cred *new)
{
kdebug("abort_creds(%p{%d,%d})", new,
atomic_read(&new->usage),
read_cred_subscribers(new));
#ifdef CONFIG_DEBUG_CREDENTIALS
BUG_ON(read_cred_subscribers(new) != 0);
#endif
BUG_ON(atomic_read(&new->usage) < 1);
put_cred(new);
}
EXPORT_SYMBOL(abort_creds);
/**
* override_creds - Override the current process's subjective credentials
* @new: The credentials to be assigned
*
* Install a set of temporary override subjective credentials on the current
* process, returning the old set for later reversion.
*/
const struct cred *override_creds(const struct cred *new)
{
const struct cred *old = current->cred;
kdebug("override_creds(%p{%d,%d})", new,
atomic_read(&new->usage),
read_cred_subscribers(new));
validate_creds(old);
validate_creds(new);
get_cred(new);
alter_cred_subscribers(new, 1);
rcu_assign_pointer(current->cred, new);
alter_cred_subscribers(old, -1);
kdebug("override_creds() = %p{%d,%d}", old,
atomic_read(&old->usage),
read_cred_subscribers(old));
return old;
}
EXPORT_SYMBOL(override_creds);
/**
* revert_creds - Revert a temporary subjective credentials override
* @old: The credentials to be restored
*
* Revert a temporary set of override subjective credentials to an old set,
* discarding the override set.
*/
void revert_creds(const struct cred *old)
{
const struct cred *override = current->cred;
kdebug("revert_creds(%p{%d,%d})", old,
atomic_read(&old->usage),
read_cred_subscribers(old));
validate_creds(old);
validate_creds(override);
alter_cred_subscribers(old, 1);
rcu_assign_pointer(current->cred, old);
alter_cred_subscribers(override, -1);
put_cred(override);
}
EXPORT_SYMBOL(revert_creds);
/*
* initialise the credentials stuff
*/
void __init cred_init(void)
{
/* allocate a slab in which we can store credentials */
cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
}
/**
* prepare_kernel_cred - Prepare a set of credentials for a kernel service
* @daemon: A userspace daemon to be used as a reference
*
* Prepare a set of credentials for a kernel service. This can then be used to
* override a task's own credentials so that work can be done on behalf of that
* task that requires a different subjective context.
*
* @daemon is used to provide a base for the security record, but can be NULL.
* If @daemon is supplied, then the security data will be derived from that;
* otherwise they'll be set to 0 and no groups, full capabilities and no keys.
*
* The caller may change these controls afterwards if desired.
*
* Returns the new credentials or NULL if out of memory.
*
* Does not take, and does not return holding current->cred_replace_mutex.
*/
struct cred *prepare_kernel_cred(struct task_struct *daemon)
{
const struct cred *old;
struct cred *new;
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
kdebug("prepare_kernel_cred() alloc %p", new);
if (daemon)
old = get_task_cred(daemon);
else
old = get_cred(&init_cred);
validate_creds(old);
*new = *old;
atomic_set(&new->usage, 1);
set_cred_subscribers(new, 0);
get_uid(new->user);
get_user_ns(new->user_ns);
get_group_info(new->group_info);
#ifdef CONFIG_KEYS
new->session_keyring = NULL;
new->process_keyring = NULL;
new->thread_keyring = NULL;
new->request_key_auth = NULL;
new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
#endif
#ifdef CONFIG_SECURITY
new->security = NULL;
#endif
if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
goto error;
put_cred(old);
validate_creds(new);
return new;
error:
put_cred(new);
put_cred(old);
return NULL;
}
EXPORT_SYMBOL(prepare_kernel_cred);
/**
* set_security_override - Set the security ID in a set of credentials
* @new: The credentials to alter
* @secid: The LSM security ID to set
*
* Set the LSM security ID in a set of credentials so that the subjective
* security is overridden when an alternative set of credentials is used.
*/
int set_security_override(struct cred *new, u32 secid)
{
return security_kernel_act_as(new, secid);
}
EXPORT_SYMBOL(set_security_override);
/**
* set_security_override_from_ctx - Set the security ID in a set of credentials
* @new: The credentials to alter
* @secctx: The LSM security context to generate the security ID from.
*
* Set the LSM security ID in a set of credentials so that the subjective
* security is overridden when an alternative set of credentials is used. The
* security ID is specified in string form as a security context to be
* interpreted by the LSM.
*/
int set_security_override_from_ctx(struct cred *new, const char *secctx)
{
u32 secid;
int ret;
ret = security_secctx_to_secid(secctx, strlen(secctx), &secid);
if (ret < 0)
return ret;
return set_security_override(new, secid);
}
EXPORT_SYMBOL(set_security_override_from_ctx);
/**
* set_create_files_as - Set the LSM file create context in a set of credentials
* @new: The credentials to alter
* @inode: The inode to take the context from
*
* Change the LSM file creation context in a set of credentials to be the same
* as the object context of the specified inode, so that the new inodes have
* the same MAC context as that inode.
*/
int set_create_files_as(struct cred *new, struct inode *inode)
{
new->fsuid = inode->i_uid;
new->fsgid = inode->i_gid;
return security_kernel_create_files_as(new, inode);
}
EXPORT_SYMBOL(set_create_files_as);
#ifdef CONFIG_DEBUG_CREDENTIALS
bool creds_are_invalid(const struct cred *cred)
{
if (cred->magic != CRED_MAGIC)
return true;
#ifdef CONFIG_SECURITY_SELINUX
/*
* cred->security == NULL if security_cred_alloc_blank() or
* security_prepare_creds() returned an error.
*/
if (selinux_is_enabled() && cred->security) {
if ((unsigned long) cred->security < PAGE_SIZE)
return true;
if ((*(u32 *)cred->security & 0xffffff00) ==
(POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
return true;
}
#endif
return false;
}
EXPORT_SYMBOL(creds_are_invalid);
/*
* dump invalid credentials
*/
static void dump_invalid_creds(const struct cred *cred, const char *label,
const struct task_struct *tsk)
{
printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
label, cred,
cred == &init_cred ? "[init]" : "",
cred == tsk->real_cred ? "[real]" : "",
cred == tsk->cred ? "[eff]" : "");
printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
cred->magic, cred->put_addr);
printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
atomic_read(&cred->usage),
read_cred_subscribers(cred));
printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
from_kuid_munged(&init_user_ns, cred->uid),
from_kuid_munged(&init_user_ns, cred->euid),
from_kuid_munged(&init_user_ns, cred->suid),
from_kuid_munged(&init_user_ns, cred->fsuid));
printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
from_kgid_munged(&init_user_ns, cred->gid),
from_kgid_munged(&init_user_ns, cred->egid),
from_kgid_munged(&init_user_ns, cred->sgid),
from_kgid_munged(&init_user_ns, cred->fsgid));
#ifdef CONFIG_SECURITY
printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
if ((unsigned long) cred->security >= PAGE_SIZE &&
(((unsigned long) cred->security & 0xffffff00) !=
(POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
printk(KERN_ERR "CRED: ->security {%x, %x}\n",
((u32*)cred->security)[0],
((u32*)cred->security)[1]);
#endif
}
/*
* report use of invalid credentials
*/
void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
{
printk(KERN_ERR "CRED: Invalid credentials\n");
printk(KERN_ERR "CRED: At %s:%u\n", file, line);
dump_invalid_creds(cred, "Specified", current);
BUG();
}
EXPORT_SYMBOL(__invalid_creds);
/*
* check the credentials on a process
*/
void __validate_process_creds(struct task_struct *tsk,
const char *file, unsigned line)
{
if (tsk->cred == tsk->real_cred) {
if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
creds_are_invalid(tsk->cred)))
goto invalid_creds;
} else {
if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
read_cred_subscribers(tsk->cred) < 1 ||
creds_are_invalid(tsk->real_cred) ||
creds_are_invalid(tsk->cred)))
goto invalid_creds;
}
return;
invalid_creds:
printk(KERN_ERR "CRED: Invalid process credentials\n");
printk(KERN_ERR "CRED: At %s:%u\n", file, line);
dump_invalid_creds(tsk->real_cred, "Real", tsk);
if (tsk->cred != tsk->real_cred)
dump_invalid_creds(tsk->cred, "Effective", tsk);
else
printk(KERN_ERR "CRED: Effective creds == Real creds\n");
BUG();
}
EXPORT_SYMBOL(__validate_process_creds);
/*
* check creds for do_exit()
*/
void validate_creds_for_do_exit(struct task_struct *tsk)
{
kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
tsk->real_cred, tsk->cred,
atomic_read(&tsk->cred->usage),
read_cred_subscribers(tsk->cred));
__validate_process_creds(tsk, __FILE__, __LINE__);
}
#endif /* CONFIG_DEBUG_CREDENTIALS */

6
kernel/debug/Makefile Normal file
View file

@ -0,0 +1,6 @@
#
# Makefile for the linux kernel debugger
#
obj-$(CONFIG_KGDB) += debug_core.o gdbstub.o
obj-$(CONFIG_KGDB_KDB) += kdb/

1079
kernel/debug/debug_core.c Normal file

File diff suppressed because it is too large Load diff

85
kernel/debug/debug_core.h Normal file
View file

@ -0,0 +1,85 @@
/*
* Created by: Jason Wessel <jason.wessel@windriver.com>
*
* Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
*
* This file is licensed under the terms of the GNU General Public
* License version 2. This program is licensed "as is" without any
* warranty of any kind, whether express or implied.
*/
#ifndef _DEBUG_CORE_H_
#define _DEBUG_CORE_H_
/*
* These are the private implementation headers between the kernel
* debugger core and the debugger front end code.
*/
/* kernel debug core data structures */
struct kgdb_state {
int ex_vector;
int signo;
int err_code;
int cpu;
int pass_exception;
unsigned long thr_query;
unsigned long threadid;
long kgdb_usethreadid;
struct pt_regs *linux_regs;
atomic_t *send_ready;
};
/* Exception state values */
#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
#define DCPU_SSTEP 0x8 /* CPU is single stepping */
struct debuggerinfo_struct {
void *debuggerinfo;
struct task_struct *task;
int exception_state;
int ret_state;
int irq_depth;
int enter_kgdb;
};
extern struct debuggerinfo_struct kgdb_info[];
/* kernel debug core break point routines */
extern int dbg_remove_all_break(void);
extern int dbg_set_sw_break(unsigned long addr);
extern int dbg_remove_sw_break(unsigned long addr);
extern int dbg_activate_sw_breakpoints(void);
extern int dbg_deactivate_sw_breakpoints(void);
/* polled character access to i/o module */
extern int dbg_io_get_char(void);
/* stub return value for switching between the gdbstub and kdb */
#define DBG_PASS_EVENT -12345
/* Switch from one cpu to another */
#define DBG_SWITCH_CPU_EVENT -123456
extern int dbg_switch_cpu;
/* gdbstub interface functions */
extern int gdb_serial_stub(struct kgdb_state *ks);
extern void gdbstub_msg_write(const char *s, int len);
/* gdbstub functions used for kdb <-> gdbstub transition */
extern int gdbstub_state(struct kgdb_state *ks, char *cmd);
extern int dbg_kdb_mode;
#ifdef CONFIG_KGDB_KDB
extern int kdb_stub(struct kgdb_state *ks);
extern int kdb_parse(const char *cmdstr);
extern int kdb_common_init_state(struct kgdb_state *ks);
extern int kdb_common_deinit_state(void);
#else /* ! CONFIG_KGDB_KDB */
static inline int kdb_stub(struct kgdb_state *ks)
{
return DBG_PASS_EVENT;
}
#endif /* CONFIG_KGDB_KDB */
#endif /* _DEBUG_CORE_H_ */

1145
kernel/debug/gdbstub.c Normal file

File diff suppressed because it is too large Load diff

25
kernel/debug/kdb/Makefile Normal file
View file

@ -0,0 +1,25 @@
# This file is subject to the terms and conditions of the GNU General Public
# License. See the file "COPYING" in the main directory of this archive
# for more details.
#
# Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
#
CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p')
obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o
clean-files := gen-kdb_cmds.c
quiet_cmd_gen-kdb = GENKDB $@
cmd_gen-kdb = $(AWK) 'BEGIN {print "\#include <linux/stddef.h>"; print "\#include <linux/init.h>"} \
/^\#/{next} \
/^[ \t]*$$/{next} \
{gsub(/"/, "\\\"", $$0); \
print "static __initdata char kdb_cmd" cmds++ "[] = \"" $$0 "\\n\";"} \
END {print "extern char *kdb_cmds[]; char __initdata *kdb_cmds[] = {"; for (i = 0; i < cmds; ++i) {print " kdb_cmd" i ","}; print(" NULL\n};");}' \
$(filter-out %/Makefile,$^) > $@#
$(obj)/gen-kdb_cmds.c: $(src)/kdb_cmds $(src)/Makefile
$(call cmd,gen-kdb)

553
kernel/debug/kdb/kdb_bp.c Normal file
View file

@ -0,0 +1,553 @@
/*
* Kernel Debugger Architecture Independent Breakpoint Handler
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*
* Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
* Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
*/
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/kdb.h>
#include <linux/kgdb.h>
#include <linux/smp.h>
#include <linux/sched.h>
#include <linux/interrupt.h>
#include "kdb_private.h"
/*
* Table of kdb_breakpoints
*/
kdb_bp_t kdb_breakpoints[KDB_MAXBPT];
static void kdb_setsinglestep(struct pt_regs *regs)
{
KDB_STATE_SET(DOING_SS);
}
static char *kdb_rwtypes[] = {
"Instruction(i)",
"Instruction(Register)",
"Data Write",
"I/O",
"Data Access"
};
static char *kdb_bptype(kdb_bp_t *bp)
{
if (bp->bp_type < 0 || bp->bp_type > 4)
return "";
return kdb_rwtypes[bp->bp_type];
}
static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
{
int nextarg = *nextargp;
int diag;
bp->bph_length = 1;
if ((argc + 1) != nextarg) {
if (strncasecmp(argv[nextarg], "datar", sizeof("datar")) == 0)
bp->bp_type = BP_ACCESS_WATCHPOINT;
else if (strncasecmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
bp->bp_type = BP_WRITE_WATCHPOINT;
else if (strncasecmp(argv[nextarg], "inst", sizeof("inst")) == 0)
bp->bp_type = BP_HARDWARE_BREAKPOINT;
else
return KDB_ARGCOUNT;
bp->bph_length = 1;
nextarg++;
if ((argc + 1) != nextarg) {
unsigned long len;
diag = kdbgetularg((char *)argv[nextarg],
&len);
if (diag)
return diag;
if (len > 8)
return KDB_BADLENGTH;
bp->bph_length = len;
nextarg++;
}
if ((argc + 1) != nextarg)
return KDB_ARGCOUNT;
}
*nextargp = nextarg;
return 0;
}
static int _kdb_bp_remove(kdb_bp_t *bp)
{
int ret = 1;
if (!bp->bp_installed)
return ret;
if (!bp->bp_type)
ret = dbg_remove_sw_break(bp->bp_addr);
else
ret = arch_kgdb_ops.remove_hw_breakpoint(bp->bp_addr,
bp->bph_length,
bp->bp_type);
if (ret == 0)
bp->bp_installed = 0;
return ret;
}
static void kdb_handle_bp(struct pt_regs *regs, kdb_bp_t *bp)
{
if (KDB_DEBUG(BP))
kdb_printf("regs->ip = 0x%lx\n", instruction_pointer(regs));
/*
* Setup single step
*/
kdb_setsinglestep(regs);
/*
* Reset delay attribute
*/
bp->bp_delay = 0;
bp->bp_delayed = 1;
}
static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
{
int ret;
/*
* Install the breakpoint, if it is not already installed.
*/
if (KDB_DEBUG(BP))
kdb_printf("%s: bp_installed %d\n",
__func__, bp->bp_installed);
if (!KDB_STATE(SSBPT))
bp->bp_delay = 0;
if (bp->bp_installed)
return 1;
if (bp->bp_delay || (bp->bp_delayed && KDB_STATE(DOING_SS))) {
if (KDB_DEBUG(BP))
kdb_printf("%s: delayed bp\n", __func__);
kdb_handle_bp(regs, bp);
return 0;
}
if (!bp->bp_type)
ret = dbg_set_sw_break(bp->bp_addr);
else
ret = arch_kgdb_ops.set_hw_breakpoint(bp->bp_addr,
bp->bph_length,
bp->bp_type);
if (ret == 0) {
bp->bp_installed = 1;
} else {
kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
__func__, bp->bp_addr);
#ifdef CONFIG_DEBUG_RODATA
if (!bp->bp_type) {
kdb_printf("Software breakpoints are unavailable.\n"
" Change the kernel CONFIG_DEBUG_RODATA=n\n"
" OR use hw breaks: help bph\n");
}
#endif
return 1;
}
return 0;
}
/*
* kdb_bp_install
*
* Install kdb_breakpoints prior to returning from the
* kernel debugger. This allows the kdb_breakpoints to be set
* upon functions that are used internally by kdb, such as
* printk(). This function is only called once per kdb session.
*/
void kdb_bp_install(struct pt_regs *regs)
{
int i;
for (i = 0; i < KDB_MAXBPT; i++) {
kdb_bp_t *bp = &kdb_breakpoints[i];
if (KDB_DEBUG(BP)) {
kdb_printf("%s: bp %d bp_enabled %d\n",
__func__, i, bp->bp_enabled);
}
if (bp->bp_enabled)
_kdb_bp_install(regs, bp);
}
}
/*
* kdb_bp_remove
*
* Remove kdb_breakpoints upon entry to the kernel debugger.
*
* Parameters:
* None.
* Outputs:
* None.
* Returns:
* None.
* Locking:
* None.
* Remarks:
*/
void kdb_bp_remove(void)
{
int i;
for (i = KDB_MAXBPT - 1; i >= 0; i--) {
kdb_bp_t *bp = &kdb_breakpoints[i];
if (KDB_DEBUG(BP)) {
kdb_printf("%s: bp %d bp_enabled %d\n",
__func__, i, bp->bp_enabled);
}
if (bp->bp_enabled)
_kdb_bp_remove(bp);
}
}
/*
* kdb_printbp
*
* Internal function to format and print a breakpoint entry.
*
* Parameters:
* None.
* Outputs:
* None.
* Returns:
* None.
* Locking:
* None.
* Remarks:
*/
static void kdb_printbp(kdb_bp_t *bp, int i)
{
kdb_printf("%s ", kdb_bptype(bp));
kdb_printf("BP #%d at ", i);
kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT);
if (bp->bp_enabled)
kdb_printf("\n is enabled");
else
kdb_printf("\n is disabled");
kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n",
bp->bp_addr, bp->bp_type, bp->bp_installed);
kdb_printf("\n");
}
/*
* kdb_bp
*
* Handle the bp commands.
*
* [bp|bph] <addr-expression> [DATAR|DATAW]
*
* Parameters:
* argc Count of arguments in argv
* argv Space delimited command line arguments
* Outputs:
* None.
* Returns:
* Zero for success, a kdb diagnostic if failure.
* Locking:
* None.
* Remarks:
*
* bp Set breakpoint on all cpus. Only use hardware assist if need.
* bph Set breakpoint on all cpus. Force hardware register
*/
static int kdb_bp(int argc, const char **argv)
{
int i, bpno;
kdb_bp_t *bp, *bp_check;
int diag;
char *symname = NULL;
long offset = 0ul;
int nextarg;
kdb_bp_t template = {0};
if (argc == 0) {
/*
* Display breakpoint table
*/
for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT;
bpno++, bp++) {
if (bp->bp_free)
continue;
kdb_printbp(bp, bpno);
}
return 0;
}
nextarg = 1;
diag = kdbgetaddrarg(argc, argv, &nextarg, &template.bp_addr,
&offset, &symname);
if (diag)
return diag;
if (!template.bp_addr)
return KDB_BADINT;
/*
* Find an empty bp structure to allocate
*/
for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
if (bp->bp_free)
break;
}
if (bpno == KDB_MAXBPT)
return KDB_TOOMANYBPT;
if (strcmp(argv[0], "bph") == 0) {
template.bp_type = BP_HARDWARE_BREAKPOINT;
diag = kdb_parsebp(argc, argv, &nextarg, &template);
if (diag)
return diag;
} else {
template.bp_type = BP_BREAKPOINT;
}
/*
* Check for clashing breakpoints.
*
* Note, in this design we can't have hardware breakpoints
* enabled for both read and write on the same address.
*/
for (i = 0, bp_check = kdb_breakpoints; i < KDB_MAXBPT;
i++, bp_check++) {
if (!bp_check->bp_free &&
bp_check->bp_addr == template.bp_addr) {
kdb_printf("You already have a breakpoint at "
kdb_bfd_vma_fmt0 "\n", template.bp_addr);
return KDB_DUPBPT;
}
}
template.bp_enabled = 1;
/*
* Actually allocate the breakpoint found earlier
*/
*bp = template;
bp->bp_free = 0;
kdb_printbp(bp, bpno);
return 0;
}
/*
* kdb_bc
*
* Handles the 'bc', 'be', and 'bd' commands
*
* [bd|bc|be] <breakpoint-number>
* [bd|bc|be] *
*
* Parameters:
* argc Count of arguments in argv
* argv Space delimited command line arguments
* Outputs:
* None.
* Returns:
* Zero for success, a kdb diagnostic for failure
* Locking:
* None.
* Remarks:
*/
static int kdb_bc(int argc, const char **argv)
{
unsigned long addr;
kdb_bp_t *bp = NULL;
int lowbp = KDB_MAXBPT;
int highbp = 0;
int done = 0;
int i;
int diag = 0;
int cmd; /* KDBCMD_B? */
#define KDBCMD_BC 0
#define KDBCMD_BE 1
#define KDBCMD_BD 2
if (strcmp(argv[0], "be") == 0)
cmd = KDBCMD_BE;
else if (strcmp(argv[0], "bd") == 0)
cmd = KDBCMD_BD;
else
cmd = KDBCMD_BC;
if (argc != 1)
return KDB_ARGCOUNT;
if (strcmp(argv[1], "*") == 0) {
lowbp = 0;
highbp = KDB_MAXBPT;
} else {
diag = kdbgetularg(argv[1], &addr);
if (diag)
return diag;
/*
* For addresses less than the maximum breakpoint number,
* assume that the breakpoint number is desired.
*/
if (addr < KDB_MAXBPT) {
bp = &kdb_breakpoints[addr];
lowbp = highbp = addr;
highbp++;
} else {
for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT;
i++, bp++) {
if (bp->bp_addr == addr) {
lowbp = highbp = i;
highbp++;
break;
}
}
}
}
/*
* Now operate on the set of breakpoints matching the input
* criteria (either '*' for all, or an individual breakpoint).
*/
for (bp = &kdb_breakpoints[lowbp], i = lowbp;
i < highbp;
i++, bp++) {
if (bp->bp_free)
continue;
done++;
switch (cmd) {
case KDBCMD_BC:
bp->bp_enabled = 0;
kdb_printf("Breakpoint %d at "
kdb_bfd_vma_fmt " cleared\n",
i, bp->bp_addr);
bp->bp_addr = 0;
bp->bp_free = 1;
break;
case KDBCMD_BE:
bp->bp_enabled = 1;
kdb_printf("Breakpoint %d at "
kdb_bfd_vma_fmt " enabled",
i, bp->bp_addr);
kdb_printf("\n");
break;
case KDBCMD_BD:
if (!bp->bp_enabled)
break;
bp->bp_enabled = 0;
kdb_printf("Breakpoint %d at "
kdb_bfd_vma_fmt " disabled\n",
i, bp->bp_addr);
break;
}
if (bp->bp_delay && (cmd == KDBCMD_BC || cmd == KDBCMD_BD)) {
bp->bp_delay = 0;
KDB_STATE_CLEAR(SSBPT);
}
}
return (!done) ? KDB_BPTNOTFOUND : 0;
}
/*
* kdb_ss
*
* Process the 'ss' (Single Step) command.
*
* ss
*
* Parameters:
* argc Argument count
* argv Argument vector
* Outputs:
* None.
* Returns:
* KDB_CMD_SS for success, a kdb error if failure.
* Locking:
* None.
* Remarks:
*
* Set the arch specific option to trigger a debug trap after the next
* instruction.
*/
static int kdb_ss(int argc, const char **argv)
{
if (argc != 0)
return KDB_ARGCOUNT;
/*
* Set trace flag and go.
*/
KDB_STATE_SET(DOING_SS);
return KDB_CMD_SS;
}
/* Initialize the breakpoint table and register breakpoint commands. */
void __init kdb_initbptab(void)
{
int i;
kdb_bp_t *bp;
/*
* First time initialization.
*/
memset(&kdb_breakpoints, '\0', sizeof(kdb_breakpoints));
for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
bp->bp_free = 1;
kdb_register_repeat("bp", kdb_bp, "[<vaddr>]",
"Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
kdb_register_repeat("bl", kdb_bp, "[<vaddr>]",
"Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
kdb_register_repeat("bph", kdb_bp, "[<vaddr>]",
"[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS);
kdb_register_repeat("bc", kdb_bc, "<bpnum>",
"Clear Breakpoint", 0, KDB_REPEAT_NONE);
kdb_register_repeat("be", kdb_bc, "<bpnum>",
"Enable Breakpoint", 0, KDB_REPEAT_NONE);
kdb_register_repeat("bd", kdb_bc, "<bpnum>",
"Disable Breakpoint", 0, KDB_REPEAT_NONE);
kdb_register_repeat("ss", kdb_ss, "",
"Single Step", 1, KDB_REPEAT_NO_ARGS);
/*
* Architecture dependent initialization.
*/
}

210
kernel/debug/kdb/kdb_bt.c Normal file
View file

@ -0,0 +1,210 @@
/*
* Kernel Debugger Architecture Independent Stack Traceback
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*
* Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
* Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
*/
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/kdb.h>
#include <linux/nmi.h>
#include "kdb_private.h"
static void kdb_show_stack(struct task_struct *p, void *addr)
{
int old_lvl = console_loglevel;
console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
kdb_trap_printk++;
kdb_set_current_task(p);
if (addr) {
show_stack((struct task_struct *)p, addr);
} else if (kdb_current_regs) {
#ifdef CONFIG_X86
show_stack(p, &kdb_current_regs->sp);
#else
show_stack(p, NULL);
#endif
} else {
show_stack(p, NULL);
}
console_loglevel = old_lvl;
kdb_trap_printk--;
}
/*
* kdb_bt
*
* This function implements the 'bt' command. Print a stack
* traceback.
*
* bt [<address-expression>] (addr-exp is for alternate stacks)
* btp <pid> Kernel stack for <pid>
* btt <address-expression> Kernel stack for task structure at
* <address-expression>
* bta [DRSTCZEUIMA] All useful processes, optionally
* filtered by state
* btc [<cpu>] The current process on one cpu,
* default is all cpus
*
* bt <address-expression> refers to a address on the stack, that location
* is assumed to contain a return address.
*
* btt <address-expression> refers to the address of a struct task.
*
* Inputs:
* argc argument count
* argv argument vector
* Outputs:
* None.
* Returns:
* zero for success, a kdb diagnostic if error
* Locking:
* none.
* Remarks:
* Backtrack works best when the code uses frame pointers. But even
* without frame pointers we should get a reasonable trace.
*
* mds comes in handy when examining the stack to do a manual traceback or
* to get a starting point for bt <address-expression>.
*/
static int
kdb_bt1(struct task_struct *p, unsigned long mask,
int argcount, int btaprompt)
{
char buffer[2];
if (kdb_getarea(buffer[0], (unsigned long)p) ||
kdb_getarea(buffer[0], (unsigned long)(p+1)-1))
return KDB_BADADDR;
if (!kdb_task_state(p, mask))
return 0;
kdb_printf("Stack traceback for pid %d\n", p->pid);
kdb_ps1(p);
kdb_show_stack(p, NULL);
if (btaprompt) {
kdb_getstr(buffer, sizeof(buffer),
"Enter <q> to end, <cr> to continue:");
if (buffer[0] == 'q') {
kdb_printf("\n");
return 1;
}
}
touch_nmi_watchdog();
return 0;
}
int
kdb_bt(int argc, const char **argv)
{
int diag;
int argcount = 5;
int btaprompt = 1;
int nextarg;
unsigned long addr;
long offset;
/* Prompt after each proc in bta */
kdbgetintenv("BTAPROMPT", &btaprompt);
if (strcmp(argv[0], "bta") == 0) {
struct task_struct *g, *p;
unsigned long cpu;
unsigned long mask = kdb_task_state_string(argc ? argv[1] :
NULL);
if (argc == 0)
kdb_ps_suppressed();
/* Run the active tasks first */
for_each_online_cpu(cpu) {
p = kdb_curr_task(cpu);
if (kdb_bt1(p, mask, argcount, btaprompt))
return 0;
}
/* Now the inactive tasks */
kdb_do_each_thread(g, p) {
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
if (task_curr(p))
continue;
if (kdb_bt1(p, mask, argcount, btaprompt))
return 0;
} kdb_while_each_thread(g, p);
} else if (strcmp(argv[0], "btp") == 0) {
struct task_struct *p;
unsigned long pid;
if (argc != 1)
return KDB_ARGCOUNT;
diag = kdbgetularg((char *)argv[1], &pid);
if (diag)
return diag;
p = find_task_by_pid_ns(pid, &init_pid_ns);
if (p) {
kdb_set_current_task(p);
return kdb_bt1(p, ~0UL, argcount, 0);
}
kdb_printf("No process with pid == %ld found\n", pid);
return 0;
} else if (strcmp(argv[0], "btt") == 0) {
if (argc != 1)
return KDB_ARGCOUNT;
diag = kdbgetularg((char *)argv[1], &addr);
if (diag)
return diag;
kdb_set_current_task((struct task_struct *)addr);
return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0);
} else if (strcmp(argv[0], "btc") == 0) {
unsigned long cpu = ~0;
struct task_struct *save_current_task = kdb_current_task;
char buf[80];
if (argc > 1)
return KDB_ARGCOUNT;
if (argc == 1) {
diag = kdbgetularg((char *)argv[1], &cpu);
if (diag)
return diag;
}
/* Recursive use of kdb_parse, do not use argv after
* this point */
argv = NULL;
if (cpu != ~0) {
if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
kdb_printf("no process for cpu %ld\n", cpu);
return 0;
}
sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
kdb_parse(buf);
return 0;
}
kdb_printf("btc: cpu status: ");
kdb_parse("cpu\n");
for_each_online_cpu(cpu) {
sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
kdb_parse(buf);
touch_nmi_watchdog();
}
kdb_set_current_task(save_current_task);
return 0;
} else {
if (argc) {
nextarg = 1;
diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
&offset, NULL);
if (diag)
return diag;
kdb_show_stack(kdb_current_task, (void *)addr);
return 0;
} else {
return kdb_bt1(kdb_current_task, ~0UL, argcount, 0);
}
}
/* NOTREACHED */
return 0;
}

31
kernel/debug/kdb/kdb_cmds Normal file
View file

@ -0,0 +1,31 @@
# Initial commands for kdb, alter to suit your needs.
# These commands are executed in kdb_init() context, no SMP, no
# processes. Commands that require process data (including stack or
# registers) are not reliable this early. set and bp commands should
# be safe. Global breakpoint commands affect each cpu as it is booted.
# Standard debugging information for first level support, just type archkdb
# or archkdbcpu or archkdbshort at the kdb prompt.
defcmd dumpcommon "" "Common kdb debugging"
set BTAPROMPT 0
set LINES 10000
-summary
-cpu
-ps
-dmesg 600
-bt
endefcmd
defcmd dumpall "" "First line debugging"
pid R
-dumpcommon
-bta
endefcmd
defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
pid R
-dumpcommon
-btc
endefcmd

View file

@ -0,0 +1,182 @@
/*
* Created by: Jason Wessel <jason.wessel@windriver.com>
*
* Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
*
* This file is licensed under the terms of the GNU General Public
* License version 2. This program is licensed "as is" without any
* warranty of any kind, whether express or implied.
*/
#include <linux/kgdb.h>
#include <linux/kdb.h>
#include <linux/kdebug.h>
#include <linux/export.h>
#include <linux/hardirq.h>
#include "kdb_private.h"
#include "../debug_core.h"
/*
* KDB interface to KGDB internals
*/
get_char_func kdb_poll_funcs[] = {
dbg_io_get_char,
NULL,
NULL,
NULL,
NULL,
NULL,
};
EXPORT_SYMBOL_GPL(kdb_poll_funcs);
int kdb_poll_idx = 1;
EXPORT_SYMBOL_GPL(kdb_poll_idx);
static struct kgdb_state *kdb_ks;
int kdb_common_init_state(struct kgdb_state *ks)
{
kdb_initial_cpu = atomic_read(&kgdb_active);
kdb_current_task = kgdb_info[ks->cpu].task;
kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
return 0;
}
int kdb_common_deinit_state(void)
{
kdb_initial_cpu = -1;
kdb_current_task = NULL;
kdb_current_regs = NULL;
return 0;
}
int kdb_stub(struct kgdb_state *ks)
{
int error = 0;
kdb_bp_t *bp;
unsigned long addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
kdb_reason_t reason = KDB_REASON_OOPS;
kdb_dbtrap_t db_result = KDB_DB_NOBPT;
int i;
kdb_ks = ks;
if (KDB_STATE(REENTRY)) {
reason = KDB_REASON_SWITCH;
KDB_STATE_CLEAR(REENTRY);
addr = instruction_pointer(ks->linux_regs);
}
ks->pass_exception = 0;
if (atomic_read(&kgdb_setting_breakpoint))
reason = KDB_REASON_KEYBOARD;
if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP)
reason = KDB_REASON_SYSTEM_NMI;
else if (in_nmi())
reason = KDB_REASON_NMI;
for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
reason = KDB_REASON_BREAK;
db_result = KDB_DB_BPT;
if (addr != instruction_pointer(ks->linux_regs))
kgdb_arch_set_pc(ks->linux_regs, addr);
break;
}
}
if (reason == KDB_REASON_BREAK || reason == KDB_REASON_SWITCH) {
for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
if (bp->bp_free)
continue;
if (bp->bp_addr == addr) {
bp->bp_delay = 1;
bp->bp_delayed = 1;
/*
* SSBPT is set when the kernel debugger must single step a
* task in order to re-establish an instruction breakpoint
* which uses the instruction replacement mechanism. It is
* cleared by any action that removes the need to single-step
* the breakpoint.
*/
reason = KDB_REASON_BREAK;
db_result = KDB_DB_BPT;
KDB_STATE_SET(SSBPT);
break;
}
}
}
if (reason != KDB_REASON_BREAK && ks->ex_vector == 0 &&
ks->signo == SIGTRAP) {
reason = KDB_REASON_SSTEP;
db_result = KDB_DB_BPT;
}
/* Set initial kdb state variables */
KDB_STATE_CLEAR(KGDB_TRANS);
kdb_common_init_state(ks);
/* Remove any breakpoints as needed by kdb and clear single step */
kdb_bp_remove();
KDB_STATE_CLEAR(DOING_SS);
KDB_STATE_SET(PAGER);
/* zero out any offline cpu data */
for_each_present_cpu(i) {
if (!cpu_online(i)) {
kgdb_info[i].debuggerinfo = NULL;
kgdb_info[i].task = NULL;
}
}
if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) {
ks->pass_exception = 1;
KDB_FLAG_SET(CATASTROPHIC);
}
if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
KDB_STATE_CLEAR(SSBPT);
KDB_STATE_CLEAR(DOING_SS);
} else {
/* Start kdb main loop */
error = kdb_main_loop(KDB_REASON_ENTER, reason,
ks->err_code, db_result, ks->linux_regs);
}
/*
* Upon exit from the kdb main loop setup break points and restart
* the system based on the requested continue state
*/
kdb_common_deinit_state();
KDB_STATE_CLEAR(PAGER);
kdbnearsym_cleanup();
if (error == KDB_CMD_KGDB) {
if (KDB_STATE(DOING_KGDB))
KDB_STATE_CLEAR(DOING_KGDB);
return DBG_PASS_EVENT;
}
kdb_bp_install(ks->linux_regs);
dbg_activate_sw_breakpoints();
/* Set the exit state to a single step or a continue */
if (KDB_STATE(DOING_SS))
gdbstub_state(ks, "s");
else
gdbstub_state(ks, "c");
KDB_FLAG_CLEAR(CATASTROPHIC);
/* Invoke arch specific exception handling prior to system resume */
kgdb_info[ks->cpu].ret_state = gdbstub_state(ks, "e");
if (ks->pass_exception)
kgdb_info[ks->cpu].ret_state = 1;
if (error == KDB_CMD_CPU) {
KDB_STATE_SET(REENTRY);
/*
* Force clear the single step bit because kdb emulates this
* differently vs the gdbstub
*/
kgdb_single_step = 0;
dbg_deactivate_sw_breakpoints();
return DBG_SWITCH_CPU_EVENT;
}
return kgdb_info[ks->cpu].ret_state;
}
void kdb_gdb_state_pass(char *buf)
{
gdbstub_state(kdb_ks, buf);
}

860
kernel/debug/kdb/kdb_io.c Normal file
View file

@ -0,0 +1,860 @@
/*
* Kernel Debugger Architecture Independent Console I/O handler
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*
* Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
* Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/ctype.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/kdev_t.h>
#include <linux/console.h>
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/smp.h>
#include <linux/nmi.h>
#include <linux/delay.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
#include <linux/kallsyms.h>
#include "kdb_private.h"
#define CMD_BUFLEN 256
char kdb_prompt_str[CMD_BUFLEN];
int kdb_trap_printk;
static int kgdb_transition_check(char *buffer)
{
if (buffer[0] != '+' && buffer[0] != '$') {
KDB_STATE_SET(KGDB_TRANS);
kdb_printf("%s", buffer);
} else {
int slen = strlen(buffer);
if (slen > 3 && buffer[slen - 3] == '#') {
kdb_gdb_state_pass(buffer);
strcpy(buffer, "kgdb");
KDB_STATE_SET(DOING_KGDB);
return 1;
}
}
return 0;
}
static int kdb_read_get_key(char *buffer, size_t bufsize)
{
#define ESCAPE_UDELAY 1000
#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */
char *ped = escape_data;
int escape_delay = 0;
get_char_func *f, *f_escape = NULL;
int key;
for (f = &kdb_poll_funcs[0]; ; ++f) {
if (*f == NULL) {
/* Reset NMI watchdog once per poll loop */
touch_nmi_watchdog();
f = &kdb_poll_funcs[0];
}
if (escape_delay == 2) {
*ped = '\0';
ped = escape_data;
--escape_delay;
}
if (escape_delay == 1) {
key = *ped++;
if (!*ped)
--escape_delay;
break;
}
key = (*f)();
if (key == -1) {
if (escape_delay) {
udelay(ESCAPE_UDELAY);
--escape_delay;
}
continue;
}
if (bufsize <= 2) {
if (key == '\r')
key = '\n';
*buffer++ = key;
*buffer = '\0';
return -1;
}
if (escape_delay == 0 && key == '\e') {
escape_delay = ESCAPE_DELAY;
ped = escape_data;
f_escape = f;
}
if (escape_delay) {
*ped++ = key;
if (f_escape != f) {
escape_delay = 2;
continue;
}
if (ped - escape_data == 1) {
/* \e */
continue;
} else if (ped - escape_data == 2) {
/* \e<something> */
if (key != '[')
escape_delay = 2;
continue;
} else if (ped - escape_data == 3) {
/* \e[<something> */
int mapkey = 0;
switch (key) {
case 'A': /* \e[A, up arrow */
mapkey = 16;
break;
case 'B': /* \e[B, down arrow */
mapkey = 14;
break;
case 'C': /* \e[C, right arrow */
mapkey = 6;
break;
case 'D': /* \e[D, left arrow */
mapkey = 2;
break;
case '1': /* dropthrough */
case '3': /* dropthrough */
/* \e[<1,3,4>], may be home, del, end */
case '4':
mapkey = -1;
break;
}
if (mapkey != -1) {
if (mapkey > 0) {
escape_data[0] = mapkey;
escape_data[1] = '\0';
}
escape_delay = 2;
}
continue;
} else if (ped - escape_data == 4) {
/* \e[<1,3,4><something> */
int mapkey = 0;
if (key == '~') {
switch (escape_data[2]) {
case '1': /* \e[1~, home */
mapkey = 1;
break;
case '3': /* \e[3~, del */
mapkey = 4;
break;
case '4': /* \e[4~, end */
mapkey = 5;
break;
}
}
if (mapkey > 0) {
escape_data[0] = mapkey;
escape_data[1] = '\0';
}
escape_delay = 2;
continue;
}
}
break; /* A key to process */
}
return key;
}
/*
* kdb_read
*
* This function reads a string of characters, terminated by
* a newline, or by reaching the end of the supplied buffer,
* from the current kernel debugger console device.
* Parameters:
* buffer - Address of character buffer to receive input characters.
* bufsize - size, in bytes, of the character buffer
* Returns:
* Returns a pointer to the buffer containing the received
* character string. This string will be terminated by a
* newline character.
* Locking:
* No locks are required to be held upon entry to this
* function. It is not reentrant - it relies on the fact
* that while kdb is running on only one "master debug" cpu.
* Remarks:
*
* The buffer size must be >= 2. A buffer size of 2 means that the caller only
* wants a single key.
*
* An escape key could be the start of a vt100 control sequence such as \e[D
* (left arrow) or it could be a character in its own right. The standard
* method for detecting the difference is to wait for 2 seconds to see if there
* are any other characters. kdb is complicated by the lack of a timer service
* (interrupts are off), by multiple input sources and by the need to sometimes
* return after just one key. Escape sequence processing has to be done as
* states in the polling loop.
*/
static char *kdb_read(char *buffer, size_t bufsize)
{
char *cp = buffer;
char *bufend = buffer+bufsize-2; /* Reserve space for newline
* and null byte */
char *lastchar;
char *p_tmp;
char tmp;
static char tmpbuffer[CMD_BUFLEN];
int len = strlen(buffer);
int len_tmp;
int tab = 0;
int count;
int i;
int diag, dtab_count;
int key;
static int last_crlf;
diag = kdbgetintenv("DTABCOUNT", &dtab_count);
if (diag)
dtab_count = 30;
if (len > 0) {
cp += len;
if (*(buffer+len-1) == '\n')
cp--;
}
lastchar = cp;
*cp = '\0';
kdb_printf("%s", buffer);
poll_again:
key = kdb_read_get_key(buffer, bufsize);
if (key == -1)
return buffer;
if (key != 9)
tab = 0;
if (key != 10 && key != 13)
last_crlf = 0;
switch (key) {
case 8: /* backspace */
if (cp > buffer) {
if (cp < lastchar) {
memcpy(tmpbuffer, cp, lastchar - cp);
memcpy(cp-1, tmpbuffer, lastchar - cp);
}
*(--lastchar) = '\0';
--cp;
kdb_printf("\b%s \r", cp);
tmp = *cp;
*cp = '\0';
kdb_printf(kdb_prompt_str);
kdb_printf("%s", buffer);
*cp = tmp;
}
break;
case 10: /* new line */
case 13: /* carriage return */
/* handle \n after \r */
if (last_crlf && last_crlf != key)
break;
last_crlf = key;
*lastchar++ = '\n';
*lastchar++ = '\0';
if (!KDB_STATE(KGDB_TRANS)) {
KDB_STATE_SET(KGDB_TRANS);
kdb_printf("%s", buffer);
}
kdb_printf("\n");
return buffer;
case 4: /* Del */
if (cp < lastchar) {
memcpy(tmpbuffer, cp+1, lastchar - cp - 1);
memcpy(cp, tmpbuffer, lastchar - cp - 1);
*(--lastchar) = '\0';
kdb_printf("%s \r", cp);
tmp = *cp;
*cp = '\0';
kdb_printf(kdb_prompt_str);
kdb_printf("%s", buffer);
*cp = tmp;
}
break;
case 1: /* Home */
if (cp > buffer) {
kdb_printf("\r");
kdb_printf(kdb_prompt_str);
cp = buffer;
}
break;
case 5: /* End */
if (cp < lastchar) {
kdb_printf("%s", cp);
cp = lastchar;
}
break;
case 2: /* Left */
if (cp > buffer) {
kdb_printf("\b");
--cp;
}
break;
case 14: /* Down */
memset(tmpbuffer, ' ',
strlen(kdb_prompt_str) + (lastchar-buffer));
*(tmpbuffer+strlen(kdb_prompt_str) +
(lastchar-buffer)) = '\0';
kdb_printf("\r%s\r", tmpbuffer);
*lastchar = (char)key;
*(lastchar+1) = '\0';
return lastchar;
case 6: /* Right */
if (cp < lastchar) {
kdb_printf("%c", *cp);
++cp;
}
break;
case 16: /* Up */
memset(tmpbuffer, ' ',
strlen(kdb_prompt_str) + (lastchar-buffer));
*(tmpbuffer+strlen(kdb_prompt_str) +
(lastchar-buffer)) = '\0';
kdb_printf("\r%s\r", tmpbuffer);
*lastchar = (char)key;
*(lastchar+1) = '\0';
return lastchar;
case 9: /* Tab */
if (tab < 2)
++tab;
p_tmp = buffer;
while (*p_tmp == ' ')
p_tmp++;
if (p_tmp > cp)
break;
memcpy(tmpbuffer, p_tmp, cp-p_tmp);
*(tmpbuffer + (cp-p_tmp)) = '\0';
p_tmp = strrchr(tmpbuffer, ' ');
if (p_tmp)
++p_tmp;
else
p_tmp = tmpbuffer;
len = strlen(p_tmp);
count = kallsyms_symbol_complete(p_tmp,
sizeof(tmpbuffer) -
(p_tmp - tmpbuffer));
if (tab == 2 && count > 0) {
kdb_printf("\n%d symbols are found.", count);
if (count > dtab_count) {
count = dtab_count;
kdb_printf(" But only first %d symbols will"
" be printed.\nYou can change the"
" environment variable DTABCOUNT.",
count);
}
kdb_printf("\n");
for (i = 0; i < count; i++) {
if (kallsyms_symbol_next(p_tmp, i) < 0)
break;
kdb_printf("%s ", p_tmp);
*(p_tmp + len) = '\0';
}
if (i >= dtab_count)
kdb_printf("...");
kdb_printf("\n");
kdb_printf(kdb_prompt_str);
kdb_printf("%s", buffer);
} else if (tab != 2 && count > 0) {
len_tmp = strlen(p_tmp);
strncpy(p_tmp+len_tmp, cp, lastchar-cp+1);
len_tmp = strlen(p_tmp);
strncpy(cp, p_tmp+len, len_tmp-len + 1);
len = len_tmp - len;
kdb_printf("%s", cp);
cp += len;
lastchar += len;
}
kdb_nextline = 1; /* reset output line number */
break;
default:
if (key >= 32 && lastchar < bufend) {
if (cp < lastchar) {
memcpy(tmpbuffer, cp, lastchar - cp);
memcpy(cp+1, tmpbuffer, lastchar - cp);
*++lastchar = '\0';
*cp = key;
kdb_printf("%s\r", cp);
++cp;
tmp = *cp;
*cp = '\0';
kdb_printf(kdb_prompt_str);
kdb_printf("%s", buffer);
*cp = tmp;
} else {
*++lastchar = '\0';
*cp++ = key;
/* The kgdb transition check will hide
* printed characters if we think that
* kgdb is connecting, until the check
* fails */
if (!KDB_STATE(KGDB_TRANS)) {
if (kgdb_transition_check(buffer))
return buffer;
} else {
kdb_printf("%c", key);
}
}
/* Special escape to kgdb */
if (lastchar - buffer >= 5 &&
strcmp(lastchar - 5, "$?#3f") == 0) {
kdb_gdb_state_pass(lastchar - 5);
strcpy(buffer, "kgdb");
KDB_STATE_SET(DOING_KGDB);
return buffer;
}
if (lastchar - buffer >= 11 &&
strcmp(lastchar - 11, "$qSupported") == 0) {
kdb_gdb_state_pass(lastchar - 11);
strcpy(buffer, "kgdb");
KDB_STATE_SET(DOING_KGDB);
return buffer;
}
}
break;
}
goto poll_again;
}
/*
* kdb_getstr
*
* Print the prompt string and read a command from the
* input device.
*
* Parameters:
* buffer Address of buffer to receive command
* bufsize Size of buffer in bytes
* prompt Pointer to string to use as prompt string
* Returns:
* Pointer to command buffer.
* Locking:
* None.
* Remarks:
* For SMP kernels, the processor number will be
* substituted for %d, %x or %o in the prompt.
*/
char *kdb_getstr(char *buffer, size_t bufsize, char *prompt)
{
if (prompt && kdb_prompt_str != prompt)
strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
kdb_printf(kdb_prompt_str);
kdb_nextline = 1; /* Prompt and input resets line number */
return kdb_read(buffer, bufsize);
}
/*
* kdb_input_flush
*
* Get rid of any buffered console input.
*
* Parameters:
* none
* Returns:
* nothing
* Locking:
* none
* Remarks:
* Call this function whenever you want to flush input. If there is any
* outstanding input, it ignores all characters until there has been no
* data for approximately 1ms.
*/
static void kdb_input_flush(void)
{
get_char_func *f;
int res;
int flush_delay = 1;
while (flush_delay) {
flush_delay--;
empty:
touch_nmi_watchdog();
for (f = &kdb_poll_funcs[0]; *f; ++f) {
res = (*f)();
if (res != -1) {
flush_delay = 1;
goto empty;
}
}
if (flush_delay)
mdelay(1);
}
}
/*
* kdb_printf
*
* Print a string to the output device(s).
*
* Parameters:
* printf-like format and optional args.
* Returns:
* 0
* Locking:
* None.
* Remarks:
* use 'kdbcons->write()' to avoid polluting 'log_buf' with
* kdb output.
*
* If the user is doing a cmd args | grep srch
* then kdb_grepping_flag is set.
* In that case we need to accumulate full lines (ending in \n) before
* searching for the pattern.
*/
static char kdb_buffer[256]; /* A bit too big to go on stack */
static char *next_avail = kdb_buffer;
static int size_avail;
static int suspend_grep;
/*
* search arg1 to see if it contains arg2
* (kdmain.c provides flags for ^pat and pat$)
*
* return 1 for found, 0 for not found
*/
static int kdb_search_string(char *searched, char *searchfor)
{
char firstchar, *cp;
int len1, len2;
/* not counting the newline at the end of "searched" */
len1 = strlen(searched)-1;
len2 = strlen(searchfor);
if (len1 < len2)
return 0;
if (kdb_grep_leading && kdb_grep_trailing && len1 != len2)
return 0;
if (kdb_grep_leading) {
if (!strncmp(searched, searchfor, len2))
return 1;
} else if (kdb_grep_trailing) {
if (!strncmp(searched+len1-len2, searchfor, len2))
return 1;
} else {
firstchar = *searchfor;
cp = searched;
while ((cp = strchr(cp, firstchar))) {
if (!strncmp(cp, searchfor, len2))
return 1;
cp++;
}
}
return 0;
}
int vkdb_printf(const char *fmt, va_list ap)
{
int diag;
int linecount;
int colcount;
int logging, saved_loglevel = 0;
int saved_trap_printk;
int got_printf_lock = 0;
int retlen = 0;
int fnd, len;
char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
char *moreprompt = "more> ";
struct console *c = console_drivers;
static DEFINE_SPINLOCK(kdb_printf_lock);
unsigned long uninitialized_var(flags);
preempt_disable();
saved_trap_printk = kdb_trap_printk;
kdb_trap_printk = 0;
/* Serialize kdb_printf if multiple cpus try to write at once.
* But if any cpu goes recursive in kdb, just print the output,
* even if it is interleaved with any other text.
*/
if (!KDB_STATE(PRINTF_LOCK)) {
KDB_STATE_SET(PRINTF_LOCK);
spin_lock_irqsave(&kdb_printf_lock, flags);
got_printf_lock = 1;
atomic_inc(&kdb_event);
} else {
__acquire(kdb_printf_lock);
}
diag = kdbgetintenv("LINES", &linecount);
if (diag || linecount <= 1)
linecount = 24;
diag = kdbgetintenv("COLUMNS", &colcount);
if (diag || colcount <= 1)
colcount = 80;
diag = kdbgetintenv("LOGGING", &logging);
if (diag)
logging = 0;
if (!kdb_grepping_flag || suspend_grep) {
/* normally, every vsnprintf starts a new buffer */
next_avail = kdb_buffer;
size_avail = sizeof(kdb_buffer);
}
vsnprintf(next_avail, size_avail, fmt, ap);
/*
* If kdb_parse() found that the command was cmd xxx | grep yyy
* then kdb_grepping_flag is set, and kdb_grep_string contains yyy
*
* Accumulate the print data up to a newline before searching it.
* (vsnprintf does null-terminate the string that it generates)
*/
/* skip the search if prints are temporarily unconditional */
if (!suspend_grep && kdb_grepping_flag) {
cp = strchr(kdb_buffer, '\n');
if (!cp) {
/*
* Special cases that don't end with newlines
* but should be written without one:
* The "[nn]kdb> " prompt should
* appear at the front of the buffer.
*
* The "[nn]more " prompt should also be
* (MOREPROMPT -> moreprompt)
* written * but we print that ourselves,
* we set the suspend_grep flag to make
* it unconditional.
*
*/
if (next_avail == kdb_buffer) {
/*
* these should occur after a newline,
* so they will be at the front of the
* buffer
*/
cp2 = kdb_buffer;
len = strlen(kdb_prompt_str);
if (!strncmp(cp2, kdb_prompt_str, len)) {
/*
* We're about to start a new
* command, so we can go back
* to normal mode.
*/
kdb_grepping_flag = 0;
goto kdb_printit;
}
}
/* no newline; don't search/write the buffer
until one is there */
len = strlen(kdb_buffer);
next_avail = kdb_buffer + len;
size_avail = sizeof(kdb_buffer) - len;
goto kdb_print_out;
}
/*
* The newline is present; print through it or discard
* it, depending on the results of the search.
*/
cp++; /* to byte after the newline */
replaced_byte = *cp; /* remember what/where it was */
cphold = cp;
*cp = '\0'; /* end the string for our search */
/*
* We now have a newline at the end of the string
* Only continue with this output if it contains the
* search string.
*/
fnd = kdb_search_string(kdb_buffer, kdb_grep_string);
if (!fnd) {
/*
* At this point the complete line at the start
* of kdb_buffer can be discarded, as it does
* not contain what the user is looking for.
* Shift the buffer left.
*/
*cphold = replaced_byte;
strcpy(kdb_buffer, cphold);
len = strlen(kdb_buffer);
next_avail = kdb_buffer + len;
size_avail = sizeof(kdb_buffer) - len;
goto kdb_print_out;
}
/*
* at this point the string is a full line and
* should be printed, up to the null.
*/
}
kdb_printit:
/*
* Write to all consoles.
*/
retlen = strlen(kdb_buffer);
if (!dbg_kdb_mode && kgdb_connected) {
gdbstub_msg_write(kdb_buffer, retlen);
} else {
if (dbg_io_ops && !dbg_io_ops->is_console) {
len = retlen;
cp = kdb_buffer;
while (len--) {
dbg_io_ops->write_char(*cp);
cp++;
}
}
while (c) {
c->write(c, kdb_buffer, retlen);
touch_nmi_watchdog();
c = c->next;
}
}
if (logging) {
saved_loglevel = console_loglevel;
console_loglevel = CONSOLE_LOGLEVEL_SILENT;
printk(KERN_INFO "%s", kdb_buffer);
}
if (KDB_STATE(PAGER)) {
/*
* Check printed string to decide how to bump the
* kdb_nextline to control when the more prompt should
* show up.
*/
int got = 0;
len = retlen;
while (len--) {
if (kdb_buffer[len] == '\n') {
kdb_nextline++;
got = 0;
} else if (kdb_buffer[len] == '\r') {
got = 0;
} else {
got++;
}
}
kdb_nextline += got / (colcount + 1);
}
/* check for having reached the LINES number of printed lines */
if (kdb_nextline >= linecount) {
char buf1[16] = "";
/* Watch out for recursion here. Any routine that calls
* kdb_printf will come back through here. And kdb_read
* uses kdb_printf to echo on serial consoles ...
*/
kdb_nextline = 1; /* In case of recursion */
/*
* Pause until cr.
*/
moreprompt = kdbgetenv("MOREPROMPT");
if (moreprompt == NULL)
moreprompt = "more> ";
kdb_input_flush();
c = console_drivers;
if (dbg_io_ops && !dbg_io_ops->is_console) {
len = strlen(moreprompt);
cp = moreprompt;
while (len--) {
dbg_io_ops->write_char(*cp);
cp++;
}
}
while (c) {
c->write(c, moreprompt, strlen(moreprompt));
touch_nmi_watchdog();
c = c->next;
}
if (logging)
printk("%s", moreprompt);
kdb_read(buf1, 2); /* '2' indicates to return
* immediately after getting one key. */
kdb_nextline = 1; /* Really set output line 1 */
/* empty and reset the buffer: */
kdb_buffer[0] = '\0';
next_avail = kdb_buffer;
size_avail = sizeof(kdb_buffer);
if ((buf1[0] == 'q') || (buf1[0] == 'Q')) {
/* user hit q or Q */
KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */
KDB_STATE_CLEAR(PAGER);
/* end of command output; back to normal mode */
kdb_grepping_flag = 0;
kdb_printf("\n");
} else if (buf1[0] == ' ') {
kdb_printf("\r");
suspend_grep = 1; /* for this recursion */
} else if (buf1[0] == '\n') {
kdb_nextline = linecount - 1;
kdb_printf("\r");
suspend_grep = 1; /* for this recursion */
} else if (buf1[0] && buf1[0] != '\n') {
/* user hit something other than enter */
suspend_grep = 1; /* for this recursion */
kdb_printf("\nOnly 'q' or 'Q' are processed at more "
"prompt, input ignored\n");
} else if (kdb_grepping_flag) {
/* user hit enter */
suspend_grep = 1; /* for this recursion */
kdb_printf("\n");
}
kdb_input_flush();
}
/*
* For grep searches, shift the printed string left.
* replaced_byte contains the character that was overwritten with
* the terminating null, and cphold points to the null.
* Then adjust the notion of available space in the buffer.
*/
if (kdb_grepping_flag && !suspend_grep) {
*cphold = replaced_byte;
strcpy(kdb_buffer, cphold);
len = strlen(kdb_buffer);
next_avail = kdb_buffer + len;
size_avail = sizeof(kdb_buffer) - len;
}
kdb_print_out:
suspend_grep = 0; /* end of what may have been a recursive call */
if (logging)
console_loglevel = saved_loglevel;
if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
got_printf_lock = 0;
spin_unlock_irqrestore(&kdb_printf_lock, flags);
KDB_STATE_CLEAR(PRINTF_LOCK);
atomic_dec(&kdb_event);
} else {
__release(kdb_printf_lock);
}
kdb_trap_printk = saved_trap_printk;
preempt_enable();
return retlen;
}
int kdb_printf(const char *fmt, ...)
{
va_list ap;
int r;
va_start(ap, fmt);
r = vkdb_printf(fmt, ap);
va_end(ap);
return r;
}
EXPORT_SYMBOL_GPL(kdb_printf);

View file

@ -0,0 +1,263 @@
/*
* Kernel Debugger Architecture Dependent Console I/O handler
*
* This file is subject to the terms and conditions of the GNU General Public
* License.
*
* Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
* Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
*/
#include <linux/kdb.h>
#include <linux/keyboard.h>
#include <linux/ctype.h>
#include <linux/module.h>
#include <linux/io.h>
/* Keyboard Controller Registers on normal PCs. */
#define KBD_STATUS_REG 0x64 /* Status register (R) */
#define KBD_DATA_REG 0x60 /* Keyboard data register (R/W) */
/* Status Register Bits */
#define KBD_STAT_OBF 0x01 /* Keyboard output buffer full */
#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
static int kbd_exists;
static int kbd_last_ret;
/*
* Check if the keyboard controller has a keypress for us.
* Some parts (Enter Release, LED change) are still blocking polled here,
* but hopefully they are all short.
*/
int kdb_get_kbd_char(void)
{
int scancode, scanstatus;
static int shift_lock; /* CAPS LOCK state (0-off, 1-on) */
static int shift_key; /* Shift next keypress */
static int ctrl_key;
u_short keychar;
if (KDB_FLAG(NO_I8042) || KDB_FLAG(NO_VT_CONSOLE) ||
(inb(KBD_STATUS_REG) == 0xff && inb(KBD_DATA_REG) == 0xff)) {
kbd_exists = 0;
return -1;
}
kbd_exists = 1;
if ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
return -1;
/*
* Fetch the scancode
*/
scancode = inb(KBD_DATA_REG);
scanstatus = inb(KBD_STATUS_REG);
/*
* Ignore mouse events.
*/
if (scanstatus & KBD_STAT_MOUSE_OBF)
return -1;
/*
* Ignore release, trigger on make
* (except for shift keys, where we want to
* keep the shift state so long as the key is
* held down).
*/
if (((scancode&0x7f) == 0x2a) || ((scancode&0x7f) == 0x36)) {
/*
* Next key may use shift table
*/
if ((scancode & 0x80) == 0)
shift_key = 1;
else
shift_key = 0;
return -1;
}
if ((scancode&0x7f) == 0x1d) {
/*
* Left ctrl key
*/
if ((scancode & 0x80) == 0)
ctrl_key = 1;
else
ctrl_key = 0;
return -1;
}
if ((scancode & 0x80) != 0) {
if (scancode == 0x9c)
kbd_last_ret = 0;
return -1;
}
scancode &= 0x7f;
/*
* Translate scancode
*/
if (scancode == 0x3a) {
/*
* Toggle caps lock
*/
shift_lock ^= 1;
#ifdef KDB_BLINK_LED
kdb_toggleled(0x4);
#endif
return -1;
}
if (scancode == 0x0e) {
/*
* Backspace
*/
return 8;
}
/* Special Key */
switch (scancode) {
case 0xF: /* Tab */
return 9;
case 0x53: /* Del */
return 4;
case 0x47: /* Home */
return 1;
case 0x4F: /* End */
return 5;
case 0x4B: /* Left */
return 2;
case 0x48: /* Up */
return 16;
case 0x50: /* Down */
return 14;
case 0x4D: /* Right */
return 6;
}
if (scancode == 0xe0)
return -1;
/*
* For Japanese 86/106 keyboards
* See comment in drivers/char/pc_keyb.c.
* - Masahiro Adegawa
*/
if (scancode == 0x73)
scancode = 0x59;
else if (scancode == 0x7d)
scancode = 0x7c;
if (!shift_lock && !shift_key && !ctrl_key) {
keychar = plain_map[scancode];
} else if ((shift_lock || shift_key) && key_maps[1]) {
keychar = key_maps[1][scancode];
} else if (ctrl_key && key_maps[4]) {
keychar = key_maps[4][scancode];
} else {
keychar = 0x0020;
kdb_printf("Unknown state/scancode (%d)\n", scancode);
}
keychar &= 0x0fff;
if (keychar == '\t')
keychar = ' ';
switch (KTYP(keychar)) {
case KT_LETTER:
case KT_LATIN:
if (isprint(keychar))
break; /* printable characters */
/* drop through */
case KT_SPEC:
if (keychar == K_ENTER)
break;
/* drop through */
default:
return -1; /* ignore unprintables */
}
if (scancode == 0x1c) {
kbd_last_ret = 1;
return 13;
}
return keychar & 0xff;
}
EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
/*
* Best effort cleanup of ENTER break codes on leaving KDB. Called on
* exiting KDB, when we know we processed an ENTER or KP ENTER scan
* code.
*/
void kdb_kbd_cleanup_state(void)
{
int scancode, scanstatus;
/*
* Nothing to clean up, since either
* ENTER was never pressed, or has already
* gotten cleaned up.
*/
if (!kbd_last_ret)
return;
kbd_last_ret = 0;
/*
* Enter key. Need to absorb the break code here, lest it gets
* leaked out if we exit KDB as the result of processing 'g'.
*
* This has several interesting implications:
* + Need to handle KP ENTER, which has break code 0xe0 0x9c.
* + Need to handle repeat ENTER and repeat KP ENTER. Repeats
* only get a break code at the end of the repeated
* sequence. This means we can't propagate the repeated key
* press, and must swallow it away.
* + Need to handle possible PS/2 mouse input.
* + Need to handle mashed keys.
*/
while (1) {
while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
cpu_relax();
/*
* Fetch the scancode.
*/
scancode = inb(KBD_DATA_REG);
scanstatus = inb(KBD_STATUS_REG);
/*
* Skip mouse input.
*/
if (scanstatus & KBD_STAT_MOUSE_OBF)
continue;
/*
* If we see 0xe0, this is either a break code for KP
* ENTER, or a repeat make for KP ENTER. Either way,
* since the second byte is equivalent to an ENTER,
* skip the 0xe0 and try again.
*
* If we see 0x1c, this must be a repeat ENTER or KP
* ENTER (and we swallowed 0xe0 before). Try again.
*
* We can also see make and break codes for other keys
* mashed before or after pressing ENTER. Thus, if we
* see anything other than 0x9c, we have to try again.
*
* Note, if you held some key as ENTER was depressed,
* that break code would get leaked out.
*/
if (scancode != 0x9c)
continue;
return;
}
}

2879
kernel/debug/kdb/kdb_main.c Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,260 @@
#ifndef _KDBPRIVATE_H
#define _KDBPRIVATE_H
/*
* Kernel Debugger Architecture Independent Private Headers
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*
* Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
* Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
*/
#include <linux/kgdb.h>
#include "../debug_core.h"
/* Kernel Debugger Command codes. Must not overlap with error codes. */
#define KDB_CMD_GO (-1001)
#define KDB_CMD_CPU (-1002)
#define KDB_CMD_SS (-1003)
#define KDB_CMD_KGDB (-1005)
/* Internal debug flags */
#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */
#define KDB_DEBUG_FLAG_BB_SUMM 0x0004 /* Basic block analysis, summary only */
#define KDB_DEBUG_FLAG_AR 0x0008 /* Activation record, generic */
#define KDB_DEBUG_FLAG_ARA 0x0010 /* Activation record, arch specific */
#define KDB_DEBUG_FLAG_BB 0x0020 /* All basic block analysis */
#define KDB_DEBUG_FLAG_STATE 0x0040 /* State flags */
#define KDB_DEBUG_FLAG_MASK 0xffff /* All debug flags */
#define KDB_DEBUG_FLAG_SHIFT 16 /* Shift factor for dbflags */
#define KDB_DEBUG(flag) (kdb_flags & \
(KDB_DEBUG_FLAG_##flag << KDB_DEBUG_FLAG_SHIFT))
#define KDB_DEBUG_STATE(text, value) if (KDB_DEBUG(STATE)) \
kdb_print_state(text, value)
#if BITS_PER_LONG == 32
#define KDB_PLATFORM_ENV "BYTESPERWORD=4"
#define kdb_machreg_fmt "0x%lx"
#define kdb_machreg_fmt0 "0x%08lx"
#define kdb_bfd_vma_fmt "0x%lx"
#define kdb_bfd_vma_fmt0 "0x%08lx"
#define kdb_elfw_addr_fmt "0x%x"
#define kdb_elfw_addr_fmt0 "0x%08x"
#define kdb_f_count_fmt "%d"
#elif BITS_PER_LONG == 64
#define KDB_PLATFORM_ENV "BYTESPERWORD=8"
#define kdb_machreg_fmt "0x%lx"
#define kdb_machreg_fmt0 "0x%016lx"
#define kdb_bfd_vma_fmt "0x%lx"
#define kdb_bfd_vma_fmt0 "0x%016lx"
#define kdb_elfw_addr_fmt "0x%x"
#define kdb_elfw_addr_fmt0 "0x%016x"
#define kdb_f_count_fmt "%ld"
#endif
/*
* KDB_MAXBPT describes the total number of breakpoints
* supported by this architecure.
*/
#define KDB_MAXBPT 16
/* Symbol table format returned by kallsyms. */
typedef struct __ksymtab {
unsigned long value; /* Address of symbol */
const char *mod_name; /* Module containing symbol or
* "kernel" */
unsigned long mod_start;
unsigned long mod_end;
const char *sec_name; /* Section containing symbol */
unsigned long sec_start;
unsigned long sec_end;
const char *sym_name; /* Full symbol name, including
* any version */
unsigned long sym_start;
unsigned long sym_end;
} kdb_symtab_t;
extern int kallsyms_symbol_next(char *prefix_name, int flag);
extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
/* Exported Symbols for kernel loadable modules to use. */
extern int kdb_getarea_size(void *, unsigned long, size_t);
extern int kdb_putarea_size(unsigned long, void *, size_t);
/*
* Like get_user and put_user, kdb_getarea and kdb_putarea take variable
* names, not pointers. The underlying *_size functions take pointers.
*/
#define kdb_getarea(x, addr) kdb_getarea_size(&(x), addr, sizeof((x)))
#define kdb_putarea(addr, x) kdb_putarea_size(addr, &(x), sizeof((x)))
extern int kdb_getphysword(unsigned long *word,
unsigned long addr, size_t size);
extern int kdb_getword(unsigned long *, unsigned long, size_t);
extern int kdb_putword(unsigned long, unsigned long, size_t);
extern int kdbgetularg(const char *, unsigned long *);
extern int kdbgetu64arg(const char *, u64 *);
extern char *kdbgetenv(const char *);
extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
long *, char **);
extern int kdbgetsymval(const char *, kdb_symtab_t *);
extern int kdbnearsym(unsigned long, kdb_symtab_t *);
extern void kdbnearsym_cleanup(void);
extern char *kdb_strdup(const char *str, gfp_t type);
extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
/* Routine for debugging the debugger state. */
extern void kdb_print_state(const char *, int);
extern int kdb_state;
#define KDB_STATE_KDB 0x00000001 /* Cpu is inside kdb */
#define KDB_STATE_LEAVING 0x00000002 /* Cpu is leaving kdb */
#define KDB_STATE_CMD 0x00000004 /* Running a kdb command */
#define KDB_STATE_KDB_CONTROL 0x00000008 /* This cpu is under
* kdb control */
#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
* after one ss, independent of
* DOING_SS */
#define KDB_STATE_REENTRY 0x00000100 /* Valid re-entry into kdb */
#define KDB_STATE_SUPPRESS 0x00000200 /* Suppress error messages */
#define KDB_STATE_PAGER 0x00000400 /* pager is available */
#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching
* back to initial cpu */
#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */
#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */
#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */
#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been
* adjusted */
#define KDB_STATE_GO1 0x00010000 /* go only releases one cpu */
#define KDB_STATE_KEYBOARD 0x00020000 /* kdb entered via
* keyboard on this cpu */
#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */
#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */
#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */
#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch
* specific use */
#define KDB_STATE(flag) (kdb_state & KDB_STATE_##flag)
#define KDB_STATE_SET(flag) ((void)(kdb_state |= KDB_STATE_##flag))
#define KDB_STATE_CLEAR(flag) ((void)(kdb_state &= ~KDB_STATE_##flag))
extern int kdb_nextline; /* Current number of lines displayed */
typedef struct _kdb_bp {
unsigned long bp_addr; /* Address breakpoint is present at */
unsigned int bp_free:1; /* This entry is available */
unsigned int bp_enabled:1; /* Breakpoint is active in register */
unsigned int bp_type:4; /* Uses hardware register */
unsigned int bp_installed:1; /* Breakpoint is installed */
unsigned int bp_delay:1; /* Do delayed bp handling */
unsigned int bp_delayed:1; /* Delayed breakpoint */
unsigned int bph_length; /* HW break length */
} kdb_bp_t;
#ifdef CONFIG_KGDB_KDB
extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
/* The KDB shell command table */
typedef struct _kdbtab {
char *cmd_name; /* Command name */
kdb_func_t cmd_func; /* Function to execute command */
char *cmd_usage; /* Usage String for this command */
char *cmd_help; /* Help message for this command */
short cmd_flags; /* Parsing flags */
short cmd_minlen; /* Minimum legal # command
* chars required */
kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */
} kdbtab_t;
extern int kdb_bt(int, const char **); /* KDB display back trace */
/* KDB breakpoint management functions */
extern void kdb_initbptab(void);
extern void kdb_bp_install(struct pt_regs *);
extern void kdb_bp_remove(void);
typedef enum {
KDB_DB_BPT, /* Breakpoint */
KDB_DB_SS, /* Single-step trap */
KDB_DB_SSBPT, /* Single step over breakpoint */
KDB_DB_NOBPT /* Spurious breakpoint */
} kdb_dbtrap_t;
extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
int, kdb_dbtrap_t, struct pt_regs *);
/* Miscellaneous functions and data areas */
extern int kdb_grepping_flag;
extern char kdb_grep_string[];
extern int kdb_grep_leading;
extern int kdb_grep_trailing;
extern char *kdb_cmds[];
extern unsigned long kdb_task_state_string(const char *);
extern char kdb_task_state_char (const struct task_struct *);
extern unsigned long kdb_task_state(const struct task_struct *p,
unsigned long mask);
extern void kdb_ps_suppressed(void);
extern void kdb_ps1(const struct task_struct *p);
extern void kdb_print_nameval(const char *name, unsigned long val);
extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
extern void kdb_meminfo_proc_show(void);
extern char *kdb_getstr(char *, size_t, char *);
extern void kdb_gdb_state_pass(char *buf);
/* Defines for kdb_symbol_print */
#define KDB_SP_SPACEB 0x0001 /* Space before string */
#define KDB_SP_SPACEA 0x0002 /* Space after string */
#define KDB_SP_PAREN 0x0004 /* Parenthesis around string */
#define KDB_SP_VALUE 0x0008 /* Print the value of the address */
#define KDB_SP_SYMSIZE 0x0010 /* Print the size of the symbol */
#define KDB_SP_NEWLINE 0x0020 /* Newline after string */
#define KDB_SP_DEFAULT (KDB_SP_VALUE|KDB_SP_PAREN)
#define KDB_TSK(cpu) kgdb_info[cpu].task
#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo
extern struct task_struct *kdb_curr_task(int);
#define kdb_task_has_cpu(p) (task_curr(p))
/* Simplify coexistence with NPTL */
#define kdb_do_each_thread(g, p) do_each_thread(g, p)
#define kdb_while_each_thread(g, p) while_each_thread(g, p)
#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
extern void *debug_kmalloc(size_t size, gfp_t flags);
extern void debug_kfree(void *);
extern void debug_kusage(void);
extern void kdb_set_current_task(struct task_struct *);
extern struct task_struct *kdb_current_task;
#ifdef CONFIG_KDB_KEYBOARD
extern void kdb_kbd_cleanup_state(void);
#else /* ! CONFIG_KDB_KEYBOARD */
#define kdb_kbd_cleanup_state()
#endif /* ! CONFIG_KDB_KEYBOARD */
#ifdef CONFIG_MODULES
extern struct list_head *kdb_modules;
#endif /* CONFIG_MODULES */
extern char kdb_prompt_str[];
#define KDB_WORD_SIZE ((int)sizeof(unsigned long))
#endif /* CONFIG_KGDB_KDB */
#endif /* !_KDBPRIVATE_H */

View file

@ -0,0 +1,927 @@
/*
* Kernel Debugger Architecture Independent Support Functions
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*
* Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
* Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
* 03/02/13 added new 2.5 kallsyms <xavier.bru@bull.net>
*/
#include <stdarg.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/kallsyms.h>
#include <linux/stddef.h>
#include <linux/vmalloc.h>
#include <linux/ptrace.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
#include <linux/delay.h>
#include <linux/uaccess.h>
#include <linux/kdb.h>
#include <linux/slab.h>
#include "kdb_private.h"
/*
* kdbgetsymval - Return the address of the given symbol.
*
* Parameters:
* symname Character string containing symbol name
* symtab Structure to receive results
* Returns:
* 0 Symbol not found, symtab zero filled
* 1 Symbol mapped to module/symbol/section, data in symtab
*/
int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
{
if (KDB_DEBUG(AR))
kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname,
symtab);
memset(symtab, 0, sizeof(*symtab));
symtab->sym_start = kallsyms_lookup_name(symname);
if (symtab->sym_start) {
if (KDB_DEBUG(AR))
kdb_printf("kdbgetsymval: returns 1, "
"symtab->sym_start=0x%lx\n",
symtab->sym_start);
return 1;
}
if (KDB_DEBUG(AR))
kdb_printf("kdbgetsymval: returns 0\n");
return 0;
}
EXPORT_SYMBOL(kdbgetsymval);
static char *kdb_name_table[100]; /* arbitrary size */
/*
* kdbnearsym - Return the name of the symbol with the nearest address
* less than 'addr'.
*
* Parameters:
* addr Address to check for symbol near
* symtab Structure to receive results
* Returns:
* 0 No sections contain this address, symtab zero filled
* 1 Address mapped to module/symbol/section, data in symtab
* Remarks:
* 2.6 kallsyms has a "feature" where it unpacks the name into a
* string. If that string is reused before the caller expects it
* then the caller sees its string change without warning. To
* avoid cluttering up the main kdb code with lots of kdb_strdup,
* tests and kfree calls, kdbnearsym maintains an LRU list of the
* last few unique strings. The list is sized large enough to
* hold active strings, no kdb caller of kdbnearsym makes more
* than ~20 later calls before using a saved value.
*/
int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
{
int ret = 0;
unsigned long symbolsize = 0;
unsigned long offset = 0;
#define knt1_size 128 /* must be >= kallsyms table size */
char *knt1 = NULL;
if (KDB_DEBUG(AR))
kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab);
memset(symtab, 0, sizeof(*symtab));
if (addr < 4096)
goto out;
knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
if (!knt1) {
kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n",
addr);
goto out;
}
symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
(char **)(&symtab->mod_name), knt1);
if (offset > 8*1024*1024) {
symtab->sym_name = NULL;
addr = offset = symbolsize = 0;
}
symtab->sym_start = addr - offset;
symtab->sym_end = symtab->sym_start + symbolsize;
ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
if (ret) {
int i;
/* Another 2.6 kallsyms "feature". Sometimes the sym_name is
* set but the buffer passed into kallsyms_lookup is not used,
* so it contains garbage. The caller has to work out which
* buffer needs to be saved.
*
* What was Rusty smoking when he wrote that code?
*/
if (symtab->sym_name != knt1) {
strncpy(knt1, symtab->sym_name, knt1_size);
knt1[knt1_size-1] = '\0';
}
for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
if (kdb_name_table[i] &&
strcmp(kdb_name_table[i], knt1) == 0)
break;
}
if (i >= ARRAY_SIZE(kdb_name_table)) {
debug_kfree(kdb_name_table[0]);
memcpy(kdb_name_table, kdb_name_table+1,
sizeof(kdb_name_table[0]) *
(ARRAY_SIZE(kdb_name_table)-1));
} else {
debug_kfree(knt1);
knt1 = kdb_name_table[i];
memcpy(kdb_name_table+i, kdb_name_table+i+1,
sizeof(kdb_name_table[0]) *
(ARRAY_SIZE(kdb_name_table)-i-1));
}
i = ARRAY_SIZE(kdb_name_table) - 1;
kdb_name_table[i] = knt1;
symtab->sym_name = kdb_name_table[i];
knt1 = NULL;
}
if (symtab->mod_name == NULL)
symtab->mod_name = "kernel";
if (KDB_DEBUG(AR))
kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
"symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret,
symtab->sym_start, symtab->mod_name, symtab->sym_name,
symtab->sym_name);
out:
debug_kfree(knt1);
return ret;
}
void kdbnearsym_cleanup(void)
{
int i;
for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
if (kdb_name_table[i]) {
debug_kfree(kdb_name_table[i]);
kdb_name_table[i] = NULL;
}
}
}
static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
/*
* kallsyms_symbol_complete
*
* Parameters:
* prefix_name prefix of a symbol name to lookup
* max_len maximum length that can be returned
* Returns:
* Number of symbols which match the given prefix.
* Notes:
* prefix_name is changed to contain the longest unique prefix that
* starts with this prefix (tab completion).
*/
int kallsyms_symbol_complete(char *prefix_name, int max_len)
{
loff_t pos = 0;
int prefix_len = strlen(prefix_name), prev_len = 0;
int i, number = 0;
const char *name;
while ((name = kdb_walk_kallsyms(&pos))) {
if (strncmp(name, prefix_name, prefix_len) == 0) {
strcpy(ks_namebuf, name);
/* Work out the longest name that matches the prefix */
if (++number == 1) {
prev_len = min_t(int, max_len-1,
strlen(ks_namebuf));
memcpy(ks_namebuf_prev, ks_namebuf, prev_len);
ks_namebuf_prev[prev_len] = '\0';
continue;
}
for (i = 0; i < prev_len; i++) {
if (ks_namebuf[i] != ks_namebuf_prev[i]) {
prev_len = i;
ks_namebuf_prev[i] = '\0';
break;
}
}
}
}
if (prev_len > prefix_len)
memcpy(prefix_name, ks_namebuf_prev, prev_len+1);
return number;
}
/*
* kallsyms_symbol_next
*
* Parameters:
* prefix_name prefix of a symbol name to lookup
* flag 0 means search from the head, 1 means continue search.
* Returns:
* 1 if a symbol matches the given prefix.
* 0 if no string found
*/
int kallsyms_symbol_next(char *prefix_name, int flag)
{
int prefix_len = strlen(prefix_name);
static loff_t pos;
const char *name;
if (!flag)
pos = 0;
while ((name = kdb_walk_kallsyms(&pos))) {
if (strncmp(name, prefix_name, prefix_len) == 0) {
strncpy(prefix_name, name, strlen(name)+1);
return 1;
}
}
return 0;
}
/*
* kdb_symbol_print - Standard method for printing a symbol name and offset.
* Inputs:
* addr Address to be printed.
* symtab Address of symbol data, if NULL this routine does its
* own lookup.
* punc Punctuation for string, bit field.
* Remarks:
* The string and its punctuation is only printed if the address
* is inside the kernel, except that the value is always printed
* when requested.
*/
void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p,
unsigned int punc)
{
kdb_symtab_t symtab, *symtab_p2;
if (symtab_p) {
symtab_p2 = (kdb_symtab_t *)symtab_p;
} else {
symtab_p2 = &symtab;
kdbnearsym(addr, symtab_p2);
}
if (!(symtab_p2->sym_name || (punc & KDB_SP_VALUE)))
return;
if (punc & KDB_SP_SPACEB)
kdb_printf(" ");
if (punc & KDB_SP_VALUE)
kdb_printf(kdb_machreg_fmt0, addr);
if (symtab_p2->sym_name) {
if (punc & KDB_SP_VALUE)
kdb_printf(" ");
if (punc & KDB_SP_PAREN)
kdb_printf("(");
if (strcmp(symtab_p2->mod_name, "kernel"))
kdb_printf("[%s]", symtab_p2->mod_name);
kdb_printf("%s", symtab_p2->sym_name);
if (addr != symtab_p2->sym_start)
kdb_printf("+0x%lx", addr - symtab_p2->sym_start);
if (punc & KDB_SP_SYMSIZE)
kdb_printf("/0x%lx",
symtab_p2->sym_end - symtab_p2->sym_start);
if (punc & KDB_SP_PAREN)
kdb_printf(")");
}
if (punc & KDB_SP_SPACEA)
kdb_printf(" ");
if (punc & KDB_SP_NEWLINE)
kdb_printf("\n");
}
/*
* kdb_strdup - kdb equivalent of strdup, for disasm code.
* Inputs:
* str The string to duplicate.
* type Flags to kmalloc for the new string.
* Returns:
* Address of the new string, NULL if storage could not be allocated.
* Remarks:
* This is not in lib/string.c because it uses kmalloc which is not
* available when string.o is used in boot loaders.
*/
char *kdb_strdup(const char *str, gfp_t type)
{
int n = strlen(str)+1;
char *s = kmalloc(n, type);
if (!s)
return NULL;
return strcpy(s, str);
}
/*
* kdb_getarea_size - Read an area of data. The kdb equivalent of
* copy_from_user, with kdb messages for invalid addresses.
* Inputs:
* res Pointer to the area to receive the result.
* addr Address of the area to copy.
* size Size of the area.
* Returns:
* 0 for success, < 0 for error.
*/
int kdb_getarea_size(void *res, unsigned long addr, size_t size)
{
int ret = probe_kernel_read((char *)res, (char *)addr, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
KDB_STATE_SET(SUPPRESS);
}
ret = KDB_BADADDR;
} else {
KDB_STATE_CLEAR(SUPPRESS);
}
return ret;
}
/*
* kdb_putarea_size - Write an area of data. The kdb equivalent of
* copy_to_user, with kdb messages for invalid addresses.
* Inputs:
* addr Address of the area to write to.
* res Pointer to the area holding the data.
* size Size of the area.
* Returns:
* 0 for success, < 0 for error.
*/
int kdb_putarea_size(unsigned long addr, void *res, size_t size)
{
int ret = probe_kernel_read((char *)addr, (char *)res, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
KDB_STATE_SET(SUPPRESS);
}
ret = KDB_BADADDR;
} else {
KDB_STATE_CLEAR(SUPPRESS);
}
return ret;
}
/*
* kdb_getphys - Read data from a physical address. Validate the
* address is in range, use kmap_atomic() to get data
* similar to kdb_getarea() - but for phys addresses
* Inputs:
* res Pointer to the word to receive the result
* addr Physical address of the area to copy
* size Size of the area
* Returns:
* 0 for success, < 0 for error.
*/
static int kdb_getphys(void *res, unsigned long addr, size_t size)
{
unsigned long pfn;
void *vaddr;
struct page *page;
pfn = (addr >> PAGE_SHIFT);
if (!pfn_valid(pfn))
return 1;
page = pfn_to_page(pfn);
vaddr = kmap_atomic(page);
memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
kunmap_atomic(vaddr);
return 0;
}
/*
* kdb_getphysword
* Inputs:
* word Pointer to the word to receive the result.
* addr Address of the area to copy.
* size Size of the area.
* Returns:
* 0 for success, < 0 for error.
*/
int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
{
int diag;
__u8 w1;
__u16 w2;
__u32 w4;
__u64 w8;
*word = 0; /* Default value if addr or size is invalid */
switch (size) {
case 1:
diag = kdb_getphys(&w1, addr, sizeof(w1));
if (!diag)
*word = w1;
break;
case 2:
diag = kdb_getphys(&w2, addr, sizeof(w2));
if (!diag)
*word = w2;
break;
case 4:
diag = kdb_getphys(&w4, addr, sizeof(w4));
if (!diag)
*word = w4;
break;
case 8:
if (size <= sizeof(*word)) {
diag = kdb_getphys(&w8, addr, sizeof(w8));
if (!diag)
*word = w8;
break;
}
/* drop through */
default:
diag = KDB_BADWIDTH;
kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
}
return diag;
}
/*
* kdb_getword - Read a binary value. Unlike kdb_getarea, this treats
* data as numbers.
* Inputs:
* word Pointer to the word to receive the result.
* addr Address of the area to copy.
* size Size of the area.
* Returns:
* 0 for success, < 0 for error.
*/
int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
{
int diag;
__u8 w1;
__u16 w2;
__u32 w4;
__u64 w8;
*word = 0; /* Default value if addr or size is invalid */
switch (size) {
case 1:
diag = kdb_getarea(w1, addr);
if (!diag)
*word = w1;
break;
case 2:
diag = kdb_getarea(w2, addr);
if (!diag)
*word = w2;
break;
case 4:
diag = kdb_getarea(w4, addr);
if (!diag)
*word = w4;
break;
case 8:
if (size <= sizeof(*word)) {
diag = kdb_getarea(w8, addr);
if (!diag)
*word = w8;
break;
}
/* drop through */
default:
diag = KDB_BADWIDTH;
kdb_printf("kdb_getword: bad width %ld\n", (long) size);
}
return diag;
}
/*
* kdb_putword - Write a binary value. Unlike kdb_putarea, this
* treats data as numbers.
* Inputs:
* addr Address of the area to write to..
* word The value to set.
* size Size of the area.
* Returns:
* 0 for success, < 0 for error.
*/
int kdb_putword(unsigned long addr, unsigned long word, size_t size)
{
int diag;
__u8 w1;
__u16 w2;
__u32 w4;
__u64 w8;
switch (size) {
case 1:
w1 = word;
diag = kdb_putarea(addr, w1);
break;
case 2:
w2 = word;
diag = kdb_putarea(addr, w2);
break;
case 4:
w4 = word;
diag = kdb_putarea(addr, w4);
break;
case 8:
if (size <= sizeof(word)) {
w8 = word;
diag = kdb_putarea(addr, w8);
break;
}
/* drop through */
default:
diag = KDB_BADWIDTH;
kdb_printf("kdb_putword: bad width %ld\n", (long) size);
}
return diag;
}
/*
* kdb_task_state_string - Convert a string containing any of the
* letters DRSTCZEUIMA to a mask for the process state field and
* return the value. If no argument is supplied, return the mask
* that corresponds to environment variable PS, DRSTCZEU by
* default.
* Inputs:
* s String to convert
* Returns:
* Mask for process state.
* Notes:
* The mask folds data from several sources into a single long value, so
* be careful not to overlap the bits. TASK_* bits are in the LSB,
* special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
* is no overlap between TASK_* and EXIT_* but that may not always be
* true, so EXIT_* bits are shifted left 16 bits before being stored in
* the mask.
*/
/* unrunnable is < 0 */
#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1))
#define RUNNING (1UL << (8*sizeof(unsigned long) - 2))
#define IDLE (1UL << (8*sizeof(unsigned long) - 3))
#define DAEMON (1UL << (8*sizeof(unsigned long) - 4))
unsigned long kdb_task_state_string(const char *s)
{
long res = 0;
if (!s) {
s = kdbgetenv("PS");
if (!s)
s = "DRSTCZEU"; /* default value for ps */
}
while (*s) {
switch (*s) {
case 'D':
res |= TASK_UNINTERRUPTIBLE;
break;
case 'R':
res |= RUNNING;
break;
case 'S':
res |= TASK_INTERRUPTIBLE;
break;
case 'T':
res |= TASK_STOPPED;
break;
case 'C':
res |= TASK_TRACED;
break;
case 'Z':
res |= EXIT_ZOMBIE << 16;
break;
case 'E':
res |= EXIT_DEAD << 16;
break;
case 'U':
res |= UNRUNNABLE;
break;
case 'I':
res |= IDLE;
break;
case 'M':
res |= DAEMON;
break;
case 'A':
res = ~0UL;
break;
default:
kdb_printf("%s: unknown flag '%c' ignored\n",
__func__, *s);
break;
}
++s;
}
return res;
}
/*
* kdb_task_state_char - Return the character that represents the task state.
* Inputs:
* p struct task for the process
* Returns:
* One character to represent the task state.
*/
char kdb_task_state_char (const struct task_struct *p)
{
int cpu;
char state;
unsigned long tmp;
if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
return 'E';
cpu = kdb_process_cpu(p);
state = (p->state == 0) ? 'R' :
(p->state < 0) ? 'U' :
(p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
(p->state & TASK_STOPPED) ? 'T' :
(p->state & TASK_TRACED) ? 'C' :
(p->exit_state & EXIT_ZOMBIE) ? 'Z' :
(p->exit_state & EXIT_DEAD) ? 'E' :
(p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
if (is_idle_task(p)) {
/* Idle task. Is it really idle, apart from the kdb
* interrupt? */
if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
if (cpu != kdb_initial_cpu)
state = 'I'; /* idle task */
}
} else if (!p->mm && state == 'S') {
state = 'M'; /* sleeping system daemon */
}
return state;
}
/*
* kdb_task_state - Return true if a process has the desired state
* given by the mask.
* Inputs:
* p struct task for the process
* mask mask from kdb_task_state_string to select processes
* Returns:
* True if the process matches at least one criteria defined by the mask.
*/
unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
{
char state[] = { kdb_task_state_char(p), '\0' };
return (mask & kdb_task_state_string(state)) != 0;
}
/*
* kdb_print_nameval - Print a name and its value, converting the
* value to a symbol lookup if possible.
* Inputs:
* name field name to print
* val value of field
*/
void kdb_print_nameval(const char *name, unsigned long val)
{
kdb_symtab_t symtab;
kdb_printf(" %-11.11s ", name);
if (kdbnearsym(val, &symtab))
kdb_symbol_print(val, &symtab,
KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE);
else
kdb_printf("0x%lx\n", val);
}
/* Last ditch allocator for debugging, so we can still debug even when
* the GFP_ATOMIC pool has been exhausted. The algorithms are tuned
* for space usage, not for speed. One smallish memory pool, the free
* chain is always in ascending address order to allow coalescing,
* allocations are done in brute force best fit.
*/
struct debug_alloc_header {
u32 next; /* offset of next header from start of pool */
u32 size;
void *caller;
};
/* The memory returned by this allocator must be aligned, which means
* so must the header size. Do not assume that sizeof(struct
* debug_alloc_header) is a multiple of the alignment, explicitly
* calculate the overhead of this header, including the alignment.
* The rest of this code must not use sizeof() on any header or
* pointer to a header.
*/
#define dah_align 8
#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */
static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
/* Locking is awkward. The debug code is called from all contexts,
* including non maskable interrupts. A normal spinlock is not safe
* in NMI context. Try to get the debug allocator lock, if it cannot
* be obtained after a second then give up. If the lock could not be
* previously obtained on this cpu then only try once.
*
* sparse has no annotation for "this function _sometimes_ acquires a
* lock", so fudge the acquire/release notation.
*/
static DEFINE_SPINLOCK(dap_lock);
static int get_dap_lock(void)
__acquires(dap_lock)
{
static int dap_locked = -1;
int count;
if (dap_locked == smp_processor_id())
count = 1;
else
count = 1000;
while (1) {
if (spin_trylock(&dap_lock)) {
dap_locked = -1;
return 1;
}
if (!count--)
break;
udelay(1000);
}
dap_locked = smp_processor_id();
__acquire(dap_lock);
return 0;
}
void *debug_kmalloc(size_t size, gfp_t flags)
{
unsigned int rem, h_offset;
struct debug_alloc_header *best, *bestprev, *prev, *h;
void *p = NULL;
if (!get_dap_lock()) {
__release(dap_lock); /* we never actually got it */
return NULL;
}
h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
if (dah_first_call) {
h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
dah_first_call = 0;
}
size = ALIGN(size, dah_align);
prev = best = bestprev = NULL;
while (1) {
if (h->size >= size && (!best || h->size < best->size)) {
best = h;
bestprev = prev;
if (h->size == size)
break;
}
if (!h->next)
break;
prev = h;
h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
}
if (!best)
goto out;
rem = best->size - size;
/* The pool must always contain at least one header */
if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
goto out;
if (rem >= dah_overhead) {
best->size = size;
h_offset = ((char *)best - debug_alloc_pool) +
dah_overhead + best->size;
h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
h->size = rem - dah_overhead;
h->next = best->next;
} else
h_offset = best->next;
best->caller = __builtin_return_address(0);
dah_used += best->size;
dah_used_max = max(dah_used, dah_used_max);
if (bestprev)
bestprev->next = h_offset;
else
dah_first = h_offset;
p = (char *)best + dah_overhead;
memset(p, POISON_INUSE, best->size - 1);
*((char *)p + best->size - 1) = POISON_END;
out:
spin_unlock(&dap_lock);
return p;
}
void debug_kfree(void *p)
{
struct debug_alloc_header *h;
unsigned int h_offset;
if (!p)
return;
if ((char *)p < debug_alloc_pool ||
(char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
kfree(p);
return;
}
if (!get_dap_lock()) {
__release(dap_lock); /* we never actually got it */
return; /* memory leak, cannot be helped */
}
h = (struct debug_alloc_header *)((char *)p - dah_overhead);
memset(p, POISON_FREE, h->size - 1);
*((char *)p + h->size - 1) = POISON_END;
h->caller = NULL;
dah_used -= h->size;
h_offset = (char *)h - debug_alloc_pool;
if (h_offset < dah_first) {
h->next = dah_first;
dah_first = h_offset;
} else {
struct debug_alloc_header *prev;
unsigned int prev_offset;
prev = (struct debug_alloc_header *)(debug_alloc_pool +
dah_first);
while (1) {
if (!prev->next || prev->next > h_offset)
break;
prev = (struct debug_alloc_header *)
(debug_alloc_pool + prev->next);
}
prev_offset = (char *)prev - debug_alloc_pool;
if (prev_offset + dah_overhead + prev->size == h_offset) {
prev->size += dah_overhead + h->size;
memset(h, POISON_FREE, dah_overhead - 1);
*((char *)h + dah_overhead - 1) = POISON_END;
h = prev;
h_offset = prev_offset;
} else {
h->next = prev->next;
prev->next = h_offset;
}
}
if (h_offset + dah_overhead + h->size == h->next) {
struct debug_alloc_header *next;
next = (struct debug_alloc_header *)
(debug_alloc_pool + h->next);
h->size += dah_overhead + next->size;
h->next = next->next;
memset(next, POISON_FREE, dah_overhead - 1);
*((char *)next + dah_overhead - 1) = POISON_END;
}
spin_unlock(&dap_lock);
}
void debug_kusage(void)
{
struct debug_alloc_header *h_free, *h_used;
#ifdef CONFIG_IA64
/* FIXME: using dah for ia64 unwind always results in a memory leak.
* Fix that memory leak first, then set debug_kusage_one_time = 1 for
* all architectures.
*/
static int debug_kusage_one_time;
#else
static int debug_kusage_one_time = 1;
#endif
if (!get_dap_lock()) {
__release(dap_lock); /* we never actually got it */
return;
}
h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
if (dah_first == 0 &&
(h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
dah_first_call))
goto out;
if (!debug_kusage_one_time)
goto out;
debug_kusage_one_time = 0;
kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n",
__func__, dah_first);
if (dah_first) {
h_used = (struct debug_alloc_header *)debug_alloc_pool;
kdb_printf("%s: h_used %p size %d\n", __func__, h_used,
h_used->size);
}
do {
h_used = (struct debug_alloc_header *)
((char *)h_free + dah_overhead + h_free->size);
kdb_printf("%s: h_used %p size %d caller %p\n",
__func__, h_used, h_used->size, h_used->caller);
h_free = (struct debug_alloc_header *)
(debug_alloc_pool + h_free->next);
} while (h_free->next);
h_used = (struct debug_alloc_header *)
((char *)h_free + dah_overhead + h_free->size);
if ((char *)h_used - debug_alloc_pool !=
sizeof(debug_alloc_pool_aligned))
kdb_printf("%s: h_used %p size %d caller %p\n",
__func__, h_used, h_used->size, h_used->caller);
out:
spin_unlock(&dap_lock);
}
/* Maintain a small stack of kdb_flags to allow recursion without disturbing
* the global kdb state.
*/
static int kdb_flags_stack[4], kdb_flags_index;
void kdb_save_flags(void)
{
BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack));
kdb_flags_stack[kdb_flags_index++] = kdb_flags;
}
void kdb_restore_flags(void)
{
BUG_ON(kdb_flags_index <= 0);
kdb_flags = kdb_flags_stack[--kdb_flags_index];
}

158
kernel/delayacct.c Normal file
View file

@ -0,0 +1,158 @@
/* delayacct.c - per-task delay accounting
*
* Copyright (C) Shailabh Nagar, IBM Corp. 2006
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it would be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
* the GNU General Public License for more details.
*/
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/taskstats.h>
#include <linux/time.h>
#include <linux/sysctl.h>
#include <linux/delayacct.h>
#include <linux/module.h>
int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
EXPORT_SYMBOL_GPL(delayacct_on);
struct kmem_cache *delayacct_cache;
static int __init delayacct_setup_disable(char *str)
{
delayacct_on = 0;
return 1;
}
__setup("nodelayacct", delayacct_setup_disable);
void delayacct_init(void)
{
delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
delayacct_tsk_init(&init_task);
}
void __delayacct_tsk_init(struct task_struct *tsk)
{
tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
if (tsk->delays)
spin_lock_init(&tsk->delays->lock);
}
/*
* Finish delay accounting for a statistic using its timestamps (@start),
* accumalator (@total) and @count
*/
static void delayacct_end(u64 *start, u64 *total, u32 *count)
{
s64 ns = ktime_get_ns() - *start;
unsigned long flags;
if (ns > 0) {
spin_lock_irqsave(&current->delays->lock, flags);
*total += ns;
(*count)++;
spin_unlock_irqrestore(&current->delays->lock, flags);
}
}
void __delayacct_blkio_start(void)
{
current->delays->blkio_start = ktime_get_ns();
}
void __delayacct_blkio_end(void)
{
if (current->delays->flags & DELAYACCT_PF_SWAPIN)
/* Swapin block I/O */
delayacct_end(&current->delays->blkio_start,
&current->delays->swapin_delay,
&current->delays->swapin_count);
else /* Other block I/O */
delayacct_end(&current->delays->blkio_start,
&current->delays->blkio_delay,
&current->delays->blkio_count);
}
int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
{
cputime_t utime, stime, stimescaled, utimescaled;
unsigned long long t2, t3;
unsigned long flags, t1;
s64 tmp;
task_cputime(tsk, &utime, &stime);
tmp = (s64)d->cpu_run_real_total;
tmp += cputime_to_nsecs(utime + stime);
d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
task_cputime_scaled(tsk, &utimescaled, &stimescaled);
tmp = (s64)d->cpu_scaled_run_real_total;
tmp += cputime_to_nsecs(utimescaled + stimescaled);
d->cpu_scaled_run_real_total =
(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
/*
* No locking available for sched_info (and too expensive to add one)
* Mitigate by taking snapshot of values
*/
t1 = tsk->sched_info.pcount;
t2 = tsk->sched_info.run_delay;
t3 = tsk->se.sum_exec_runtime;
d->cpu_count += t1;
tmp = (s64)d->cpu_delay_total + t2;
d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
tmp = (s64)d->cpu_run_virtual_total + t3;
d->cpu_run_virtual_total =
(tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
spin_lock_irqsave(&tsk->delays->lock, flags);
tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
d->blkio_count += tsk->delays->blkio_count;
d->swapin_count += tsk->delays->swapin_count;
d->freepages_count += tsk->delays->freepages_count;
spin_unlock_irqrestore(&tsk->delays->lock, flags);
return 0;
}
__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
{
__u64 ret;
unsigned long flags;
spin_lock_irqsave(&tsk->delays->lock, flags);
ret = nsec_to_clock_t(tsk->delays->blkio_delay +
tsk->delays->swapin_delay);
spin_unlock_irqrestore(&tsk->delays->lock, flags);
return ret;
}
void __delayacct_freepages_start(void)
{
current->delays->freepages_start = ktime_get_ns();
}
void __delayacct_freepages_end(void)
{
delayacct_end(&current->delays->freepages_start,
&current->delays->freepages_delay,
&current->delays->freepages_count);
}

160
kernel/dma.c Normal file
View file

@ -0,0 +1,160 @@
/*
* linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
*
* Written by Hennus Bergman, 1992.
*
* 1994/12/26: Changes by Alex Nash to fix a minor bug in /proc/dma.
* In the previous version the reported device could end up being wrong,
* if a device requested a DMA channel that was already in use.
* [It also happened to remove the sizeof(char *) == sizeof(int)
* assumption introduced because of those /proc/dma patches. -- Hennus]
*/
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <asm/dma.h>
/* A note on resource allocation:
*
* All drivers needing DMA channels, should allocate and release them
* through the public routines `request_dma()' and `free_dma()'.
*
* In order to avoid problems, all processes should allocate resources in
* the same sequence and release them in the reverse order.
*
* So, when allocating DMAs and IRQs, first allocate the IRQ, then the DMA.
* When releasing them, first release the DMA, then release the IRQ.
* If you don't, you may cause allocation requests to fail unnecessarily.
* This doesn't really matter now, but it will once we get real semaphores
* in the kernel.
*/
DEFINE_SPINLOCK(dma_spin_lock);
/*
* If our port doesn't define this it has no PC like DMA
*/
#ifdef MAX_DMA_CHANNELS
/* Channel n is busy iff dma_chan_busy[n].lock != 0.
* DMA0 used to be reserved for DRAM refresh, but apparently not any more...
* DMA4 is reserved for cascading.
*/
struct dma_chan {
int lock;
const char *device_id;
};
static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = {
[4] = { 1, "cascade" },
};
/**
* request_dma - request and reserve a system DMA channel
* @dmanr: DMA channel number
* @device_id: reserving device ID string, used in /proc/dma
*/
int request_dma(unsigned int dmanr, const char * device_id)
{
if (dmanr >= MAX_DMA_CHANNELS)
return -EINVAL;
if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0)
return -EBUSY;
dma_chan_busy[dmanr].device_id = device_id;
/* old flag was 0, now contains 1 to indicate busy */
return 0;
} /* request_dma */
/**
* free_dma - free a reserved system DMA channel
* @dmanr: DMA channel number
*/
void free_dma(unsigned int dmanr)
{
if (dmanr >= MAX_DMA_CHANNELS) {
printk(KERN_WARNING "Trying to free DMA%d\n", dmanr);
return;
}
if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) {
printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr);
return;
}
} /* free_dma */
#else
int request_dma(unsigned int dmanr, const char *device_id)
{
return -EINVAL;
}
void free_dma(unsigned int dmanr)
{
}
#endif
#ifdef CONFIG_PROC_FS
#ifdef MAX_DMA_CHANNELS
static int proc_dma_show(struct seq_file *m, void *v)
{
int i;
for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) {
if (dma_chan_busy[i].lock) {
seq_printf(m, "%2d: %s\n", i,
dma_chan_busy[i].device_id);
}
}
return 0;
}
#else
static int proc_dma_show(struct seq_file *m, void *v)
{
seq_puts(m, "No DMA\n");
return 0;
}
#endif /* MAX_DMA_CHANNELS */
static int proc_dma_open(struct inode *inode, struct file *file)
{
return single_open(file, proc_dma_show, NULL);
}
static const struct file_operations proc_dma_operations = {
.open = proc_dma_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int __init proc_dma_init(void)
{
proc_create("dma", 0, NULL, &proc_dma_operations);
return 0;
}
__initcall(proc_dma_init);
#endif
EXPORT_SYMBOL(request_dma);
EXPORT_SYMBOL(free_dma);
EXPORT_SYMBOL(dma_spin_lock);

24
kernel/elfcore.c Normal file
View file

@ -0,0 +1,24 @@
#include <linux/elf.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/binfmts.h>
Elf_Half __weak elf_core_extra_phdrs(void)
{
return 0;
}
int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
{
return 1;
}
int __weak elf_core_write_extra_data(struct coredump_params *cprm)
{
return 1;
}
size_t __weak elf_core_extra_data_size(void)
{
return 0;
}

9
kernel/events/Makefile Normal file
View file

@ -0,0 +1,9 @@
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_core.o = -pg
endif
obj-y := core.o ring_buffer.o callchain.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_UPROBES) += uprobes.o

209
kernel/events/callchain.c Normal file
View file

@ -0,0 +1,209 @@
/*
* Performance events callchain code, extracted from core.c:
*
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*
* For licensing details see kernel-base/COPYING
*/
#include <linux/perf_event.h>
#include <linux/slab.h>
#include "internal.h"
struct callchain_cpus_entries {
struct rcu_head rcu_head;
struct perf_callchain_entry *cpu_entries[0];
};
static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
static atomic_t nr_callchain_events;
static DEFINE_MUTEX(callchain_mutex);
static struct callchain_cpus_entries *callchain_cpus_entries;
__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
struct pt_regs *regs)
{
}
__weak void perf_callchain_user(struct perf_callchain_entry *entry,
struct pt_regs *regs)
{
}
static void release_callchain_buffers_rcu(struct rcu_head *head)
{
struct callchain_cpus_entries *entries;
int cpu;
entries = container_of(head, struct callchain_cpus_entries, rcu_head);
for_each_possible_cpu(cpu)
kfree(entries->cpu_entries[cpu]);
kfree(entries);
}
static void release_callchain_buffers(void)
{
struct callchain_cpus_entries *entries;
entries = callchain_cpus_entries;
RCU_INIT_POINTER(callchain_cpus_entries, NULL);
call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
}
static int alloc_callchain_buffers(void)
{
int cpu;
int size;
struct callchain_cpus_entries *entries;
/*
* We can't use the percpu allocation API for data that can be
* accessed from NMI. Use a temporary manual per cpu allocation
* until that gets sorted out.
*/
size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
entries = kzalloc(size, GFP_KERNEL);
if (!entries)
return -ENOMEM;
size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
for_each_possible_cpu(cpu) {
entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
cpu_to_node(cpu));
if (!entries->cpu_entries[cpu])
goto fail;
}
rcu_assign_pointer(callchain_cpus_entries, entries);
return 0;
fail:
for_each_possible_cpu(cpu)
kfree(entries->cpu_entries[cpu]);
kfree(entries);
return -ENOMEM;
}
int get_callchain_buffers(void)
{
int err = 0;
int count;
mutex_lock(&callchain_mutex);
count = atomic_inc_return(&nr_callchain_events);
if (WARN_ON_ONCE(count < 1)) {
err = -EINVAL;
goto exit;
}
if (count > 1) {
/* If the allocation failed, give up */
if (!callchain_cpus_entries)
err = -ENOMEM;
goto exit;
}
err = alloc_callchain_buffers();
exit:
if (err)
atomic_dec(&nr_callchain_events);
mutex_unlock(&callchain_mutex);
return err;
}
void put_callchain_buffers(void)
{
if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
release_callchain_buffers();
mutex_unlock(&callchain_mutex);
}
}
static struct perf_callchain_entry *get_callchain_entry(int *rctx)
{
int cpu;
struct callchain_cpus_entries *entries;
*rctx = get_recursion_context(this_cpu_ptr(callchain_recursion));
if (*rctx == -1)
return NULL;
entries = rcu_dereference(callchain_cpus_entries);
if (!entries)
return NULL;
cpu = smp_processor_id();
return &entries->cpu_entries[cpu][*rctx];
}
static void
put_callchain_entry(int rctx)
{
put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
}
struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
int rctx;
struct perf_callchain_entry *entry;
int kernel = !event->attr.exclude_callchain_kernel;
int user = !event->attr.exclude_callchain_user;
if (!kernel && !user)
return NULL;
entry = get_callchain_entry(&rctx);
if (rctx == -1)
return NULL;
if (!entry)
goto exit_put;
entry->nr = 0;
if (kernel && !user_mode(regs)) {
perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
perf_callchain_kernel(entry, regs);
}
if (user) {
if (!user_mode(regs)) {
if (current->mm)
regs = task_pt_regs(current);
else
regs = NULL;
}
if (regs) {
/*
* Disallow cross-task user callchains.
*/
if (event->ctx->task && event->ctx->task != current)
goto exit_put;
perf_callchain_store(entry, PERF_CONTEXT_USER);
perf_callchain_user(entry, regs);
}
}
exit_put:
put_callchain_entry(rctx);
return entry;
}

8339
kernel/events/core.c Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,655 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) 2007 Alan Stern
* Copyright (C) IBM Corporation, 2009
* Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
*
* Thanks to Ingo Molnar for his many suggestions.
*
* Authors: Alan Stern <stern@rowland.harvard.edu>
* K.Prasad <prasad@linux.vnet.ibm.com>
* Frederic Weisbecker <fweisbec@gmail.com>
*/
/*
* HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
* using the CPU's debug registers.
* This file contains the arch-independent routines.
*/
#include <linux/irqflags.h>
#include <linux/kallsyms.h>
#include <linux/notifier.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/hw_breakpoint.h>
/*
* Constraints data
*/
struct bp_cpuinfo {
/* Number of pinned cpu breakpoints in a cpu */
unsigned int cpu_pinned;
/* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
unsigned int *tsk_pinned;
/* Number of non-pinned cpu/task breakpoints in a cpu */
unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */
};
static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
static int nr_slots[TYPE_MAX];
static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
{
return per_cpu_ptr(bp_cpuinfo + type, cpu);
}
/* Keep track of the breakpoints attached to tasks */
static LIST_HEAD(bp_task_head);
static int constraints_initialized;
/* Gather the number of total pinned and un-pinned bp in a cpuset */
struct bp_busy_slots {
unsigned int pinned;
unsigned int flexible;
};
/* Serialize accesses to the above constraints */
static DEFINE_MUTEX(nr_bp_mutex);
__weak int hw_breakpoint_weight(struct perf_event *bp)
{
return 1;
}
static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
{
if (bp->attr.bp_type & HW_BREAKPOINT_RW)
return TYPE_DATA;
return TYPE_INST;
}
/*
* Report the maximum number of pinned breakpoints a task
* have in this cpu
*/
static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
{
unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
int i;
for (i = nr_slots[type] - 1; i >= 0; i--) {
if (tsk_pinned[i] > 0)
return i + 1;
}
return 0;
}
/*
* Count the number of breakpoints of the same type and same task.
* The given event must be not on the list.
*/
static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
{
struct task_struct *tsk = bp->hw.bp_target;
struct perf_event *iter;
int count = 0;
list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
if (iter->hw.bp_target == tsk &&
find_slot_idx(iter) == type &&
(iter->cpu < 0 || cpu == iter->cpu))
count += hw_breakpoint_weight(iter);
}
return count;
}
static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
{
if (bp->cpu >= 0)
return cpumask_of(bp->cpu);
return cpu_possible_mask;
}
/*
* Report the number of pinned/un-pinned breakpoints we have in
* a given cpu (cpu > -1) or in all of them (cpu = -1).
*/
static void
fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
enum bp_type_idx type)
{
const struct cpumask *cpumask = cpumask_of_bp(bp);
int cpu;
for_each_cpu(cpu, cpumask) {
struct bp_cpuinfo *info = get_bp_info(cpu, type);
int nr;
nr = info->cpu_pinned;
if (!bp->hw.bp_target)
nr += max_task_bp_pinned(cpu, type);
else
nr += task_bp_pinned(cpu, bp, type);
if (nr > slots->pinned)
slots->pinned = nr;
nr = info->flexible;
if (nr > slots->flexible)
slots->flexible = nr;
}
}
/*
* For now, continue to consider flexible as pinned, until we can
* ensure no flexible event can ever be scheduled before a pinned event
* in a same cpu.
*/
static void
fetch_this_slot(struct bp_busy_slots *slots, int weight)
{
slots->pinned += weight;
}
/*
* Add a pinned breakpoint for the given task in our constraint table
*/
static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
enum bp_type_idx type, int weight)
{
unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
int old_idx, new_idx;
old_idx = task_bp_pinned(cpu, bp, type) - 1;
new_idx = old_idx + weight;
if (old_idx >= 0)
tsk_pinned[old_idx]--;
if (new_idx >= 0)
tsk_pinned[new_idx]++;
}
/*
* Add/remove the given breakpoint in our constraint table
*/
static void
toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
int weight)
{
const struct cpumask *cpumask = cpumask_of_bp(bp);
int cpu;
if (!enable)
weight = -weight;
/* Pinned counter cpu profiling */
if (!bp->hw.bp_target) {
get_bp_info(bp->cpu, type)->cpu_pinned += weight;
return;
}
/* Pinned counter task profiling */
for_each_cpu(cpu, cpumask)
toggle_bp_task_slot(bp, cpu, type, weight);
if (enable)
list_add_tail(&bp->hw.bp_list, &bp_task_head);
else
list_del(&bp->hw.bp_list);
}
/*
* Function to perform processor-specific cleanup during unregistration
*/
__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
{
/*
* A weak stub function here for those archs that don't define
* it inside arch/.../kernel/hw_breakpoint.c
*/
}
/*
* Contraints to check before allowing this new breakpoint counter:
*
* == Non-pinned counter == (Considered as pinned for now)
*
* - If attached to a single cpu, check:
*
* (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu)
* + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM
*
* -> If there are already non-pinned counters in this cpu, it means
* there is already a free slot for them.
* Otherwise, we check that the maximum number of per task
* breakpoints (for this cpu) plus the number of per cpu breakpoint
* (for this cpu) doesn't cover every registers.
*
* - If attached to every cpus, check:
*
* (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *))
* + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM
*
* -> This is roughly the same, except we check the number of per cpu
* bp for every cpu and we keep the max one. Same for the per tasks
* breakpoints.
*
*
* == Pinned counter ==
*
* - If attached to a single cpu, check:
*
* ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu)
* + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM
*
* -> Same checks as before. But now the info->flexible, if any, must keep
* one register at least (or they will never be fed).
*
* - If attached to every cpus, check:
*
* ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *))
* + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM
*/
static int __reserve_bp_slot(struct perf_event *bp)
{
struct bp_busy_slots slots = {0};
enum bp_type_idx type;
int weight;
/* We couldn't initialize breakpoint constraints on boot */
if (!constraints_initialized)
return -ENOMEM;
/* Basic checks */
if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
bp->attr.bp_type == HW_BREAKPOINT_INVALID)
return -EINVAL;
type = find_slot_idx(bp);
weight = hw_breakpoint_weight(bp);
fetch_bp_busy_slots(&slots, bp, type);
/*
* Simulate the addition of this breakpoint to the constraints
* and see the result.
*/
fetch_this_slot(&slots, weight);
/* Flexible counters need to keep at least one slot */
if (slots.pinned + (!!slots.flexible) > nr_slots[type])
return -ENOSPC;
toggle_bp_slot(bp, true, type, weight);
return 0;
}
int reserve_bp_slot(struct perf_event *bp)
{
int ret;
mutex_lock(&nr_bp_mutex);
ret = __reserve_bp_slot(bp);
mutex_unlock(&nr_bp_mutex);
return ret;
}
static void __release_bp_slot(struct perf_event *bp)
{
enum bp_type_idx type;
int weight;
type = find_slot_idx(bp);
weight = hw_breakpoint_weight(bp);
toggle_bp_slot(bp, false, type, weight);
}
void release_bp_slot(struct perf_event *bp)
{
mutex_lock(&nr_bp_mutex);
arch_unregister_hw_breakpoint(bp);
__release_bp_slot(bp);
mutex_unlock(&nr_bp_mutex);
}
/*
* Allow the kernel debugger to reserve breakpoint slots without
* taking a lock using the dbg_* variant of for the reserve and
* release breakpoint slots.
*/
int dbg_reserve_bp_slot(struct perf_event *bp)
{
if (mutex_is_locked(&nr_bp_mutex))
return -1;
return __reserve_bp_slot(bp);
}
int dbg_release_bp_slot(struct perf_event *bp)
{
if (mutex_is_locked(&nr_bp_mutex))
return -1;
__release_bp_slot(bp);
return 0;
}
static int validate_hw_breakpoint(struct perf_event *bp)
{
int ret;
ret = arch_validate_hwbkpt_settings(bp);
if (ret)
return ret;
if (arch_check_bp_in_kernelspace(bp)) {
if (bp->attr.exclude_kernel)
return -EINVAL;
/*
* Don't let unprivileged users set a breakpoint in the trap
* path to avoid trap recursion attacks.
*/
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
}
return 0;
}
int register_perf_hw_breakpoint(struct perf_event *bp)
{
int ret;
ret = reserve_bp_slot(bp);
if (ret)
return ret;
ret = validate_hw_breakpoint(bp);
/* if arch_validate_hwbkpt_settings() fails then release bp slot */
if (ret)
release_bp_slot(bp);
return ret;
}
/**
* register_user_hw_breakpoint - register a hardware breakpoint for user space
* @attr: breakpoint attributes
* @triggered: callback to trigger when we hit the breakpoint
* @tsk: pointer to 'task_struct' of the process to which the address belongs
*/
struct perf_event *
register_user_hw_breakpoint(struct perf_event_attr *attr,
perf_overflow_handler_t triggered,
void *context,
struct task_struct *tsk)
{
return perf_event_create_kernel_counter(attr, -1, tsk, triggered,
context);
}
EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
/**
* modify_user_hw_breakpoint - modify a user-space hardware breakpoint
* @bp: the breakpoint structure to modify
* @attr: new breakpoint attributes
* @triggered: callback to trigger when we hit the breakpoint
* @tsk: pointer to 'task_struct' of the process to which the address belongs
*/
int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
{
u64 old_addr = bp->attr.bp_addr;
u64 old_len = bp->attr.bp_len;
int old_type = bp->attr.bp_type;
int err = 0;
/*
* modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
* will not be possible to raise IPIs that invoke __perf_event_disable.
* So call the function directly after making sure we are targeting the
* current task.
*/
if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
__perf_event_disable(bp);
else
perf_event_disable(bp);
bp->attr.bp_addr = attr->bp_addr;
bp->attr.bp_type = attr->bp_type;
bp->attr.bp_len = attr->bp_len;
if (attr->disabled)
goto end;
err = validate_hw_breakpoint(bp);
if (!err)
perf_event_enable(bp);
if (err) {
bp->attr.bp_addr = old_addr;
bp->attr.bp_type = old_type;
bp->attr.bp_len = old_len;
if (!bp->attr.disabled)
perf_event_enable(bp);
return err;
}
end:
bp->attr.disabled = attr->disabled;
return 0;
}
EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
/**
* unregister_hw_breakpoint - unregister a user-space hardware breakpoint
* @bp: the breakpoint structure to unregister
*/
void unregister_hw_breakpoint(struct perf_event *bp)
{
if (!bp)
return;
perf_event_release_kernel(bp);
}
EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
/**
* register_wide_hw_breakpoint - register a wide breakpoint in the kernel
* @attr: breakpoint attributes
* @triggered: callback to trigger when we hit the breakpoint
*
* @return a set of per_cpu pointers to perf events
*/
struct perf_event * __percpu *
register_wide_hw_breakpoint(struct perf_event_attr *attr,
perf_overflow_handler_t triggered,
void *context)
{
struct perf_event * __percpu *cpu_events, *bp;
long err = 0;
int cpu;
cpu_events = alloc_percpu(typeof(*cpu_events));
if (!cpu_events)
return (void __percpu __force *)ERR_PTR(-ENOMEM);
get_online_cpus();
for_each_online_cpu(cpu) {
bp = perf_event_create_kernel_counter(attr, cpu, NULL,
triggered, context);
if (IS_ERR(bp)) {
err = PTR_ERR(bp);
break;
}
per_cpu(*cpu_events, cpu) = bp;
}
put_online_cpus();
if (likely(!err))
return cpu_events;
unregister_wide_hw_breakpoint(cpu_events);
return (void __percpu __force *)ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
/**
* unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
* @cpu_events: the per cpu set of events to unregister
*/
void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
{
int cpu;
for_each_possible_cpu(cpu)
unregister_hw_breakpoint(per_cpu(*cpu_events, cpu));
free_percpu(cpu_events);
}
EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
static struct notifier_block hw_breakpoint_exceptions_nb = {
.notifier_call = hw_breakpoint_exceptions_notify,
/* we need to be notified first */
.priority = 0x7fffffff
};
static void bp_perf_event_destroy(struct perf_event *event)
{
release_bp_slot(event);
}
static int hw_breakpoint_event_init(struct perf_event *bp)
{
int err;
if (bp->attr.type != PERF_TYPE_BREAKPOINT)
return -ENOENT;
/*
* no branch sampling for breakpoint events
*/
if (has_branch_stack(bp))
return -EOPNOTSUPP;
err = register_perf_hw_breakpoint(bp);
if (err)
return err;
bp->destroy = bp_perf_event_destroy;
return 0;
}
static int hw_breakpoint_add(struct perf_event *bp, int flags)
{
if (!(flags & PERF_EF_START))
bp->hw.state = PERF_HES_STOPPED;
if (is_sampling_event(bp)) {
bp->hw.last_period = bp->hw.sample_period;
perf_swevent_set_period(bp);
}
return arch_install_hw_breakpoint(bp);
}
static void hw_breakpoint_del(struct perf_event *bp, int flags)
{
arch_uninstall_hw_breakpoint(bp);
}
static void hw_breakpoint_start(struct perf_event *bp, int flags)
{
bp->hw.state = 0;
}
static void hw_breakpoint_stop(struct perf_event *bp, int flags)
{
bp->hw.state = PERF_HES_STOPPED;
}
static struct pmu perf_breakpoint = {
.task_ctx_nr = perf_sw_context, /* could eventually get its own */
.event_init = hw_breakpoint_event_init,
.add = hw_breakpoint_add,
.del = hw_breakpoint_del,
.start = hw_breakpoint_start,
.stop = hw_breakpoint_stop,
.read = hw_breakpoint_pmu_read,
};
int __init init_hw_breakpoint(void)
{
int cpu, err_cpu;
int i;
for (i = 0; i < TYPE_MAX; i++)
nr_slots[i] = hw_breakpoint_slots(i);
for_each_possible_cpu(cpu) {
for (i = 0; i < TYPE_MAX; i++) {
struct bp_cpuinfo *info = get_bp_info(cpu, i);
info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
GFP_KERNEL);
if (!info->tsk_pinned)
goto err_alloc;
}
}
constraints_initialized = 1;
perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
return register_die_notifier(&hw_breakpoint_exceptions_nb);
err_alloc:
for_each_possible_cpu(err_cpu) {
for (i = 0; i < TYPE_MAX; i++)
kfree(get_bp_info(err_cpu, i)->tsk_pinned);
if (err_cpu == cpu)
break;
}
return -ENOMEM;
}

198
kernel/events/internal.h Normal file
View file

@ -0,0 +1,198 @@
#ifndef _KERNEL_EVENTS_INTERNAL_H
#define _KERNEL_EVENTS_INTERNAL_H
#include <linux/hardirq.h>
#include <linux/uaccess.h>
/* Buffer handling */
#define RING_BUFFER_WRITABLE 0x01
struct ring_buffer {
atomic_t refcount;
struct rcu_head rcu_head;
#ifdef CONFIG_PERF_USE_VMALLOC
struct work_struct work;
int page_order; /* allocation order */
#endif
int nr_pages; /* nr of data pages */
int overwrite; /* can overwrite itself */
atomic_t poll; /* POLL_ for wakeups */
local_t head; /* write position */
local_t nest; /* nested writers */
local_t events; /* event limit */
local_t wakeup; /* wakeup stamp */
local_t lost; /* nr records lost */
long watermark; /* wakeup watermark */
/* poll crap */
spinlock_t event_lock;
struct list_head event_list;
atomic_t mmap_count;
unsigned long mmap_locked;
struct user_struct *mmap_user;
struct perf_event_mmap_page *user_page;
void *data_pages[0];
};
extern void rb_free(struct ring_buffer *rb);
extern struct ring_buffer *
rb_alloc(int nr_pages, long watermark, int cpu, int flags);
extern void perf_event_wakeup(struct perf_event *event);
extern void
perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event);
extern void
perf_event__output_id_sample(struct perf_event *event,
struct perf_output_handle *handle,
struct perf_sample_data *sample);
extern struct page *
perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
#ifdef CONFIG_PERF_USE_VMALLOC
/*
* Back perf_mmap() with vmalloc memory.
*
* Required for architectures that have d-cache aliasing issues.
*/
static inline int page_order(struct ring_buffer *rb)
{
return rb->page_order;
}
#else
static inline int page_order(struct ring_buffer *rb)
{
return 0;
}
#endif
static inline unsigned long perf_data_size(struct ring_buffer *rb)
{
return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
}
#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
static inline unsigned long \
func_name(struct perf_output_handle *handle, \
const void *buf, unsigned long len) \
{ \
unsigned long size, written; \
\
do { \
size = min(handle->size, len); \
written = memcpy_func(handle->addr, buf, size); \
written = size - written; \
\
len -= written; \
handle->addr += written; \
buf += written; \
handle->size -= written; \
if (!handle->size) { \
struct ring_buffer *rb = handle->rb; \
\
handle->page++; \
handle->page &= rb->nr_pages - 1; \
handle->addr = rb->data_pages[handle->page]; \
handle->size = PAGE_SIZE << page_order(rb); \
} \
} while (len && written == size); \
\
return len; \
}
static inline unsigned long
memcpy_common(void *dst, const void *src, unsigned long n)
{
memcpy(dst, src, n);
return 0;
}
DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
static inline unsigned long
memcpy_skip(void *dst, const void *src, unsigned long n)
{
return 0;
}
DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
#ifndef arch_perf_out_copy_user
#define arch_perf_out_copy_user arch_perf_out_copy_user
static inline unsigned long
arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
{
unsigned long ret;
pagefault_disable();
ret = __copy_from_user_inatomic(dst, src, n);
pagefault_enable();
return ret;
}
#endif
DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
/* Callchain handling */
extern struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs);
extern int get_callchain_buffers(void);
extern void put_callchain_buffers(void);
static inline int get_recursion_context(int *recursion)
{
int rctx;
if (in_nmi())
rctx = 3;
else if (in_irq())
rctx = 2;
else if (in_softirq())
rctx = 1;
else
rctx = 0;
if (recursion[rctx])
return -1;
recursion[rctx]++;
barrier();
return rctx;
}
static inline void put_recursion_context(int *recursion, int rctx)
{
barrier();
recursion[rctx]--;
}
#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP
static inline bool arch_perf_have_user_stack_dump(void)
{
return true;
}
#define perf_user_stack_pointer(regs) user_stack_pointer(regs)
#else
static inline bool arch_perf_have_user_stack_dump(void)
{
return false;
}
#define perf_user_stack_pointer(regs) 0
#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
#endif /* _KERNEL_EVENTS_INTERNAL_H */

417
kernel/events/ring_buffer.c Normal file
View file

@ -0,0 +1,417 @@
/*
* Performance events ring-buffer code:
*
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*
* For licensing details see kernel-base/COPYING
*/
#include <linux/perf_event.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/circ_buf.h>
#include "internal.h"
static void perf_output_wakeup(struct perf_output_handle *handle)
{
atomic_set(&handle->rb->poll, POLL_IN);
handle->event->pending_wakeup = 1;
irq_work_queue(&handle->event->pending);
}
/*
* We need to ensure a later event_id doesn't publish a head when a former
* event isn't done writing. However since we need to deal with NMIs we
* cannot fully serialize things.
*
* We only publish the head (and generate a wakeup) when the outer-most
* event completes.
*/
static void perf_output_get_handle(struct perf_output_handle *handle)
{
struct ring_buffer *rb = handle->rb;
preempt_disable();
local_inc(&rb->nest);
handle->wakeup = local_read(&rb->wakeup);
}
static void perf_output_put_handle(struct perf_output_handle *handle)
{
struct ring_buffer *rb = handle->rb;
unsigned long head;
again:
head = local_read(&rb->head);
/*
* IRQ/NMI can happen here, which means we can miss a head update.
*/
if (!local_dec_and_test(&rb->nest))
goto out;
/*
* Since the mmap() consumer (userspace) can run on a different CPU:
*
* kernel user
*
* if (LOAD ->data_tail) { LOAD ->data_head
* (A) smp_rmb() (C)
* STORE $data LOAD $data
* smp_wmb() (B) smp_mb() (D)
* STORE ->data_head STORE ->data_tail
* }
*
* Where A pairs with D, and B pairs with C.
*
* In our case (A) is a control dependency that separates the load of
* the ->data_tail and the stores of $data. In case ->data_tail
* indicates there is no room in the buffer to store $data we do not.
*
* D needs to be a full barrier since it separates the data READ
* from the tail WRITE.
*
* For B a WMB is sufficient since it separates two WRITEs, and for C
* an RMB is sufficient since it separates two READs.
*
* See perf_output_begin().
*/
smp_wmb(); /* B, matches C */
rb->user_page->data_head = head;
/*
* Now check if we missed an update -- rely on previous implied
* compiler barriers to force a re-read.
*/
if (unlikely(head != local_read(&rb->head))) {
local_inc(&rb->nest);
goto again;
}
if (handle->wakeup != local_read(&rb->wakeup))
perf_output_wakeup(handle);
out:
preempt_enable();
}
int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size)
{
struct ring_buffer *rb;
unsigned long tail, offset, head;
int have_lost, page_shift;
struct {
struct perf_event_header header;
u64 id;
u64 lost;
} lost_event;
rcu_read_lock();
/*
* For inherited events we send all the output towards the parent.
*/
if (event->parent)
event = event->parent;
rb = rcu_dereference(event->rb);
if (unlikely(!rb))
goto out;
if (unlikely(!rb->nr_pages))
goto out;
handle->rb = rb;
handle->event = event;
have_lost = local_read(&rb->lost);
if (unlikely(have_lost)) {
size += sizeof(lost_event);
if (event->attr.sample_id_all)
size += event->id_header_size;
}
perf_output_get_handle(handle);
do {
tail = ACCESS_ONCE(rb->user_page->data_tail);
offset = head = local_read(&rb->head);
if (!rb->overwrite &&
unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
goto fail;
/*
* The above forms a control dependency barrier separating the
* @tail load above from the data stores below. Since the @tail
* load is required to compute the branch to fail below.
*
* A, matches D; the full memory barrier userspace SHOULD issue
* after reading the data and before storing the new tail
* position.
*
* See perf_output_put_handle().
*/
head += size;
} while (local_cmpxchg(&rb->head, offset, head) != offset);
/*
* We rely on the implied barrier() by local_cmpxchg() to ensure
* none of the data stores below can be lifted up by the compiler.
*/
if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
local_add(rb->watermark, &rb->wakeup);
page_shift = PAGE_SHIFT + page_order(rb);
handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
offset &= (1UL << page_shift) - 1;
handle->addr = rb->data_pages[handle->page] + offset;
handle->size = (1UL << page_shift) - offset;
if (unlikely(have_lost)) {
struct perf_sample_data sample_data;
lost_event.header.size = sizeof(lost_event);
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
lost_event.id = event->id;
lost_event.lost = local_xchg(&rb->lost, 0);
perf_event_header__init_id(&lost_event.header,
&sample_data, event);
perf_output_put(handle, lost_event);
perf_event__output_id_sample(event, handle, &sample_data);
}
return 0;
fail:
local_inc(&rb->lost);
perf_output_put_handle(handle);
out:
rcu_read_unlock();
return -ENOSPC;
}
unsigned int perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len)
{
return __output_copy(handle, buf, len);
}
unsigned int perf_output_skip(struct perf_output_handle *handle,
unsigned int len)
{
return __output_skip(handle, NULL, len);
}
void perf_output_end(struct perf_output_handle *handle)
{
perf_output_put_handle(handle);
rcu_read_unlock();
}
static void
ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
{
long max_size = perf_data_size(rb);
if (watermark)
rb->watermark = min(max_size, watermark);
if (!rb->watermark)
rb->watermark = max_size / 2;
if (flags & RING_BUFFER_WRITABLE)
rb->overwrite = 0;
else
rb->overwrite = 1;
atomic_set(&rb->refcount, 1);
INIT_LIST_HEAD(&rb->event_list);
spin_lock_init(&rb->event_lock);
}
#ifndef CONFIG_PERF_USE_VMALLOC
/*
* Back perf_mmap() with regular GFP_KERNEL-0 pages.
*/
struct page *
perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
{
if (pgoff > rb->nr_pages)
return NULL;
if (pgoff == 0)
return virt_to_page(rb->user_page);
return virt_to_page(rb->data_pages[pgoff - 1]);
}
static void *perf_mmap_alloc_page(int cpu)
{
struct page *page;
int node;
node = (cpu == -1) ? cpu : cpu_to_node(cpu);
page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
if (!page)
return NULL;
return page_address(page);
}
struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
{
struct ring_buffer *rb;
unsigned long size;
int i;
size = sizeof(struct ring_buffer);
size += nr_pages * sizeof(void *);
rb = kzalloc(size, GFP_KERNEL);
if (!rb)
goto fail;
rb->user_page = perf_mmap_alloc_page(cpu);
if (!rb->user_page)
goto fail_user_page;
for (i = 0; i < nr_pages; i++) {
rb->data_pages[i] = perf_mmap_alloc_page(cpu);
if (!rb->data_pages[i])
goto fail_data_pages;
}
rb->nr_pages = nr_pages;
ring_buffer_init(rb, watermark, flags);
return rb;
fail_data_pages:
for (i--; i >= 0; i--)
free_page((unsigned long)rb->data_pages[i]);
free_page((unsigned long)rb->user_page);
fail_user_page:
kfree(rb);
fail:
return NULL;
}
static void perf_mmap_free_page(unsigned long addr)
{
struct page *page = virt_to_page((void *)addr);
page->mapping = NULL;
__free_page(page);
}
void rb_free(struct ring_buffer *rb)
{
int i;
perf_mmap_free_page((unsigned long)rb->user_page);
for (i = 0; i < rb->nr_pages; i++)
perf_mmap_free_page((unsigned long)rb->data_pages[i]);
kfree(rb);
}
#else
static int data_page_nr(struct ring_buffer *rb)
{
return rb->nr_pages << page_order(rb);
}
struct page *
perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
{
/* The '>' counts in the user page. */
if (pgoff > data_page_nr(rb))
return NULL;
return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
}
static void perf_mmap_unmark_page(void *addr)
{
struct page *page = vmalloc_to_page(addr);
page->mapping = NULL;
}
static void rb_free_work(struct work_struct *work)
{
struct ring_buffer *rb;
void *base;
int i, nr;
rb = container_of(work, struct ring_buffer, work);
nr = data_page_nr(rb);
base = rb->user_page;
/* The '<=' counts in the user page. */
for (i = 0; i <= nr; i++)
perf_mmap_unmark_page(base + (i * PAGE_SIZE));
vfree(base);
kfree(rb);
}
void rb_free(struct ring_buffer *rb)
{
schedule_work(&rb->work);
}
struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
{
struct ring_buffer *rb;
unsigned long size;
void *all_buf;
size = sizeof(struct ring_buffer);
size += sizeof(void *);
rb = kzalloc(size, GFP_KERNEL);
if (!rb)
goto fail;
INIT_WORK(&rb->work, rb_free_work);
all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
if (!all_buf)
goto fail_all_buf;
rb->user_page = all_buf;
rb->data_pages[0] = all_buf + PAGE_SIZE;
rb->page_order = ilog2(nr_pages);
rb->nr_pages = !!nr_pages;
ring_buffer_init(rb, watermark, flags);
return rb;
fail_all_buf:
kfree(rb);
fail:
return NULL;
}
#endif

1993
kernel/events/uprobes.c Normal file

File diff suppressed because it is too large Load diff

200
kernel/exec_domain.c Normal file
View file

@ -0,0 +1,200 @@
/*
* Handling of different ABIs (personalities).
*
* We group personalities into execution domains which have their
* own handlers for kernel entry points, signal mapping, etc...
*
* 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org)
*/
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/module.h>
#include <linux/personality.h>
#include <linux/proc_fs.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/sysctl.h>
#include <linux/types.h>
#include <linux/fs_struct.h>
static void default_handler(int, struct pt_regs *);
static struct exec_domain *exec_domains = &default_exec_domain;
static DEFINE_RWLOCK(exec_domains_lock);
static unsigned long ident_map[32] = {
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31
};
struct exec_domain default_exec_domain = {
.name = "Linux", /* name */
.handler = default_handler, /* lcall7 causes a seg fault. */
.pers_low = 0, /* PER_LINUX personality. */
.pers_high = 0, /* PER_LINUX personality. */
.signal_map = ident_map, /* Identity map signals. */
.signal_invmap = ident_map, /* - both ways. */
};
static void
default_handler(int segment, struct pt_regs *regp)
{
set_personality(0);
if (current_thread_info()->exec_domain->handler != default_handler)
current_thread_info()->exec_domain->handler(segment, regp);
else
send_sig(SIGSEGV, current, 1);
}
static struct exec_domain *
lookup_exec_domain(unsigned int personality)
{
unsigned int pers = personality(personality);
struct exec_domain *ep;
read_lock(&exec_domains_lock);
for (ep = exec_domains; ep; ep = ep->next) {
if (pers >= ep->pers_low && pers <= ep->pers_high)
if (try_module_get(ep->module))
goto out;
}
/*
* Disable the request_module here to avoid trying to
* load the personality-8 module, which doesn't exist,
* and results in selinux audit noise.
* Disabling this here avoids folks adding module_request
* to their sepolicy, which is maybe too generous
*/
#if 0
read_unlock(&exec_domains_lock);
request_module("personality-%d", pers);
read_lock(&exec_domains_lock);
for (ep = exec_domains; ep; ep = ep->next) {
if (pers >= ep->pers_low && pers <= ep->pers_high)
if (try_module_get(ep->module))
goto out;
}
#endif
ep = &default_exec_domain;
out:
read_unlock(&exec_domains_lock);
return ep;
}
int
register_exec_domain(struct exec_domain *ep)
{
struct exec_domain *tmp;
int err = -EBUSY;
if (ep == NULL)
return -EINVAL;
if (ep->next != NULL)
return -EBUSY;
write_lock(&exec_domains_lock);
for (tmp = exec_domains; tmp; tmp = tmp->next) {
if (tmp == ep)
goto out;
}
ep->next = exec_domains;
exec_domains = ep;
err = 0;
out:
write_unlock(&exec_domains_lock);
return err;
}
EXPORT_SYMBOL(register_exec_domain);
int
unregister_exec_domain(struct exec_domain *ep)
{
struct exec_domain **epp;
epp = &exec_domains;
write_lock(&exec_domains_lock);
for (epp = &exec_domains; *epp; epp = &(*epp)->next) {
if (ep == *epp)
goto unregister;
}
write_unlock(&exec_domains_lock);
return -EINVAL;
unregister:
*epp = ep->next;
ep->next = NULL;
write_unlock(&exec_domains_lock);
return 0;
}
EXPORT_SYMBOL(unregister_exec_domain);
int __set_personality(unsigned int personality)
{
struct exec_domain *oep = current_thread_info()->exec_domain;
current_thread_info()->exec_domain = lookup_exec_domain(personality);
current->personality = personality;
module_put(oep->module);
return 0;
}
EXPORT_SYMBOL(__set_personality);
#ifdef CONFIG_PROC_FS
static int execdomains_proc_show(struct seq_file *m, void *v)
{
struct exec_domain *ep;
read_lock(&exec_domains_lock);
for (ep = exec_domains; ep; ep = ep->next)
seq_printf(m, "%d-%d\t%-16s\t[%s]\n",
ep->pers_low, ep->pers_high, ep->name,
module_name(ep->module));
read_unlock(&exec_domains_lock);
return 0;
}
static int execdomains_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, execdomains_proc_show, NULL);
}
static const struct file_operations execdomains_proc_fops = {
.open = execdomains_proc_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int __init proc_execdomains_init(void)
{
proc_create("execdomains", 0, NULL, &execdomains_proc_fops);
return 0;
}
module_init(proc_execdomains_init);
#endif
SYSCALL_DEFINE1(personality, unsigned int, personality)
{
unsigned int old = current->personality;
if (personality != 0xffffffff)
set_personality(personality);
return old;
}

1648
kernel/exit.c Normal file

File diff suppressed because it is too large Load diff

139
kernel/extable.c Normal file
View file

@ -0,0 +1,139 @@
/* Rewritten by Rusty Russell, on the backs of many others...
Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/ftrace.h>
#include <linux/memory.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <asm/sections.h>
#include <asm/uaccess.h>
/*
* mutex protecting text section modification (dynamic code patching).
* some users need to sleep (allocating memory...) while they hold this lock.
*
* NOT exported to modules - patching kernel text is a really delicate matter.
*/
DEFINE_MUTEX(text_mutex);
extern struct exception_table_entry __start___ex_table[];
extern struct exception_table_entry __stop___ex_table[];
/* Cleared by build time tools if the table is already sorted. */
u32 __initdata __visible main_extable_sort_needed = 1;
/* Sort the kernel's built-in exception table */
void __init sort_main_extable(void)
{
if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) {
pr_notice("Sorting __ex_table...\n");
sort_extable(__start___ex_table, __stop___ex_table);
}
}
/* Given an address, look for it in the exception tables. */
const struct exception_table_entry *search_exception_tables(unsigned long addr)
{
const struct exception_table_entry *e;
e = search_extable(__start___ex_table, __stop___ex_table-1, addr);
if (!e)
e = search_module_extables(addr);
return e;
}
static inline int init_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_sinittext &&
addr < (unsigned long)_einittext)
return 1;
return 0;
}
int core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
addr < (unsigned long)_etext)
return 1;
if (system_state == SYSTEM_BOOTING &&
init_kernel_text(addr))
return 1;
return 0;
}
/**
* core_kernel_data - tell if addr points to kernel data
* @addr: address to test
*
* Returns true if @addr passed in is from the core kernel data
* section.
*
* Note: On some archs it may return true for core RODATA, and false
* for others. But will always be true for core RW data.
*/
int core_kernel_data(unsigned long addr)
{
if (addr >= (unsigned long)_sdata &&
addr < (unsigned long)_edata)
return 1;
return 0;
}
int __kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
if (is_module_text_address(addr))
return 1;
/*
* There might be init symbols in saved stacktraces.
* Give those symbols a chance to be printed in
* backtraces (such as lockdep traces).
*
* Since we are after the module-symbols check, there's
* no danger of address overlap:
*/
if (init_kernel_text(addr))
return 1;
return 0;
}
int kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
return is_module_text_address(addr);
}
/*
* On some architectures (PPC64, IA64) function pointers
* are actually only tokens to some data that then holds the
* real function address. As a result, to find if a function
* pointer is part of the kernel text, we need to do some
* special dereferencing first.
*/
int func_ptr_is_kernel_text(void *ptr)
{
unsigned long addr;
addr = (unsigned long) dereference_function_descriptor(ptr);
if (core_kernel_text(addr))
return 1;
return is_module_text_address(addr);
}

2015
kernel/fork.c Normal file

File diff suppressed because it is too large Load diff

179
kernel/freezer.c Normal file
View file

@ -0,0 +1,179 @@
/*
* kernel/freezer.c - Function to freeze a process
*
* Originally from kernel/power/process.c
*/
#include <linux/interrupt.h>
#include <linux/suspend.h>
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
/* total number of freezing conditions in effect */
atomic_t system_freezing_cnt = ATOMIC_INIT(0);
EXPORT_SYMBOL(system_freezing_cnt);
/* indicate whether PM freezing is in effect, protected by pm_mutex */
bool pm_freezing;
bool pm_nosig_freezing;
/*
* Temporary export for the deadlock workaround in ata_scsi_hotplug().
* Remove once the hack becomes unnecessary.
*/
EXPORT_SYMBOL_GPL(pm_freezing);
/* protects freezing and frozen transitions */
static DEFINE_SPINLOCK(freezer_lock);
/**
* freezing_slow_path - slow path for testing whether a task needs to be frozen
* @p: task to be tested
*
* This function is called by freezing() if system_freezing_cnt isn't zero
* and tests whether @p needs to enter and stay in frozen state. Can be
* called under any context. The freezers are responsible for ensuring the
* target tasks see the updated state.
*/
bool freezing_slow_path(struct task_struct *p)
{
if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
return false;
if (test_thread_flag(TIF_MEMDIE))
return false;
if (pm_nosig_freezing || cgroup_freezing(p))
return true;
if (pm_freezing && !(p->flags & PF_KTHREAD))
return true;
return false;
}
EXPORT_SYMBOL(freezing_slow_path);
/* Refrigerator is place where frozen processes are stored :-). */
bool __refrigerator(bool check_kthr_stop)
{
/* Hmm, should we be allowed to suspend when there are realtime
processes around? */
bool was_frozen = false;
long save = current->state;
pr_debug("%s entered refrigerator\n", current->comm);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
spin_lock_irq(&freezer_lock);
current->flags |= PF_FROZEN;
if (!freezing(current) ||
(check_kthr_stop && kthread_should_stop()))
current->flags &= ~PF_FROZEN;
spin_unlock_irq(&freezer_lock);
if (!(current->flags & PF_FROZEN))
break;
was_frozen = true;
schedule();
}
pr_debug("%s left refrigerator\n", current->comm);
/*
* Restore saved task state before returning. The mb'd version
* needs to be used; otherwise, it might silently break
* synchronization which depends on ordered task state change.
*/
set_current_state(save);
return was_frozen;
}
EXPORT_SYMBOL(__refrigerator);
static void fake_signal_wake_up(struct task_struct *p)
{
unsigned long flags;
if (lock_task_sighand(p, &flags)) {
signal_wake_up(p, 0);
unlock_task_sighand(p, &flags);
}
}
/**
* freeze_task - send a freeze request to given task
* @p: task to send the request to
*
* If @p is freezing, the freeze request is sent either by sending a fake
* signal (if it's not a kernel thread) or waking it up (if it's a kernel
* thread).
*
* RETURNS:
* %false, if @p is not freezing or already frozen; %true, otherwise
*/
bool freeze_task(struct task_struct *p)
{
unsigned long flags;
/*
* This check can race with freezer_do_not_count, but worst case that
* will result in an extra wakeup being sent to the task. It does not
* race with freezer_count(), the barriers in freezer_count() and
* freezer_should_skip() ensure that either freezer_count() sees
* freezing == true in try_to_freeze() and freezes, or
* freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
* normally.
*/
if (freezer_should_skip(p))
return false;
spin_lock_irqsave(&freezer_lock, flags);
if (!freezing(p) || frozen(p)) {
spin_unlock_irqrestore(&freezer_lock, flags);
return false;
}
if (!(p->flags & PF_KTHREAD))
fake_signal_wake_up(p);
else
wake_up_state(p, TASK_INTERRUPTIBLE);
spin_unlock_irqrestore(&freezer_lock, flags);
return true;
}
void __thaw_task(struct task_struct *p)
{
unsigned long flags;
spin_lock_irqsave(&freezer_lock, flags);
if (frozen(p))
wake_up_process(p);
spin_unlock_irqrestore(&freezer_lock, flags);
}
/**
* set_freezable - make %current freezable
*
* Mark %current freezable and enter refrigerator if necessary.
*/
bool set_freezable(void)
{
might_sleep();
/*
* Modify flags while holding freezer_lock. This ensures the
* freezer notices that we aren't frozen yet or the freezing
* condition is visible to try_to_freeze() below.
*/
spin_lock_irq(&freezer_lock);
current->flags &= ~PF_NOFREEZE;
spin_unlock_irq(&freezer_lock);
return try_to_freeze();
}
EXPORT_SYMBOL(set_freezable);

3053
kernel/futex.c Normal file

File diff suppressed because it is too large Load diff

201
kernel/futex_compat.c Normal file
View file

@ -0,0 +1,201 @@
/*
* linux/kernel/futex_compat.c
*
* Futex compatibililty routines.
*
* Copyright 2006, Red Hat, Inc., Ingo Molnar
*/
#include <linux/linkage.h>
#include <linux/compat.h>
#include <linux/nsproxy.h>
#include <linux/futex.h>
#include <linux/ptrace.h>
#include <linux/syscalls.h>
#include <asm/uaccess.h>
/*
* Fetch a robust-list pointer. Bit 0 signals PI futexes:
*/
static inline int
fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
compat_uptr_t __user *head, unsigned int *pi)
{
if (get_user(*uentry, head))
return -EFAULT;
*entry = compat_ptr((*uentry) & ~1);
*pi = (unsigned int)(*uentry) & 1;
return 0;
}
static void __user *futex_uaddr(struct robust_list __user *entry,
compat_long_t futex_offset)
{
compat_uptr_t base = ptr_to_compat(entry);
void __user *uaddr = compat_ptr(base + futex_offset);
return uaddr;
}
/*
* Walk curr->robust_list (very carefully, it's a userspace list!)
* and mark any locks found there dead, and notify any waiters.
*
* We silently return on any sign of list-walking problem.
*/
void compat_exit_robust_list(struct task_struct *curr)
{
struct compat_robust_list_head __user *head = curr->compat_robust_list;
struct robust_list __user *entry, *next_entry, *pending;
unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
unsigned int uninitialized_var(next_pi);
compat_uptr_t uentry, next_uentry, upending;
compat_long_t futex_offset;
int rc;
if (!futex_cmpxchg_enabled)
return;
/*
* Fetch the list head (which was registered earlier, via
* sys_set_robust_list()):
*/
if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
return;
/*
* Fetch the relative futex offset:
*/
if (get_user(futex_offset, &head->futex_offset))
return;
/*
* Fetch any possibly pending lock-add first, and handle it
* if it exists:
*/
if (fetch_robust_entry(&upending, &pending,
&head->list_op_pending, &pip))
return;
next_entry = NULL; /* avoid warning with gcc */
while (entry != (struct robust_list __user *) &head->list) {
/*
* Fetch the next entry in the list before calling
* handle_futex_death:
*/
rc = fetch_robust_entry(&next_uentry, &next_entry,
(compat_uptr_t __user *)&entry->next, &next_pi);
/*
* A pending lock might already be on the list, so
* dont process it twice:
*/
if (entry != pending) {
void __user *uaddr = futex_uaddr(entry, futex_offset);
if (handle_futex_death(uaddr, curr, pi))
return;
}
if (rc)
return;
uentry = next_uentry;
entry = next_entry;
pi = next_pi;
/*
* Avoid excessively long or circular lists:
*/
if (!--limit)
break;
cond_resched();
}
if (pending) {
void __user *uaddr = futex_uaddr(pending, futex_offset);
handle_futex_death(uaddr, curr, pip);
}
}
COMPAT_SYSCALL_DEFINE2(set_robust_list,
struct compat_robust_list_head __user *, head,
compat_size_t, len)
{
if (!futex_cmpxchg_enabled)
return -ENOSYS;
if (unlikely(len != sizeof(*head)))
return -EINVAL;
current->compat_robust_list = head;
return 0;
}
COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
compat_uptr_t __user *, head_ptr,
compat_size_t __user *, len_ptr)
{
struct compat_robust_list_head __user *head;
unsigned long ret;
struct task_struct *p;
if (!futex_cmpxchg_enabled)
return -ENOSYS;
rcu_read_lock();
ret = -ESRCH;
if (!pid)
p = current;
else {
p = find_task_by_vpid(pid);
if (!p)
goto err_unlock;
}
ret = -EPERM;
if (!ptrace_may_access(p, PTRACE_MODE_READ))
goto err_unlock;
head = p->compat_robust_list;
rcu_read_unlock();
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(ptr_to_compat(head), head_ptr);
err_unlock:
rcu_read_unlock();
return ret;
}
COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
struct compat_timespec __user *, utime, u32 __user *, uaddr2,
u32, val3)
{
struct timespec ts;
ktime_t t, *tp = NULL;
int val2 = 0;
int cmd = op & FUTEX_CMD_MASK;
if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
cmd == FUTEX_WAIT_BITSET ||
cmd == FUTEX_WAIT_REQUEUE_PI)) {
if (compat_get_timespec(&ts, utime))
return -EFAULT;
if (!timespec_valid(&ts))
return -EINVAL;
t = timespec_to_ktime(ts);
if (cmd == FUTEX_WAIT)
t = ktime_add_safe(ktime_get(), t);
tp = &t;
}
if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
val2 = (int) (unsigned long) utime;
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}

79
kernel/gcov/Kconfig Normal file
View file

@ -0,0 +1,79 @@
menu "GCOV-based kernel profiling"
config GCOV_KERNEL
bool "Enable gcov-based kernel profiling"
depends on DEBUG_FS
select CONSTRUCTORS if !UML
default n
---help---
This option enables gcov-based code profiling (e.g. for code coverage
measurements).
If unsure, say N.
Additionally specify CONFIG_GCOV_PROFILE_ALL=y to get profiling data
for the entire kernel. To enable profiling for specific files or
directories, add a line similar to the following to the respective
Makefile:
For a single file (e.g. main.o):
GCOV_PROFILE_main.o := y
For all files in one directory:
GCOV_PROFILE := y
To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL
is specified, use:
GCOV_PROFILE_main.o := n
and:
GCOV_PROFILE := n
Note that the debugfs filesystem has to be mounted to access
profiling data.
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
depends on GCOV_KERNEL
depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64
default n
---help---
This options activates profiling for the entire kernel.
If unsure, say N.
Note that a kernel compiled with profiling flags will be significantly
larger and run slower. Also be sure to exclude files from profiling
which are not linked to the kernel image to prevent linker errors.
choice
prompt "Specify GCOV format"
depends on GCOV_KERNEL
default GCOV_FORMAT_AUTODETECT
---help---
The gcov format is usually determined by the GCC version, but there are
exceptions where format changes are integrated in lower-version GCCs.
In such a case use this option to adjust the format used in the kernel
accordingly.
If unsure, choose "Autodetect".
config GCOV_FORMAT_AUTODETECT
bool "Autodetect"
---help---
Select this option to use the format that corresponds to your GCC
version.
config GCOV_FORMAT_3_4
bool "GCC 3.4 format"
---help---
Select this option to use the format defined by GCC 3.4.
config GCOV_FORMAT_4_7
bool "GCC 4.7 format"
---help---
Select this option to use the format defined by GCC 4.7.
endchoice
endmenu

33
kernel/gcov/Makefile Normal file
View file

@ -0,0 +1,33 @@
ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
# if-lt
# Usage VAR := $(call if-lt, $(a), $(b))
# Returns 1 if (a < b)
if-lt = $(shell [ $(1) -lt $(2) ] && echo 1)
ifeq ($(CONFIG_GCOV_FORMAT_3_4),y)
cc-ver := 0304
else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y)
cc-ver := 0407
else
# Use cc-version if available, otherwise set 0
#
# scripts/Kbuild.include, which contains cc-version function, is not included
# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov"
# Meaning cc-ver is empty causing if-lt test to fail with
# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage.
# This has no affect on the clean phase, but the error message could be
# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version
# is not available. We can probably move if-lt to Kbuild.include, so it's also
# not defined during clean or to include Kbuild.include in
# scripts/Makefile.clean. But the following workaround seems least invasive.
cc-ver := $(if $(call cc-version),$(call cc-version),0)
endif
obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o
ifeq ($(call if-lt, $(cc-ver), 0407),1)
obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o
else
obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o
endif

158
kernel/gcov/base.c Normal file
View file

@ -0,0 +1,158 @@
/*
* This code maintains a list of active profiling data structures.
*
* Copyright IBM Corp. 2009
* Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
*
* Uses gcc-internal data definitions.
* Based on the gcov-kernel patch by:
* Hubertus Franke <frankeh@us.ibm.com>
* Nigel Hinds <nhinds@us.ibm.com>
* Rajan Ravindran <rajancr@us.ibm.com>
* Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
* Paul Larson
*/
#define pr_fmt(fmt) "gcov: " fmt
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include "gcov.h"
static int gcov_events_enabled;
static DEFINE_MUTEX(gcov_lock);
/*
* __gcov_init is called by gcc-generated constructor code for each object
* file compiled with -fprofile-arcs.
*/
void __gcov_init(struct gcov_info *info)
{
static unsigned int gcov_version;
mutex_lock(&gcov_lock);
if (gcov_version == 0) {
gcov_version = gcov_info_version(info);
/*
* Printing gcc's version magic may prove useful for debugging
* incompatibility reports.
*/
pr_info("version magic: 0x%x\n", gcov_version);
}
/*
* Add new profiling data structure to list and inform event
* listener.
*/
gcov_info_link(info);
if (gcov_events_enabled)
gcov_event(GCOV_ADD, info);
mutex_unlock(&gcov_lock);
}
EXPORT_SYMBOL(__gcov_init);
/*
* These functions may be referenced by gcc-generated profiling code but serve
* no function for kernel profiling.
*/
void __gcov_flush(void)
{
/* Unused. */
}
EXPORT_SYMBOL(__gcov_flush);
void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
{
/* Unused. */
}
EXPORT_SYMBOL(__gcov_merge_add);
void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
{
/* Unused. */
}
EXPORT_SYMBOL(__gcov_merge_single);
void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
{
/* Unused. */
}
EXPORT_SYMBOL(__gcov_merge_delta);
void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
{
/* Unused. */
}
EXPORT_SYMBOL(__gcov_merge_ior);
void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
{
/* Unused. */
}
EXPORT_SYMBOL(__gcov_merge_time_profile);
/**
* gcov_enable_events - enable event reporting through gcov_event()
*
* Turn on reporting of profiling data load/unload-events through the
* gcov_event() callback. Also replay all previous events once. This function
* is needed because some events are potentially generated too early for the
* callback implementation to handle them initially.
*/
void gcov_enable_events(void)
{
struct gcov_info *info = NULL;
mutex_lock(&gcov_lock);
gcov_events_enabled = 1;
/* Perform event callback for previously registered entries. */
while ((info = gcov_info_next(info)))
gcov_event(GCOV_ADD, info);
mutex_unlock(&gcov_lock);
}
#ifdef CONFIG_MODULES
static inline int within(void *addr, void *start, unsigned long size)
{
return ((addr >= start) && (addr < start + size));
}
/* Update list and generate events when modules are unloaded. */
static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
void *data)
{
struct module *mod = data;
struct gcov_info *info = NULL;
struct gcov_info *prev = NULL;
if (event != MODULE_STATE_GOING)
return NOTIFY_OK;
mutex_lock(&gcov_lock);
/* Remove entries located in module from linked list. */
while ((info = gcov_info_next(info))) {
if (within(info, mod->module_core, mod->core_size)) {
gcov_info_unlink(prev, info);
if (gcov_events_enabled)
gcov_event(GCOV_REMOVE, info);
} else
prev = info;
}
mutex_unlock(&gcov_lock);
return NOTIFY_OK;
}
static struct notifier_block gcov_nb = {
.notifier_call = gcov_module_notifier,
};
static int __init gcov_init(void)
{
return register_module_notifier(&gcov_nb);
}
device_initcall(gcov_init);
#endif /* CONFIG_MODULES */

791
kernel/gcov/fs.c Normal file
View file

@ -0,0 +1,791 @@
/*
* This code exports profiling data as debugfs files to userspace.
*
* Copyright IBM Corp. 2009
* Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
*
* Uses gcc-internal data definitions.
* Based on the gcov-kernel patch by:
* Hubertus Franke <frankeh@us.ibm.com>
* Nigel Hinds <nhinds@us.ibm.com>
* Rajan Ravindran <rajancr@us.ibm.com>
* Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
* Paul Larson
* Yi CDL Yang
*/
#define pr_fmt(fmt) "gcov: " fmt
#include <linux/init.h>
#include <linux/module.h>
#include <linux/debugfs.h>
#include <linux/fs.h>
#include <linux/list.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
#include "gcov.h"
/**
* struct gcov_node - represents a debugfs entry
* @list: list head for child node list
* @children: child nodes
* @all: list head for list of all nodes
* @parent: parent node
* @loaded_info: array of pointers to profiling data sets for loaded object
* files.
* @num_loaded: number of profiling data sets for loaded object files.
* @unloaded_info: accumulated copy of profiling data sets for unloaded
* object files. Used only when gcov_persist=1.
* @dentry: main debugfs entry, either a directory or data file
* @links: associated symbolic links
* @name: data file basename
*
* struct gcov_node represents an entity within the gcov/ subdirectory
* of debugfs. There are directory and data file nodes. The latter represent
* the actual synthesized data file plus any associated symbolic links which
* are needed by the gcov tool to work correctly.
*/
struct gcov_node {
struct list_head list;
struct list_head children;
struct list_head all;
struct gcov_node *parent;
struct gcov_info **loaded_info;
struct gcov_info *unloaded_info;
struct dentry *dentry;
struct dentry **links;
int num_loaded;
char name[0];
};
static const char objtree[] = OBJTREE;
static const char srctree[] = SRCTREE;
static struct gcov_node root_node;
static struct dentry *reset_dentry;
static LIST_HEAD(all_head);
static DEFINE_MUTEX(node_lock);
/* If non-zero, keep copies of profiling data for unloaded modules. */
static int gcov_persist = 1;
static int __init gcov_persist_setup(char *str)
{
unsigned long val;
if (kstrtoul(str, 0, &val)) {
pr_warn("invalid gcov_persist parameter '%s'\n", str);
return 0;
}
gcov_persist = val;
pr_info("setting gcov_persist to %d\n", gcov_persist);
return 1;
}
__setup("gcov_persist=", gcov_persist_setup);
/*
* seq_file.start() implementation for gcov data files. Note that the
* gcov_iterator interface is designed to be more restrictive than seq_file
* (no start from arbitrary position, etc.), to simplify the iterator
* implementation.
*/
static void *gcov_seq_start(struct seq_file *seq, loff_t *pos)
{
loff_t i;
gcov_iter_start(seq->private);
for (i = 0; i < *pos; i++) {
if (gcov_iter_next(seq->private))
return NULL;
}
return seq->private;
}
/* seq_file.next() implementation for gcov data files. */
static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
{
struct gcov_iterator *iter = data;
if (gcov_iter_next(iter))
return NULL;
(*pos)++;
return iter;
}
/* seq_file.show() implementation for gcov data files. */
static int gcov_seq_show(struct seq_file *seq, void *data)
{
struct gcov_iterator *iter = data;
if (gcov_iter_write(iter, seq))
return -EINVAL;
return 0;
}
static void gcov_seq_stop(struct seq_file *seq, void *data)
{
/* Unused. */
}
static const struct seq_operations gcov_seq_ops = {
.start = gcov_seq_start,
.next = gcov_seq_next,
.show = gcov_seq_show,
.stop = gcov_seq_stop,
};
/*
* Return a profiling data set associated with the given node. This is
* either a data set for a loaded object file or a data set copy in case
* all associated object files have been unloaded.
*/
static struct gcov_info *get_node_info(struct gcov_node *node)
{
if (node->num_loaded > 0)
return node->loaded_info[0];
return node->unloaded_info;
}
/*
* Return a newly allocated profiling data set which contains the sum of
* all profiling data associated with the given node.
*/
static struct gcov_info *get_accumulated_info(struct gcov_node *node)
{
struct gcov_info *info;
int i = 0;
if (node->unloaded_info)
info = gcov_info_dup(node->unloaded_info);
else
info = gcov_info_dup(node->loaded_info[i++]);
if (!info)
return NULL;
for (; i < node->num_loaded; i++)
gcov_info_add(info, node->loaded_info[i]);
return info;
}
/*
* open() implementation for gcov data files. Create a copy of the profiling
* data set and initialize the iterator and seq_file interface.
*/
static int gcov_seq_open(struct inode *inode, struct file *file)
{
struct gcov_node *node = inode->i_private;
struct gcov_iterator *iter;
struct seq_file *seq;
struct gcov_info *info;
int rc = -ENOMEM;
mutex_lock(&node_lock);
/*
* Read from a profiling data copy to minimize reference tracking
* complexity and concurrent access and to keep accumulating multiple
* profiling data sets associated with one node simple.
*/
info = get_accumulated_info(node);
if (!info)
goto out_unlock;
iter = gcov_iter_new(info);
if (!iter)
goto err_free_info;
rc = seq_open(file, &gcov_seq_ops);
if (rc)
goto err_free_iter_info;
seq = file->private_data;
seq->private = iter;
out_unlock:
mutex_unlock(&node_lock);
return rc;
err_free_iter_info:
gcov_iter_free(iter);
err_free_info:
gcov_info_free(info);
goto out_unlock;
}
/*
* release() implementation for gcov data files. Release resources allocated
* by open().
*/
static int gcov_seq_release(struct inode *inode, struct file *file)
{
struct gcov_iterator *iter;
struct gcov_info *info;
struct seq_file *seq;
seq = file->private_data;
iter = seq->private;
info = gcov_iter_get_info(iter);
gcov_iter_free(iter);
gcov_info_free(info);
seq_release(inode, file);
return 0;
}
/*
* Find a node by the associated data file name. Needs to be called with
* node_lock held.
*/
static struct gcov_node *get_node_by_name(const char *name)
{
struct gcov_node *node;
struct gcov_info *info;
list_for_each_entry(node, &all_head, all) {
info = get_node_info(node);
if (info && (strcmp(gcov_info_filename(info), name) == 0))
return node;
}
return NULL;
}
/*
* Reset all profiling data associated with the specified node.
*/
static void reset_node(struct gcov_node *node)
{
int i;
if (node->unloaded_info)
gcov_info_reset(node->unloaded_info);
for (i = 0; i < node->num_loaded; i++)
gcov_info_reset(node->loaded_info[i]);
}
static void remove_node(struct gcov_node *node);
/*
* write() implementation for gcov data files. Reset profiling data for the
* corresponding file. If all associated object files have been unloaded,
* remove the debug fs node as well.
*/
static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
size_t len, loff_t *pos)
{
struct seq_file *seq;
struct gcov_info *info;
struct gcov_node *node;
seq = file->private_data;
info = gcov_iter_get_info(seq->private);
mutex_lock(&node_lock);
node = get_node_by_name(gcov_info_filename(info));
if (node) {
/* Reset counts or remove node for unloaded modules. */
if (node->num_loaded == 0)
remove_node(node);
else
reset_node(node);
}
/* Reset counts for open file. */
gcov_info_reset(info);
mutex_unlock(&node_lock);
return len;
}
/*
* Given a string <path> representing a file path of format:
* path/to/file.gcda
* construct and return a new string:
* <dir/>path/to/file.<ext>
*/
static char *link_target(const char *dir, const char *path, const char *ext)
{
char *target;
char *old_ext;
char *copy;
copy = kstrdup(path, GFP_KERNEL);
if (!copy)
return NULL;
old_ext = strrchr(copy, '.');
if (old_ext)
*old_ext = '\0';
if (dir)
target = kasprintf(GFP_KERNEL, "%s/%s.%s", dir, copy, ext);
else
target = kasprintf(GFP_KERNEL, "%s.%s", copy, ext);
kfree(copy);
return target;
}
/*
* Construct a string representing the symbolic link target for the given
* gcov data file name and link type. Depending on the link type and the
* location of the data file, the link target can either point to a
* subdirectory of srctree, objtree or in an external location.
*/
static char *get_link_target(const char *filename, const struct gcov_link *ext)
{
const char *rel;
char *result;
if (strncmp(filename, objtree, strlen(objtree)) == 0) {
rel = filename + strlen(objtree) + 1;
if (ext->dir == SRC_TREE)
result = link_target(srctree, rel, ext->ext);
else
result = link_target(objtree, rel, ext->ext);
} else {
/* External compilation. */
result = link_target(NULL, filename, ext->ext);
}
return result;
}
#define SKEW_PREFIX ".tmp_"
/*
* For a filename .tmp_filename.ext return filename.ext. Needed to compensate
* for filename skewing caused by the mod-versioning mechanism.
*/
static const char *deskew(const char *basename)
{
if (strncmp(basename, SKEW_PREFIX, sizeof(SKEW_PREFIX) - 1) == 0)
return basename + sizeof(SKEW_PREFIX) - 1;
return basename;
}
/*
* Create links to additional files (usually .c and .gcno files) which the
* gcov tool expects to find in the same directory as the gcov data file.
*/
static void add_links(struct gcov_node *node, struct dentry *parent)
{
const char *basename;
char *target;
int num;
int i;
for (num = 0; gcov_link[num].ext; num++)
/* Nothing. */;
node->links = kcalloc(num, sizeof(struct dentry *), GFP_KERNEL);
if (!node->links)
return;
for (i = 0; i < num; i++) {
target = get_link_target(
gcov_info_filename(get_node_info(node)),
&gcov_link[i]);
if (!target)
goto out_err;
basename = kbasename(target);
if (basename == target)
goto out_err;
node->links[i] = debugfs_create_symlink(deskew(basename),
parent, target);
if (!node->links[i])
goto out_err;
kfree(target);
}
return;
out_err:
kfree(target);
while (i-- > 0)
debugfs_remove(node->links[i]);
kfree(node->links);
node->links = NULL;
}
static const struct file_operations gcov_data_fops = {
.open = gcov_seq_open,
.release = gcov_seq_release,
.read = seq_read,
.llseek = seq_lseek,
.write = gcov_seq_write,
};
/* Basic initialization of a new node. */
static void init_node(struct gcov_node *node, struct gcov_info *info,
const char *name, struct gcov_node *parent)
{
INIT_LIST_HEAD(&node->list);
INIT_LIST_HEAD(&node->children);
INIT_LIST_HEAD(&node->all);
if (node->loaded_info) {
node->loaded_info[0] = info;
node->num_loaded = 1;
}
node->parent = parent;
if (name)
strcpy(node->name, name);
}
/*
* Create a new node and associated debugfs entry. Needs to be called with
* node_lock held.
*/
static struct gcov_node *new_node(struct gcov_node *parent,
struct gcov_info *info, const char *name)
{
struct gcov_node *node;
node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
if (!node)
goto err_nomem;
if (info) {
node->loaded_info = kcalloc(1, sizeof(struct gcov_info *),
GFP_KERNEL);
if (!node->loaded_info)
goto err_nomem;
}
init_node(node, info, name, parent);
/* Differentiate between gcov data file nodes and directory nodes. */
if (info) {
node->dentry = debugfs_create_file(deskew(node->name), 0600,
parent->dentry, node, &gcov_data_fops);
} else
node->dentry = debugfs_create_dir(node->name, parent->dentry);
if (!node->dentry) {
pr_warn("could not create file\n");
kfree(node);
return NULL;
}
if (info)
add_links(node, parent->dentry);
list_add(&node->list, &parent->children);
list_add(&node->all, &all_head);
return node;
err_nomem:
kfree(node);
pr_warn("out of memory\n");
return NULL;
}
/* Remove symbolic links associated with node. */
static void remove_links(struct gcov_node *node)
{
int i;
if (!node->links)
return;
for (i = 0; gcov_link[i].ext; i++)
debugfs_remove(node->links[i]);
kfree(node->links);
node->links = NULL;
}
/*
* Remove node from all lists and debugfs and release associated resources.
* Needs to be called with node_lock held.
*/
static void release_node(struct gcov_node *node)
{
list_del(&node->list);
list_del(&node->all);
debugfs_remove(node->dentry);
remove_links(node);
kfree(node->loaded_info);
if (node->unloaded_info)
gcov_info_free(node->unloaded_info);
kfree(node);
}
/* Release node and empty parents. Needs to be called with node_lock held. */
static void remove_node(struct gcov_node *node)
{
struct gcov_node *parent;
while ((node != &root_node) && list_empty(&node->children)) {
parent = node->parent;
release_node(node);
node = parent;
}
}
/*
* Find child node with given basename. Needs to be called with node_lock
* held.
*/
static struct gcov_node *get_child_by_name(struct gcov_node *parent,
const char *name)
{
struct gcov_node *node;
list_for_each_entry(node, &parent->children, list) {
if (strcmp(node->name, name) == 0)
return node;
}
return NULL;
}
/*
* write() implementation for reset file. Reset all profiling data to zero
* and remove nodes for which all associated object files are unloaded.
*/
static ssize_t reset_write(struct file *file, const char __user *addr,
size_t len, loff_t *pos)
{
struct gcov_node *node;
mutex_lock(&node_lock);
restart:
list_for_each_entry(node, &all_head, all) {
if (node->num_loaded > 0)
reset_node(node);
else if (list_empty(&node->children)) {
remove_node(node);
/* Several nodes may have gone - restart loop. */
goto restart;
}
}
mutex_unlock(&node_lock);
return len;
}
/* read() implementation for reset file. Unused. */
static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
loff_t *pos)
{
/* Allow read operation so that a recursive copy won't fail. */
return 0;
}
static const struct file_operations gcov_reset_fops = {
.write = reset_write,
.read = reset_read,
.llseek = noop_llseek,
};
/*
* Create a node for a given profiling data set and add it to all lists and
* debugfs. Needs to be called with node_lock held.
*/
static void add_node(struct gcov_info *info)
{
char *filename;
char *curr;
char *next;
struct gcov_node *parent;
struct gcov_node *node;
filename = kstrdup(gcov_info_filename(info), GFP_KERNEL);
if (!filename)
return;
parent = &root_node;
/* Create directory nodes along the path. */
for (curr = filename; (next = strchr(curr, '/')); curr = next + 1) {
if (curr == next)
continue;
*next = 0;
if (strcmp(curr, ".") == 0)
continue;
if (strcmp(curr, "..") == 0) {
if (!parent->parent)
goto err_remove;
parent = parent->parent;
continue;
}
node = get_child_by_name(parent, curr);
if (!node) {
node = new_node(parent, NULL, curr);
if (!node)
goto err_remove;
}
parent = node;
}
/* Create file node. */
node = new_node(parent, info, curr);
if (!node)
goto err_remove;
out:
kfree(filename);
return;
err_remove:
remove_node(parent);
goto out;
}
/*
* Associate a profiling data set with an existing node. Needs to be called
* with node_lock held.
*/
static void add_info(struct gcov_node *node, struct gcov_info *info)
{
struct gcov_info **loaded_info;
int num = node->num_loaded;
/*
* Prepare new array. This is done first to simplify cleanup in
* case the new data set is incompatible, the node only contains
* unloaded data sets and there's not enough memory for the array.
*/
loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
if (!loaded_info) {
pr_warn("could not add '%s' (out of memory)\n",
gcov_info_filename(info));
return;
}
memcpy(loaded_info, node->loaded_info,
num * sizeof(struct gcov_info *));
loaded_info[num] = info;
/* Check if the new data set is compatible. */
if (num == 0) {
/*
* A module was unloaded, modified and reloaded. The new
* data set replaces the copy of the last one.
*/
if (!gcov_info_is_compatible(node->unloaded_info, info)) {
pr_warn("discarding saved data for %s "
"(incompatible version)\n",
gcov_info_filename(info));
gcov_info_free(node->unloaded_info);
node->unloaded_info = NULL;
}
} else {
/*
* Two different versions of the same object file are loaded.
* The initial one takes precedence.
*/
if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
pr_warn("could not add '%s' (incompatible "
"version)\n", gcov_info_filename(info));
kfree(loaded_info);
return;
}
}
/* Overwrite previous array. */
kfree(node->loaded_info);
node->loaded_info = loaded_info;
node->num_loaded = num + 1;
}
/*
* Return the index of a profiling data set associated with a node.
*/
static int get_info_index(struct gcov_node *node, struct gcov_info *info)
{
int i;
for (i = 0; i < node->num_loaded; i++) {
if (node->loaded_info[i] == info)
return i;
}
return -ENOENT;
}
/*
* Save the data of a profiling data set which is being unloaded.
*/
static void save_info(struct gcov_node *node, struct gcov_info *info)
{
if (node->unloaded_info)
gcov_info_add(node->unloaded_info, info);
else {
node->unloaded_info = gcov_info_dup(info);
if (!node->unloaded_info) {
pr_warn("could not save data for '%s' "
"(out of memory)\n",
gcov_info_filename(info));
}
}
}
/*
* Disassociate a profiling data set from a node. Needs to be called with
* node_lock held.
*/
static void remove_info(struct gcov_node *node, struct gcov_info *info)
{
int i;
i = get_info_index(node, info);
if (i < 0) {
pr_warn("could not remove '%s' (not found)\n",
gcov_info_filename(info));
return;
}
if (gcov_persist)
save_info(node, info);
/* Shrink array. */
node->loaded_info[i] = node->loaded_info[node->num_loaded - 1];
node->num_loaded--;
if (node->num_loaded > 0)
return;
/* Last loaded data set was removed. */
kfree(node->loaded_info);
node->loaded_info = NULL;
node->num_loaded = 0;
if (!node->unloaded_info)
remove_node(node);
}
/*
* Callback to create/remove profiling files when code compiled with
* -fprofile-arcs is loaded/unloaded.
*/
void gcov_event(enum gcov_action action, struct gcov_info *info)
{
struct gcov_node *node;
mutex_lock(&node_lock);
node = get_node_by_name(gcov_info_filename(info));
switch (action) {
case GCOV_ADD:
if (node)
add_info(node, info);
else
add_node(info);
break;
case GCOV_REMOVE:
if (node)
remove_info(node, info);
else {
pr_warn("could not remove '%s' (not found)\n",
gcov_info_filename(info));
}
break;
}
mutex_unlock(&node_lock);
}
/* Create debugfs entries. */
static __init int gcov_fs_init(void)
{
int rc = -EIO;
init_node(&root_node, NULL, NULL, NULL);
/*
* /sys/kernel/debug/gcov will be parent for the reset control file
* and all profiling files.
*/
root_node.dentry = debugfs_create_dir("gcov", NULL);
if (!root_node.dentry)
goto err_remove;
/*
* Create reset file which resets all profiling counts when written
* to.
*/
reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry,
NULL, &gcov_reset_fops);
if (!reset_dentry)
goto err_remove;
/* Replay previous events to get our fs hierarchy up-to-date. */
gcov_enable_events();
return 0;
err_remove:
pr_err("init failed\n");
debugfs_remove(root_node.dentry);
return rc;
}
device_initcall(gcov_fs_init);

562
kernel/gcov/gcc_3_4.c Normal file
View file

@ -0,0 +1,562 @@
/*
* This code provides functions to handle gcc's profiling data format
* introduced with gcc 3.4. Future versions of gcc may change the gcov
* format (as happened before), so all format-specific information needs
* to be kept modular and easily exchangeable.
*
* This file is based on gcc-internal definitions. Functions and data
* structures are defined to be compatible with gcc counterparts.
* For a better understanding, refer to gcc source: gcc/gcov-io.h.
*
* Copyright IBM Corp. 2009
* Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
*
* Uses gcc-internal data definitions.
*/
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
#include "gcov.h"
#define GCOV_COUNTERS 5
static struct gcov_info *gcov_info_head;
/**
* struct gcov_fn_info - profiling meta data per function
* @ident: object file-unique function identifier
* @checksum: function checksum
* @n_ctrs: number of values per counter type belonging to this function
*
* This data is generated by gcc during compilation and doesn't change
* at run-time.
*/
struct gcov_fn_info {
unsigned int ident;
unsigned int checksum;
unsigned int n_ctrs[0];
};
/**
* struct gcov_ctr_info - profiling data per counter type
* @num: number of counter values for this type
* @values: array of counter values for this type
* @merge: merge function for counter values of this type (unused)
*
* This data is generated by gcc during compilation and doesn't change
* at run-time with the exception of the values array.
*/
struct gcov_ctr_info {
unsigned int num;
gcov_type *values;
void (*merge)(gcov_type *, unsigned int);
};
/**
* struct gcov_info - profiling data per object file
* @version: gcov version magic indicating the gcc version used for compilation
* @next: list head for a singly-linked list
* @stamp: time stamp
* @filename: name of the associated gcov data file
* @n_functions: number of instrumented functions
* @functions: function data
* @ctr_mask: mask specifying which counter types are active
* @counts: counter data per counter type
*
* This data is generated by gcc during compilation and doesn't change
* at run-time with the exception of the next pointer.
*/
struct gcov_info {
unsigned int version;
struct gcov_info *next;
unsigned int stamp;
const char *filename;
unsigned int n_functions;
const struct gcov_fn_info *functions;
unsigned int ctr_mask;
struct gcov_ctr_info counts[0];
};
/**
* gcov_info_filename - return info filename
* @info: profiling data set
*/
const char *gcov_info_filename(struct gcov_info *info)
{
return info->filename;
}
/**
* gcov_info_version - return info version
* @info: profiling data set
*/
unsigned int gcov_info_version(struct gcov_info *info)
{
return info->version;
}
/**
* gcov_info_next - return next profiling data set
* @info: profiling data set
*
* Returns next gcov_info following @info or first gcov_info in the chain if
* @info is %NULL.
*/
struct gcov_info *gcov_info_next(struct gcov_info *info)
{
if (!info)
return gcov_info_head;
return info->next;
}
/**
* gcov_info_link - link/add profiling data set to the list
* @info: profiling data set
*/
void gcov_info_link(struct gcov_info *info)
{
info->next = gcov_info_head;
gcov_info_head = info;
}
/**
* gcov_info_unlink - unlink/remove profiling data set from the list
* @prev: previous profiling data set
* @info: profiling data set
*/
void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
{
if (prev)
prev->next = info->next;
else
gcov_info_head = info->next;
}
/* Symbolic links to be created for each profiling data file. */
const struct gcov_link gcov_link[] = {
{ OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
{ 0, NULL},
};
/*
* Determine whether a counter is active. Based on gcc magic. Doesn't change
* at run-time.
*/
static int counter_active(struct gcov_info *info, unsigned int type)
{
return (1 << type) & info->ctr_mask;
}
/* Determine number of active counters. Based on gcc magic. */
static unsigned int num_counter_active(struct gcov_info *info)
{
unsigned int i;
unsigned int result = 0;
for (i = 0; i < GCOV_COUNTERS; i++) {
if (counter_active(info, i))
result++;
}
return result;
}
/**
* gcov_info_reset - reset profiling data to zero
* @info: profiling data set
*/
void gcov_info_reset(struct gcov_info *info)
{
unsigned int active = num_counter_active(info);
unsigned int i;
for (i = 0; i < active; i++) {
memset(info->counts[i].values, 0,
info->counts[i].num * sizeof(gcov_type));
}
}
/**
* gcov_info_is_compatible - check if profiling data can be added
* @info1: first profiling data set
* @info2: second profiling data set
*
* Returns non-zero if profiling data can be added, zero otherwise.
*/
int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
{
return (info1->stamp == info2->stamp);
}
/**
* gcov_info_add - add up profiling data
* @dest: profiling data set to which data is added
* @source: profiling data set which is added
*
* Adds profiling counts of @source to @dest.
*/
void gcov_info_add(struct gcov_info *dest, struct gcov_info *source)
{
unsigned int i;
unsigned int j;
for (i = 0; i < num_counter_active(dest); i++) {
for (j = 0; j < dest->counts[i].num; j++) {
dest->counts[i].values[j] +=
source->counts[i].values[j];
}
}
}
/* Get size of function info entry. Based on gcc magic. */
static size_t get_fn_size(struct gcov_info *info)
{
size_t size;
size = sizeof(struct gcov_fn_info) + num_counter_active(info) *
sizeof(unsigned int);
if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int))
size = ALIGN(size, __alignof__(struct gcov_fn_info));
return size;
}
/* Get address of function info entry. Based on gcc magic. */
static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn)
{
return (struct gcov_fn_info *)
((char *) info->functions + fn * get_fn_size(info));
}
/**
* gcov_info_dup - duplicate profiling data set
* @info: profiling data set to duplicate
*
* Return newly allocated duplicate on success, %NULL on error.
*/
struct gcov_info *gcov_info_dup(struct gcov_info *info)
{
struct gcov_info *dup;
unsigned int i;
unsigned int active;
/* Duplicate gcov_info. */
active = num_counter_active(info);
dup = kzalloc(sizeof(struct gcov_info) +
sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
if (!dup)
return NULL;
dup->version = info->version;
dup->stamp = info->stamp;
dup->n_functions = info->n_functions;
dup->ctr_mask = info->ctr_mask;
/* Duplicate filename. */
dup->filename = kstrdup(info->filename, GFP_KERNEL);
if (!dup->filename)
goto err_free;
/* Duplicate table of functions. */
dup->functions = kmemdup(info->functions, info->n_functions *
get_fn_size(info), GFP_KERNEL);
if (!dup->functions)
goto err_free;
/* Duplicate counter arrays. */
for (i = 0; i < active ; i++) {
struct gcov_ctr_info *ctr = &info->counts[i];
size_t size = ctr->num * sizeof(gcov_type);
dup->counts[i].num = ctr->num;
dup->counts[i].merge = ctr->merge;
dup->counts[i].values = vmalloc(size);
if (!dup->counts[i].values)
goto err_free;
memcpy(dup->counts[i].values, ctr->values, size);
}
return dup;
err_free:
gcov_info_free(dup);
return NULL;
}
/**
* gcov_info_free - release memory for profiling data set duplicate
* @info: profiling data set duplicate to free
*/
void gcov_info_free(struct gcov_info *info)
{
unsigned int active = num_counter_active(info);
unsigned int i;
for (i = 0; i < active ; i++)
vfree(info->counts[i].values);
kfree(info->functions);
kfree(info->filename);
kfree(info);
}
/**
* struct type_info - iterator helper array
* @ctr_type: counter type
* @offset: index of the first value of the current function for this type
*
* This array is needed to convert the in-memory data format into the in-file
* data format:
*
* In-memory:
* for each counter type
* for each function
* values
*
* In-file:
* for each function
* for each counter type
* values
*
* See gcc source gcc/gcov-io.h for more information on data organization.
*/
struct type_info {
int ctr_type;
unsigned int offset;
};
/**
* struct gcov_iterator - specifies current file position in logical records
* @info: associated profiling data
* @record: record type
* @function: function number
* @type: counter type
* @count: index into values array
* @num_types: number of counter types
* @type_info: helper array to get values-array offset for current function
*/
struct gcov_iterator {
struct gcov_info *info;
int record;
unsigned int function;
unsigned int type;
unsigned int count;
int num_types;
struct type_info type_info[0];
};
static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
{
return get_fn_info(iter->info, iter->function);
}
static struct type_info *get_type(struct gcov_iterator *iter)
{
return &iter->type_info[iter->type];
}
/**
* gcov_iter_new - allocate and initialize profiling data iterator
* @info: profiling data set to be iterated
*
* Return file iterator on success, %NULL otherwise.
*/
struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
{
struct gcov_iterator *iter;
iter = kzalloc(sizeof(struct gcov_iterator) +
num_counter_active(info) * sizeof(struct type_info),
GFP_KERNEL);
if (iter)
iter->info = info;
return iter;
}
/**
* gcov_iter_free - release memory for iterator
* @iter: file iterator to free
*/
void gcov_iter_free(struct gcov_iterator *iter)
{
kfree(iter);
}
/**
* gcov_iter_get_info - return profiling data set for given file iterator
* @iter: file iterator
*/
struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
{
return iter->info;
}
/**
* gcov_iter_start - reset file iterator to starting position
* @iter: file iterator
*/
void gcov_iter_start(struct gcov_iterator *iter)
{
int i;
iter->record = 0;
iter->function = 0;
iter->type = 0;
iter->count = 0;
iter->num_types = 0;
for (i = 0; i < GCOV_COUNTERS; i++) {
if (counter_active(iter->info, i)) {
iter->type_info[iter->num_types].ctr_type = i;
iter->type_info[iter->num_types++].offset = 0;
}
}
}
/* Mapping of logical record number to actual file content. */
#define RECORD_FILE_MAGIC 0
#define RECORD_GCOV_VERSION 1
#define RECORD_TIME_STAMP 2
#define RECORD_FUNCTION_TAG 3
#define RECORD_FUNCTON_TAG_LEN 4
#define RECORD_FUNCTION_IDENT 5
#define RECORD_FUNCTION_CHECK 6
#define RECORD_COUNT_TAG 7
#define RECORD_COUNT_LEN 8
#define RECORD_COUNT 9
/**
* gcov_iter_next - advance file iterator to next logical record
* @iter: file iterator
*
* Return zero if new position is valid, non-zero if iterator has reached end.
*/
int gcov_iter_next(struct gcov_iterator *iter)
{
switch (iter->record) {
case RECORD_FILE_MAGIC:
case RECORD_GCOV_VERSION:
case RECORD_FUNCTION_TAG:
case RECORD_FUNCTON_TAG_LEN:
case RECORD_FUNCTION_IDENT:
case RECORD_COUNT_TAG:
/* Advance to next record */
iter->record++;
break;
case RECORD_COUNT:
/* Advance to next count */
iter->count++;
/* fall through */
case RECORD_COUNT_LEN:
if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
iter->record = 9;
break;
}
/* Advance to next counter type */
get_type(iter)->offset += iter->count;
iter->count = 0;
iter->type++;
/* fall through */
case RECORD_FUNCTION_CHECK:
if (iter->type < iter->num_types) {
iter->record = 7;
break;
}
/* Advance to next function */
iter->type = 0;
iter->function++;
/* fall through */
case RECORD_TIME_STAMP:
if (iter->function < iter->info->n_functions)
iter->record = 3;
else
iter->record = -1;
break;
}
/* Check for EOF. */
if (iter->record == -1)
return -EINVAL;
else
return 0;
}
/**
* seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file
* @seq: seq_file handle
* @v: value to be stored
*
* Number format defined by gcc: numbers are recorded in the 32 bit
* unsigned binary form of the endianness of the machine generating the
* file.
*/
static int seq_write_gcov_u32(struct seq_file *seq, u32 v)
{
return seq_write(seq, &v, sizeof(v));
}
/**
* seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file
* @seq: seq_file handle
* @v: value to be stored
*
* Number format defined by gcc: numbers are recorded in the 32 bit
* unsigned binary form of the endianness of the machine generating the
* file. 64 bit numbers are stored as two 32 bit numbers, the low part
* first.
*/
static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
{
u32 data[2];
data[0] = (v & 0xffffffffUL);
data[1] = (v >> 32);
return seq_write(seq, data, sizeof(data));
}
/**
* gcov_iter_write - write data for current pos to seq_file
* @iter: file iterator
* @seq: seq_file handle
*
* Return zero on success, non-zero otherwise.
*/
int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
{
int rc = -EINVAL;
switch (iter->record) {
case RECORD_FILE_MAGIC:
rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC);
break;
case RECORD_GCOV_VERSION:
rc = seq_write_gcov_u32(seq, iter->info->version);
break;
case RECORD_TIME_STAMP:
rc = seq_write_gcov_u32(seq, iter->info->stamp);
break;
case RECORD_FUNCTION_TAG:
rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
break;
case RECORD_FUNCTON_TAG_LEN:
rc = seq_write_gcov_u32(seq, 2);
break;
case RECORD_FUNCTION_IDENT:
rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
break;
case RECORD_FUNCTION_CHECK:
rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
break;
case RECORD_COUNT_TAG:
rc = seq_write_gcov_u32(seq,
GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type));
break;
case RECORD_COUNT_LEN:
rc = seq_write_gcov_u32(seq,
get_func(iter)->n_ctrs[iter->type] * 2);
break;
case RECORD_COUNT:
rc = seq_write_gcov_u64(seq,
iter->info->counts[iter->type].
values[iter->count + get_type(iter)->offset]);
break;
}
return rc;
}

565
kernel/gcov/gcc_4_7.c Normal file
View file

@ -0,0 +1,565 @@
/*
* This code provides functions to handle gcc's profiling data format
* introduced with gcc 4.7.
*
* This file is based heavily on gcc_3_4.c file.
*
* For a better understanding, refer to gcc source:
* gcc/gcov-io.h
* libgcc/libgcov.c
*
* Uses gcc-internal data definitions.
*/
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
#include "gcov.h"
#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9
#define GCOV_COUNTERS 9
#else
#define GCOV_COUNTERS 8
#endif
#define GCOV_TAG_FUNCTION_LENGTH 3
static struct gcov_info *gcov_info_head;
/**
* struct gcov_ctr_info - information about counters for a single function
* @num: number of counter values for this type
* @values: array of counter values for this type
*
* This data is generated by gcc during compilation and doesn't change
* at run-time with the exception of the values array.
*/
struct gcov_ctr_info {
unsigned int num;
gcov_type *values;
};
/**
* struct gcov_fn_info - profiling meta data per function
* @key: comdat key
* @ident: unique ident of function
* @lineno_checksum: function lineo_checksum
* @cfg_checksum: function cfg checksum
* @ctrs: instrumented counters
*
* This data is generated by gcc during compilation and doesn't change
* at run-time.
*
* Information about a single function. This uses the trailing array
* idiom. The number of counters is determined from the merge pointer
* array in gcov_info. The key is used to detect which of a set of
* comdat functions was selected -- it points to the gcov_info object
* of the object file containing the selected comdat function.
*/
struct gcov_fn_info {
const struct gcov_info *key;
unsigned int ident;
unsigned int lineno_checksum;
unsigned int cfg_checksum;
struct gcov_ctr_info ctrs[0];
};
/**
* struct gcov_info - profiling data per object file
* @version: gcov version magic indicating the gcc version used for compilation
* @next: list head for a singly-linked list
* @stamp: uniquifying time stamp
* @filename: name of the associated gcov data file
* @merge: merge functions (null for unused counter type)
* @n_functions: number of instrumented functions
* @functions: pointer to pointers to function information
*
* This data is generated by gcc during compilation and doesn't change
* at run-time with the exception of the next pointer.
*/
struct gcov_info {
unsigned int version;
struct gcov_info *next;
unsigned int stamp;
const char *filename;
void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
unsigned int n_functions;
struct gcov_fn_info **functions;
};
/**
* gcov_info_filename - return info filename
* @info: profiling data set
*/
const char *gcov_info_filename(struct gcov_info *info)
{
return info->filename;
}
/**
* gcov_info_version - return info version
* @info: profiling data set
*/
unsigned int gcov_info_version(struct gcov_info *info)
{
return info->version;
}
/**
* gcov_info_next - return next profiling data set
* @info: profiling data set
*
* Returns next gcov_info following @info or first gcov_info in the chain if
* @info is %NULL.
*/
struct gcov_info *gcov_info_next(struct gcov_info *info)
{
if (!info)
return gcov_info_head;
return info->next;
}
/**
* gcov_info_link - link/add profiling data set to the list
* @info: profiling data set
*/
void gcov_info_link(struct gcov_info *info)
{
info->next = gcov_info_head;
gcov_info_head = info;
}
/**
* gcov_info_unlink - unlink/remove profiling data set from the list
* @prev: previous profiling data set
* @info: profiling data set
*/
void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
{
if (prev)
prev->next = info->next;
else
gcov_info_head = info->next;
}
/* Symbolic links to be created for each profiling data file. */
const struct gcov_link gcov_link[] = {
{ OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
{ 0, NULL},
};
/*
* Determine whether a counter is active. Doesn't change at run-time.
*/
static int counter_active(struct gcov_info *info, unsigned int type)
{
return info->merge[type] ? 1 : 0;
}
/* Determine number of active counters. Based on gcc magic. */
static unsigned int num_counter_active(struct gcov_info *info)
{
unsigned int i;
unsigned int result = 0;
for (i = 0; i < GCOV_COUNTERS; i++) {
if (counter_active(info, i))
result++;
}
return result;
}
/**
* gcov_info_reset - reset profiling data to zero
* @info: profiling data set
*/
void gcov_info_reset(struct gcov_info *info)
{
struct gcov_ctr_info *ci_ptr;
unsigned int fi_idx;
unsigned int ct_idx;
for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
ci_ptr = info->functions[fi_idx]->ctrs;
for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
if (!counter_active(info, ct_idx))
continue;
memset(ci_ptr->values, 0,
sizeof(gcov_type) * ci_ptr->num);
ci_ptr++;
}
}
}
/**
* gcov_info_is_compatible - check if profiling data can be added
* @info1: first profiling data set
* @info2: second profiling data set
*
* Returns non-zero if profiling data can be added, zero otherwise.
*/
int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
{
return (info1->stamp == info2->stamp);
}
/**
* gcov_info_add - add up profiling data
* @dest: profiling data set to which data is added
* @source: profiling data set which is added
*
* Adds profiling counts of @source to @dest.
*/
void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
{
struct gcov_ctr_info *dci_ptr;
struct gcov_ctr_info *sci_ptr;
unsigned int fi_idx;
unsigned int ct_idx;
unsigned int val_idx;
for (fi_idx = 0; fi_idx < src->n_functions; fi_idx++) {
dci_ptr = dst->functions[fi_idx]->ctrs;
sci_ptr = src->functions[fi_idx]->ctrs;
for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
if (!counter_active(src, ct_idx))
continue;
for (val_idx = 0; val_idx < sci_ptr->num; val_idx++)
dci_ptr->values[val_idx] +=
sci_ptr->values[val_idx];
dci_ptr++;
sci_ptr++;
}
}
}
/**
* gcov_info_dup - duplicate profiling data set
* @info: profiling data set to duplicate
*
* Return newly allocated duplicate on success, %NULL on error.
*/
struct gcov_info *gcov_info_dup(struct gcov_info *info)
{
struct gcov_info *dup;
struct gcov_ctr_info *dci_ptr; /* dst counter info */
struct gcov_ctr_info *sci_ptr; /* src counter info */
unsigned int active;
unsigned int fi_idx; /* function info idx */
unsigned int ct_idx; /* counter type idx */
size_t fi_size; /* function info size */
size_t cv_size; /* counter values size */
dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
if (!dup)
return NULL;
dup->next = NULL;
dup->filename = NULL;
dup->functions = NULL;
dup->filename = kstrdup(info->filename, GFP_KERNEL);
if (!dup->filename)
goto err_free;
dup->functions = kcalloc(info->n_functions,
sizeof(struct gcov_fn_info *), GFP_KERNEL);
if (!dup->functions)
goto err_free;
active = num_counter_active(info);
fi_size = sizeof(struct gcov_fn_info);
fi_size += sizeof(struct gcov_ctr_info) * active;
for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
dup->functions[fi_idx] = kzalloc(fi_size, GFP_KERNEL);
if (!dup->functions[fi_idx])
goto err_free;
*(dup->functions[fi_idx]) = *(info->functions[fi_idx]);
sci_ptr = info->functions[fi_idx]->ctrs;
dci_ptr = dup->functions[fi_idx]->ctrs;
for (ct_idx = 0; ct_idx < active; ct_idx++) {
cv_size = sizeof(gcov_type) * sci_ptr->num;
dci_ptr->values = vmalloc(cv_size);
if (!dci_ptr->values)
goto err_free;
dci_ptr->num = sci_ptr->num;
memcpy(dci_ptr->values, sci_ptr->values, cv_size);
sci_ptr++;
dci_ptr++;
}
}
return dup;
err_free:
gcov_info_free(dup);
return NULL;
}
/**
* gcov_info_free - release memory for profiling data set duplicate
* @info: profiling data set duplicate to free
*/
void gcov_info_free(struct gcov_info *info)
{
unsigned int active;
unsigned int fi_idx;
unsigned int ct_idx;
struct gcov_ctr_info *ci_ptr;
if (!info->functions)
goto free_info;
active = num_counter_active(info);
for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
if (!info->functions[fi_idx])
continue;
ci_ptr = info->functions[fi_idx]->ctrs;
for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++)
vfree(ci_ptr->values);
kfree(info->functions[fi_idx]);
}
free_info:
kfree(info->functions);
kfree(info->filename);
kfree(info);
}
#define ITER_STRIDE PAGE_SIZE
/**
* struct gcov_iterator - specifies current file position in logical records
* @info: associated profiling data
* @buffer: buffer containing file data
* @size: size of buffer
* @pos: current position in file
*/
struct gcov_iterator {
struct gcov_info *info;
void *buffer;
size_t size;
loff_t pos;
};
/**
* store_gcov_u32 - store 32 bit number in gcov format to buffer
* @buffer: target buffer or NULL
* @off: offset into the buffer
* @v: value to be stored
*
* Number format defined by gcc: numbers are recorded in the 32 bit
* unsigned binary form of the endianness of the machine generating the
* file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
* store anything.
*/
static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
{
u32 *data;
if (buffer) {
data = buffer + off;
*data = v;
}
return sizeof(*data);
}
/**
* store_gcov_u64 - store 64 bit number in gcov format to buffer
* @buffer: target buffer or NULL
* @off: offset into the buffer
* @v: value to be stored
*
* Number format defined by gcc: numbers are recorded in the 32 bit
* unsigned binary form of the endianness of the machine generating the
* file. 64 bit numbers are stored as two 32 bit numbers, the low part
* first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
* anything.
*/
static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
{
u32 *data;
if (buffer) {
data = buffer + off;
data[0] = (v & 0xffffffffUL);
data[1] = (v >> 32);
}
return sizeof(*data) * 2;
}
/**
* convert_to_gcda - convert profiling data set to gcda file format
* @buffer: the buffer to store file data or %NULL if no data should be stored
* @info: profiling data set to be converted
*
* Returns the number of bytes that were/would have been stored into the buffer.
*/
static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
{
struct gcov_fn_info *fi_ptr;
struct gcov_ctr_info *ci_ptr;
unsigned int fi_idx;
unsigned int ct_idx;
unsigned int cv_idx;
size_t pos = 0;
/* File header. */
pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
pos += store_gcov_u32(buffer, pos, info->version);
pos += store_gcov_u32(buffer, pos, info->stamp);
for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
fi_ptr = info->functions[fi_idx];
/* Function record. */
pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH);
pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum);
pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
ci_ptr = fi_ptr->ctrs;
for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
if (!counter_active(info, ct_idx))
continue;
/* Counter record. */
pos += store_gcov_u32(buffer, pos,
GCOV_TAG_FOR_COUNTER(ct_idx));
pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2);
for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) {
pos += store_gcov_u64(buffer, pos,
ci_ptr->values[cv_idx]);
}
ci_ptr++;
}
}
return pos;
}
/**
* gcov_iter_new - allocate and initialize profiling data iterator
* @info: profiling data set to be iterated
*
* Return file iterator on success, %NULL otherwise.
*/
struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
{
struct gcov_iterator *iter;
iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
if (!iter)
goto err_free;
iter->info = info;
/* Dry-run to get the actual buffer size. */
iter->size = convert_to_gcda(NULL, info);
iter->buffer = vmalloc(iter->size);
if (!iter->buffer)
goto err_free;
convert_to_gcda(iter->buffer, info);
return iter;
err_free:
kfree(iter);
return NULL;
}
/**
* gcov_iter_get_info - return profiling data set for given file iterator
* @iter: file iterator
*/
void gcov_iter_free(struct gcov_iterator *iter)
{
vfree(iter->buffer);
kfree(iter);
}
/**
* gcov_iter_get_info - return profiling data set for given file iterator
* @iter: file iterator
*/
struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
{
return iter->info;
}
/**
* gcov_iter_start - reset file iterator to starting position
* @iter: file iterator
*/
void gcov_iter_start(struct gcov_iterator *iter)
{
iter->pos = 0;
}
/**
* gcov_iter_next - advance file iterator to next logical record
* @iter: file iterator
*
* Return zero if new position is valid, non-zero if iterator has reached end.
*/
int gcov_iter_next(struct gcov_iterator *iter)
{
if (iter->pos < iter->size)
iter->pos += ITER_STRIDE;
if (iter->pos >= iter->size)
return -EINVAL;
return 0;
}
/**
* gcov_iter_write - write data for current pos to seq_file
* @iter: file iterator
* @seq: seq_file handle
*
* Return zero on success, non-zero otherwise.
*/
int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
{
size_t len;
if (iter->pos >= iter->size)
return -EINVAL;
len = ITER_STRIDE;
if (iter->pos + len > iter->size)
len = iter->size - iter->pos;
seq_write(seq, iter->buffer + iter->pos, len);
return 0;
}

85
kernel/gcov/gcov.h Normal file
View file

@ -0,0 +1,85 @@
/*
* Profiling infrastructure declarations.
*
* This file is based on gcc-internal definitions. Data structures are
* defined to be compatible with gcc counterparts. For a better
* understanding, refer to gcc source: gcc/gcov-io.h.
*
* Copyright IBM Corp. 2009
* Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
*
* Uses gcc-internal data definitions.
*/
#ifndef GCOV_H
#define GCOV_H GCOV_H
#include <linux/types.h>
/*
* Profiling data types used for gcc 3.4 and above - these are defined by
* gcc and need to be kept as close to the original definition as possible to
* remain compatible.
*/
#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
#define GCOV_TAG_FOR_COUNTER(count) \
(GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
#if BITS_PER_LONG >= 64
typedef long gcov_type;
#else
typedef long long gcov_type;
#endif
/* Opaque gcov_info. The gcov structures can change as for example in gcc 4.7 so
* we cannot use full definition here and they need to be placed in gcc specific
* implementation of gcov. This also means no direct access to the members in
* generic code and usage of the interface below.*/
struct gcov_info;
/* Interface to access gcov_info data */
const char *gcov_info_filename(struct gcov_info *info);
unsigned int gcov_info_version(struct gcov_info *info);
struct gcov_info *gcov_info_next(struct gcov_info *info);
void gcov_info_link(struct gcov_info *info);
void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
/* Base interface. */
enum gcov_action {
GCOV_ADD,
GCOV_REMOVE,
};
void gcov_event(enum gcov_action action, struct gcov_info *info);
void gcov_enable_events(void);
/* Iterator control. */
struct seq_file;
struct gcov_iterator;
struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
void gcov_iter_free(struct gcov_iterator *iter);
void gcov_iter_start(struct gcov_iterator *iter);
int gcov_iter_next(struct gcov_iterator *iter);
int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
/* gcov_info control. */
void gcov_info_reset(struct gcov_info *info);
int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2);
void gcov_info_add(struct gcov_info *dest, struct gcov_info *source);
struct gcov_info *gcov_info_dup(struct gcov_info *info);
void gcov_info_free(struct gcov_info *info);
struct gcov_link {
enum {
OBJ_TREE,
SRC_TREE,
} dir;
const char *ext;
};
extern const struct gcov_link gcov_link[];
#endif /* GCOV_H */

280
kernel/groups.c Normal file
View file

@ -0,0 +1,280 @@
/*
* Supplementary group IDs
*/
#include <linux/cred.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/user_namespace.h>
#include <asm/uaccess.h>
/* init to 2 - one for init_task, one to ensure it is never freed */
struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
struct group_info *groups_alloc(int gidsetsize)
{
struct group_info *group_info;
int nblocks;
int i;
nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
/* Make sure we always allocate at least one indirect block pointer */
nblocks = nblocks ? : 1;
group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
if (!group_info)
return NULL;
group_info->ngroups = gidsetsize;
group_info->nblocks = nblocks;
atomic_set(&group_info->usage, 1);
if (gidsetsize <= NGROUPS_SMALL)
group_info->blocks[0] = group_info->small_block;
else {
for (i = 0; i < nblocks; i++) {
kgid_t *b;
b = (void *)__get_free_page(GFP_USER);
if (!b)
goto out_undo_partial_alloc;
group_info->blocks[i] = b;
}
}
return group_info;
out_undo_partial_alloc:
while (--i >= 0) {
free_page((unsigned long)group_info->blocks[i]);
}
kfree(group_info);
return NULL;
}
EXPORT_SYMBOL(groups_alloc);
void groups_free(struct group_info *group_info)
{
if (group_info->blocks[0] != group_info->small_block) {
int i;
for (i = 0; i < group_info->nblocks; i++)
free_page((unsigned long)group_info->blocks[i]);
}
kfree(group_info);
}
EXPORT_SYMBOL(groups_free);
/* export the group_info to a user-space array */
static int groups_to_user(gid_t __user *grouplist,
const struct group_info *group_info)
{
struct user_namespace *user_ns = current_user_ns();
int i;
unsigned int count = group_info->ngroups;
for (i = 0; i < count; i++) {
gid_t gid;
gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i));
if (put_user(gid, grouplist+i))
return -EFAULT;
}
return 0;
}
/* fill a group_info from a user-space array - it must be allocated already */
static int groups_from_user(struct group_info *group_info,
gid_t __user *grouplist)
{
struct user_namespace *user_ns = current_user_ns();
int i;
unsigned int count = group_info->ngroups;
for (i = 0; i < count; i++) {
gid_t gid;
kgid_t kgid;
if (get_user(gid, grouplist+i))
return -EFAULT;
kgid = make_kgid(user_ns, gid);
if (!gid_valid(kgid))
return -EINVAL;
GROUP_AT(group_info, i) = kgid;
}
return 0;
}
/* a simple Shell sort */
static void groups_sort(struct group_info *group_info)
{
int base, max, stride;
int gidsetsize = group_info->ngroups;
for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
; /* nothing */
stride /= 3;
while (stride) {
max = gidsetsize - stride;
for (base = 0; base < max; base++) {
int left = base;
int right = left + stride;
kgid_t tmp = GROUP_AT(group_info, right);
while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) {
GROUP_AT(group_info, right) =
GROUP_AT(group_info, left);
right = left;
left -= stride;
}
GROUP_AT(group_info, right) = tmp;
}
stride /= 3;
}
}
/* a simple bsearch */
int groups_search(const struct group_info *group_info, kgid_t grp)
{
unsigned int left, right;
if (!group_info)
return 0;
left = 0;
right = group_info->ngroups;
while (left < right) {
unsigned int mid = (left+right)/2;
if (gid_gt(grp, GROUP_AT(group_info, mid)))
left = mid + 1;
else if (gid_lt(grp, GROUP_AT(group_info, mid)))
right = mid;
else
return 1;
}
return 0;
}
/**
* set_groups - Change a group subscription in a set of credentials
* @new: The newly prepared set of credentials to alter
* @group_info: The group list to install
*/
void set_groups(struct cred *new, struct group_info *group_info)
{
put_group_info(new->group_info);
groups_sort(group_info);
get_group_info(group_info);
new->group_info = group_info;
}
EXPORT_SYMBOL(set_groups);
/**
* set_current_groups - Change current's group subscription
* @group_info: The group list to impose
*
* Validate a group subscription and, if valid, impose it upon current's task
* security record.
*/
int set_current_groups(struct group_info *group_info)
{
struct cred *new;
new = prepare_creds();
if (!new)
return -ENOMEM;
set_groups(new, group_info);
return commit_creds(new);
}
EXPORT_SYMBOL(set_current_groups);
SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
{
const struct cred *cred = current_cred();
int i;
if (gidsetsize < 0)
return -EINVAL;
/* no need to grab task_lock here; it cannot change */
i = cred->group_info->ngroups;
if (gidsetsize) {
if (i > gidsetsize) {
i = -EINVAL;
goto out;
}
if (groups_to_user(grouplist, cred->group_info)) {
i = -EFAULT;
goto out;
}
}
out:
return i;
}
bool may_setgroups(void)
{
struct user_namespace *user_ns = current_user_ns();
return ns_capable(user_ns, CAP_SETGID) &&
userns_may_setgroups(user_ns);
}
/*
* SMP: Our groups are copy-on-write. We can set them safely
* without another task interfering.
*/
SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
{
struct group_info *group_info;
int retval;
if (!may_setgroups())
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
group_info = groups_alloc(gidsetsize);
if (!group_info)
return -ENOMEM;
retval = groups_from_user(group_info, grouplist);
if (retval) {
put_group_info(group_info);
return retval;
}
retval = set_current_groups(group_info);
put_group_info(group_info);
return retval;
}
/*
* Check whether we're fsgid/egid or in the supplemental group..
*/
int in_group_p(kgid_t grp)
{
const struct cred *cred = current_cred();
int retval = 1;
if (!gid_eq(grp, cred->fsgid))
retval = groups_search(cred->group_info, grp);
return retval;
}
EXPORT_SYMBOL(in_group_p);
int in_egroup_p(kgid_t grp)
{
const struct cred *cred = current_cred();
int retval = 1;
if (!gid_eq(grp, cred->egid))
retval = groups_search(cred->group_info, grp);
return retval;
}
EXPORT_SYMBOL(in_egroup_p);

251
kernel/hung_task.c Normal file
View file

@ -0,0 +1,251 @@
/*
* Detect Hung Task
*
* kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
*
*/
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/nmi.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/lockdep.h>
#include <linux/export.h>
#include <linux/sysctl.h>
#include <linux/utsname.h>
#include <trace/events/sched.h>
/*
* The number of tasks checked:
*/
int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
/*
* Limit number of tasks checked in a batch.
*
* This value controls the preemptibility of khungtaskd since preemption
* is disabled during the critical section. It also controls the size of
* the RCU grace period. So it needs to be upper-bound.
*/
#define HUNG_TASK_BATCHING 1024
/*
* Zero means infinite timeout - no checking done:
*/
unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
int __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
static struct task_struct *watchdog_task;
/*
* Should we panic (and reboot, if panic_timeout= is set) when a
* hung task is detected:
*/
unsigned int __read_mostly sysctl_hung_task_panic =
CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
static int __init hung_task_panic_setup(char *str)
{
int rc = kstrtouint(str, 0, &sysctl_hung_task_panic);
if (rc)
return rc;
return 1;
}
__setup("hung_task_panic=", hung_task_panic_setup);
static int
hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
{
did_panic = 1;
return NOTIFY_DONE;
}
static struct notifier_block panic_block = {
.notifier_call = hung_task_panic,
};
static void check_hung_task(struct task_struct *t, unsigned long timeout)
{
unsigned long switch_count = t->nvcsw + t->nivcsw;
/*
* Ensure the task is not frozen.
* Also, skip vfork and any other user process that freezer should skip.
*/
if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
return;
/*
* When a freshly created task is scheduled once, changes its state to
* TASK_UNINTERRUPTIBLE without having ever been switched out once, it
* musn't be checked.
*/
if (unlikely(!switch_count))
return;
if (switch_count != t->last_switch_count) {
t->last_switch_count = switch_count;
return;
}
trace_sched_process_hang(t);
if (!sysctl_hung_task_warnings)
return;
if (sysctl_hung_task_warnings > 0)
sysctl_hung_task_warnings--;
/*
* Ok, the task did not get scheduled for more than 2 minutes,
* complain:
*/
pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
t->comm, t->pid, timeout);
pr_err(" %s %s %.*s\n",
print_tainted(), init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
" disables this message.\n");
sched_show_task(t);
debug_show_held_locks(t);
touch_nmi_watchdog();
if (sysctl_hung_task_panic) {
trigger_all_cpu_backtrace();
panic("hung_task: blocked tasks");
}
}
/*
* To avoid extending the RCU grace period for an unbounded amount of time,
* periodically exit the critical section and enter a new one.
*
* For preemptible RCU it is sufficient to call rcu_read_unlock in order
* to exit the grace period. For classic RCU, a reschedule is required.
*/
static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
{
bool can_cont;
get_task_struct(g);
get_task_struct(t);
rcu_read_unlock();
cond_resched();
rcu_read_lock();
can_cont = pid_alive(g) && pid_alive(t);
put_task_struct(t);
put_task_struct(g);
return can_cont;
}
/*
* Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
* a really long time (120 seconds). If that happens, print out
* a warning.
*/
static void check_hung_uninterruptible_tasks(unsigned long timeout)
{
int max_count = sysctl_hung_task_check_count;
int batch_count = HUNG_TASK_BATCHING;
struct task_struct *g, *t;
/*
* If the system crashed already then all bets are off,
* do not report extra hung tasks:
*/
if (test_taint(TAINT_DIE) || did_panic)
return;
rcu_read_lock();
do_each_thread(g, t) {
if (!max_count--)
goto unlock;
if (!--batch_count) {
batch_count = HUNG_TASK_BATCHING;
if (!rcu_lock_break(g, t))
goto unlock;
}
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
check_hung_task(t, timeout);
} while_each_thread(g, t);
unlock:
rcu_read_unlock();
}
static unsigned long timeout_jiffies(unsigned long timeout)
{
/* timeout of 0 will disable the watchdog */
return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
}
/*
* Process updating of timeout sysctl
*/
int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int ret;
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
goto out;
wake_up_process(watchdog_task);
out:
return ret;
}
static atomic_t reset_hung_task = ATOMIC_INIT(0);
void reset_hung_task_detector(void)
{
atomic_set(&reset_hung_task, 1);
}
EXPORT_SYMBOL_GPL(reset_hung_task_detector);
/*
* kthread which checks for tasks stuck in D state
*/
static int watchdog(void *dummy)
{
set_user_nice(current, 0);
for ( ; ; ) {
unsigned long timeout = sysctl_hung_task_timeout_secs;
while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
timeout = sysctl_hung_task_timeout_secs;
if (atomic_xchg(&reset_hung_task, 0))
continue;
check_hung_uninterruptible_tasks(timeout);
}
return 0;
}
static int __init hung_task_init(void)
{
atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
return 0;
}
subsys_initcall(hung_task_init);

88
kernel/irq/Kconfig Normal file
View file

@ -0,0 +1,88 @@
menu "IRQ subsystem"
# Options selectable by the architecture code
# Make sparse irq Kconfig switch below available
config MAY_HAVE_SPARSE_IRQ
bool
# Legacy support, required for itanic
config GENERIC_IRQ_LEGACY
bool
# Enable the generic irq autoprobe mechanism
config GENERIC_IRQ_PROBE
bool
# Use the generic /proc/interrupts implementation
config GENERIC_IRQ_SHOW
bool
# Print level/edge extra information
config GENERIC_IRQ_SHOW_LEVEL
bool
# Facility to allocate a hardware interrupt. This is legacy support
# and should not be used in new code. Use irq domains instead.
config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
bool
# Support for delayed migration from interrupt context
config GENERIC_PENDING_IRQ
bool
# Alpha specific irq affinity mechanism
config AUTO_IRQ_AFFINITY
bool
# Tasklet based software resend for pending interrupts on enable_irq()
config HARDIRQS_SW_RESEND
bool
# Preflow handler support for fasteoi (sparc64)
config IRQ_PREFLOW_FASTEOI
bool
# Edge style eoi based handler (cell)
config IRQ_EDGE_EOI_HANDLER
bool
# Generic configurable interrupt chip implementation
config GENERIC_IRQ_CHIP
bool
select IRQ_DOMAIN
# Generic irq_domain hw <--> linux irq number translation
config IRQ_DOMAIN
bool
config HANDLE_DOMAIN_IRQ
bool
config IRQ_DOMAIN_DEBUG
bool "Expose hardware/virtual IRQ mapping via debugfs"
depends on IRQ_DOMAIN && DEBUG_FS
help
This option will show the mapping relationship between hardware irq
numbers and Linux irq numbers. The mapping is exposed via debugfs
in the file "irq_domain_mapping".
If you don't know what this means you don't need it.
# Support forced irq threading
config IRQ_FORCED_THREADING
bool
config SPARSE_IRQ
bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ
---help---
Sparse irq numbering is useful for distro kernels that want
to define a high CONFIG_NR_CPUS value but still want to have
low kernel memory footprint on smaller machines.
( Sparse irqs can also be beneficial on NUMA boxes, as they spread
out the interrupt descriptors in a more NUMA-friendly way. )
If you don't know what to do here, say N.
endmenu

8
kernel/irq/Makefile Normal file
View file

@ -0,0 +1,8 @@
obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
obj-$(CONFIG_PM_SLEEP) += pm.o

185
kernel/irq/autoprobe.c Normal file
View file

@ -0,0 +1,185 @@
/*
* linux/kernel/irq/autoprobe.c
*
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains the interrupt probing code and driver APIs.
*/
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/async.h>
#include "internals.h"
/*
* Autodetection depends on the fact that any interrupt that
* comes in on to an unassigned handler will get stuck with
* "IRQS_WAITING" cleared and the interrupt disabled.
*/
static DEFINE_MUTEX(probing_active);
/**
* probe_irq_on - begin an interrupt autodetect
*
* Commence probing for an interrupt. The interrupts are scanned
* and a mask of potential interrupt lines is returned.
*
*/
unsigned long probe_irq_on(void)
{
struct irq_desc *desc;
unsigned long mask = 0;
int i;
/*
* quiesce the kernel, or at least the asynchronous portion
*/
async_synchronize_full();
mutex_lock(&probing_active);
/*
* something may have generated an irq long ago and we want to
* flush such a longstanding irq before considering it as spurious.
*/
for_each_irq_desc_reverse(i, desc) {
raw_spin_lock_irq(&desc->lock);
if (!desc->action && irq_settings_can_probe(desc)) {
/*
* Some chips need to know about probing in
* progress:
*/
if (desc->irq_data.chip->irq_set_type)
desc->irq_data.chip->irq_set_type(&desc->irq_data,
IRQ_TYPE_PROBE);
irq_startup(desc, false);
}
raw_spin_unlock_irq(&desc->lock);
}
/* Wait for longstanding interrupts to trigger. */
msleep(20);
/*
* enable any unassigned irqs
* (we must startup again here because if a longstanding irq
* happened in the previous stage, it may have masked itself)
*/
for_each_irq_desc_reverse(i, desc) {
raw_spin_lock_irq(&desc->lock);
if (!desc->action && irq_settings_can_probe(desc)) {
desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
if (irq_startup(desc, false))
desc->istate |= IRQS_PENDING;
}
raw_spin_unlock_irq(&desc->lock);
}
/*
* Wait for spurious interrupts to trigger
*/
msleep(100);
/*
* Now filter out any obviously spurious interrupts
*/
for_each_irq_desc(i, desc) {
raw_spin_lock_irq(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
/* It triggered already - consider it spurious. */
if (!(desc->istate & IRQS_WAITING)) {
desc->istate &= ~IRQS_AUTODETECT;
irq_shutdown(desc);
} else
if (i < 32)
mask |= 1 << i;
}
raw_spin_unlock_irq(&desc->lock);
}
return mask;
}
EXPORT_SYMBOL(probe_irq_on);
/**
* probe_irq_mask - scan a bitmap of interrupt lines
* @val: mask of interrupts to consider
*
* Scan the interrupt lines and return a bitmap of active
* autodetect interrupts. The interrupt probe logic state
* is then returned to its previous value.
*
* Note: we need to scan all the irq's even though we will
* only return autodetect irq numbers - just so that we reset
* them all to a known state.
*/
unsigned int probe_irq_mask(unsigned long val)
{
unsigned int mask = 0;
struct irq_desc *desc;
int i;
for_each_irq_desc(i, desc) {
raw_spin_lock_irq(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
if (i < 16 && !(desc->istate & IRQS_WAITING))
mask |= 1 << i;
desc->istate &= ~IRQS_AUTODETECT;
irq_shutdown(desc);
}
raw_spin_unlock_irq(&desc->lock);
}
mutex_unlock(&probing_active);
return mask & val;
}
EXPORT_SYMBOL(probe_irq_mask);
/**
* probe_irq_off - end an interrupt autodetect
* @val: mask of potential interrupts (unused)
*
* Scans the unused interrupt lines and returns the line which
* appears to have triggered the interrupt. If no interrupt was
* found then zero is returned. If more than one interrupt is
* found then minus the first candidate is returned to indicate
* their is doubt.
*
* The interrupt probe logic state is returned to its previous
* value.
*
* BUGS: When used in a module (which arguably shouldn't happen)
* nothing prevents two IRQ probe callers from overlapping. The
* results of this are non-optimal.
*/
int probe_irq_off(unsigned long val)
{
int i, irq_found = 0, nr_of_irqs = 0;
struct irq_desc *desc;
for_each_irq_desc(i, desc) {
raw_spin_lock_irq(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
if (!(desc->istate & IRQS_WAITING)) {
if (!nr_of_irqs)
irq_found = i;
nr_of_irqs++;
}
desc->istate &= ~IRQS_AUTODETECT;
irq_shutdown(desc);
}
raw_spin_unlock_irq(&desc->lock);
}
mutex_unlock(&probing_active);
if (nr_of_irqs > 1)
irq_found = -irq_found;
return irq_found;
}
EXPORT_SYMBOL(probe_irq_off);

849
kernel/irq/chip.c Normal file
View file

@ -0,0 +1,849 @@
/*
* linux/kernel/irq/chip.c
*
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
*
* This file contains the core interrupt handling code, for irq-chip
* based architectures.
*
* Detailed information is available in Documentation/DocBook/genericirq
*/
#include <linux/irq.h>
#include <linux/msi.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <trace/events/irq.h>
#include "internals.h"
/**
* irq_set_chip - set the irq chip for an irq
* @irq: irq number
* @chip: pointer to irq chip description structure
*/
int irq_set_chip(unsigned int irq, struct irq_chip *chip)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
if (!desc)
return -EINVAL;
if (!chip)
chip = &no_irq_chip;
desc->irq_data.chip = chip;
irq_put_desc_unlock(desc, flags);
/*
* For !CONFIG_SPARSE_IRQ make the irq show up in
* allocated_irqs.
*/
irq_mark_irq(irq);
return 0;
}
EXPORT_SYMBOL(irq_set_chip);
/**
* irq_set_type - set the irq trigger type for an irq
* @irq: irq number
* @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
*/
int irq_set_irq_type(unsigned int irq, unsigned int type)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
int ret = 0;
if (!desc)
return -EINVAL;
type &= IRQ_TYPE_SENSE_MASK;
ret = __irq_set_trigger(desc, irq, type);
irq_put_desc_busunlock(desc, flags);
return ret;
}
EXPORT_SYMBOL(irq_set_irq_type);
/**
* irq_set_handler_data - set irq handler data for an irq
* @irq: Interrupt number
* @data: Pointer to interrupt specific data
*
* Set the hardware irq controller data for an irq
*/
int irq_set_handler_data(unsigned int irq, void *data)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
if (!desc)
return -EINVAL;
desc->irq_data.handler_data = data;
irq_put_desc_unlock(desc, flags);
return 0;
}
EXPORT_SYMBOL(irq_set_handler_data);
/**
* irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
* @irq_base: Interrupt number base
* @irq_offset: Interrupt number offset
* @entry: Pointer to MSI descriptor data
*
* Set the MSI descriptor entry for an irq at offset
*/
int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
struct msi_desc *entry)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
if (!desc)
return -EINVAL;
desc->irq_data.msi_desc = entry;
if (entry && !irq_offset)
entry->irq = irq_base;
irq_put_desc_unlock(desc, flags);
return 0;
}
/**
* irq_set_msi_desc - set MSI descriptor data for an irq
* @irq: Interrupt number
* @entry: Pointer to MSI descriptor data
*
* Set the MSI descriptor entry for an irq
*/
int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
{
return irq_set_msi_desc_off(irq, 0, entry);
}
/**
* irq_set_chip_data - set irq chip data for an irq
* @irq: Interrupt number
* @data: Pointer to chip specific data
*
* Set the hardware irq chip data for an irq
*/
int irq_set_chip_data(unsigned int irq, void *data)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
if (!desc)
return -EINVAL;
desc->irq_data.chip_data = data;
irq_put_desc_unlock(desc, flags);
return 0;
}
EXPORT_SYMBOL(irq_set_chip_data);
struct irq_data *irq_get_irq_data(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
return desc ? &desc->irq_data : NULL;
}
EXPORT_SYMBOL_GPL(irq_get_irq_data);
static void irq_state_clr_disabled(struct irq_desc *desc)
{
irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
}
static void irq_state_set_disabled(struct irq_desc *desc)
{
irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
}
static void irq_state_clr_masked(struct irq_desc *desc)
{
irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
}
static void irq_state_set_masked(struct irq_desc *desc)
{
irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
}
int irq_startup(struct irq_desc *desc, bool resend)
{
int ret = 0;
irq_state_clr_disabled(desc);
desc->depth = 0;
if (desc->irq_data.chip->irq_startup) {
ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
irq_state_clr_masked(desc);
} else {
irq_enable(desc);
}
if (resend)
check_irq_resend(desc, desc->irq_data.irq);
return ret;
}
void irq_shutdown(struct irq_desc *desc)
{
irq_state_set_disabled(desc);
desc->depth = 1;
if (desc->irq_data.chip->irq_shutdown)
desc->irq_data.chip->irq_shutdown(&desc->irq_data);
else if (desc->irq_data.chip->irq_disable)
desc->irq_data.chip->irq_disable(&desc->irq_data);
else
desc->irq_data.chip->irq_mask(&desc->irq_data);
irq_state_set_masked(desc);
}
void irq_enable(struct irq_desc *desc)
{
irq_state_clr_disabled(desc);
if (desc->irq_data.chip->irq_enable)
desc->irq_data.chip->irq_enable(&desc->irq_data);
else
desc->irq_data.chip->irq_unmask(&desc->irq_data);
irq_state_clr_masked(desc);
}
/**
* irq_disable - Mark interrupt disabled
* @desc: irq descriptor which should be disabled
*
* If the chip does not implement the irq_disable callback, we
* use a lazy disable approach. That means we mark the interrupt
* disabled, but leave the hardware unmasked. That's an
* optimization because we avoid the hardware access for the
* common case where no interrupt happens after we marked it
* disabled. If an interrupt happens, then the interrupt flow
* handler masks the line at the hardware level and marks it
* pending.
*/
void irq_disable(struct irq_desc *desc)
{
irq_state_set_disabled(desc);
if (desc->irq_data.chip->irq_disable) {
desc->irq_data.chip->irq_disable(&desc->irq_data);
irq_state_set_masked(desc);
}
}
void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu)
{
if (desc->irq_data.chip->irq_enable)
desc->irq_data.chip->irq_enable(&desc->irq_data);
else
desc->irq_data.chip->irq_unmask(&desc->irq_data);
cpumask_set_cpu(cpu, desc->percpu_enabled);
}
void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu)
{
if (desc->irq_data.chip->irq_disable)
desc->irq_data.chip->irq_disable(&desc->irq_data);
else
desc->irq_data.chip->irq_mask(&desc->irq_data);
cpumask_clear_cpu(cpu, desc->percpu_enabled);
}
static inline void mask_ack_irq(struct irq_desc *desc)
{
if (desc->irq_data.chip->irq_mask_ack)
desc->irq_data.chip->irq_mask_ack(&desc->irq_data);
else {
desc->irq_data.chip->irq_mask(&desc->irq_data);
if (desc->irq_data.chip->irq_ack)
desc->irq_data.chip->irq_ack(&desc->irq_data);
}
irq_state_set_masked(desc);
}
void mask_irq(struct irq_desc *desc)
{
if (desc->irq_data.chip->irq_mask) {
desc->irq_data.chip->irq_mask(&desc->irq_data);
irq_state_set_masked(desc);
}
}
void unmask_irq(struct irq_desc *desc)
{
if (desc->irq_data.chip->irq_unmask) {
desc->irq_data.chip->irq_unmask(&desc->irq_data);
irq_state_clr_masked(desc);
}
}
void unmask_threaded_irq(struct irq_desc *desc)
{
struct irq_chip *chip = desc->irq_data.chip;
if (chip->flags & IRQCHIP_EOI_THREADED)
chip->irq_eoi(&desc->irq_data);
if (chip->irq_unmask) {
chip->irq_unmask(&desc->irq_data);
irq_state_clr_masked(desc);
}
}
/*
* handle_nested_irq - Handle a nested irq from a irq thread
* @irq: the interrupt number
*
* Handle interrupts which are nested into a threaded interrupt
* handler. The handler function is called inside the calling
* threads context.
*/
void handle_nested_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
irqreturn_t action_ret;
might_sleep();
raw_spin_lock_irq(&desc->lock);
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
action = desc->action;
if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
desc->istate |= IRQS_PENDING;
goto out_unlock;
}
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock_irq(&desc->lock);
action_ret = action->thread_fn(action->irq, action->dev_id);
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
raw_spin_lock_irq(&desc->lock);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
out_unlock:
raw_spin_unlock_irq(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_nested_irq);
static bool irq_check_poll(struct irq_desc *desc)
{
if (!(desc->istate & IRQS_POLL_INPROGRESS))
return false;
return irq_wait_for_poll(desc);
}
static bool irq_may_run(struct irq_desc *desc)
{
unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED;
/*
* If the interrupt is not in progress and is not an armed
* wakeup interrupt, proceed.
*/
if (!irqd_has_set(&desc->irq_data, mask))
return true;
/*
* If the interrupt is an armed wakeup source, mark it pending
* and suspended, disable it and notify the pm core about the
* event.
*/
if (irq_pm_check_wakeup(desc))
return false;
/*
* Handle a potential concurrent poll on a different core.
*/
return irq_check_poll(desc);
}
/**
* handle_simple_irq - Simple and software-decoded IRQs.
* @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Simple interrupts are either sent from a demultiplexing interrupt
* handler or come from hardware, where no interrupt hardware control
* is necessary.
*
* Note: The caller is expected to handle the ack, clear, mask and
* unmask issues if necessary.
*/
void
handle_simple_irq(unsigned int irq, struct irq_desc *desc)
{
raw_spin_lock(&desc->lock);
if (!irq_may_run(desc))
goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
desc->istate |= IRQS_PENDING;
goto out_unlock;
}
handle_irq_event(desc);
out_unlock:
raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_simple_irq);
/*
* Called unconditionally from handle_level_irq() and only for oneshot
* interrupts from handle_fasteoi_irq()
*/
static void cond_unmask_irq(struct irq_desc *desc)
{
/*
* We need to unmask in the following cases:
* - Standard level irq (IRQF_ONESHOT is not set)
* - Oneshot irq which did not wake the thread (caused by a
* spurious interrupt or a primary handler handling it
* completely).
*/
if (!irqd_irq_disabled(&desc->irq_data) &&
irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot)
unmask_irq(desc);
}
/**
* handle_level_irq - Level type irq handler
* @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Level type interrupts are active as long as the hardware line has
* the active level. This may require to mask the interrupt and unmask
* it after the associated handler has acknowledged the device, so the
* interrupt line is back to inactive.
*/
void
handle_level_irq(unsigned int irq, struct irq_desc *desc)
{
raw_spin_lock(&desc->lock);
mask_ack_irq(desc);
if (!irq_may_run(desc))
goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
/*
* If its disabled or no action available
* keep it masked and get out of here
*/
if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
desc->istate |= IRQS_PENDING;
goto out_unlock;
}
handle_irq_event(desc);
cond_unmask_irq(desc);
out_unlock:
raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_level_irq);
#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
static inline void preflow_handler(struct irq_desc *desc)
{
if (desc->preflow_handler)
desc->preflow_handler(&desc->irq_data);
}
#else
static inline void preflow_handler(struct irq_desc *desc) { }
#endif
static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
{
if (!(desc->istate & IRQS_ONESHOT)) {
chip->irq_eoi(&desc->irq_data);
return;
}
/*
* We need to unmask in the following cases:
* - Oneshot irq which did not wake the thread (caused by a
* spurious interrupt or a primary handler handling it
* completely).
*/
if (!irqd_irq_disabled(&desc->irq_data) &&
irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) {
chip->irq_eoi(&desc->irq_data);
unmask_irq(desc);
} else if (!(chip->flags & IRQCHIP_EOI_THREADED)) {
chip->irq_eoi(&desc->irq_data);
}
}
/**
* handle_fasteoi_irq - irq handler for transparent controllers
* @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Only a single callback will be issued to the chip: an ->eoi()
* call when the interrupt has been serviced. This enables support
* for modern forms of interrupt handlers, which handle the flow
* details in hardware, transparently.
*/
void
handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
{
struct irq_chip *chip = desc->irq_data.chip;
raw_spin_lock(&desc->lock);
if (!irq_may_run(desc))
goto out;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
/*
* If its disabled or no action available
* then mask it and get out of here:
*/
if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
desc->istate |= IRQS_PENDING;
mask_irq(desc);
goto out;
}
if (desc->istate & IRQS_ONESHOT)
mask_irq(desc);
preflow_handler(desc);
handle_irq_event(desc);
cond_unmask_eoi_irq(desc, chip);
raw_spin_unlock(&desc->lock);
return;
out:
if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
chip->irq_eoi(&desc->irq_data);
raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
/**
* handle_edge_irq - edge type IRQ handler
* @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Interrupt occures on the falling and/or rising edge of a hardware
* signal. The occurrence is latched into the irq controller hardware
* and must be acked in order to be reenabled. After the ack another
* interrupt can happen on the same source even before the first one
* is handled by the associated event handler. If this happens it
* might be necessary to disable (mask) the interrupt depending on the
* controller hardware. This requires to reenable the interrupt inside
* of the loop which handles the interrupts which have arrived while
* the handler was running. If all pending interrupts are handled, the
* loop is left.
*/
void
handle_edge_irq(unsigned int irq, struct irq_desc *desc)
{
raw_spin_lock(&desc->lock);
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
if (!irq_may_run(desc)) {
desc->istate |= IRQS_PENDING;
mask_ack_irq(desc);
goto out_unlock;
}
/*
* If its disabled or no action available then mask it and get
* out of here.
*/
if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
desc->istate |= IRQS_PENDING;
mask_ack_irq(desc);
goto out_unlock;
}
kstat_incr_irqs_this_cpu(irq, desc);
/* Start handling the irq */
desc->irq_data.chip->irq_ack(&desc->irq_data);
do {
if (unlikely(!desc->action)) {
mask_irq(desc);
goto out_unlock;
}
/*
* When another irq arrived while we were handling
* one, we could have masked the irq.
* Renable it, if it was not disabled in meantime.
*/
if (unlikely(desc->istate & IRQS_PENDING)) {
if (!irqd_irq_disabled(&desc->irq_data) &&
irqd_irq_masked(&desc->irq_data))
unmask_irq(desc);
}
handle_irq_event(desc);
} while ((desc->istate & IRQS_PENDING) &&
!irqd_irq_disabled(&desc->irq_data));
out_unlock:
raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL(handle_edge_irq);
#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
/**
* handle_edge_eoi_irq - edge eoi type IRQ handler
* @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Similar as the above handle_edge_irq, but using eoi and w/o the
* mask/unmask logic.
*/
void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
raw_spin_lock(&desc->lock);
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
if (!irq_may_run(desc)) {
desc->istate |= IRQS_PENDING;
goto out_eoi;
}
/*
* If its disabled or no action available then mask it and get
* out of here.
*/
if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
desc->istate |= IRQS_PENDING;
goto out_eoi;
}
kstat_incr_irqs_this_cpu(irq, desc);
do {
if (unlikely(!desc->action))
goto out_eoi;
handle_irq_event(desc);
} while ((desc->istate & IRQS_PENDING) &&
!irqd_irq_disabled(&desc->irq_data));
out_eoi:
chip->irq_eoi(&desc->irq_data);
raw_spin_unlock(&desc->lock);
}
#endif
/**
* handle_percpu_irq - Per CPU local irq handler
* @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Per CPU interrupts on SMP machines without locking requirements
*/
void
handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
kstat_incr_irqs_this_cpu(irq, desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
handle_irq_event_percpu(desc, desc->action);
if (chip->irq_eoi)
chip->irq_eoi(&desc->irq_data);
}
/**
* handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids
* @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Per CPU interrupts on SMP machines without locking requirements. Same as
* handle_percpu_irq() above but with the following extras:
*
* action->percpu_dev_id is a pointer to percpu variables which
* contain the real device id for the cpu on which this handler is
* called
*/
void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
struct irqaction *action = desc->action;
void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
irqreturn_t res;
kstat_incr_irqs_this_cpu(irq, desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
trace_irq_handler_entry(irq, action);
res = action->handler(irq, dev_id);
trace_irq_handler_exit(irq, action, res);
if (chip->irq_eoi)
chip->irq_eoi(&desc->irq_data);
}
void
__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
const char *name)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
if (!desc)
return;
if (!handle) {
handle = handle_bad_irq;
} else {
if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
goto out;
}
/* Uninstall? */
if (handle == handle_bad_irq) {
if (desc->irq_data.chip != &no_irq_chip)
mask_ack_irq(desc);
irq_state_set_disabled(desc);
desc->depth = 1;
}
desc->handle_irq = handle;
desc->name = name;
if (handle != handle_bad_irq && is_chained) {
irq_settings_set_noprobe(desc);
irq_settings_set_norequest(desc);
irq_settings_set_nothread(desc);
irq_startup(desc, true);
}
out:
irq_put_desc_busunlock(desc, flags);
}
EXPORT_SYMBOL_GPL(__irq_set_handler);
void
irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
irq_flow_handler_t handle, const char *name)
{
irq_set_chip(irq, chip);
__irq_set_handler(irq, handle, 0, name);
}
EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
if (!desc)
return;
irq_settings_clr_and_set(desc, clr, set);
irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
if (irq_settings_has_no_balance_set(desc))
irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
if (irq_settings_is_per_cpu(desc))
irqd_set(&desc->irq_data, IRQD_PER_CPU);
if (irq_settings_can_move_pcntxt(desc))
irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
if (irq_settings_is_level(desc))
irqd_set(&desc->irq_data, IRQD_LEVEL);
irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
irq_put_desc_unlock(desc, flags);
}
EXPORT_SYMBOL_GPL(irq_modify_status);
/**
* irq_cpu_online - Invoke all irq_cpu_online functions.
*
* Iterate through all irqs and invoke the chip.irq_cpu_online()
* for each.
*/
void irq_cpu_online(void)
{
struct irq_desc *desc;
struct irq_chip *chip;
unsigned long flags;
unsigned int irq;
for_each_active_irq(irq) {
desc = irq_to_desc(irq);
if (!desc)
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
chip = irq_data_get_irq_chip(&desc->irq_data);
if (chip && chip->irq_cpu_online &&
(!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
!irqd_irq_disabled(&desc->irq_data)))
chip->irq_cpu_online(&desc->irq_data);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
/**
* irq_cpu_offline - Invoke all irq_cpu_offline functions.
*
* Iterate through all irqs and invoke the chip.irq_cpu_offline()
* for each.
*/
void irq_cpu_offline(void)
{
struct irq_desc *desc;
struct irq_chip *chip;
unsigned long flags;
unsigned int irq;
for_each_active_irq(irq) {
desc = irq_to_desc(irq);
if (!desc)
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
chip = irq_data_get_irq_chip(&desc->irq_data);
if (chip && chip->irq_cpu_offline &&
(!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
!irqd_irq_disabled(&desc->irq_data)))
chip->irq_cpu_offline(&desc->irq_data);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}

45
kernel/irq/debug.h Normal file
View file

@ -0,0 +1,45 @@
/*
* Debugging printout:
*/
#include <linux/kallsyms.h>
#define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
#define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f)
/* FIXME */
#define ___PD(f) do { } while (0)
static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
{
printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
printk("->handle_irq(): %p, ", desc->handle_irq);
print_symbol("%s\n", (unsigned long)desc->handle_irq);
printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
printk("->action(): %p\n", desc->action);
if (desc->action) {
printk("->action->handler(): %p, ", desc->action->handler);
print_symbol("%s\n", (unsigned long)desc->action->handler);
}
___P(IRQ_LEVEL);
___P(IRQ_PER_CPU);
___P(IRQ_NOPROBE);
___P(IRQ_NOREQUEST);
___P(IRQ_NOTHREAD);
___P(IRQ_NOAUTOEN);
___PS(IRQS_AUTODETECT);
___PS(IRQS_REPLAY);
___PS(IRQS_WAITING);
___PS(IRQS_PENDING);
___PD(IRQS_INPROGRESS);
___PD(IRQS_DISABLED);
___PD(IRQS_MASKED);
}
#undef ___P
#undef ___PS
#undef ___PD

139
kernel/irq/devres.c Normal file
View file

@ -0,0 +1,139 @@
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/device.h>
#include <linux/gfp.h>
/*
* Device resource management aware IRQ request/free implementation.
*/
struct irq_devres {
unsigned int irq;
void *dev_id;
};
static void devm_irq_release(struct device *dev, void *res)
{
struct irq_devres *this = res;
free_irq(this->irq, this->dev_id);
}
static int devm_irq_match(struct device *dev, void *res, void *data)
{
struct irq_devres *this = res, *match = data;
return this->irq == match->irq && this->dev_id == match->dev_id;
}
/**
* devm_request_threaded_irq - allocate an interrupt line for a managed device
* @dev: device to request interrupt for
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs
* @thread_fn: function to be called in a threaded interrupt context. NULL
* for devices which handle everything in @handler
* @irqflags: Interrupt type flags
* @devname: An ascii name for the claiming device
* @dev_id: A cookie passed back to the handler function
*
* Except for the extra @dev argument, this function takes the
* same arguments and performs the same function as
* request_threaded_irq(). IRQs requested with this function will be
* automatically freed on driver detach.
*
* If an IRQ allocated with this function needs to be freed
* separately, devm_free_irq() must be used.
*/
int devm_request_threaded_irq(struct device *dev, unsigned int irq,
irq_handler_t handler, irq_handler_t thread_fn,
unsigned long irqflags, const char *devname,
void *dev_id)
{
struct irq_devres *dr;
int rc;
dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres),
GFP_KERNEL);
if (!dr)
return -ENOMEM;
rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
dev_id);
if (rc) {
devres_free(dr);
return rc;
}
dr->irq = irq;
dr->dev_id = dev_id;
devres_add(dev, dr);
return 0;
}
EXPORT_SYMBOL(devm_request_threaded_irq);
/**
* devm_request_any_context_irq - allocate an interrupt line for a managed device
* @dev: device to request interrupt for
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs
* @thread_fn: function to be called in a threaded interrupt context. NULL
* for devices which handle everything in @handler
* @irqflags: Interrupt type flags
* @devname: An ascii name for the claiming device
* @dev_id: A cookie passed back to the handler function
*
* Except for the extra @dev argument, this function takes the
* same arguments and performs the same function as
* request_any_context_irq(). IRQs requested with this function will be
* automatically freed on driver detach.
*
* If an IRQ allocated with this function needs to be freed
* separately, devm_free_irq() must be used.
*/
int devm_request_any_context_irq(struct device *dev, unsigned int irq,
irq_handler_t handler, unsigned long irqflags,
const char *devname, void *dev_id)
{
struct irq_devres *dr;
int rc;
dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres),
GFP_KERNEL);
if (!dr)
return -ENOMEM;
rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id);
if (rc) {
devres_free(dr);
return rc;
}
dr->irq = irq;
dr->dev_id = dev_id;
devres_add(dev, dr);
return 0;
}
EXPORT_SYMBOL(devm_request_any_context_irq);
/**
* devm_free_irq - free an interrupt
* @dev: device to free interrupt for
* @irq: Interrupt line to free
* @dev_id: Device identity to free
*
* Except for the extra @dev argument, this function takes the
* same arguments and performs the same function as free_irq().
* This function instead of free_irq() should be used to manually
* free IRQs allocated with devm_request_irq().
*/
void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
{
struct irq_devres match_data = { irq, dev_id };
WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match,
&match_data));
free_irq(irq, dev_id);
}
EXPORT_SYMBOL(devm_free_irq);

61
kernel/irq/dummychip.c Normal file
View file

@ -0,0 +1,61 @@
/*
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
*
* This file contains the dummy interrupt chip implementation
*/
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/export.h>
#include "internals.h"
/*
* What should we do if we get a hw irq event on an illegal vector?
* Each architecture has to answer this themself.
*/
static void ack_bad(struct irq_data *data)
{
struct irq_desc *desc = irq_data_to_desc(data);
print_irq_desc(data->irq, desc);
ack_bad_irq(data->irq);
}
/*
* NOP functions
*/
static void noop(struct irq_data *data) { }
static unsigned int noop_ret(struct irq_data *data)
{
return 0;
}
/*
* Generic no controller implementation
*/
struct irq_chip no_irq_chip = {
.name = "none",
.irq_startup = noop_ret,
.irq_shutdown = noop,
.irq_enable = noop,
.irq_disable = noop,
.irq_ack = ack_bad,
};
/*
* Generic dummy implementation which can be used for
* real dumb interrupt sources
*/
struct irq_chip dummy_irq_chip = {
.name = "dummy",
.irq_startup = noop_ret,
.irq_shutdown = noop,
.irq_enable = noop,
.irq_disable = noop,
.irq_ack = noop,
.irq_mask = noop,
.irq_unmask = noop,
};
EXPORT_SYMBOL_GPL(dummy_irq_chip);

592
kernel/irq/generic-chip.c Normal file
View file

@ -0,0 +1,592 @@
/*
* Library implementing the most common irq chip callback functions
*
* Copyright (C) 2011, Thomas Gleixner
*/
#include <linux/io.h>
#include <linux/irq.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/irqdomain.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/syscore_ops.h>
#include "internals.h"
static LIST_HEAD(gc_list);
static DEFINE_RAW_SPINLOCK(gc_lock);
/**
* irq_gc_noop - NOOP function
* @d: irq_data
*/
void irq_gc_noop(struct irq_data *d)
{
}
/**
* irq_gc_mask_disable_reg - Mask chip via disable register
* @d: irq_data
*
* Chip has separate enable/disable registers instead of a single mask
* register.
*/
void irq_gc_mask_disable_reg(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
irq_gc_lock(gc);
irq_reg_writel(mask, gc->reg_base + ct->regs.disable);
*ct->mask_cache &= ~mask;
irq_gc_unlock(gc);
}
/**
* irq_gc_mask_set_bit - Mask chip via setting bit in mask register
* @d: irq_data
*
* Chip has a single mask register. Values of this register are cached
* and protected by gc->lock
*/
void irq_gc_mask_set_bit(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
irq_gc_lock(gc);
*ct->mask_cache |= mask;
irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
/**
* irq_gc_mask_clr_bit - Mask chip via clearing bit in mask register
* @d: irq_data
*
* Chip has a single mask register. Values of this register are cached
* and protected by gc->lock
*/
void irq_gc_mask_clr_bit(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
irq_gc_lock(gc);
*ct->mask_cache &= ~mask;
irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
/**
* irq_gc_unmask_enable_reg - Unmask chip via enable register
* @d: irq_data
*
* Chip has separate enable/disable registers instead of a single mask
* register.
*/
void irq_gc_unmask_enable_reg(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
irq_gc_lock(gc);
irq_reg_writel(mask, gc->reg_base + ct->regs.enable);
*ct->mask_cache |= mask;
irq_gc_unlock(gc);
}
/**
* irq_gc_ack_set_bit - Ack pending interrupt via setting bit
* @d: irq_data
*/
void irq_gc_ack_set_bit(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
irq_gc_lock(gc);
irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
/**
* irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
* @d: irq_data
*/
void irq_gc_ack_clr_bit(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = ~d->mask;
irq_gc_lock(gc);
irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
irq_gc_unlock(gc);
}
/**
* irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt
* @d: irq_data
*/
void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
irq_gc_lock(gc);
irq_reg_writel(mask, gc->reg_base + ct->regs.mask);
irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
irq_gc_unlock(gc);
}
/**
* irq_gc_eoi - EOI interrupt
* @d: irq_data
*/
void irq_gc_eoi(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
irq_gc_lock(gc);
irq_reg_writel(mask, gc->reg_base + ct->regs.eoi);
irq_gc_unlock(gc);
}
/**
* irq_gc_set_wake - Set/clr wake bit for an interrupt
* @d: irq_data
* @on: Indicates whether the wake bit should be set or cleared
*
* For chips where the wake from suspend functionality is not
* configured in a separate register and the wakeup active state is
* just stored in a bitmask.
*/
int irq_gc_set_wake(struct irq_data *d, unsigned int on)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
u32 mask = d->mask;
if (!(mask & gc->wake_enabled))
return -EINVAL;
irq_gc_lock(gc);
if (on)
gc->wake_active |= mask;
else
gc->wake_active &= ~mask;
irq_gc_unlock(gc);
return 0;
}
static void
irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
int num_ct, unsigned int irq_base,
void __iomem *reg_base, irq_flow_handler_t handler)
{
raw_spin_lock_init(&gc->lock);
gc->num_ct = num_ct;
gc->irq_base = irq_base;
gc->reg_base = reg_base;
gc->chip_types->chip.name = name;
gc->chip_types->handler = handler;
}
/**
* irq_alloc_generic_chip - Allocate a generic chip and initialize it
* @name: Name of the irq chip
* @num_ct: Number of irq_chip_type instances associated with this
* @irq_base: Interrupt base nr for this chip
* @reg_base: Register base address (virtual)
* @handler: Default flow handler associated with this chip
*
* Returns an initialized irq_chip_generic structure. The chip defaults
* to the primary (index 0) irq_chip_type and @handler
*/
struct irq_chip_generic *
irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
void __iomem *reg_base, irq_flow_handler_t handler)
{
struct irq_chip_generic *gc;
unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
gc = kzalloc(sz, GFP_KERNEL);
if (gc) {
irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base,
handler);
}
return gc;
}
EXPORT_SYMBOL_GPL(irq_alloc_generic_chip);
static void
irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
{
struct irq_chip_type *ct = gc->chip_types;
u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask;
int i;
for (i = 0; i < gc->num_ct; i++) {
if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) {
mskptr = &ct[i].mask_cache_priv;
mskreg = ct[i].regs.mask;
}
ct[i].mask_cache = mskptr;
if (flags & IRQ_GC_INIT_MASK_CACHE)
*mskptr = irq_reg_readl(gc->reg_base + mskreg);
}
}
/**
* irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
* @d: irq domain for which to allocate chips
* @irqs_per_chip: Number of interrupts each chip handles
* @num_ct: Number of irq_chip_type instances associated with this
* @name: Name of the irq chip
* @handler: Default flow handler associated with these chips
* @clr: IRQ_* bits to clear in the mapping function
* @set: IRQ_* bits to set in the mapping function
* @gcflags: Generic chip specific setup flags
*/
int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
int num_ct, const char *name,
irq_flow_handler_t handler,
unsigned int clr, unsigned int set,
enum irq_gc_flags gcflags)
{
struct irq_domain_chip_generic *dgc;
struct irq_chip_generic *gc;
int numchips, sz, i;
unsigned long flags;
void *tmp;
if (d->gc)
return -EBUSY;
numchips = DIV_ROUND_UP(d->revmap_size, irqs_per_chip);
if (!numchips)
return -EINVAL;
/* Allocate a pointer, generic chip and chiptypes for each chip */
sz = sizeof(*dgc) + numchips * sizeof(gc);
sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type));
tmp = dgc = kzalloc(sz, GFP_KERNEL);
if (!dgc)
return -ENOMEM;
dgc->irqs_per_chip = irqs_per_chip;
dgc->num_chips = numchips;
dgc->irq_flags_to_set = set;
dgc->irq_flags_to_clear = clr;
dgc->gc_flags = gcflags;
d->gc = dgc;
/* Calc pointer to the first generic chip */
tmp += sizeof(*dgc) + numchips * sizeof(gc);
for (i = 0; i < numchips; i++) {
/* Store the pointer to the generic chip */
dgc->gc[i] = gc = tmp;
irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
NULL, handler);
gc->domain = d;
raw_spin_lock_irqsave(&gc_lock, flags);
list_add_tail(&gc->list, &gc_list);
raw_spin_unlock_irqrestore(&gc_lock, flags);
/* Calc pointer to the next generic chip */
tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
}
d->name = name;
return 0;
}
EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
/**
* irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq
* @d: irq domain pointer
* @hw_irq: Hardware interrupt number
*/
struct irq_chip_generic *
irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
{
struct irq_domain_chip_generic *dgc = d->gc;
int idx;
if (!dgc)
return NULL;
idx = hw_irq / dgc->irqs_per_chip;
if (idx >= dgc->num_chips)
return NULL;
return dgc->gc[idx];
}
EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
/*
* Separate lockdep class for interrupt chip which can nest irq_desc
* lock.
*/
static struct lock_class_key irq_nested_lock_class;
/*
* irq_map_generic_chip - Map a generic chip for an irq domain
*/
int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
irq_hw_number_t hw_irq)
{
struct irq_data *data = irq_get_irq_data(virq);
struct irq_domain_chip_generic *dgc = d->gc;
struct irq_chip_generic *gc;
struct irq_chip_type *ct;
struct irq_chip *chip;
unsigned long flags;
int idx;
if (!d->gc)
return -ENODEV;
idx = hw_irq / dgc->irqs_per_chip;
if (idx >= dgc->num_chips)
return -EINVAL;
gc = dgc->gc[idx];
idx = hw_irq % dgc->irqs_per_chip;
if (test_bit(idx, &gc->unused))
return -ENOTSUPP;
if (test_bit(idx, &gc->installed))
return -EBUSY;
ct = gc->chip_types;
chip = &ct->chip;
/* We only init the cache for the first mapping of a generic chip */
if (!gc->installed) {
raw_spin_lock_irqsave(&gc->lock, flags);
irq_gc_init_mask_cache(gc, dgc->gc_flags);
raw_spin_unlock_irqrestore(&gc->lock, flags);
}
/* Mark the interrupt as installed */
set_bit(idx, &gc->installed);
if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
irq_set_lockdep_class(virq, &irq_nested_lock_class);
if (chip->irq_calc_mask)
chip->irq_calc_mask(data);
else
data->mask = 1 << idx;
irq_set_chip_and_handler(virq, chip, ct->handler);
irq_set_chip_data(virq, gc);
irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
return 0;
}
EXPORT_SYMBOL_GPL(irq_map_generic_chip);
struct irq_domain_ops irq_generic_chip_ops = {
.map = irq_map_generic_chip,
.xlate = irq_domain_xlate_onetwocell,
};
EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
/**
* irq_setup_generic_chip - Setup a range of interrupts with a generic chip
* @gc: Generic irq chip holding all data
* @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
* @flags: Flags for initialization
* @clr: IRQ_* bits to clear
* @set: IRQ_* bits to set
*
* Set up max. 32 interrupts starting from gc->irq_base. Note, this
* initializes all interrupts to the primary irq_chip_type and its
* associated handler.
*/
void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
enum irq_gc_flags flags, unsigned int clr,
unsigned int set)
{
struct irq_chip_type *ct = gc->chip_types;
struct irq_chip *chip = &ct->chip;
unsigned int i;
raw_spin_lock(&gc_lock);
list_add_tail(&gc->list, &gc_list);
raw_spin_unlock(&gc_lock);
irq_gc_init_mask_cache(gc, flags);
for (i = gc->irq_base; msk; msk >>= 1, i++) {
if (!(msk & 0x01))
continue;
if (flags & IRQ_GC_INIT_NESTED_LOCK)
irq_set_lockdep_class(i, &irq_nested_lock_class);
if (!(flags & IRQ_GC_NO_MASK)) {
struct irq_data *d = irq_get_irq_data(i);
if (chip->irq_calc_mask)
chip->irq_calc_mask(d);
else
d->mask = 1 << (i - gc->irq_base);
}
irq_set_chip_and_handler(i, chip, ct->handler);
irq_set_chip_data(i, gc);
irq_modify_status(i, clr, set);
}
gc->irq_cnt = i - gc->irq_base;
}
EXPORT_SYMBOL_GPL(irq_setup_generic_chip);
/**
* irq_setup_alt_chip - Switch to alternative chip
* @d: irq_data for this interrupt
* @type: Flow type to be initialized
*
* Only to be called from chip->irq_set_type() callbacks.
*/
int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
struct irq_chip_type *ct = gc->chip_types;
unsigned int i;
for (i = 0; i < gc->num_ct; i++, ct++) {
if (ct->type & type) {
d->chip = &ct->chip;
irq_data_to_desc(d)->handle_irq = ct->handler;
return 0;
}
}
return -EINVAL;
}
EXPORT_SYMBOL_GPL(irq_setup_alt_chip);
/**
* irq_remove_generic_chip - Remove a chip
* @gc: Generic irq chip holding all data
* @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
* @clr: IRQ_* bits to clear
* @set: IRQ_* bits to set
*
* Remove up to 32 interrupts starting from gc->irq_base.
*/
void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
unsigned int clr, unsigned int set)
{
unsigned int i = gc->irq_base;
raw_spin_lock(&gc_lock);
list_del(&gc->list);
raw_spin_unlock(&gc_lock);
for (; msk; msk >>= 1, i++) {
if (!(msk & 0x01))
continue;
/* Remove handler first. That will mask the irq line */
irq_set_handler(i, NULL);
irq_set_chip(i, &no_irq_chip);
irq_set_chip_data(i, NULL);
irq_modify_status(i, clr, set);
}
}
EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc)
{
unsigned int virq;
if (!gc->domain)
return irq_get_irq_data(gc->irq_base);
/*
* We don't know which of the irqs has been actually
* installed. Use the first one.
*/
if (!gc->installed)
return NULL;
virq = irq_find_mapping(gc->domain, gc->irq_base + __ffs(gc->installed));
return virq ? irq_get_irq_data(virq) : NULL;
}
#ifdef CONFIG_PM
static int irq_gc_suspend(void)
{
struct irq_chip_generic *gc;
list_for_each_entry(gc, &gc_list, list) {
struct irq_chip_type *ct = gc->chip_types;
if (ct->chip.irq_suspend) {
struct irq_data *data = irq_gc_get_irq_data(gc);
if (data)
ct->chip.irq_suspend(data);
}
}
return 0;
}
static void irq_gc_resume(void)
{
struct irq_chip_generic *gc;
list_for_each_entry(gc, &gc_list, list) {
struct irq_chip_type *ct = gc->chip_types;
if (ct->chip.irq_resume) {
struct irq_data *data = irq_gc_get_irq_data(gc);
if (data)
ct->chip.irq_resume(data);
}
}
}
#else
#define irq_gc_suspend NULL
#define irq_gc_resume NULL
#endif
static void irq_gc_shutdown(void)
{
struct irq_chip_generic *gc;
list_for_each_entry(gc, &gc_list, list) {
struct irq_chip_type *ct = gc->chip_types;
if (ct->chip.irq_pm_shutdown) {
struct irq_data *data = irq_gc_get_irq_data(gc);
if (data)
ct->chip.irq_pm_shutdown(data);
}
}
}
static struct syscore_ops irq_gc_syscore_ops = {
.suspend = irq_gc_suspend,
.resume = irq_gc_resume,
.shutdown = irq_gc_shutdown,
};
static int __init irq_gc_init_ops(void)
{
register_syscore_ops(&irq_gc_syscore_ops);
return 0;
}
device_initcall(irq_gc_init_ops);

199
kernel/irq/handle.c Normal file
View file

@ -0,0 +1,199 @@
/*
* linux/kernel/irq/handle.c
*
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
*
* This file contains the core interrupt handling code.
*
* Detailed information is available in Documentation/DocBook/genericirq
*
*/
#include <linux/irq.h>
#include <linux/random.h>
#include <linux/sched.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <trace/events/irq.h>
#include "internals.h"
/**
* handle_bad_irq - handle spurious and unhandled irqs
* @irq: the interrupt number
* @desc: description of the interrupt
*
* Handles spurious and unhandled IRQ's. It also prints a debugmessage.
*/
void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
{
print_irq_desc(irq, desc);
kstat_incr_irqs_this_cpu(irq, desc);
ack_bad_irq(irq);
}
/*
* Special, empty irq handler:
*/
irqreturn_t no_action(int cpl, void *dev_id)
{
return IRQ_NONE;
}
EXPORT_SYMBOL_GPL(no_action);
static void warn_no_thread(unsigned int irq, struct irqaction *action)
{
if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags))
return;
printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD "
"but no thread function available.", irq, action->name);
}
void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
{
/*
* In case the thread crashed and was killed we just pretend that
* we handled the interrupt. The hardirq handler has disabled the
* device interrupt, so no irq storm is lurking.
*/
if (action->thread->flags & PF_EXITING)
return;
/*
* Wake up the handler thread for this action. If the
* RUNTHREAD bit is already set, nothing to do.
*/
if (test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
return;
/*
* It's safe to OR the mask lockless here. We have only two
* places which write to threads_oneshot: This code and the
* irq thread.
*
* This code is the hard irq context and can never run on two
* cpus in parallel. If it ever does we have more serious
* problems than this bitmask.
*
* The irq threads of this irq which clear their "running" bit
* in threads_oneshot are serialized via desc->lock against
* each other and they are serialized against this code by
* IRQS_INPROGRESS.
*
* Hard irq handler:
*
* spin_lock(desc->lock);
* desc->state |= IRQS_INPROGRESS;
* spin_unlock(desc->lock);
* set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
* desc->threads_oneshot |= mask;
* spin_lock(desc->lock);
* desc->state &= ~IRQS_INPROGRESS;
* spin_unlock(desc->lock);
*
* irq thread:
*
* again:
* spin_lock(desc->lock);
* if (desc->state & IRQS_INPROGRESS) {
* spin_unlock(desc->lock);
* while(desc->state & IRQS_INPROGRESS)
* cpu_relax();
* goto again;
* }
* if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
* desc->threads_oneshot &= ~mask;
* spin_unlock(desc->lock);
*
* So either the thread waits for us to clear IRQS_INPROGRESS
* or we are waiting in the flow handler for desc->lock to be
* released before we reach this point. The thread also checks
* IRQTF_RUNTHREAD under desc->lock. If set it leaves
* threads_oneshot untouched and runs the thread another time.
*/
desc->threads_oneshot |= action->thread_mask;
/*
* We increment the threads_active counter in case we wake up
* the irq thread. The irq thread decrements the counter when
* it returns from the handler or in the exit path and wakes
* up waiters which are stuck in synchronize_irq() when the
* active count becomes zero. synchronize_irq() is serialized
* against this code (hard irq handler) via IRQS_INPROGRESS
* like the finalize_oneshot() code. See comment above.
*/
atomic_inc(&desc->threads_active);
wake_up_process(action->thread);
}
irqreturn_t
handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
{
irqreturn_t retval = IRQ_NONE;
unsigned int flags = 0, irq = desc->irq_data.irq;
do {
irqreturn_t res;
trace_irq_handler_entry(irq, action);
exynos_ss_irq(irq, (void *)action->handler, (int)irqs_disabled(), ESS_FLAG_IN);
res = action->handler(irq, action->dev_id);
exynos_ss_irq(irq, (void *)action->handler, (int)irqs_disabled(), ESS_FLAG_OUT);
trace_irq_handler_exit(irq, action, res);
if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
irq, action->handler))
local_irq_disable();
switch (res) {
case IRQ_WAKE_THREAD:
/*
* Catch drivers which return WAKE_THREAD but
* did not set up a thread function
*/
if (unlikely(!action->thread_fn)) {
warn_no_thread(irq, action);
break;
}
__irq_wake_thread(desc, action);
/* Fall through to add to randomness */
case IRQ_HANDLED:
flags |= action->flags;
break;
default:
break;
}
retval |= res;
action = action->next;
} while (action);
add_interrupt_randomness(irq, flags);
if (!noirqdebug)
note_interrupt(irq, desc, retval);
return retval;
}
irqreturn_t handle_irq_event(struct irq_desc *desc)
{
struct irqaction *action = desc->action;
irqreturn_t ret;
desc->istate &= ~IRQS_PENDING;
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock(&desc->lock);
ret = handle_irq_event_percpu(desc, action);
raw_spin_lock(&desc->lock);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
return ret;
}

212
kernel/irq/internals.h Normal file
View file

@ -0,0 +1,212 @@
/*
* IRQ subsystem internal functions and variables:
*
* Do not ever include this file from anything else than
* kernel/irq/. Do not even think about using any information outside
* of this file for your non core code.
*/
#include <linux/irqdesc.h>
#include <linux/kernel_stat.h>
#ifdef CONFIG_SPARSE_IRQ
# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
#else
# define IRQ_BITMAP_BITS NR_IRQS
#endif
#define istate core_internal_state__do_not_mess_with_it
extern bool noirqdebug;
/*
* Bits used by threaded handlers:
* IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
* IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
* IRQTF_AFFINITY - irq thread is requested to adjust affinity
* IRQTF_FORCED_THREAD - irq action is force threaded
*/
enum {
IRQTF_RUNTHREAD,
IRQTF_WARNED,
IRQTF_AFFINITY,
IRQTF_FORCED_THREAD,
};
/*
* Bit masks for desc->core_internal_state__do_not_mess_with_it
*
* IRQS_AUTODETECT - autodetection in progress
* IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt
* detection
* IRQS_POLL_INPROGRESS - polling in progress
* IRQS_ONESHOT - irq is not unmasked in primary handler
* IRQS_REPLAY - irq is replayed
* IRQS_WAITING - irq is waiting
* IRQS_PENDING - irq is pending and replayed later
* IRQS_SUSPENDED - irq is suspended
*/
enum {
IRQS_AUTODETECT = 0x00000001,
IRQS_SPURIOUS_DISABLED = 0x00000002,
IRQS_POLL_INPROGRESS = 0x00000008,
IRQS_ONESHOT = 0x00000020,
IRQS_REPLAY = 0x00000040,
IRQS_WAITING = 0x00000080,
IRQS_PENDING = 0x00000200,
IRQS_SUSPENDED = 0x00000800,
};
#include "debug.h"
#include "settings.h"
#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
unsigned long flags);
extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
extern void __enable_irq(struct irq_desc *desc, unsigned int irq);
extern int irq_startup(struct irq_desc *desc, bool resend);
extern void irq_shutdown(struct irq_desc *desc);
extern void irq_enable(struct irq_desc *desc);
extern void irq_disable(struct irq_desc *desc);
extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
extern void mask_irq(struct irq_desc *desc);
extern void unmask_irq(struct irq_desc *desc);
extern void unmask_threaded_irq(struct irq_desc *desc);
#ifdef CONFIG_SPARSE_IRQ
static inline void irq_mark_irq(unsigned int irq) { }
extern void irq_lock_sparse(void);
extern void irq_unlock_sparse(void);
#else
extern void irq_mark_irq(unsigned int irq);
static inline void irq_lock_sparse(void) { }
static inline void irq_unlock_sparse(void) { }
#endif
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
irqreturn_t handle_irq_event(struct irq_desc *desc);
/* Resending of interrupts :*/
void check_irq_resend(struct irq_desc *desc, unsigned int irq);
bool irq_wait_for_poll(struct irq_desc *desc);
void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
extern void register_handler_proc(unsigned int irq, struct irqaction *action);
extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
#else
static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { }
static inline void register_handler_proc(unsigned int irq,
struct irqaction *action) { }
static inline void unregister_handler_proc(unsigned int irq,
struct irqaction *action) { }
#endif
extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
extern void irq_set_thread_affinity(struct irq_desc *desc);
extern int irq_do_set_affinity(struct irq_data *data,
const struct cpumask *dest, bool force);
/* Inline functions for support of irq chips on slow busses */
static inline void chip_bus_lock(struct irq_desc *desc)
{
if (unlikely(desc->irq_data.chip->irq_bus_lock))
desc->irq_data.chip->irq_bus_lock(&desc->irq_data);
}
static inline void chip_bus_sync_unlock(struct irq_desc *desc)
{
if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock))
desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
}
#define _IRQ_DESC_CHECK (1 << 0)
#define _IRQ_DESC_PERCPU (1 << 1)
#define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK)
#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
struct irq_desc *
__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
unsigned int check);
void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
static inline struct irq_desc *
irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check)
{
return __irq_get_desc_lock(irq, flags, true, check);
}
static inline void
irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
{
__irq_put_desc_unlock(desc, flags, true);
}
static inline struct irq_desc *
irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check)
{
return __irq_get_desc_lock(irq, flags, false, check);
}
static inline void
irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
{
__irq_put_desc_unlock(desc, flags, false);
}
/*
* Manipulation functions for irq_data.state
*/
static inline void irqd_set_move_pending(struct irq_data *d)
{
d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
}
static inline void irqd_clr_move_pending(struct irq_data *d)
{
d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
}
static inline void irqd_clear(struct irq_data *d, unsigned int mask)
{
d->state_use_accessors &= ~mask;
}
static inline void irqd_set(struct irq_data *d, unsigned int mask)
{
d->state_use_accessors |= mask;
}
static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
{
return d->state_use_accessors & mask;
}
static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc)
{
__this_cpu_inc(*desc->kstat_irqs);
__this_cpu_inc(kstat.irqs_sum);
}
#ifdef CONFIG_PM_SLEEP
bool irq_pm_check_wakeup(struct irq_desc *desc);
void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action);
#else
static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; }
static inline void
irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
static inline void
irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
#endif

659
kernel/irq/irqdesc.c Normal file
View file

@ -0,0 +1,659 @@
/*
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
*
* This file contains the interrupt descriptor management code
*
* Detailed information is available in Documentation/DocBook/genericirq
*
*/
#include <linux/irq.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/radix-tree.h>
#include <linux/bitmap.h>
#include <linux/irqdomain.h>
#include <linux/exynos-ss.h>
#include "internals.h"
/*
* lockdep: we want to handle all irq_desc locks as a single lock-class:
*/
static struct lock_class_key irq_desc_lock_class;
#ifdef CONFIG_SCHED_HMP
extern struct cpumask hmp_slow_cpu_mask;
#endif
#if defined(CONFIG_SMP)
static void __init init_irq_default_affinity(void)
{
alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
#ifdef CONFIG_SCHED_HMP
cpumask_copy(irq_default_affinity, &hmp_slow_cpu_mask);
#else
cpumask_setall(irq_default_affinity);
#endif
}
#else
static void __init init_irq_default_affinity(void)
{
}
#endif
#ifdef CONFIG_SMP
static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
{
if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node))
return -ENOMEM;
#ifdef CONFIG_GENERIC_PENDING_IRQ
if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
free_cpumask_var(desc->irq_data.affinity);
return -ENOMEM;
}
#endif
return 0;
}
static void desc_smp_init(struct irq_desc *desc, int node)
{
desc->irq_data.node = node;
cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
#ifdef CONFIG_GENERIC_PENDING_IRQ
cpumask_clear(desc->pending_mask);
#endif
}
static inline int desc_node(struct irq_desc *desc)
{
return desc->irq_data.node;
}
#else
static inline int
alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
static inline void desc_smp_init(struct irq_desc *desc, int node) { }
static inline int desc_node(struct irq_desc *desc) { return 0; }
#endif
static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
struct module *owner)
{
int cpu;
desc->irq_data.irq = irq;
desc->irq_data.chip = &no_irq_chip;
desc->irq_data.chip_data = NULL;
desc->irq_data.handler_data = NULL;
desc->irq_data.msi_desc = NULL;
irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
desc->handle_irq = handle_bad_irq;
desc->depth = 1;
desc->irq_count = 0;
desc->irqs_unhandled = 0;
desc->name = NULL;
desc->owner = owner;
for_each_possible_cpu(cpu)
*per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
desc_smp_init(desc, node);
}
int nr_irqs = NR_IRQS;
EXPORT_SYMBOL_GPL(nr_irqs);
static DEFINE_MUTEX(sparse_irq_lock);
static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
#ifdef CONFIG_SPARSE_IRQ
static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
{
radix_tree_insert(&irq_desc_tree, irq, desc);
}
struct irq_desc *irq_to_desc(unsigned int irq)
{
return radix_tree_lookup(&irq_desc_tree, irq);
}
EXPORT_SYMBOL(irq_to_desc);
static void delete_irq_desc(unsigned int irq)
{
radix_tree_delete(&irq_desc_tree, irq);
}
#ifdef CONFIG_SMP
static void free_masks(struct irq_desc *desc)
{
#ifdef CONFIG_GENERIC_PENDING_IRQ
free_cpumask_var(desc->pending_mask);
#endif
free_cpumask_var(desc->irq_data.affinity);
}
#else
static inline void free_masks(struct irq_desc *desc) { }
#endif
void irq_lock_sparse(void)
{
mutex_lock(&sparse_irq_lock);
}
void irq_unlock_sparse(void)
{
mutex_unlock(&sparse_irq_lock);
}
static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
{
struct irq_desc *desc;
gfp_t gfp = GFP_KERNEL;
desc = kzalloc_node(sizeof(*desc), gfp, node);
if (!desc)
return NULL;
/* allocate based on nr_cpu_ids */
desc->kstat_irqs = alloc_percpu(unsigned int);
if (!desc->kstat_irqs)
goto err_desc;
if (alloc_masks(desc, gfp, node))
goto err_kstat;
raw_spin_lock_init(&desc->lock);
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
desc_set_defaults(irq, desc, node, owner);
return desc;
err_kstat:
free_percpu(desc->kstat_irqs);
err_desc:
kfree(desc);
return NULL;
}
static void free_desc(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
unregister_irq_proc(irq, desc);
/*
* sparse_irq_lock protects also show_interrupts() and
* kstat_irq_usr(). Once we deleted the descriptor from the
* sparse tree we can free it. Access in proc will fail to
* lookup the descriptor.
*/
mutex_lock(&sparse_irq_lock);
delete_irq_desc(irq);
mutex_unlock(&sparse_irq_lock);
free_masks(desc);
free_percpu(desc->kstat_irqs);
kfree(desc);
}
static int alloc_descs(unsigned int start, unsigned int cnt, int node,
struct module *owner)
{
struct irq_desc *desc;
int i;
for (i = 0; i < cnt; i++) {
desc = alloc_desc(start + i, node, owner);
if (!desc)
goto err;
mutex_lock(&sparse_irq_lock);
irq_insert_desc(start + i, desc);
mutex_unlock(&sparse_irq_lock);
}
return start;
err:
for (i--; i >= 0; i--)
free_desc(start + i);
mutex_lock(&sparse_irq_lock);
bitmap_clear(allocated_irqs, start, cnt);
mutex_unlock(&sparse_irq_lock);
return -ENOMEM;
}
static int irq_expand_nr_irqs(unsigned int nr)
{
if (nr > IRQ_BITMAP_BITS)
return -ENOMEM;
nr_irqs = nr;
return 0;
}
int __init early_irq_init(void)
{
int i, initcnt, node = first_online_node;
struct irq_desc *desc;
init_irq_default_affinity();
/* Let arch update nr_irqs and return the nr of preallocated irqs */
initcnt = arch_probe_nr_irqs();
printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
nr_irqs = IRQ_BITMAP_BITS;
if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
initcnt = IRQ_BITMAP_BITS;
if (initcnt > nr_irqs)
nr_irqs = initcnt;
for (i = 0; i < initcnt; i++) {
desc = alloc_desc(i, node, NULL);
set_bit(i, allocated_irqs);
irq_insert_desc(i, desc);
}
return arch_early_irq_init();
}
#else /* !CONFIG_SPARSE_IRQ */
struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
[0 ... NR_IRQS-1] = {
.handle_irq = handle_bad_irq,
.depth = 1,
.lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
}
};
int __init early_irq_init(void)
{
int count, i, node = first_online_node;
struct irq_desc *desc;
init_irq_default_affinity();
printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
desc = irq_desc;
count = ARRAY_SIZE(irq_desc);
for (i = 0; i < count; i++) {
desc[i].kstat_irqs = alloc_percpu(unsigned int);
alloc_masks(&desc[i], GFP_KERNEL, node);
raw_spin_lock_init(&desc[i].lock);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
desc_set_defaults(i, &desc[i], node, NULL);
}
return arch_early_irq_init();
}
struct irq_desc *irq_to_desc(unsigned int irq)
{
return (irq < NR_IRQS) ? irq_desc + irq : NULL;
}
EXPORT_SYMBOL(irq_to_desc);
static void free_desc(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
raw_spin_lock_irqsave(&desc->lock, flags);
desc_set_defaults(irq, desc, desc_node(desc), NULL);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
struct module *owner)
{
u32 i;
for (i = 0; i < cnt; i++) {
struct irq_desc *desc = irq_to_desc(start + i);
desc->owner = owner;
}
return start;
}
static int irq_expand_nr_irqs(unsigned int nr)
{
return -ENOMEM;
}
void irq_mark_irq(unsigned int irq)
{
mutex_lock(&sparse_irq_lock);
bitmap_set(allocated_irqs, irq, 1);
mutex_unlock(&sparse_irq_lock);
}
#ifdef CONFIG_GENERIC_IRQ_LEGACY
void irq_init_desc(unsigned int irq)
{
free_desc(irq);
}
#endif
#endif /* !CONFIG_SPARSE_IRQ */
/**
* generic_handle_irq - Invoke the handler for a particular irq
* @irq: The irq number to handle
*
*/
int generic_handle_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
if (!desc)
return -EINVAL;
generic_handle_irq_desc(irq, desc);
return 0;
}
EXPORT_SYMBOL_GPL(generic_handle_irq);
#ifdef CONFIG_HANDLE_DOMAIN_IRQ
/**
* __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain
* @domain: The domain where to perform the lookup
* @hwirq: The HW irq number to convert to a logical one
* @lookup: Whether to perform the domain lookup or not
* @regs: Register file coming from the low-level handling code
*
* Returns: 0 on success, or -EINVAL if conversion has failed
*/
int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
bool lookup, struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
unsigned long long start_time;
unsigned int irq = hwirq;
int ret = 0;
exynos_ss_irq_exit_var(start_time);
irq_enter();
#ifdef CONFIG_IRQ_DOMAIN
if (lookup)
irq = irq_find_mapping(domain, hwirq);
#endif
/*
* Some hardware gives randomly wrong interrupts. Rather
* than crashing, do something sensible.
*/
if (unlikely(!irq || irq >= nr_irqs)) {
ack_bad_irq(irq);
ret = -EINVAL;
} else {
generic_handle_irq(irq);
}
irq_exit();
exynos_ss_irq_exit(irq, start_time);
set_irq_regs(old_regs);
return ret;
}
#endif
/* Dynamic interrupt handling */
/**
* irq_free_descs - free irq descriptors
* @from: Start of descriptor range
* @cnt: Number of consecutive irqs to free
*/
void irq_free_descs(unsigned int from, unsigned int cnt)
{
int i;
if (from >= nr_irqs || (from + cnt) > nr_irqs)
return;
for (i = 0; i < cnt; i++)
free_desc(from + i);
mutex_lock(&sparse_irq_lock);
bitmap_clear(allocated_irqs, from, cnt);
mutex_unlock(&sparse_irq_lock);
}
EXPORT_SYMBOL_GPL(irq_free_descs);
/**
* irq_alloc_descs - allocate and initialize a range of irq descriptors
* @irq: Allocate for specific irq number if irq >= 0
* @from: Start the search from this irq number
* @cnt: Number of consecutive irqs to allocate.
* @node: Preferred node on which the irq descriptor should be allocated
* @owner: Owning module (can be NULL)
*
* Returns the first irq number or error code
*/
int __ref
__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
struct module *owner)
{
int start, ret;
if (!cnt)
return -EINVAL;
if (irq >= 0) {
if (from > irq)
return -EINVAL;
from = irq;
} else {
/*
* For interrupts which are freely allocated the
* architecture can force a lower bound to the @from
* argument. x86 uses this to exclude the GSI space.
*/
from = arch_dynirq_lower_bound(from);
}
mutex_lock(&sparse_irq_lock);
start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
from, cnt, 0);
ret = -EEXIST;
if (irq >=0 && start != irq)
goto err;
if (start + cnt > nr_irqs) {
ret = irq_expand_nr_irqs(start + cnt);
if (ret)
goto err;
}
bitmap_set(allocated_irqs, start, cnt);
mutex_unlock(&sparse_irq_lock);
return alloc_descs(start, cnt, node, owner);
err:
mutex_unlock(&sparse_irq_lock);
return ret;
}
EXPORT_SYMBOL_GPL(__irq_alloc_descs);
#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
/**
* irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware
* @cnt: number of interrupts to allocate
* @node: node on which to allocate
*
* Returns an interrupt number > 0 or 0, if the allocation fails.
*/
unsigned int irq_alloc_hwirqs(int cnt, int node)
{
int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL);
if (irq < 0)
return 0;
for (i = irq; cnt > 0; i++, cnt--) {
if (arch_setup_hwirq(i, node))
goto err;
irq_clear_status_flags(i, _IRQ_NOREQUEST);
}
return irq;
err:
for (i--; i >= irq; i--) {
irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
arch_teardown_hwirq(i);
}
irq_free_descs(irq, cnt);
return 0;
}
EXPORT_SYMBOL_GPL(irq_alloc_hwirqs);
/**
* irq_free_hwirqs - Free irq descriptor and cleanup the hardware
* @from: Free from irq number
* @cnt: number of interrupts to free
*
*/
void irq_free_hwirqs(unsigned int from, int cnt)
{
int i, j;
for (i = from, j = cnt; j > 0; i++, j--) {
irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
arch_teardown_hwirq(i);
}
irq_free_descs(from, cnt);
}
EXPORT_SYMBOL_GPL(irq_free_hwirqs);
#endif
/**
* irq_get_next_irq - get next allocated irq number
* @offset: where to start the search
*
* Returns next irq number after offset or nr_irqs if none is found.
*/
unsigned int irq_get_next_irq(unsigned int offset)
{
return find_next_bit(allocated_irqs, nr_irqs, offset);
}
struct irq_desc *
__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
unsigned int check)
{
struct irq_desc *desc = irq_to_desc(irq);
if (desc) {
if (check & _IRQ_DESC_CHECK) {
if ((check & _IRQ_DESC_PERCPU) &&
!irq_settings_is_per_cpu_devid(desc))
return NULL;
if (!(check & _IRQ_DESC_PERCPU) &&
irq_settings_is_per_cpu_devid(desc))
return NULL;
}
if (bus)
chip_bus_lock(desc);
raw_spin_lock_irqsave(&desc->lock, *flags);
}
return desc;
}
void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
{
raw_spin_unlock_irqrestore(&desc->lock, flags);
if (bus)
chip_bus_sync_unlock(desc);
}
int irq_set_percpu_devid(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
if (!desc)
return -EINVAL;
if (desc->percpu_enabled)
return -EINVAL;
desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL);
if (!desc->percpu_enabled)
return -ENOMEM;
irq_set_percpu_devid_flags(irq);
return 0;
}
void kstat_incr_irq_this_cpu(unsigned int irq)
{
kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
}
/**
* kstat_irqs_cpu - Get the statistics for an interrupt on a cpu
* @irq: The interrupt number
* @cpu: The cpu number
*
* Returns the sum of interrupt counts on @cpu since boot for
* @irq. The caller must ensure that the interrupt is not removed
* concurrently.
*/
unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
{
struct irq_desc *desc = irq_to_desc(irq);
return desc && desc->kstat_irqs ?
*per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
}
/**
* kstat_irqs - Get the statistics for an interrupt
* @irq: The interrupt number
*
* Returns the sum of interrupt counts on all cpus since boot for
* @irq. The caller must ensure that the interrupt is not removed
* concurrently.
*/
unsigned int kstat_irqs(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
int cpu;
int sum = 0;
if (!desc || !desc->kstat_irqs)
return 0;
for_each_possible_cpu(cpu)
sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
return sum;
}
/**
* kstat_irqs_usr - Get the statistics for an interrupt
* @irq: The interrupt number
*
* Returns the sum of interrupt counts on all cpus since boot for
* @irq. Contrary to kstat_irqs() this can be called from any
* preemptible context. It's protected against concurrent removal of
* an interrupt descriptor when sparse irqs are enabled.
*/
unsigned int kstat_irqs_usr(unsigned int irq)
{
int sum;
irq_lock_sparse();
sum = kstat_irqs(irq);
irq_unlock_sparse();
return sum;
}

711
kernel/irq/irqdomain.c Normal file
View file

@ -0,0 +1,711 @@
#define pr_fmt(fmt) "irq: " fmt
#include <linux/debugfs.h>
#include <linux/hardirq.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/irqdesc.h>
#include <linux/irqdomain.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/of.h>
#include <linux/of_address.h>
#include <linux/of_irq.h>
#include <linux/topology.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/fs.h>
static LIST_HEAD(irq_domain_list);
static DEFINE_MUTEX(irq_domain_mutex);
static DEFINE_MUTEX(revmap_trees_mutex);
static struct irq_domain *irq_default_domain;
/**
* __irq_domain_add() - Allocate a new irq_domain data structure
* @of_node: optional device-tree node of the interrupt controller
* @size: Size of linear map; 0 for radix mapping only
* @hwirq_max: Maximum number of interrupts supported by controller
* @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
* direct mapping
* @ops: map/unmap domain callbacks
* @host_data: Controller private data pointer
*
* Allocates and initialize and irq_domain structure.
* Returns pointer to IRQ domain, or NULL on failure.
*/
struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
irq_hw_number_t hwirq_max, int direct_max,
const struct irq_domain_ops *ops,
void *host_data)
{
struct irq_domain *domain;
domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
GFP_KERNEL, of_node_to_nid(of_node));
if (WARN_ON(!domain))
return NULL;
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
domain->ops = ops;
domain->host_data = host_data;
domain->of_node = of_node_get(of_node);
domain->hwirq_max = hwirq_max;
domain->revmap_size = size;
domain->revmap_direct_max_irq = direct_max;
mutex_lock(&irq_domain_mutex);
list_add(&domain->link, &irq_domain_list);
mutex_unlock(&irq_domain_mutex);
pr_debug("Added domain %s\n", domain->name);
return domain;
}
EXPORT_SYMBOL_GPL(__irq_domain_add);
/**
* irq_domain_remove() - Remove an irq domain.
* @domain: domain to remove
*
* This routine is used to remove an irq domain. The caller must ensure
* that all mappings within the domain have been disposed of prior to
* use, depending on the revmap type.
*/
void irq_domain_remove(struct irq_domain *domain)
{
mutex_lock(&irq_domain_mutex);
/*
* radix_tree_delete() takes care of destroying the root
* node when all entries are removed. Shout if there are
* any mappings left.
*/
WARN_ON(domain->revmap_tree.height);
list_del(&domain->link);
/*
* If the going away domain is the default one, reset it.
*/
if (unlikely(irq_default_domain == domain))
irq_set_default_host(NULL);
mutex_unlock(&irq_domain_mutex);
pr_debug("Removed domain %s\n", domain->name);
of_node_put(domain->of_node);
kfree(domain);
}
EXPORT_SYMBOL_GPL(irq_domain_remove);
/**
* irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
* @of_node: pointer to interrupt controller's device tree node.
* @size: total number of irqs in mapping
* @first_irq: first number of irq block assigned to the domain,
* pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
* pre-map all of the irqs in the domain to virqs starting at first_irq.
* @ops: map/unmap domain callbacks
* @host_data: Controller private data pointer
*
* Allocates an irq_domain, and optionally if first_irq is positive then also
* allocate irq_descs and map all of the hwirqs to virqs starting at first_irq.
*
* This is intended to implement the expected behaviour for most
* interrupt controllers. If device tree is used, then first_irq will be 0 and
* irqs get mapped dynamically on the fly. However, if the controller requires
* static virq assignments (non-DT boot) then it will set that up correctly.
*/
struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
unsigned int size,
unsigned int first_irq,
const struct irq_domain_ops *ops,
void *host_data)
{
struct irq_domain *domain;
domain = __irq_domain_add(of_node, size, size, 0, ops, host_data);
if (!domain)
return NULL;
if (first_irq > 0) {
if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
/* attempt to allocated irq_descs */
int rc = irq_alloc_descs(first_irq, first_irq, size,
of_node_to_nid(of_node));
if (rc < 0)
pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
first_irq);
}
irq_domain_associate_many(domain, first_irq, 0, size);
}
return domain;
}
EXPORT_SYMBOL_GPL(irq_domain_add_simple);
/**
* irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
* @of_node: pointer to interrupt controller's device tree node.
* @size: total number of irqs in legacy mapping
* @first_irq: first number of irq block assigned to the domain
* @first_hwirq: first hwirq number to use for the translation. Should normally
* be '0', but a positive integer can be used if the effective
* hwirqs numbering does not begin at zero.
* @ops: map/unmap domain callbacks
* @host_data: Controller private data pointer
*
* Note: the map() callback will be called before this function returns
* for all legacy interrupts except 0 (which is always the invalid irq for
* a legacy controller).
*/
struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
unsigned int size,
unsigned int first_irq,
irq_hw_number_t first_hwirq,
const struct irq_domain_ops *ops,
void *host_data)
{
struct irq_domain *domain;
domain = __irq_domain_add(of_node, first_hwirq + size,
first_hwirq + size, 0, ops, host_data);
if (!domain)
return NULL;
irq_domain_associate_many(domain, first_irq, first_hwirq, size);
return domain;
}
EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
/**
* irq_find_host() - Locates a domain for a given device node
* @node: device-tree node of the interrupt controller
*/
struct irq_domain *irq_find_host(struct device_node *node)
{
struct irq_domain *h, *found = NULL;
int rc;
/* We might want to match the legacy controller last since
* it might potentially be set to match all interrupts in
* the absence of a device node. This isn't a problem so far
* yet though...
*/
mutex_lock(&irq_domain_mutex);
list_for_each_entry(h, &irq_domain_list, link) {
if (h->ops->match)
rc = h->ops->match(h, node);
else
rc = (h->of_node != NULL) && (h->of_node == node);
if (rc) {
found = h;
break;
}
}
mutex_unlock(&irq_domain_mutex);
return found;
}
EXPORT_SYMBOL_GPL(irq_find_host);
/**
* irq_set_default_host() - Set a "default" irq domain
* @domain: default domain pointer
*
* For convenience, it's possible to set a "default" domain that will be used
* whenever NULL is passed to irq_create_mapping(). It makes life easier for
* platforms that want to manipulate a few hard coded interrupt numbers that
* aren't properly represented in the device-tree.
*/
void irq_set_default_host(struct irq_domain *domain)
{
pr_debug("Default domain set to @0x%p\n", domain);
irq_default_domain = domain;
}
EXPORT_SYMBOL_GPL(irq_set_default_host);
void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
{
struct irq_data *irq_data = irq_get_irq_data(irq);
irq_hw_number_t hwirq;
if (WARN(!irq_data || irq_data->domain != domain,
"virq%i doesn't exist; cannot disassociate\n", irq))
return;
hwirq = irq_data->hwirq;
irq_set_status_flags(irq, IRQ_NOREQUEST);
/* remove chip and handler */
irq_set_chip_and_handler(irq, NULL, NULL);
/* Make sure it's completed */
synchronize_irq(irq);
/* Tell the PIC about it */
if (domain->ops->unmap)
domain->ops->unmap(domain, irq);
smp_mb();
irq_data->domain = NULL;
irq_data->hwirq = 0;
/* Clear reverse map for this hwirq */
if (hwirq < domain->revmap_size) {
domain->linear_revmap[hwirq] = 0;
} else {
mutex_lock(&revmap_trees_mutex);
radix_tree_delete(&domain->revmap_tree, hwirq);
mutex_unlock(&revmap_trees_mutex);
}
}
int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
irq_hw_number_t hwirq)
{
struct irq_data *irq_data = irq_get_irq_data(virq);
int ret;
if (WARN(hwirq >= domain->hwirq_max,
"error: hwirq 0x%x is too large for %s\n", (int)hwirq, domain->name))
return -EINVAL;
if (WARN(!irq_data, "error: virq%i is not allocated", virq))
return -EINVAL;
if (WARN(irq_data->domain, "error: virq%i is already associated", virq))
return -EINVAL;
mutex_lock(&irq_domain_mutex);
irq_data->hwirq = hwirq;
irq_data->domain = domain;
if (domain->ops->map) {
ret = domain->ops->map(domain, virq, hwirq);
if (ret != 0) {
/*
* If map() returns -EPERM, this interrupt is protected
* by the firmware or some other service and shall not
* be mapped. Don't bother telling the user about it.
*/
if (ret != -EPERM) {
pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n",
domain->name, hwirq, virq, ret);
}
irq_data->domain = NULL;
irq_data->hwirq = 0;
mutex_unlock(&irq_domain_mutex);
return ret;
}
/* If not already assigned, give the domain the chip's name */
if (!domain->name && irq_data->chip)
domain->name = irq_data->chip->name;
}
if (hwirq < domain->revmap_size) {
domain->linear_revmap[hwirq] = virq;
} else {
mutex_lock(&revmap_trees_mutex);
radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
mutex_unlock(&revmap_trees_mutex);
}
mutex_unlock(&irq_domain_mutex);
irq_clear_status_flags(virq, IRQ_NOREQUEST);
return 0;
}
EXPORT_SYMBOL_GPL(irq_domain_associate);
void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
irq_hw_number_t hwirq_base, int count)
{
int i;
pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
for (i = 0; i < count; i++) {
irq_domain_associate(domain, irq_base + i, hwirq_base + i);
}
}
EXPORT_SYMBOL_GPL(irq_domain_associate_many);
/**
* irq_create_direct_mapping() - Allocate an irq for direct mapping
* @domain: domain to allocate the irq for or NULL for default domain
*
* This routine is used for irq controllers which can choose the hardware
* interrupt numbers they generate. In such a case it's simplest to use
* the linux irq as the hardware interrupt number. It still uses the linear
* or radix tree to store the mapping, but the irq controller can optimize
* the revmap path by using the hwirq directly.
*/
unsigned int irq_create_direct_mapping(struct irq_domain *domain)
{
unsigned int virq;
if (domain == NULL)
domain = irq_default_domain;
virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
if (!virq) {
pr_debug("create_direct virq allocation failed\n");
return 0;
}
if (virq >= domain->revmap_direct_max_irq) {
pr_err("ERROR: no free irqs available below %i maximum\n",
domain->revmap_direct_max_irq);
irq_free_desc(virq);
return 0;
}
pr_debug("create_direct obtained virq %d\n", virq);
if (irq_domain_associate(domain, virq, virq)) {
irq_free_desc(virq);
return 0;
}
return virq;
}
EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
/**
* irq_create_mapping() - Map a hardware interrupt into linux irq space
* @domain: domain owning this hardware interrupt or NULL for default domain
* @hwirq: hardware irq number in that domain space
*
* Only one mapping per hardware interrupt is permitted. Returns a linux
* irq number.
* If the sense/trigger is to be specified, set_irq_type() should be called
* on the number returned from that call.
*/
unsigned int irq_create_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
unsigned int hint;
int virq;
pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
/* Look for default domain if nececssary */
if (domain == NULL)
domain = irq_default_domain;
if (domain == NULL) {
WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);
return 0;
}
pr_debug("-> using domain @%p\n", domain);
/* Check if mapping already exists */
virq = irq_find_mapping(domain, hwirq);
if (virq) {
pr_debug("-> existing mapping on virq %d\n", virq);
return virq;
}
/* Allocate a virtual interrupt number */
hint = hwirq % nr_irqs;
if (hint == 0)
hint++;
virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node));
if (virq <= 0)
virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
if (virq <= 0) {
pr_debug("-> virq allocation failed\n");
return 0;
}
if (irq_domain_associate(domain, virq, hwirq)) {
irq_free_desc(virq);
return 0;
}
pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
hwirq, of_node_full_name(domain->of_node), virq);
return virq;
}
EXPORT_SYMBOL_GPL(irq_create_mapping);
/**
* irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
* @domain: domain owning the interrupt range
* @irq_base: beginning of linux IRQ range
* @hwirq_base: beginning of hardware IRQ range
* @count: Number of interrupts to map
*
* This routine is used for allocating and mapping a range of hardware
* irqs to linux irqs where the linux irq numbers are at pre-defined
* locations. For use by controllers that already have static mappings
* to insert in to the domain.
*
* Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time
* domain insertion.
*
* 0 is returned upon success, while any failure to establish a static
* mapping is treated as an error.
*/
int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
irq_hw_number_t hwirq_base, int count)
{
int ret;
ret = irq_alloc_descs(irq_base, irq_base, count,
of_node_to_nid(domain->of_node));
if (unlikely(ret < 0))
return ret;
irq_domain_associate_many(domain, irq_base, hwirq_base, count);
return 0;
}
EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
{
struct irq_domain *domain;
irq_hw_number_t hwirq;
unsigned int type = IRQ_TYPE_NONE;
unsigned int virq;
domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
if (!domain) {
pr_warn("no irq domain found for %s !\n",
of_node_full_name(irq_data->np));
return 0;
}
/* If domain has no translation, then we assume interrupt line */
if (domain->ops->xlate == NULL)
hwirq = irq_data->args[0];
else {
if (domain->ops->xlate(domain, irq_data->np, irq_data->args,
irq_data->args_count, &hwirq, &type))
return 0;
}
/* Create mapping */
virq = irq_create_mapping(domain, hwirq);
if (!virq)
return virq;
/* Set type if specified and different than the current one */
if (type != IRQ_TYPE_NONE &&
type != irq_get_trigger_type(virq))
irq_set_irq_type(virq, type);
return virq;
}
EXPORT_SYMBOL_GPL(irq_create_of_mapping);
/**
* irq_dispose_mapping() - Unmap an interrupt
* @virq: linux irq number of the interrupt to unmap
*/
void irq_dispose_mapping(unsigned int virq)
{
struct irq_data *irq_data = irq_get_irq_data(virq);
struct irq_domain *domain;
if (!virq || !irq_data)
return;
domain = irq_data->domain;
if (WARN_ON(domain == NULL))
return;
irq_domain_disassociate(domain, virq);
irq_free_desc(virq);
}
EXPORT_SYMBOL_GPL(irq_dispose_mapping);
/**
* irq_find_mapping() - Find a linux irq from an hw irq number.
* @domain: domain owning this hardware interrupt
* @hwirq: hardware irq number in that domain space
*/
unsigned int irq_find_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
struct irq_data *data;
/* Look for default domain if nececssary */
if (domain == NULL)
domain = irq_default_domain;
if (domain == NULL)
return 0;
if (hwirq < domain->revmap_direct_max_irq) {
data = irq_get_irq_data(hwirq);
if (data && (data->domain == domain) && (data->hwirq == hwirq))
return hwirq;
}
/* Check if the hwirq is in the linear revmap. */
if (hwirq < domain->revmap_size)
return domain->linear_revmap[hwirq];
rcu_read_lock();
data = radix_tree_lookup(&domain->revmap_tree, hwirq);
rcu_read_unlock();
return data ? data->irq : 0;
}
EXPORT_SYMBOL_GPL(irq_find_mapping);
#ifdef CONFIG_IRQ_DOMAIN_DEBUG
static int virq_debug_show(struct seq_file *m, void *private)
{
unsigned long flags;
struct irq_desc *desc;
struct irq_domain *domain;
struct radix_tree_iter iter;
void *data, **slot;
int i;
seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
"name", "mapped", "linear-max", "direct-max", "devtree-node");
mutex_lock(&irq_domain_mutex);
list_for_each_entry(domain, &irq_domain_list, link) {
int count = 0;
radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
count++;
seq_printf(m, "%c%-16s %6u %10u %10u %s\n",
domain == irq_default_domain ? '*' : ' ', domain->name,
domain->revmap_size + count, domain->revmap_size,
domain->revmap_direct_max_irq,
domain->of_node ? of_node_full_name(domain->of_node) : "");
}
mutex_unlock(&irq_domain_mutex);
seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq",
"chip name", (int)(2 * sizeof(void *) + 2), "chip data",
"active", "type", "domain");
for (i = 1; i < nr_irqs; i++) {
desc = irq_to_desc(i);
if (!desc)
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
domain = desc->irq_data.domain;
if (domain) {
struct irq_chip *chip;
int hwirq = desc->irq_data.hwirq;
bool direct;
seq_printf(m, "%5d ", i);
seq_printf(m, "0x%05x ", hwirq);
chip = irq_desc_get_chip(desc);
seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
data = irq_desc_get_chip_data(desc);
seq_printf(m, data ? "0x%p " : " %p ", data);
seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
direct = (i == hwirq) && (i < domain->revmap_direct_max_irq);
seq_printf(m, "%6s%-8s ",
(hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
direct ? "(DIRECT)" : "");
seq_printf(m, "%s\n", desc->irq_data.domain->name);
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
return 0;
}
static int virq_debug_open(struct inode *inode, struct file *file)
{
return single_open(file, virq_debug_show, inode->i_private);
}
static const struct file_operations virq_debug_fops = {
.open = virq_debug_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int __init irq_debugfs_init(void)
{
if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL,
NULL, &virq_debug_fops) == NULL)
return -ENOMEM;
return 0;
}
__initcall(irq_debugfs_init);
#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
/**
* irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
*
* Device Tree IRQ specifier translation function which works with one cell
* bindings where the cell value maps directly to the hwirq number.
*/
int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr,
const u32 *intspec, unsigned int intsize,
unsigned long *out_hwirq, unsigned int *out_type)
{
if (WARN_ON(intsize < 1))
return -EINVAL;
*out_hwirq = intspec[0];
*out_type = IRQ_TYPE_NONE;
return 0;
}
EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell);
/**
* irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings
*
* Device Tree IRQ specifier translation function which works with two cell
* bindings where the cell values map directly to the hwirq number
* and linux irq flags.
*/
int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
const u32 *intspec, unsigned int intsize,
irq_hw_number_t *out_hwirq, unsigned int *out_type)
{
if (WARN_ON(intsize < 2))
return -EINVAL;
*out_hwirq = intspec[0];
*out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
return 0;
}
EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
/**
* irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings
*
* Device Tree IRQ specifier translation function which works with either one
* or two cell bindings where the cell values map directly to the hwirq number
* and linux irq flags.
*
* Note: don't use this function unless your interrupt controller explicitly
* supports both one and two cell bindings. For the majority of controllers
* the _onecell() or _twocell() variants above should be used.
*/
int irq_domain_xlate_onetwocell(struct irq_domain *d,
struct device_node *ctrlr,
const u32 *intspec, unsigned int intsize,
unsigned long *out_hwirq, unsigned int *out_type)
{
if (WARN_ON(intsize < 1))
return -EINVAL;
*out_hwirq = intspec[0];
*out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE;
return 0;
}
EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
const struct irq_domain_ops irq_domain_simple_ops = {
.xlate = irq_domain_xlate_onetwocell,
};
EXPORT_SYMBOL_GPL(irq_domain_simple_ops);

1761
kernel/irq/manage.c Normal file

File diff suppressed because it is too large Load diff

72
kernel/irq/migration.c Normal file
View file

@ -0,0 +1,72 @@
#include <linux/irq.h>
#include <linux/interrupt.h>
#include "internals.h"
void irq_move_masked_irq(struct irq_data *idata)
{
struct irq_desc *desc = irq_data_to_desc(idata);
struct irq_chip *chip = idata->chip;
if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
return;
/*
* Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
*/
if (!irqd_can_balance(&desc->irq_data)) {
WARN_ON(1);
return;
}
irqd_clr_move_pending(&desc->irq_data);
if (unlikely(cpumask_empty(desc->pending_mask)))
return;
if (!chip->irq_set_affinity)
return;
assert_raw_spin_locked(&desc->lock);
/*
* If there was a valid mask to work with, please
* do the disable, re-program, enable sequence.
* This is *not* particularly important for level triggered
* but in a edge trigger case, we might be setting rte
* when an active trigger is coming in. This could
* cause some ioapics to mal-function.
* Being paranoid i guess!
*
* For correct operation this depends on the caller
* masking the irqs.
*/
if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false);
cpumask_clear(desc->pending_mask);
}
void irq_move_irq(struct irq_data *idata)
{
bool masked;
if (likely(!irqd_is_setaffinity_pending(idata)))
return;
if (unlikely(irqd_irq_disabled(idata)))
return;
/*
* Be careful vs. already masked interrupts. If this is a
* threaded interrupt with ONESHOT set, we can end up with an
* interrupt storm.
*/
masked = irqd_irq_masked(idata);
if (!masked)
idata->chip->irq_mask(idata);
irq_move_masked_irq(idata);
if (!masked)
idata->chip->irq_unmask(idata);
}

201
kernel/irq/pm.c Normal file
View file

@ -0,0 +1,201 @@
/*
* linux/kernel/irq/pm.c
*
* Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
*
* This file contains power management functions related to interrupts.
*/
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
#include "internals.h"
bool irq_pm_check_wakeup(struct irq_desc *desc)
{
if (irqd_is_wakeup_armed(&desc->irq_data)) {
irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
desc->depth++;
irq_disable(desc);
pm_system_wakeup();
return true;
}
return false;
}
/*
* Called from __setup_irq() with desc->lock held after @action has
* been installed in the action chain.
*/
void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action)
{
desc->nr_actions++;
if (action->flags & IRQF_FORCE_RESUME)
desc->force_resume_depth++;
WARN_ON_ONCE(desc->force_resume_depth &&
desc->force_resume_depth != desc->nr_actions);
if (action->flags & IRQF_NO_SUSPEND)
desc->no_suspend_depth++;
WARN_ON_ONCE(desc->no_suspend_depth &&
desc->no_suspend_depth != desc->nr_actions);
}
/*
* Called from __free_irq() with desc->lock held after @action has
* been removed from the action chain.
*/
void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
{
desc->nr_actions--;
if (action->flags & IRQF_FORCE_RESUME)
desc->force_resume_depth--;
if (action->flags & IRQF_NO_SUSPEND)
desc->no_suspend_depth--;
}
static bool suspend_device_irq(struct irq_desc *desc, int irq)
{
if (!desc->action || desc->no_suspend_depth)
return false;
if (irqd_is_wakeup_set(&desc->irq_data)) {
irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
/*
* We return true here to force the caller to issue
* synchronize_irq(). We need to make sure that the
* IRQD_WAKEUP_ARMED is visible before we return from
* suspend_device_irqs().
*/
return true;
}
desc->istate |= IRQS_SUSPENDED;
__disable_irq(desc, irq);
/*
* Hardware which has no wakeup source configuration facility
* requires that the non wakeup interrupts are masked at the
* chip level. The chip implementation indicates that with
* IRQCHIP_MASK_ON_SUSPEND.
*/
if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
mask_irq(desc);
return true;
}
/**
* suspend_device_irqs - disable all currently enabled interrupt lines
*
* During system-wide suspend or hibernation device drivers need to be
* prevented from receiving interrupts and this function is provided
* for this purpose.
*
* So we disable all interrupts and mark them IRQS_SUSPENDED except
* for those which are unused, those which are marked as not
* suspendable via an interrupt request with the flag IRQF_NO_SUSPEND
* set and those which are marked as active wakeup sources.
*
* The active wakeup sources are handled by the flow handler entry
* code which checks for the IRQD_WAKEUP_ARMED flag, suspends the
* interrupt and notifies the pm core about the wakeup.
*/
void suspend_device_irqs(void)
{
struct irq_desc *desc;
int irq;
for_each_irq_desc(irq, desc) {
unsigned long flags;
bool sync;
raw_spin_lock_irqsave(&desc->lock, flags);
sync = suspend_device_irq(desc, irq);
raw_spin_unlock_irqrestore(&desc->lock, flags);
if (sync)
synchronize_irq(irq);
}
}
EXPORT_SYMBOL_GPL(suspend_device_irqs);
static void resume_irq(struct irq_desc *desc, int irq)
{
irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
if (desc->istate & IRQS_SUSPENDED)
goto resume;
/* Force resume the interrupt? */
if (!desc->force_resume_depth)
return;
/* Pretend that it got disabled ! */
desc->depth++;
resume:
desc->istate &= ~IRQS_SUSPENDED;
__enable_irq(desc, irq);
}
static void resume_irqs(bool want_early)
{
struct irq_desc *desc;
int irq;
for_each_irq_desc(irq, desc) {
unsigned long flags;
bool is_early = desc->action &&
desc->action->flags & IRQF_EARLY_RESUME;
if (!is_early && want_early)
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
resume_irq(desc, irq);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
/**
* irq_pm_syscore_ops - enable interrupt lines early
*
* Enable all interrupt lines with %IRQF_EARLY_RESUME set.
*/
static void irq_pm_syscore_resume(void)
{
resume_irqs(true);
}
static struct syscore_ops irq_pm_syscore_ops = {
.resume = irq_pm_syscore_resume,
};
static int __init irq_pm_init_ops(void)
{
register_syscore_ops(&irq_pm_syscore_ops);
return 0;
}
device_initcall(irq_pm_init_ops);
/**
* resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
*
* Enable all non-%IRQF_EARLY_RESUME interrupt lines previously
* disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag
* set as well as those with %IRQF_FORCE_RESUME.
*/
void resume_device_irqs(void)
{
resume_irqs(false);
}
EXPORT_SYMBOL_GPL(resume_device_irqs);

504
kernel/irq/proc.c Normal file
View file

@ -0,0 +1,504 @@
/*
* linux/kernel/irq/proc.c
*
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains the /proc/irq/ handling code.
*/
#include <linux/irq.h>
#include <linux/gfp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include "internals.h"
/*
* Access rules:
*
* procfs protects read/write of /proc/irq/N/ files against a
* concurrent free of the interrupt descriptor. remove_proc_entry()
* immediately prevents new read/writes to happen and waits for
* already running read/write functions to complete.
*
* We remove the proc entries first and then delete the interrupt
* descriptor from the radix tree and free it. So it is guaranteed
* that irq_to_desc(N) is valid as long as the read/writes are
* permitted by procfs.
*
* The read from /proc/interrupts is a different problem because there
* is no protection. So the lookup and the access to irqdesc
* information must be protected by sparse_irq_lock.
*/
static struct proc_dir_entry *root_irq_dir;
#ifdef CONFIG_SMP
static int show_irq_affinity(int type, struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long)m->private);
const struct cpumask *mask = desc->irq_data.affinity;
#ifdef CONFIG_GENERIC_PENDING_IRQ
if (irqd_is_setaffinity_pending(&desc->irq_data))
mask = desc->pending_mask;
#endif
if (type)
seq_cpumask_list(m, mask);
else
seq_cpumask(m, mask);
seq_putc(m, '\n');
return 0;
}
static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long)m->private);
unsigned long flags;
cpumask_var_t mask;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
raw_spin_lock_irqsave(&desc->lock, flags);
if (desc->affinity_hint)
cpumask_copy(mask, desc->affinity_hint);
raw_spin_unlock_irqrestore(&desc->lock, flags);
seq_cpumask(m, mask);
seq_putc(m, '\n');
free_cpumask_var(mask);
return 0;
}
#ifndef is_affinity_mask_valid
#define is_affinity_mask_valid(val) 1
#endif
int no_irq_affinity;
static int irq_affinity_proc_show(struct seq_file *m, void *v)
{
return show_irq_affinity(0, m, v);
}
static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
{
return show_irq_affinity(1, m, v);
}
static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
unsigned int irq = (int)(long)PDE_DATA(file_inode(file));
cpumask_var_t new_value;
int err;
if (!irq_can_set_affinity(irq) || no_irq_affinity)
return -EIO;
if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
if (type)
err = cpumask_parselist_user(buffer, count, new_value);
else
err = cpumask_parse_user(buffer, count, new_value);
if (err)
goto free_cpumask;
if (!is_affinity_mask_valid(new_value)) {
err = -EINVAL;
goto free_cpumask;
}
/*
* Do not allow disabling IRQs completely - it's a too easy
* way to make the system unusable accidentally :-) At least
* one online CPU still has to be targeted.
*/
if (!cpumask_intersects(new_value, cpu_online_mask)) {
/* Special case for empty set - allow the architecture
code to set default SMP affinity. */
err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count;
} else {
irq_set_affinity(irq, new_value);
err = count;
}
free_cpumask:
free_cpumask_var(new_value);
return err;
}
static ssize_t irq_affinity_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
return write_irq_affinity(0, file, buffer, count, pos);
}
static ssize_t irq_affinity_list_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
return write_irq_affinity(1, file, buffer, count, pos);
}
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, irq_affinity_proc_show, PDE_DATA(inode));
}
static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
}
static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode));
}
static const struct file_operations irq_affinity_proc_fops = {
.open = irq_affinity_proc_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
.write = irq_affinity_proc_write,
};
static const struct file_operations irq_affinity_hint_proc_fops = {
.open = irq_affinity_hint_proc_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static const struct file_operations irq_affinity_list_proc_fops = {
.open = irq_affinity_list_proc_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
.write = irq_affinity_list_proc_write,
};
static int default_affinity_show(struct seq_file *m, void *v)
{
seq_cpumask(m, irq_default_affinity);
seq_putc(m, '\n');
return 0;
}
static ssize_t default_affinity_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
cpumask_var_t new_value;
int err;
if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
err = cpumask_parse_user(buffer, count, new_value);
if (err)
goto out;
if (!is_affinity_mask_valid(new_value)) {
err = -EINVAL;
goto out;
}
/*
* Do not allow disabling IRQs completely - it's a too easy
* way to make the system unusable accidentally :-) At least
* one online CPU still has to be targeted.
*/
if (!cpumask_intersects(new_value, cpu_online_mask)) {
err = -EINVAL;
goto out;
}
cpumask_copy(irq_default_affinity, new_value);
err = count;
out:
free_cpumask_var(new_value);
return err;
}
static int default_affinity_open(struct inode *inode, struct file *file)
{
return single_open(file, default_affinity_show, PDE_DATA(inode));
}
static const struct file_operations default_affinity_proc_fops = {
.open = default_affinity_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
.write = default_affinity_write,
};
static int irq_node_proc_show(struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long) m->private);
seq_printf(m, "%d\n", desc->irq_data.node);
return 0;
}
static int irq_node_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, irq_node_proc_show, PDE_DATA(inode));
}
static const struct file_operations irq_node_proc_fops = {
.open = irq_node_proc_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
#endif
static int irq_spurious_proc_show(struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long) m->private);
seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n",
desc->irq_count, desc->irqs_unhandled,
jiffies_to_msecs(desc->last_unhandled));
return 0;
}
static int irq_spurious_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, irq_spurious_proc_show, PDE_DATA(inode));
}
static const struct file_operations irq_spurious_proc_fops = {
.open = irq_spurious_proc_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
#define MAX_NAMELEN 128
static int name_unique(unsigned int irq, struct irqaction *new_action)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
unsigned long flags;
int ret = 1;
raw_spin_lock_irqsave(&desc->lock, flags);
for (action = desc->action ; action; action = action->next) {
if ((action != new_action) && action->name &&
!strcmp(new_action->name, action->name)) {
ret = 0;
break;
}
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
return ret;
}
void register_handler_proc(unsigned int irq, struct irqaction *action)
{
char name [MAX_NAMELEN];
struct irq_desc *desc = irq_to_desc(irq);
if (!desc->dir || action->dir || !action->name ||
!name_unique(irq, action))
return;
memset(name, 0, MAX_NAMELEN);
snprintf(name, MAX_NAMELEN, "%s", action->name);
/* create /proc/irq/1234/handler/ */
action->dir = proc_mkdir(name, desc->dir);
}
#undef MAX_NAMELEN
#define MAX_NAMELEN 10
void register_irq_proc(unsigned int irq, struct irq_desc *desc)
{
char name [MAX_NAMELEN];
if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
return;
memset(name, 0, MAX_NAMELEN);
sprintf(name, "%d", irq);
/* create /proc/irq/1234 */
desc->dir = proc_mkdir(name, root_irq_dir);
if (!desc->dir)
return;
#ifdef CONFIG_SMP
/* create /proc/irq/<irq>/smp_affinity */
proc_create_data("smp_affinity", 0644, desc->dir,
&irq_affinity_proc_fops, (void *)(long)irq);
/* create /proc/irq/<irq>/affinity_hint */
proc_create_data("affinity_hint", 0444, desc->dir,
&irq_affinity_hint_proc_fops, (void *)(long)irq);
/* create /proc/irq/<irq>/smp_affinity_list */
proc_create_data("smp_affinity_list", 0644, desc->dir,
&irq_affinity_list_proc_fops, (void *)(long)irq);
proc_create_data("node", 0444, desc->dir,
&irq_node_proc_fops, (void *)(long)irq);
#endif
proc_create_data("spurious", 0444, desc->dir,
&irq_spurious_proc_fops, (void *)(long)irq);
}
void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
{
char name [MAX_NAMELEN];
if (!root_irq_dir || !desc->dir)
return;
#ifdef CONFIG_SMP
remove_proc_entry("smp_affinity", desc->dir);
remove_proc_entry("affinity_hint", desc->dir);
remove_proc_entry("smp_affinity_list", desc->dir);
remove_proc_entry("node", desc->dir);
#endif
remove_proc_entry("spurious", desc->dir);
memset(name, 0, MAX_NAMELEN);
sprintf(name, "%u", irq);
remove_proc_entry(name, root_irq_dir);
}
#undef MAX_NAMELEN
void unregister_handler_proc(unsigned int irq, struct irqaction *action)
{
proc_remove(action->dir);
}
static void register_default_affinity_proc(void)
{
#ifdef CONFIG_SMP
proc_create("irq/default_smp_affinity", 0644, NULL,
&default_affinity_proc_fops);
#endif
}
void init_irq_proc(void)
{
unsigned int irq;
struct irq_desc *desc;
/* create /proc/irq */
root_irq_dir = proc_mkdir("irq", NULL);
if (!root_irq_dir)
return;
register_default_affinity_proc();
/*
* Create entries for all existing IRQs.
*/
for_each_irq_desc(irq, desc) {
if (!desc)
continue;
register_irq_proc(irq, desc);
}
}
#ifdef CONFIG_GENERIC_IRQ_SHOW
int __weak arch_show_interrupts(struct seq_file *p, int prec)
{
return 0;
}
#ifndef ACTUAL_NR_IRQS
# define ACTUAL_NR_IRQS nr_irqs
#endif
int show_interrupts(struct seq_file *p, void *v)
{
static int prec;
unsigned long flags, any_count = 0;
int i = *(loff_t *) v, j;
struct irqaction *action;
struct irq_desc *desc;
if (i > ACTUAL_NR_IRQS)
return 0;
if (i == ACTUAL_NR_IRQS)
return arch_show_interrupts(p, prec);
/* print header and calculate the width of the first column */
if (i == 0) {
for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
j *= 10;
seq_printf(p, "%*s", prec + 8, "");
for_each_online_cpu(j)
seq_printf(p, "CPU%-8d", j);
seq_putc(p, '\n');
}
irq_lock_sparse();
desc = irq_to_desc(i);
if (!desc)
goto outsparse;
raw_spin_lock_irqsave(&desc->lock, flags);
for_each_online_cpu(j)
any_count |= kstat_irqs_cpu(i, j);
action = desc->action;
if (!action && !any_count)
goto out;
seq_printf(p, "%*d: ", prec, i);
for_each_online_cpu(j)
seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
if (desc->irq_data.chip) {
if (desc->irq_data.chip->irq_print_chip)
desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
else if (desc->irq_data.chip->name)
seq_printf(p, " %8s", desc->irq_data.chip->name);
else
seq_printf(p, " %8s", "-");
} else {
seq_printf(p, " %8s", "None");
}
if (desc->irq_data.domain)
seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
#endif
if (desc->name)
seq_printf(p, "-%-8s", desc->name);
if (action) {
seq_printf(p, " %s", action->name);
while ((action = action->next) != NULL)
seq_printf(p, ", %s", action->name);
}
seq_putc(p, '\n');
out:
raw_spin_unlock_irqrestore(&desc->lock, flags);
outsparse:
irq_unlock_sparse();
return 0;
}
#endif

91
kernel/irq/resend.c Normal file
View file

@ -0,0 +1,91 @@
/*
* linux/kernel/irq/resend.c
*
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner
*
* This file contains the IRQ-resend code
*
* If the interrupt is waiting to be processed, we try to re-run it.
* We can't directly run it from here since the caller might be in an
* interrupt-protected region. Not all irq controller chips can
* retrigger interrupts at the hardware level, so in those cases
* we allow the resending of IRQs via a tasklet.
*/
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/interrupt.h>
#include "internals.h"
#ifdef CONFIG_HARDIRQS_SW_RESEND
/* Bitmap to handle software resend of interrupts: */
static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
/*
* Run software resends of IRQ's
*/
static void resend_irqs(unsigned long arg)
{
struct irq_desc *desc;
int irq;
while (!bitmap_empty(irqs_resend, nr_irqs)) {
irq = find_first_bit(irqs_resend, nr_irqs);
clear_bit(irq, irqs_resend);
desc = irq_to_desc(irq);
local_irq_disable();
desc->handle_irq(irq, desc);
local_irq_enable();
}
}
/* Tasklet to handle resend: */
static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
#endif
/*
* IRQ resend
*
* Is called with interrupts disabled and desc->lock held.
*/
void check_irq_resend(struct irq_desc *desc, unsigned int irq)
{
/*
* We do not resend level type interrupts. Level type
* interrupts are resent by hardware when they are still
* active. Clear the pending bit so suspend/resume does not
* get confused.
*/
if (irq_settings_is_level(desc)) {
desc->istate &= ~IRQS_PENDING;
return;
}
if (desc->istate & IRQS_REPLAY)
return;
if (desc->istate & IRQS_PENDING) {
desc->istate &= ~IRQS_PENDING;
desc->istate |= IRQS_REPLAY;
if (!desc->irq_data.chip->irq_retrigger ||
!desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
#ifdef CONFIG_HARDIRQS_SW_RESEND
/*
* If the interrupt has a parent irq and runs
* in the thread context of the parent irq,
* retrigger the parent.
*/
if (desc->parent_irq &&
irq_settings_is_nested_thread(desc))
irq = desc->parent_irq;
/* Set it pending and activate the softirq: */
set_bit(irq, irqs_resend);
tasklet_schedule(&resend_tasklet);
#endif
}
}
}

156
kernel/irq/settings.h Normal file
View file

@ -0,0 +1,156 @@
/*
* Internal header to deal with irq_desc->status which will be renamed
* to irq_desc->settings.
*/
enum {
_IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS,
_IRQ_PER_CPU = IRQ_PER_CPU,
_IRQ_LEVEL = IRQ_LEVEL,
_IRQ_NOPROBE = IRQ_NOPROBE,
_IRQ_NOREQUEST = IRQ_NOREQUEST,
_IRQ_NOTHREAD = IRQ_NOTHREAD,
_IRQ_NOAUTOEN = IRQ_NOAUTOEN,
_IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
_IRQ_NO_BALANCING = IRQ_NO_BALANCING,
_IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
_IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
_IRQ_IS_POLLED = IRQ_IS_POLLED,
_IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
};
#define IRQ_PER_CPU GOT_YOU_MORON
#define IRQ_NO_BALANCING GOT_YOU_MORON
#define IRQ_LEVEL GOT_YOU_MORON
#define IRQ_NOPROBE GOT_YOU_MORON
#define IRQ_NOREQUEST GOT_YOU_MORON
#define IRQ_NOTHREAD GOT_YOU_MORON
#define IRQ_NOAUTOEN GOT_YOU_MORON
#define IRQ_NESTED_THREAD GOT_YOU_MORON
#define IRQ_PER_CPU_DEVID GOT_YOU_MORON
#define IRQ_IS_POLLED GOT_YOU_MORON
#undef IRQF_MODIFY_MASK
#define IRQF_MODIFY_MASK GOT_YOU_MORON
static inline void
irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
{
desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK);
desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
}
static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
{
return desc->status_use_accessors & _IRQ_PER_CPU;
}
static inline bool irq_settings_is_per_cpu_devid(struct irq_desc *desc)
{
return desc->status_use_accessors & _IRQ_PER_CPU_DEVID;
}
static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
{
desc->status_use_accessors |= _IRQ_PER_CPU;
}
static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
{
desc->status_use_accessors |= _IRQ_NO_BALANCING;
}
static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
{
return desc->status_use_accessors & _IRQ_NO_BALANCING;
}
static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
{
return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK;
}
static inline void
irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
{
desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK;
desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK;
}
static inline bool irq_settings_is_level(struct irq_desc *desc)
{
return desc->status_use_accessors & _IRQ_LEVEL;
}
static inline void irq_settings_clr_level(struct irq_desc *desc)
{
desc->status_use_accessors &= ~_IRQ_LEVEL;
}
static inline void irq_settings_set_level(struct irq_desc *desc)
{
desc->status_use_accessors |= _IRQ_LEVEL;
}
static inline bool irq_settings_can_request(struct irq_desc *desc)
{
return !(desc->status_use_accessors & _IRQ_NOREQUEST);
}
static inline void irq_settings_clr_norequest(struct irq_desc *desc)
{
desc->status_use_accessors &= ~_IRQ_NOREQUEST;
}
static inline void irq_settings_set_norequest(struct irq_desc *desc)
{
desc->status_use_accessors |= _IRQ_NOREQUEST;
}
static inline bool irq_settings_can_thread(struct irq_desc *desc)
{
return !(desc->status_use_accessors & _IRQ_NOTHREAD);
}
static inline void irq_settings_clr_nothread(struct irq_desc *desc)
{
desc->status_use_accessors &= ~_IRQ_NOTHREAD;
}
static inline void irq_settings_set_nothread(struct irq_desc *desc)
{
desc->status_use_accessors |= _IRQ_NOTHREAD;
}
static inline bool irq_settings_can_probe(struct irq_desc *desc)
{
return !(desc->status_use_accessors & _IRQ_NOPROBE);
}
static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
{
desc->status_use_accessors &= ~_IRQ_NOPROBE;
}
static inline void irq_settings_set_noprobe(struct irq_desc *desc)
{
desc->status_use_accessors |= _IRQ_NOPROBE;
}
static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
{
return desc->status_use_accessors & _IRQ_MOVE_PCNTXT;
}
static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
{
return !(desc->status_use_accessors & _IRQ_NOAUTOEN);
}
static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
{
return desc->status_use_accessors & _IRQ_NESTED_THREAD;
}
static inline bool irq_settings_is_polled(struct irq_desc *desc)
{
return desc->status_use_accessors & _IRQ_IS_POLLED;
}

467
kernel/irq/spurious.c Normal file
View file

@ -0,0 +1,467 @@
/*
* linux/kernel/irq/spurious.c
*
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains spurious interrupt handling.
*/
#include <linux/jiffies.h>
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/interrupt.h>
#include <linux/moduleparam.h>
#include <linux/timer.h>
#include "internals.h"
static int irqfixup __read_mostly;
#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
static void poll_spurious_irqs(unsigned long dummy);
static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
static int irq_poll_cpu;
static atomic_t irq_poll_active;
/*
* We wait here for a poller to finish.
*
* If the poll runs on this CPU, then we yell loudly and return
* false. That will leave the interrupt line disabled in the worst
* case, but it should never happen.
*
* We wait until the poller is done and then recheck disabled and
* action (about to be disabled). Only if it's still active, we return
* true and let the handler run.
*/
bool irq_wait_for_poll(struct irq_desc *desc)
{
if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
"irq poll in progress on cpu %d for irq %d\n",
smp_processor_id(), desc->irq_data.irq))
return false;
#ifdef CONFIG_SMP
do {
raw_spin_unlock(&desc->lock);
while (irqd_irq_inprogress(&desc->irq_data))
cpu_relax();
raw_spin_lock(&desc->lock);
} while (irqd_irq_inprogress(&desc->irq_data));
/* Might have been disabled in meantime */
return !irqd_irq_disabled(&desc->irq_data) && desc->action;
#else
return false;
#endif
}
/*
* Recovery handler for misrouted interrupts.
*/
static int try_one_irq(int irq, struct irq_desc *desc, bool force)
{
irqreturn_t ret = IRQ_NONE;
struct irqaction *action;
raw_spin_lock(&desc->lock);
/*
* PER_CPU, nested thread interrupts and interrupts explicitely
* marked polled are excluded from polling.
*/
if (irq_settings_is_per_cpu(desc) ||
irq_settings_is_nested_thread(desc) ||
irq_settings_is_polled(desc))
goto out;
/*
* Do not poll disabled interrupts unless the spurious
* disabled poller asks explicitely.
*/
if (irqd_irq_disabled(&desc->irq_data) && !force)
goto out;
/*
* All handlers must agree on IRQF_SHARED, so we test just the
* first.
*/
action = desc->action;
if (!action || !(action->flags & IRQF_SHARED) ||
(action->flags & __IRQF_TIMER))
goto out;
/* Already running on another processor */
if (irqd_irq_inprogress(&desc->irq_data)) {
/*
* Already running: If it is shared get the other
* CPU to go looking for our mystery interrupt too
*/
desc->istate |= IRQS_PENDING;
goto out;
}
/* Mark it poll in progress */
desc->istate |= IRQS_POLL_INPROGRESS;
do {
if (handle_irq_event(desc) == IRQ_HANDLED)
ret = IRQ_HANDLED;
/* Make sure that there is still a valid action */
action = desc->action;
} while ((desc->istate & IRQS_PENDING) && action);
desc->istate &= ~IRQS_POLL_INPROGRESS;
out:
raw_spin_unlock(&desc->lock);
return ret == IRQ_HANDLED;
}
static int misrouted_irq(int irq)
{
struct irq_desc *desc;
int i, ok = 0;
if (atomic_inc_return(&irq_poll_active) != 1)
goto out;
irq_poll_cpu = smp_processor_id();
for_each_irq_desc(i, desc) {
if (!i)
continue;
if (i == irq) /* Already tried */
continue;
if (try_one_irq(i, desc, false))
ok = 1;
}
out:
atomic_dec(&irq_poll_active);
/* So the caller can adjust the irq error counts */
return ok;
}
static void poll_spurious_irqs(unsigned long dummy)
{
struct irq_desc *desc;
int i;
if (atomic_inc_return(&irq_poll_active) != 1)
goto out;
irq_poll_cpu = smp_processor_id();
for_each_irq_desc(i, desc) {
unsigned int state;
if (!i)
continue;
/* Racy but it doesn't matter */
state = desc->istate;
barrier();
if (!(state & IRQS_SPURIOUS_DISABLED))
continue;
local_irq_disable();
try_one_irq(i, desc, true);
local_irq_enable();
}
out:
atomic_dec(&irq_poll_active);
mod_timer(&poll_spurious_irq_timer,
jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
static inline int bad_action_ret(irqreturn_t action_ret)
{
if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
return 0;
return 1;
}
/*
* If 99,900 of the previous 100,000 interrupts have not been handled
* then assume that the IRQ is stuck in some manner. Drop a diagnostic
* and try to turn the IRQ off.
*
* (The other 100-of-100,000 interrupts may have been a correctly
* functioning device sharing an IRQ with the failing one)
*/
static void
__report_bad_irq(unsigned int irq, struct irq_desc *desc,
irqreturn_t action_ret)
{
struct irqaction *action;
unsigned long flags;
if (bad_action_ret(action_ret)) {
printk(KERN_ERR "irq event %d: bogus return value %x\n",
irq, action_ret);
} else {
printk(KERN_ERR "irq %d: nobody cared (try booting with "
"the \"irqpoll\" option)\n", irq);
}
dump_stack();
printk(KERN_ERR "handlers:\n");
/*
* We need to take desc->lock here. note_interrupt() is called
* w/o desc->lock held, but IRQ_PROGRESS set. We might race
* with something else removing an action. It's ok to take
* desc->lock here. See synchronize_irq().
*/
raw_spin_lock_irqsave(&desc->lock, flags);
action = desc->action;
while (action) {
printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
if (action->thread_fn)
printk(KERN_CONT " threaded [<%p>] %pf",
action->thread_fn, action->thread_fn);
printk(KERN_CONT "\n");
action = action->next;
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
static void
report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
{
static int count = 100;
if (count > 0) {
count--;
__report_bad_irq(irq, desc, action_ret);
}
}
static inline int
try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
irqreturn_t action_ret)
{
struct irqaction *action;
if (!irqfixup)
return 0;
/* We didn't actually handle the IRQ - see if it was misrouted? */
if (action_ret == IRQ_NONE)
return 1;
/*
* But for 'irqfixup == 2' we also do it for handled interrupts if
* they are marked as IRQF_IRQPOLL (or for irq zero, which is the
* traditional PC timer interrupt.. Legacy)
*/
if (irqfixup < 2)
return 0;
if (!irq)
return 1;
/*
* Since we don't get the descriptor lock, "action" can
* change under us. We don't really care, but we don't
* want to follow a NULL pointer. So tell the compiler to
* just load it once by using a barrier.
*/
action = desc->action;
barrier();
return action && (action->flags & IRQF_IRQPOLL);
}
#define SPURIOUS_DEFERRED 0x80000000
void note_interrupt(unsigned int irq, struct irq_desc *desc,
irqreturn_t action_ret)
{
if (desc->istate & IRQS_POLL_INPROGRESS ||
irq_settings_is_polled(desc))
return;
if (bad_action_ret(action_ret)) {
report_bad_irq(irq, desc, action_ret);
return;
}
/*
* We cannot call note_interrupt from the threaded handler
* because we need to look at the compound of all handlers
* (primary and threaded). Aside of that in the threaded
* shared case we have no serialization against an incoming
* hardware interrupt while we are dealing with a threaded
* result.
*
* So in case a thread is woken, we just note the fact and
* defer the analysis to the next hardware interrupt.
*
* The threaded handlers store whether they sucessfully
* handled an interrupt and we check whether that number
* changed versus the last invocation.
*
* We could handle all interrupts with the delayed by one
* mechanism, but for the non forced threaded case we'd just
* add pointless overhead to the straight hardirq interrupts
* for the sake of a few lines less code.
*/
if (action_ret & IRQ_WAKE_THREAD) {
/*
* There is a thread woken. Check whether one of the
* shared primary handlers returned IRQ_HANDLED. If
* not we defer the spurious detection to the next
* interrupt.
*/
if (action_ret == IRQ_WAKE_THREAD) {
int handled;
/*
* We use bit 31 of thread_handled_last to
* denote the deferred spurious detection
* active. No locking necessary as
* thread_handled_last is only accessed here
* and we have the guarantee that hard
* interrupts are not reentrant.
*/
if (!(desc->threads_handled_last & SPURIOUS_DEFERRED)) {
desc->threads_handled_last |= SPURIOUS_DEFERRED;
return;
}
/*
* Check whether one of the threaded handlers
* returned IRQ_HANDLED since the last
* interrupt happened.
*
* For simplicity we just set bit 31, as it is
* set in threads_handled_last as well. So we
* avoid extra masking. And we really do not
* care about the high bits of the handled
* count. We just care about the count being
* different than the one we saw before.
*/
handled = atomic_read(&desc->threads_handled);
handled |= SPURIOUS_DEFERRED;
if (handled != desc->threads_handled_last) {
action_ret = IRQ_HANDLED;
/*
* Note: We keep the SPURIOUS_DEFERRED
* bit set. We are handling the
* previous invocation right now.
* Keep it for the current one, so the
* next hardware interrupt will
* account for it.
*/
desc->threads_handled_last = handled;
} else {
/*
* None of the threaded handlers felt
* responsible for the last interrupt
*
* We keep the SPURIOUS_DEFERRED bit
* set in threads_handled_last as we
* need to account for the current
* interrupt as well.
*/
action_ret = IRQ_NONE;
}
} else {
/*
* One of the primary handlers returned
* IRQ_HANDLED. So we don't care about the
* threaded handlers on the same line. Clear
* the deferred detection bit.
*
* In theory we could/should check whether the
* deferred bit is set and take the result of
* the previous run into account here as
* well. But it's really not worth the
* trouble. If every other interrupt is
* handled we never trigger the spurious
* detector. And if this is just the one out
* of 100k unhandled ones which is handled
* then we merily delay the spurious detection
* by one hard interrupt. Not a real problem.
*/
desc->threads_handled_last &= ~SPURIOUS_DEFERRED;
}
}
if (unlikely(action_ret == IRQ_NONE)) {
/*
* If we are seeing only the odd spurious IRQ caused by
* bus asynchronicity then don't eventually trigger an error,
* otherwise the counter becomes a doomsday timer for otherwise
* working systems
*/
if (time_after(jiffies, desc->last_unhandled + HZ/10))
desc->irqs_unhandled = 1;
else
desc->irqs_unhandled++;
desc->last_unhandled = jiffies;
}
if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
int ok = misrouted_irq(irq);
if (action_ret == IRQ_NONE)
desc->irqs_unhandled -= ok;
}
desc->irq_count++;
if (likely(desc->irq_count < 100000))
return;
desc->irq_count = 0;
if (unlikely(desc->irqs_unhandled > 99900)) {
/*
* The interrupt is stuck
*/
__report_bad_irq(irq, desc, action_ret);
/*
* Now kill the IRQ
*/
printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
desc->istate |= IRQS_SPURIOUS_DISABLED;
desc->depth++;
irq_disable(desc);
mod_timer(&poll_spurious_irq_timer,
jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
desc->irqs_unhandled = 0;
}
bool noirqdebug __read_mostly;
int noirqdebug_setup(char *str)
{
noirqdebug = 1;
printk(KERN_INFO "IRQ lockup detection disabled\n");
return 1;
}
__setup("noirqdebug", noirqdebug_setup);
module_param(noirqdebug, bool, 0644);
MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
static int __init irqfixup_setup(char *str)
{
irqfixup = 1;
printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
printk(KERN_WARNING "This may impact system performance.\n");
return 1;
}
__setup("irqfixup", irqfixup_setup);
module_param(irqfixup, int, 0644);
static int __init irqpoll_setup(char *str)
{
irqfixup = 2;
printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
"enabled\n");
printk(KERN_WARNING "This may significantly impact system "
"performance\n");
return 1;
}
__setup("irqpoll", irqpoll_setup);

196
kernel/irq_work.c Normal file
View file

@ -0,0 +1,196 @@
/*
* Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
*
* Provides a framework for enqueueing and running callbacks from hardirq
* context. The enqueueing is NMI-safe.
*/
#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/irq_work.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/irqflags.h>
#include <linux/sched.h>
#include <linux/tick.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/smp.h>
#include <asm/processor.h>
static DEFINE_PER_CPU(struct llist_head, raised_list);
static DEFINE_PER_CPU(struct llist_head, lazy_list);
/*
* Claim the entry so that no one else will poke at it.
*/
static bool irq_work_claim(struct irq_work *work)
{
unsigned long flags, oflags, nflags;
/*
* Start with our best wish as a premise but only trust any
* flag value after cmpxchg() result.
*/
flags = work->flags & ~IRQ_WORK_PENDING;
for (;;) {
nflags = flags | IRQ_WORK_FLAGS;
oflags = cmpxchg(&work->flags, flags, nflags);
if (oflags == flags)
break;
if (oflags & IRQ_WORK_PENDING)
return false;
flags = oflags;
cpu_relax();
}
return true;
}
void __weak arch_irq_work_raise(void)
{
/*
* Lame architectures will get the timer tick callback
*/
}
#ifdef CONFIG_SMP
/*
* Enqueue the irq_work @work on @cpu unless it's already pending
* somewhere.
*
* Can be re-enqueued while the callback is still in progress.
*/
bool irq_work_queue_on(struct irq_work *work, int cpu)
{
/* All work should have been flushed before going offline */
WARN_ON_ONCE(cpu_is_offline(cpu));
/* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi());
/* Only queue if not already pending */
if (!irq_work_claim(work))
return false;
if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
arch_send_call_function_single_ipi(cpu);
return true;
}
EXPORT_SYMBOL_GPL(irq_work_queue_on);
#endif
/* Enqueue the irq work @work on the current CPU */
bool irq_work_queue(struct irq_work *work)
{
/* Only queue if not already pending */
if (!irq_work_claim(work))
return false;
/* Queue the entry and raise the IPI if needed. */
preempt_disable();
/* If the work is "lazy", handle it from next tick if any */
if (work->flags & IRQ_WORK_LAZY) {
if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
tick_nohz_tick_stopped())
arch_irq_work_raise();
} else {
if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
arch_irq_work_raise();
}
preempt_enable();
return true;
}
EXPORT_SYMBOL_GPL(irq_work_queue);
bool irq_work_needs_cpu(void)
{
struct llist_head *raised, *lazy;
raised = this_cpu_ptr(&raised_list);
lazy = this_cpu_ptr(&lazy_list);
if (llist_empty(raised) || arch_irq_work_has_interrupt())
if (llist_empty(lazy))
return false;
/* All work should have been flushed before going offline */
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
return true;
}
static void irq_work_run_list(struct llist_head *list)
{
unsigned long flags;
struct irq_work *work;
struct llist_node *llnode;
BUG_ON(!irqs_disabled());
if (llist_empty(list))
return;
llnode = llist_del_all(list);
while (llnode != NULL) {
work = llist_entry(llnode, struct irq_work, llnode);
llnode = llist_next(llnode);
/*
* Clear the PENDING bit, after this point the @work
* can be re-used.
* Make it immediately visible so that other CPUs trying
* to claim that work don't rely on us to handle their data
* while we are in the middle of the func.
*/
flags = work->flags & ~IRQ_WORK_PENDING;
xchg(&work->flags, flags);
work->func(work);
/*
* Clear the BUSY bit and return to the free state if
* no-one else claimed it meanwhile.
*/
(void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
}
}
/*
* hotplug calls this through:
* hotplug_cfd() -> flush_smp_call_function_queue()
*/
void irq_work_run(void)
{
irq_work_run_list(this_cpu_ptr(&raised_list));
irq_work_run_list(this_cpu_ptr(&lazy_list));
}
EXPORT_SYMBOL_GPL(irq_work_run);
void irq_work_tick(void)
{
struct llist_head *raised = &__get_cpu_var(raised_list);
if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
irq_work_run_list(raised);
irq_work_run_list(&__get_cpu_var(lazy_list));
}
/*
* Synchronize against the irq_work @entry, ensures the entry is not
* currently in use.
*/
void irq_work_sync(struct irq_work *work)
{
WARN_ON_ONCE(irqs_disabled());
while (work->flags & IRQ_WORK_BUSY)
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);

460
kernel/jump_label.c Normal file
View file

@ -0,0 +1,460 @@
/*
* jump label support
*
* Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
* Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com>
*
*/
#include <linux/memory.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/sort.h>
#include <linux/err.h>
#include <linux/static_key.h>
#include <linux/jump_label_ratelimit.h>
#ifdef HAVE_JUMP_LABEL
/* mutex to protect coming/going of the the jump_label table */
static DEFINE_MUTEX(jump_label_mutex);
void jump_label_lock(void)
{
mutex_lock(&jump_label_mutex);
}
void jump_label_unlock(void)
{
mutex_unlock(&jump_label_mutex);
}
static int jump_label_cmp(const void *a, const void *b)
{
const struct jump_entry *jea = a;
const struct jump_entry *jeb = b;
if (jea->key < jeb->key)
return -1;
if (jea->key > jeb->key)
return 1;
return 0;
}
static void
jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
{
unsigned long size;
size = (((unsigned long)stop - (unsigned long)start)
/ sizeof(struct jump_entry));
sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
}
static void jump_label_update(struct static_key *key, int enable);
void static_key_slow_inc(struct static_key *key)
{
STATIC_KEY_CHECK_USE();
if (atomic_inc_not_zero(&key->enabled))
return;
jump_label_lock();
if (atomic_read(&key->enabled) == 0) {
if (!jump_label_get_branch_default(key))
jump_label_update(key, JUMP_LABEL_ENABLE);
else
jump_label_update(key, JUMP_LABEL_DISABLE);
}
atomic_inc(&key->enabled);
jump_label_unlock();
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
static void __static_key_slow_dec(struct static_key *key,
unsigned long rate_limit, struct delayed_work *work)
{
if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
WARN(atomic_read(&key->enabled) < 0,
"jump label: negative count!\n");
return;
}
if (rate_limit) {
atomic_inc(&key->enabled);
schedule_delayed_work(work, rate_limit);
} else {
if (!jump_label_get_branch_default(key))
jump_label_update(key, JUMP_LABEL_DISABLE);
else
jump_label_update(key, JUMP_LABEL_ENABLE);
}
jump_label_unlock();
}
static void jump_label_update_timeout(struct work_struct *work)
{
struct static_key_deferred *key =
container_of(work, struct static_key_deferred, work.work);
__static_key_slow_dec(&key->key, 0, NULL);
}
void static_key_slow_dec(struct static_key *key)
{
STATIC_KEY_CHECK_USE();
__static_key_slow_dec(key, 0, NULL);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec);
void static_key_slow_dec_deferred(struct static_key_deferred *key)
{
STATIC_KEY_CHECK_USE();
__static_key_slow_dec(&key->key, key->timeout, &key->work);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
void jump_label_rate_limit(struct static_key_deferred *key,
unsigned long rl)
{
STATIC_KEY_CHECK_USE();
key->timeout = rl;
INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
}
EXPORT_SYMBOL_GPL(jump_label_rate_limit);
static int addr_conflict(struct jump_entry *entry, void *start, void *end)
{
if (entry->code <= (unsigned long)end &&
entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
return 1;
return 0;
}
static int __jump_label_text_reserved(struct jump_entry *iter_start,
struct jump_entry *iter_stop, void *start, void *end)
{
struct jump_entry *iter;
iter = iter_start;
while (iter < iter_stop) {
if (addr_conflict(iter, start, end))
return 1;
iter++;
}
return 0;
}
/*
* Update code which is definitely not currently executing.
* Architectures which need heavyweight synchronization to modify
* running code can override this to make the non-live update case
* cheaper.
*/
void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
enum jump_label_type type)
{
arch_jump_label_transform(entry, type);
}
static void __jump_label_update(struct static_key *key,
struct jump_entry *entry,
struct jump_entry *stop, int enable)
{
for (; (entry < stop) &&
(entry->key == (jump_label_t)(unsigned long)key);
entry++) {
/*
* entry->code set to 0 invalidates module init text sections
* kernel_text_address() verifies we are not in core kernel
* init code, see jump_label_invalidate_module_init().
*/
if (entry->code && kernel_text_address(entry->code))
arch_jump_label_transform(entry, enable);
}
}
static enum jump_label_type jump_label_type(struct static_key *key)
{
bool true_branch = jump_label_get_branch_default(key);
bool state = static_key_enabled(key);
if ((!true_branch && state) || (true_branch && !state))
return JUMP_LABEL_ENABLE;
return JUMP_LABEL_DISABLE;
}
void __init jump_label_init(void)
{
struct jump_entry *iter_start = __start___jump_table;
struct jump_entry *iter_stop = __stop___jump_table;
struct static_key *key = NULL;
struct jump_entry *iter;
jump_label_lock();
jump_label_sort_entries(iter_start, iter_stop);
for (iter = iter_start; iter < iter_stop; iter++) {
struct static_key *iterk;
iterk = (struct static_key *)(unsigned long)iter->key;
arch_jump_label_transform_static(iter, jump_label_type(iterk));
if (iterk == key)
continue;
key = iterk;
/*
* Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
*/
*((unsigned long *)&key->entries) += (unsigned long)iter;
#ifdef CONFIG_MODULES
key->next = NULL;
#endif
}
static_key_initialized = true;
jump_label_unlock();
}
#ifdef CONFIG_MODULES
struct static_key_mod {
struct static_key_mod *next;
struct jump_entry *entries;
struct module *mod;
};
static int __jump_label_mod_text_reserved(void *start, void *end)
{
struct module *mod;
mod = __module_text_address((unsigned long)start);
if (!mod)
return 0;
WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
return __jump_label_text_reserved(mod->jump_entries,
mod->jump_entries + mod->num_jump_entries,
start, end);
}
static void __jump_label_mod_update(struct static_key *key, int enable)
{
struct static_key_mod *mod = key->next;
while (mod) {
struct module *m = mod->mod;
__jump_label_update(key, mod->entries,
m->jump_entries + m->num_jump_entries,
enable);
mod = mod->next;
}
}
/***
* apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
* @mod: module to patch
*
* Allow for run-time selection of the optimal nops. Before the module
* loads patch these with arch_get_jump_label_nop(), which is specified by
* the arch specific jump label code.
*/
void jump_label_apply_nops(struct module *mod)
{
struct jump_entry *iter_start = mod->jump_entries;
struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
struct jump_entry *iter;
/* if the module doesn't have jump label entries, just return */
if (iter_start == iter_stop)
return;
for (iter = iter_start; iter < iter_stop; iter++) {
arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
}
}
static int jump_label_add_module(struct module *mod)
{
struct jump_entry *iter_start = mod->jump_entries;
struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
struct jump_entry *iter;
struct static_key *key = NULL;
struct static_key_mod *jlm;
/* if the module doesn't have jump label entries, just return */
if (iter_start == iter_stop)
return 0;
jump_label_sort_entries(iter_start, iter_stop);
for (iter = iter_start; iter < iter_stop; iter++) {
struct static_key *iterk;
iterk = (struct static_key *)(unsigned long)iter->key;
if (iterk == key)
continue;
key = iterk;
if (__module_address(iter->key) == mod) {
/*
* Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
*/
*((unsigned long *)&key->entries) += (unsigned long)iter;
key->next = NULL;
continue;
}
jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
if (!jlm)
return -ENOMEM;
jlm->mod = mod;
jlm->entries = iter;
jlm->next = key->next;
key->next = jlm;
if (jump_label_type(key) == JUMP_LABEL_ENABLE)
__jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
}
return 0;
}
static void jump_label_del_module(struct module *mod)
{
struct jump_entry *iter_start = mod->jump_entries;
struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
struct jump_entry *iter;
struct static_key *key = NULL;
struct static_key_mod *jlm, **prev;
for (iter = iter_start; iter < iter_stop; iter++) {
if (iter->key == (jump_label_t)(unsigned long)key)
continue;
key = (struct static_key *)(unsigned long)iter->key;
if (__module_address(iter->key) == mod)
continue;
prev = &key->next;
jlm = key->next;
while (jlm && jlm->mod != mod) {
prev = &jlm->next;
jlm = jlm->next;
}
if (jlm) {
*prev = jlm->next;
kfree(jlm);
}
}
}
static void jump_label_invalidate_module_init(struct module *mod)
{
struct jump_entry *iter_start = mod->jump_entries;
struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
struct jump_entry *iter;
for (iter = iter_start; iter < iter_stop; iter++) {
if (within_module_init(iter->code, mod))
iter->code = 0;
}
}
static int
jump_label_module_notify(struct notifier_block *self, unsigned long val,
void *data)
{
struct module *mod = data;
int ret = 0;
switch (val) {
case MODULE_STATE_COMING:
jump_label_lock();
ret = jump_label_add_module(mod);
if (ret)
jump_label_del_module(mod);
jump_label_unlock();
break;
case MODULE_STATE_GOING:
jump_label_lock();
jump_label_del_module(mod);
jump_label_unlock();
break;
case MODULE_STATE_LIVE:
jump_label_lock();
jump_label_invalidate_module_init(mod);
jump_label_unlock();
break;
}
return notifier_from_errno(ret);
}
struct notifier_block jump_label_module_nb = {
.notifier_call = jump_label_module_notify,
.priority = 1, /* higher than tracepoints */
};
static __init int jump_label_init_module(void)
{
return register_module_notifier(&jump_label_module_nb);
}
early_initcall(jump_label_init_module);
#endif /* CONFIG_MODULES */
/***
* jump_label_text_reserved - check if addr range is reserved
* @start: start text addr
* @end: end text addr
*
* checks if the text addr located between @start and @end
* overlaps with any of the jump label patch addresses. Code
* that wants to modify kernel text should first verify that
* it does not overlap with any of the jump label addresses.
* Caller must hold jump_label_mutex.
*
* returns 1 if there is an overlap, 0 otherwise
*/
int jump_label_text_reserved(void *start, void *end)
{
int ret = __jump_label_text_reserved(__start___jump_table,
__stop___jump_table, start, end);
if (ret)
return ret;
#ifdef CONFIG_MODULES
ret = __jump_label_mod_text_reserved(start, end);
#endif
return ret;
}
static void jump_label_update(struct static_key *key, int enable)
{
struct jump_entry *stop = __stop___jump_table;
struct jump_entry *entry = jump_label_get_entries(key);
#ifdef CONFIG_MODULES
struct module *mod = __module_address((unsigned long)key);
__jump_label_mod_update(key, enable);
if (mod)
stop = mod->jump_entries + mod->num_jump_entries;
#endif
/* if there are no users, entry can be NULL */
if (entry)
__jump_label_update(key, entry, stop, enable);
}
#endif

608
kernel/kallsyms.c Normal file
View file

@ -0,0 +1,608 @@
/*
* kallsyms.c: in-kernel printing of symbolic oopses and stack traces.
*
* Rewritten and vastly simplified by Rusty Russell for in-kernel
* module loader:
* Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
*
* ChangeLog:
*
* (25/Aug/2004) Paulo Marques <pmarques@grupopie.com>
* Changed the compression method from stem compression to "table lookup"
* compression (see scripts/kallsyms.c for a more complete description)
*/
#include <linux/kallsyms.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
#include <linux/kdb.h>
#include <linux/err.h>
#include <linux/proc_fs.h>
#include <linux/sched.h> /* for cond_resched */
#include <linux/mm.h>
#include <linux/ctype.h>
#include <linux/slab.h>
#include <linux/compiler.h>
#include <asm/sections.h>
#ifdef CONFIG_KALLSYMS_ALL
#define all_var 1
#else
#define all_var 0
#endif
/*
* These will be re-linked against their real values
* during the second link stage.
*/
extern const unsigned long kallsyms_addresses[] __weak;
extern const u8 kallsyms_names[] __weak;
/*
* Tell the compiler that the count isn't in the small data section if the arch
* has one (eg: FRV).
*/
extern const unsigned long kallsyms_num_syms
__attribute__((weak, section(".rodata")));
extern const u8 kallsyms_token_table[] __weak;
extern const u16 kallsyms_token_index[] __weak;
extern const unsigned long kallsyms_markers[] __weak;
static inline int is_kernel_inittext(unsigned long addr)
{
if (addr >= (unsigned long)_sinittext
&& addr <= (unsigned long)_einittext)
return 1;
return 0;
}
static inline int is_kernel_text(unsigned long addr)
{
if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
arch_is_kernel_text(addr))
return 1;
return in_gate_area_no_mm(addr);
}
static inline int is_kernel(unsigned long addr)
{
if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
return 1;
return in_gate_area_no_mm(addr);
}
static int is_ksym_addr(unsigned long addr)
{
if (all_var)
return is_kernel(addr);
return is_kernel_text(addr) || is_kernel_inittext(addr);
}
/*
* Expand a compressed symbol data into the resulting uncompressed string,
* if uncompressed string is too long (>= maxlen), it will be truncated,
* given the offset to where the symbol is in the compressed stream.
*/
static unsigned int kallsyms_expand_symbol(unsigned int off,
char *result, size_t maxlen)
{
int len, skipped_first = 0;
const u8 *tptr, *data;
/* Get the compressed symbol length from the first symbol byte. */
data = &kallsyms_names[off];
len = *data;
data++;
/*
* Update the offset to return the offset for the next symbol on
* the compressed stream.
*/
off += len + 1;
/*
* For every byte on the compressed symbol data, copy the table
* entry for that byte.
*/
while (len) {
tptr = &kallsyms_token_table[kallsyms_token_index[*data]];
data++;
len--;
while (*tptr) {
if (skipped_first) {
if (maxlen <= 1)
goto tail;
*result = *tptr;
result++;
maxlen--;
} else
skipped_first = 1;
tptr++;
}
}
tail:
if (maxlen)
*result = '\0';
/* Return to offset to the next symbol. */
return off;
}
/*
* Get symbol type information. This is encoded as a single char at the
* beginning of the symbol name.
*/
static char kallsyms_get_symbol_type(unsigned int off)
{
/*
* Get just the first code, look it up in the token table,
* and return the first char from this token.
*/
return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]];
}
/*
* Find the offset on the compressed stream given and index in the
* kallsyms array.
*/
static unsigned int get_symbol_offset(unsigned long pos)
{
const u8 *name;
int i;
/*
* Use the closest marker we have. We have markers every 256 positions,
* so that should be close enough.
*/
name = &kallsyms_names[kallsyms_markers[pos >> 8]];
/*
* Sequentially scan all the symbols up to the point we're searching
* for. Every symbol is stored in a [<len>][<len> bytes of data] format,
* so we just need to add the len to the current pointer for every
* symbol we wish to skip.
*/
for (i = 0; i < (pos & 0xFF); i++)
name = name + (*name) + 1;
return name - kallsyms_names;
}
/* Lookup the address for this symbol. Returns 0 if not found. */
unsigned long kallsyms_lookup_name(const char *name)
{
char namebuf[KSYM_NAME_LEN];
unsigned long i;
unsigned int off;
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
if (strcmp(namebuf, name) == 0)
return kallsyms_addresses[i];
}
return module_kallsyms_lookup_name(name);
}
EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
unsigned long),
void *data)
{
char namebuf[KSYM_NAME_LEN];
unsigned long i;
unsigned int off;
int ret;
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
if (ret != 0)
return ret;
}
return module_kallsyms_on_each_symbol(fn, data);
}
EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
static unsigned long get_symbol_pos(unsigned long addr,
unsigned long *symbolsize,
unsigned long *offset)
{
unsigned long symbol_start = 0, symbol_end = 0;
unsigned long i, low, high, mid;
/* This kernel should never had been booted. */
BUG_ON(!kallsyms_addresses);
/* Do a binary search on the sorted kallsyms_addresses array. */
low = 0;
high = kallsyms_num_syms;
while (high - low > 1) {
mid = low + (high - low) / 2;
if (kallsyms_addresses[mid] <= addr)
low = mid;
else
high = mid;
}
/*
* Search for the first aliased symbol. Aliased
* symbols are symbols with the same address.
*/
while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
--low;
symbol_start = kallsyms_addresses[low];
/* Search for next non-aliased symbol. */
for (i = low + 1; i < kallsyms_num_syms; i++) {
if (kallsyms_addresses[i] > symbol_start) {
symbol_end = kallsyms_addresses[i];
break;
}
}
/* If we found no next symbol, we use the end of the section. */
if (!symbol_end) {
if (is_kernel_inittext(addr))
symbol_end = (unsigned long)_einittext;
else if (all_var)
symbol_end = (unsigned long)_end;
else
symbol_end = (unsigned long)_etext;
}
if (symbolsize)
*symbolsize = symbol_end - symbol_start;
if (offset)
*offset = addr - symbol_start;
return low;
}
/*
* Lookup an address but don't bother to find any names.
*/
int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
unsigned long *offset)
{
char namebuf[KSYM_NAME_LEN];
if (is_ksym_addr(addr))
return !!get_symbol_pos(addr, symbolsize, offset);
return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf);
}
/*
* Lookup an address
* - modname is set to NULL if it's in the kernel.
* - We guarantee that the returned name is valid until we reschedule even if.
* It resides in a module.
* - We also guarantee that modname will be valid until rescheduled.
*/
const char *kallsyms_lookup(unsigned long addr,
unsigned long *symbolsize,
unsigned long *offset,
char **modname, char *namebuf)
{
namebuf[KSYM_NAME_LEN - 1] = 0;
namebuf[0] = 0;
if (is_ksym_addr(addr)) {
unsigned long pos;
pos = get_symbol_pos(addr, symbolsize, offset);
/* Grab name */
kallsyms_expand_symbol(get_symbol_offset(pos),
namebuf, KSYM_NAME_LEN);
if (modname)
*modname = NULL;
return namebuf;
}
/* See if it's in a module. */
return module_address_lookup(addr, symbolsize, offset, modname,
namebuf);
}
int lookup_symbol_name(unsigned long addr, char *symname)
{
symname[0] = '\0';
symname[KSYM_NAME_LEN - 1] = '\0';
if (is_ksym_addr(addr)) {
unsigned long pos;
pos = get_symbol_pos(addr, NULL, NULL);
/* Grab name */
kallsyms_expand_symbol(get_symbol_offset(pos),
symname, KSYM_NAME_LEN);
return 0;
}
/* See if it's in a module. */
return lookup_module_symbol_name(addr, symname);
}
int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
unsigned long *offset, char *modname, char *name)
{
name[0] = '\0';
name[KSYM_NAME_LEN - 1] = '\0';
if (is_ksym_addr(addr)) {
unsigned long pos;
pos = get_symbol_pos(addr, size, offset);
/* Grab name */
kallsyms_expand_symbol(get_symbol_offset(pos),
name, KSYM_NAME_LEN);
modname[0] = '\0';
return 0;
}
/* See if it's in a module. */
return lookup_module_symbol_attrs(addr, size, offset, modname, name);
}
/* Look up a kernel symbol and return it in a text buffer. */
static int __sprint_symbol(char *buffer, unsigned long address,
int symbol_offset, int add_offset)
{
char *modname;
const char *name;
unsigned long offset, size;
int len;
address += symbol_offset;
name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
if (!name)
return sprintf(buffer, "0x%lx", address - symbol_offset);
if (name != buffer)
strcpy(buffer, name);
len = strlen(buffer);
offset -= symbol_offset;
if (add_offset)
len += sprintf(buffer + len, "+%#lx/%#lx", offset, size);
if (modname)
len += sprintf(buffer + len, " [%s]", modname);
return len;
}
/**
* sprint_symbol - Look up a kernel symbol and return it in a text buffer
* @buffer: buffer to be stored
* @address: address to lookup
*
* This function looks up a kernel symbol with @address and stores its name,
* offset, size and module name to @buffer if possible. If no symbol was found,
* just saves its @address as is.
*
* This function returns the number of bytes stored in @buffer.
*/
int sprint_symbol(char *buffer, unsigned long address)
{
return __sprint_symbol(buffer, address, 0, 1);
}
EXPORT_SYMBOL_GPL(sprint_symbol);
/**
* sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer
* @buffer: buffer to be stored
* @address: address to lookup
*
* This function looks up a kernel symbol with @address and stores its name
* and module name to @buffer if possible. If no symbol was found, just saves
* its @address as is.
*
* This function returns the number of bytes stored in @buffer.
*/
int sprint_symbol_no_offset(char *buffer, unsigned long address)
{
return __sprint_symbol(buffer, address, 0, 0);
}
EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
/**
* sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
* @buffer: buffer to be stored
* @address: address to lookup
*
* This function is for stack backtrace and does the same thing as
* sprint_symbol() but with modified/decreased @address. If there is a
* tail-call to the function marked "noreturn", gcc optimized out code after
* the call so that the stack-saved return address could point outside of the
* caller. This function ensures that kallsyms will find the original caller
* by decreasing @address.
*
* This function returns the number of bytes stored in @buffer.
*/
int sprint_backtrace(char *buffer, unsigned long address)
{
return __sprint_symbol(buffer, address, -1, 1);
}
/* Look up a kernel symbol and print it to the kernel messages. */
void __print_symbol(const char *fmt, unsigned long address)
{
char buffer[KSYM_SYMBOL_LEN];
sprint_symbol(buffer, address);
printk(fmt, buffer);
}
EXPORT_SYMBOL(__print_symbol);
/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
struct kallsym_iter {
loff_t pos;
unsigned long value;
unsigned int nameoff; /* If iterating in core kernel symbols. */
char type;
char name[KSYM_NAME_LEN];
char module_name[MODULE_NAME_LEN];
int exported;
};
static int get_ksymbol_mod(struct kallsym_iter *iter)
{
if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value,
&iter->type, iter->name, iter->module_name,
&iter->exported) < 0)
return 0;
return 1;
}
/* Returns space to next name. */
static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
{
unsigned off = iter->nameoff;
iter->module_name[0] = '\0';
iter->value = kallsyms_addresses[iter->pos];
iter->type = kallsyms_get_symbol_type(off);
off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name));
return off - iter->nameoff;
}
static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
{
iter->name[0] = '\0';
iter->nameoff = get_symbol_offset(new_pos);
iter->pos = new_pos;
}
/* Returns false if pos at or past end of file. */
static int update_iter(struct kallsym_iter *iter, loff_t pos)
{
/* Module symbols can be accessed randomly. */
if (pos >= kallsyms_num_syms) {
iter->pos = pos;
return get_ksymbol_mod(iter);
}
/* If we're not on the desired position, reset to new position. */
if (pos != iter->pos)
reset_iter(iter, pos);
iter->nameoff += get_ksymbol_core(iter);
iter->pos++;
return 1;
}
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
(*pos)++;
if (!update_iter(m->private, *pos))
return NULL;
return p;
}
static void *s_start(struct seq_file *m, loff_t *pos)
{
if (!update_iter(m->private, *pos))
return NULL;
return m->private;
}
static void s_stop(struct seq_file *m, void *p)
{
}
static int s_show(struct seq_file *m, void *p)
{
struct kallsym_iter *iter = m->private;
/* Some debugging symbols have no name. Ignore them. */
if (!iter->name[0])
return 0;
if (iter->module_name[0]) {
char type;
/*
* Label it "global" if it is exported,
* "local" if not exported.
*/
type = iter->exported ? toupper(iter->type) :
tolower(iter->type);
seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
type, iter->name, iter->module_name);
} else
seq_printf(m, "%pK %c %s\n", (void *)iter->value,
iter->type, iter->name);
return 0;
}
static const struct seq_operations kallsyms_op = {
.start = s_start,
.next = s_next,
.stop = s_stop,
.show = s_show
};
static int kallsyms_open(struct inode *inode, struct file *file)
{
/*
* We keep iterator in m->private, since normal case is to
* s_start from where we left off, so we avoid doing
* using get_symbol_offset for every symbol.
*/
struct kallsym_iter *iter;
iter = __seq_open_private(file, &kallsyms_op, sizeof(*iter));
if (!iter)
return -ENOMEM;
reset_iter(iter, 0);
return 0;
}
#ifdef CONFIG_KGDB_KDB
const char *kdb_walk_kallsyms(loff_t *pos)
{
static struct kallsym_iter kdb_walk_kallsyms_iter;
if (*pos == 0) {
memset(&kdb_walk_kallsyms_iter, 0,
sizeof(kdb_walk_kallsyms_iter));
reset_iter(&kdb_walk_kallsyms_iter, 0);
}
while (1) {
if (!update_iter(&kdb_walk_kallsyms_iter, *pos))
return NULL;
++*pos;
/* Some debugging symbols have no name. Ignore them. */
if (kdb_walk_kallsyms_iter.name[0])
return kdb_walk_kallsyms_iter.name;
}
}
#endif /* CONFIG_KGDB_KDB */
static const struct file_operations kallsyms_operations = {
.open = kallsyms_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
};
static int __init kallsyms_init(void)
{
proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
return 0;
}
device_initcall(kallsyms_init);

198
kernel/kcmp.c Normal file
View file

@ -0,0 +1,198 @@
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/fdtable.h>
#include <linux/string.h>
#include <linux/random.h>
#include <linux/module.h>
#include <linux/ptrace.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/cache.h>
#include <linux/bug.h>
#include <linux/err.h>
#include <linux/kcmp.h>
#include <asm/unistd.h>
/*
* We don't expose the real in-memory order of objects for security reasons.
* But still the comparison results should be suitable for sorting. So we
* obfuscate kernel pointers values and compare the production instead.
*
* The obfuscation is done in two steps. First we xor the kernel pointer with
* a random value, which puts pointer into a new position in a reordered space.
* Secondly we multiply the xor production with a large odd random number to
* permute its bits even more (the odd multiplier guarantees that the product
* is unique ever after the high bits are truncated, since any odd number is
* relative prime to 2^n).
*
* Note also that the obfuscation itself is invisible to userspace and if needed
* it can be changed to an alternate scheme.
*/
static unsigned long cookies[KCMP_TYPES][2] __read_mostly;
static long kptr_obfuscate(long v, int type)
{
return (v ^ cookies[type][0]) * cookies[type][1];
}
/*
* 0 - equal, i.e. v1 = v2
* 1 - less than, i.e. v1 < v2
* 2 - greater than, i.e. v1 > v2
* 3 - not equal but ordering unavailable (reserved for future)
*/
static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
{
long t1, t2;
t1 = kptr_obfuscate((long)v1, type);
t2 = kptr_obfuscate((long)v2, type);
return (t1 < t2) | ((t1 > t2) << 1);
}
/* The caller must have pinned the task */
static struct file *
get_file_raw_ptr(struct task_struct *task, unsigned int idx)
{
struct file *file = NULL;
task_lock(task);
rcu_read_lock();
if (task->files)
file = fcheck_files(task->files, idx);
rcu_read_unlock();
task_unlock(task);
return file;
}
static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
{
if (likely(m2 != m1))
mutex_unlock(m2);
mutex_unlock(m1);
}
static int kcmp_lock(struct mutex *m1, struct mutex *m2)
{
int err;
if (m2 > m1)
swap(m1, m2);
err = mutex_lock_killable(m1);
if (!err && likely(m1 != m2)) {
err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
if (err)
mutex_unlock(m1);
}
return err;
}
SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
unsigned long, idx1, unsigned long, idx2)
{
struct task_struct *task1, *task2;
int ret;
rcu_read_lock();
/*
* Tasks are looked up in caller's PID namespace only.
*/
task1 = find_task_by_vpid(pid1);
task2 = find_task_by_vpid(pid2);
if (!task1 || !task2)
goto err_no_task;
get_task_struct(task1);
get_task_struct(task2);
rcu_read_unlock();
/*
* One should have enough rights to inspect task details.
*/
ret = kcmp_lock(&task1->signal->cred_guard_mutex,
&task2->signal->cred_guard_mutex);
if (ret)
goto err;
if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
!ptrace_may_access(task2, PTRACE_MODE_READ)) {
ret = -EPERM;
goto err_unlock;
}
switch (type) {
case KCMP_FILE: {
struct file *filp1, *filp2;
filp1 = get_file_raw_ptr(task1, idx1);
filp2 = get_file_raw_ptr(task2, idx2);
if (filp1 && filp2)
ret = kcmp_ptr(filp1, filp2, KCMP_FILE);
else
ret = -EBADF;
break;
}
case KCMP_VM:
ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM);
break;
case KCMP_FILES:
ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES);
break;
case KCMP_FS:
ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS);
break;
case KCMP_SIGHAND:
ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND);
break;
case KCMP_IO:
ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO);
break;
case KCMP_SYSVSEM:
#ifdef CONFIG_SYSVIPC
ret = kcmp_ptr(task1->sysvsem.undo_list,
task2->sysvsem.undo_list,
KCMP_SYSVSEM);
#else
ret = -EOPNOTSUPP;
#endif
break;
default:
ret = -EINVAL;
break;
}
err_unlock:
kcmp_unlock(&task1->signal->cred_guard_mutex,
&task2->signal->cred_guard_mutex);
err:
put_task_struct(task1);
put_task_struct(task2);
return ret;
err_no_task:
rcu_read_unlock();
return -ESRCH;
}
static __init int kcmp_cookies_init(void)
{
int i;
get_random_bytes(cookies, sizeof(cookies));
for (i = 0; i < KCMP_TYPES; i++)
cookies[i][1] |= (~(~0UL >> 1) | 1);
return 0;
}
arch_initcall(kcmp_cookies_init);

2776
kernel/kexec.c Normal file

File diff suppressed because it is too large Load diff

727
kernel/kmod.c Normal file
View file

@ -0,0 +1,727 @@
/*
kmod, the new module loader (replaces kerneld)
Kirk Petersen
Reorganized not to be a daemon by Adam Richter, with guidance
from Greg Zornetzer.
Modified to avoid chroot and file sharing problems.
Mikael Pettersson
Limit the concurrent number of kmod modprobes to catch loops from
"modprobe needs a service that is in a module".
Keith Owens <kaos@ocs.com.au> December 1999
Unblock all signals when we exec a usermode process.
Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000
call_usermodehelper wait flag, and remove exec_usermodehelper.
Rusty Russell <rusty@rustcorp.com.au> Jan 2003
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/completion.h>
#include <linux/cred.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/resource.h>
#include <linux/notifier.h>
#include <linux/suspend.h>
#include <linux/rwsem.h>
#include <linux/ptrace.h>
#include <linux/async.h>
#include <asm/uaccess.h>
#include <trace/events/module.h>
extern int max_threads;
static struct workqueue_struct *khelper_wq;
/*
* kmod_thread_locker is used for deadlock avoidance. There is no explicit
* locking to protect this global - it is private to the singleton khelper
* thread and should only ever be modified by that thread.
*/
static const struct task_struct *kmod_thread_locker;
#define CAP_BSET (void *)1
#define CAP_PI (void *)2
static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
static DEFINE_SPINLOCK(umh_sysctl_lock);
static DECLARE_RWSEM(umhelper_sem);
#ifdef CONFIG_MODULES
/*
modprobe_path is set via /proc/sys.
*/
char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
static void free_modprobe_argv(struct subprocess_info *info)
{
kfree(info->argv[3]); /* check call_modprobe() */
kfree(info->argv);
}
static int call_modprobe(char *module_name, int wait)
{
struct subprocess_info *info;
static char *envp[] = {
"HOME=/",
"TERM=linux",
"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
NULL
};
char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
if (!argv)
goto out;
module_name = kstrdup(module_name, GFP_KERNEL);
if (!module_name)
goto free_argv;
argv[0] = modprobe_path;
argv[1] = "-q";
argv[2] = "--";
argv[3] = module_name; /* check free_modprobe_argv() */
argv[4] = NULL;
info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
NULL, free_modprobe_argv, NULL);
if (!info)
goto free_module_name;
return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
free_module_name:
kfree(module_name);
free_argv:
kfree(argv);
out:
return -ENOMEM;
}
/**
* __request_module - try to load a kernel module
* @wait: wait (or not) for the operation to complete
* @fmt: printf style format string for the name of the module
* @...: arguments as specified in the format string
*
* Load a module using the user mode module loader. The function returns
* zero on success or a negative errno code on failure. Note that a
* successful module load does not mean the module did not then unload
* and exit on an error of its own. Callers must check that the service
* they requested is now available not blindly invoke it.
*
* If module auto-loading support is disabled then this function
* becomes a no-operation.
*/
int __request_module(bool wait, const char *fmt, ...)
{
va_list args;
char module_name[MODULE_NAME_LEN];
unsigned int max_modprobes;
int ret;
static atomic_t kmod_concurrent = ATOMIC_INIT(0);
#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
static int kmod_loop_msg;
/*
* We don't allow synchronous module loading from async. Module
* init may invoke async_synchronize_full() which will end up
* waiting for this task which already is waiting for the module
* loading to complete, leading to a deadlock.
*/
WARN_ON_ONCE(wait && current_is_async());
if (!modprobe_path[0])
return 0;
va_start(args, fmt);
ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
va_end(args);
if (ret >= MODULE_NAME_LEN)
return -ENAMETOOLONG;
ret = security_kernel_module_request(module_name);
if (ret)
return ret;
/* If modprobe needs a service that is in a module, we get a recursive
* loop. Limit the number of running kmod threads to max_threads/2 or
* MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
* would be to run the parents of this process, counting how many times
* kmod was invoked. That would mean accessing the internals of the
* process tables to get the command line, proc_pid_cmdline is static
* and it is not worth changing the proc code just to handle this case.
* KAO.
*
* "trace the ppid" is simple, but will fail if someone's
* parent exits. I think this is as good as it gets. --RR
*/
max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT);
atomic_inc(&kmod_concurrent);
if (atomic_read(&kmod_concurrent) > max_modprobes) {
/* We may be blaming an innocent here, but unlikely */
if (kmod_loop_msg < 5) {
printk(KERN_ERR
"request_module: runaway loop modprobe %s\n",
module_name);
kmod_loop_msg++;
}
atomic_dec(&kmod_concurrent);
return -ENOMEM;
}
trace_module_request(module_name, wait, _RET_IP_);
ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
atomic_dec(&kmod_concurrent);
return ret;
}
EXPORT_SYMBOL(__request_module);
#endif /* CONFIG_MODULES */
static void call_usermodehelper_freeinfo(struct subprocess_info *info)
{
if (info->cleanup)
(*info->cleanup)(info);
kfree(info);
}
static void umh_complete(struct subprocess_info *sub_info)
{
struct completion *comp = xchg(&sub_info->complete, NULL);
/*
* See call_usermodehelper_exec(). If xchg() returns NULL
* we own sub_info, the UMH_KILLABLE caller has gone away
* or the caller used UMH_NO_WAIT.
*/
if (comp)
complete(comp);
else
call_usermodehelper_freeinfo(sub_info);
}
/*
* This is the task which runs the usermode application
*/
static int ____call_usermodehelper(void *data)
{
struct subprocess_info *sub_info = data;
int wait = sub_info->wait & ~UMH_KILLABLE;
struct cred *new;
int retval;
spin_lock_irq(&current->sighand->siglock);
flush_signal_handlers(current, 1);
spin_unlock_irq(&current->sighand->siglock);
/* We can run anywhere, unlike our parent keventd(). */
set_cpus_allowed_ptr(current, cpu_all_mask);
/*
* Our parent is keventd, which runs with elevated scheduling priority.
* Avoid propagating that into the userspace child.
*/
set_user_nice(current, 0);
retval = -ENOMEM;
new = prepare_kernel_cred(current);
if (!new)
goto out;
spin_lock(&umh_sysctl_lock);
new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
new->cap_inheritable);
spin_unlock(&umh_sysctl_lock);
if (sub_info->init) {
retval = sub_info->init(sub_info, new);
if (retval) {
abort_creds(new);
goto out;
}
}
commit_creds(new);
retval = do_execve(getname_kernel(sub_info->path),
(const char __user *const __user *)sub_info->argv,
(const char __user *const __user *)sub_info->envp);
out:
sub_info->retval = retval;
/* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
if (wait != UMH_WAIT_PROC)
umh_complete(sub_info);
if (!retval)
return 0;
do_exit(0);
}
static int call_helper(void *data)
{
/* Worker thread started blocking khelper thread. */
kmod_thread_locker = current;
return ____call_usermodehelper(data);
}
/* Keventd can't block, but this (a child) can. */
static int wait_for_helper(void *data)
{
struct subprocess_info *sub_info = data;
pid_t pid;
/* If SIGCLD is ignored sys_wait4 won't populate the status. */
kernel_sigaction(SIGCHLD, SIG_DFL);
pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
} else {
int ret = -ECHILD;
/*
* Normally it is bogus to call wait4() from in-kernel because
* wait4() wants to write the exit code to a userspace address.
* But wait_for_helper() always runs as keventd, and put_user()
* to a kernel address works OK for kernel threads, due to their
* having an mm_segment_t which spans the entire address space.
*
* Thus the __user pointer cast is valid here.
*/
sys_wait4(pid, (int __user *)&ret, 0, NULL);
/*
* If ret is 0, either ____call_usermodehelper failed and the
* real error code is already in sub_info->retval or
* sub_info->retval is 0 anyway, so don't mess with it then.
*/
if (ret)
sub_info->retval = ret;
}
umh_complete(sub_info);
do_exit(0);
}
/* This is run by khelper thread */
static void __call_usermodehelper(struct work_struct *work)
{
struct subprocess_info *sub_info =
container_of(work, struct subprocess_info, work);
int wait = sub_info->wait & ~UMH_KILLABLE;
pid_t pid;
/* CLONE_VFORK: wait until the usermode helper has execve'd
* successfully We need the data structures to stay around
* until that is done. */
if (wait == UMH_WAIT_PROC)
pid = kernel_thread(wait_for_helper, sub_info,
CLONE_FS | CLONE_FILES | SIGCHLD);
else {
pid = kernel_thread(call_helper, sub_info,
CLONE_VFORK | SIGCHLD);
/* Worker thread stopped blocking khelper thread. */
kmod_thread_locker = NULL;
}
if (pid < 0) {
sub_info->retval = pid;
umh_complete(sub_info);
}
}
/*
* If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
* (used for preventing user land processes from being created after the user
* land has been frozen during a system-wide hibernation or suspend operation).
* Should always be manipulated under umhelper_sem acquired for write.
*/
static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;
/* Number of helpers running */
static atomic_t running_helpers = ATOMIC_INIT(0);
/*
* Wait queue head used by usermodehelper_disable() to wait for all running
* helpers to finish.
*/
static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
/*
* Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
* to become 'false'.
*/
static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);
/*
* Time to wait for running_helpers to become zero before the setting of
* usermodehelper_disabled in usermodehelper_disable() fails
*/
#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
int usermodehelper_read_trylock(void)
{
DEFINE_WAIT(wait);
int ret = 0;
down_read(&umhelper_sem);
for (;;) {
prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
TASK_INTERRUPTIBLE);
if (!usermodehelper_disabled)
break;
if (usermodehelper_disabled == UMH_DISABLED)
ret = -EAGAIN;
up_read(&umhelper_sem);
if (ret)
break;
schedule();
try_to_freeze();
down_read(&umhelper_sem);
}
finish_wait(&usermodehelper_disabled_waitq, &wait);
return ret;
}
EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);
long usermodehelper_read_lock_wait(long timeout)
{
DEFINE_WAIT(wait);
if (timeout < 0)
return -EINVAL;
down_read(&umhelper_sem);
for (;;) {
prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
TASK_UNINTERRUPTIBLE);
if (!usermodehelper_disabled)
break;
up_read(&umhelper_sem);
timeout = schedule_timeout(timeout);
if (!timeout)
break;
down_read(&umhelper_sem);
}
finish_wait(&usermodehelper_disabled_waitq, &wait);
return timeout;
}
EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);
void usermodehelper_read_unlock(void)
{
up_read(&umhelper_sem);
}
EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
/**
* __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
* @depth: New value to assign to usermodehelper_disabled.
*
* Change the value of usermodehelper_disabled (under umhelper_sem locked for
* writing) and wakeup tasks waiting for it to change.
*/
void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
{
down_write(&umhelper_sem);
usermodehelper_disabled = depth;
wake_up(&usermodehelper_disabled_waitq);
up_write(&umhelper_sem);
}
/**
* __usermodehelper_disable - Prevent new helpers from being started.
* @depth: New value to assign to usermodehelper_disabled.
*
* Set usermodehelper_disabled to @depth and wait for running helpers to exit.
*/
int __usermodehelper_disable(enum umh_disable_depth depth)
{
long retval;
if (!depth)
return -EINVAL;
down_write(&umhelper_sem);
usermodehelper_disabled = depth;
up_write(&umhelper_sem);
/*
* From now on call_usermodehelper_exec() won't start any new
* helpers, so it is sufficient if running_helpers turns out to
* be zero at one point (it may be increased later, but that
* doesn't matter).
*/
retval = wait_event_timeout(running_helpers_waitq,
atomic_read(&running_helpers) == 0,
RUNNING_HELPERS_TIMEOUT);
if (retval)
return 0;
__usermodehelper_set_disable_depth(UMH_ENABLED);
return -EAGAIN;
}
static void helper_lock(void)
{
atomic_inc(&running_helpers);
smp_mb__after_atomic();
}
static void helper_unlock(void)
{
if (atomic_dec_and_test(&running_helpers))
wake_up(&running_helpers_waitq);
}
/**
* call_usermodehelper_setup - prepare to call a usermode helper
* @path: path to usermode executable
* @argv: arg vector for process
* @envp: environment for process
* @gfp_mask: gfp mask for memory allocation
* @cleanup: a cleanup function
* @init: an init function
* @data: arbitrary context sensitive data
*
* Returns either %NULL on allocation failure, or a subprocess_info
* structure. This should be passed to call_usermodehelper_exec to
* exec the process and free the structure.
*
* The init function is used to customize the helper process prior to
* exec. A non-zero return code causes the process to error out, exit,
* and return the failure to the calling process
*
* The cleanup function is just before ethe subprocess_info is about to
* be freed. This can be used for freeing the argv and envp. The
* Function must be runnable in either a process context or the
* context in which call_usermodehelper_exec is called.
*/
struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
char **envp, gfp_t gfp_mask,
int (*init)(struct subprocess_info *info, struct cred *new),
void (*cleanup)(struct subprocess_info *info),
void *data)
{
struct subprocess_info *sub_info;
sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
if (!sub_info)
goto out;
INIT_WORK(&sub_info->work, __call_usermodehelper);
sub_info->path = path;
sub_info->argv = argv;
sub_info->envp = envp;
sub_info->cleanup = cleanup;
sub_info->init = init;
sub_info->data = data;
out:
return sub_info;
}
EXPORT_SYMBOL(call_usermodehelper_setup);
/**
* call_usermodehelper_exec - start a usermode application
* @sub_info: information about the subprocessa
* @wait: wait for the application to finish and return status.
* when UMH_NO_WAIT don't wait at all, but you get no useful error back
* when the program couldn't be exec'ed. This makes it safe to call
* from interrupt context.
*
* Runs a user-space application. The application is started
* asynchronously if wait is not set, and runs as a child of keventd.
* (ie. it runs with full root capabilities).
*/
int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
DECLARE_COMPLETION_ONSTACK(done);
int retval = 0;
if (!sub_info->path) {
call_usermodehelper_freeinfo(sub_info);
return -EINVAL;
}
helper_lock();
if (!khelper_wq || usermodehelper_disabled) {
retval = -EBUSY;
goto out;
}
/*
* Worker thread must not wait for khelper thread at below
* wait_for_completion() if the thread was created with CLONE_VFORK
* flag, for khelper thread is already waiting for the thread at
* wait_for_completion() in do_fork().
*/
if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
retval = -EBUSY;
goto out;
}
/*
* Set the completion pointer only if there is a waiter.
* This makes it possible to use umh_complete to free
* the data structure in case of UMH_NO_WAIT.
*/
sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
sub_info->wait = wait;
queue_work(khelper_wq, &sub_info->work);
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
if (wait & UMH_KILLABLE) {
retval = wait_for_completion_killable(&done);
if (!retval)
goto wait_done;
/* umh_complete() will see NULL and free sub_info */
if (xchg(&sub_info->complete, NULL))
goto unlock;
/* fallthrough, umh_complete() was already called */
}
wait_for_completion(&done);
wait_done:
retval = sub_info->retval;
out:
call_usermodehelper_freeinfo(sub_info);
unlock:
helper_unlock();
return retval;
}
EXPORT_SYMBOL(call_usermodehelper_exec);
/**
* call_usermodehelper() - prepare and start a usermode application
* @path: path to usermode executable
* @argv: arg vector for process
* @envp: environment for process
* @wait: wait for the application to finish and return status.
* when UMH_NO_WAIT don't wait at all, but you get no useful error back
* when the program couldn't be exec'ed. This makes it safe to call
* from interrupt context.
*
* This function is the equivalent to use call_usermodehelper_setup() and
* call_usermodehelper_exec().
*/
int call_usermodehelper(char *path, char **argv, char **envp, int wait)
{
struct subprocess_info *info;
gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
NULL, NULL, NULL);
if (info == NULL)
return -ENOMEM;
return call_usermodehelper_exec(info, wait);
}
EXPORT_SYMBOL(call_usermodehelper);
static int proc_cap_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
kernel_cap_t new_cap;
int err, i;
if (write && (!capable(CAP_SETPCAP) ||
!capable(CAP_SYS_MODULE)))
return -EPERM;
/*
* convert from the global kernel_cap_t to the ulong array to print to
* userspace if this is a read.
*/
spin_lock(&umh_sysctl_lock);
for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
if (table->data == CAP_BSET)
cap_array[i] = usermodehelper_bset.cap[i];
else if (table->data == CAP_PI)
cap_array[i] = usermodehelper_inheritable.cap[i];
else
BUG();
}
spin_unlock(&umh_sysctl_lock);
t = *table;
t.data = &cap_array;
/*
* actually read or write and array of ulongs from userspace. Remember
* these are least significant 32 bits first
*/
err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
if (err < 0)
return err;
/*
* convert from the sysctl array of ulongs to the kernel_cap_t
* internal representation
*/
for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
new_cap.cap[i] = cap_array[i];
/*
* Drop everything not in the new_cap (but don't add things)
*/
spin_lock(&umh_sysctl_lock);
if (write) {
if (table->data == CAP_BSET)
usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
if (table->data == CAP_PI)
usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
}
spin_unlock(&umh_sysctl_lock);
return 0;
}
struct ctl_table usermodehelper_table[] = {
{
.procname = "bset",
.data = CAP_BSET,
.maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
.mode = 0600,
.proc_handler = proc_cap_handler,
},
{
.procname = "inheritable",
.data = CAP_PI,
.maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
.mode = 0600,
.proc_handler = proc_cap_handler,
},
{ }
};
void __init usermodehelper_init(void)
{
khelper_wq = create_singlethread_workqueue("khelper");
BUG_ON(!khelper_wq);
}

2456
kernel/kprobes.c Normal file

File diff suppressed because it is too large Load diff

243
kernel/ksysfs.c Normal file
View file

@ -0,0 +1,243 @@
/*
* kernel/ksysfs.c - sysfs attributes in /sys/kernel, which
* are not related to any other subsystem
*
* Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
*
* This file is release under the GPLv2
*
*/
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/sysfs.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/kexec.h>
#include <linux/profile.h>
#include <linux/stat.h>
#include <linux/sched.h>
#include <linux/capability.h>
#include <linux/compiler.h>
#include <linux/rcupdate.h> /* rcu_expedited */
#define KERNEL_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define KERNEL_ATTR_RW(_name) \
static struct kobj_attribute _name##_attr = \
__ATTR(_name, 0644, _name##_show, _name##_store)
/* current uevent sequence number */
static ssize_t uevent_seqnum_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
}
KERNEL_ATTR_RO(uevent_seqnum);
#ifdef CONFIG_UEVENT_HELPER
/* uevent helper program, used during early boot */
static ssize_t uevent_helper_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%s\n", uevent_helper);
}
static ssize_t uevent_helper_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
if (count+1 > UEVENT_HELPER_PATH_LEN)
return -ENOENT;
memcpy(uevent_helper, buf, count);
uevent_helper[count] = '\0';
if (count && uevent_helper[count-1] == '\n')
uevent_helper[count-1] = '\0';
return count;
}
KERNEL_ATTR_RW(uevent_helper);
#endif
#ifdef CONFIG_PROFILING
static ssize_t profiling_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%d\n", prof_on);
}
static ssize_t profiling_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
int ret;
if (prof_on)
return -EEXIST;
/*
* This eventually calls into get_option() which
* has a ton of callers and is not const. It is
* easiest to cast it away here.
*/
profile_setup((char *)buf);
ret = profile_init();
if (ret)
return ret;
ret = create_proc_profile();
if (ret)
return ret;
return count;
}
KERNEL_ATTR_RW(profiling);
#endif
#ifdef CONFIG_KEXEC
static ssize_t kexec_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%d\n", !!kexec_image);
}
KERNEL_ATTR_RO(kexec_loaded);
static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%d\n", !!kexec_crash_image);
}
KERNEL_ATTR_RO(kexec_crash_loaded);
static ssize_t kexec_crash_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%zu\n", crash_get_memory_size());
}
static ssize_t kexec_crash_size_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
unsigned long cnt;
int ret;
if (kstrtoul(buf, 0, &cnt))
return -EINVAL;
ret = crash_shrink_memory(cnt);
return ret < 0 ? ret : count;
}
KERNEL_ATTR_RW(kexec_crash_size);
static ssize_t vmcoreinfo_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%lx %x\n",
paddr_vmcoreinfo_note(),
(unsigned int)sizeof(vmcoreinfo_note));
}
KERNEL_ATTR_RO(vmcoreinfo);
#endif /* CONFIG_KEXEC */
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%d\n", file_caps_enabled);
}
KERNEL_ATTR_RO(fscaps);
int rcu_expedited;
static ssize_t rcu_expedited_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%d\n", rcu_expedited);
}
static ssize_t rcu_expedited_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
if (kstrtoint(buf, 0, &rcu_expedited))
return -EINVAL;
return count;
}
KERNEL_ATTR_RW(rcu_expedited);
/*
* Make /sys/kernel/notes give the raw contents of our kernel .notes section.
*/
extern const void __start_notes __weak;
extern const void __stop_notes __weak;
#define notes_size (&__stop_notes - &__start_notes)
static ssize_t notes_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t count)
{
memcpy(buf, &__start_notes + off, count);
return count;
}
static struct bin_attribute notes_attr = {
.attr = {
.name = "notes",
.mode = S_IRUGO,
},
.read = &notes_read,
};
struct kobject *kernel_kobj;
EXPORT_SYMBOL_GPL(kernel_kobj);
static struct attribute * kernel_attrs[] = {
&fscaps_attr.attr,
&uevent_seqnum_attr.attr,
#ifdef CONFIG_UEVENT_HELPER
&uevent_helper_attr.attr,
#endif
#ifdef CONFIG_PROFILING
&profiling_attr.attr,
#endif
#ifdef CONFIG_KEXEC
&kexec_loaded_attr.attr,
&kexec_crash_loaded_attr.attr,
&kexec_crash_size_attr.attr,
&vmcoreinfo_attr.attr,
#endif
&rcu_expedited_attr.attr,
NULL
};
static struct attribute_group kernel_attr_group = {
.attrs = kernel_attrs,
};
static int __init ksysfs_init(void)
{
int error;
kernel_kobj = kobject_create_and_add("kernel", NULL);
if (!kernel_kobj) {
error = -ENOMEM;
goto exit;
}
error = sysfs_create_group(kernel_kobj, &kernel_attr_group);
if (error)
goto kset_exit;
if (notes_size > 0) {
notes_attr.size = notes_size;
error = sysfs_create_bin_file(kernel_kobj, &notes_attr);
if (error)
goto group_exit;
}
return 0;
group_exit:
sysfs_remove_group(kernel_kobj, &kernel_attr_group);
kset_exit:
kobject_put(kernel_kobj);
exit:
return error;
}
core_initcall(ksysfs_init);

692
kernel/kthread.c Normal file
View file

@ -0,0 +1,692 @@
/* Kernel thread helper functions.
* Copyright (C) 2004 IBM Corporation, Rusty Russell.
*
* Creation is done via kthreadd, so that we get a clean environment
* even if we're invoked from userspace (think modprobe, hotplug cpu,
* etc.).
*/
#include <linux/sched.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
#include <linux/cpuset.h>
#include <linux/unistd.h>
#include <linux/file.h>
#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
int (*threadfn)(void *data);
void *data;
int node;
/* Result passed back to kthread_create() from kthreadd. */
struct task_struct *result;
struct completion *done;
struct list_head list;
};
struct kthread {
unsigned long flags;
unsigned int cpu;
void *data;
struct completion parked;
struct completion exited;
};
enum KTHREAD_BITS {
KTHREAD_IS_PER_CPU = 0,
KTHREAD_SHOULD_STOP,
KTHREAD_SHOULD_PARK,
KTHREAD_IS_PARKED,
};
#define __to_kthread(vfork) \
container_of(vfork, struct kthread, exited)
static inline struct kthread *to_kthread(struct task_struct *k)
{
return __to_kthread(k->vfork_done);
}
static struct kthread *to_live_kthread(struct task_struct *k)
{
struct completion *vfork = ACCESS_ONCE(k->vfork_done);
if (likely(vfork))
return __to_kthread(vfork);
return NULL;
}
/**
* kthread_should_stop - should this kthread return now?
*
* When someone calls kthread_stop() on your kthread, it will be woken
* and this will return true. You should then return, and your return
* value will be passed through to kthread_stop().
*/
bool kthread_should_stop(void)
{
return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
}
EXPORT_SYMBOL(kthread_should_stop);
/**
* kthread_should_park - should this kthread park now?
*
* When someone calls kthread_park() on your kthread, it will be woken
* and this will return true. You should then do the necessary
* cleanup and call kthread_parkme()
*
* Similar to kthread_should_stop(), but this keeps the thread alive
* and in a park position. kthread_unpark() "restarts" the thread and
* calls the thread function again.
*/
bool kthread_should_park(void)
{
return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
}
/**
* kthread_freezable_should_stop - should this freezable kthread return now?
* @was_frozen: optional out parameter, indicates whether %current was frozen
*
* kthread_should_stop() for freezable kthreads, which will enter
* refrigerator if necessary. This function is safe from kthread_stop() /
* freezer deadlock and freezable kthreads should use this function instead
* of calling try_to_freeze() directly.
*/
bool kthread_freezable_should_stop(bool *was_frozen)
{
bool frozen = false;
might_sleep();
if (unlikely(freezing(current)))
frozen = __refrigerator(true);
if (was_frozen)
*was_frozen = frozen;
return kthread_should_stop();
}
EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
/**
* kthread_data - return data value specified on kthread creation
* @task: kthread task in question
*
* Return the data value specified when kthread @task was created.
* The caller is responsible for ensuring the validity of @task when
* calling this function.
*/
void *kthread_data(struct task_struct *task)
{
return to_kthread(task)->data;
}
/**
* probe_kthread_data - speculative version of kthread_data()
* @task: possible kthread task in question
*
* @task could be a kthread task. Return the data value specified when it
* was created if accessible. If @task isn't a kthread task or its data is
* inaccessible for any reason, %NULL is returned. This function requires
* that @task itself is safe to dereference.
*/
void *probe_kthread_data(struct task_struct *task)
{
struct kthread *kthread = to_kthread(task);
void *data = NULL;
probe_kernel_read(&data, &kthread->data, sizeof(data));
return data;
}
static void __kthread_parkme(struct kthread *self)
{
__set_current_state(TASK_PARKED);
while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
complete(&self->parked);
schedule();
__set_current_state(TASK_PARKED);
}
clear_bit(KTHREAD_IS_PARKED, &self->flags);
__set_current_state(TASK_RUNNING);
}
void kthread_parkme(void)
{
__kthread_parkme(to_kthread(current));
}
static int kthread(void *_create)
{
/* Copy data: it's on kthread's stack */
struct kthread_create_info *create = _create;
int (*threadfn)(void *data) = create->threadfn;
void *data = create->data;
struct completion *done;
struct kthread self;
int ret;
self.flags = 0;
self.data = data;
init_completion(&self.exited);
init_completion(&self.parked);
current->vfork_done = &self.exited;
/* If user was SIGKILLed, I release the structure. */
done = xchg(&create->done, NULL);
if (!done) {
kfree(create);
do_exit(-EINTR);
}
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
create->result = current;
complete(done);
schedule();
ret = -EINTR;
if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
__kthread_parkme(&self);
ret = threadfn(data);
}
/* we can't just return, we must preserve "self" on stack */
do_exit(ret);
}
/* called from do_fork() to get node information for about to be created task */
int tsk_fork_get_node(struct task_struct *tsk)
{
#ifdef CONFIG_NUMA
if (tsk == kthreadd_task)
return tsk->pref_node_fork;
#endif
return NUMA_NO_NODE;
}
static void create_kthread(struct kthread_create_info *create)
{
int pid;
#ifdef CONFIG_NUMA
current->pref_node_fork = create->node;
#endif
/* We want our own signal handler (we take no signals by default). */
pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
/* If user was SIGKILLed, I release the structure. */
struct completion *done = xchg(&create->done, NULL);
if (!done) {
kfree(create);
return;
}
create->result = ERR_PTR(pid);
complete(done);
}
}
/**
* kthread_create_on_node - create a kthread.
* @threadfn: the function to run until signal_pending(current).
* @data: data ptr for @threadfn.
* @node: memory node number.
* @namefmt: printf-style name for the thread.
*
* Description: This helper function creates and names a kernel
* thread. The thread will be stopped: use wake_up_process() to start
* it. See also kthread_run().
*
* If thread is going to be bound on a particular cpu, give its node
* in @node, to get NUMA affinity for kthread stack, or else give -1.
* When woken, the thread will run @threadfn() with @data as its
* argument. @threadfn() can either call do_exit() directly if it is a
* standalone thread for which no one will call kthread_stop(), or
* return when 'kthread_should_stop()' is true (which means
* kthread_stop() has been called). The return value should be zero
* or a negative error number; it will be passed to kthread_stop().
*
* Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
*/
struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
void *data, int node,
const char namefmt[],
...)
{
DECLARE_COMPLETION_ONSTACK(done);
struct task_struct *task;
struct kthread_create_info *create = kmalloc(sizeof(*create),
GFP_KERNEL);
if (!create)
return ERR_PTR(-ENOMEM);
create->threadfn = threadfn;
create->data = data;
create->node = node;
create->done = &done;
spin_lock(&kthread_create_lock);
list_add_tail(&create->list, &kthread_create_list);
spin_unlock(&kthread_create_lock);
wake_up_process(kthreadd_task);
/*
* Wait for completion in killable state, for I might be chosen by
* the OOM killer while kthreadd is trying to allocate memory for
* new kernel thread.
*/
if (unlikely(wait_for_completion_killable(&done))) {
/*
* If I was SIGKILLed before kthreadd (or new kernel thread)
* calls complete(), leave the cleanup of this structure to
* that thread.
*/
if (xchg(&create->done, NULL))
return ERR_PTR(-EINTR);
/*
* kthreadd (or new kernel thread) will call complete()
* shortly.
*/
wait_for_completion(&done);
}
task = create->result;
if (!IS_ERR(task)) {
static const struct sched_param param = { .sched_priority = 0 };
va_list args;
va_start(args, namefmt);
vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
va_end(args);
/*
* root may have changed our (kthreadd's) priority or CPU mask.
* The kernel thread should not inherit these properties.
*/
sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
set_cpus_allowed_ptr(task, cpu_all_mask);
}
kfree(create);
return task;
}
EXPORT_SYMBOL(kthread_create_on_node);
static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
{
/* Must have done schedule() in kthread() before we set_task_cpu */
if (!wait_task_inactive(p, state)) {
WARN_ON(1);
return;
}
/* It's safe because the task is inactive. */
do_set_cpus_allowed(p, cpumask_of(cpu));
p->flags |= PF_NO_SETAFFINITY;
}
/**
* kthread_bind - bind a just-created kthread to a cpu.
* @p: thread created by kthread_create().
* @cpu: cpu (might not be online, must be possible) for @k to run on.
*
* Description: This function is equivalent to set_cpus_allowed(),
* except that @cpu doesn't need to be online, and the thread must be
* stopped (i.e., just returned from kthread_create()).
*/
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
__kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(kthread_bind);
/**
* kthread_create_on_cpu - Create a cpu bound kthread
* @threadfn: the function to run until signal_pending(current).
* @data: data ptr for @threadfn.
* @cpu: The cpu on which the thread should be bound,
* @namefmt: printf-style name for the thread. Format is restricted
* to "name.*%u". Code fills in cpu number.
*
* Description: This helper function creates and names a kernel thread
* The thread will be woken and put into park mode.
*/
struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
void *data, unsigned int cpu,
const char *namefmt)
{
struct task_struct *p;
p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
cpu);
if (IS_ERR(p))
return p;
set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
to_kthread(p)->cpu = cpu;
/* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */
kthread_park(p);
return p;
}
static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
{
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
/*
* We clear the IS_PARKED bit here as we don't wait
* until the task has left the park code. So if we'd
* park before that happens we'd see the IS_PARKED bit
* which might be about to be cleared.
*/
if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
__kthread_bind(k, kthread->cpu, TASK_PARKED);
wake_up_state(k, TASK_PARKED);
}
}
/**
* kthread_unpark - unpark a thread created by kthread_create().
* @k: thread created by kthread_create().
*
* Sets kthread_should_park() for @k to return false, wakes it, and
* waits for it to return. If the thread is marked percpu then its
* bound to the cpu again.
*/
void kthread_unpark(struct task_struct *k)
{
struct kthread *kthread = to_live_kthread(k);
if (kthread)
__kthread_unpark(k, kthread);
}
/**
* kthread_park - park a thread created by kthread_create().
* @k: thread created by kthread_create().
*
* Sets kthread_should_park() for @k to return true, wakes it, and
* waits for it to return. This can also be called after kthread_create()
* instead of calling wake_up_process(): the thread will park without
* calling threadfn().
*
* Returns 0 if the thread is parked, -ENOSYS if the thread exited.
* If called by the kthread itself just the park bit is set.
*/
int kthread_park(struct task_struct *k)
{
struct kthread *kthread = to_live_kthread(k);
int ret = -ENOSYS;
if (kthread) {
if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
if (k != current) {
wake_up_process(k);
wait_for_completion(&kthread->parked);
}
}
ret = 0;
}
return ret;
}
/**
* kthread_stop - stop a thread created by kthread_create().
* @k: thread created by kthread_create().
*
* Sets kthread_should_stop() for @k to return true, wakes it, and
* waits for it to exit. This can also be called after kthread_create()
* instead of calling wake_up_process(): the thread will exit without
* calling threadfn().
*
* If threadfn() may call do_exit() itself, the caller must ensure
* task_struct can't go away.
*
* Returns the result of threadfn(), or %-EINTR if wake_up_process()
* was never called.
*/
int kthread_stop(struct task_struct *k)
{
struct kthread *kthread;
int ret;
trace_sched_kthread_stop(k);
get_task_struct(k);
kthread = to_live_kthread(k);
if (kthread) {
set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
__kthread_unpark(k, kthread);
wake_up_process(k);
wait_for_completion(&kthread->exited);
}
ret = k->exit_code;
put_task_struct(k);
trace_sched_kthread_stop_ret(ret);
return ret;
}
EXPORT_SYMBOL(kthread_stop);
int kthreadd(void *unused)
{
struct task_struct *tsk = current;
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
set_cpus_allowed_ptr(tsk, cpu_all_mask);
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
if (list_empty(&kthread_create_list))
schedule();
__set_current_state(TASK_RUNNING);
spin_lock(&kthread_create_lock);
while (!list_empty(&kthread_create_list)) {
struct kthread_create_info *create;
create = list_entry(kthread_create_list.next,
struct kthread_create_info, list);
list_del_init(&create->list);
spin_unlock(&kthread_create_lock);
create_kthread(create);
spin_lock(&kthread_create_lock);
}
spin_unlock(&kthread_create_lock);
}
return 0;
}
void __init_kthread_worker(struct kthread_worker *worker,
const char *name,
struct lock_class_key *key)
{
spin_lock_init(&worker->lock);
lockdep_set_class_and_name(&worker->lock, key, name);
INIT_LIST_HEAD(&worker->work_list);
worker->task = NULL;
}
EXPORT_SYMBOL_GPL(__init_kthread_worker);
/**
* kthread_worker_fn - kthread function to process kthread_worker
* @worker_ptr: pointer to initialized kthread_worker
*
* This function can be used as @threadfn to kthread_create() or
* kthread_run() with @worker_ptr argument pointing to an initialized
* kthread_worker. The started kthread will process work_list until
* the it is stopped with kthread_stop(). A kthread can also call
* this function directly after extra initialization.
*
* Different kthreads can be used for the same kthread_worker as long
* as there's only one kthread attached to it at any given time. A
* kthread_worker without an attached kthread simply collects queued
* kthread_works.
*/
int kthread_worker_fn(void *worker_ptr)
{
struct kthread_worker *worker = worker_ptr;
struct kthread_work *work;
WARN_ON(worker->task);
worker->task = current;
repeat:
set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
spin_lock_irq(&worker->lock);
worker->task = NULL;
spin_unlock_irq(&worker->lock);
return 0;
}
work = NULL;
spin_lock_irq(&worker->lock);
if (!list_empty(&worker->work_list)) {
work = list_first_entry(&worker->work_list,
struct kthread_work, node);
list_del_init(&work->node);
}
worker->current_work = work;
spin_unlock_irq(&worker->lock);
if (work) {
__set_current_state(TASK_RUNNING);
work->func(work);
} else if (!freezing(current))
schedule();
try_to_freeze();
goto repeat;
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);
/* insert @work before @pos in @worker */
static void insert_kthread_work(struct kthread_worker *worker,
struct kthread_work *work,
struct list_head *pos)
{
lockdep_assert_held(&worker->lock);
list_add_tail(&work->node, pos);
work->worker = worker;
if (!worker->current_work && likely(worker->task))
wake_up_process(worker->task);
}
/**
* queue_kthread_work - queue a kthread_work
* @worker: target kthread_worker
* @work: kthread_work to queue
*
* Queue @work to work processor @task for async execution. @task
* must have been created with kthread_worker_create(). Returns %true
* if @work was successfully queued, %false if it was already pending.
*/
bool queue_kthread_work(struct kthread_worker *worker,
struct kthread_work *work)
{
bool ret = false;
unsigned long flags;
spin_lock_irqsave(&worker->lock, flags);
if (list_empty(&work->node)) {
insert_kthread_work(worker, work, &worker->work_list);
ret = true;
}
spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(queue_kthread_work);
struct kthread_flush_work {
struct kthread_work work;
struct completion done;
};
static void kthread_flush_work_fn(struct kthread_work *work)
{
struct kthread_flush_work *fwork =
container_of(work, struct kthread_flush_work, work);
complete(&fwork->done);
}
/**
* flush_kthread_work - flush a kthread_work
* @work: work to flush
*
* If @work is queued or executing, wait for it to finish execution.
*/
void flush_kthread_work(struct kthread_work *work)
{
struct kthread_flush_work fwork = {
KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
COMPLETION_INITIALIZER_ONSTACK(fwork.done),
};
struct kthread_worker *worker;
bool noop = false;
retry:
worker = work->worker;
if (!worker)
return;
spin_lock_irq(&worker->lock);
if (work->worker != worker) {
spin_unlock_irq(&worker->lock);
goto retry;
}
if (!list_empty(&work->node))
insert_kthread_work(worker, &fwork.work, work->node.next);
else if (worker->current_work == work)
insert_kthread_work(worker, &fwork.work, worker->work_list.next);
else
noop = true;
spin_unlock_irq(&worker->lock);
if (!noop)
wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(flush_kthread_work);
/**
* flush_kthread_worker - flush all current works on a kthread_worker
* @worker: worker to flush
*
* Wait until all currently executing or pending works on @worker are
* finished.
*/
void flush_kthread_worker(struct kthread_worker *worker)
{
struct kthread_flush_work fwork = {
KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
COMPLETION_INITIALIZER_ONSTACK(fwork.done),
};
queue_kthread_work(worker, &fwork.work);
wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(flush_kthread_worker);

Some files were not shown because too many files have changed in this diff Show more