mirror of
https://github.com/AetherDroid/android_kernel_samsung_on5xelte.git
synced 2025-10-28 23:08:52 +01:00
Fixed MTP to work with TWRP
This commit is contained in:
commit
f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions
21
kernel/sched/Makefile
Normal file
21
kernel/sched/Makefile
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
ifdef CONFIG_FUNCTION_TRACER
|
||||
CFLAGS_REMOVE_clock.o = -pg
|
||||
endif
|
||||
|
||||
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
|
||||
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
|
||||
# needed for x86 only. Why this used to be enabled for all architectures is beyond
|
||||
# me. I suspect most platforms don't need this, but until we know that for sure
|
||||
# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
|
||||
# to get a correct value for the wait-channel (WCHAN in ps). --davidm
|
||||
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
|
||||
endif
|
||||
|
||||
obj-y += core.o proc.o clock.o cputime.o
|
||||
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
|
||||
obj-y += wait.o completion.o idle.o
|
||||
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
|
||||
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
|
||||
obj-$(CONFIG_SCHEDSTATS) += stats.o
|
||||
obj-$(CONFIG_SCHED_DEBUG) += debug.o
|
||||
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
|
||||
253
kernel/sched/auto_group.c
Normal file
253
kernel/sched/auto_group.c
Normal file
|
|
@ -0,0 +1,253 @@
|
|||
#ifdef CONFIG_SCHED_AUTOGROUP
|
||||
|
||||
#include "sched.h"
|
||||
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/kallsyms.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/export.h>
|
||||
|
||||
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
|
||||
static struct autogroup autogroup_default;
|
||||
static atomic_t autogroup_seq_nr;
|
||||
|
||||
void __init autogroup_init(struct task_struct *init_task)
|
||||
{
|
||||
autogroup_default.tg = &root_task_group;
|
||||
kref_init(&autogroup_default.kref);
|
||||
init_rwsem(&autogroup_default.lock);
|
||||
init_task->signal->autogroup = &autogroup_default;
|
||||
}
|
||||
|
||||
void autogroup_free(struct task_group *tg)
|
||||
{
|
||||
kfree(tg->autogroup);
|
||||
}
|
||||
|
||||
static inline void autogroup_destroy(struct kref *kref)
|
||||
{
|
||||
struct autogroup *ag = container_of(kref, struct autogroup, kref);
|
||||
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
/* We've redirected RT tasks to the root task group... */
|
||||
ag->tg->rt_se = NULL;
|
||||
ag->tg->rt_rq = NULL;
|
||||
#endif
|
||||
sched_offline_group(ag->tg);
|
||||
sched_destroy_group(ag->tg);
|
||||
}
|
||||
|
||||
static inline void autogroup_kref_put(struct autogroup *ag)
|
||||
{
|
||||
kref_put(&ag->kref, autogroup_destroy);
|
||||
}
|
||||
|
||||
static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
|
||||
{
|
||||
kref_get(&ag->kref);
|
||||
return ag;
|
||||
}
|
||||
|
||||
static inline struct autogroup *autogroup_task_get(struct task_struct *p)
|
||||
{
|
||||
struct autogroup *ag;
|
||||
unsigned long flags;
|
||||
|
||||
if (!lock_task_sighand(p, &flags))
|
||||
return autogroup_kref_get(&autogroup_default);
|
||||
|
||||
ag = autogroup_kref_get(p->signal->autogroup);
|
||||
unlock_task_sighand(p, &flags);
|
||||
|
||||
return ag;
|
||||
}
|
||||
|
||||
static inline struct autogroup *autogroup_create(void)
|
||||
{
|
||||
struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
|
||||
struct task_group *tg;
|
||||
|
||||
if (!ag)
|
||||
goto out_fail;
|
||||
|
||||
tg = sched_create_group(&root_task_group);
|
||||
|
||||
if (IS_ERR(tg))
|
||||
goto out_free;
|
||||
|
||||
kref_init(&ag->kref);
|
||||
init_rwsem(&ag->lock);
|
||||
ag->id = atomic_inc_return(&autogroup_seq_nr);
|
||||
ag->tg = tg;
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
/*
|
||||
* Autogroup RT tasks are redirected to the root task group
|
||||
* so we don't have to move tasks around upon policy change,
|
||||
* or flail around trying to allocate bandwidth on the fly.
|
||||
* A bandwidth exception in __sched_setscheduler() allows
|
||||
* the policy change to proceed.
|
||||
*/
|
||||
free_rt_sched_group(tg);
|
||||
tg->rt_se = root_task_group.rt_se;
|
||||
tg->rt_rq = root_task_group.rt_rq;
|
||||
#endif
|
||||
tg->autogroup = ag;
|
||||
|
||||
sched_online_group(tg, &root_task_group);
|
||||
return ag;
|
||||
|
||||
out_free:
|
||||
kfree(ag);
|
||||
out_fail:
|
||||
if (printk_ratelimit()) {
|
||||
printk(KERN_WARNING "autogroup_create: %s failure.\n",
|
||||
ag ? "sched_create_group()" : "kmalloc()");
|
||||
}
|
||||
|
||||
return autogroup_kref_get(&autogroup_default);
|
||||
}
|
||||
|
||||
bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
|
||||
{
|
||||
if (tg != &root_task_group)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* We can only assume the task group can't go away on us if
|
||||
* autogroup_move_group() can see us on ->thread_group list.
|
||||
*/
|
||||
if (p->flags & PF_EXITING)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
autogroup_move_group(struct task_struct *p, struct autogroup *ag)
|
||||
{
|
||||
struct autogroup *prev;
|
||||
struct task_struct *t;
|
||||
unsigned long flags;
|
||||
|
||||
BUG_ON(!lock_task_sighand(p, &flags));
|
||||
|
||||
prev = p->signal->autogroup;
|
||||
if (prev == ag) {
|
||||
unlock_task_sighand(p, &flags);
|
||||
return;
|
||||
}
|
||||
|
||||
p->signal->autogroup = autogroup_kref_get(ag);
|
||||
|
||||
if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
|
||||
goto out;
|
||||
|
||||
for_each_thread(p, t)
|
||||
sched_move_task(t);
|
||||
out:
|
||||
unlock_task_sighand(p, &flags);
|
||||
autogroup_kref_put(prev);
|
||||
}
|
||||
|
||||
/* Allocates GFP_KERNEL, cannot be called under any spinlock */
|
||||
void sched_autogroup_create_attach(struct task_struct *p)
|
||||
{
|
||||
struct autogroup *ag = autogroup_create();
|
||||
|
||||
autogroup_move_group(p, ag);
|
||||
/* drop extra reference added by autogroup_create() */
|
||||
autogroup_kref_put(ag);
|
||||
}
|
||||
EXPORT_SYMBOL(sched_autogroup_create_attach);
|
||||
|
||||
/* Cannot be called under siglock. Currently has no users */
|
||||
void sched_autogroup_detach(struct task_struct *p)
|
||||
{
|
||||
autogroup_move_group(p, &autogroup_default);
|
||||
}
|
||||
EXPORT_SYMBOL(sched_autogroup_detach);
|
||||
|
||||
void sched_autogroup_fork(struct signal_struct *sig)
|
||||
{
|
||||
sig->autogroup = autogroup_task_get(current);
|
||||
}
|
||||
|
||||
void sched_autogroup_exit(struct signal_struct *sig)
|
||||
{
|
||||
autogroup_kref_put(sig->autogroup);
|
||||
}
|
||||
|
||||
static int __init setup_autogroup(char *str)
|
||||
{
|
||||
sysctl_sched_autogroup_enabled = 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
__setup("noautogroup", setup_autogroup);
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
|
||||
int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
|
||||
{
|
||||
static unsigned long next = INITIAL_JIFFIES;
|
||||
struct autogroup *ag;
|
||||
int err;
|
||||
|
||||
if (nice < MIN_NICE || nice > MAX_NICE)
|
||||
return -EINVAL;
|
||||
|
||||
err = security_task_setnice(current, nice);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (nice < 0 && !can_nice(current, nice))
|
||||
return -EPERM;
|
||||
|
||||
/* this is a heavy operation taking global locks.. */
|
||||
if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
|
||||
return -EAGAIN;
|
||||
|
||||
next = HZ / 10 + jiffies;
|
||||
ag = autogroup_task_get(p);
|
||||
|
||||
down_write(&ag->lock);
|
||||
err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
|
||||
if (!err)
|
||||
ag->nice = nice;
|
||||
up_write(&ag->lock);
|
||||
|
||||
autogroup_kref_put(ag);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
|
||||
{
|
||||
struct autogroup *ag = autogroup_task_get(p);
|
||||
|
||||
if (!task_group_is_autogroup(ag->tg))
|
||||
goto out;
|
||||
|
||||
down_read(&ag->lock);
|
||||
seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
|
||||
up_read(&ag->lock);
|
||||
|
||||
out:
|
||||
autogroup_kref_put(ag);
|
||||
}
|
||||
#endif /* CONFIG_PROC_FS */
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
int autogroup_path(struct task_group *tg, char *buf, int buflen)
|
||||
{
|
||||
if (!task_group_is_autogroup(tg))
|
||||
return 0;
|
||||
|
||||
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
|
||||
}
|
||||
#endif /* CONFIG_SCHED_DEBUG */
|
||||
|
||||
#endif /* CONFIG_SCHED_AUTOGROUP */
|
||||
64
kernel/sched/auto_group.h
Normal file
64
kernel/sched/auto_group.h
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
#ifdef CONFIG_SCHED_AUTOGROUP
|
||||
|
||||
#include <linux/kref.h>
|
||||
#include <linux/rwsem.h>
|
||||
|
||||
struct autogroup {
|
||||
/*
|
||||
* reference doesn't mean how many thread attach to this
|
||||
* autogroup now. It just stands for the number of task
|
||||
* could use this autogroup.
|
||||
*/
|
||||
struct kref kref;
|
||||
struct task_group *tg;
|
||||
struct rw_semaphore lock;
|
||||
unsigned long id;
|
||||
int nice;
|
||||
};
|
||||
|
||||
extern void autogroup_init(struct task_struct *init_task);
|
||||
extern void autogroup_free(struct task_group *tg);
|
||||
|
||||
static inline bool task_group_is_autogroup(struct task_group *tg)
|
||||
{
|
||||
return !!tg->autogroup;
|
||||
}
|
||||
|
||||
extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
|
||||
|
||||
static inline struct task_group *
|
||||
autogroup_task_group(struct task_struct *p, struct task_group *tg)
|
||||
{
|
||||
int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
|
||||
|
||||
if (enabled && task_wants_autogroup(p, tg))
|
||||
return p->signal->autogroup->tg;
|
||||
|
||||
return tg;
|
||||
}
|
||||
|
||||
extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
|
||||
|
||||
#else /* !CONFIG_SCHED_AUTOGROUP */
|
||||
|
||||
static inline void autogroup_init(struct task_struct *init_task) { }
|
||||
static inline void autogroup_free(struct task_group *tg) { }
|
||||
static inline bool task_group_is_autogroup(struct task_group *tg)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline struct task_group *
|
||||
autogroup_task_group(struct task_struct *p, struct task_group *tg)
|
||||
{
|
||||
return tg;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* CONFIG_SCHED_AUTOGROUP */
|
||||
422
kernel/sched/clock.c
Normal file
422
kernel/sched/clock.c
Normal file
|
|
@ -0,0 +1,422 @@
|
|||
/*
|
||||
* sched_clock for unstable cpu clocks
|
||||
*
|
||||
* Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
|
||||
*
|
||||
* Updates and enhancements:
|
||||
* Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
|
||||
*
|
||||
* Based on code by:
|
||||
* Ingo Molnar <mingo@redhat.com>
|
||||
* Guillaume Chazarain <guichaz@gmail.com>
|
||||
*
|
||||
*
|
||||
* What:
|
||||
*
|
||||
* cpu_clock(i) provides a fast (execution time) high resolution
|
||||
* clock with bounded drift between CPUs. The value of cpu_clock(i)
|
||||
* is monotonic for constant i. The timestamp returned is in nanoseconds.
|
||||
*
|
||||
* ######################### BIG FAT WARNING ##########################
|
||||
* # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
|
||||
* # go backwards !! #
|
||||
* ####################################################################
|
||||
*
|
||||
* There is no strict promise about the base, although it tends to start
|
||||
* at 0 on boot (but people really shouldn't rely on that).
|
||||
*
|
||||
* cpu_clock(i) -- can be used from any context, including NMI.
|
||||
* local_clock() -- is cpu_clock() on the current cpu.
|
||||
*
|
||||
* sched_clock_cpu(i)
|
||||
*
|
||||
* How:
|
||||
*
|
||||
* The implementation either uses sched_clock() when
|
||||
* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
|
||||
* sched_clock() is assumed to provide these properties (mostly it means
|
||||
* the architecture provides a globally synchronized highres time source).
|
||||
*
|
||||
* Otherwise it tries to create a semi stable clock from a mixture of other
|
||||
* clocks, including:
|
||||
*
|
||||
* - GTOD (clock monotomic)
|
||||
* - sched_clock()
|
||||
* - explicit idle events
|
||||
*
|
||||
* We use GTOD as base and use sched_clock() deltas to improve resolution. The
|
||||
* deltas are filtered to provide monotonicity and keeping it within an
|
||||
* expected window.
|
||||
*
|
||||
* Furthermore, explicit sleep and wakeup hooks allow us to account for time
|
||||
* that is otherwise invisible (TSC gets stopped).
|
||||
*
|
||||
*/
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/hardirq.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/ktime.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/static_key.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/compiler.h>
|
||||
|
||||
/*
|
||||
* Scheduler clock - returns current time in nanosec units.
|
||||
* This is default implementation.
|
||||
* Architectures and sub-architectures can override this.
|
||||
*/
|
||||
unsigned long long __weak sched_clock(void)
|
||||
{
|
||||
return (unsigned long long)(jiffies - INITIAL_JIFFIES)
|
||||
* (NSEC_PER_SEC / HZ);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sched_clock);
|
||||
|
||||
__read_mostly int sched_clock_running;
|
||||
|
||||
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
|
||||
static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
|
||||
static int __sched_clock_stable_early;
|
||||
|
||||
int sched_clock_stable(void)
|
||||
{
|
||||
return static_key_false(&__sched_clock_stable);
|
||||
}
|
||||
|
||||
static void __set_sched_clock_stable(void)
|
||||
{
|
||||
if (!sched_clock_stable())
|
||||
static_key_slow_inc(&__sched_clock_stable);
|
||||
}
|
||||
|
||||
void set_sched_clock_stable(void)
|
||||
{
|
||||
__sched_clock_stable_early = 1;
|
||||
|
||||
smp_mb(); /* matches sched_clock_init() */
|
||||
|
||||
if (!sched_clock_running)
|
||||
return;
|
||||
|
||||
__set_sched_clock_stable();
|
||||
}
|
||||
|
||||
static void __clear_sched_clock_stable(struct work_struct *work)
|
||||
{
|
||||
/* XXX worry about clock continuity */
|
||||
if (sched_clock_stable())
|
||||
static_key_slow_dec(&__sched_clock_stable);
|
||||
}
|
||||
|
||||
static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
|
||||
|
||||
void clear_sched_clock_stable(void)
|
||||
{
|
||||
__sched_clock_stable_early = 0;
|
||||
|
||||
smp_mb(); /* matches sched_clock_init() */
|
||||
|
||||
if (!sched_clock_running)
|
||||
return;
|
||||
|
||||
schedule_work(&sched_clock_work);
|
||||
}
|
||||
|
||||
struct sched_clock_data {
|
||||
u64 tick_raw;
|
||||
u64 tick_gtod;
|
||||
u64 clock;
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
|
||||
|
||||
static inline struct sched_clock_data *this_scd(void)
|
||||
{
|
||||
return this_cpu_ptr(&sched_clock_data);
|
||||
}
|
||||
|
||||
static inline struct sched_clock_data *cpu_sdc(int cpu)
|
||||
{
|
||||
return &per_cpu(sched_clock_data, cpu);
|
||||
}
|
||||
|
||||
void sched_clock_init(void)
|
||||
{
|
||||
u64 ktime_now = ktime_to_ns(ktime_get());
|
||||
int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct sched_clock_data *scd = cpu_sdc(cpu);
|
||||
|
||||
scd->tick_raw = 0;
|
||||
scd->tick_gtod = ktime_now;
|
||||
scd->clock = ktime_now;
|
||||
}
|
||||
|
||||
sched_clock_running = 1;
|
||||
|
||||
/*
|
||||
* Ensure that it is impossible to not do a static_key update.
|
||||
*
|
||||
* Either {set,clear}_sched_clock_stable() must see sched_clock_running
|
||||
* and do the update, or we must see their __sched_clock_stable_early
|
||||
* and do the update, or both.
|
||||
*/
|
||||
smp_mb(); /* matches {set,clear}_sched_clock_stable() */
|
||||
|
||||
if (__sched_clock_stable_early)
|
||||
__set_sched_clock_stable();
|
||||
else
|
||||
__clear_sched_clock_stable(NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* min, max except they take wrapping into account
|
||||
*/
|
||||
|
||||
static inline u64 wrap_min(u64 x, u64 y)
|
||||
{
|
||||
return (s64)(x - y) < 0 ? x : y;
|
||||
}
|
||||
|
||||
static inline u64 wrap_max(u64 x, u64 y)
|
||||
{
|
||||
return (s64)(x - y) > 0 ? x : y;
|
||||
}
|
||||
|
||||
/*
|
||||
* update the percpu scd from the raw @now value
|
||||
*
|
||||
* - filter out backward motion
|
||||
* - use the GTOD tick value to create a window to filter crazy TSC values
|
||||
*/
|
||||
static u64 sched_clock_local(struct sched_clock_data *scd)
|
||||
{
|
||||
u64 now, clock, old_clock, min_clock, max_clock;
|
||||
s64 delta;
|
||||
|
||||
again:
|
||||
now = sched_clock();
|
||||
delta = now - scd->tick_raw;
|
||||
if (unlikely(delta < 0))
|
||||
delta = 0;
|
||||
|
||||
old_clock = scd->clock;
|
||||
|
||||
/*
|
||||
* scd->clock = clamp(scd->tick_gtod + delta,
|
||||
* max(scd->tick_gtod, scd->clock),
|
||||
* scd->tick_gtod + TICK_NSEC);
|
||||
*/
|
||||
|
||||
clock = scd->tick_gtod + delta;
|
||||
min_clock = wrap_max(scd->tick_gtod, old_clock);
|
||||
max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
|
||||
|
||||
clock = wrap_max(clock, min_clock);
|
||||
clock = wrap_min(clock, max_clock);
|
||||
|
||||
if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
|
||||
goto again;
|
||||
|
||||
return clock;
|
||||
}
|
||||
|
||||
static u64 sched_clock_remote(struct sched_clock_data *scd)
|
||||
{
|
||||
struct sched_clock_data *my_scd = this_scd();
|
||||
u64 this_clock, remote_clock;
|
||||
u64 *ptr, old_val, val;
|
||||
|
||||
#if BITS_PER_LONG != 64
|
||||
again:
|
||||
/*
|
||||
* Careful here: The local and the remote clock values need to
|
||||
* be read out atomic as we need to compare the values and
|
||||
* then update either the local or the remote side. So the
|
||||
* cmpxchg64 below only protects one readout.
|
||||
*
|
||||
* We must reread via sched_clock_local() in the retry case on
|
||||
* 32bit as an NMI could use sched_clock_local() via the
|
||||
* tracer and hit between the readout of
|
||||
* the low32bit and the high 32bit portion.
|
||||
*/
|
||||
this_clock = sched_clock_local(my_scd);
|
||||
/*
|
||||
* We must enforce atomic readout on 32bit, otherwise the
|
||||
* update on the remote cpu can hit inbetween the readout of
|
||||
* the low32bit and the high 32bit portion.
|
||||
*/
|
||||
remote_clock = cmpxchg64(&scd->clock, 0, 0);
|
||||
#else
|
||||
/*
|
||||
* On 64bit the read of [my]scd->clock is atomic versus the
|
||||
* update, so we can avoid the above 32bit dance.
|
||||
*/
|
||||
sched_clock_local(my_scd);
|
||||
again:
|
||||
this_clock = my_scd->clock;
|
||||
remote_clock = scd->clock;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Use the opportunity that we have both locks
|
||||
* taken to couple the two clocks: we take the
|
||||
* larger time as the latest time for both
|
||||
* runqueues. (this creates monotonic movement)
|
||||
*/
|
||||
if (likely((s64)(remote_clock - this_clock) < 0)) {
|
||||
ptr = &scd->clock;
|
||||
old_val = remote_clock;
|
||||
val = this_clock;
|
||||
} else {
|
||||
/*
|
||||
* Should be rare, but possible:
|
||||
*/
|
||||
ptr = &my_scd->clock;
|
||||
old_val = this_clock;
|
||||
val = remote_clock;
|
||||
}
|
||||
|
||||
if (cmpxchg64(ptr, old_val, val) != old_val)
|
||||
goto again;
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
/*
|
||||
* Similar to cpu_clock(), but requires local IRQs to be disabled.
|
||||
*
|
||||
* See cpu_clock().
|
||||
*/
|
||||
u64 sched_clock_cpu(int cpu)
|
||||
{
|
||||
struct sched_clock_data *scd;
|
||||
u64 clock;
|
||||
|
||||
if (sched_clock_stable())
|
||||
return sched_clock();
|
||||
|
||||
if (unlikely(!sched_clock_running))
|
||||
return 0ull;
|
||||
|
||||
preempt_disable_notrace();
|
||||
scd = cpu_sdc(cpu);
|
||||
|
||||
if (cpu != smp_processor_id())
|
||||
clock = sched_clock_remote(scd);
|
||||
else
|
||||
clock = sched_clock_local(scd);
|
||||
preempt_enable_notrace();
|
||||
|
||||
return clock;
|
||||
}
|
||||
|
||||
void sched_clock_tick(void)
|
||||
{
|
||||
struct sched_clock_data *scd;
|
||||
u64 now, now_gtod;
|
||||
|
||||
if (sched_clock_stable())
|
||||
return;
|
||||
|
||||
if (unlikely(!sched_clock_running))
|
||||
return;
|
||||
|
||||
WARN_ON_ONCE(!irqs_disabled());
|
||||
|
||||
scd = this_scd();
|
||||
now_gtod = ktime_to_ns(ktime_get());
|
||||
now = sched_clock();
|
||||
|
||||
scd->tick_raw = now;
|
||||
scd->tick_gtod = now_gtod;
|
||||
sched_clock_local(scd);
|
||||
}
|
||||
|
||||
/*
|
||||
* We are going deep-idle (irqs are disabled):
|
||||
*/
|
||||
void sched_clock_idle_sleep_event(void)
|
||||
{
|
||||
sched_clock_cpu(smp_processor_id());
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
|
||||
|
||||
/*
|
||||
* We just idled delta nanoseconds (called with irqs disabled):
|
||||
*/
|
||||
void sched_clock_idle_wakeup_event(u64 delta_ns)
|
||||
{
|
||||
if (timekeeping_suspended)
|
||||
return;
|
||||
|
||||
sched_clock_tick();
|
||||
touch_softlockup_watchdog();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
|
||||
|
||||
/*
|
||||
* As outlined at the top, provides a fast, high resolution, nanosecond
|
||||
* time source that is monotonic per cpu argument and has bounded drift
|
||||
* between cpus.
|
||||
*
|
||||
* ######################### BIG FAT WARNING ##########################
|
||||
* # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
|
||||
* # go backwards !! #
|
||||
* ####################################################################
|
||||
*/
|
||||
u64 cpu_clock(int cpu)
|
||||
{
|
||||
if (!sched_clock_stable())
|
||||
return sched_clock_cpu(cpu);
|
||||
|
||||
return sched_clock();
|
||||
}
|
||||
|
||||
/*
|
||||
* Similar to cpu_clock() for the current cpu. Time will only be observed
|
||||
* to be monotonic if care is taken to only compare timestampt taken on the
|
||||
* same CPU.
|
||||
*
|
||||
* See cpu_clock().
|
||||
*/
|
||||
u64 local_clock(void)
|
||||
{
|
||||
if (!sched_clock_stable())
|
||||
return sched_clock_cpu(raw_smp_processor_id());
|
||||
|
||||
return sched_clock();
|
||||
}
|
||||
|
||||
#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
|
||||
|
||||
void sched_clock_init(void)
|
||||
{
|
||||
sched_clock_running = 1;
|
||||
}
|
||||
|
||||
u64 sched_clock_cpu(int cpu)
|
||||
{
|
||||
if (unlikely(!sched_clock_running))
|
||||
return 0;
|
||||
|
||||
return sched_clock();
|
||||
}
|
||||
|
||||
u64 cpu_clock(int cpu)
|
||||
{
|
||||
return sched_clock();
|
||||
}
|
||||
|
||||
u64 local_clock(void)
|
||||
{
|
||||
return sched_clock();
|
||||
}
|
||||
|
||||
#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
|
||||
|
||||
EXPORT_SYMBOL_GPL(cpu_clock);
|
||||
EXPORT_SYMBOL_GPL(local_clock);
|
||||
299
kernel/sched/completion.c
Normal file
299
kernel/sched/completion.c
Normal file
|
|
@ -0,0 +1,299 @@
|
|||
/*
|
||||
* Generic wait-for-completion handler;
|
||||
*
|
||||
* It differs from semaphores in that their default case is the opposite,
|
||||
* wait_for_completion default blocks whereas semaphore default non-block. The
|
||||
* interface also makes it easy to 'complete' multiple waiting threads,
|
||||
* something which isn't entirely natural for semaphores.
|
||||
*
|
||||
* But more importantly, the primitive documents the usage. Semaphores would
|
||||
* typically be used for exclusion which gives rise to priority inversion.
|
||||
* Waiting for completion is a typically sync point, but not an exclusion point.
|
||||
*/
|
||||
|
||||
#include <linux/sched.h>
|
||||
#include <linux/completion.h>
|
||||
|
||||
/**
|
||||
* complete: - signals a single thread waiting on this completion
|
||||
* @x: holds the state of this particular completion
|
||||
*
|
||||
* This will wake up a single thread waiting on this completion. Threads will be
|
||||
* awakened in the same order in which they were queued.
|
||||
*
|
||||
* See also complete_all(), wait_for_completion() and related routines.
|
||||
*
|
||||
* It may be assumed that this function implies a write memory barrier before
|
||||
* changing the task state if and only if any tasks are woken up.
|
||||
*/
|
||||
void complete(struct completion *x)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&x->wait.lock, flags);
|
||||
x->done++;
|
||||
__wake_up_locked(&x->wait, TASK_NORMAL, 1);
|
||||
spin_unlock_irqrestore(&x->wait.lock, flags);
|
||||
}
|
||||
EXPORT_SYMBOL(complete);
|
||||
|
||||
/**
|
||||
* complete_all: - signals all threads waiting on this completion
|
||||
* @x: holds the state of this particular completion
|
||||
*
|
||||
* This will wake up all threads waiting on this particular completion event.
|
||||
*
|
||||
* It may be assumed that this function implies a write memory barrier before
|
||||
* changing the task state if and only if any tasks are woken up.
|
||||
*/
|
||||
void complete_all(struct completion *x)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&x->wait.lock, flags);
|
||||
x->done += UINT_MAX/2;
|
||||
__wake_up_locked(&x->wait, TASK_NORMAL, 0);
|
||||
spin_unlock_irqrestore(&x->wait.lock, flags);
|
||||
}
|
||||
EXPORT_SYMBOL(complete_all);
|
||||
|
||||
static inline long __sched
|
||||
do_wait_for_common(struct completion *x,
|
||||
long (*action)(long), long timeout, int state)
|
||||
{
|
||||
if (!x->done) {
|
||||
DECLARE_WAITQUEUE(wait, current);
|
||||
|
||||
__add_wait_queue_tail_exclusive(&x->wait, &wait);
|
||||
do {
|
||||
if (signal_pending_state(state, current)) {
|
||||
timeout = -ERESTARTSYS;
|
||||
break;
|
||||
}
|
||||
__set_current_state(state);
|
||||
spin_unlock_irq(&x->wait.lock);
|
||||
timeout = action(timeout);
|
||||
spin_lock_irq(&x->wait.lock);
|
||||
} while (!x->done && timeout);
|
||||
__remove_wait_queue(&x->wait, &wait);
|
||||
if (!x->done)
|
||||
return timeout;
|
||||
}
|
||||
x->done--;
|
||||
return timeout ?: 1;
|
||||
}
|
||||
|
||||
static inline long __sched
|
||||
__wait_for_common(struct completion *x,
|
||||
long (*action)(long), long timeout, int state)
|
||||
{
|
||||
might_sleep();
|
||||
|
||||
spin_lock_irq(&x->wait.lock);
|
||||
timeout = do_wait_for_common(x, action, timeout, state);
|
||||
spin_unlock_irq(&x->wait.lock);
|
||||
return timeout;
|
||||
}
|
||||
|
||||
static long __sched
|
||||
wait_for_common(struct completion *x, long timeout, int state)
|
||||
{
|
||||
return __wait_for_common(x, schedule_timeout, timeout, state);
|
||||
}
|
||||
|
||||
static long __sched
|
||||
wait_for_common_io(struct completion *x, long timeout, int state)
|
||||
{
|
||||
return __wait_for_common(x, io_schedule_timeout, timeout, state);
|
||||
}
|
||||
|
||||
/**
|
||||
* wait_for_completion: - waits for completion of a task
|
||||
* @x: holds the state of this particular completion
|
||||
*
|
||||
* This waits to be signaled for completion of a specific task. It is NOT
|
||||
* interruptible and there is no timeout.
|
||||
*
|
||||
* See also similar routines (i.e. wait_for_completion_timeout()) with timeout
|
||||
* and interrupt capability. Also see complete().
|
||||
*/
|
||||
void __sched wait_for_completion(struct completion *x)
|
||||
{
|
||||
wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
|
||||
}
|
||||
EXPORT_SYMBOL(wait_for_completion);
|
||||
|
||||
/**
|
||||
* wait_for_completion_timeout: - waits for completion of a task (w/timeout)
|
||||
* @x: holds the state of this particular completion
|
||||
* @timeout: timeout value in jiffies
|
||||
*
|
||||
* This waits for either a completion of a specific task to be signaled or for a
|
||||
* specified timeout to expire. The timeout is in jiffies. It is not
|
||||
* interruptible.
|
||||
*
|
||||
* Return: 0 if timed out, and positive (at least 1, or number of jiffies left
|
||||
* till timeout) if completed.
|
||||
*/
|
||||
unsigned long __sched
|
||||
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
|
||||
{
|
||||
return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
|
||||
}
|
||||
EXPORT_SYMBOL(wait_for_completion_timeout);
|
||||
|
||||
/**
|
||||
* wait_for_completion_io: - waits for completion of a task
|
||||
* @x: holds the state of this particular completion
|
||||
*
|
||||
* This waits to be signaled for completion of a specific task. It is NOT
|
||||
* interruptible and there is no timeout. The caller is accounted as waiting
|
||||
* for IO.
|
||||
*/
|
||||
void __sched wait_for_completion_io(struct completion *x)
|
||||
{
|
||||
wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
|
||||
}
|
||||
EXPORT_SYMBOL(wait_for_completion_io);
|
||||
|
||||
/**
|
||||
* wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
|
||||
* @x: holds the state of this particular completion
|
||||
* @timeout: timeout value in jiffies
|
||||
*
|
||||
* This waits for either a completion of a specific task to be signaled or for a
|
||||
* specified timeout to expire. The timeout is in jiffies. It is not
|
||||
* interruptible. The caller is accounted as waiting for IO.
|
||||
*
|
||||
* Return: 0 if timed out, and positive (at least 1, or number of jiffies left
|
||||
* till timeout) if completed.
|
||||
*/
|
||||
unsigned long __sched
|
||||
wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
|
||||
{
|
||||
return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
|
||||
}
|
||||
EXPORT_SYMBOL(wait_for_completion_io_timeout);
|
||||
|
||||
/**
|
||||
* wait_for_completion_interruptible: - waits for completion of a task (w/intr)
|
||||
* @x: holds the state of this particular completion
|
||||
*
|
||||
* This waits for completion of a specific task to be signaled. It is
|
||||
* interruptible.
|
||||
*
|
||||
* Return: -ERESTARTSYS if interrupted, 0 if completed.
|
||||
*/
|
||||
int __sched wait_for_completion_interruptible(struct completion *x)
|
||||
{
|
||||
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
|
||||
if (t == -ERESTARTSYS)
|
||||
return t;
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(wait_for_completion_interruptible);
|
||||
|
||||
/**
|
||||
* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
|
||||
* @x: holds the state of this particular completion
|
||||
* @timeout: timeout value in jiffies
|
||||
*
|
||||
* This waits for either a completion of a specific task to be signaled or for a
|
||||
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
|
||||
*
|
||||
* Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
|
||||
* or number of jiffies left till timeout) if completed.
|
||||
*/
|
||||
long __sched
|
||||
wait_for_completion_interruptible_timeout(struct completion *x,
|
||||
unsigned long timeout)
|
||||
{
|
||||
return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
|
||||
}
|
||||
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
|
||||
|
||||
/**
|
||||
* wait_for_completion_killable: - waits for completion of a task (killable)
|
||||
* @x: holds the state of this particular completion
|
||||
*
|
||||
* This waits to be signaled for completion of a specific task. It can be
|
||||
* interrupted by a kill signal.
|
||||
*
|
||||
* Return: -ERESTARTSYS if interrupted, 0 if completed.
|
||||
*/
|
||||
int __sched wait_for_completion_killable(struct completion *x)
|
||||
{
|
||||
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
|
||||
if (t == -ERESTARTSYS)
|
||||
return t;
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(wait_for_completion_killable);
|
||||
|
||||
/**
|
||||
* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
|
||||
* @x: holds the state of this particular completion
|
||||
* @timeout: timeout value in jiffies
|
||||
*
|
||||
* This waits for either a completion of a specific task to be
|
||||
* signaled or for a specified timeout to expire. It can be
|
||||
* interrupted by a kill signal. The timeout is in jiffies.
|
||||
*
|
||||
* Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
|
||||
* or number of jiffies left till timeout) if completed.
|
||||
*/
|
||||
long __sched
|
||||
wait_for_completion_killable_timeout(struct completion *x,
|
||||
unsigned long timeout)
|
||||
{
|
||||
return wait_for_common(x, timeout, TASK_KILLABLE);
|
||||
}
|
||||
EXPORT_SYMBOL(wait_for_completion_killable_timeout);
|
||||
|
||||
/**
|
||||
* try_wait_for_completion - try to decrement a completion without blocking
|
||||
* @x: completion structure
|
||||
*
|
||||
* Return: 0 if a decrement cannot be done without blocking
|
||||
* 1 if a decrement succeeded.
|
||||
*
|
||||
* If a completion is being used as a counting completion,
|
||||
* attempt to decrement the counter without blocking. This
|
||||
* enables us to avoid waiting if the resource the completion
|
||||
* is protecting is not available.
|
||||
*/
|
||||
bool try_wait_for_completion(struct completion *x)
|
||||
{
|
||||
unsigned long flags;
|
||||
int ret = 1;
|
||||
|
||||
spin_lock_irqsave(&x->wait.lock, flags);
|
||||
if (!x->done)
|
||||
ret = 0;
|
||||
else
|
||||
x->done--;
|
||||
spin_unlock_irqrestore(&x->wait.lock, flags);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(try_wait_for_completion);
|
||||
|
||||
/**
|
||||
* completion_done - Test to see if a completion has any waiters
|
||||
* @x: completion structure
|
||||
*
|
||||
* Return: 0 if there are waiters (wait_for_completion() in progress)
|
||||
* 1 if there are no waiters.
|
||||
*
|
||||
*/
|
||||
bool completion_done(struct completion *x)
|
||||
{
|
||||
unsigned long flags;
|
||||
int ret = 1;
|
||||
|
||||
spin_lock_irqsave(&x->wait.lock, flags);
|
||||
if (!x->done)
|
||||
ret = 0;
|
||||
spin_unlock_irqrestore(&x->wait.lock, flags);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(completion_done);
|
||||
8311
kernel/sched/core.c
Normal file
8311
kernel/sched/core.c
Normal file
File diff suppressed because it is too large
Load diff
283
kernel/sched/cpuacct.c
Normal file
283
kernel/sched/cpuacct.c
Normal file
|
|
@ -0,0 +1,283 @@
|
|||
#include <linux/cgroup.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/cpumask.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/kernel_stat.h>
|
||||
#include <linux/err.h>
|
||||
|
||||
#include "sched.h"
|
||||
|
||||
/*
|
||||
* CPU accounting code for task groups.
|
||||
*
|
||||
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
|
||||
* (balbir@in.ibm.com).
|
||||
*/
|
||||
|
||||
/* Time spent by the tasks of the cpu accounting group executing in ... */
|
||||
enum cpuacct_stat_index {
|
||||
CPUACCT_STAT_USER, /* ... user mode */
|
||||
CPUACCT_STAT_SYSTEM, /* ... kernel mode */
|
||||
|
||||
CPUACCT_STAT_NSTATS,
|
||||
};
|
||||
|
||||
/* track cpu usage of a group of tasks and its child groups */
|
||||
struct cpuacct {
|
||||
struct cgroup_subsys_state css;
|
||||
/* cpuusage holds pointer to a u64-type object on every cpu */
|
||||
u64 __percpu *cpuusage;
|
||||
struct kernel_cpustat __percpu *cpustat;
|
||||
};
|
||||
|
||||
static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
|
||||
{
|
||||
return css ? container_of(css, struct cpuacct, css) : NULL;
|
||||
}
|
||||
|
||||
/* return cpu accounting group to which this task belongs */
|
||||
static inline struct cpuacct *task_ca(struct task_struct *tsk)
|
||||
{
|
||||
return css_ca(task_css(tsk, cpuacct_cgrp_id));
|
||||
}
|
||||
|
||||
static inline struct cpuacct *parent_ca(struct cpuacct *ca)
|
||||
{
|
||||
return css_ca(ca->css.parent);
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
|
||||
static struct cpuacct root_cpuacct = {
|
||||
.cpustat = &kernel_cpustat,
|
||||
.cpuusage = &root_cpuacct_cpuusage,
|
||||
};
|
||||
|
||||
/* create a new cpu accounting group */
|
||||
static struct cgroup_subsys_state *
|
||||
cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
|
||||
{
|
||||
struct cpuacct *ca;
|
||||
|
||||
if (!parent_css)
|
||||
return &root_cpuacct.css;
|
||||
|
||||
ca = kzalloc(sizeof(*ca), GFP_KERNEL);
|
||||
if (!ca)
|
||||
goto out;
|
||||
|
||||
ca->cpuusage = alloc_percpu(u64);
|
||||
if (!ca->cpuusage)
|
||||
goto out_free_ca;
|
||||
|
||||
ca->cpustat = alloc_percpu(struct kernel_cpustat);
|
||||
if (!ca->cpustat)
|
||||
goto out_free_cpuusage;
|
||||
|
||||
return &ca->css;
|
||||
|
||||
out_free_cpuusage:
|
||||
free_percpu(ca->cpuusage);
|
||||
out_free_ca:
|
||||
kfree(ca);
|
||||
out:
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
/* destroy an existing cpu accounting group */
|
||||
static void cpuacct_css_free(struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct cpuacct *ca = css_ca(css);
|
||||
|
||||
free_percpu(ca->cpustat);
|
||||
free_percpu(ca->cpuusage);
|
||||
kfree(ca);
|
||||
}
|
||||
|
||||
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
|
||||
{
|
||||
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
|
||||
u64 data;
|
||||
|
||||
#ifndef CONFIG_64BIT
|
||||
/*
|
||||
* Take rq->lock to make 64-bit read safe on 32-bit platforms.
|
||||
*/
|
||||
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
|
||||
data = *cpuusage;
|
||||
raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
|
||||
#else
|
||||
data = *cpuusage;
|
||||
#endif
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
|
||||
{
|
||||
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
|
||||
|
||||
#ifndef CONFIG_64BIT
|
||||
/*
|
||||
* Take rq->lock to make 64-bit write safe on 32-bit platforms.
|
||||
*/
|
||||
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
|
||||
*cpuusage = val;
|
||||
raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
|
||||
#else
|
||||
*cpuusage = val;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* return total cpu usage (in nanoseconds) of a group */
|
||||
static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
|
||||
{
|
||||
struct cpuacct *ca = css_ca(css);
|
||||
u64 totalcpuusage = 0;
|
||||
int i;
|
||||
|
||||
for_each_present_cpu(i)
|
||||
totalcpuusage += cpuacct_cpuusage_read(ca, i);
|
||||
|
||||
return totalcpuusage;
|
||||
}
|
||||
|
||||
static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
|
||||
u64 reset)
|
||||
{
|
||||
struct cpuacct *ca = css_ca(css);
|
||||
int err = 0;
|
||||
int i;
|
||||
|
||||
if (reset) {
|
||||
err = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
for_each_present_cpu(i)
|
||||
cpuacct_cpuusage_write(ca, i, 0);
|
||||
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
|
||||
{
|
||||
struct cpuacct *ca = css_ca(seq_css(m));
|
||||
u64 percpu;
|
||||
int i;
|
||||
|
||||
for_each_present_cpu(i) {
|
||||
percpu = cpuacct_cpuusage_read(ca, i);
|
||||
seq_printf(m, "%llu ", (unsigned long long) percpu);
|
||||
}
|
||||
seq_printf(m, "\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const char * const cpuacct_stat_desc[] = {
|
||||
[CPUACCT_STAT_USER] = "user",
|
||||
[CPUACCT_STAT_SYSTEM] = "system",
|
||||
};
|
||||
|
||||
static int cpuacct_stats_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
struct cpuacct *ca = css_ca(seq_css(sf));
|
||||
int cpu;
|
||||
s64 val = 0;
|
||||
|
||||
for_each_online_cpu(cpu) {
|
||||
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
|
||||
val += kcpustat->cpustat[CPUTIME_USER];
|
||||
val += kcpustat->cpustat[CPUTIME_NICE];
|
||||
}
|
||||
val = cputime64_to_clock_t(val);
|
||||
seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
|
||||
|
||||
val = 0;
|
||||
for_each_online_cpu(cpu) {
|
||||
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
|
||||
val += kcpustat->cpustat[CPUTIME_SYSTEM];
|
||||
val += kcpustat->cpustat[CPUTIME_IRQ];
|
||||
val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
|
||||
}
|
||||
|
||||
val = cputime64_to_clock_t(val);
|
||||
seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct cftype files[] = {
|
||||
{
|
||||
.name = "usage",
|
||||
.read_u64 = cpuusage_read,
|
||||
.write_u64 = cpuusage_write,
|
||||
},
|
||||
{
|
||||
.name = "usage_percpu",
|
||||
.seq_show = cpuacct_percpu_seq_show,
|
||||
},
|
||||
{
|
||||
.name = "stat",
|
||||
.seq_show = cpuacct_stats_show,
|
||||
},
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
/*
|
||||
* charge this task's execution time to its accounting group.
|
||||
*
|
||||
* called with rq->lock held.
|
||||
*/
|
||||
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
|
||||
{
|
||||
struct cpuacct *ca;
|
||||
int cpu;
|
||||
|
||||
cpu = task_cpu(tsk);
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
ca = task_ca(tsk);
|
||||
|
||||
while (true) {
|
||||
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
|
||||
*cpuusage += cputime;
|
||||
|
||||
ca = parent_ca(ca);
|
||||
if (!ca)
|
||||
break;
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/*
|
||||
* Add user/system time to cpuacct.
|
||||
*
|
||||
* Note: it's the caller that updates the account of the root cgroup.
|
||||
*/
|
||||
void cpuacct_account_field(struct task_struct *p, int index, u64 val)
|
||||
{
|
||||
struct kernel_cpustat *kcpustat;
|
||||
struct cpuacct *ca;
|
||||
|
||||
rcu_read_lock();
|
||||
ca = task_ca(p);
|
||||
while (ca != &root_cpuacct) {
|
||||
kcpustat = this_cpu_ptr(ca->cpustat);
|
||||
kcpustat->cpustat[index] += val;
|
||||
ca = parent_ca(ca);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
struct cgroup_subsys cpuacct_cgrp_subsys = {
|
||||
.css_alloc = cpuacct_css_alloc,
|
||||
.css_free = cpuacct_css_free,
|
||||
.legacy_cftypes = files,
|
||||
.early_init = 1,
|
||||
};
|
||||
17
kernel/sched/cpuacct.h
Normal file
17
kernel/sched/cpuacct.h
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
#ifdef CONFIG_CGROUP_CPUACCT
|
||||
|
||||
extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
|
||||
extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
|
||||
|
||||
#else
|
||||
|
||||
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
cpuacct_account_field(struct task_struct *p, int index, u64 val)
|
||||
{
|
||||
}
|
||||
|
||||
#endif
|
||||
227
kernel/sched/cpudeadline.c
Normal file
227
kernel/sched/cpudeadline.c
Normal file
|
|
@ -0,0 +1,227 @@
|
|||
/*
|
||||
* kernel/sched/cpudl.c
|
||||
*
|
||||
* Global CPU deadline management
|
||||
*
|
||||
* Author: Juri Lelli <j.lelli@sssup.it>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; version 2
|
||||
* of the License.
|
||||
*/
|
||||
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/slab.h>
|
||||
#include "cpudeadline.h"
|
||||
|
||||
static inline int parent(int i)
|
||||
{
|
||||
return (i - 1) >> 1;
|
||||
}
|
||||
|
||||
static inline int left_child(int i)
|
||||
{
|
||||
return (i << 1) + 1;
|
||||
}
|
||||
|
||||
static inline int right_child(int i)
|
||||
{
|
||||
return (i << 1) + 2;
|
||||
}
|
||||
|
||||
static inline int dl_time_before(u64 a, u64 b)
|
||||
{
|
||||
return (s64)(a - b) < 0;
|
||||
}
|
||||
|
||||
static void cpudl_exchange(struct cpudl *cp, int a, int b)
|
||||
{
|
||||
int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
|
||||
|
||||
swap(cp->elements[a].cpu, cp->elements[b].cpu);
|
||||
swap(cp->elements[a].dl , cp->elements[b].dl );
|
||||
|
||||
swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
|
||||
}
|
||||
|
||||
static void cpudl_heapify(struct cpudl *cp, int idx)
|
||||
{
|
||||
int l, r, largest;
|
||||
|
||||
/* adapted from lib/prio_heap.c */
|
||||
while(1) {
|
||||
l = left_child(idx);
|
||||
r = right_child(idx);
|
||||
largest = idx;
|
||||
|
||||
if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
|
||||
cp->elements[l].dl))
|
||||
largest = l;
|
||||
if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
|
||||
cp->elements[r].dl))
|
||||
largest = r;
|
||||
if (largest == idx)
|
||||
break;
|
||||
|
||||
/* Push idx down the heap one level and bump one up */
|
||||
cpudl_exchange(cp, largest, idx);
|
||||
idx = largest;
|
||||
}
|
||||
}
|
||||
|
||||
static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
|
||||
{
|
||||
WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
|
||||
|
||||
if (dl_time_before(new_dl, cp->elements[idx].dl)) {
|
||||
cp->elements[idx].dl = new_dl;
|
||||
cpudl_heapify(cp, idx);
|
||||
} else {
|
||||
cp->elements[idx].dl = new_dl;
|
||||
while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
|
||||
cp->elements[idx].dl)) {
|
||||
cpudl_exchange(cp, idx, parent(idx));
|
||||
idx = parent(idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline int cpudl_maximum(struct cpudl *cp)
|
||||
{
|
||||
return cp->elements[0].cpu;
|
||||
}
|
||||
|
||||
/*
|
||||
* cpudl_find - find the best (later-dl) CPU in the system
|
||||
* @cp: the cpudl max-heap context
|
||||
* @p: the task
|
||||
* @later_mask: a mask to fill in with the selected CPUs (or NULL)
|
||||
*
|
||||
* Returns: int - best CPU (heap maximum if suitable)
|
||||
*/
|
||||
int cpudl_find(struct cpudl *cp, struct task_struct *p,
|
||||
struct cpumask *later_mask)
|
||||
{
|
||||
int best_cpu = -1;
|
||||
const struct sched_dl_entity *dl_se = &p->dl;
|
||||
|
||||
if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
|
||||
best_cpu = cpumask_any(later_mask);
|
||||
goto out;
|
||||
} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
|
||||
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
|
||||
best_cpu = cpudl_maximum(cp);
|
||||
if (later_mask)
|
||||
cpumask_set_cpu(best_cpu, later_mask);
|
||||
}
|
||||
|
||||
out:
|
||||
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
|
||||
|
||||
return best_cpu;
|
||||
}
|
||||
|
||||
/*
|
||||
* cpudl_set - update the cpudl max-heap
|
||||
* @cp: the cpudl max-heap context
|
||||
* @cpu: the target cpu
|
||||
* @dl: the new earliest deadline for this cpu
|
||||
*
|
||||
* Notes: assumes cpu_rq(cpu)->lock is locked
|
||||
*
|
||||
* Returns: (void)
|
||||
*/
|
||||
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
|
||||
{
|
||||
int old_idx, new_cpu;
|
||||
unsigned long flags;
|
||||
|
||||
WARN_ON(!cpu_present(cpu));
|
||||
|
||||
raw_spin_lock_irqsave(&cp->lock, flags);
|
||||
old_idx = cp->elements[cpu].idx;
|
||||
if (!is_valid) {
|
||||
/* remove item */
|
||||
if (old_idx == IDX_INVALID) {
|
||||
/*
|
||||
* Nothing to remove if old_idx was invalid.
|
||||
* This could happen if a rq_offline_dl is
|
||||
* called for a CPU without -dl tasks running.
|
||||
*/
|
||||
goto out;
|
||||
}
|
||||
new_cpu = cp->elements[cp->size - 1].cpu;
|
||||
cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
|
||||
cp->elements[old_idx].cpu = new_cpu;
|
||||
cp->size--;
|
||||
cp->elements[new_cpu].idx = old_idx;
|
||||
cp->elements[cpu].idx = IDX_INVALID;
|
||||
while (old_idx > 0 && dl_time_before(
|
||||
cp->elements[parent(old_idx)].dl,
|
||||
cp->elements[old_idx].dl)) {
|
||||
cpudl_exchange(cp, old_idx, parent(old_idx));
|
||||
old_idx = parent(old_idx);
|
||||
}
|
||||
cpumask_set_cpu(cpu, cp->free_cpus);
|
||||
cpudl_heapify(cp, old_idx);
|
||||
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (old_idx == IDX_INVALID) {
|
||||
cp->size++;
|
||||
cp->elements[cp->size - 1].dl = 0;
|
||||
cp->elements[cp->size - 1].cpu = cpu;
|
||||
cp->elements[cpu].idx = cp->size - 1;
|
||||
cpudl_change_key(cp, cp->size - 1, dl);
|
||||
cpumask_clear_cpu(cpu, cp->free_cpus);
|
||||
} else {
|
||||
cpudl_change_key(cp, old_idx, dl);
|
||||
}
|
||||
|
||||
out:
|
||||
raw_spin_unlock_irqrestore(&cp->lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* cpudl_init - initialize the cpudl structure
|
||||
* @cp: the cpudl max-heap context
|
||||
*/
|
||||
int cpudl_init(struct cpudl *cp)
|
||||
{
|
||||
int i;
|
||||
|
||||
memset(cp, 0, sizeof(*cp));
|
||||
raw_spin_lock_init(&cp->lock);
|
||||
cp->size = 0;
|
||||
|
||||
cp->elements = kcalloc(nr_cpu_ids,
|
||||
sizeof(struct cpudl_item),
|
||||
GFP_KERNEL);
|
||||
if (!cp->elements)
|
||||
return -ENOMEM;
|
||||
|
||||
if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
|
||||
kfree(cp->elements);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
for_each_possible_cpu(i)
|
||||
cp->elements[i].idx = IDX_INVALID;
|
||||
|
||||
cpumask_setall(cp->free_cpus);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* cpudl_cleanup - clean up the cpudl structure
|
||||
* @cp: the cpudl max-heap context
|
||||
*/
|
||||
void cpudl_cleanup(struct cpudl *cp)
|
||||
{
|
||||
free_cpumask_var(cp->free_cpus);
|
||||
kfree(cp->elements);
|
||||
}
|
||||
33
kernel/sched/cpudeadline.h
Normal file
33
kernel/sched/cpudeadline.h
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
#ifndef _LINUX_CPUDL_H
|
||||
#define _LINUX_CPUDL_H
|
||||
|
||||
#include <linux/sched.h>
|
||||
|
||||
#define IDX_INVALID -1
|
||||
|
||||
struct cpudl_item {
|
||||
u64 dl;
|
||||
int cpu;
|
||||
int idx;
|
||||
};
|
||||
|
||||
struct cpudl {
|
||||
raw_spinlock_t lock;
|
||||
int size;
|
||||
cpumask_var_t free_cpus;
|
||||
struct cpudl_item *elements;
|
||||
};
|
||||
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
int cpudl_find(struct cpudl *cp, struct task_struct *p,
|
||||
struct cpumask *later_mask);
|
||||
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
|
||||
int cpudl_init(struct cpudl *cp);
|
||||
void cpudl_cleanup(struct cpudl *cp);
|
||||
#else
|
||||
#define cpudl_set(cp, cpu, dl) do { } while (0)
|
||||
#define cpudl_init() do { } while (0)
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
#endif /* _LINUX_CPUDL_H */
|
||||
248
kernel/sched/cpupri.c
Normal file
248
kernel/sched/cpupri.c
Normal file
|
|
@ -0,0 +1,248 @@
|
|||
/*
|
||||
* kernel/sched/cpupri.c
|
||||
*
|
||||
* CPU priority management
|
||||
*
|
||||
* Copyright (C) 2007-2008 Novell
|
||||
*
|
||||
* Author: Gregory Haskins <ghaskins@novell.com>
|
||||
*
|
||||
* This code tracks the priority of each CPU so that global migration
|
||||
* decisions are easy to calculate. Each CPU can be in a state as follows:
|
||||
*
|
||||
* (INVALID), IDLE, NORMAL, RT1, ... RT99
|
||||
*
|
||||
* going from the lowest priority to the highest. CPUs in the INVALID state
|
||||
* are not eligible for routing. The system maintains this state with
|
||||
* a 2 dimensional bitmap (the first for priority class, the second for cpus
|
||||
* in that class). Therefore a typical application without affinity
|
||||
* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
|
||||
* searches). For tasks with affinity restrictions, the algorithm has a
|
||||
* worst case complexity of O(min(102, nr_domcpus)), though the scenario that
|
||||
* yields the worst case search is fairly contrived.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; version 2
|
||||
* of the License.
|
||||
*/
|
||||
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched/rt.h>
|
||||
#include <linux/slab.h>
|
||||
#include "cpupri.h"
|
||||
|
||||
/* Convert between a 140 based task->prio, and our 102 based cpupri */
|
||||
static int convert_prio(int prio)
|
||||
{
|
||||
int cpupri;
|
||||
|
||||
if (prio == CPUPRI_INVALID)
|
||||
cpupri = CPUPRI_INVALID;
|
||||
else if (prio == MAX_PRIO)
|
||||
cpupri = CPUPRI_IDLE;
|
||||
else if (prio >= MAX_RT_PRIO)
|
||||
cpupri = CPUPRI_NORMAL;
|
||||
else
|
||||
cpupri = MAX_RT_PRIO - prio + 1;
|
||||
|
||||
return cpupri;
|
||||
}
|
||||
|
||||
/**
|
||||
* cpupri_find - find the best (lowest-pri) CPU in the system
|
||||
* @cp: The cpupri context
|
||||
* @p: The task
|
||||
* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
|
||||
*
|
||||
* Note: This function returns the recommended CPUs as calculated during the
|
||||
* current invocation. By the time the call returns, the CPUs may have in
|
||||
* fact changed priorities any number of times. While not ideal, it is not
|
||||
* an issue of correctness since the normal rebalancer logic will correct
|
||||
* any discrepancies created by racing against the uncertainty of the current
|
||||
* priority configuration.
|
||||
*
|
||||
* Return: (int)bool - CPUs were found
|
||||
*/
|
||||
int cpupri_find(struct cpupri *cp, struct task_struct *p,
|
||||
struct cpumask *lowest_mask)
|
||||
{
|
||||
int idx = 0;
|
||||
int task_pri = convert_prio(p->prio);
|
||||
|
||||
BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
|
||||
|
||||
for (idx = 0; idx < task_pri; idx++) {
|
||||
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
|
||||
int skip = 0;
|
||||
|
||||
if (!atomic_read(&(vec)->count))
|
||||
skip = 1;
|
||||
/*
|
||||
* When looking at the vector, we need to read the counter,
|
||||
* do a memory barrier, then read the mask.
|
||||
*
|
||||
* Note: This is still all racey, but we can deal with it.
|
||||
* Ideally, we only want to look at masks that are set.
|
||||
*
|
||||
* If a mask is not set, then the only thing wrong is that we
|
||||
* did a little more work than necessary.
|
||||
*
|
||||
* If we read a zero count but the mask is set, because of the
|
||||
* memory barriers, that can only happen when the highest prio
|
||||
* task for a run queue has left the run queue, in which case,
|
||||
* it will be followed by a pull. If the task we are processing
|
||||
* fails to find a proper place to go, that pull request will
|
||||
* pull this task if the run queue is running at a lower
|
||||
* priority.
|
||||
*/
|
||||
smp_rmb();
|
||||
|
||||
/* Need to do the rmb for every iteration */
|
||||
if (skip)
|
||||
continue;
|
||||
|
||||
if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
|
||||
continue;
|
||||
|
||||
if (lowest_mask) {
|
||||
cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
|
||||
|
||||
/*
|
||||
* We have to ensure that we have at least one bit
|
||||
* still set in the array, since the map could have
|
||||
* been concurrently emptied between the first and
|
||||
* second reads of vec->mask. If we hit this
|
||||
* condition, simply act as though we never hit this
|
||||
* priority level and continue on.
|
||||
*/
|
||||
if (cpumask_any(lowest_mask) >= nr_cpu_ids)
|
||||
continue;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* cpupri_set - update the cpu priority setting
|
||||
* @cp: The cpupri context
|
||||
* @cpu: The target cpu
|
||||
* @newpri: The priority (INVALID-RT99) to assign to this CPU
|
||||
*
|
||||
* Note: Assumes cpu_rq(cpu)->lock is locked
|
||||
*
|
||||
* Returns: (void)
|
||||
*/
|
||||
void cpupri_set(struct cpupri *cp, int cpu, int newpri)
|
||||
{
|
||||
int *currpri = &cp->cpu_to_pri[cpu];
|
||||
int oldpri = *currpri;
|
||||
int do_mb = 0;
|
||||
|
||||
newpri = convert_prio(newpri);
|
||||
|
||||
BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
|
||||
|
||||
if (newpri == oldpri)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If the cpu was currently mapped to a different value, we
|
||||
* need to map it to the new value then remove the old value.
|
||||
* Note, we must add the new value first, otherwise we risk the
|
||||
* cpu being missed by the priority loop in cpupri_find.
|
||||
*/
|
||||
if (likely(newpri != CPUPRI_INVALID)) {
|
||||
struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
|
||||
|
||||
cpumask_set_cpu(cpu, vec->mask);
|
||||
/*
|
||||
* When adding a new vector, we update the mask first,
|
||||
* do a write memory barrier, and then update the count, to
|
||||
* make sure the vector is visible when count is set.
|
||||
*/
|
||||
smp_mb__before_atomic();
|
||||
atomic_inc(&(vec)->count);
|
||||
do_mb = 1;
|
||||
}
|
||||
if (likely(oldpri != CPUPRI_INVALID)) {
|
||||
struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
|
||||
|
||||
/*
|
||||
* Because the order of modification of the vec->count
|
||||
* is important, we must make sure that the update
|
||||
* of the new prio is seen before we decrement the
|
||||
* old prio. This makes sure that the loop sees
|
||||
* one or the other when we raise the priority of
|
||||
* the run queue. We don't care about when we lower the
|
||||
* priority, as that will trigger an rt pull anyway.
|
||||
*
|
||||
* We only need to do a memory barrier if we updated
|
||||
* the new priority vec.
|
||||
*/
|
||||
if (do_mb)
|
||||
smp_mb__after_atomic();
|
||||
|
||||
/*
|
||||
* When removing from the vector, we decrement the counter first
|
||||
* do a memory barrier and then clear the mask.
|
||||
*/
|
||||
atomic_dec(&(vec)->count);
|
||||
smp_mb__after_atomic();
|
||||
cpumask_clear_cpu(cpu, vec->mask);
|
||||
}
|
||||
|
||||
*currpri = newpri;
|
||||
}
|
||||
|
||||
/**
|
||||
* cpupri_init - initialize the cpupri structure
|
||||
* @cp: The cpupri context
|
||||
*
|
||||
* Return: -ENOMEM on memory allocation failure.
|
||||
*/
|
||||
int cpupri_init(struct cpupri *cp)
|
||||
{
|
||||
int i;
|
||||
|
||||
memset(cp, 0, sizeof(*cp));
|
||||
|
||||
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
|
||||
struct cpupri_vec *vec = &cp->pri_to_cpu[i];
|
||||
|
||||
atomic_set(&vec->count, 0);
|
||||
if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
|
||||
if (!cp->cpu_to_pri)
|
||||
goto cleanup;
|
||||
|
||||
for_each_possible_cpu(i)
|
||||
cp->cpu_to_pri[i] = CPUPRI_INVALID;
|
||||
|
||||
return 0;
|
||||
|
||||
cleanup:
|
||||
for (i--; i >= 0; i--)
|
||||
free_cpumask_var(cp->pri_to_cpu[i].mask);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/**
|
||||
* cpupri_cleanup - clean up the cpupri structure
|
||||
* @cp: The cpupri context
|
||||
*/
|
||||
void cpupri_cleanup(struct cpupri *cp)
|
||||
{
|
||||
int i;
|
||||
|
||||
kfree(cp->cpu_to_pri);
|
||||
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
|
||||
free_cpumask_var(cp->pri_to_cpu[i].mask);
|
||||
}
|
||||
34
kernel/sched/cpupri.h
Normal file
34
kernel/sched/cpupri.h
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
#ifndef _LINUX_CPUPRI_H
|
||||
#define _LINUX_CPUPRI_H
|
||||
|
||||
#include <linux/sched.h>
|
||||
|
||||
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
|
||||
|
||||
#define CPUPRI_INVALID -1
|
||||
#define CPUPRI_IDLE 0
|
||||
#define CPUPRI_NORMAL 1
|
||||
/* values 2-101 are RT priorities 0-99 */
|
||||
|
||||
struct cpupri_vec {
|
||||
atomic_t count;
|
||||
cpumask_var_t mask;
|
||||
};
|
||||
|
||||
struct cpupri {
|
||||
struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
|
||||
int *cpu_to_pri;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
int cpupri_find(struct cpupri *cp,
|
||||
struct task_struct *p, struct cpumask *lowest_mask);
|
||||
void cpupri_set(struct cpupri *cp, int cpu, int pri);
|
||||
int cpupri_init(struct cpupri *cp);
|
||||
void cpupri_cleanup(struct cpupri *cp);
|
||||
#else
|
||||
#define cpupri_set(cp, cpu, pri) do { } while (0)
|
||||
#define cpupri_init() do { } while (0)
|
||||
#endif
|
||||
|
||||
#endif /* _LINUX_CPUPRI_H */
|
||||
852
kernel/sched/cputime.c
Normal file
852
kernel/sched/cputime.c
Normal file
|
|
@ -0,0 +1,852 @@
|
|||
#include <linux/export.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/tsacct_kern.h>
|
||||
#include <linux/kernel_stat.h>
|
||||
#include <linux/static_key.h>
|
||||
#include <linux/context_tracking.h>
|
||||
#include "sched.h"
|
||||
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
|
||||
/*
|
||||
* There are no locks covering percpu hardirq/softirq time.
|
||||
* They are only modified in vtime_account, on corresponding CPU
|
||||
* with interrupts disabled. So, writes are safe.
|
||||
* They are read and saved off onto struct rq in update_rq_clock().
|
||||
* This may result in other CPU reading this CPU's irq time and can
|
||||
* race with irq/vtime_account on this CPU. We would either get old
|
||||
* or new value with a side effect of accounting a slice of irq time to wrong
|
||||
* task when irq is in progress while we read rq->clock. That is a worthy
|
||||
* compromise in place of having locks on each irq in account_system_time.
|
||||
*/
|
||||
DEFINE_PER_CPU(u64, cpu_hardirq_time);
|
||||
DEFINE_PER_CPU(u64, cpu_softirq_time);
|
||||
|
||||
static DEFINE_PER_CPU(u64, irq_start_time);
|
||||
static int sched_clock_irqtime;
|
||||
|
||||
void enable_sched_clock_irqtime(void)
|
||||
{
|
||||
sched_clock_irqtime = 1;
|
||||
}
|
||||
|
||||
void disable_sched_clock_irqtime(void)
|
||||
{
|
||||
sched_clock_irqtime = 0;
|
||||
}
|
||||
|
||||
#ifndef CONFIG_64BIT
|
||||
DEFINE_PER_CPU(seqcount_t, irq_time_seq);
|
||||
#endif /* CONFIG_64BIT */
|
||||
|
||||
/*
|
||||
* Called before incrementing preempt_count on {soft,}irq_enter
|
||||
* and before decrementing preempt_count on {soft,}irq_exit.
|
||||
*/
|
||||
void irqtime_account_irq(struct task_struct *curr)
|
||||
{
|
||||
unsigned long flags;
|
||||
s64 delta;
|
||||
int cpu;
|
||||
|
||||
if (!sched_clock_irqtime)
|
||||
return;
|
||||
|
||||
local_irq_save(flags);
|
||||
|
||||
cpu = smp_processor_id();
|
||||
delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
|
||||
__this_cpu_add(irq_start_time, delta);
|
||||
|
||||
irq_time_write_begin();
|
||||
/*
|
||||
* We do not account for softirq time from ksoftirqd here.
|
||||
* We want to continue accounting softirq time to ksoftirqd thread
|
||||
* in that case, so as not to confuse scheduler with a special task
|
||||
* that do not consume any time, but still wants to run.
|
||||
*/
|
||||
if (hardirq_count())
|
||||
__this_cpu_add(cpu_hardirq_time, delta);
|
||||
else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
|
||||
__this_cpu_add(cpu_softirq_time, delta);
|
||||
|
||||
irq_time_write_end();
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(irqtime_account_irq);
|
||||
|
||||
static int irqtime_account_hi_update(void)
|
||||
{
|
||||
u64 *cpustat = kcpustat_this_cpu->cpustat;
|
||||
unsigned long flags;
|
||||
u64 latest_ns;
|
||||
int ret = 0;
|
||||
|
||||
local_irq_save(flags);
|
||||
latest_ns = this_cpu_read(cpu_hardirq_time);
|
||||
if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
|
||||
ret = 1;
|
||||
local_irq_restore(flags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int irqtime_account_si_update(void)
|
||||
{
|
||||
u64 *cpustat = kcpustat_this_cpu->cpustat;
|
||||
unsigned long flags;
|
||||
u64 latest_ns;
|
||||
int ret = 0;
|
||||
|
||||
local_irq_save(flags);
|
||||
latest_ns = this_cpu_read(cpu_softirq_time);
|
||||
if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
|
||||
ret = 1;
|
||||
local_irq_restore(flags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
|
||||
|
||||
#define sched_clock_irqtime (0)
|
||||
|
||||
#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
|
||||
|
||||
static inline void task_group_account_field(struct task_struct *p, int index,
|
||||
u64 tmp)
|
||||
{
|
||||
/*
|
||||
* Since all updates are sure to touch the root cgroup, we
|
||||
* get ourselves ahead and touch it first. If the root cgroup
|
||||
* is the only cgroup, then nothing else should be necessary.
|
||||
*
|
||||
*/
|
||||
__this_cpu_add(kernel_cpustat.cpustat[index], tmp);
|
||||
|
||||
cpuacct_account_field(p, index, tmp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Account user cpu time to a process.
|
||||
* @p: the process that the cpu time gets accounted to
|
||||
* @cputime: the cpu time spent in user space since the last update
|
||||
* @cputime_scaled: cputime scaled by cpu frequency
|
||||
*/
|
||||
void account_user_time(struct task_struct *p, cputime_t cputime,
|
||||
cputime_t cputime_scaled)
|
||||
{
|
||||
int index;
|
||||
|
||||
/* Add user time to process. */
|
||||
p->utime += cputime;
|
||||
p->utimescaled += cputime_scaled;
|
||||
account_group_user_time(p, cputime);
|
||||
|
||||
index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
|
||||
|
||||
/* Add user time to cpustat. */
|
||||
task_group_account_field(p, index, (__force u64) cputime);
|
||||
|
||||
/* Account for user time used */
|
||||
acct_account_cputime(p);
|
||||
}
|
||||
|
||||
/*
|
||||
* Account guest cpu time to a process.
|
||||
* @p: the process that the cpu time gets accounted to
|
||||
* @cputime: the cpu time spent in virtual machine since the last update
|
||||
* @cputime_scaled: cputime scaled by cpu frequency
|
||||
*/
|
||||
static void account_guest_time(struct task_struct *p, cputime_t cputime,
|
||||
cputime_t cputime_scaled)
|
||||
{
|
||||
u64 *cpustat = kcpustat_this_cpu->cpustat;
|
||||
|
||||
/* Add guest time to process. */
|
||||
p->utime += cputime;
|
||||
p->utimescaled += cputime_scaled;
|
||||
account_group_user_time(p, cputime);
|
||||
p->gtime += cputime;
|
||||
|
||||
/* Add guest time to cpustat. */
|
||||
if (task_nice(p) > 0) {
|
||||
cpustat[CPUTIME_NICE] += (__force u64) cputime;
|
||||
cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
|
||||
} else {
|
||||
cpustat[CPUTIME_USER] += (__force u64) cputime;
|
||||
cpustat[CPUTIME_GUEST] += (__force u64) cputime;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Account system cpu time to a process and desired cpustat field
|
||||
* @p: the process that the cpu time gets accounted to
|
||||
* @cputime: the cpu time spent in kernel space since the last update
|
||||
* @cputime_scaled: cputime scaled by cpu frequency
|
||||
* @target_cputime64: pointer to cpustat field that has to be updated
|
||||
*/
|
||||
static inline
|
||||
void __account_system_time(struct task_struct *p, cputime_t cputime,
|
||||
cputime_t cputime_scaled, int index)
|
||||
{
|
||||
/* Add system time to process. */
|
||||
p->stime += cputime;
|
||||
p->stimescaled += cputime_scaled;
|
||||
account_group_system_time(p, cputime);
|
||||
|
||||
/* Add system time to cpustat. */
|
||||
task_group_account_field(p, index, (__force u64) cputime);
|
||||
|
||||
/* Account for system time used */
|
||||
acct_account_cputime(p);
|
||||
}
|
||||
|
||||
/*
|
||||
* Account system cpu time to a process.
|
||||
* @p: the process that the cpu time gets accounted to
|
||||
* @hardirq_offset: the offset to subtract from hardirq_count()
|
||||
* @cputime: the cpu time spent in kernel space since the last update
|
||||
* @cputime_scaled: cputime scaled by cpu frequency
|
||||
*/
|
||||
void account_system_time(struct task_struct *p, int hardirq_offset,
|
||||
cputime_t cputime, cputime_t cputime_scaled)
|
||||
{
|
||||
int index;
|
||||
|
||||
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
|
||||
account_guest_time(p, cputime, cputime_scaled);
|
||||
return;
|
||||
}
|
||||
|
||||
if (hardirq_count() - hardirq_offset)
|
||||
index = CPUTIME_IRQ;
|
||||
else if (in_serving_softirq())
|
||||
index = CPUTIME_SOFTIRQ;
|
||||
else
|
||||
index = CPUTIME_SYSTEM;
|
||||
|
||||
__account_system_time(p, cputime, cputime_scaled, index);
|
||||
}
|
||||
|
||||
/*
|
||||
* Account for involuntary wait time.
|
||||
* @cputime: the cpu time spent in involuntary wait
|
||||
*/
|
||||
void account_steal_time(cputime_t cputime)
|
||||
{
|
||||
u64 *cpustat = kcpustat_this_cpu->cpustat;
|
||||
|
||||
cpustat[CPUTIME_STEAL] += (__force u64) cputime;
|
||||
}
|
||||
|
||||
/*
|
||||
* Account for idle time.
|
||||
* @cputime: the cpu time spent in idle wait
|
||||
*/
|
||||
void account_idle_time(cputime_t cputime)
|
||||
{
|
||||
u64 *cpustat = kcpustat_this_cpu->cpustat;
|
||||
struct rq *rq = this_rq();
|
||||
|
||||
if (atomic_read(&rq->nr_iowait) > 0)
|
||||
cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
|
||||
else
|
||||
cpustat[CPUTIME_IDLE] += (__force u64) cputime;
|
||||
}
|
||||
|
||||
static __always_inline bool steal_account_process_tick(void)
|
||||
{
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
if (static_key_false(¶virt_steal_enabled)) {
|
||||
u64 steal;
|
||||
cputime_t steal_ct;
|
||||
|
||||
steal = paravirt_steal_clock(smp_processor_id());
|
||||
steal -= this_rq()->prev_steal_time;
|
||||
|
||||
/*
|
||||
* cputime_t may be less precise than nsecs (eg: if it's
|
||||
* based on jiffies). Lets cast the result to cputime
|
||||
* granularity and account the rest on the next rounds.
|
||||
*/
|
||||
steal_ct = nsecs_to_cputime(steal);
|
||||
this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
|
||||
|
||||
account_steal_time(steal_ct);
|
||||
return steal_ct;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Accumulate raw cputime values of dead tasks (sig->[us]time) and live
|
||||
* tasks (sum on group iteration) belonging to @tsk's group.
|
||||
*/
|
||||
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
|
||||
{
|
||||
struct signal_struct *sig = tsk->signal;
|
||||
cputime_t utime, stime;
|
||||
struct task_struct *t;
|
||||
unsigned int seq, nextseq;
|
||||
unsigned long flags;
|
||||
|
||||
rcu_read_lock();
|
||||
/* Attempt a lockless read on the first round. */
|
||||
nextseq = 0;
|
||||
do {
|
||||
seq = nextseq;
|
||||
flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
|
||||
times->utime = sig->utime;
|
||||
times->stime = sig->stime;
|
||||
times->sum_exec_runtime = sig->sum_sched_runtime;
|
||||
|
||||
for_each_thread(tsk, t) {
|
||||
task_cputime(t, &utime, &stime);
|
||||
times->utime += utime;
|
||||
times->stime += stime;
|
||||
times->sum_exec_runtime += task_sched_runtime(t);
|
||||
}
|
||||
/* If lockless access failed, take the lock. */
|
||||
nextseq = 1;
|
||||
} while (need_seqretry(&sig->stats_lock, seq));
|
||||
done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
/*
|
||||
* Account a tick to a process and cpustat
|
||||
* @p: the process that the cpu time gets accounted to
|
||||
* @user_tick: is the tick from userspace
|
||||
* @rq: the pointer to rq
|
||||
*
|
||||
* Tick demultiplexing follows the order
|
||||
* - pending hardirq update
|
||||
* - pending softirq update
|
||||
* - user_time
|
||||
* - idle_time
|
||||
* - system time
|
||||
* - check for guest_time
|
||||
* - else account as system_time
|
||||
*
|
||||
* Check for hardirq is done both for system and user time as there is
|
||||
* no timer going off while we are on hardirq and hence we may never get an
|
||||
* opportunity to update it solely in system time.
|
||||
* p->stime and friends are only updated on system time and not on irq
|
||||
* softirq as those do not count in task exec_runtime any more.
|
||||
*/
|
||||
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
|
||||
struct rq *rq, int ticks)
|
||||
{
|
||||
cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
|
||||
u64 cputime = (__force u64) cputime_one_jiffy;
|
||||
u64 *cpustat = kcpustat_this_cpu->cpustat;
|
||||
|
||||
if (steal_account_process_tick())
|
||||
return;
|
||||
|
||||
cputime *= ticks;
|
||||
scaled *= ticks;
|
||||
|
||||
if (irqtime_account_hi_update()) {
|
||||
cpustat[CPUTIME_IRQ] += cputime;
|
||||
} else if (irqtime_account_si_update()) {
|
||||
cpustat[CPUTIME_SOFTIRQ] += cputime;
|
||||
} else if (this_cpu_ksoftirqd() == p) {
|
||||
/*
|
||||
* ksoftirqd time do not get accounted in cpu_softirq_time.
|
||||
* So, we have to handle it separately here.
|
||||
* Also, p->stime needs to be updated for ksoftirqd.
|
||||
*/
|
||||
__account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
|
||||
} else if (user_tick) {
|
||||
account_user_time(p, cputime, scaled);
|
||||
} else if (p == rq->idle) {
|
||||
account_idle_time(cputime);
|
||||
} else if (p->flags & PF_VCPU) { /* System time or guest time */
|
||||
account_guest_time(p, cputime, scaled);
|
||||
} else {
|
||||
__account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
|
||||
}
|
||||
}
|
||||
|
||||
static void irqtime_account_idle_ticks(int ticks)
|
||||
{
|
||||
struct rq *rq = this_rq();
|
||||
|
||||
irqtime_account_process_tick(current, 0, rq, ticks);
|
||||
}
|
||||
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
|
||||
static inline void irqtime_account_idle_ticks(int ticks) {}
|
||||
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
|
||||
struct rq *rq, int nr_ticks) {}
|
||||
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
|
||||
|
||||
/*
|
||||
* Use precise platform statistics if available:
|
||||
*/
|
||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
|
||||
|
||||
#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
|
||||
void vtime_common_task_switch(struct task_struct *prev)
|
||||
{
|
||||
if (is_idle_task(prev))
|
||||
vtime_account_idle(prev);
|
||||
else
|
||||
vtime_account_system(prev);
|
||||
|
||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
|
||||
vtime_account_user(prev);
|
||||
#endif
|
||||
arch_vtime_task_switch(prev);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Archs that account the whole time spent in the idle task
|
||||
* (outside irq) as idle time can rely on this and just implement
|
||||
* vtime_account_system() and vtime_account_idle(). Archs that
|
||||
* have other meaning of the idle time (s390 only includes the
|
||||
* time spent by the CPU when it's in low power mode) must override
|
||||
* vtime_account().
|
||||
*/
|
||||
#ifndef __ARCH_HAS_VTIME_ACCOUNT
|
||||
void vtime_common_account_irq_enter(struct task_struct *tsk)
|
||||
{
|
||||
if (!in_interrupt()) {
|
||||
/*
|
||||
* If we interrupted user, context_tracking_in_user()
|
||||
* is 1 because the context tracking don't hook
|
||||
* on irq entry/exit. This way we know if
|
||||
* we need to flush user time on kernel entry.
|
||||
*/
|
||||
if (context_tracking_in_user()) {
|
||||
vtime_account_user(tsk);
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_idle_task(tsk)) {
|
||||
vtime_account_idle(tsk);
|
||||
return;
|
||||
}
|
||||
}
|
||||
vtime_account_system(tsk);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
|
||||
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
|
||||
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
|
||||
|
||||
|
||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
|
||||
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
|
||||
{
|
||||
*ut = p->utime;
|
||||
*st = p->stime;
|
||||
}
|
||||
|
||||
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
|
||||
{
|
||||
struct task_cputime cputime;
|
||||
|
||||
thread_group_cputime(p, &cputime);
|
||||
|
||||
*ut = cputime.utime;
|
||||
*st = cputime.stime;
|
||||
}
|
||||
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
|
||||
/*
|
||||
* Account a single tick of cpu time.
|
||||
* @p: the process that the cpu time gets accounted to
|
||||
* @user_tick: indicates if the tick is a user or a system tick
|
||||
*/
|
||||
void account_process_tick(struct task_struct *p, int user_tick)
|
||||
{
|
||||
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
|
||||
struct rq *rq = this_rq();
|
||||
|
||||
if (vtime_accounting_enabled())
|
||||
return;
|
||||
|
||||
if (sched_clock_irqtime) {
|
||||
irqtime_account_process_tick(p, user_tick, rq, 1);
|
||||
return;
|
||||
}
|
||||
|
||||
if (steal_account_process_tick())
|
||||
return;
|
||||
|
||||
if (user_tick)
|
||||
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
|
||||
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
|
||||
account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
|
||||
one_jiffy_scaled);
|
||||
else
|
||||
account_idle_time(cputime_one_jiffy);
|
||||
}
|
||||
|
||||
/*
|
||||
* Account multiple ticks of steal time.
|
||||
* @p: the process from which the cpu time has been stolen
|
||||
* @ticks: number of stolen ticks
|
||||
*/
|
||||
void account_steal_ticks(unsigned long ticks)
|
||||
{
|
||||
account_steal_time(jiffies_to_cputime(ticks));
|
||||
}
|
||||
|
||||
/*
|
||||
* Account multiple ticks of idle time.
|
||||
* @ticks: number of stolen ticks
|
||||
*/
|
||||
void account_idle_ticks(unsigned long ticks)
|
||||
{
|
||||
|
||||
if (sched_clock_irqtime) {
|
||||
irqtime_account_idle_ticks(ticks);
|
||||
return;
|
||||
}
|
||||
|
||||
account_idle_time(jiffies_to_cputime(ticks));
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform (stime * rtime) / total, but avoid multiplication overflow by
|
||||
* loosing precision when the numbers are big.
|
||||
*/
|
||||
static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
|
||||
{
|
||||
u64 scaled;
|
||||
|
||||
for (;;) {
|
||||
/* Make sure "rtime" is the bigger of stime/rtime */
|
||||
if (stime > rtime)
|
||||
swap(rtime, stime);
|
||||
|
||||
/* Make sure 'total' fits in 32 bits */
|
||||
if (total >> 32)
|
||||
goto drop_precision;
|
||||
|
||||
/* Does rtime (and thus stime) fit in 32 bits? */
|
||||
if (!(rtime >> 32))
|
||||
break;
|
||||
|
||||
/* Can we just balance rtime/stime rather than dropping bits? */
|
||||
if (stime >> 31)
|
||||
goto drop_precision;
|
||||
|
||||
/* We can grow stime and shrink rtime and try to make them both fit */
|
||||
stime <<= 1;
|
||||
rtime >>= 1;
|
||||
continue;
|
||||
|
||||
drop_precision:
|
||||
/* We drop from rtime, it has more bits than stime */
|
||||
rtime >>= 1;
|
||||
total >>= 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure gcc understands that this is a 32x32->64 multiply,
|
||||
* followed by a 64/32->64 divide.
|
||||
*/
|
||||
scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
|
||||
return (__force cputime_t) scaled;
|
||||
}
|
||||
|
||||
/*
|
||||
* Atomically advance counter to the new value. Interrupts, vcpu
|
||||
* scheduling, and scaling inaccuracies can cause cputime_advance
|
||||
* to be occasionally called with a new value smaller than counter.
|
||||
* Let's enforce atomicity.
|
||||
*
|
||||
* Normally a caller will only go through this loop once, or not
|
||||
* at all in case a previous caller updated counter the same jiffy.
|
||||
*/
|
||||
static void cputime_advance(cputime_t *counter, cputime_t new)
|
||||
{
|
||||
cputime_t old;
|
||||
|
||||
while (new > (old = ACCESS_ONCE(*counter)))
|
||||
cmpxchg_cputime(counter, old, new);
|
||||
}
|
||||
|
||||
/*
|
||||
* Adjust tick based cputime random precision against scheduler
|
||||
* runtime accounting.
|
||||
*/
|
||||
static void cputime_adjust(struct task_cputime *curr,
|
||||
struct cputime *prev,
|
||||
cputime_t *ut, cputime_t *st)
|
||||
{
|
||||
cputime_t rtime, stime, utime;
|
||||
|
||||
/*
|
||||
* Tick based cputime accounting depend on random scheduling
|
||||
* timeslices of a task to be interrupted or not by the timer.
|
||||
* Depending on these circumstances, the number of these interrupts
|
||||
* may be over or under-optimistic, matching the real user and system
|
||||
* cputime with a variable precision.
|
||||
*
|
||||
* Fix this by scaling these tick based values against the total
|
||||
* runtime accounted by the CFS scheduler.
|
||||
*/
|
||||
rtime = nsecs_to_cputime(curr->sum_exec_runtime);
|
||||
|
||||
/*
|
||||
* Update userspace visible utime/stime values only if actual execution
|
||||
* time is bigger than already exported. Note that can happen, that we
|
||||
* provided bigger values due to scaling inaccuracy on big numbers.
|
||||
*/
|
||||
if (prev->stime + prev->utime >= rtime)
|
||||
goto out;
|
||||
|
||||
stime = curr->stime;
|
||||
utime = curr->utime;
|
||||
|
||||
if (utime == 0) {
|
||||
stime = rtime;
|
||||
} else if (stime == 0) {
|
||||
utime = rtime;
|
||||
} else {
|
||||
cputime_t total = stime + utime;
|
||||
|
||||
stime = scale_stime((__force u64)stime,
|
||||
(__force u64)rtime, (__force u64)total);
|
||||
utime = rtime - stime;
|
||||
}
|
||||
|
||||
cputime_advance(&prev->stime, stime);
|
||||
cputime_advance(&prev->utime, utime);
|
||||
|
||||
out:
|
||||
*ut = prev->utime;
|
||||
*st = prev->stime;
|
||||
}
|
||||
|
||||
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
|
||||
{
|
||||
struct task_cputime cputime = {
|
||||
.sum_exec_runtime = p->se.sum_exec_runtime,
|
||||
};
|
||||
|
||||
task_cputime(p, &cputime.utime, &cputime.stime);
|
||||
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
|
||||
}
|
||||
|
||||
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
|
||||
{
|
||||
struct task_cputime cputime;
|
||||
|
||||
thread_group_cputime(p, &cputime);
|
||||
cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
|
||||
}
|
||||
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
|
||||
|
||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
|
||||
static unsigned long long vtime_delta(struct task_struct *tsk)
|
||||
{
|
||||
unsigned long long clock;
|
||||
|
||||
clock = local_clock();
|
||||
if (clock < tsk->vtime_snap)
|
||||
return 0;
|
||||
|
||||
return clock - tsk->vtime_snap;
|
||||
}
|
||||
|
||||
static cputime_t get_vtime_delta(struct task_struct *tsk)
|
||||
{
|
||||
unsigned long long delta = vtime_delta(tsk);
|
||||
|
||||
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
|
||||
tsk->vtime_snap += delta;
|
||||
|
||||
/* CHECKME: always safe to convert nsecs to cputime? */
|
||||
return nsecs_to_cputime(delta);
|
||||
}
|
||||
|
||||
static void __vtime_account_system(struct task_struct *tsk)
|
||||
{
|
||||
cputime_t delta_cpu = get_vtime_delta(tsk);
|
||||
|
||||
account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
|
||||
}
|
||||
|
||||
void vtime_account_system(struct task_struct *tsk)
|
||||
{
|
||||
write_seqlock(&tsk->vtime_seqlock);
|
||||
__vtime_account_system(tsk);
|
||||
write_sequnlock(&tsk->vtime_seqlock);
|
||||
}
|
||||
|
||||
void vtime_gen_account_irq_exit(struct task_struct *tsk)
|
||||
{
|
||||
write_seqlock(&tsk->vtime_seqlock);
|
||||
__vtime_account_system(tsk);
|
||||
if (context_tracking_in_user())
|
||||
tsk->vtime_snap_whence = VTIME_USER;
|
||||
write_sequnlock(&tsk->vtime_seqlock);
|
||||
}
|
||||
|
||||
void vtime_account_user(struct task_struct *tsk)
|
||||
{
|
||||
cputime_t delta_cpu;
|
||||
|
||||
write_seqlock(&tsk->vtime_seqlock);
|
||||
delta_cpu = get_vtime_delta(tsk);
|
||||
tsk->vtime_snap_whence = VTIME_SYS;
|
||||
account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
|
||||
write_sequnlock(&tsk->vtime_seqlock);
|
||||
}
|
||||
|
||||
void vtime_user_enter(struct task_struct *tsk)
|
||||
{
|
||||
write_seqlock(&tsk->vtime_seqlock);
|
||||
__vtime_account_system(tsk);
|
||||
tsk->vtime_snap_whence = VTIME_USER;
|
||||
write_sequnlock(&tsk->vtime_seqlock);
|
||||
}
|
||||
|
||||
void vtime_guest_enter(struct task_struct *tsk)
|
||||
{
|
||||
/*
|
||||
* The flags must be updated under the lock with
|
||||
* the vtime_snap flush and update.
|
||||
* That enforces a right ordering and update sequence
|
||||
* synchronization against the reader (task_gtime())
|
||||
* that can thus safely catch up with a tickless delta.
|
||||
*/
|
||||
write_seqlock(&tsk->vtime_seqlock);
|
||||
__vtime_account_system(tsk);
|
||||
current->flags |= PF_VCPU;
|
||||
write_sequnlock(&tsk->vtime_seqlock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vtime_guest_enter);
|
||||
|
||||
void vtime_guest_exit(struct task_struct *tsk)
|
||||
{
|
||||
write_seqlock(&tsk->vtime_seqlock);
|
||||
__vtime_account_system(tsk);
|
||||
current->flags &= ~PF_VCPU;
|
||||
write_sequnlock(&tsk->vtime_seqlock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vtime_guest_exit);
|
||||
|
||||
void vtime_account_idle(struct task_struct *tsk)
|
||||
{
|
||||
cputime_t delta_cpu = get_vtime_delta(tsk);
|
||||
|
||||
account_idle_time(delta_cpu);
|
||||
}
|
||||
|
||||
void arch_vtime_task_switch(struct task_struct *prev)
|
||||
{
|
||||
write_seqlock(&prev->vtime_seqlock);
|
||||
prev->vtime_snap_whence = VTIME_SLEEPING;
|
||||
write_sequnlock(&prev->vtime_seqlock);
|
||||
|
||||
write_seqlock(¤t->vtime_seqlock);
|
||||
current->vtime_snap_whence = VTIME_SYS;
|
||||
current->vtime_snap = sched_clock_cpu(smp_processor_id());
|
||||
write_sequnlock(¤t->vtime_seqlock);
|
||||
}
|
||||
|
||||
void vtime_init_idle(struct task_struct *t, int cpu)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
write_seqlock_irqsave(&t->vtime_seqlock, flags);
|
||||
t->vtime_snap_whence = VTIME_SYS;
|
||||
t->vtime_snap = sched_clock_cpu(cpu);
|
||||
write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
|
||||
}
|
||||
|
||||
cputime_t task_gtime(struct task_struct *t)
|
||||
{
|
||||
unsigned int seq;
|
||||
cputime_t gtime;
|
||||
|
||||
do {
|
||||
seq = read_seqbegin(&t->vtime_seqlock);
|
||||
|
||||
gtime = t->gtime;
|
||||
if (t->flags & PF_VCPU)
|
||||
gtime += vtime_delta(t);
|
||||
|
||||
} while (read_seqretry(&t->vtime_seqlock, seq));
|
||||
|
||||
return gtime;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fetch cputime raw values from fields of task_struct and
|
||||
* add up the pending nohz execution time since the last
|
||||
* cputime snapshot.
|
||||
*/
|
||||
static void
|
||||
fetch_task_cputime(struct task_struct *t,
|
||||
cputime_t *u_dst, cputime_t *s_dst,
|
||||
cputime_t *u_src, cputime_t *s_src,
|
||||
cputime_t *udelta, cputime_t *sdelta)
|
||||
{
|
||||
unsigned int seq;
|
||||
unsigned long long delta;
|
||||
|
||||
do {
|
||||
*udelta = 0;
|
||||
*sdelta = 0;
|
||||
|
||||
seq = read_seqbegin(&t->vtime_seqlock);
|
||||
|
||||
if (u_dst)
|
||||
*u_dst = *u_src;
|
||||
if (s_dst)
|
||||
*s_dst = *s_src;
|
||||
|
||||
/* Task is sleeping, nothing to add */
|
||||
if (t->vtime_snap_whence == VTIME_SLEEPING ||
|
||||
is_idle_task(t))
|
||||
continue;
|
||||
|
||||
delta = vtime_delta(t);
|
||||
|
||||
/*
|
||||
* Task runs either in user or kernel space, add pending nohz time to
|
||||
* the right place.
|
||||
*/
|
||||
if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
|
||||
*udelta = delta;
|
||||
} else {
|
||||
if (t->vtime_snap_whence == VTIME_SYS)
|
||||
*sdelta = delta;
|
||||
}
|
||||
} while (read_seqretry(&t->vtime_seqlock, seq));
|
||||
}
|
||||
|
||||
|
||||
void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
|
||||
{
|
||||
cputime_t udelta, sdelta;
|
||||
|
||||
fetch_task_cputime(t, utime, stime, &t->utime,
|
||||
&t->stime, &udelta, &sdelta);
|
||||
if (utime)
|
||||
*utime += udelta;
|
||||
if (stime)
|
||||
*stime += sdelta;
|
||||
}
|
||||
|
||||
void task_cputime_scaled(struct task_struct *t,
|
||||
cputime_t *utimescaled, cputime_t *stimescaled)
|
||||
{
|
||||
cputime_t udelta, sdelta;
|
||||
|
||||
fetch_task_cputime(t, utimescaled, stimescaled,
|
||||
&t->utimescaled, &t->stimescaled, &udelta, &sdelta);
|
||||
if (utimescaled)
|
||||
*utimescaled += cputime_to_scaled(udelta);
|
||||
if (stimescaled)
|
||||
*stimescaled += cputime_to_scaled(sdelta);
|
||||
}
|
||||
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
|
||||
1689
kernel/sched/deadline.c
Normal file
1689
kernel/sched/deadline.c
Normal file
File diff suppressed because it is too large
Load diff
663
kernel/sched/debug.c
Normal file
663
kernel/sched/debug.c
Normal file
|
|
@ -0,0 +1,663 @@
|
|||
/*
|
||||
* kernel/sched/debug.c
|
||||
*
|
||||
* Print the CFS rbtree
|
||||
*
|
||||
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/kallsyms.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/mempolicy.h>
|
||||
|
||||
#include "sched.h"
|
||||
|
||||
static DEFINE_SPINLOCK(sched_debug_lock);
|
||||
|
||||
/*
|
||||
* This allows printing both to /proc/sched_debug and
|
||||
* to the console
|
||||
*/
|
||||
#define SEQ_printf(m, x...) \
|
||||
do { \
|
||||
if (m) \
|
||||
seq_printf(m, x); \
|
||||
else \
|
||||
printk(x); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* Ease the printing of nsec fields:
|
||||
*/
|
||||
static long long nsec_high(unsigned long long nsec)
|
||||
{
|
||||
if ((long long)nsec < 0) {
|
||||
nsec = -nsec;
|
||||
do_div(nsec, 1000000);
|
||||
return -nsec;
|
||||
}
|
||||
do_div(nsec, 1000000);
|
||||
|
||||
return nsec;
|
||||
}
|
||||
|
||||
static unsigned long nsec_low(unsigned long long nsec)
|
||||
{
|
||||
if ((long long)nsec < 0)
|
||||
nsec = -nsec;
|
||||
|
||||
return do_div(nsec, 1000000);
|
||||
}
|
||||
|
||||
#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
|
||||
{
|
||||
struct sched_entity *se = tg->se[cpu];
|
||||
|
||||
#define P(F) \
|
||||
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
|
||||
#define PN(F) \
|
||||
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
|
||||
|
||||
if (!se) {
|
||||
struct sched_avg *avg = &cpu_rq(cpu)->avg;
|
||||
P(avg->runnable_avg_sum);
|
||||
P(avg->runnable_avg_period);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
PN(se->exec_start);
|
||||
PN(se->vruntime);
|
||||
PN(se->sum_exec_runtime);
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
PN(se->statistics.wait_start);
|
||||
PN(se->statistics.sleep_start);
|
||||
PN(se->statistics.block_start);
|
||||
PN(se->statistics.sleep_max);
|
||||
PN(se->statistics.block_max);
|
||||
PN(se->statistics.exec_max);
|
||||
PN(se->statistics.slice_max);
|
||||
PN(se->statistics.wait_max);
|
||||
PN(se->statistics.wait_sum);
|
||||
P(se->statistics.wait_count);
|
||||
#endif
|
||||
P(se->load.weight);
|
||||
#ifdef CONFIG_SMP
|
||||
P(se->avg.runnable_avg_sum);
|
||||
P(se->avg.runnable_avg_period);
|
||||
P(se->avg.usage_avg_sum);
|
||||
P(se->avg.load_avg_contrib);
|
||||
P(se->avg.decay_count);
|
||||
#endif
|
||||
#undef PN
|
||||
#undef P
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
static char group_path[PATH_MAX];
|
||||
|
||||
static char *task_group_path(struct task_group *tg)
|
||||
{
|
||||
if (autogroup_path(tg, group_path, PATH_MAX))
|
||||
return group_path;
|
||||
|
||||
return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void
|
||||
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
if (rq->curr == p)
|
||||
SEQ_printf(m, "R");
|
||||
else
|
||||
SEQ_printf(m, " ");
|
||||
|
||||
SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
|
||||
p->comm, task_pid_nr(p),
|
||||
SPLIT_NS(p->se.vruntime),
|
||||
(long long)(p->nvcsw + p->nivcsw),
|
||||
p->prio);
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
|
||||
SPLIT_NS(p->se.vruntime),
|
||||
SPLIT_NS(p->se.sum_exec_runtime),
|
||||
SPLIT_NS(p->se.statistics.sum_sleep_runtime));
|
||||
#else
|
||||
SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
|
||||
0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
|
||||
#endif
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
SEQ_printf(m, " %d", task_node(p));
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
SEQ_printf(m, " %s", task_group_path(task_group(p)));
|
||||
#endif
|
||||
|
||||
SEQ_printf(m, "\n");
|
||||
}
|
||||
|
||||
static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
|
||||
{
|
||||
struct task_struct *g, *p;
|
||||
|
||||
SEQ_printf(m,
|
||||
"\nrunnable tasks:\n"
|
||||
" task PID tree-key switches prio"
|
||||
" exec-runtime sum-exec sum-sleep\n"
|
||||
"------------------------------------------------------"
|
||||
"----------------------------------------------------\n");
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_process_thread(g, p) {
|
||||
if (task_cpu(p) != rq_cpu)
|
||||
continue;
|
||||
|
||||
print_task(m, rq, p);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
||||
{
|
||||
s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
|
||||
spread, rq0_min_vruntime, spread0;
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
struct sched_entity *last;
|
||||
unsigned long flags;
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
|
||||
#else
|
||||
SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
|
||||
#endif
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
|
||||
SPLIT_NS(cfs_rq->exec_clock));
|
||||
|
||||
raw_spin_lock_irqsave(&rq->lock, flags);
|
||||
if (cfs_rq->rb_leftmost)
|
||||
MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
|
||||
last = __pick_last_entity(cfs_rq);
|
||||
if (last)
|
||||
max_vruntime = last->vruntime;
|
||||
min_vruntime = cfs_rq->min_vruntime;
|
||||
rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
|
||||
raw_spin_unlock_irqrestore(&rq->lock, flags);
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
|
||||
SPLIT_NS(MIN_vruntime));
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
|
||||
SPLIT_NS(min_vruntime));
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
|
||||
SPLIT_NS(max_vruntime));
|
||||
spread = max_vruntime - MIN_vruntime;
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
|
||||
SPLIT_NS(spread));
|
||||
spread0 = min_vruntime - rq0_min_vruntime;
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
|
||||
SPLIT_NS(spread0));
|
||||
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
|
||||
cfs_rq->nr_spread_over);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
|
||||
#ifdef CONFIG_SMP
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg",
|
||||
cfs_rq->runnable_load_avg);
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
|
||||
cfs_rq->blocked_load_avg);
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
SEQ_printf(m, " .%-30s: %Ld\n", "tg_load_contrib",
|
||||
cfs_rq->tg_load_contrib);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
|
||||
cfs_rq->tg_runnable_contrib);
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
|
||||
atomic_long_read(&cfs_rq->tg->load_avg));
|
||||
SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
|
||||
atomic_read(&cfs_rq->tg->runnable_avg));
|
||||
SEQ_printf(m, " .%-30s: %d\n", "tg->usage_avg",
|
||||
atomic_read(&cfs_rq->tg->usage_avg));
|
||||
#endif
|
||||
#endif
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
|
||||
cfs_rq->tg->cfs_bandwidth.timer_active);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "throttled",
|
||||
cfs_rq->throttled);
|
||||
SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
|
||||
cfs_rq->throttle_count);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
print_cfs_group_stats(m, cpu, cfs_rq->tg);
|
||||
#endif
|
||||
}
|
||||
|
||||
void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
|
||||
{
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
|
||||
#else
|
||||
SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
|
||||
#endif
|
||||
|
||||
#define P(x) \
|
||||
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
|
||||
#define PN(x) \
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
|
||||
|
||||
P(rt_nr_running);
|
||||
P(rt_throttled);
|
||||
PN(rt_time);
|
||||
PN(rt_runtime);
|
||||
|
||||
#undef PN
|
||||
#undef P
|
||||
}
|
||||
|
||||
extern __read_mostly int sched_clock_running;
|
||||
|
||||
static void print_cpu(struct seq_file *m, int cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
unsigned long flags;
|
||||
|
||||
#ifdef CONFIG_X86
|
||||
{
|
||||
unsigned int freq = cpu_khz ? : 1;
|
||||
|
||||
SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
|
||||
cpu, freq / 1000, (freq % 1000));
|
||||
}
|
||||
#else
|
||||
SEQ_printf(m, "cpu#%d\n", cpu);
|
||||
#endif
|
||||
|
||||
#define P(x) \
|
||||
do { \
|
||||
if (sizeof(rq->x) == 4) \
|
||||
SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
|
||||
else \
|
||||
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
|
||||
} while (0)
|
||||
|
||||
#define PN(x) \
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
|
||||
|
||||
P(nr_running);
|
||||
SEQ_printf(m, " .%-30s: %lu\n", "load",
|
||||
rq->load.weight);
|
||||
P(nr_switches);
|
||||
P(nr_load_updates);
|
||||
P(nr_uninterruptible);
|
||||
PN(next_balance);
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
|
||||
PN(clock);
|
||||
P(cpu_load[0]);
|
||||
P(cpu_load[1]);
|
||||
P(cpu_load[2]);
|
||||
P(cpu_load[3]);
|
||||
P(cpu_load[4]);
|
||||
#undef P
|
||||
#undef PN
|
||||
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
|
||||
#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
|
||||
|
||||
P(yld_count);
|
||||
|
||||
P(sched_count);
|
||||
P(sched_goidle);
|
||||
#ifdef CONFIG_SMP
|
||||
P64(avg_idle);
|
||||
P64(max_idle_balance_cost);
|
||||
#endif
|
||||
|
||||
P(ttwu_count);
|
||||
P(ttwu_local);
|
||||
|
||||
#undef P
|
||||
#undef P64
|
||||
#endif
|
||||
spin_lock_irqsave(&sched_debug_lock, flags);
|
||||
print_cfs_stats(m, cpu);
|
||||
print_rt_stats(m, cpu);
|
||||
|
||||
print_rq(m, rq, cpu);
|
||||
spin_unlock_irqrestore(&sched_debug_lock, flags);
|
||||
SEQ_printf(m, "\n");
|
||||
}
|
||||
|
||||
static const char *sched_tunable_scaling_names[] = {
|
||||
"none",
|
||||
"logaritmic",
|
||||
"linear"
|
||||
};
|
||||
|
||||
static void sched_debug_header(struct seq_file *m)
|
||||
{
|
||||
u64 ktime, sched_clk, cpu_clk;
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
ktime = ktime_to_ns(ktime_get());
|
||||
sched_clk = sched_clock();
|
||||
cpu_clk = local_clock();
|
||||
local_irq_restore(flags);
|
||||
|
||||
SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
|
||||
init_utsname()->release,
|
||||
(int)strcspn(init_utsname()->version, " "),
|
||||
init_utsname()->version);
|
||||
|
||||
#define P(x) \
|
||||
SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
|
||||
#define PN(x) \
|
||||
SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
|
||||
PN(ktime);
|
||||
PN(sched_clk);
|
||||
PN(cpu_clk);
|
||||
P(jiffies);
|
||||
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
|
||||
P(sched_clock_stable());
|
||||
#endif
|
||||
#undef PN
|
||||
#undef P
|
||||
|
||||
SEQ_printf(m, "\n");
|
||||
SEQ_printf(m, "sysctl_sched\n");
|
||||
|
||||
#define P(x) \
|
||||
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
|
||||
#define PN(x) \
|
||||
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
|
||||
PN(sysctl_sched_latency);
|
||||
PN(sysctl_sched_min_granularity);
|
||||
PN(sysctl_sched_wakeup_granularity);
|
||||
P(sysctl_sched_child_runs_first);
|
||||
P(sysctl_sched_features);
|
||||
#undef PN
|
||||
#undef P
|
||||
|
||||
SEQ_printf(m, " .%-40s: %d (%s)\n",
|
||||
"sysctl_sched_tunable_scaling",
|
||||
sysctl_sched_tunable_scaling,
|
||||
sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
|
||||
SEQ_printf(m, "\n");
|
||||
}
|
||||
|
||||
static int sched_debug_show(struct seq_file *m, void *v)
|
||||
{
|
||||
int cpu = (unsigned long)(v - 2);
|
||||
|
||||
if (cpu != -1)
|
||||
print_cpu(m, cpu);
|
||||
else
|
||||
sched_debug_header(m);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void sysrq_sched_debug_show(void)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
sched_debug_header(NULL);
|
||||
for_each_online_cpu(cpu)
|
||||
print_cpu(NULL, cpu);
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* This itererator needs some explanation.
|
||||
* It returns 1 for the header position.
|
||||
* This means 2 is cpu 0.
|
||||
* In a hotplugged system some cpus, including cpu 0, may be missing so we have
|
||||
* to use cpumask_* to iterate over the cpus.
|
||||
*/
|
||||
static void *sched_debug_start(struct seq_file *file, loff_t *offset)
|
||||
{
|
||||
unsigned long n = *offset;
|
||||
|
||||
if (n == 0)
|
||||
return (void *) 1;
|
||||
|
||||
n--;
|
||||
|
||||
if (n > 0)
|
||||
n = cpumask_next(n - 1, cpu_online_mask);
|
||||
else
|
||||
n = cpumask_first(cpu_online_mask);
|
||||
|
||||
*offset = n + 1;
|
||||
|
||||
if (n < nr_cpu_ids)
|
||||
return (void *)(unsigned long)(n + 2);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
|
||||
{
|
||||
(*offset)++;
|
||||
return sched_debug_start(file, offset);
|
||||
}
|
||||
|
||||
static void sched_debug_stop(struct seq_file *file, void *data)
|
||||
{
|
||||
}
|
||||
|
||||
static const struct seq_operations sched_debug_sops = {
|
||||
.start = sched_debug_start,
|
||||
.next = sched_debug_next,
|
||||
.stop = sched_debug_stop,
|
||||
.show = sched_debug_show,
|
||||
};
|
||||
|
||||
static int sched_debug_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
seq_release(inode, file);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sched_debug_open(struct inode *inode, struct file *filp)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
ret = seq_open(filp, &sched_debug_sops);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const struct file_operations sched_debug_fops = {
|
||||
.open = sched_debug_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = sched_debug_release,
|
||||
};
|
||||
|
||||
static int __init init_sched_debug_procfs(void)
|
||||
{
|
||||
struct proc_dir_entry *pe;
|
||||
|
||||
pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
|
||||
if (!pe)
|
||||
return -ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
__initcall(init_sched_debug_procfs);
|
||||
|
||||
#define __P(F) \
|
||||
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
|
||||
#define P(F) \
|
||||
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
|
||||
#define __PN(F) \
|
||||
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
|
||||
#define PN(F) \
|
||||
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
|
||||
|
||||
|
||||
static void sched_show_numa(struct task_struct *p, struct seq_file *m)
|
||||
{
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
struct mempolicy *pol;
|
||||
int node, i;
|
||||
|
||||
if (p->mm)
|
||||
P(mm->numa_scan_seq);
|
||||
|
||||
task_lock(p);
|
||||
pol = p->mempolicy;
|
||||
if (pol && !(pol->flags & MPOL_F_MORON))
|
||||
pol = NULL;
|
||||
mpol_get(pol);
|
||||
task_unlock(p);
|
||||
|
||||
SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
|
||||
|
||||
for_each_online_node(node) {
|
||||
for (i = 0; i < 2; i++) {
|
||||
unsigned long nr_faults = -1;
|
||||
int cpu_current, home_node;
|
||||
|
||||
if (p->numa_faults_memory)
|
||||
nr_faults = p->numa_faults_memory[2*node + i];
|
||||
|
||||
cpu_current = !i ? (task_node(p) == node) :
|
||||
(pol && node_isset(node, pol->v.nodes));
|
||||
|
||||
home_node = (p->numa_preferred_nid == node);
|
||||
|
||||
SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
|
||||
i, node, cpu_current, home_node, nr_faults);
|
||||
}
|
||||
}
|
||||
|
||||
mpol_put(pol);
|
||||
#endif
|
||||
}
|
||||
|
||||
void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
|
||||
{
|
||||
unsigned long nr_switches;
|
||||
|
||||
SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
|
||||
get_nr_threads(p));
|
||||
SEQ_printf(m,
|
||||
"---------------------------------------------------------"
|
||||
"----------\n");
|
||||
#define __P(F) \
|
||||
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
|
||||
#define P(F) \
|
||||
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
|
||||
#define __PN(F) \
|
||||
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
|
||||
#define PN(F) \
|
||||
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
|
||||
|
||||
PN(se.exec_start);
|
||||
PN(se.vruntime);
|
||||
PN(se.sum_exec_runtime);
|
||||
|
||||
nr_switches = p->nvcsw + p->nivcsw;
|
||||
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
PN(se.statistics.wait_start);
|
||||
PN(se.statistics.sleep_start);
|
||||
PN(se.statistics.block_start);
|
||||
PN(se.statistics.sleep_max);
|
||||
PN(se.statistics.block_max);
|
||||
PN(se.statistics.exec_max);
|
||||
PN(se.statistics.slice_max);
|
||||
PN(se.statistics.wait_max);
|
||||
PN(se.statistics.wait_sum);
|
||||
P(se.statistics.wait_count);
|
||||
PN(se.statistics.iowait_sum);
|
||||
P(se.statistics.iowait_count);
|
||||
P(se.nr_migrations);
|
||||
P(se.statistics.nr_migrations_cold);
|
||||
P(se.statistics.nr_failed_migrations_affine);
|
||||
P(se.statistics.nr_failed_migrations_running);
|
||||
P(se.statistics.nr_failed_migrations_hot);
|
||||
P(se.statistics.nr_forced_migrations);
|
||||
P(se.statistics.nr_wakeups);
|
||||
P(se.statistics.nr_wakeups_sync);
|
||||
P(se.statistics.nr_wakeups_migrate);
|
||||
P(se.statistics.nr_wakeups_local);
|
||||
P(se.statistics.nr_wakeups_remote);
|
||||
P(se.statistics.nr_wakeups_affine);
|
||||
P(se.statistics.nr_wakeups_affine_attempts);
|
||||
P(se.statistics.nr_wakeups_passive);
|
||||
P(se.statistics.nr_wakeups_idle);
|
||||
|
||||
{
|
||||
u64 avg_atom, avg_per_cpu;
|
||||
|
||||
avg_atom = p->se.sum_exec_runtime;
|
||||
if (nr_switches)
|
||||
avg_atom = div64_ul(avg_atom, nr_switches);
|
||||
else
|
||||
avg_atom = -1LL;
|
||||
|
||||
avg_per_cpu = p->se.sum_exec_runtime;
|
||||
if (p->se.nr_migrations) {
|
||||
avg_per_cpu = div64_u64(avg_per_cpu,
|
||||
p->se.nr_migrations);
|
||||
} else {
|
||||
avg_per_cpu = -1LL;
|
||||
}
|
||||
|
||||
__PN(avg_atom);
|
||||
__PN(avg_per_cpu);
|
||||
}
|
||||
#endif
|
||||
__P(nr_switches);
|
||||
SEQ_printf(m, "%-45s:%21Ld\n",
|
||||
"nr_voluntary_switches", (long long)p->nvcsw);
|
||||
SEQ_printf(m, "%-45s:%21Ld\n",
|
||||
"nr_involuntary_switches", (long long)p->nivcsw);
|
||||
|
||||
P(se.load.weight);
|
||||
#ifdef CONFIG_SMP
|
||||
P(se.avg.runnable_avg_sum);
|
||||
P(se.avg.runnable_avg_period);
|
||||
P(se.avg.load_avg_contrib);
|
||||
P(se.avg.decay_count);
|
||||
#endif
|
||||
P(policy);
|
||||
P(prio);
|
||||
#undef PN
|
||||
#undef __PN
|
||||
#undef P
|
||||
#undef __P
|
||||
|
||||
{
|
||||
unsigned int this_cpu = raw_smp_processor_id();
|
||||
u64 t0, t1;
|
||||
|
||||
t0 = cpu_clock(this_cpu);
|
||||
t1 = cpu_clock(this_cpu);
|
||||
SEQ_printf(m, "%-45s:%21Ld\n",
|
||||
"clock-delta", (long long)(t1-t0));
|
||||
}
|
||||
|
||||
sched_show_numa(p, m);
|
||||
}
|
||||
|
||||
void proc_sched_set_task(struct task_struct *p)
|
||||
{
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
|
||||
#endif
|
||||
}
|
||||
9976
kernel/sched/fair.c
Normal file
9976
kernel/sched/fair.c
Normal file
File diff suppressed because it is too large
Load diff
85
kernel/sched/features.h
Normal file
85
kernel/sched/features.h
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
/*
|
||||
* Only give sleepers 50% of their service deficit. This allows
|
||||
* them to run sooner, but does not allow tons of sleepers to
|
||||
* rip the spread apart.
|
||||
*/
|
||||
SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
|
||||
|
||||
/*
|
||||
* Place new tasks ahead so that they do not starve already running
|
||||
* tasks
|
||||
*/
|
||||
SCHED_FEAT(START_DEBIT, true)
|
||||
|
||||
/*
|
||||
* Prefer to schedule the task we woke last (assuming it failed
|
||||
* wakeup-preemption), since its likely going to consume data we
|
||||
* touched, increases cache locality.
|
||||
*/
|
||||
SCHED_FEAT(NEXT_BUDDY, false)
|
||||
|
||||
/*
|
||||
* Prefer to schedule the task that ran last (when we did
|
||||
* wake-preempt) as that likely will touch the same data, increases
|
||||
* cache locality.
|
||||
*/
|
||||
SCHED_FEAT(LAST_BUDDY, true)
|
||||
|
||||
/*
|
||||
* Consider buddies to be cache hot, decreases the likelyness of a
|
||||
* cache buddy being migrated away, increases cache locality.
|
||||
*/
|
||||
SCHED_FEAT(CACHE_HOT_BUDDY, true)
|
||||
|
||||
/*
|
||||
* Allow wakeup-time preemption of the current task:
|
||||
*/
|
||||
SCHED_FEAT(WAKEUP_PREEMPTION, true)
|
||||
|
||||
/*
|
||||
* Use arch dependent cpu capacity functions
|
||||
*/
|
||||
SCHED_FEAT(ARCH_CAPACITY, true)
|
||||
|
||||
SCHED_FEAT(HRTICK, false)
|
||||
SCHED_FEAT(DOUBLE_TICK, false)
|
||||
SCHED_FEAT(LB_BIAS, true)
|
||||
|
||||
/*
|
||||
* Decrement CPU capacity based on time not spent running tasks
|
||||
*/
|
||||
SCHED_FEAT(NONTASK_CAPACITY, true)
|
||||
|
||||
/*
|
||||
* Queue remote wakeups on the target CPU and process them
|
||||
* using the scheduler IPI. Reduces rq->lock contention/bounces.
|
||||
*/
|
||||
SCHED_FEAT(TTWU_QUEUE, true)
|
||||
|
||||
SCHED_FEAT(FORCE_SD_OVERLAP, false)
|
||||
SCHED_FEAT(RT_RUNTIME_SHARE, true)
|
||||
SCHED_FEAT(LB_MIN, false)
|
||||
|
||||
/*
|
||||
* Apply the automatic NUMA scheduling policy. Enabled automatically
|
||||
* at runtime if running on a NUMA machine. Can be controlled via
|
||||
* numa_balancing=
|
||||
*/
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
SCHED_FEAT(NUMA, false)
|
||||
|
||||
/*
|
||||
* NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
|
||||
* higher number of hinting faults are recorded during active load
|
||||
* balancing.
|
||||
*/
|
||||
SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
|
||||
|
||||
/*
|
||||
* NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
|
||||
* lower number of hinting faults have been recorded. As this has
|
||||
* the potential to prevent a task ever migrating to a new node
|
||||
* due to CPU overload it is disabled by default.
|
||||
*/
|
||||
SCHED_FEAT(NUMA_RESIST_LOWER, false)
|
||||
#endif
|
||||
275
kernel/sched/idle.c
Normal file
275
kernel/sched/idle.c
Normal file
|
|
@ -0,0 +1,275 @@
|
|||
/*
|
||||
* Generic entry point for the idle threads
|
||||
*/
|
||||
#include <linux/sched.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/cpuidle.h>
|
||||
#include <linux/tick.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/stackprotector.h>
|
||||
|
||||
#include <asm/tlb.h>
|
||||
|
||||
#include <trace/events/power.h>
|
||||
|
||||
#include "sched.h"
|
||||
|
||||
static int __read_mostly cpu_idle_force_poll;
|
||||
|
||||
void cpu_idle_poll_ctrl(bool enable)
|
||||
{
|
||||
if (enable) {
|
||||
cpu_idle_force_poll++;
|
||||
} else {
|
||||
cpu_idle_force_poll--;
|
||||
WARN_ON_ONCE(cpu_idle_force_poll < 0);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
|
||||
static int __init cpu_idle_poll_setup(char *__unused)
|
||||
{
|
||||
cpu_idle_force_poll = 1;
|
||||
return 1;
|
||||
}
|
||||
__setup("nohlt", cpu_idle_poll_setup);
|
||||
|
||||
static int __init cpu_idle_nopoll_setup(char *__unused)
|
||||
{
|
||||
cpu_idle_force_poll = 0;
|
||||
return 1;
|
||||
}
|
||||
__setup("hlt", cpu_idle_nopoll_setup);
|
||||
#endif
|
||||
|
||||
static inline int cpu_idle_poll(void)
|
||||
{
|
||||
rcu_idle_enter();
|
||||
trace_cpu_idle_rcuidle(0, smp_processor_id());
|
||||
local_irq_enable();
|
||||
while (!tif_need_resched())
|
||||
cpu_relax();
|
||||
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
|
||||
rcu_idle_exit();
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Weak implementations for optional arch specific functions */
|
||||
void __weak arch_cpu_idle_prepare(void) { }
|
||||
void __weak arch_cpu_idle_enter(void) { }
|
||||
void __weak arch_cpu_idle_exit(void) { }
|
||||
void __weak arch_cpu_idle_dead(void) { }
|
||||
void __weak arch_cpu_idle(void)
|
||||
{
|
||||
cpu_idle_force_poll = 1;
|
||||
local_irq_enable();
|
||||
}
|
||||
|
||||
/**
|
||||
* cpuidle_idle_call - the main idle function
|
||||
*
|
||||
* NOTE: no locks or semaphores should be used here
|
||||
*
|
||||
* On archs that support TIF_POLLING_NRFLAG, is called with polling
|
||||
* set, and it returns with polling set. If it ever stops polling, it
|
||||
* must clear the polling bit.
|
||||
*/
|
||||
static void cpuidle_idle_call(void)
|
||||
{
|
||||
struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
|
||||
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
|
||||
int next_state, entered_state;
|
||||
unsigned int broadcast;
|
||||
|
||||
/*
|
||||
* Check if the idle task must be rescheduled. If it is the
|
||||
* case, exit the function after re-enabling the local irq.
|
||||
*/
|
||||
if (need_resched()) {
|
||||
local_irq_enable();
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* During the idle period, stop measuring the disabled irqs
|
||||
* critical sections latencies
|
||||
*/
|
||||
stop_critical_timings();
|
||||
|
||||
/*
|
||||
* Tell the RCU framework we are entering an idle section,
|
||||
* so no more rcu read side critical sections and one more
|
||||
* step to the grace period
|
||||
*/
|
||||
rcu_idle_enter();
|
||||
|
||||
/*
|
||||
* Ask the cpuidle framework to choose a convenient idle state.
|
||||
* Fall back to the default arch idle method on errors.
|
||||
*/
|
||||
next_state = cpuidle_select(drv, dev);
|
||||
if (next_state < 0) {
|
||||
use_default:
|
||||
/*
|
||||
* We can't use the cpuidle framework, let's use the default
|
||||
* idle routine.
|
||||
*/
|
||||
if (current_clr_polling_and_test())
|
||||
local_irq_enable();
|
||||
else
|
||||
arch_cpu_idle();
|
||||
|
||||
goto exit_idle;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* The idle task must be scheduled, it is pointless to
|
||||
* go to idle, just update no idle residency and get
|
||||
* out of this function
|
||||
*/
|
||||
if (current_clr_polling_and_test()) {
|
||||
dev->last_residency = 0;
|
||||
entered_state = next_state;
|
||||
local_irq_enable();
|
||||
goto exit_idle;
|
||||
}
|
||||
|
||||
broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP;
|
||||
|
||||
/*
|
||||
* Tell the time framework to switch to a broadcast timer
|
||||
* because our local timer will be shutdown. If a local timer
|
||||
* is used from another cpu as a broadcast timer, this call may
|
||||
* fail if it is not available
|
||||
*/
|
||||
if (broadcast &&
|
||||
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
|
||||
goto use_default;
|
||||
|
||||
/* Take note of the planned idle state. */
|
||||
idle_set_state(this_rq(), &drv->states[next_state]);
|
||||
|
||||
/*
|
||||
* Enter the idle state previously returned by the governor decision.
|
||||
* This function will block until an interrupt occurs and will take
|
||||
* care of re-enabling the local interrupts
|
||||
*/
|
||||
entered_state = cpuidle_enter(drv, dev, next_state);
|
||||
|
||||
/* The cpu is no longer idle or about to enter idle. */
|
||||
idle_set_state(this_rq(), NULL);
|
||||
|
||||
if (broadcast)
|
||||
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
|
||||
|
||||
/*
|
||||
* Give the governor an opportunity to reflect on the outcome
|
||||
*/
|
||||
cpuidle_reflect(dev, entered_state);
|
||||
|
||||
exit_idle:
|
||||
__current_set_polling();
|
||||
|
||||
/*
|
||||
* It is up to the idle functions to reenable local interrupts
|
||||
*/
|
||||
if (WARN_ON_ONCE(irqs_disabled()))
|
||||
local_irq_enable();
|
||||
|
||||
rcu_idle_exit();
|
||||
start_critical_timings();
|
||||
}
|
||||
|
||||
/*
|
||||
* Generic idle loop implementation
|
||||
*
|
||||
* Called with polling cleared.
|
||||
*/
|
||||
static void cpu_idle_loop(void)
|
||||
{
|
||||
while (1) {
|
||||
/*
|
||||
* If the arch has a polling bit, we maintain an invariant:
|
||||
*
|
||||
* Our polling bit is clear if we're not scheduled (i.e. if
|
||||
* rq->curr != rq->idle). This means that, if rq->idle has
|
||||
* the polling bit set, then setting need_resched is
|
||||
* guaranteed to cause the cpu to reschedule.
|
||||
*/
|
||||
|
||||
__current_set_polling();
|
||||
tick_nohz_idle_enter();
|
||||
|
||||
while (!need_resched()) {
|
||||
check_pgt_cache();
|
||||
rmb();
|
||||
|
||||
if (cpu_is_offline(smp_processor_id()))
|
||||
arch_cpu_idle_dead();
|
||||
|
||||
local_irq_disable();
|
||||
arch_cpu_idle_enter();
|
||||
|
||||
/*
|
||||
* In poll mode we reenable interrupts and spin.
|
||||
*
|
||||
* Also if we detected in the wakeup from idle
|
||||
* path that the tick broadcast device expired
|
||||
* for us, we don't want to go deep idle as we
|
||||
* know that the IPI is going to arrive right
|
||||
* away
|
||||
*/
|
||||
if (cpu_idle_force_poll || tick_check_broadcast_expired())
|
||||
cpu_idle_poll();
|
||||
else
|
||||
cpuidle_idle_call();
|
||||
|
||||
arch_cpu_idle_exit();
|
||||
}
|
||||
|
||||
/*
|
||||
* Since we fell out of the loop above, we know
|
||||
* TIF_NEED_RESCHED must be set, propagate it into
|
||||
* PREEMPT_NEED_RESCHED.
|
||||
*
|
||||
* This is required because for polling idle loops we will
|
||||
* not have had an IPI to fold the state for us.
|
||||
*/
|
||||
preempt_set_need_resched();
|
||||
tick_nohz_idle_exit();
|
||||
__current_clr_polling();
|
||||
|
||||
/*
|
||||
* We promise to call sched_ttwu_pending and reschedule
|
||||
* if need_resched is set while polling is set. That
|
||||
* means that clearing polling needs to be visible
|
||||
* before doing these things.
|
||||
*/
|
||||
smp_mb__after_atomic();
|
||||
|
||||
sched_ttwu_pending();
|
||||
schedule_preempt_disabled();
|
||||
}
|
||||
}
|
||||
|
||||
void cpu_startup_entry(enum cpuhp_state state)
|
||||
{
|
||||
/*
|
||||
* This #ifdef needs to die, but it's too late in the cycle to
|
||||
* make this generic (arm and sh have never invoked the canary
|
||||
* init for the non boot cpus!). Will be fixed in 3.11
|
||||
*/
|
||||
#ifdef CONFIG_X86
|
||||
/*
|
||||
* If we're the non-boot CPU, nothing set the stack canary up
|
||||
* for us. The boot CPU already has it initialized but no harm
|
||||
* in doing it again. This is a good place for updating it, as
|
||||
* we wont ever return from this function (so the invalid
|
||||
* canaries already on the stack wont ever trigger).
|
||||
*/
|
||||
boot_init_stack_canary();
|
||||
#endif
|
||||
arch_cpu_idle_prepare();
|
||||
cpu_idle_loop();
|
||||
}
|
||||
109
kernel/sched/idle_task.c
Normal file
109
kernel/sched/idle_task.c
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
#include "sched.h"
|
||||
|
||||
/*
|
||||
* idle-task scheduling class.
|
||||
*
|
||||
* (NOTE: these are not related to SCHED_IDLE tasks which are
|
||||
* handled in sched/fair.c)
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static int
|
||||
select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
|
||||
{
|
||||
return task_cpu(p); /* IDLE tasks as never migrated */
|
||||
}
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
/*
|
||||
* Idle tasks are unconditionally rescheduled:
|
||||
*/
|
||||
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
resched_curr(rq);
|
||||
}
|
||||
|
||||
static struct task_struct *
|
||||
pick_next_task_idle(struct rq *rq, struct task_struct *prev)
|
||||
{
|
||||
put_prev_task(rq, prev);
|
||||
|
||||
schedstat_inc(rq, sched_goidle);
|
||||
return rq->idle;
|
||||
}
|
||||
|
||||
/*
|
||||
* It is not legal to sleep in the idle task - print a warning
|
||||
* message if some code attempts to do it:
|
||||
*/
|
||||
static void
|
||||
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
raw_spin_unlock_irq(&rq->lock);
|
||||
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
|
||||
dump_stack();
|
||||
raw_spin_lock_irq(&rq->lock);
|
||||
}
|
||||
|
||||
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
|
||||
{
|
||||
idle_exit_fair(rq);
|
||||
rq_last_tick_reset(rq);
|
||||
}
|
||||
|
||||
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
|
||||
{
|
||||
}
|
||||
|
||||
static void set_curr_task_idle(struct rq *rq)
|
||||
{
|
||||
}
|
||||
|
||||
static void switched_to_idle(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
BUG();
|
||||
}
|
||||
|
||||
static void
|
||||
prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
|
||||
{
|
||||
BUG();
|
||||
}
|
||||
|
||||
static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void update_curr_idle(struct rq *rq)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* Simple, special scheduling class for the per-CPU idle tasks:
|
||||
*/
|
||||
const struct sched_class idle_sched_class = {
|
||||
/* .next is NULL */
|
||||
/* no enqueue/yield_task for idle tasks */
|
||||
|
||||
/* dequeue is not valid, we print a debug message there: */
|
||||
.dequeue_task = dequeue_task_idle,
|
||||
|
||||
.check_preempt_curr = check_preempt_curr_idle,
|
||||
|
||||
.pick_next_task = pick_next_task_idle,
|
||||
.put_prev_task = put_prev_task_idle,
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
.select_task_rq = select_task_rq_idle,
|
||||
#endif
|
||||
|
||||
.set_curr_task = set_curr_task_idle,
|
||||
.task_tick = task_tick_idle,
|
||||
|
||||
.get_rr_interval = get_rr_interval_idle,
|
||||
|
||||
.prio_changed = prio_changed_idle,
|
||||
.switched_to = switched_to_idle,
|
||||
.update_curr = update_curr_idle,
|
||||
};
|
||||
584
kernel/sched/proc.c
Normal file
584
kernel/sched/proc.c
Normal file
|
|
@ -0,0 +1,584 @@
|
|||
/*
|
||||
* kernel/sched/proc.c
|
||||
*
|
||||
* Kernel load calculations, forked from sched/core.c
|
||||
*/
|
||||
|
||||
#include <linux/export.h>
|
||||
|
||||
#include "sched.h"
|
||||
|
||||
/*
|
||||
* Global load-average calculations
|
||||
*
|
||||
* We take a distributed and async approach to calculating the global load-avg
|
||||
* in order to minimize overhead.
|
||||
*
|
||||
* The global load average is an exponentially decaying average of nr_running +
|
||||
* nr_uninterruptible.
|
||||
*
|
||||
* Once every LOAD_FREQ:
|
||||
*
|
||||
* nr_active = 0;
|
||||
* for_each_possible_cpu(cpu)
|
||||
* nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
|
||||
*
|
||||
* avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
|
||||
*
|
||||
* Due to a number of reasons the above turns in the mess below:
|
||||
*
|
||||
* - for_each_possible_cpu() is prohibitively expensive on machines with
|
||||
* serious number of cpus, therefore we need to take a distributed approach
|
||||
* to calculating nr_active.
|
||||
*
|
||||
* \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
|
||||
* = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
|
||||
*
|
||||
* So assuming nr_active := 0 when we start out -- true per definition, we
|
||||
* can simply take per-cpu deltas and fold those into a global accumulate
|
||||
* to obtain the same result. See calc_load_fold_active().
|
||||
*
|
||||
* Furthermore, in order to avoid synchronizing all per-cpu delta folding
|
||||
* across the machine, we assume 10 ticks is sufficient time for every
|
||||
* cpu to have completed this task.
|
||||
*
|
||||
* This places an upper-bound on the IRQ-off latency of the machine. Then
|
||||
* again, being late doesn't loose the delta, just wrecks the sample.
|
||||
*
|
||||
* - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
|
||||
* this would add another cross-cpu cacheline miss and atomic operation
|
||||
* to the wakeup path. Instead we increment on whatever cpu the task ran
|
||||
* when it went into uninterruptible state and decrement on whatever cpu
|
||||
* did the wakeup. This means that only the sum of nr_uninterruptible over
|
||||
* all cpus yields the correct result.
|
||||
*
|
||||
* This covers the NO_HZ=n code, for extra head-aches, see the comment below.
|
||||
*/
|
||||
|
||||
/* Variables and functions for calc_load */
|
||||
atomic_long_t calc_load_tasks;
|
||||
unsigned long calc_load_update;
|
||||
unsigned long avenrun[3];
|
||||
EXPORT_SYMBOL(avenrun); /* should be removed */
|
||||
|
||||
/**
|
||||
* get_avenrun - get the load average array
|
||||
* @loads: pointer to dest load array
|
||||
* @offset: offset to add
|
||||
* @shift: shift count to shift the result left
|
||||
*
|
||||
* These values are estimates at best, so no need for locking.
|
||||
*/
|
||||
void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
|
||||
{
|
||||
loads[0] = (avenrun[0] + offset) << shift;
|
||||
loads[1] = (avenrun[1] + offset) << shift;
|
||||
loads[2] = (avenrun[2] + offset) << shift;
|
||||
}
|
||||
|
||||
long calc_load_fold_active(struct rq *this_rq)
|
||||
{
|
||||
long nr_active, delta = 0;
|
||||
|
||||
nr_active = this_rq->nr_running;
|
||||
nr_active += (long) this_rq->nr_uninterruptible;
|
||||
|
||||
if (nr_active != this_rq->calc_load_active) {
|
||||
delta = nr_active - this_rq->calc_load_active;
|
||||
this_rq->calc_load_active = nr_active;
|
||||
}
|
||||
|
||||
return delta;
|
||||
}
|
||||
|
||||
/*
|
||||
* a1 = a0 * e + a * (1 - e)
|
||||
*/
|
||||
static unsigned long
|
||||
calc_load(unsigned long load, unsigned long exp, unsigned long active)
|
||||
{
|
||||
load *= exp;
|
||||
load += active * (FIXED_1 - exp);
|
||||
load += 1UL << (FSHIFT - 1);
|
||||
return load >> FSHIFT;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
/*
|
||||
* Handle NO_HZ for the global load-average.
|
||||
*
|
||||
* Since the above described distributed algorithm to compute the global
|
||||
* load-average relies on per-cpu sampling from the tick, it is affected by
|
||||
* NO_HZ.
|
||||
*
|
||||
* The basic idea is to fold the nr_active delta into a global idle-delta upon
|
||||
* entering NO_HZ state such that we can include this as an 'extra' cpu delta
|
||||
* when we read the global state.
|
||||
*
|
||||
* Obviously reality has to ruin such a delightfully simple scheme:
|
||||
*
|
||||
* - When we go NO_HZ idle during the window, we can negate our sample
|
||||
* contribution, causing under-accounting.
|
||||
*
|
||||
* We avoid this by keeping two idle-delta counters and flipping them
|
||||
* when the window starts, thus separating old and new NO_HZ load.
|
||||
*
|
||||
* The only trick is the slight shift in index flip for read vs write.
|
||||
*
|
||||
* 0s 5s 10s 15s
|
||||
* +10 +10 +10 +10
|
||||
* |-|-----------|-|-----------|-|-----------|-|
|
||||
* r:0 0 1 1 0 0 1 1 0
|
||||
* w:0 1 1 0 0 1 1 0 0
|
||||
*
|
||||
* This ensures we'll fold the old idle contribution in this window while
|
||||
* accumlating the new one.
|
||||
*
|
||||
* - When we wake up from NO_HZ idle during the window, we push up our
|
||||
* contribution, since we effectively move our sample point to a known
|
||||
* busy state.
|
||||
*
|
||||
* This is solved by pushing the window forward, and thus skipping the
|
||||
* sample, for this cpu (effectively using the idle-delta for this cpu which
|
||||
* was in effect at the time the window opened). This also solves the issue
|
||||
* of having to deal with a cpu having been in NOHZ idle for multiple
|
||||
* LOAD_FREQ intervals.
|
||||
*
|
||||
* When making the ILB scale, we should try to pull this in as well.
|
||||
*/
|
||||
static atomic_long_t calc_load_idle[2];
|
||||
static int calc_load_idx;
|
||||
|
||||
static inline int calc_load_write_idx(void)
|
||||
{
|
||||
int idx = calc_load_idx;
|
||||
|
||||
/*
|
||||
* See calc_global_nohz(), if we observe the new index, we also
|
||||
* need to observe the new update time.
|
||||
*/
|
||||
smp_rmb();
|
||||
|
||||
/*
|
||||
* If the folding window started, make sure we start writing in the
|
||||
* next idle-delta.
|
||||
*/
|
||||
if (!time_before(jiffies, calc_load_update))
|
||||
idx++;
|
||||
|
||||
return idx & 1;
|
||||
}
|
||||
|
||||
static inline int calc_load_read_idx(void)
|
||||
{
|
||||
return calc_load_idx & 1;
|
||||
}
|
||||
|
||||
void calc_load_enter_idle(void)
|
||||
{
|
||||
struct rq *this_rq = this_rq();
|
||||
long delta;
|
||||
|
||||
/*
|
||||
* We're going into NOHZ mode, if there's any pending delta, fold it
|
||||
* into the pending idle delta.
|
||||
*/
|
||||
delta = calc_load_fold_active(this_rq);
|
||||
if (delta) {
|
||||
int idx = calc_load_write_idx();
|
||||
atomic_long_add(delta, &calc_load_idle[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
void calc_load_exit_idle(void)
|
||||
{
|
||||
struct rq *this_rq = this_rq();
|
||||
|
||||
/*
|
||||
* If we're still before the sample window, we're done.
|
||||
*/
|
||||
if (time_before(jiffies, this_rq->calc_load_update))
|
||||
return;
|
||||
|
||||
/*
|
||||
* We woke inside or after the sample window, this means we're already
|
||||
* accounted through the nohz accounting, so skip the entire deal and
|
||||
* sync up for the next window.
|
||||
*/
|
||||
this_rq->calc_load_update = calc_load_update;
|
||||
if (time_before(jiffies, this_rq->calc_load_update + 10))
|
||||
this_rq->calc_load_update += LOAD_FREQ;
|
||||
}
|
||||
|
||||
static long calc_load_fold_idle(void)
|
||||
{
|
||||
int idx = calc_load_read_idx();
|
||||
long delta = 0;
|
||||
|
||||
if (atomic_long_read(&calc_load_idle[idx]))
|
||||
delta = atomic_long_xchg(&calc_load_idle[idx], 0);
|
||||
|
||||
return delta;
|
||||
}
|
||||
|
||||
/**
|
||||
* fixed_power_int - compute: x^n, in O(log n) time
|
||||
*
|
||||
* @x: base of the power
|
||||
* @frac_bits: fractional bits of @x
|
||||
* @n: power to raise @x to.
|
||||
*
|
||||
* By exploiting the relation between the definition of the natural power
|
||||
* function: x^n := x*x*...*x (x multiplied by itself for n times), and
|
||||
* the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
|
||||
* (where: n_i \elem {0, 1}, the binary vector representing n),
|
||||
* we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
|
||||
* of course trivially computable in O(log_2 n), the length of our binary
|
||||
* vector.
|
||||
*/
|
||||
static unsigned long
|
||||
fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
|
||||
{
|
||||
unsigned long result = 1UL << frac_bits;
|
||||
|
||||
if (n) for (;;) {
|
||||
if (n & 1) {
|
||||
result *= x;
|
||||
result += 1UL << (frac_bits - 1);
|
||||
result >>= frac_bits;
|
||||
}
|
||||
n >>= 1;
|
||||
if (!n)
|
||||
break;
|
||||
x *= x;
|
||||
x += 1UL << (frac_bits - 1);
|
||||
x >>= frac_bits;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* a1 = a0 * e + a * (1 - e)
|
||||
*
|
||||
* a2 = a1 * e + a * (1 - e)
|
||||
* = (a0 * e + a * (1 - e)) * e + a * (1 - e)
|
||||
* = a0 * e^2 + a * (1 - e) * (1 + e)
|
||||
*
|
||||
* a3 = a2 * e + a * (1 - e)
|
||||
* = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
|
||||
* = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
|
||||
*
|
||||
* ...
|
||||
*
|
||||
* an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
|
||||
* = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
|
||||
* = a0 * e^n + a * (1 - e^n)
|
||||
*
|
||||
* [1] application of the geometric series:
|
||||
*
|
||||
* n 1 - x^(n+1)
|
||||
* S_n := \Sum x^i = -------------
|
||||
* i=0 1 - x
|
||||
*/
|
||||
static unsigned long
|
||||
calc_load_n(unsigned long load, unsigned long exp,
|
||||
unsigned long active, unsigned int n)
|
||||
{
|
||||
|
||||
return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
|
||||
}
|
||||
|
||||
/*
|
||||
* NO_HZ can leave us missing all per-cpu ticks calling
|
||||
* calc_load_account_active(), but since an idle CPU folds its delta into
|
||||
* calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
|
||||
* in the pending idle delta if our idle period crossed a load cycle boundary.
|
||||
*
|
||||
* Once we've updated the global active value, we need to apply the exponential
|
||||
* weights adjusted to the number of cycles missed.
|
||||
*/
|
||||
static void calc_global_nohz(void)
|
||||
{
|
||||
long delta, active, n;
|
||||
|
||||
if (!time_before(jiffies, calc_load_update + 10)) {
|
||||
/*
|
||||
* Catch-up, fold however many we are behind still
|
||||
*/
|
||||
delta = jiffies - calc_load_update - 10;
|
||||
n = 1 + (delta / LOAD_FREQ);
|
||||
|
||||
active = atomic_long_read(&calc_load_tasks);
|
||||
active = active > 0 ? active * FIXED_1 : 0;
|
||||
|
||||
avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
|
||||
avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
|
||||
avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
|
||||
|
||||
calc_load_update += n * LOAD_FREQ;
|
||||
}
|
||||
|
||||
/*
|
||||
* Flip the idle index...
|
||||
*
|
||||
* Make sure we first write the new time then flip the index, so that
|
||||
* calc_load_write_idx() will see the new time when it reads the new
|
||||
* index, this avoids a double flip messing things up.
|
||||
*/
|
||||
smp_wmb();
|
||||
calc_load_idx++;
|
||||
}
|
||||
#else /* !CONFIG_NO_HZ_COMMON */
|
||||
|
||||
static inline long calc_load_fold_idle(void) { return 0; }
|
||||
static inline void calc_global_nohz(void) { }
|
||||
|
||||
#endif /* CONFIG_NO_HZ_COMMON */
|
||||
|
||||
/*
|
||||
* calc_load - update the avenrun load estimates 10 ticks after the
|
||||
* CPUs have updated calc_load_tasks.
|
||||
*/
|
||||
void calc_global_load(unsigned long ticks)
|
||||
{
|
||||
long active, delta;
|
||||
|
||||
if (time_before(jiffies, calc_load_update + 10))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Fold the 'old' idle-delta to include all NO_HZ cpus.
|
||||
*/
|
||||
delta = calc_load_fold_idle();
|
||||
if (delta)
|
||||
atomic_long_add(delta, &calc_load_tasks);
|
||||
|
||||
active = atomic_long_read(&calc_load_tasks);
|
||||
active = active > 0 ? active * FIXED_1 : 0;
|
||||
|
||||
avenrun[0] = calc_load(avenrun[0], EXP_1, active);
|
||||
avenrun[1] = calc_load(avenrun[1], EXP_5, active);
|
||||
avenrun[2] = calc_load(avenrun[2], EXP_15, active);
|
||||
|
||||
calc_load_update += LOAD_FREQ;
|
||||
|
||||
/*
|
||||
* In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
|
||||
*/
|
||||
calc_global_nohz();
|
||||
}
|
||||
|
||||
/*
|
||||
* Called from update_cpu_load() to periodically update this CPU's
|
||||
* active count.
|
||||
*/
|
||||
static void calc_load_account_active(struct rq *this_rq)
|
||||
{
|
||||
long delta;
|
||||
|
||||
if (time_before(jiffies, this_rq->calc_load_update))
|
||||
return;
|
||||
|
||||
delta = calc_load_fold_active(this_rq);
|
||||
if (delta)
|
||||
atomic_long_add(delta, &calc_load_tasks);
|
||||
|
||||
this_rq->calc_load_update += LOAD_FREQ;
|
||||
}
|
||||
|
||||
/*
|
||||
* End of global load-average stuff
|
||||
*/
|
||||
|
||||
/*
|
||||
* The exact cpuload at various idx values, calculated at every tick would be
|
||||
* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
|
||||
*
|
||||
* If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
|
||||
* on nth tick when cpu may be busy, then we have:
|
||||
* load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
||||
* load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
|
||||
*
|
||||
* decay_load_missed() below does efficient calculation of
|
||||
* load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
||||
* avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
|
||||
*
|
||||
* The calculation is approximated on a 128 point scale.
|
||||
* degrade_zero_ticks is the number of ticks after which load at any
|
||||
* particular idx is approximated to be zero.
|
||||
* degrade_factor is a precomputed table, a row for each load idx.
|
||||
* Each column corresponds to degradation factor for a power of two ticks,
|
||||
* based on 128 point scale.
|
||||
* Example:
|
||||
* row 2, col 3 (=12) says that the degradation at load idx 2 after
|
||||
* 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
|
||||
*
|
||||
* With this power of 2 load factors, we can degrade the load n times
|
||||
* by looking at 1 bits in n and doing as many mult/shift instead of
|
||||
* n mult/shifts needed by the exact degradation.
|
||||
*/
|
||||
#define DEGRADE_SHIFT 7
|
||||
static const unsigned char
|
||||
degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
|
||||
static const unsigned char
|
||||
degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
{64, 32, 8, 0, 0, 0, 0, 0},
|
||||
{96, 72, 40, 12, 1, 0, 0},
|
||||
{112, 98, 75, 43, 15, 1, 0},
|
||||
{120, 112, 98, 76, 45, 16, 2} };
|
||||
|
||||
/*
|
||||
* Update cpu_load for any missed ticks, due to tickless idle. The backlog
|
||||
* would be when CPU is idle and so we just decay the old load without
|
||||
* adding any new load.
|
||||
*/
|
||||
static unsigned long
|
||||
decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
|
||||
{
|
||||
int j = 0;
|
||||
|
||||
if (!missed_updates)
|
||||
return load;
|
||||
|
||||
if (missed_updates >= degrade_zero_ticks[idx])
|
||||
return 0;
|
||||
|
||||
if (idx == 1)
|
||||
return load >> missed_updates;
|
||||
|
||||
while (missed_updates) {
|
||||
if (missed_updates % 2)
|
||||
load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
|
||||
|
||||
missed_updates >>= 1;
|
||||
j++;
|
||||
}
|
||||
return load;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update rq->cpu_load[] statistics. This function is usually called every
|
||||
* scheduler tick (TICK_NSEC). With tickless idle this will not be called
|
||||
* every tick. We fix it up based on jiffies.
|
||||
*/
|
||||
static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
|
||||
unsigned long pending_updates)
|
||||
{
|
||||
int i, scale;
|
||||
|
||||
this_rq->nr_load_updates++;
|
||||
|
||||
/* Update our load: */
|
||||
this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
|
||||
for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
|
||||
unsigned long old_load, new_load;
|
||||
|
||||
/* scale is effectively 1 << i now, and >> i divides by scale */
|
||||
|
||||
old_load = this_rq->cpu_load[i];
|
||||
old_load = decay_load_missed(old_load, pending_updates - 1, i);
|
||||
new_load = this_load;
|
||||
/*
|
||||
* Round up the averaging division if load is increasing. This
|
||||
* prevents us from getting stuck on 9 if the load is 10, for
|
||||
* example.
|
||||
*/
|
||||
if (new_load > old_load)
|
||||
new_load += scale - 1;
|
||||
|
||||
this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
|
||||
}
|
||||
|
||||
sched_avg_update(this_rq);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static inline unsigned long get_rq_runnable_load(struct rq *rq)
|
||||
{
|
||||
return rq->cfs.runnable_load_avg;
|
||||
}
|
||||
#else
|
||||
static inline unsigned long get_rq_runnable_load(struct rq *rq)
|
||||
{
|
||||
return rq->load.weight;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
/*
|
||||
* There is no sane way to deal with nohz on smp when using jiffies because the
|
||||
* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
|
||||
* causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
|
||||
*
|
||||
* Therefore we cannot use the delta approach from the regular tick since that
|
||||
* would seriously skew the load calculation. However we'll make do for those
|
||||
* updates happening while idle (nohz_idle_balance) or coming out of idle
|
||||
* (tick_nohz_idle_exit).
|
||||
*
|
||||
* This means we might still be one tick off for nohz periods.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Called from nohz_idle_balance() to update the load ratings before doing the
|
||||
* idle balance.
|
||||
*/
|
||||
void update_idle_cpu_load(struct rq *this_rq)
|
||||
{
|
||||
unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
|
||||
unsigned long load = get_rq_runnable_load(this_rq);
|
||||
unsigned long pending_updates;
|
||||
|
||||
/*
|
||||
* bail if there's load or we're actually up-to-date.
|
||||
*/
|
||||
if (load || curr_jiffies == this_rq->last_load_update_tick)
|
||||
return;
|
||||
|
||||
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
|
||||
this_rq->last_load_update_tick = curr_jiffies;
|
||||
|
||||
__update_cpu_load(this_rq, load, pending_updates);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
|
||||
*/
|
||||
void update_cpu_load_nohz(void)
|
||||
{
|
||||
struct rq *this_rq = this_rq();
|
||||
unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
|
||||
unsigned long pending_updates;
|
||||
|
||||
if (curr_jiffies == this_rq->last_load_update_tick)
|
||||
return;
|
||||
|
||||
raw_spin_lock(&this_rq->lock);
|
||||
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
|
||||
if (pending_updates) {
|
||||
this_rq->last_load_update_tick = curr_jiffies;
|
||||
/*
|
||||
* We were idle, this means load 0, the current load might be
|
||||
* !0 due to remote wakeups and the sort.
|
||||
*/
|
||||
__update_cpu_load(this_rq, 0, pending_updates);
|
||||
}
|
||||
raw_spin_unlock(&this_rq->lock);
|
||||
}
|
||||
#endif /* CONFIG_NO_HZ */
|
||||
|
||||
/*
|
||||
* Called from scheduler_tick()
|
||||
*/
|
||||
void update_cpu_load_active(struct rq *this_rq)
|
||||
{
|
||||
unsigned long load = get_rq_runnable_load(this_rq);
|
||||
/*
|
||||
* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
|
||||
*/
|
||||
this_rq->last_load_update_tick = jiffies;
|
||||
__update_cpu_load(this_rq, load, 1);
|
||||
|
||||
calc_load_account_active(this_rq);
|
||||
}
|
||||
2148
kernel/sched/rt.c
Normal file
2148
kernel/sched/rt.c
Normal file
File diff suppressed because it is too large
Load diff
1580
kernel/sched/sched.h
Normal file
1580
kernel/sched/sched.h
Normal file
File diff suppressed because it is too large
Load diff
145
kernel/sched/stats.c
Normal file
145
kernel/sched/stats.c
Normal file
|
|
@ -0,0 +1,145 @@
|
|||
|
||||
#include <linux/slab.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/proc_fs.h>
|
||||
|
||||
#include "sched.h"
|
||||
|
||||
/*
|
||||
* bump this up when changing the output format or the meaning of an existing
|
||||
* format, so that tools can adapt (or abort)
|
||||
*/
|
||||
#define SCHEDSTAT_VERSION 15
|
||||
|
||||
static int show_schedstat(struct seq_file *seq, void *v)
|
||||
{
|
||||
int cpu;
|
||||
int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
|
||||
char *mask_str = kmalloc(mask_len, GFP_KERNEL);
|
||||
|
||||
if (mask_str == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
if (v == (void *)1) {
|
||||
seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
|
||||
seq_printf(seq, "timestamp %lu\n", jiffies);
|
||||
} else {
|
||||
struct rq *rq;
|
||||
#ifdef CONFIG_SMP
|
||||
struct sched_domain *sd;
|
||||
int dcount = 0;
|
||||
#endif
|
||||
cpu = (unsigned long)(v - 2);
|
||||
rq = cpu_rq(cpu);
|
||||
|
||||
/* runqueue-specific stats */
|
||||
seq_printf(seq,
|
||||
"cpu%d %u 0 %u %u %u %u %llu %llu %lu",
|
||||
cpu, rq->yld_count,
|
||||
rq->sched_count, rq->sched_goidle,
|
||||
rq->ttwu_count, rq->ttwu_local,
|
||||
rq->rq_cpu_time,
|
||||
rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
|
||||
|
||||
seq_printf(seq, "\n");
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/* domain-specific stats */
|
||||
rcu_read_lock();
|
||||
for_each_domain(cpu, sd) {
|
||||
enum cpu_idle_type itype;
|
||||
|
||||
cpumask_scnprintf(mask_str, mask_len,
|
||||
sched_domain_span(sd));
|
||||
seq_printf(seq, "domain%d %s", dcount++, mask_str);
|
||||
for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
|
||||
itype++) {
|
||||
seq_printf(seq, " %u %u %u %u %u %u %u %u",
|
||||
sd->lb_count[itype],
|
||||
sd->lb_balanced[itype],
|
||||
sd->lb_failed[itype],
|
||||
sd->lb_imbalance[itype],
|
||||
sd->lb_gained[itype],
|
||||
sd->lb_hot_gained[itype],
|
||||
sd->lb_nobusyq[itype],
|
||||
sd->lb_nobusyg[itype]);
|
||||
}
|
||||
seq_printf(seq,
|
||||
" %u %u %u %u %u %u %u %u %u %u %u %u\n",
|
||||
sd->alb_count, sd->alb_failed, sd->alb_pushed,
|
||||
sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
|
||||
sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
|
||||
sd->ttwu_wake_remote, sd->ttwu_move_affine,
|
||||
sd->ttwu_move_balance);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
#endif
|
||||
}
|
||||
kfree(mask_str);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* This itererator needs some explanation.
|
||||
* It returns 1 for the header position.
|
||||
* This means 2 is cpu 0.
|
||||
* In a hotplugged system some cpus, including cpu 0, may be missing so we have
|
||||
* to use cpumask_* to iterate over the cpus.
|
||||
*/
|
||||
static void *schedstat_start(struct seq_file *file, loff_t *offset)
|
||||
{
|
||||
unsigned long n = *offset;
|
||||
|
||||
if (n == 0)
|
||||
return (void *) 1;
|
||||
|
||||
n--;
|
||||
|
||||
if (n > 0)
|
||||
n = cpumask_next(n - 1, cpu_online_mask);
|
||||
else
|
||||
n = cpumask_first(cpu_online_mask);
|
||||
|
||||
*offset = n + 1;
|
||||
|
||||
if (n < nr_cpu_ids)
|
||||
return (void *)(unsigned long)(n + 2);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
|
||||
{
|
||||
(*offset)++;
|
||||
return schedstat_start(file, offset);
|
||||
}
|
||||
|
||||
static void schedstat_stop(struct seq_file *file, void *data)
|
||||
{
|
||||
}
|
||||
|
||||
static const struct seq_operations schedstat_sops = {
|
||||
.start = schedstat_start,
|
||||
.next = schedstat_next,
|
||||
.stop = schedstat_stop,
|
||||
.show = show_schedstat,
|
||||
};
|
||||
|
||||
static int schedstat_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open(file, &schedstat_sops);
|
||||
}
|
||||
|
||||
static const struct file_operations proc_schedstat_operations = {
|
||||
.open = schedstat_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release,
|
||||
};
|
||||
|
||||
static int __init proc_schedstat_init(void)
|
||||
{
|
||||
proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
|
||||
return 0;
|
||||
}
|
||||
subsys_initcall(proc_schedstat_init);
|
||||
267
kernel/sched/stats.h
Normal file
267
kernel/sched/stats.h
Normal file
|
|
@ -0,0 +1,267 @@
|
|||
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
|
||||
/*
|
||||
* Expects runqueue lock to be held for atomicity of update
|
||||
*/
|
||||
static inline void
|
||||
rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
|
||||
{
|
||||
if (rq) {
|
||||
rq->rq_sched_info.run_delay += delta;
|
||||
rq->rq_sched_info.pcount++;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Expects runqueue lock to be held for atomicity of update
|
||||
*/
|
||||
static inline void
|
||||
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
|
||||
{
|
||||
if (rq)
|
||||
rq->rq_cpu_time += delta;
|
||||
}
|
||||
|
||||
static inline void
|
||||
rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
|
||||
{
|
||||
if (rq)
|
||||
rq->rq_sched_info.run_delay += delta;
|
||||
}
|
||||
# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
|
||||
# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
|
||||
# define schedstat_set(var, val) do { var = (val); } while (0)
|
||||
#else /* !CONFIG_SCHEDSTATS */
|
||||
static inline void
|
||||
rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
|
||||
{}
|
||||
static inline void
|
||||
rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
|
||||
{}
|
||||
static inline void
|
||||
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
|
||||
{}
|
||||
# define schedstat_inc(rq, field) do { } while (0)
|
||||
# define schedstat_add(rq, field, amt) do { } while (0)
|
||||
# define schedstat_set(var, val) do { } while (0)
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
|
||||
static inline void sched_info_reset_dequeued(struct task_struct *t)
|
||||
{
|
||||
t->sched_info.last_queued = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* We are interested in knowing how long it was from the *first* time a
|
||||
* task was queued to the time that it finally hit a cpu, we call this routine
|
||||
* from dequeue_task() to account for possible rq->clock skew across cpus. The
|
||||
* delta taken on each cpu would annul the skew.
|
||||
*/
|
||||
static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
|
||||
{
|
||||
unsigned long long now = rq_clock(rq), delta = 0;
|
||||
|
||||
if (unlikely(sched_info_on()))
|
||||
if (t->sched_info.last_queued)
|
||||
delta = now - t->sched_info.last_queued;
|
||||
sched_info_reset_dequeued(t);
|
||||
t->sched_info.run_delay += delta;
|
||||
|
||||
rq_sched_info_dequeued(rq, delta);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called when a task finally hits the cpu. We can now calculate how
|
||||
* long it was waiting to run. We also note when it began so that we
|
||||
* can keep stats on how long its timeslice is.
|
||||
*/
|
||||
static void sched_info_arrive(struct rq *rq, struct task_struct *t)
|
||||
{
|
||||
unsigned long long now = rq_clock(rq), delta = 0;
|
||||
|
||||
if (t->sched_info.last_queued)
|
||||
delta = now - t->sched_info.last_queued;
|
||||
sched_info_reset_dequeued(t);
|
||||
t->sched_info.run_delay += delta;
|
||||
t->sched_info.last_arrival = now;
|
||||
t->sched_info.pcount++;
|
||||
|
||||
rq_sched_info_arrive(rq, delta);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is only called from enqueue_task(), but also only updates
|
||||
* the timestamp if it is already not set. It's assumed that
|
||||
* sched_info_dequeued() will clear that stamp when appropriate.
|
||||
*/
|
||||
static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
|
||||
{
|
||||
if (unlikely(sched_info_on()))
|
||||
if (!t->sched_info.last_queued)
|
||||
t->sched_info.last_queued = rq_clock(rq);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called when a process ceases being the active-running process involuntarily
|
||||
* due, typically, to expiring its time slice (this may also be called when
|
||||
* switching to the idle task). Now we can calculate how long we ran.
|
||||
* Also, if the process is still in the TASK_RUNNING state, call
|
||||
* sched_info_queued() to mark that it has now again started waiting on
|
||||
* the runqueue.
|
||||
*/
|
||||
static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
|
||||
{
|
||||
unsigned long long delta = rq_clock(rq) -
|
||||
t->sched_info.last_arrival;
|
||||
|
||||
rq_sched_info_depart(rq, delta);
|
||||
|
||||
if (t->state == TASK_RUNNING)
|
||||
sched_info_queued(rq, t);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called when tasks are switched involuntarily due, typically, to expiring
|
||||
* their time slice. (This may also be called when switching to or from
|
||||
* the idle task.) We are only called when prev != next.
|
||||
*/
|
||||
static inline void
|
||||
__sched_info_switch(struct rq *rq,
|
||||
struct task_struct *prev, struct task_struct *next)
|
||||
{
|
||||
/*
|
||||
* prev now departs the cpu. It's not interesting to record
|
||||
* stats about how efficient we were at scheduling the idle
|
||||
* process, however.
|
||||
*/
|
||||
if (prev != rq->idle)
|
||||
sched_info_depart(rq, prev);
|
||||
|
||||
if (next != rq->idle)
|
||||
sched_info_arrive(rq, next);
|
||||
}
|
||||
static inline void
|
||||
sched_info_switch(struct rq *rq,
|
||||
struct task_struct *prev, struct task_struct *next)
|
||||
{
|
||||
if (unlikely(sched_info_on()))
|
||||
__sched_info_switch(rq, prev, next);
|
||||
}
|
||||
#else
|
||||
#define sched_info_queued(rq, t) do { } while (0)
|
||||
#define sched_info_reset_dequeued(t) do { } while (0)
|
||||
#define sched_info_dequeued(rq, t) do { } while (0)
|
||||
#define sched_info_depart(rq, t) do { } while (0)
|
||||
#define sched_info_arrive(rq, next) do { } while (0)
|
||||
#define sched_info_switch(rq, t, next) do { } while (0)
|
||||
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
|
||||
|
||||
/*
|
||||
* The following are functions that support scheduler-internal time accounting.
|
||||
* These functions are generally called at the timer tick. None of this depends
|
||||
* on CONFIG_SCHEDSTATS.
|
||||
*/
|
||||
|
||||
/**
|
||||
* cputimer_running - return true if cputimer is running
|
||||
*
|
||||
* @tsk: Pointer to target task.
|
||||
*/
|
||||
static inline bool cputimer_running(struct task_struct *tsk)
|
||||
|
||||
{
|
||||
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
|
||||
|
||||
if (!cputimer->running)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
|
||||
* in __exit_signal(), we won't account to the signal struct further
|
||||
* cputime consumed by that task, even though the task can still be
|
||||
* ticking after __exit_signal().
|
||||
*
|
||||
* In order to keep a consistent behaviour between thread group cputime
|
||||
* and thread group cputimer accounting, lets also ignore the cputime
|
||||
* elapsing after __exit_signal() in any thread group timer running.
|
||||
*
|
||||
* This makes sure that POSIX CPU clocks and timers are synchronized, so
|
||||
* that a POSIX CPU timer won't expire while the corresponding POSIX CPU
|
||||
* clock delta is behind the expiring timer value.
|
||||
*/
|
||||
if (unlikely(!tsk->sighand))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* account_group_user_time - Maintain utime for a thread group.
|
||||
*
|
||||
* @tsk: Pointer to task structure.
|
||||
* @cputime: Time value by which to increment the utime field of the
|
||||
* thread_group_cputime structure.
|
||||
*
|
||||
* If thread group time is being maintained, get the structure for the
|
||||
* running CPU and update the utime field there.
|
||||
*/
|
||||
static inline void account_group_user_time(struct task_struct *tsk,
|
||||
cputime_t cputime)
|
||||
{
|
||||
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
|
||||
|
||||
if (!cputimer_running(tsk))
|
||||
return;
|
||||
|
||||
raw_spin_lock(&cputimer->lock);
|
||||
cputimer->cputime.utime += cputime;
|
||||
raw_spin_unlock(&cputimer->lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* account_group_system_time - Maintain stime for a thread group.
|
||||
*
|
||||
* @tsk: Pointer to task structure.
|
||||
* @cputime: Time value by which to increment the stime field of the
|
||||
* thread_group_cputime structure.
|
||||
*
|
||||
* If thread group time is being maintained, get the structure for the
|
||||
* running CPU and update the stime field there.
|
||||
*/
|
||||
static inline void account_group_system_time(struct task_struct *tsk,
|
||||
cputime_t cputime)
|
||||
{
|
||||
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
|
||||
|
||||
if (!cputimer_running(tsk))
|
||||
return;
|
||||
|
||||
raw_spin_lock(&cputimer->lock);
|
||||
cputimer->cputime.stime += cputime;
|
||||
raw_spin_unlock(&cputimer->lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* account_group_exec_runtime - Maintain exec runtime for a thread group.
|
||||
*
|
||||
* @tsk: Pointer to task structure.
|
||||
* @ns: Time value by which to increment the sum_exec_runtime field
|
||||
* of the thread_group_cputime structure.
|
||||
*
|
||||
* If thread group time is being maintained, get the structure for the
|
||||
* running CPU and update the sum_exec_runtime field there.
|
||||
*/
|
||||
static inline void account_group_exec_runtime(struct task_struct *tsk,
|
||||
unsigned long long ns)
|
||||
{
|
||||
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
|
||||
|
||||
if (!cputimer_running(tsk))
|
||||
return;
|
||||
|
||||
raw_spin_lock(&cputimer->lock);
|
||||
cputimer->cputime.sum_exec_runtime += ns;
|
||||
raw_spin_unlock(&cputimer->lock);
|
||||
}
|
||||
136
kernel/sched/stop_task.c
Normal file
136
kernel/sched/stop_task.c
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
#include "sched.h"
|
||||
|
||||
/*
|
||||
* stop-task scheduling class.
|
||||
*
|
||||
* The stop task is the highest priority task in the system, it preempts
|
||||
* everything and will be preempted by nothing.
|
||||
*
|
||||
* See kernel/stop_machine.c
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static int
|
||||
select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
|
||||
{
|
||||
return task_cpu(p); /* stop tasks as never migrate */
|
||||
}
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
static void
|
||||
check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
/* we're never preempted */
|
||||
}
|
||||
|
||||
static struct task_struct *
|
||||
pick_next_task_stop(struct rq *rq, struct task_struct *prev)
|
||||
{
|
||||
struct task_struct *stop = rq->stop;
|
||||
|
||||
if (!stop || !task_on_rq_queued(stop))
|
||||
return NULL;
|
||||
|
||||
put_prev_task(rq, prev);
|
||||
|
||||
stop->se.exec_start = rq_clock_task(rq);
|
||||
|
||||
return stop;
|
||||
}
|
||||
|
||||
static void
|
||||
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
add_nr_running(rq, 1);
|
||||
}
|
||||
|
||||
static void
|
||||
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
sub_nr_running(rq, 1);
|
||||
}
|
||||
|
||||
static void yield_task_stop(struct rq *rq)
|
||||
{
|
||||
BUG(); /* the stop task should never yield, its pointless. */
|
||||
}
|
||||
|
||||
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
|
||||
{
|
||||
struct task_struct *curr = rq->curr;
|
||||
u64 delta_exec;
|
||||
|
||||
delta_exec = rq_clock_task(rq) - curr->se.exec_start;
|
||||
if (unlikely((s64)delta_exec < 0))
|
||||
delta_exec = 0;
|
||||
|
||||
schedstat_set(curr->se.statistics.exec_max,
|
||||
max(curr->se.statistics.exec_max, delta_exec));
|
||||
|
||||
curr->se.sum_exec_runtime += delta_exec;
|
||||
account_group_exec_runtime(curr, delta_exec);
|
||||
|
||||
curr->se.exec_start = rq_clock_task(rq);
|
||||
cpuacct_charge(curr, delta_exec);
|
||||
}
|
||||
|
||||
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
|
||||
{
|
||||
}
|
||||
|
||||
static void set_curr_task_stop(struct rq *rq)
|
||||
{
|
||||
struct task_struct *stop = rq->stop;
|
||||
|
||||
stop->se.exec_start = rq_clock_task(rq);
|
||||
}
|
||||
|
||||
static void switched_to_stop(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
BUG(); /* its impossible to change to this class */
|
||||
}
|
||||
|
||||
static void
|
||||
prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
|
||||
{
|
||||
BUG(); /* how!?, what priority? */
|
||||
}
|
||||
|
||||
static unsigned int
|
||||
get_rr_interval_stop(struct rq *rq, struct task_struct *task)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void update_curr_stop(struct rq *rq)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* Simple, special scheduling class for the per-CPU stop tasks:
|
||||
*/
|
||||
const struct sched_class stop_sched_class = {
|
||||
.next = &dl_sched_class,
|
||||
|
||||
.enqueue_task = enqueue_task_stop,
|
||||
.dequeue_task = dequeue_task_stop,
|
||||
.yield_task = yield_task_stop,
|
||||
|
||||
.check_preempt_curr = check_preempt_curr_stop,
|
||||
|
||||
.pick_next_task = pick_next_task_stop,
|
||||
.put_prev_task = put_prev_task_stop,
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
.select_task_rq = select_task_rq_stop,
|
||||
#endif
|
||||
|
||||
.set_curr_task = set_curr_task_stop,
|
||||
.task_tick = task_tick_stop,
|
||||
|
||||
.get_rr_interval = get_rr_interval_stop,
|
||||
|
||||
.prio_changed = prio_changed_stop,
|
||||
.switched_to = switched_to_stop,
|
||||
.update_curr = update_curr_stop,
|
||||
};
|
||||
619
kernel/sched/wait.c
Normal file
619
kernel/sched/wait.c
Normal file
|
|
@ -0,0 +1,619 @@
|
|||
/*
|
||||
* Generic waiting primitives.
|
||||
*
|
||||
* (C) 2004 Nadia Yvette Chambers, Oracle
|
||||
*/
|
||||
#include <linux/init.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/hash.h>
|
||||
|
||||
void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
|
||||
{
|
||||
spin_lock_init(&q->lock);
|
||||
lockdep_set_class_and_name(&q->lock, key, name);
|
||||
INIT_LIST_HEAD(&q->task_list);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(__init_waitqueue_head);
|
||||
|
||||
void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
wait->flags &= ~WQ_FLAG_EXCLUSIVE;
|
||||
spin_lock_irqsave(&q->lock, flags);
|
||||
__add_wait_queue(q, wait);
|
||||
spin_unlock_irqrestore(&q->lock, flags);
|
||||
}
|
||||
EXPORT_SYMBOL(add_wait_queue);
|
||||
|
||||
void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
wait->flags |= WQ_FLAG_EXCLUSIVE;
|
||||
spin_lock_irqsave(&q->lock, flags);
|
||||
__add_wait_queue_tail(q, wait);
|
||||
spin_unlock_irqrestore(&q->lock, flags);
|
||||
}
|
||||
EXPORT_SYMBOL(add_wait_queue_exclusive);
|
||||
|
||||
void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&q->lock, flags);
|
||||
__remove_wait_queue(q, wait);
|
||||
spin_unlock_irqrestore(&q->lock, flags);
|
||||
}
|
||||
EXPORT_SYMBOL(remove_wait_queue);
|
||||
|
||||
|
||||
/*
|
||||
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
|
||||
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
|
||||
* number) then we wake all the non-exclusive tasks and one exclusive task.
|
||||
*
|
||||
* There are circumstances in which we can try to wake a task which has already
|
||||
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
|
||||
* zero in this (rare) case, and we handle it by continuing to scan the queue.
|
||||
*/
|
||||
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
|
||||
int nr_exclusive, int wake_flags, void *key)
|
||||
{
|
||||
wait_queue_t *curr, *next;
|
||||
|
||||
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
|
||||
unsigned flags = curr->flags;
|
||||
|
||||
if (curr->func(curr, mode, wake_flags, key) &&
|
||||
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* __wake_up - wake up threads blocked on a waitqueue.
|
||||
* @q: the waitqueue
|
||||
* @mode: which threads
|
||||
* @nr_exclusive: how many wake-one or wake-many threads to wake up
|
||||
* @key: is directly passed to the wakeup function
|
||||
*
|
||||
* It may be assumed that this function implies a write memory barrier before
|
||||
* changing the task state if and only if any tasks are woken up.
|
||||
*/
|
||||
void __wake_up(wait_queue_head_t *q, unsigned int mode,
|
||||
int nr_exclusive, void *key)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&q->lock, flags);
|
||||
__wake_up_common(q, mode, nr_exclusive, 0, key);
|
||||
spin_unlock_irqrestore(&q->lock, flags);
|
||||
}
|
||||
EXPORT_SYMBOL(__wake_up);
|
||||
|
||||
/*
|
||||
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
|
||||
*/
|
||||
void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
|
||||
{
|
||||
__wake_up_common(q, mode, nr, 0, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__wake_up_locked);
|
||||
|
||||
void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
|
||||
{
|
||||
__wake_up_common(q, mode, 1, 0, key);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
|
||||
|
||||
/**
|
||||
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
|
||||
* @q: the waitqueue
|
||||
* @mode: which threads
|
||||
* @nr_exclusive: how many wake-one or wake-many threads to wake up
|
||||
* @key: opaque value to be passed to wakeup targets
|
||||
*
|
||||
* The sync wakeup differs that the waker knows that it will schedule
|
||||
* away soon, so while the target thread will be woken up, it will not
|
||||
* be migrated to another CPU - ie. the two threads are 'synchronized'
|
||||
* with each other. This can prevent needless bouncing between CPUs.
|
||||
*
|
||||
* On UP it can prevent extra preemption.
|
||||
*
|
||||
* It may be assumed that this function implies a write memory barrier before
|
||||
* changing the task state if and only if any tasks are woken up.
|
||||
*/
|
||||
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
|
||||
int nr_exclusive, void *key)
|
||||
{
|
||||
unsigned long flags;
|
||||
int wake_flags = 1; /* XXX WF_SYNC */
|
||||
|
||||
if (unlikely(!q))
|
||||
return;
|
||||
|
||||
if (unlikely(nr_exclusive != 1))
|
||||
wake_flags = 0;
|
||||
|
||||
spin_lock_irqsave(&q->lock, flags);
|
||||
__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
|
||||
spin_unlock_irqrestore(&q->lock, flags);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
|
||||
|
||||
/*
|
||||
* __wake_up_sync - see __wake_up_sync_key()
|
||||
*/
|
||||
void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
|
||||
{
|
||||
__wake_up_sync_key(q, mode, nr_exclusive, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
|
||||
|
||||
/*
|
||||
* Note: we use "set_current_state()" _after_ the wait-queue add,
|
||||
* because we need a memory barrier there on SMP, so that any
|
||||
* wake-function that tests for the wait-queue being active
|
||||
* will be guaranteed to see waitqueue addition _or_ subsequent
|
||||
* tests in this thread will see the wakeup having taken place.
|
||||
*
|
||||
* The spin_unlock() itself is semi-permeable and only protects
|
||||
* one way (it only protects stuff inside the critical region and
|
||||
* stops them from bleeding out - it would still allow subsequent
|
||||
* loads to move into the critical region).
|
||||
*/
|
||||
void
|
||||
prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
wait->flags &= ~WQ_FLAG_EXCLUSIVE;
|
||||
spin_lock_irqsave(&q->lock, flags);
|
||||
if (list_empty(&wait->task_list))
|
||||
__add_wait_queue(q, wait);
|
||||
set_current_state(state);
|
||||
spin_unlock_irqrestore(&q->lock, flags);
|
||||
}
|
||||
EXPORT_SYMBOL(prepare_to_wait);
|
||||
|
||||
void
|
||||
prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
wait->flags |= WQ_FLAG_EXCLUSIVE;
|
||||
spin_lock_irqsave(&q->lock, flags);
|
||||
if (list_empty(&wait->task_list))
|
||||
__add_wait_queue_tail(q, wait);
|
||||
set_current_state(state);
|
||||
spin_unlock_irqrestore(&q->lock, flags);
|
||||
}
|
||||
EXPORT_SYMBOL(prepare_to_wait_exclusive);
|
||||
|
||||
long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
if (signal_pending_state(state, current))
|
||||
return -ERESTARTSYS;
|
||||
|
||||
wait->private = current;
|
||||
wait->func = autoremove_wake_function;
|
||||
|
||||
spin_lock_irqsave(&q->lock, flags);
|
||||
if (list_empty(&wait->task_list)) {
|
||||
if (wait->flags & WQ_FLAG_EXCLUSIVE)
|
||||
__add_wait_queue_tail(q, wait);
|
||||
else
|
||||
__add_wait_queue(q, wait);
|
||||
}
|
||||
set_current_state(state);
|
||||
spin_unlock_irqrestore(&q->lock, flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(prepare_to_wait_event);
|
||||
|
||||
/**
|
||||
* finish_wait - clean up after waiting in a queue
|
||||
* @q: waitqueue waited on
|
||||
* @wait: wait descriptor
|
||||
*
|
||||
* Sets current thread back to running state and removes
|
||||
* the wait descriptor from the given waitqueue if still
|
||||
* queued.
|
||||
*/
|
||||
void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
__set_current_state(TASK_RUNNING);
|
||||
/*
|
||||
* We can check for list emptiness outside the lock
|
||||
* IFF:
|
||||
* - we use the "careful" check that verifies both
|
||||
* the next and prev pointers, so that there cannot
|
||||
* be any half-pending updates in progress on other
|
||||
* CPU's that we haven't seen yet (and that might
|
||||
* still change the stack area.
|
||||
* and
|
||||
* - all other users take the lock (ie we can only
|
||||
* have _one_ other CPU that looks at or modifies
|
||||
* the list).
|
||||
*/
|
||||
if (!list_empty_careful(&wait->task_list)) {
|
||||
spin_lock_irqsave(&q->lock, flags);
|
||||
list_del_init(&wait->task_list);
|
||||
spin_unlock_irqrestore(&q->lock, flags);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(finish_wait);
|
||||
|
||||
/**
|
||||
* abort_exclusive_wait - abort exclusive waiting in a queue
|
||||
* @q: waitqueue waited on
|
||||
* @wait: wait descriptor
|
||||
* @mode: runstate of the waiter to be woken
|
||||
* @key: key to identify a wait bit queue or %NULL
|
||||
*
|
||||
* Sets current thread back to running state and removes
|
||||
* the wait descriptor from the given waitqueue if still
|
||||
* queued.
|
||||
*
|
||||
* Wakes up the next waiter if the caller is concurrently
|
||||
* woken up through the queue.
|
||||
*
|
||||
* This prevents waiter starvation where an exclusive waiter
|
||||
* aborts and is woken up concurrently and no one wakes up
|
||||
* the next waiter.
|
||||
*/
|
||||
void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
|
||||
unsigned int mode, void *key)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
__set_current_state(TASK_RUNNING);
|
||||
spin_lock_irqsave(&q->lock, flags);
|
||||
if (!list_empty(&wait->task_list))
|
||||
list_del_init(&wait->task_list);
|
||||
else if (waitqueue_active(q))
|
||||
__wake_up_locked_key(q, mode, key);
|
||||
spin_unlock_irqrestore(&q->lock, flags);
|
||||
}
|
||||
EXPORT_SYMBOL(abort_exclusive_wait);
|
||||
|
||||
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
|
||||
{
|
||||
int ret = default_wake_function(wait, mode, sync, key);
|
||||
|
||||
if (ret)
|
||||
list_del_init(&wait->task_list);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(autoremove_wake_function);
|
||||
|
||||
|
||||
/*
|
||||
* DEFINE_WAIT_FUNC(wait, woken_wake_func);
|
||||
*
|
||||
* add_wait_queue(&wq, &wait);
|
||||
* for (;;) {
|
||||
* if (condition)
|
||||
* break;
|
||||
*
|
||||
* p->state = mode; condition = true;
|
||||
* smp_mb(); // A smp_wmb(); // C
|
||||
* if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN;
|
||||
* schedule() try_to_wake_up();
|
||||
* p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~
|
||||
* wait->flags &= ~WQ_FLAG_WOKEN; condition = true;
|
||||
* smp_mb() // B smp_wmb(); // C
|
||||
* wait->flags |= WQ_FLAG_WOKEN;
|
||||
* }
|
||||
* remove_wait_queue(&wq, &wait);
|
||||
*
|
||||
*/
|
||||
long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
|
||||
{
|
||||
set_current_state(mode); /* A */
|
||||
/*
|
||||
* The above implies an smp_mb(), which matches with the smp_wmb() from
|
||||
* woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
|
||||
* also observe all state before the wakeup.
|
||||
*/
|
||||
if (!(wait->flags & WQ_FLAG_WOKEN))
|
||||
timeout = schedule_timeout(timeout);
|
||||
__set_current_state(TASK_RUNNING);
|
||||
|
||||
/*
|
||||
* The below implies an smp_mb(), it too pairs with the smp_wmb() from
|
||||
* woken_wake_function() such that we must either observe the wait
|
||||
* condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
|
||||
* an event.
|
||||
*/
|
||||
set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
|
||||
|
||||
return timeout;
|
||||
}
|
||||
EXPORT_SYMBOL(wait_woken);
|
||||
|
||||
int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
|
||||
{
|
||||
/*
|
||||
* Although this function is called under waitqueue lock, LOCK
|
||||
* doesn't imply write barrier and the users expects write
|
||||
* barrier semantics on wakeup functions. The following
|
||||
* smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
|
||||
* and is paired with set_mb() in wait_woken().
|
||||
*/
|
||||
smp_wmb(); /* C */
|
||||
wait->flags |= WQ_FLAG_WOKEN;
|
||||
|
||||
return default_wake_function(wait, mode, sync, key);
|
||||
}
|
||||
EXPORT_SYMBOL(woken_wake_function);
|
||||
|
||||
int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
|
||||
{
|
||||
struct wait_bit_key *key = arg;
|
||||
struct wait_bit_queue *wait_bit
|
||||
= container_of(wait, struct wait_bit_queue, wait);
|
||||
|
||||
if (wait_bit->key.flags != key->flags ||
|
||||
wait_bit->key.bit_nr != key->bit_nr ||
|
||||
test_bit(key->bit_nr, key->flags))
|
||||
return 0;
|
||||
else
|
||||
return autoremove_wake_function(wait, mode, sync, key);
|
||||
}
|
||||
EXPORT_SYMBOL(wake_bit_function);
|
||||
|
||||
/*
|
||||
* To allow interruptible waiting and asynchronous (i.e. nonblocking)
|
||||
* waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
|
||||
* permitted return codes. Nonzero return codes halt waiting and return.
|
||||
*/
|
||||
int __sched
|
||||
__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
|
||||
wait_bit_action_f *action, unsigned mode)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
do {
|
||||
prepare_to_wait(wq, &q->wait, mode);
|
||||
if (test_bit(q->key.bit_nr, q->key.flags))
|
||||
ret = (*action)(&q->key, mode);
|
||||
} while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
|
||||
finish_wait(wq, &q->wait);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(__wait_on_bit);
|
||||
|
||||
int __sched out_of_line_wait_on_bit(void *word, int bit,
|
||||
wait_bit_action_f *action, unsigned mode)
|
||||
{
|
||||
wait_queue_head_t *wq = bit_waitqueue(word, bit);
|
||||
DEFINE_WAIT_BIT(wait, word, bit);
|
||||
|
||||
return __wait_on_bit(wq, &wait, action, mode);
|
||||
}
|
||||
EXPORT_SYMBOL(out_of_line_wait_on_bit);
|
||||
|
||||
int __sched out_of_line_wait_on_bit_timeout(
|
||||
void *word, int bit, wait_bit_action_f *action,
|
||||
unsigned mode, unsigned long timeout)
|
||||
{
|
||||
wait_queue_head_t *wq = bit_waitqueue(word, bit);
|
||||
DEFINE_WAIT_BIT(wait, word, bit);
|
||||
|
||||
wait.key.timeout = jiffies + timeout;
|
||||
return __wait_on_bit(wq, &wait, action, mode);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
|
||||
|
||||
int __sched
|
||||
__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
|
||||
wait_bit_action_f *action, unsigned mode)
|
||||
{
|
||||
do {
|
||||
int ret;
|
||||
|
||||
prepare_to_wait_exclusive(wq, &q->wait, mode);
|
||||
if (!test_bit(q->key.bit_nr, q->key.flags))
|
||||
continue;
|
||||
ret = action(&q->key, mode);
|
||||
if (!ret)
|
||||
continue;
|
||||
abort_exclusive_wait(wq, &q->wait, mode, &q->key);
|
||||
return ret;
|
||||
} while (test_and_set_bit(q->key.bit_nr, q->key.flags));
|
||||
finish_wait(wq, &q->wait);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(__wait_on_bit_lock);
|
||||
|
||||
int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
|
||||
wait_bit_action_f *action, unsigned mode)
|
||||
{
|
||||
wait_queue_head_t *wq = bit_waitqueue(word, bit);
|
||||
DEFINE_WAIT_BIT(wait, word, bit);
|
||||
|
||||
return __wait_on_bit_lock(wq, &wait, action, mode);
|
||||
}
|
||||
EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
|
||||
|
||||
void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
|
||||
{
|
||||
struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
|
||||
if (waitqueue_active(wq))
|
||||
__wake_up(wq, TASK_NORMAL, 1, &key);
|
||||
}
|
||||
EXPORT_SYMBOL(__wake_up_bit);
|
||||
|
||||
/**
|
||||
* wake_up_bit - wake up a waiter on a bit
|
||||
* @word: the word being waited on, a kernel virtual address
|
||||
* @bit: the bit of the word being waited on
|
||||
*
|
||||
* There is a standard hashed waitqueue table for generic use. This
|
||||
* is the part of the hashtable's accessor API that wakes up waiters
|
||||
* on a bit. For instance, if one were to have waiters on a bitflag,
|
||||
* one would call wake_up_bit() after clearing the bit.
|
||||
*
|
||||
* In order for this to function properly, as it uses waitqueue_active()
|
||||
* internally, some kind of memory barrier must be done prior to calling
|
||||
* this. Typically, this will be smp_mb__after_atomic(), but in some
|
||||
* cases where bitflags are manipulated non-atomically under a lock, one
|
||||
* may need to use a less regular barrier, such fs/inode.c's smp_mb(),
|
||||
* because spin_unlock() does not guarantee a memory barrier.
|
||||
*/
|
||||
void wake_up_bit(void *word, int bit)
|
||||
{
|
||||
__wake_up_bit(bit_waitqueue(word, bit), word, bit);
|
||||
}
|
||||
EXPORT_SYMBOL(wake_up_bit);
|
||||
|
||||
wait_queue_head_t *bit_waitqueue(void *word, int bit)
|
||||
{
|
||||
const int shift = BITS_PER_LONG == 32 ? 5 : 6;
|
||||
const struct zone *zone = page_zone(virt_to_page(word));
|
||||
unsigned long val = (unsigned long)word << shift | bit;
|
||||
|
||||
return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
|
||||
}
|
||||
EXPORT_SYMBOL(bit_waitqueue);
|
||||
|
||||
/*
|
||||
* Manipulate the atomic_t address to produce a better bit waitqueue table hash
|
||||
* index (we're keying off bit -1, but that would produce a horrible hash
|
||||
* value).
|
||||
*/
|
||||
static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
|
||||
{
|
||||
if (BITS_PER_LONG == 64) {
|
||||
unsigned long q = (unsigned long)p;
|
||||
return bit_waitqueue((void *)(q & ~1), q & 1);
|
||||
}
|
||||
return bit_waitqueue(p, 0);
|
||||
}
|
||||
|
||||
static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync,
|
||||
void *arg)
|
||||
{
|
||||
struct wait_bit_key *key = arg;
|
||||
struct wait_bit_queue *wait_bit
|
||||
= container_of(wait, struct wait_bit_queue, wait);
|
||||
atomic_t *val = key->flags;
|
||||
|
||||
if (wait_bit->key.flags != key->flags ||
|
||||
wait_bit->key.bit_nr != key->bit_nr ||
|
||||
atomic_read(val) != 0)
|
||||
return 0;
|
||||
return autoremove_wake_function(wait, mode, sync, key);
|
||||
}
|
||||
|
||||
/*
|
||||
* To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
|
||||
* the actions of __wait_on_atomic_t() are permitted return codes. Nonzero
|
||||
* return codes halt waiting and return.
|
||||
*/
|
||||
static __sched
|
||||
int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
|
||||
int (*action)(atomic_t *), unsigned mode)
|
||||
{
|
||||
atomic_t *val;
|
||||
int ret = 0;
|
||||
|
||||
do {
|
||||
prepare_to_wait(wq, &q->wait, mode);
|
||||
val = q->key.flags;
|
||||
if (atomic_read(val) == 0)
|
||||
break;
|
||||
ret = (*action)(val);
|
||||
} while (!ret && atomic_read(val) != 0);
|
||||
finish_wait(wq, &q->wait);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define DEFINE_WAIT_ATOMIC_T(name, p) \
|
||||
struct wait_bit_queue name = { \
|
||||
.key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \
|
||||
.wait = { \
|
||||
.private = current, \
|
||||
.func = wake_atomic_t_function, \
|
||||
.task_list = \
|
||||
LIST_HEAD_INIT((name).wait.task_list), \
|
||||
}, \
|
||||
}
|
||||
|
||||
__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
|
||||
unsigned mode)
|
||||
{
|
||||
wait_queue_head_t *wq = atomic_t_waitqueue(p);
|
||||
DEFINE_WAIT_ATOMIC_T(wait, p);
|
||||
|
||||
return __wait_on_atomic_t(wq, &wait, action, mode);
|
||||
}
|
||||
EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
|
||||
|
||||
/**
|
||||
* wake_up_atomic_t - Wake up a waiter on a atomic_t
|
||||
* @p: The atomic_t being waited on, a kernel virtual address
|
||||
*
|
||||
* Wake up anyone waiting for the atomic_t to go to zero.
|
||||
*
|
||||
* Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
|
||||
* check is done by the waiter's wake function, not the by the waker itself).
|
||||
*/
|
||||
void wake_up_atomic_t(atomic_t *p)
|
||||
{
|
||||
__wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
|
||||
}
|
||||
EXPORT_SYMBOL(wake_up_atomic_t);
|
||||
|
||||
__sched int bit_wait(struct wait_bit_key *word, int mode)
|
||||
{
|
||||
schedule();
|
||||
if (signal_pending_state(mode, current))
|
||||
return -EINTR;
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(bit_wait);
|
||||
|
||||
__sched int bit_wait_io(struct wait_bit_key *word, int mode)
|
||||
{
|
||||
io_schedule();
|
||||
if (signal_pending_state(mode, current))
|
||||
return -EINTR;
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(bit_wait_io);
|
||||
|
||||
__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
|
||||
{
|
||||
unsigned long now = ACCESS_ONCE(jiffies);
|
||||
if (time_after_eq(now, word->timeout))
|
||||
return -EAGAIN;
|
||||
schedule_timeout(word->timeout - now);
|
||||
if (signal_pending_state(mode, current))
|
||||
return -EINTR;
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bit_wait_timeout);
|
||||
|
||||
__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
|
||||
{
|
||||
unsigned long now = ACCESS_ONCE(jiffies);
|
||||
if (time_after_eq(now, word->timeout))
|
||||
return -EAGAIN;
|
||||
io_schedule_timeout(word->timeout - now);
|
||||
if (signal_pending_state(mode, current))
|
||||
return -EINTR;
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
|
||||
Loading…
Add table
Add a link
Reference in a new issue