[PATCH v4 0/9] Steal time series again

July 01st, 2011 - 05:40 pm ET by Glauber Costa | Report spam
Here follows the fourth version of the steal time series.
Hope it is acceptable for all involved parties now. The main differences
from v3 are:

* The Changelogs seem to have been writen by an actual person now, not of a
monkey. Yet, I am the aforementioned person, so don't expect much.
* Forcing delayacct on the hypervisor side allow us to simplify the guest
code dramatically, since now we don't need to test for is_idle: if we're idle,
we won't have steal time and end of story.

Hope you enjoy.

Glauber Costa (8):
KVM-HDR Add constant to represent KVM MSRs enabled bit
KVM-HDR: KVM Steal time implementation
KVM-HV: KVM Steal time implementation
KVM-GST: Add a pv_ops stub for steal time
add jump labels for ia64 paravirt
KVM-GST: KVM Steal time accounting
KVM-GST: adjust scheduler cpu power
KVM-GST: KVM Steal time registration

Gleb Natapov (1):
introduce kvm_read_guest_cached

Documentation/kernel-parameters.txt | 4 ++
Documentation/virtual/kvm/msr.txt | 35 ++++++++++++++
arch/ia64/include/asm/paravirt.h | 4 ++
arch/ia64/kernel/paravirt.c | 2 +
arch/x86/Kconfig | 12 +++++
arch/x86/include/asm/kvm_host.h | 8 +++
arch/x86/include/asm/kvm_para.h | 15 ++++++
arch/x86/include/asm/paravirt.h | 9 ++++
arch/x86/include/asm/paravirt_types.h | 1 +
arch/x86/kernel/kvm.c | 73 ++++++++++++++++++++++++++++++
arch/x86/kernel/kvmclock.c | 2 +
arch/x86/kernel/paravirt.c | 9 ++++
arch/x86/kvm/Kconfig | 1 +
arch/x86/kvm/x86.c | 56 ++++++++++++++++++++++-
include/linux/kvm_host.h | 2 +
kernel/sched.c | 80 ++++++++++++++++++++++++++++-
kernel/sched_features.h | 4 +-
virt/kvm/kvm_main.c | 20 ++++++++
18 files changed, 322 insertions(+), 15 deletions(-)

1.7.3.4

To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
email Follow the discussionReplies 5 repliesReplies Make a reply

Similar topics

Replies

#1 Glauber Costa
July 01st, 2011 - 05:40 pm ET | Report spam
This patch implements the kvm bits of the steal time infrastructure.
The most important part of it, is the steal time clock. It is an
continuous clock that shows the accumulated amount of steal time
since vcpu creation. It is supposed to survive cpu offlining/onlining.

Signed-off-by: Glauber Costa
CC: Rik van Riel
CC: Jeremy Fitzhardinge
CC: Peter Zijlstra
CC: Avi Kivity
CC: Anthony Liguori
CC: Eric B Munson

Documentation/kernel-parameters.txt | 4 ++
arch/x86/include/asm/kvm_para.h | 1 +
arch/x86/kernel/kvm.c | 73 +++++++++++++++++++++++++++++++++++
arch/x86/kernel/kvmclock.c | 2 +
4 files changed, 80 insertions(+), 0 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index fd248a31..a722574 100644
a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1737,6 +1737,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
fault handling.

+ no-steal-acc [X86,KVM] Disable paravirtualized steal time accounting.
+ steal time is computed, but won't influence scheduler
+ behaviour
+
nolapic [X86-32,APIC] Do not enable or use the local APIC.

nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c484ba8..35d732d 100644
a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -94,6 +94,7 @@ struct kvm_vcpu_pv_apf_data {

extern void kvmclock_init(void);
extern int kvm_register_clock(char *txt);
+extern void kvm_disable_steal_time(void);


/* This instruction is vmcall. On non-VT architectures, it will generate a
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33c07b0..58331c2 100644
a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -51,6 +51,15 @@ static int parse_no_kvmapf(char *arg)

early_param("no-kvmapf", parse_no_kvmapf);

+static int steal_acc = 1;
+static int parse_no_stealacc(char *arg)
+{
+ steal_acc = 0;
+ return 0;
+}
+
+early_param("no-steal-acc", parse_no_stealacc);
+
struct kvm_para_state {
u8 mmu_queue[MMU_QUEUE_SIZE];
int mmu_queue_len;
@@ -58,6 +67,8 @@ struct kvm_para_state {

static DEFINE_PER_CPU(struct kvm_para_state, para_state);
static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
+static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
+static int has_steal_clock = 0;

static struct kvm_para_state *kvm_para_state(void)
{
@@ -441,6 +452,21 @@ static void __init paravirt_ops_setup(void)
#endif
}

+static void kvm_register_steal_time(void)
+{
+ int cpu = smp_processor_id();
+ struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
+
+ if (!has_steal_clock)
+ return;
+
+ memset(st, 0, sizeof(*st));
+
+ wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED));
+ printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx",
+ cpu, __pa(st));
+}
+
void __cpuinit kvm_guest_cpu_init(void)
{
if (!kvm_para_available())
@@ -457,6 +483,9 @@ void __cpuinit kvm_guest_cpu_init(void)
printk(KERN_INFO"KVM setup async PF for cpu %d",
smp_processor_id());
}
+
+ if (has_steal_clock)
+ kvm_register_steal_time();
}

static void kvm_pv_disable_apf(void *unused)
@@ -483,6 +512,31 @@ static struct notifier_block kvm_pv_reboot_nb = {
.notifier_call = kvm_pv_reboot_notify,
};

+static u64 kvm_steal_clock(int cpu)
+{
+ u64 steal;
+ struct kvm_steal_time *src;
+ int version;
+
+ src = &per_cpu(steal_time, cpu);
+ do {
+ version = src->version;
+ rmb();
+ steal = src->steal;
+ rmb();
+ } while ((version & 1) || (version != src->version));
+
+ return steal;
+}
+
+void kvm_disable_steal_time(void)
+{
+ if (!has_steal_clock)
+ return;
+
+ wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
+}
+
#ifdef CONFIG_SMP
static void __init kvm_smp_prepare_boot_cpu(void)
{
@@ -500,6 +554,7 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)

static void kvm_guest_cpu_offline(void *dummy)
{
+ kvm_disable_steal_time();
kvm_pv_disable_apf(NULL);
apf_task_wake_all();
}
@@ -548,6 +603,11 @@ void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
x86_init.irqs.trap_init = kvm_apf_trap_init;

+ if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ has_steal_clock = 1;
+ pv_time_ops.steal_clock = kvm_steal_clock;
+ }
+
#ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
register_cpu_notifier(&kvm_cpu_notifier);
@@ -555,3 +615,16 @@ void __init kvm_guest_init(void)
kvm_guest_cpu_init();
#endif
}
+
+static __init int activate_jump_labels(void)
+{
+ if (has_steal_clock) {
+ jump_label_inc(&paravirt_steal_enabled);
+ if (steal_acc)
+ jump_label_inc(&paravirt_steal_rq_enabled);
+ }
+
+ return 0;
+}
+arch_initcall(activate_jump_labels);
+
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 6389a6b..c1a0188 100644
a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -160,6 +160,7 @@ static void __cpuinit kvm_setup_secondary_clock(void)
static void kvm_crash_shutdown(struct pt_regs *regs)
{
native_write_msr(msr_kvm_system_time, 0, 0);
+ kvm_disable_steal_time();
native_machine_crash_shutdown(regs);
}
#endif
@@ -167,6 +168,7 @@ static void kvm_crash_shutdown(struct pt_regs *regs)
static void kvm_shutdown(void)
{
native_write_msr(msr_kvm_system_time, 0, 0);
+ kvm_disable_steal_time();
native_machine_shutdown();
}

1.7.3.4

To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Replies Reply to this message
#2 Peter Zijlstra
July 02nd, 2011 - 06:30 am ET | Report spam
On Fri, 2011-07-01 at 17:22 -0400, Glauber Costa wrote:
@@ -1971,8 +1974,14 @@ static inline u64 steal_ticks(u64 steal)

static void update_rq_clock_task(struct rq *rq, s64 delta)
{
- s64 irq_delta;
-
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ s64 steal = 0, irq_delta = 0;
+#endif



So I wanted to ask a GCC person (Hi Jeff) about this.

+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;

/*
@@ -1995,12 +2004,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)

rq->prev_irq_time += irq_delta;
delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ if (static_branch((&paravirt_steal_rq_enabled))) {
+ u64 st;
+
+ steal = paravirt_steal_clock(cpu_of(rq));
+ steal -= rq->prev_steal_time_rq;
+
+ if (unlikely(steal > delta))
+ steal = delta;
+
+ st = steal_ticks(steal);
+ steal = st * TICK_NSEC;
+
+ rq->prev_steal_time_rq += steal;
+
+ delta -= steal;
+ }
+#endif
+
rq->clock_task += delta;

- if (irq_delta && sched_feat(NONIRQ_POWER))
- sched_rt_avg_update(rq, irq_delta);
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+ sched_rt_avg_update(rq, irq_delta + steal);
+#endif
}



In case of !CONFIG_IRQ_TIME_ACCOUNTING && !
CONFIG_PARAVIRT_TIME_ACCOUNTING, irq_delta and steal will both always be
0.

The function will basically look like:

static void update_rq_clock_task(struct rq *rq, s64 delta)
{
s64 irq_delta = 0, steal = 0;

rq->clock_task += delta;

if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
sched_rt_avg_update(rq, irq_delta + steal);
}

And we want it to emit the equivalent of:

static void update_rq_clock_task(struct rq *rq, s64 delta)
{
rq->clock_task += delta;
}

Now Glauber is properly paranoid and doesn't trust his compiler (this is
very hot code in the kernel so any extra code emitted here is sad) and
chose the heavy handed CPP solution.

Now without checking a all relevant gcc versions on all relevant
architectures (see you in a few weeks etc..) can we actually rely on gcc
doing such relatively simple things correct, or should we stick with CPP
just to make sure?
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Replies Reply to this message
#3 Peter Zijlstra
July 02nd, 2011 - 06:40 am ET | Report spam
On Fri, 2011-07-01 at 17:22 -0400, Glauber Costa wrote:
@@ -3929,6 +3945,23 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
}

+#ifdef CONFIG_PARAVIRT
+ if (static_branch(&paravirt_steal_enabled)) {
+ u64 steal, st = 0;
+
+ steal = paravirt_steal_clock(smp_processor_id());
+ steal -= this_rq()->prev_steal_time;
+
+ st = steal_ticks(steal);
+ this_rq()->prev_steal_time += st * TICK_NSEC;
+
+ if (st) {
+ account_steal_time(st);
+ return;
+ }
+ }
+#endif
+
if (user_tick)
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))



So I was about to send an Ack for this patch, when I noticed that this
will all be dead code when CONFIG_IRQ_TIME_ACCOUNTING &&
sched_clock_irqtime.

I think irqtime_account_process_tick() wants a similar hunk (which
suggests splitting it out into an inline function).
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Replies Reply to this message
#4 Peter Zijlstra
July 02nd, 2011 - 06:40 am ET | Report spam
On Fri, 2011-07-01 at 17:22 -0400, Glauber Costa wrote:
This patch makes update_rq_clock() aware of steal time.
The mechanism of operation is not different from irq_time,
and follows the same principles. This lives in a CONFIG
option itself, and can be compiled out independently of
the rest of steal time reporting. The effect of disabling it
is that the scheduler will still report steal time (that cannot be
disabled), but won't use this information for cpu power adjustments.

Everytime update_rq_clock_task() is invoked, we query information
about how much time was stolen since last call, and feed it into
sched_rt_avg_update().

Although steal time reporting in account_process_tick() keeps
track of the last time we read the steal clock, in prev_steal_time,
this patch do it independently using another field,
prev_steal_time_rq. This is because otherwise, information about time
accounted in update_process_tick() would never reach us in update_rq_clock().

Signed-off-by: Glauber Costa
CC: Rik van Riel
CC: Jeremy Fitzhardinge
CC: Peter Zijlstra
CC: Avi Kivity
CC: Anthony Liguori
CC: Eric B Munson



Acked-by: Peter Zijlstra

Cleaning up that CPP mess when the GCC people come back saying their
compiler will properly do that optimization can always be done in a
follow up patch.

Thanks!
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Replies Reply to this message
#5 Avi Kivity
July 03rd, 2011 - 09:30 am ET | Report spam
On 07/02/2011 01:24 PM, Peter Zijlstra wrote:
static void update_rq_clock_task(struct rq *rq, s64 delta)
{
s64 irq_delta = 0, steal = 0;

rq->clock_task += delta;

if ((irq_delta + steal)&& sched_feat(NONTASK_POWER))
sched_rt_avg_update(rq, irq_delta + steal);
}

And we want it to emit the equivalent of:

static void update_rq_clock_task(struct rq *rq, s64 delta)
{
rq->clock_task += delta;
}

Now Glauber is properly paranoid and doesn't trust his compiler (this is
very hot code in the kernel so any extra code emitted here is sad) and
chose the heavy handed CPP solution.

Now without checking a all relevant gcc versions on all relevant
architectures (see you in a few weeks etc..) can we actually rely on gcc
doing such relatively simple things correct, or should we stick with CPP
just to make sure?



I'm pretty sure any relevant version of gcc will do the right optimizations.

error compiling committee.c: too many arguments to function

To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
email Follow the discussion Replies Reply to this message
Help Create a new topicReplies Make a reply
Search Make your own search