[PATCH] x86, AMD: Correct F15h IC aliasing issue

July 22nd, 2011 - 09:20 am ET by Borislav Petkov | Report spam
From: Borislav Petkov <borislav.petkov@amd.com>

This patch provides performance tuning for the "Bulldozer" CPU. With its
shared instruction cache there is a chance of generating an excessive
number of cache cross-invalidates when running specific workloads on the
cores of a compute module.

This excessive amount of cross-invalidations can be observed if cache
lines backed by shared physical memory alias in bits [14:12] of their
virtual addresses, as those bits are used for the index generation.

This patch addresses the issue by zeroing out the slice [14:12] of
the file mapping's virtual address at generation time, thus forcing
those bits the same for all mappings of a single shared library across
processes and, in doing so, avoids instruction cache aliases.

It also adds the kernel command line option
"unalias_va_addr=(32|64|off)" with which virtual address unaliasing
can be enabled for 32-bit or 64-bit x86 individually, or be completely
disabled.

This change leaves virtual region address allocation on other families
and/or vendors unaffected.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Martin Pohlack <martin.pohlack@amd.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

Documentation/kernel-parameters.txt | 6 ++++
arch/x86/include/asm/elf.h | 36 +++++++++++++++++++++++++++
arch/x86/kernel/cpu/amd.c | 7 +++++
arch/x86/kernel/sys_x86_64.c | 46 ++++++++++++++++++++++++++++++++--
arch/x86/mm/mmap.c | 15 --
arch/x86/vdso/vma.c | 10 +++++++
6 files changed, 102 insertions(+), 18 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index fd248a31..17eda04 100644
a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2535,6 +2535,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Note that genuine overcurrent events won't be
reported either.

+ unalias_va_addr
+ [X86-64] Unalias VA address by clearing slice [14:12]
+ 1: only on 32-bit
+ 2: only on 64-bit
+ off: disabled on both 32 and 64-bit
+
unknown_nmi_panic
[X86] Cause panic on unknown NMI.

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f2ad216..89e38a4 100644
a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -4,6 +4,7 @@
/*
* ELF register definitions..
*/
+#include <linux/thread_info.h>

#include <asm/ptrace.h>
#include <asm/user.h>
@@ -320,4 +321,39 @@ extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
extern unsigned long arch_randomize_brk(struct mm_struct *mm);
#define arch_randomize_brk arch_randomize_brk

+/*
+ * True on X86_32 or when emulating IA32 on X86_64
+ */
+static inline int mmap_is_ia32(void)
+{
+#ifdef CONFIG_X86_32
+ return 1;
+#endif
+#ifdef CONFIG_IA32_EMULATION
+ if (test_thread_flag(TIF_IA32))
+ return 1;
+#endif
+ return 0;
+}
+
+#define UNALIAS_VA_32 1
+#define UNALIAS_VA_64 2
+
+extern int unalias_va_addr;
+static inline unsigned long unalias_addr(unsigned long addr, bool incr)
+{
+ /* handle both 32- and 64-bit with a single conditional */
+ if (!(unalias_va_addr & (2 - mmap_is_ia32())))
+ return addr;
+
+ /* check if [14:12] are already cleared */
+ if (!(addr & (0x7 << PAGE_SHIFT)))
+ return addr;
+
+ addr = addr & ~(0x7 << PAGE_SHIFT);
+ if (incr)
+ addr += (0x8 << PAGE_SHIFT);
+
+ return addr;
+}
#endif /* _ASM_X86_ELF_H */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index b13ed39..2d380c6 100644
a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -458,6 +458,13 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
"with P0 frequency!");
}
}
+
+ if (unalias_va_addr == -1) {
+ if (c->x86 == 0x15)
+ unalias_va_addr = UNALIAS_VA_32 | UNALIAS_VA_64;
+ else
+ unalias_va_addr = 0;
+ }
}

static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index ff14a50..323c411 100644
a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -18,6 +18,30 @@
#include <asm/ia32.h>
#include <asm/syscalls.h>

+/* see Documentation/kernel-parameters.txt */
+int unalias_va_addr = -1;
+
+static int __init control_va_addr_unaliasing(char *str)
+{
+ if (*str == 0)
+ return 1;
+
+ if (*str == '=')
+ str++;
+
+ if (!strcmp(str, "32"))
+ unalias_va_addr = UNALIAS_VA_32;
+ else if (!strcmp(str, "64"))
+ unalias_va_addr = UNALIAS_VA_64;
+ else if (!strcmp(str, "off"))
+ unalias_va_addr = 0;
+ else
+ return 0;
+
+ return 1;
+}
+__setup("unalias_va_addr", control_va_addr_unaliasing);
+
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, off)
@@ -92,6 +116,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
start_addr = addr;

full_search:
+ if (filp)
+ addr = unalias_addr(addr, true);
+
for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
/* At this point: (!vma || addr < vma->vm_end). */
if (end - len < addr) {
@@ -117,6 +144,9 @@ full_search:
mm->cached_hole_size = vma->vm_start - addr;

addr = vma->vm_end;
+
+ if (filp)
+ addr = unalias_addr(addr, true);
}
}

@@ -161,10 +191,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,

/* make sure it can fit in the remaining address space */
if (addr > len) {
- vma = find_vma(mm, addr-len);
- if (!vma || addr <= vma->vm_start)
+ unsigned long tmp_addr = addr;
+
+ if (filp)
+ tmp_addr = unalias_addr(addr - len, false);
+ else
+ tmp_addr = tmp_addr - len;
+
+ vma = find_vma(mm, tmp_addr);
+ if (!vma || tmp_addr + len <= vma->vm_start)
/* remember the address as a hint for next time */
- return mm->free_area_cache = addr-len;
+ return mm->free_area_cache = tmp_addr;
}

if (mm->mmap_base < len)
@@ -173,6 +210,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
addr = mm->mmap_base-len;

do {
+ if (filp)
+ addr = unalias_addr(addr, false);
+
/*
* Lookup failure means no vma is above this address,
* else if new region fits below vma->vm_start,
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 1dab519..d4c0736 100644
a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -51,21 +51,6 @@ static unsigned int stack_maxrandom_size(void)
#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
#define MAX_GAP (TASK_SIZE/6*5)

-/*
- * True on X86_32 or when emulating IA32 on X86_64
- */
-static int mmap_is_ia32(void)
-{
-#ifdef CONFIG_X86_32
- return 1;
-#endif
-#ifdef CONFIG_IA32_EMULATION
- if (test_thread_flag(TIF_IA32))
- return 1;
-#endif
- return 0;
-}
-
static int mmap_is_legacy(void)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7abd2be..a48dfd2 100644
a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -69,6 +69,16 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
addr = start + (offset << PAGE_SHIFT);
if (addr >= end)
addr = end;
+
+ if (unalias_va_addr) {
+ /*
+ * page-align it here so that get_unmapped_area doesn't
+ * align it wrongfully again to the next page
+ */
+ addr = PAGE_ALIGN(addr);
+ addr = unalias_addr(addr, false);
+ }
+
return addr;
}

1.7.4.rc2

To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
email Follow the discussionReplies 35 repliesReplies Make a reply

Similar topics

Replies

#1 Ingo Molnar
July 24th, 2011 - 07:20 am ET | Report spam
(Cc:-ed Linus and akpm)

* Borislav Petkov wrote:

From: Borislav Petkov

This patch provides performance tuning for the "Bulldozer" CPU. With its
shared instruction cache there is a chance of generating an excessive
number of cache cross-invalidates when running specific workloads on the
cores of a compute module.

This excessive amount of cross-invalidations can be observed if cache
lines backed by shared physical memory alias in bits [14:12] of their
virtual addresses, as those bits are used for the index generation.

This patch addresses the issue by zeroing out the slice [14:12] of
the file mapping's virtual address at generation time, thus forcing
those bits the same for all mappings of a single shared library across
processes and, in doing so, avoids instruction cache aliases.



So all commonly-aliased virtual mappings of the same pages should be
32K granular aligned for good performance? Seems rather unfortunate
for a modern CPU - is this going to be the case for all family 0x15
CPUs?

It also adds the kernel command line option
"unalias_va_addr=(32|64|off)" with which virtual address unaliasing
can be enabled for 32-bit or 64-bit x86 individually, or be completely
disabled.



So the default behavior is to have this aligning enabled.


This change leaves virtual region address allocation on other families
and/or vendors unaffected.

Signed-off-by: Andre Przywara
Signed-off-by: Martin Pohlack
Signed-off-by: Borislav Petkov

Documentation/kernel-parameters.txt | 6 ++++
arch/x86/include/asm/elf.h | 36 +++++++++++++++++++++++++++
arch/x86/kernel/cpu/amd.c | 7 +++++
arch/x86/kernel/sys_x86_64.c | 46 ++++++++++++++++++++++++++++++++--
arch/x86/mm/mmap.c | 15 --
arch/x86/vdso/vma.c | 10 +++++++
6 files changed, 102 insertions(+), 18 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index fd248a31..17eda04 100644
a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2535,6 +2535,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Note that genuine overcurrent events won't be
reported either.

+ unalias_va_addr
+ [X86-64] Unalias VA address by clearing slice [14:12]
+ 1: only on 32-bit
+ 2: only on 64-bit
+ off: disabled on both 32 and 64-bit



This says nothing about why it's cleared, what it does and why one
would want to handle it differently on 32-bit and 64-bit.

+
unknown_nmi_panic
[X86] Cause panic on unknown NMI.

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f2ad216..89e38a4 100644
a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -4,6 +4,7 @@
/*
* ELF register definitions..
*/
+#include <linux/thread_info.h>

#include <asm/ptrace.h>
#include <asm/user.h>
@@ -320,4 +321,39 @@ extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
extern unsigned long arch_randomize_brk(struct mm_struct *mm);
#define arch_randomize_brk arch_randomize_brk

+/*
+ * True on X86_32 or when emulating IA32 on X86_64
+ */
+static inline int mmap_is_ia32(void)
+{
+#ifdef CONFIG_X86_32
+ return 1;
+#endif
+#ifdef CONFIG_IA32_EMULATION
+ if (test_thread_flag(TIF_IA32))
+ return 1;
+#endif
+ return 0;
+}
+
+#define UNALIAS_VA_32 1
+#define UNALIAS_VA_64 2
+
+extern int unalias_va_addr;
+static inline unsigned long unalias_addr(unsigned long addr, bool incr)
+{
+ /* handle both 32- and 64-bit with a single conditional */
+ if (!(unalias_va_addr & (2 - mmap_is_ia32())))
+ return addr;
+
+ /* check if [14:12] are already cleared */
+ if (!(addr & (0x7 << PAGE_SHIFT)))
+ return addr;
+
+ addr = addr & ~(0x7 << PAGE_SHIFT);
+ if (incr)
+ addr += (0x8 << PAGE_SHIFT);
+
+ return addr;
+}



Looks too big to be inlined.

Also, there's no explanation what the 'incr' logic is about and why
it adds 32K to the address. Is this round-up logic in disguise?

#endif /* _ASM_X86_ELF_H */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index b13ed39..2d380c6 100644
a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -458,6 +458,13 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
"with P0 frequency!");
}
}
+
+ if (unalias_va_addr == -1) {
+ if (c->x86 == 0x15)
+ unalias_va_addr = UNALIAS_VA_32 | UNALIAS_VA_64;
+ else
+ unalias_va_addr = 0;



the placement here feels a bit wrong - don't we have run-once CPU
quirks?

+ }
}

static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index ff14a50..323c411 100644
a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -18,6 +18,30 @@
#include <asm/ia32.h>
#include <asm/syscalls.h>

+/* see Documentation/kernel-parameters.txt */
+int unalias_va_addr = -1;



used by hot VM code so this really wants to be __read_mostly ...

+
+static int __init control_va_addr_unaliasing(char *str)
+{
+ if (*str == 0)
+ return 1;
+
+ if (*str == '=')
+ str++;
+
+ if (!strcmp(str, "32"))
+ unalias_va_addr = UNALIAS_VA_32;
+ else if (!strcmp(str, "64"))
+ unalias_va_addr = UNALIAS_VA_64;
+ else if (!strcmp(str, "off"))
+ unalias_va_addr = 0;
+ else
+ return 0;



there's no 'both'/'all' setting: while it can be achieved by leaving
out this boot option (i.e. using the default) we generally try to
allow the setting of all combinations.

+
+ return 1;
+}
+__setup("unalias_va_addr", control_va_addr_unaliasing);
+
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, off)
@@ -92,6 +116,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
start_addr = addr;

full_search:
+ if (filp)
+ addr = unalias_addr(addr, true);
+
for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
/* At this point: (!vma || addr < vma->vm_end). */
if (end - len < addr) {
@@ -117,6 +144,9 @@ full_search:
mm->cached_hole_size = vma->vm_start - addr;

addr = vma->vm_end;
+
+ if (filp)
+ addr = unalias_addr(addr, true);



Instead of polluting the code with 'if (filp)' conditions (which is
really a property specific to this VA placement quirk) please push it
into the address alignment function so that it looks like this:

addr = cache_aliasing_quirk(addr, filp);

or so. (see further below about the 'unalias' naming.)

}
}

@@ -161,10 +191,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,

/* make sure it can fit in the remaining address space */
if (addr > len) {
- vma = find_vma(mm, addr-len);
- if (!vma || addr <= vma->vm_start)
+ unsigned long tmp_addr = addr;
+
+ if (filp)
+ tmp_addr = unalias_addr(addr - len, false);
+ else
+ tmp_addr = tmp_addr - len;



This too could be written cleaner with the updated helper.

+
+ vma = find_vma(mm, tmp_addr);
+ if (!vma || tmp_addr + len <= vma->vm_start)
/* remember the address as a hint for next time */
- return mm->free_area_cache = addr-len;
+ return mm->free_area_cache = tmp_addr;
}

if (mm->mmap_base < len)
@@ -173,6 +210,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
addr = mm->mmap_base-len;

do {
+ if (filp)
+ addr = unalias_addr(addr, false);
+
/*
* Lookup failure means no vma is above this address,
* else if new region fits below vma->vm_start,
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 1dab519..d4c0736 100644
a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -51,21 +51,6 @@ static unsigned int stack_maxrandom_size(void)
#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
#define MAX_GAP (TASK_SIZE/6*5)

-/*
- * True on X86_32 or when emulating IA32 on X86_64
- */
-static int mmap_is_ia32(void)
-{
-#ifdef CONFIG_X86_32
- return 1;
-#endif
-#ifdef CONFIG_IA32_EMULATION
- if (test_thread_flag(TIF_IA32))
- return 1;
-#endif
- return 0;
-}
-
static int mmap_is_legacy(void)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7abd2be..a48dfd2 100644
a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -69,6 +69,16 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
addr = start + (offset << PAGE_SHIFT);
if (addr >= end)
addr = end;
+
+ if (unalias_va_addr) {
+ /*
+ * page-align it here so that get_unmapped_area doesn't
+ * align it wrongfully again to the next page
+ */
+ addr = PAGE_ALIGN(addr);
+ addr = unalias_addr(addr, false);
+ }



Well, this cuts 3 bits from the randomization granularity. It would
be better to extend the scope/range of randomization beyond the
current PMD_SIZE range instead of cutting it.

(Arguably it would also make sense to increase vdso randomization
beyond a 2MB range as well, but that's beyond this patch.)

also, the PAGE_ALIGN() complication here looks unnecessary - can the
vdso 'addr' ever be not 4K aligned above?

Thirdly, we stick this vdso_addr() result into a get_unmapped_area()
call - which does an unalias_addr() call yet another time, right? So
why the two separate calls?

+
return addr;
}



Also, a naming nit: i find the whole 'unalias' language above rather
confusing in this context: AFAICS the point is to intentionally
*align* those mappings. While that technically 'unaliases' them in
CPU cache coherency lingo, the code you are modifying here is VM code
where 'unaliasing' can mean a number of other things, none of which
are related to this problem.

'Aligning' a mapping to a given granularity is a common expression
though.

Thanks,

Ingo
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Replies Reply to this message
#2 Borislav Petkov
July 24th, 2011 - 09:50 am ET | Report spam
Hi Ingo,

thanks for taking the time to look at this in detail.

On Sun, Jul 24, 2011 at 07:13:50AM -0400, Ingo Molnar wrote:

(Cc:-ed Linus and akpm)

* Borislav Petkov wrote:

> From: Borislav Petkov
>
> This patch provides performance tuning for the "Bulldozer" CPU. With its
> shared instruction cache there is a chance of generating an excessive
> number of cache cross-invalidates when running specific workloads on the
> cores of a compute module.
>
> This excessive amount of cross-invalidations can be observed if cache
> lines backed by shared physical memory alias in bits [14:12] of their
> virtual addresses, as those bits are used for the index generation.
>
> This patch addresses the issue by zeroing out the slice [14:12] of
> the file mapping's virtual address at generation time, thus forcing
> those bits the same for all mappings of a single shared library across
> processes and, in doing so, avoids instruction cache aliases.

So all commonly-aliased virtual mappings of the same pages should be
32K granular aligned for good performance? Seems rather unfortunate
for a modern CPU - is this going to be the case for all family 0x15
CPUs?



Right, this gets enabled on all F15h by default for now... The thing
is, the invalidations appear only in certain cases and when the two
tasks run on the cores of one compute module. However, due to the
scheduler migrating them, we need to have those VA assignments aligned
(or unaliased) at process creation.

Another possibility would be to not 32K align but never schedule them
on the same compute module but that won't fly with n tasks and n cores
scenario.

> It also adds the kernel command line option
> "unalias_va_addr=(32|64|off)" with which virtual address unaliasing
> can be enabled for 32-bit or 64-bit x86 individually, or be completely
> disabled.

So the default behavior is to have this aligning enabled.



Yes, it gets enabled on F15h by default.

> This change leaves virtual region address allocation on other families
> and/or vendors unaffected.
>
> Signed-off-by: Andre Przywara
> Signed-off-by: Martin Pohlack
> Signed-off-by: Borislav Petkov
>
> Documentation/kernel-parameters.txt | 6 ++++
> arch/x86/include/asm/elf.h | 36 +++++++++++++++++++++++++++
> arch/x86/kernel/cpu/amd.c | 7 +++++
> arch/x86/kernel/sys_x86_64.c | 46 ++++++++++++++++++++++++++++++++--
> arch/x86/mm/mmap.c | 15 --
> arch/x86/vdso/vma.c | 10 +++++++
> 6 files changed, 102 insertions(+), 18 deletions(-)
>
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index fd248a31..17eda04 100644
> a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -2535,6 +2535,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
> Note that genuine overcurrent events won't be
> reported either.
>
> + unalias_va_addr
> + [X86-64] Unalias VA address by clearing slice [14:12]
> + 1: only on 32-bit
> + 2: only on 64-bit
> + off: disabled on both 32 and 64-bit

This says nothing about why it's cleared, what it does and why one
would want to handle it differently on 32-bit and 64-bit.



This is there in case users care more about the 3bits ASLR granularity
than invalidations amount, for example. I'll make this more descriptive.

[..]

> +extern int unalias_va_addr;
> +static inline unsigned long unalias_addr(unsigned long addr, bool incr)
> +{
> + /* handle both 32- and 64-bit with a single conditional */
> + if (!(unalias_va_addr & (2 - mmap_is_ia32())))
> + return addr;
> +
> + /* check if [14:12] are already cleared */
> + if (!(addr & (0x7 << PAGE_SHIFT)))
> + return addr;
> +
> + addr = addr & ~(0x7 << PAGE_SHIFT);
> + if (incr)
> + addr += (0x8 << PAGE_SHIFT);
> +
> + return addr;
> +}

Looks too big to be inlined.



Ok, will change.

Also, there's no explanation what the 'incr' logic is about and why it
adds 32K to the address. Is this round-up logic in disguise?



Basically yes. Round-up and -down, actually. The idea
is for the unalias_addr() helper to be called both from
arch_get_unmapped_area_topdown() and arch_get_unmapped_area() depending
on the search direction we take in the VA space. So 'incr' says whether
we increment the last cached VA addr (mm->free_area_cache) or we
decrement. In both cases, it changes addr so that its bits [14:12] are
zeros.

> #endif /* _ASM_X86_ELF_H */
> diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
> index b13ed39..2d380c6 100644
> a/arch/x86/kernel/cpu/amd.c
> +++ b/arch/x86/kernel/cpu/amd.c
> @@ -458,6 +458,13 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
> "with P0 frequency!");
> }
> }
> +
> + if (unalias_va_addr == -1) {
> + if (c->x86 == 0x15)
> + unalias_va_addr = UNALIAS_VA_32 | UNALIAS_VA_64;
> + else
> + unalias_va_addr = 0;

the placement here feels a bit wrong - don't we have run-once CPU
quirks?



Maybe another func ptr in struct x86_cpuinit_ops
(arch/x86/kernel/x86_init.c) would be a better place?

> + }
> }
>
> static void __cpuinit init_amd(struct cpuinfo_x86 *c)
> diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
> index ff14a50..323c411 100644
> a/arch/x86/kernel/sys_x86_64.c
> +++ b/arch/x86/kernel/sys_x86_64.c
> @@ -18,6 +18,30 @@
> #include <asm/ia32.h>
> #include <asm/syscalls.h>
>
> +/* see Documentation/kernel-parameters.txt */
> +int unalias_va_addr = -1;

used by hot VM code so this really wants to be __read_mostly ...



Ok.

> +
> +static int __init control_va_addr_unaliasing(char *str)
> +{
> + if (*str == 0)
> + return 1;
> +
> + if (*str == '=')
> + str++;
> +
> + if (!strcmp(str, "32"))
> + unalias_va_addr = UNALIAS_VA_32;
> + else if (!strcmp(str, "64"))
> + unalias_va_addr = UNALIAS_VA_64;
> + else if (!strcmp(str, "off"))
> + unalias_va_addr = 0;
> + else
> + return 0;

there's no 'both'/'all' setting: while it can be achieved by leaving
out this boot option (i.e. using the default) we generally try to
allow the setting of all combinations.



My reasoning here was that it gets enabled on F15h by default but we
don't want it on the remaining families. This is additionally to tweak
the behavior on F15h only but theoretically you could enable it on
everything else. I think I want to restrict the visibility/modifiability
of this option to F15h only.

> +
> + return 1;
> +}
> +__setup("unalias_va_addr", control_va_addr_unaliasing);
> +
> SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
> unsigned long, prot, unsigned long, flags,
> unsigned long, fd, unsigned long, off)
> @@ -92,6 +116,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
> start_addr = addr;
>
> full_search:
> + if (filp)
> + addr = unalias_addr(addr, true);
> +
> for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
> /* At this point: (!vma || addr < vma->vm_end). */
> if (end - len < addr) {
> @@ -117,6 +144,9 @@ full_search:
> mm->cached_hole_size = vma->vm_start - addr;
>
> addr = vma->vm_end;
> +
> + if (filp)
> + addr = unalias_addr(addr, true);

Instead of polluting the code with 'if (filp)' conditions (which is
really a property specific to this VA placement quirk) please push it
into the address alignment function so that it looks like this:

addr = cache_aliasing_quirk(addr, filp);

or so. (see further below about the 'unalias' naming.)



Ok, will do.

I was also thinking of doing something similar to ALTERNATIVE_JUMP
but instead of looking at a CPU feature bit, it can check a generic
conditional evaluated at runtime and patch in an unconditional 'jmp' to
make this codepath even cheaper. But this maybe later.

>
> @@ -161,10 +191,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
>
> /* make sure it can fit in the remaining address space */
> if (addr > len) {
> - vma = find_vma(mm, addr-len);
> - if (!vma || addr <= vma->vm_start)
> + unsigned long tmp_addr = addr;
> +
> + if (filp)
> + tmp_addr = unalias_addr(addr - len, false);
> + else
> + tmp_addr = tmp_addr - len;

This too could be written cleaner with the updated helper.



Sure, will do.

[..]

> diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
> index 7abd2be..a48dfd2 100644
> a/arch/x86/vdso/vma.c
> +++ b/arch/x86/vdso/vma.c
> @@ -69,6 +69,16 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
> addr = start + (offset << PAGE_SHIFT);
> if (addr >= end)
> addr = end;
> +
> + if (unalias_va_addr) {
> + /*
> + * page-align it here so that get_unmapped_area doesn't
> + * align it wrongfully again to the next page
> + */
> + addr = PAGE_ALIGN(addr);
> + addr = unalias_addr(addr, false);
> + }

Well, this cuts 3 bits from the randomization granularity. It would
be better to extend the scope/range of randomization beyond the
current PMD_SIZE range instead of cutting it.

(Arguably it would also make sense to increase vdso randomization
beyond a 2MB range as well, but that's beyond this patch.)



AFAIU, this would cost us 2 pages: a PMD and a PTE if we don't keep the
vDSO in the same PTE as the comment above vdso_addr() puts it:

/* Put the vdso above the (randomized) stack with another randomized offset.
This way there is no hole in the middle of address space.
To save memory make sure it is still in the same PTE as the stack top.
This doesn't give that many random bits */

Hmm..

also, the PAGE_ALIGN() complication here looks unnecessary - can the
vdso 'addr' ever be not 4K aligned above?



Yeah, it can. I did some trace_printk runs and 'addr' wasn't 4K aligned
in some cases. I think this is because of the stack randomization we do
and we use mm->start_stack to get the vdso base address. Unfortunately,
I can't find those traces anymore but will do some again tomorrow.

Thirdly, we stick this vdso_addr() result into a get_unmapped_area()
call - which does an unalias_addr() call yet another time, right? So
why the two separate calls?



Right, in order to unalias the address, I check if we have a file
mapping which backs it - if (filp) - and this is not the case for the
vDSO. And if the address is not 4K aligned and we unalias it first and
_then_ align it, resulting in not what we want.

But speaking of file mappings, the better fix should be to pass
protection flags down to get_unmapped_area() from do_mmap_pgoff()
so that we can check against PROT_EXEC and only then change the
address. However, this would require changes all over the place since
->get_unmapped_area is part of fs file_operations, etc, etc.

It still works this way too, anyway, because for example:

$ cat /proc/self/maps
...

7f19429b1000-7f1942b2b000 r-xp 00000000 08:01 4112535 /lib/x86_64-linux-gnu/libc-2.13.so
7f1942b2b000-7f1942d2b000 p 0017a000 08:01 4112535 /lib/x86_64-linux-gnu/libc-2.13.so
7f1942d2b000-7f1942d2f000 r--p 0017a000 08:01 4112535 /lib/x86_64-linux-gnu/libc-2.13.so
7f1942d2f000-7f1942d30000 rw-p 0017e000 08:01 4112535 /lib/x86_64-linux-gnu/libc-2.13.so

when we do the first "r-xp" mapping above, we do

get_unmapped_area(filp != NULL, addr = 0...)

and we change the address accordingly, but the follow-up mappings come
in either with MAP_FIXED set so we return the requested address early
or they're anonymous mappings for .bss and RO data which have no file
backing anyways.

Also, a naming nit: i find the whole 'unalias' language above rather
confusing in this context: AFAICS the point is to intentionally
*align* those mappings. While that technically 'unaliases' them in
CPU cache coherency lingo, the code you are modifying here is VM code
where 'unaliasing' can mean a number of other things, none of which
are related to this problem.

'Aligning' a mapping to a given granularity is a common expression
though.



Yes, makes sense, will change.

Thanks again for the review.

Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Replies Reply to this message
#3 Ingo Molnar
July 24th, 2011 - 09:50 am ET | Report spam
* Borislav Petkov wrote:

> > diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
> > index b13ed39..2d380c6 100644
> > a/arch/x86/kernel/cpu/amd.c
> > +++ b/arch/x86/kernel/cpu/amd.c
> > @@ -458,6 +458,13 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
> > "with P0 frequency!");
> > }
> > }
> > +
> > + if (unalias_va_addr == -1) {
> > + if (c->x86 == 0x15)
> > + unalias_va_addr = UNALIAS_VA_32 | UNALIAS_VA_64;
> > + else
> > + unalias_va_addr = 0;
>
> the placement here feels a bit wrong - don't we have run-once CPU
> quirks?

Maybe another func ptr in struct x86_cpuinit_ops
(arch/x86/kernel/x86_init.c) would be a better place?



Yeah.

Assuming no objections come i'd suggest to do that as an add-on patch
- so the first patch can be backported, the second patch cleans up
the x86_cpuinit_ops aspect.

Thanks,

Ingo
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Replies Reply to this message
#4 Linus Torvalds
July 24th, 2011 - 12:10 pm ET | Report spam
Argh. This is a small disaster, you know that, right? Suddenly we have
user-visible allocation changes depending on which CPU you are running
on. I just hope that the address-space randomization has caught all
the code that depended on specific layouts.

And even with ASLR, I wouldn't be surprised if there are binaries out
there that "know" that they get dense virtual memory when they do
back-to-back allocations, even when they don't pass in the address
explicitly.

How much testing has AMD done with this change and various legacy
Linux distros? The 32-bit case in particular makes me nervous, that's
where I'd expect a higher likelihood of binaries that depend on the
layout.

You guys do realize that we had to disable ASLR on many machines?

So at a MINIMUM, I would say that this is acceptable only when the
process doing the allocation hasn't got ASLR disabled.

On Fri, Jul 22, 2011 at 6:15 AM, Borislav Petkov wrote:
+static inline unsigned long unalias_addr(unsigned long addr, bool incr)
+{
+       /* handle both 32- and 64-bit with a single conditional */
+       if (!(unalias_va_addr & (2 - mmap_is_ia32())))
+               return addr;



Ugh. I guess it works, but the actual values you used did not have a
comment about those particular values being magical. You should do
that, otherwise somebody will start adding bits and moving things
around, and breaking this "bits 2/1" logic.

+       /* check if [14:12] are already cleared */
+       if (!(addr & (0x7 << PAGE_SHIFT)))
+               return addr;
+
+       addr = addr & ~(0x7 << PAGE_SHIFT);
+       if (incr)
+               addr += (0x8 << PAGE_SHIFT);



This is just really hard to look at. First you talk about "bits
14:12", and then you use odd values like "8 << PAGE_SHIFT".

Yes, yes, I can do the math in my head, and say "8 is 1<<3, and
PAGE_SHIFT is 12, so he's adding things up to the next bit 15".

But is that really sensible?

If we don't already have helpers for this, it would still be prettier
with something like

#define BIT(a) (1ul << (a))
#define BITS(a,b) (BIT((a)+1) - BIT(b))

and then that "0x7 << PAGE_SHIFT" ends up being BITS(14,12) like in
the comment (you should really double-check that I got it right
though).

Or alternatively, make the comment match the code, and explain the
14:12 with something like "the three bits above the page mask",
although that just sounds odd.

Anyway, I seriously think that this patch is completely unacceptable
in this form, and is quite possibly going to break real applications.
Maybe most of the applications that had problems with ASLR only had
trouble with anonymous memory, and the fact that you only do this for
file mappings might mean that it's ok. But I'd be really worried.
Changing address space layout is not a small decision.

Linus
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Replies Reply to this message
#5 Andrew Morton
July 24th, 2011 - 12:20 pm ET | Report spam
On Sun, 24 Jul 2011 15:40:46 +0200 Borislav Petkov wrote:

> > + unalias_va_addr
> > + [X86-64] Unalias VA address by clearing slice [14:12]
> > + 1: only on 32-bit
> > + 2: only on 64-bit
> > + off: disabled on both 32 and 64-bit



How much performance difference does this actually make?

> This says nothing about why it's cleared, what it does and why one
> would want to handle it differently on 32-bit and 64-bit.

This is there in case users care more about the 3bits ASLR granularity
than invalidations amount, for example.



To make this decision the users will want to know how much their
computers will slow down. We should tell them.
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Replies Reply to this message
Help Create a new topicNext page Replies Make a reply
Search Make your own search