From 090a3f615524c3f75d09fdb37f15ea1868d79f7e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 12 Jan 2015 18:19:40 +0100 Subject: [PATCH] x86/lib/copy_page_64.S: Use generic ALTERNATIVE macro ... instead of the semi-version with the spelled out sections. What is more, make the REP_GOOD version be the default copy_page() version as the majority of the relevant x86 CPUs do set X86_FEATURE_REP_GOOD. Thus, copy_page gets compiled to: ffffffff8130af80 : ffffffff8130af80: e9 0b 00 00 00 jmpq ffffffff8130af90 ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi) ffffffff8130af8d: c3 retq ffffffff8130af8e: 66 90 xchg %ax,%ax ffffffff8130af90 : ... and after the alternatives have run, the JMP to the old, unrolled version gets NOPed out: ffffffff8130af80 : ffffffff8130af80: 66 66 90 xchg %ax,%ax ffffffff8130af83: 66 90 xchg %ax,%ax ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi) ffffffff8130af8d: c3 retq On modern uarches, those NOPs are cheaper than the unconditional JMP previously. Signed-off-by: Borislav Petkov --- arch/x86/lib/copy_page_64.S | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index f1ffdbb07755..8239dbcbf984 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -2,23 +2,26 @@ #include #include +#include #include +/* + * Some CPUs run faster using the string copy instructions (sane microcode). + * It is also a lot simpler. Use this when possible. But, don't use streaming + * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the + * prefetch distance based on SMP/UP. + */ ALIGN -copy_page_rep: +ENTRY(copy_page) CFI_STARTPROC + ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD movl $4096/8, %ecx rep movsq ret CFI_ENDPROC -ENDPROC(copy_page_rep) - -/* - * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD. - * Could vary the prefetch distance based on SMP/UP. -*/ +ENDPROC(copy_page) -ENTRY(copy_page) +ENTRY(copy_page_regs) CFI_STARTPROC subq $2*8, %rsp CFI_ADJUST_CFA_OFFSET 2*8 @@ -90,21 +93,5 @@ ENTRY(copy_page) addq $2*8, %rsp CFI_ADJUST_CFA_OFFSET -2*8 ret -.Lcopy_page_end: CFI_ENDPROC -ENDPROC(copy_page) - - /* Some CPUs run faster using the string copy instructions. - It is also a lot simpler. Use this when possible */ - -#include - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp */ - .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \ - .Lcopy_page_end-copy_page, 2b-1b, 0 - .previous +ENDPROC(copy_page_regs) -- 2.34.1