arch/x86/lib/memcpy_64.S

   1 /* Copyright 2002 Andi Kleen */
   2
   3 #include <linux/linkage.h>
   4
   5 #include <asm/cpufeature.h>
   6 #include <asm/dwarf2.h>
   7 #include <asm/alternative-asm.h>
   8
   9 /*
  10  * memcpy - Copy a memory block.
  11  *
  12  * Input:
  13  *  rdi destination
  14  *  rsi source
  15  *  rdx count
  16  *
  17  * Output:
  18  * rax original destination
  19  */
  20
  21 /*
  22  * memcpy_c() - fast string ops (REP MOVSQ) based variant.
  23  *
  24  * This gets patched over the unrolled variant (below) via the
  25  * alternative instructions framework:
  26  */
  27         .section .altinstr_replacement, "ax", @progbits
  28 .Lmemcpy_c:
  29         movq %rdi, %rax
  30         movq %rdx, %rcx
  31         shrq $3, %rcx
  32         andl $7, %edx
  33         rep movsq
  34         movl %edx, %ecx
  35         rep movsb
  36         ret
  37 .Lmemcpy_e:
  38         .previous
  39
  40 /*
  41  * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
  42  * memcpy_c. Use memcpy_c_e when possible.
  43  *
  44  * This gets patched over the unrolled variant (below) via the
  45  * alternative instructions framework:
  46  */
  47         .section .altinstr_replacement, "ax", @progbits
  48 .Lmemcpy_c_e:
  49         movq %rdi, %rax
  50         movq %rdx, %rcx
  51         rep movsb
  52         ret
  53 .Lmemcpy_e_e:
  54         .previous
  55
  56 ENTRY(__memcpy)
  57 ENTRY(memcpy)
  58         CFI_STARTPROC
  59         movq %rdi, %rax
  60
  61         cmpq $0x20, %rdx
  62         jb .Lhandle_tail
  63
  64         /*
  65          * We check whether memory false dependence could occur,
  66          * then jump to corresponding copy mode.
  67          */
  68         cmp  %dil, %sil
  69         jl .Lcopy_backward
  70         subq $0x20, %rdx
  71 .Lcopy_forward_loop:
  72         subq $0x20,     %rdx
  73
  74         /*
  75          * Move in blocks of 4x8 bytes:
  76          */
  77         movq 0*8(%rsi), %r8
  78         movq 1*8(%rsi), %r9
  79         movq 2*8(%rsi), %r10
  80         movq 3*8(%rsi), %r11
  81         leaq 4*8(%rsi), %rsi
  82
  83         movq %r8,       0*8(%rdi)
  84         movq %r9,       1*8(%rdi)
  85         movq %r10,      2*8(%rdi)
  86         movq %r11,      3*8(%rdi)
  87         leaq 4*8(%rdi), %rdi
  88         jae  .Lcopy_forward_loop
  89         addl $0x20,     %edx
  90         jmp  .Lhandle_tail
  91
  92 .Lcopy_backward:
  93         /*
  94          * Calculate copy position to tail.
  95          */
  96         addq %rdx,      %rsi
  97         addq %rdx,      %rdi
  98         subq $0x20,     %rdx
  99         /*
 100          * At most 3 ALU operations in one cycle,
 101          * so append NOPS in the same 16 bytes trunk.
 102          */
 103         .p2align 4
 104 .Lcopy_backward_loop:
 105         subq $0x20,     %rdx
 106         movq -1*8(%rsi),        %r8
 107         movq -2*8(%rsi),        %r9
 108         movq -3*8(%rsi),        %r10
 109         movq -4*8(%rsi),        %r11
 110         leaq -4*8(%rsi),        %rsi
 111         movq %r8,               -1*8(%rdi)
 112         movq %r9,               -2*8(%rdi)
 113         movq %r10,              -3*8(%rdi)
 114         movq %r11,              -4*8(%rdi)
 115         leaq -4*8(%rdi),        %rdi
 116         jae  .Lcopy_backward_loop
 117
 118         /*
 119          * Calculate copy position to head.
 120          */
 121         addl $0x20,     %edx
 122         subq %rdx,      %rsi
 123         subq %rdx,      %rdi
 124 .Lhandle_tail:
 125         cmpl $16,       %edx
 126         jb   .Lless_16bytes
 127
 128         /*
 129          * Move data from 16 bytes to 31 bytes.
 130          */
 131         movq 0*8(%rsi), %r8
 132         movq 1*8(%rsi), %r9
 133         movq -2*8(%rsi, %rdx),  %r10
 134         movq -1*8(%rsi, %rdx),  %r11
 135         movq %r8,       0*8(%rdi)
 136         movq %r9,       1*8(%rdi)
 137         movq %r10,      -2*8(%rdi, %rdx)
 138         movq %r11,      -1*8(%rdi, %rdx)
 139         retq
 140         .p2align 4
 141 .Lless_16bytes:
 142         cmpl $8,        %edx
 143         jb   .Lless_8bytes
 144         /*
 145          * Move data from 8 bytes to 15 bytes.
 146          */
 147         movq 0*8(%rsi), %r8
 148         movq -1*8(%rsi, %rdx),  %r9
 149         movq %r8,       0*8(%rdi)
 150         movq %r9,       -1*8(%rdi, %rdx)
 151         retq
 152         .p2align 4
 153 .Lless_8bytes:
 154         cmpl $4,        %edx
 155         jb   .Lless_3bytes
 156
 157         /*
 158          * Move data from 4 bytes to 7 bytes.
 159          */
 160         movl (%rsi), %ecx
 161         movl -4(%rsi, %rdx), %r8d
 162         movl %ecx, (%rdi)
 163         movl %r8d, -4(%rdi, %rdx)
 164         retq
 165         .p2align 4
 166 .Lless_3bytes:
 167         subl $1, %edx
 168         jb .Lend
 169         /*
 170          * Move data from 1 bytes to 3 bytes.
 171          */
 172         movzbl (%rsi), %ecx
 173         jz .Lstore_1byte
 174         movzbq 1(%rsi), %r8
 175         movzbq (%rsi, %rdx), %r9
 176         movb %r8b, 1(%rdi)
 177         movb %r9b, (%rdi, %rdx)
 178 .Lstore_1byte:
 179         movb %cl, (%rdi)
 180
 181 .Lend:
 182         retq
 183         CFI_ENDPROC
 184 ENDPROC(memcpy)
 185 ENDPROC(__memcpy)
 186
 187         /*
 188          * Some CPUs are adding enhanced REP MOVSB/STOSB feature
 189          * If the feature is supported, memcpy_c_e() is the first choice.
 190          * If enhanced rep movsb copy is not available, use fast string copy
 191          * memcpy_c() when possible. This is faster and code is simpler than
 192          * original memcpy().
 193          * Otherwise, original memcpy() is used.
 194          * In .altinstructions section, ERMS feature is placed after REG_GOOD
 195          * feature to implement the right patch order.
 196          *
 197          * Replace only beginning, memcpy is used to apply alternatives,
 198          * so it is silly to overwrite itself with nops - reboot is the
 199          * only outcome...
 200          */
 201         .section .altinstructions, "a"
 202         altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
 203                              .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
 204         altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
 205                              .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
 206         .previous