arch/x86/lib/memcpy_64.S

   1 /* Copyright 2002 Andi Kleen */
   2
   3 #include <linux/linkage.h>
   4 #include <asm/cpufeatures.h>
   5 #include <asm/alternative-asm.h>
   6
   7 /*
   8  * We build a jump to memcpy_orig by default which gets NOPped out on
   9  * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
  10  * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
  11  * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
  12  */
  13
  14 .weak memcpy
  15
  16 /*
  17  * memcpy - Copy a memory block.
  18  *
  19  * Input:
  20  *  rdi destination
  21  *  rsi source
  22  *  rdx count
  23  *
  24  * Output:
  25  * rax original destination
  26  */
  27 ENTRY(__memcpy)
  28 ENTRY(memcpy)
  29         ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
  30                       "jmp memcpy_erms", X86_FEATURE_ERMS
  31
  32         movq %rdi, %rax
  33         movq %rdx, %rcx
  34         shrq $3, %rcx
  35         andl $7, %edx
  36         rep movsq
  37         movl %edx, %ecx
  38         rep movsb
  39         ret
  40 ENDPROC(memcpy)
  41 ENDPROC(__memcpy)
  42
  43 /*
  44  * memcpy_erms() - enhanced fast string memcpy. This is faster and
  45  * simpler than memcpy. Use memcpy_erms when possible.
  46  */
  47 ENTRY(memcpy_erms)
  48         movq %rdi, %rax
  49         movq %rdx, %rcx
  50         rep movsb
  51         ret
  52 ENDPROC(memcpy_erms)
  53
  54 ENTRY(memcpy_orig)
  55         movq %rdi, %rax
  56
  57         cmpq $0x20, %rdx
  58         jb .Lhandle_tail
  59
  60         /*
  61          * We check whether memory false dependence could occur,
  62          * then jump to corresponding copy mode.
  63          */
  64         cmp  %dil, %sil
  65         jl .Lcopy_backward
  66         subq $0x20, %rdx
  67 .Lcopy_forward_loop:
  68         subq $0x20,     %rdx
  69
  70         /*
  71          * Move in blocks of 4x8 bytes:
  72          */
  73         movq 0*8(%rsi), %r8
  74         movq 1*8(%rsi), %r9
  75         movq 2*8(%rsi), %r10
  76         movq 3*8(%rsi), %r11
  77         leaq 4*8(%rsi), %rsi
  78
  79         movq %r8,       0*8(%rdi)
  80         movq %r9,       1*8(%rdi)
  81         movq %r10,      2*8(%rdi)
  82         movq %r11,      3*8(%rdi)
  83         leaq 4*8(%rdi), %rdi
  84         jae  .Lcopy_forward_loop
  85         addl $0x20,     %edx
  86         jmp  .Lhandle_tail
  87
  88 .Lcopy_backward:
  89         /*
  90          * Calculate copy position to tail.
  91          */
  92         addq %rdx,      %rsi
  93         addq %rdx,      %rdi
  94         subq $0x20,     %rdx
  95         /*
  96          * At most 3 ALU operations in one cycle,
  97          * so append NOPS in the same 16 bytes trunk.
  98          */
  99         .p2align 4
 100 .Lcopy_backward_loop:
 101         subq $0x20,     %rdx
 102         movq -1*8(%rsi),        %r8
 103         movq -2*8(%rsi),        %r9
 104         movq -3*8(%rsi),        %r10
 105         movq -4*8(%rsi),        %r11
 106         leaq -4*8(%rsi),        %rsi
 107         movq %r8,               -1*8(%rdi)
 108         movq %r9,               -2*8(%rdi)
 109         movq %r10,              -3*8(%rdi)
 110         movq %r11,              -4*8(%rdi)
 111         leaq -4*8(%rdi),        %rdi
 112         jae  .Lcopy_backward_loop
 113
 114         /*
 115          * Calculate copy position to head.
 116          */
 117         addl $0x20,     %edx
 118         subq %rdx,      %rsi
 119         subq %rdx,      %rdi
 120 .Lhandle_tail:
 121         cmpl $16,       %edx
 122         jb   .Lless_16bytes
 123
 124         /*
 125          * Move data from 16 bytes to 31 bytes.
 126          */
 127         movq 0*8(%rsi), %r8
 128         movq 1*8(%rsi), %r9
 129         movq -2*8(%rsi, %rdx),  %r10
 130         movq -1*8(%rsi, %rdx),  %r11
 131         movq %r8,       0*8(%rdi)
 132         movq %r9,       1*8(%rdi)
 133         movq %r10,      -2*8(%rdi, %rdx)
 134         movq %r11,      -1*8(%rdi, %rdx)
 135         retq
 136         .p2align 4
 137 .Lless_16bytes:
 138         cmpl $8,        %edx
 139         jb   .Lless_8bytes
 140         /*
 141          * Move data from 8 bytes to 15 bytes.
 142          */
 143         movq 0*8(%rsi), %r8
 144         movq -1*8(%rsi, %rdx),  %r9
 145         movq %r8,       0*8(%rdi)
 146         movq %r9,       -1*8(%rdi, %rdx)
 147         retq
 148         .p2align 4
 149 .Lless_8bytes:
 150         cmpl $4,        %edx
 151         jb   .Lless_3bytes
 152
 153         /*
 154          * Move data from 4 bytes to 7 bytes.
 155          */
 156         movl (%rsi), %ecx
 157         movl -4(%rsi, %rdx), %r8d
 158         movl %ecx, (%rdi)
 159         movl %r8d, -4(%rdi, %rdx)
 160         retq
 161         .p2align 4
 162 .Lless_3bytes:
 163         subl $1, %edx
 164         jb .Lend
 165         /*
 166          * Move data from 1 bytes to 3 bytes.
 167          */
 168         movzbl (%rsi), %ecx
 169         jz .Lstore_1byte
 170         movzbq 1(%rsi), %r8
 171         movzbq (%rsi, %rdx), %r9
 172         movb %r8b, 1(%rdi)
 173         movb %r9b, (%rdi, %rdx)
 174 .Lstore_1byte:
 175         movb %cl, (%rdi)
 176
 177 .Lend:
 178         retq
 179 ENDPROC(memcpy_orig)
 180
 181 #ifndef CONFIG_UML
 182 /*
 183  * memcpy_mcsafe - memory copy with machine check exception handling
 184  * Note that we only catch machine checks when reading the source addresses.
 185  * Writes to target are posted and don't generate machine checks.
 186  */
 187 ENTRY(memcpy_mcsafe)
 188         cmpl $8, %edx
 189         /* Less than 8 bytes? Go to byte copy loop */
 190         jb .L_no_whole_words
 191
 192         /* Check for bad alignment of source */
 193         testl $7, %esi
 194         /* Already aligned */
 195         jz .L_8byte_aligned
 196
 197         /* Copy one byte at a time until source is 8-byte aligned */
 198         movl %esi, %ecx
 199         andl $7, %ecx
 200         subl $8, %ecx
 201         negl %ecx
 202         subl %ecx, %edx
 203 .L_copy_leading_bytes:
 204         movb (%rsi), %al
 205         movb %al, (%rdi)
 206         incq %rsi
 207         incq %rdi
 208         decl %ecx
 209         jnz .L_copy_leading_bytes
 210
 211 .L_8byte_aligned:
 212         /* Figure out how many whole cache lines (64-bytes) to copy */
 213         movl %edx, %ecx
 214         andl $63, %edx
 215         shrl $6, %ecx
 216         jz .L_no_whole_cache_lines
 217
 218         /* Loop copying whole cache lines */
 219 .L_cache_w0: movq (%rsi), %r8
 220 .L_cache_w1: movq 1*8(%rsi), %r9
 221 .L_cache_w2: movq 2*8(%rsi), %r10
 222 .L_cache_w3: movq 3*8(%rsi), %r11
 223         movq %r8, (%rdi)
 224         movq %r9, 1*8(%rdi)
 225         movq %r10, 2*8(%rdi)
 226         movq %r11, 3*8(%rdi)
 227 .L_cache_w4: movq 4*8(%rsi), %r8
 228 .L_cache_w5: movq 5*8(%rsi), %r9
 229 .L_cache_w6: movq 6*8(%rsi), %r10
 230 .L_cache_w7: movq 7*8(%rsi), %r11
 231         movq %r8, 4*8(%rdi)
 232         movq %r9, 5*8(%rdi)
 233         movq %r10, 6*8(%rdi)
 234         movq %r11, 7*8(%rdi)
 235         leaq 64(%rsi), %rsi
 236         leaq 64(%rdi), %rdi
 237         decl %ecx
 238         jnz .L_cache_w0
 239
 240         /* Are there any trailing 8-byte words? */
 241 .L_no_whole_cache_lines:
 242         movl %edx, %ecx
 243         andl $7, %edx
 244         shrl $3, %ecx
 245         jz .L_no_whole_words
 246
 247         /* Copy trailing words */
 248 .L_copy_trailing_words:
 249         movq (%rsi), %r8
 250         mov %r8, (%rdi)
 251         leaq 8(%rsi), %rsi
 252         leaq 8(%rdi), %rdi
 253         decl %ecx
 254         jnz .L_copy_trailing_words
 255
 256         /* Any trailing bytes? */
 257 .L_no_whole_words:
 258         andl %edx, %edx
 259         jz .L_done_memcpy_trap
 260
 261         /* Copy trailing bytes */
 262         movl %edx, %ecx
 263 .L_copy_trailing_bytes:
 264         movb (%rsi), %al
 265         movb %al, (%rdi)
 266         incq %rsi
 267         incq %rdi
 268         decl %ecx
 269         jnz .L_copy_trailing_bytes
 270
 271         /* Copy successful. Return true */
 272 .L_done_memcpy_trap:
 273         xorq %rax, %rax
 274         ret
 275 ENDPROC(memcpy_mcsafe)
 276
 277         .section .fixup, "ax"
 278         /* Return false for any failure */
 279 .L_memcpy_mcsafe_fail:
 280         mov     $1, %rax
 281         ret
 282
 283         .previous
 284
 285         _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
 286         _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
 287         _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
 288         _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
 289         _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
 290         _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
 291         _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
 292         _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
 293         _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
 294         _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
 295         _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
 296 #endif