arch/x86_64/lib/memset.S

   1 /* Copyright 2002 Andi Kleen, SuSE Labs */
   2
   3 #include <linux/config.h>
   4 #include <linux/linkage.h>
   5 #include <asm/dwarf2.h>
   6
   7 /*
   8  * ISO C memset - set a memory block to a byte value.
   9  *
  10  * rdi   destination
  11  * rsi   value (char)
  12  * rdx   count (bytes)
  13  *
  14  * rax   original destination
  15  */
  16         ALIGN
  17 memset_c:
  18         CFI_STARTPROC
  19         movq %rdi,%r9
  20         movl %edx,%r8d
  21         andl $7,%r8d
  22         movl %edx,%ecx
  23         shrl $3,%ecx
  24         /* expand byte value  */
  25         movzbl %sil,%esi
  26         movabs $0x0101010101010101,%rax
  27         mulq %rsi               /* with rax, clobbers rdx */
  28         rep stosq
  29         movl %r8d,%ecx
  30         rep stosb
  31         movq %r9,%rax
  32         ret
  33         CFI_ENDPROC
  34 ENDPROC(memset_c)
  35
  36 ENTRY(memset)
  37 ENTRY(__memset)
  38         CFI_STARTPROC
  39         movq %rdi,%r10
  40         movq %rdx,%r11
  41
  42         /* expand byte value  */
  43         movzbl %sil,%ecx
  44         movabs $0x0101010101010101,%rax
  45         mul    %rcx             /* with rax, clobbers rdx */
  46
  47         /* align dst */
  48         movl  %edi,%r9d
  49         andl  $7,%r9d
  50         jnz  .Lbad_alignment
  51         CFI_REMEMBER_STATE
  52 .Lafter_bad_alignment:
  53
  54         movl %r11d,%ecx
  55         shrl $6,%ecx
  56         jz       .Lhandle_tail
  57
  58         .p2align 4
  59 .Lloop_64:
  60         decl   %ecx
  61         movq  %rax,(%rdi)
  62         movq  %rax,8(%rdi)
  63         movq  %rax,16(%rdi)
  64         movq  %rax,24(%rdi)
  65         movq  %rax,32(%rdi)
  66         movq  %rax,40(%rdi)
  67         movq  %rax,48(%rdi)
  68         movq  %rax,56(%rdi)
  69         leaq  64(%rdi),%rdi
  70         jnz    .Lloop_64
  71
  72         /* Handle tail in loops. The loops should be faster than hard
  73            to predict jump tables. */
  74         .p2align 4
  75 .Lhandle_tail:
  76         movl    %r11d,%ecx
  77         andl    $63&(~7),%ecx
  78         jz              .Lhandle_7
  79         shrl    $3,%ecx
  80         .p2align 4
  81 .Lloop_8:
  82         decl   %ecx
  83         movq  %rax,(%rdi)
  84         leaq  8(%rdi),%rdi
  85         jnz    .Lloop_8
  86
  87 .Lhandle_7:
  88         movl    %r11d,%ecx
  89         andl    $7,%ecx
  90         jz      .Lende
  91         .p2align 4
  92 .Lloop_1:
  93         decl    %ecx
  94         movb    %al,(%rdi)
  95         leaq    1(%rdi),%rdi
  96         jnz     .Lloop_1
  97
  98 .Lende:
  99         movq    %r10,%rax
 100         ret
 101
 102         CFI_RESTORE_STATE
 103 .Lbad_alignment:
 104         cmpq $7,%r11
 105         jbe     .Lhandle_7
 106         movq %rax,(%rdi)        /* unaligned store */
 107         movq $8,%r8
 108         subq %r9,%r8
 109         addq %r8,%rdi
 110         subq %r8,%r11
 111         jmp .Lafter_bad_alignment
 112 .Lfinal:
 113         CFI_ENDPROC
 114 ENDPROC(memset)
 115 ENDPROC(__memset)
 116
 117         /* Some CPUs run faster using the string instructions.
 118            It is also a lot simpler. Use this when possible */
 119
 120 #include <asm/cpufeature.h>
 121
 122         .section .altinstr_replacement,"ax"
 123 1:      .byte 0xeb                              /* jmp <disp8> */
 124         .byte (memset_c - memset) - (2f - 1b)   /* offset */
 125 2:
 126         .previous
 127         .section .altinstructions,"a"
 128         .align 8
 129         .quad memset
 130         .quad 1b
 131         .byte X86_FEATURE_REP_GOOD
 132         .byte .Lfinal - memset
 133         .byte 2b - 1b
 134         .previous