2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16 * Copyright (C) IBM Corporation, 2012
18 * Author: Anton Blanchard <anton@au.ibm.com>
20 #include <asm/ppc_asm.h>
22 #define STACKFRAMESIZE 256
23 #define STK_REG(i) (112 + ((i)-14)*8)
25 _GLOBAL(memcpy_power7)
43 /* Get the source 8B aligned */
71 stdu r1,-STACKFRAMESIZE(r1)
72 std r14,STK_REG(r14)(r1)
73 std r15,STK_REG(r15)(r1)
74 std r16,STK_REG(r16)(r1)
75 std r17,STK_REG(r17)(r1)
76 std r18,STK_REG(r18)(r1)
77 std r19,STK_REG(r19)(r1)
78 std r20,STK_REG(r20)(r1)
79 std r21,STK_REG(r21)(r1)
80 std r22,STK_REG(r22)(r1)
81 std r0,STACKFRAMESIZE+16(r1)
86 /* Now do cacheline (128B) sized loads and stores. */
127 ld r14,STK_REG(r14)(r1)
128 ld r15,STK_REG(r15)(r1)
129 ld r16,STK_REG(r16)(r1)
130 ld r17,STK_REG(r17)(r1)
131 ld r18,STK_REG(r18)(r1)
132 ld r19,STK_REG(r19)(r1)
133 ld r20,STK_REG(r20)(r1)
134 ld r21,STK_REG(r21)(r1)
135 ld r22,STK_REG(r22)(r1)
136 addi r1,r1,STACKFRAMESIZE
138 /* Up to 127B to go */
162 /* Up to 63B to go */
175 /* Up to 31B to go */
184 9: clrldi r5,r5,(64-4)
186 /* Up to 15B to go */
190 lwz r0,0(r4) /* Less chance of a reject with word ops */
216 .Lunwind_stack_nonvmx_copy:
217 addi r1,r1,STACKFRAMESIZE
220 #ifdef CONFIG_ALTIVEC
226 stdu r1,-STACKFRAMESIZE(r1)
229 ld r0,STACKFRAMESIZE+16(r1)
230 ld r3,STACKFRAMESIZE+48(r1)
231 ld r4,STACKFRAMESIZE+56(r1)
232 ld r5,STACKFRAMESIZE+64(r1)
236 * We prefetch both the source and destination using enhanced touch
237 * instructions. We use a stream ID of 0 for the load side and
238 * 1 for the store side.
242 ori r9,r9,1 /* stream=1 */
244 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
248 1: lis r0,0x0E00 /* depth=7 */
251 ori r10,r7,1 /* stream=1 */
253 lis r8,0x8000 /* GO=1 */
261 dcbtst r0,r10,0b01010
263 dcbt r0,r8,0b01010 /* GO */
266 beq .Lunwind_stack_nonvmx_copy
269 * If source and destination are not relatively aligned we use a
270 * slower permute loop.
273 rldicl. r6,r6,0,(64-4)
274 bne .Lvmx_unaligned_copy
276 /* Get the destination 16B aligned */
307 /* Get the desination 128B aligned */
346 std r14,STK_REG(r14)(r1)
347 std r15,STK_REG(r15)(r1)
348 std r16,STK_REG(r16)(r1)
358 * Now do cacheline sized loads and stores. By this stage the
359 * cacheline stores are also cacheline aligned.
383 ld r14,STK_REG(r14)(r1)
384 ld r15,STK_REG(r15)(r1)
385 ld r16,STK_REG(r16)(r1)
387 /* Up to 127B to go */
418 /* Up to 15B to go */
419 11: clrldi r5,r5,(64-4)
443 15: addi r1,r1,STACKFRAMESIZE
445 b .exit_vmx_copy /* tail call optimise */
447 .Lvmx_unaligned_copy:
448 /* Get the destination 16B aligned */
472 lwz r0,0(r4) /* Less chance of a reject with word ops */
481 /* Get the desination 128B aligned */
491 lvsl vr16,0,r4 /* Setup permute control vector */
497 vperm vr8,vr0,vr1,vr16
505 vperm vr8,vr0,vr1,vr16
507 vperm vr9,vr1,vr0,vr16
515 vperm vr8,vr0,vr3,vr16
517 vperm vr9,vr3,vr2,vr16
519 vperm vr10,vr2,vr1,vr16
521 vperm vr11,vr1,vr0,vr16
532 std r14,STK_REG(r14)(r1)
533 std r15,STK_REG(r15)(r1)
534 std r16,STK_REG(r16)(r1)
544 * Now do cacheline sized loads and stores. By this stage the
545 * cacheline stores are also cacheline aligned.
550 vperm vr8,vr0,vr7,vr16
552 vperm vr9,vr7,vr6,vr16
554 vperm vr10,vr6,vr5,vr16
556 vperm vr11,vr5,vr4,vr16
558 vperm vr12,vr4,vr3,vr16
560 vperm vr13,vr3,vr2,vr16
562 vperm vr14,vr2,vr1,vr16
564 vperm vr15,vr1,vr0,vr16
577 ld r14,STK_REG(r14)(r1)
578 ld r15,STK_REG(r15)(r1)
579 ld r16,STK_REG(r16)(r1)
581 /* Up to 127B to go */
588 vperm vr8,vr0,vr3,vr16
590 vperm vr9,vr3,vr2,vr16
592 vperm vr10,vr2,vr1,vr16
594 vperm vr11,vr1,vr0,vr16
604 vperm vr8,vr0,vr1,vr16
606 vperm vr9,vr1,vr0,vr16
614 vperm vr8,vr0,vr1,vr16
619 /* Up to 15B to go */
620 11: clrldi r5,r5,(64-4)
621 addi r4,r4,-16 /* Unwind the +16 load offset */
624 lwz r0,0(r4) /* Less chance of a reject with word ops */
647 15: addi r1,r1,STACKFRAMESIZE
649 b .exit_vmx_copy /* tail call optimise */
650 #endif /* CONFiG_ALTIVEC */