Merge remote-tracking branch 'asoc/topic/topology' into asoc-next
[deliverable/linux.git] / arch / powerpc / lib / copy_32.S
CommitLineData
14cf11af
PM
1/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
14cf11af
PM
11#include <asm/processor.h>
12#include <asm/cache.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
15
16#define COPY_16_BYTES \
17 lwz r7,4(r4); \
18 lwz r8,8(r4); \
19 lwz r9,12(r4); \
20 lwzu r10,16(r4); \
21 stw r7,4(r6); \
22 stw r8,8(r6); \
23 stw r9,12(r6); \
24 stwu r10,16(r6)
25
26#define COPY_16_BYTES_WITHEX(n) \
278 ## n ## 0: \
28 lwz r7,4(r4); \
298 ## n ## 1: \
30 lwz r8,8(r4); \
318 ## n ## 2: \
32 lwz r9,12(r4); \
338 ## n ## 3: \
34 lwzu r10,16(r4); \
358 ## n ## 4: \
36 stw r7,4(r6); \
378 ## n ## 5: \
38 stw r8,8(r6); \
398 ## n ## 6: \
40 stw r9,12(r6); \
418 ## n ## 7: \
42 stwu r10,16(r6)
43
44#define COPY_16_BYTES_EXCODE(n) \
459 ## n ## 0: \
46 addi r5,r5,-(16 * n); \
47 b 104f; \
489 ## n ## 1: \
49 addi r5,r5,-(16 * n); \
50 b 105f; \
51.section __ex_table,"a"; \
52 .align 2; \
53 .long 8 ## n ## 0b,9 ## n ## 0b; \
54 .long 8 ## n ## 1b,9 ## n ## 0b; \
55 .long 8 ## n ## 2b,9 ## n ## 0b; \
56 .long 8 ## n ## 3b,9 ## n ## 0b; \
57 .long 8 ## n ## 4b,9 ## n ## 1b; \
58 .long 8 ## n ## 5b,9 ## n ## 1b; \
59 .long 8 ## n ## 6b,9 ## n ## 1b; \
60 .long 8 ## n ## 7b,9 ## n ## 1b; \
61 .text
62
63 .text
64 .stabs "arch/powerpc/lib/",N_SO,0,0,0f
025c0186 65 .stabs "copy_32.S",N_SO,0,0,0f
14cf11af
PM
660:
67
7dffb720
SR
68CACHELINE_BYTES = L1_CACHE_BYTES
69LG_CACHELINE_BYTES = L1_CACHE_SHIFT
70CACHELINE_MASK = (L1_CACHE_BYTES-1)
14cf11af 71
df087e45
LC
72/*
73 * Use dcbz on the complete cache lines in the destination
74 * to set them to zero. This requires that the destination
75 * area is cacheable. -- paulus
400c47d8
LC
76 *
77 * During early init, cache might not be active yet, so dcbz cannot be used.
78 * We therefore skip the optimised bloc that uses dcbz. This jump is
79 * replaced by a nop once cache is active. This is done in machine_init()
df087e45 80 */
5b2a32e8 81_GLOBAL(memset)
c152f149
LC
82 rlwimi r4,r4,8,16,23
83 rlwimi r4,r4,16,0,15
84
df087e45
LC
85 addi r6,r3,-4
86 cmplwi 0,r5,4
87 blt 7f
88 stwu r4,4(r6)
89 beqlr
90 andi. r0,r6,3
91 add r5,r0,r5
92 subf r6,r0,r6
c152f149
LC
93 cmplwi 0,r4,0
94 bne 2f /* Use normal procedure if r4 is not zero */
400c47d8
LC
95_GLOBAL(memset_nocache_branch)
96 b 2f /* Skip optimised bloc until cache is enabled */
c152f149 97
df087e45
LC
98 clrlwi r7,r6,32-LG_CACHELINE_BYTES
99 add r8,r7,r5
100 srwi r9,r8,LG_CACHELINE_BYTES
101 addic. r9,r9,-1 /* total number of complete cachelines */
102 ble 2f
103 xori r0,r7,CACHELINE_MASK & ~3
104 srwi. r0,r0,2
105 beq 3f
106 mtctr r0
1074: stwu r4,4(r6)
108 bdnz 4b
1093: mtctr r9
110 li r7,4
11110: dcbz r7,r6
112 addi r6,r6,CACHELINE_BYTES
113 bdnz 10b
114 clrlwi r5,r8,32-LG_CACHELINE_BYTES
115 addi r5,r5,4
df087e45 116
c152f149 1172: srwi r0,r5,2
14cf11af
PM
118 mtctr r0
119 bdz 6f
1201: stwu r4,4(r6)
121 bdnz 1b
1226: andi. r5,r5,3
1237: cmpwi 0,r5,0
124 beqlr
125 mtctr r5
126 addi r6,r6,3
1278: stbu r4,1(r6)
128 bdnz 8b
129 blr
130
df087e45
LC
131/*
132 * This version uses dcbz on the complete cache lines in the
133 * destination area to reduce memory traffic. This requires that
134 * the destination area is cacheable.
135 * We only use this version if the source and dest don't overlap.
136 * -- paulus.
1cd03890
LC
137 *
138 * During early init, cache might not be active yet, so dcbz cannot be used.
139 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
140 * replaced by a nop once cache is active. This is done in machine_init()
df087e45 141 */
0b05e2d6
LC
142_GLOBAL(memmove)
143 cmplw 0,r3,r4
144 bgt backwards_memcpy
145 /* fall through */
146
147_GLOBAL(memcpy)
1cd03890 148 b generic_memcpy
df087e45
LC
149 add r7,r3,r5 /* test if the src & dst overlap */
150 add r8,r4,r5
151 cmplw 0,r4,r7
152 cmplw 1,r3,r8
153 crand 0,0,4 /* cr0.lt &= cr1.lt */
0b05e2d6 154 blt generic_memcpy /* if regions overlap */
df087e45
LC
155
156 addi r4,r4,-4
157 addi r6,r3,-4
158 neg r0,r3
159 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
160 beq 58f
161
162 cmplw 0,r5,r0 /* is this more than total to do? */
163 blt 63f /* if not much to do */
164 andi. r8,r0,3 /* get it word-aligned first */
165 subf r5,r0,r5
166 mtctr r8
167 beq+ 61f
16870: lbz r9,4(r4) /* do some bytes */
df087e45
LC
169 addi r4,r4,1
170 addi r6,r6,1
295ffb41 171 stb r9,3(r6)
df087e45
LC
172 bdnz 70b
17361: srwi. r0,r0,2
174 mtctr r0
175 beq 58f
17672: lwzu r9,4(r4) /* do some words */
177 stwu r9,4(r6)
178 bdnz 72b
179
18058: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
181 clrlwi r5,r5,32-LG_CACHELINE_BYTES
182 li r11,4
183 mtctr r0
184 beq 63f
18553:
186 dcbz r11,r6
187 COPY_16_BYTES
188#if L1_CACHE_BYTES >= 32
189 COPY_16_BYTES
190#if L1_CACHE_BYTES >= 64
191 COPY_16_BYTES
192 COPY_16_BYTES
193#if L1_CACHE_BYTES >= 128
194 COPY_16_BYTES
195 COPY_16_BYTES
196 COPY_16_BYTES
197 COPY_16_BYTES
198#endif
199#endif
200#endif
201 bdnz 53b
202
20363: srwi. r0,r5,2
204 mtctr r0
205 beq 64f
20630: lwzu r0,4(r4)
207 stwu r0,4(r6)
208 bdnz 30b
209
21064: andi. r0,r5,3
211 mtctr r0
212 beq+ 65f
295ffb41
LC
213 addi r4,r4,3
214 addi r6,r6,3
21540: lbzu r0,1(r4)
216 stbu r0,1(r6)
df087e45
LC
217 bdnz 40b
21865: blr
219
0b05e2d6 220_GLOBAL(generic_memcpy)
14cf11af
PM
221 srwi. r7,r5,3
222 addi r6,r3,-4
223 addi r4,r4,-4
224 beq 2f /* if less than 8 bytes to do */
225 andi. r0,r6,3 /* get dest word aligned */
226 mtctr r7
227 bne 5f
2281: lwz r7,4(r4)
229 lwzu r8,8(r4)
230 stw r7,4(r6)
231 stwu r8,8(r6)
232 bdnz 1b
233 andi. r5,r5,7
2342: cmplwi 0,r5,4
235 blt 3f
236 lwzu r0,4(r4)
237 addi r5,r5,-4
238 stwu r0,4(r6)
2393: cmpwi 0,r5,0
240 beqlr
241 mtctr r5
242 addi r4,r4,3
243 addi r6,r6,3
2444: lbzu r0,1(r4)
245 stbu r0,1(r6)
246 bdnz 4b
247 blr
2485: subfic r0,r0,4
249 mtctr r0
2506: lbz r7,4(r4)
251 addi r4,r4,1
252 stb r7,4(r6)
253 addi r6,r6,1
254 bdnz 6b
255 subf r5,r0,r5
256 rlwinm. r7,r5,32-3,3,31
257 beq 2b
258 mtctr r7
259 b 1b
260
261_GLOBAL(backwards_memcpy)
262 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */
263 add r6,r3,r5
264 add r4,r4,r5
265 beq 2f
266 andi. r0,r6,3
267 mtctr r7
268 bne 5f
2691: lwz r7,-4(r4)
270 lwzu r8,-8(r4)
271 stw r7,-4(r6)
272 stwu r8,-8(r6)
273 bdnz 1b
274 andi. r5,r5,7
2752: cmplwi 0,r5,4
276 blt 3f
277 lwzu r0,-4(r4)
278 subi r5,r5,4
279 stwu r0,-4(r6)
2803: cmpwi 0,r5,0
281 beqlr
282 mtctr r5
2834: lbzu r0,-1(r4)
284 stbu r0,-1(r6)
285 bdnz 4b
286 blr
2875: mtctr r0
2886: lbzu r7,-1(r4)
289 stbu r7,-1(r6)
290 bdnz 6b
291 subf r5,r0,r5
292 rlwinm. r7,r5,32-3,3,31
293 beq 2b
294 mtctr r7
295 b 1b
296
297_GLOBAL(__copy_tofrom_user)
298 addi r4,r4,-4
299 addi r6,r3,-4
300 neg r0,r3
301 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
302 beq 58f
303
304 cmplw 0,r5,r0 /* is this more than total to do? */
305 blt 63f /* if not much to do */
306 andi. r8,r0,3 /* get it word-aligned first */
307 mtctr r8
308 beq+ 61f
30970: lbz r9,4(r4) /* do some bytes */
31071: stb r9,4(r6)
311 addi r4,r4,1
312 addi r6,r6,1
313 bdnz 70b
31461: subf r5,r0,r5
315 srwi. r0,r0,2
316 mtctr r0
317 beq 58f
31872: lwzu r9,4(r4) /* do some words */
31973: stwu r9,4(r6)
320 bdnz 72b
321
322 .section __ex_table,"a"
323 .align 2
324 .long 70b,100f
325 .long 71b,101f
326 .long 72b,102f
327 .long 73b,103f
328 .text
329
33058: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
331 clrlwi r5,r5,32-LG_CACHELINE_BYTES
332 li r11,4
333 beq 63f
334
14cf11af
PM
335 /* Here we decide how far ahead to prefetch the source */
336 li r3,4
337 cmpwi r0,1
338 li r7,0
339 ble 114f
340 li r7,1
341#if MAX_COPY_PREFETCH > 1
342 /* Heuristically, for large transfers we prefetch
343 MAX_COPY_PREFETCH cachelines ahead. For small transfers
344 we prefetch 1 cacheline ahead. */
345 cmpwi r0,MAX_COPY_PREFETCH
346 ble 112f
347 li r7,MAX_COPY_PREFETCH
348112: mtctr r7
349111: dcbt r3,r4
350 addi r3,r3,CACHELINE_BYTES
351 bdnz 111b
352#else
353 dcbt r3,r4
354 addi r3,r3,CACHELINE_BYTES
355#endif /* MAX_COPY_PREFETCH > 1 */
356
357114: subf r8,r7,r0
358 mr r0,r7
359 mtctr r8
360
36153: dcbt r3,r4
36254: dcbz r11,r6
363 .section __ex_table,"a"
364 .align 2
365 .long 54b,105f
366 .text
367/* the main body of the cacheline loop */
368 COPY_16_BYTES_WITHEX(0)
7dffb720 369#if L1_CACHE_BYTES >= 32
14cf11af 370 COPY_16_BYTES_WITHEX(1)
7dffb720 371#if L1_CACHE_BYTES >= 64
14cf11af
PM
372 COPY_16_BYTES_WITHEX(2)
373 COPY_16_BYTES_WITHEX(3)
7dffb720 374#if L1_CACHE_BYTES >= 128
14cf11af
PM
375 COPY_16_BYTES_WITHEX(4)
376 COPY_16_BYTES_WITHEX(5)
377 COPY_16_BYTES_WITHEX(6)
378 COPY_16_BYTES_WITHEX(7)
379#endif
380#endif
381#endif
382 bdnz 53b
383 cmpwi r0,0
384 li r3,4
385 li r7,0
386 bne 114b
14cf11af
PM
387
38863: srwi. r0,r5,2
389 mtctr r0
390 beq 64f
39130: lwzu r0,4(r4)
39231: stwu r0,4(r6)
393 bdnz 30b
394
39564: andi. r0,r5,3
396 mtctr r0
397 beq+ 65f
39840: lbz r0,4(r4)
39941: stb r0,4(r6)
400 addi r4,r4,1
401 addi r6,r6,1
402 bdnz 40b
40365: li r3,0
404 blr
405
406/* read fault, initial single-byte copy */
407100: li r9,0
408 b 90f
409/* write fault, initial single-byte copy */
410101: li r9,1
41190: subf r5,r8,r5
412 li r3,0
413 b 99f
414/* read fault, initial word copy */
415102: li r9,0
416 b 91f
417/* write fault, initial word copy */
418103: li r9,1
41991: li r3,2
420 b 99f
421
422/*
423 * this stuff handles faults in the cacheline loop and branches to either
424 * 104f (if in read part) or 105f (if in write part), after updating r5
425 */
426 COPY_16_BYTES_EXCODE(0)
7dffb720 427#if L1_CACHE_BYTES >= 32
14cf11af 428 COPY_16_BYTES_EXCODE(1)
7dffb720 429#if L1_CACHE_BYTES >= 64
14cf11af
PM
430 COPY_16_BYTES_EXCODE(2)
431 COPY_16_BYTES_EXCODE(3)
7dffb720 432#if L1_CACHE_BYTES >= 128
14cf11af
PM
433 COPY_16_BYTES_EXCODE(4)
434 COPY_16_BYTES_EXCODE(5)
435 COPY_16_BYTES_EXCODE(6)
436 COPY_16_BYTES_EXCODE(7)
437#endif
438#endif
439#endif
440
441/* read fault in cacheline loop */
442104: li r9,0
443 b 92f
444/* fault on dcbz (effectively a write fault) */
445/* or write fault in cacheline loop */
446105: li r9,1
44792: li r3,LG_CACHELINE_BYTES
448 mfctr r8
449 add r0,r0,r8
450 b 106f
451/* read fault in final word loop */
452108: li r9,0
453 b 93f
454/* write fault in final word loop */
455109: li r9,1
45693: andi. r5,r5,3
457 li r3,2
458 b 99f
459/* read fault in final byte loop */
460110: li r9,0
461 b 94f
462/* write fault in final byte loop */
463111: li r9,1
46494: li r5,0
465 li r3,0
466/*
467 * At this stage the number of bytes not copied is
468 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
469 */
47099: mfctr r0
471106: slw r3,r0,r3
472 add. r3,r3,r5
473 beq 120f /* shouldn't happen */
474 cmpwi 0,r9,0
475 bne 120f
476/* for a read fault, first try to continue the copy one byte at a time */
477 mtctr r3
478130: lbz r0,4(r4)
479131: stb r0,4(r6)
480 addi r4,r4,1
481 addi r6,r6,1
482 bdnz 130b
483/* then clear out the destination: r3 bytes starting at 4(r6) */
484132: mfctr r3
485 srwi. r0,r3,2
486 li r9,0
487 mtctr r0
488 beq 113f
489112: stwu r9,4(r6)
490 bdnz 112b
491113: andi. r0,r3,3
492 mtctr r0
493 beq 120f
494114: stb r9,4(r6)
495 addi r6,r6,1
496 bdnz 114b
497120: blr
498
499 .section __ex_table,"a"
500 .align 2
501 .long 30b,108b
502 .long 31b,109b
503 .long 40b,110b
504 .long 41b,111b
505 .long 130b,132b
506 .long 131b,120b
507 .long 112b,120b
508 .long 114b,120b
509 .text
This page took 0.698449 seconds and 5 git commands to generate.