Merge branch 'nfs-for-2.6.37' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6
[deliverable/linux.git] / arch / powerpc / lib / checksum_64.S
1 /*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15 #include <linux/sys.h>
16 #include <asm/processor.h>
17 #include <asm/errno.h>
18 #include <asm/ppc_asm.h>
19
20 /*
21 * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
22 * len is in words and is always >= 5.
23 *
24 * In practice len == 5, but this is not guaranteed. So this code does not
25 * attempt to use doubleword instructions.
26 */
27 _GLOBAL(ip_fast_csum)
28 lwz r0,0(r3)
29 lwzu r5,4(r3)
30 addic. r4,r4,-2
31 addc r0,r0,r5
32 mtctr r4
33 blelr-
34 1: lwzu r4,4(r3)
35 adde r0,r0,r4
36 bdnz 1b
37 addze r0,r0 /* add in final carry */
38 rldicl r4,r0,32,0 /* fold two 32-bit halves together */
39 add r0,r0,r4
40 srdi r0,r0,32
41 rlwinm r3,r0,16,0,31 /* fold two halves together */
42 add r3,r0,r3
43 not r3,r3
44 srwi r3,r3,16
45 blr
46
47 /*
48 * Compute checksum of TCP or UDP pseudo-header:
49 * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
50 * No real gain trying to do this specially for 64 bit, but
51 * the 32 bit addition may spill into the upper bits of
52 * the doubleword so we still must fold it down from 64.
53 */
54 _GLOBAL(csum_tcpudp_magic)
55 rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
56 addc r0,r3,r4 /* add 4 32-bit words together */
57 adde r0,r0,r5
58 adde r0,r0,r7
59 rldicl r4,r0,32,0 /* fold 64 bit value */
60 add r0,r4,r0
61 srdi r0,r0,32
62 rlwinm r3,r0,16,0,31 /* fold two halves together */
63 add r3,r0,r3
64 not r3,r3
65 srwi r3,r3,16
66 blr
67
68 #define STACKFRAMESIZE 256
69 #define STK_REG(i) (112 + ((i)-14)*8)
70
71 /*
72 * Computes the checksum of a memory block at buff, length len,
73 * and adds in "sum" (32-bit).
74 *
75 * csum_partial(r3=buff, r4=len, r5=sum)
76 */
77 _GLOBAL(csum_partial)
78 addic r0,r5,0 /* clear carry */
79
80 srdi. r6,r4,3 /* less than 8 bytes? */
81 beq .Lcsum_tail_word
82
83 /*
84 * If only halfword aligned, align to a double word. Since odd
85 * aligned addresses should be rare and they would require more
86 * work to calculate the correct checksum, we ignore that case
87 * and take the potential slowdown of unaligned loads.
88 */
89 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
90 beq .Lcsum_aligned
91
92 li r7,4
93 sub r6,r7,r6
94 mtctr r6
95
96 1:
97 lhz r6,0(r3) /* align to doubleword */
98 subi r4,r4,2
99 addi r3,r3,2
100 adde r0,r0,r6
101 bdnz 1b
102
103 .Lcsum_aligned:
104 /*
105 * We unroll the loop such that each iteration is 64 bytes with an
106 * entry and exit limb of 64 bytes, meaning a minimum size of
107 * 128 bytes.
108 */
109 srdi. r6,r4,7
110 beq .Lcsum_tail_doublewords /* len < 128 */
111
112 srdi r6,r4,6
113 subi r6,r6,1
114 mtctr r6
115
116 stdu r1,-STACKFRAMESIZE(r1)
117 std r14,STK_REG(r14)(r1)
118 std r15,STK_REG(r15)(r1)
119 std r16,STK_REG(r16)(r1)
120
121 ld r6,0(r3)
122 ld r9,8(r3)
123
124 ld r10,16(r3)
125 ld r11,24(r3)
126
127 /*
128 * On POWER6 and POWER7 back to back addes take 2 cycles because of
129 * the XER dependency. This means the fastest this loop can go is
130 * 16 cycles per iteration. The scheduling of the loop below has
131 * been shown to hit this on both POWER6 and POWER7.
132 */
133 .align 5
134 2:
135 adde r0,r0,r6
136 ld r12,32(r3)
137 ld r14,40(r3)
138
139 adde r0,r0,r9
140 ld r15,48(r3)
141 ld r16,56(r3)
142 addi r3,r3,64
143
144 adde r0,r0,r10
145
146 adde r0,r0,r11
147
148 adde r0,r0,r12
149
150 adde r0,r0,r14
151
152 adde r0,r0,r15
153 ld r6,0(r3)
154 ld r9,8(r3)
155
156 adde r0,r0,r16
157 ld r10,16(r3)
158 ld r11,24(r3)
159 bdnz 2b
160
161
162 adde r0,r0,r6
163 ld r12,32(r3)
164 ld r14,40(r3)
165
166 adde r0,r0,r9
167 ld r15,48(r3)
168 ld r16,56(r3)
169 addi r3,r3,64
170
171 adde r0,r0,r10
172 adde r0,r0,r11
173 adde r0,r0,r12
174 adde r0,r0,r14
175 adde r0,r0,r15
176 adde r0,r0,r16
177
178 ld r14,STK_REG(r14)(r1)
179 ld r15,STK_REG(r15)(r1)
180 ld r16,STK_REG(r16)(r1)
181 addi r1,r1,STACKFRAMESIZE
182
183 andi. r4,r4,63
184
185 .Lcsum_tail_doublewords: /* Up to 127 bytes to go */
186 srdi. r6,r4,3
187 beq .Lcsum_tail_word
188
189 mtctr r6
190 3:
191 ld r6,0(r3)
192 addi r3,r3,8
193 adde r0,r0,r6
194 bdnz 3b
195
196 andi. r4,r4,7
197
198 .Lcsum_tail_word: /* Up to 7 bytes to go */
199 srdi. r6,r4,2
200 beq .Lcsum_tail_halfword
201
202 lwz r6,0(r3)
203 addi r3,r3,4
204 adde r0,r0,r6
205 subi r4,r4,4
206
207 .Lcsum_tail_halfword: /* Up to 3 bytes to go */
208 srdi. r6,r4,1
209 beq .Lcsum_tail_byte
210
211 lhz r6,0(r3)
212 addi r3,r3,2
213 adde r0,r0,r6
214 subi r4,r4,2
215
216 .Lcsum_tail_byte: /* Up to 1 byte to go */
217 andi. r6,r4,1
218 beq .Lcsum_finish
219
220 lbz r6,0(r3)
221 sldi r9,r6,8 /* Pad the byte out to 16 bits */
222 adde r0,r0,r9
223
224 .Lcsum_finish:
225 addze r0,r0 /* add in final carry */
226 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
227 add r3,r4,r0
228 srdi r3,r3,32
229 blr
230
231
232 .macro source
233 100:
234 .section __ex_table,"a"
235 .align 3
236 .llong 100b,.Lsrc_error
237 .previous
238 .endm
239
240 .macro dest
241 200:
242 .section __ex_table,"a"
243 .align 3
244 .llong 200b,.Ldest_error
245 .previous
246 .endm
247
248 /*
249 * Computes the checksum of a memory block at src, length len,
250 * and adds in "sum" (32-bit), while copying the block to dst.
251 * If an access exception occurs on src or dst, it stores -EFAULT
252 * to *src_err or *dst_err respectively. The caller must take any action
253 * required in this case (zeroing memory, recalculating partial checksum etc).
254 *
255 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
256 */
257 _GLOBAL(csum_partial_copy_generic)
258 addic r0,r6,0 /* clear carry */
259
260 srdi. r6,r5,3 /* less than 8 bytes? */
261 beq .Lcopy_tail_word
262
263 /*
264 * If only halfword aligned, align to a double word. Since odd
265 * aligned addresses should be rare and they would require more
266 * work to calculate the correct checksum, we ignore that case
267 * and take the potential slowdown of unaligned loads.
268 *
269 * If the source and destination are relatively unaligned we only
270 * align the source. This keeps things simple.
271 */
272 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
273 beq .Lcopy_aligned
274
275 li r7,4
276 sub r6,r7,r6
277 mtctr r6
278
279 1:
280 source; lhz r6,0(r3) /* align to doubleword */
281 subi r5,r5,2
282 addi r3,r3,2
283 adde r0,r0,r6
284 dest; sth r6,0(r4)
285 addi r4,r4,2
286 bdnz 1b
287
288 .Lcopy_aligned:
289 /*
290 * We unroll the loop such that each iteration is 64 bytes with an
291 * entry and exit limb of 64 bytes, meaning a minimum size of
292 * 128 bytes.
293 */
294 srdi. r6,r5,7
295 beq .Lcopy_tail_doublewords /* len < 128 */
296
297 srdi r6,r5,6
298 subi r6,r6,1
299 mtctr r6
300
301 stdu r1,-STACKFRAMESIZE(r1)
302 std r14,STK_REG(r14)(r1)
303 std r15,STK_REG(r15)(r1)
304 std r16,STK_REG(r16)(r1)
305
306 source; ld r6,0(r3)
307 source; ld r9,8(r3)
308
309 source; ld r10,16(r3)
310 source; ld r11,24(r3)
311
312 /*
313 * On POWER6 and POWER7 back to back addes take 2 cycles because of
314 * the XER dependency. This means the fastest this loop can go is
315 * 16 cycles per iteration. The scheduling of the loop below has
316 * been shown to hit this on both POWER6 and POWER7.
317 */
318 .align 5
319 2:
320 adde r0,r0,r6
321 source; ld r12,32(r3)
322 source; ld r14,40(r3)
323
324 adde r0,r0,r9
325 source; ld r15,48(r3)
326 source; ld r16,56(r3)
327 addi r3,r3,64
328
329 adde r0,r0,r10
330 dest; std r6,0(r4)
331 dest; std r9,8(r4)
332
333 adde r0,r0,r11
334 dest; std r10,16(r4)
335 dest; std r11,24(r4)
336
337 adde r0,r0,r12
338 dest; std r12,32(r4)
339 dest; std r14,40(r4)
340
341 adde r0,r0,r14
342 dest; std r15,48(r4)
343 dest; std r16,56(r4)
344 addi r4,r4,64
345
346 adde r0,r0,r15
347 source; ld r6,0(r3)
348 source; ld r9,8(r3)
349
350 adde r0,r0,r16
351 source; ld r10,16(r3)
352 source; ld r11,24(r3)
353 bdnz 2b
354
355
356 adde r0,r0,r6
357 source; ld r12,32(r3)
358 source; ld r14,40(r3)
359
360 adde r0,r0,r9
361 source; ld r15,48(r3)
362 source; ld r16,56(r3)
363 addi r3,r3,64
364
365 adde r0,r0,r10
366 dest; std r6,0(r4)
367 dest; std r9,8(r4)
368
369 adde r0,r0,r11
370 dest; std r10,16(r4)
371 dest; std r11,24(r4)
372
373 adde r0,r0,r12
374 dest; std r12,32(r4)
375 dest; std r14,40(r4)
376
377 adde r0,r0,r14
378 dest; std r15,48(r4)
379 dest; std r16,56(r4)
380 addi r4,r4,64
381
382 adde r0,r0,r15
383 adde r0,r0,r16
384
385 ld r14,STK_REG(r14)(r1)
386 ld r15,STK_REG(r15)(r1)
387 ld r16,STK_REG(r16)(r1)
388 addi r1,r1,STACKFRAMESIZE
389
390 andi. r5,r5,63
391
392 .Lcopy_tail_doublewords: /* Up to 127 bytes to go */
393 srdi. r6,r5,3
394 beq .Lcopy_tail_word
395
396 mtctr r6
397 3:
398 source; ld r6,0(r3)
399 addi r3,r3,8
400 adde r0,r0,r6
401 dest; std r6,0(r4)
402 addi r4,r4,8
403 bdnz 3b
404
405 andi. r5,r5,7
406
407 .Lcopy_tail_word: /* Up to 7 bytes to go */
408 srdi. r6,r5,2
409 beq .Lcopy_tail_halfword
410
411 source; lwz r6,0(r3)
412 addi r3,r3,4
413 adde r0,r0,r6
414 dest; stw r6,0(r4)
415 addi r4,r4,4
416 subi r5,r5,4
417
418 .Lcopy_tail_halfword: /* Up to 3 bytes to go */
419 srdi. r6,r5,1
420 beq .Lcopy_tail_byte
421
422 source; lhz r6,0(r3)
423 addi r3,r3,2
424 adde r0,r0,r6
425 dest; sth r6,0(r4)
426 addi r4,r4,2
427 subi r5,r5,2
428
429 .Lcopy_tail_byte: /* Up to 1 byte to go */
430 andi. r6,r5,1
431 beq .Lcopy_finish
432
433 source; lbz r6,0(r3)
434 sldi r9,r6,8 /* Pad the byte out to 16 bits */
435 adde r0,r0,r9
436 dest; stb r6,0(r4)
437
438 .Lcopy_finish:
439 addze r0,r0 /* add in final carry */
440 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
441 add r3,r4,r0
442 srdi r3,r3,32
443 blr
444
445 .Lsrc_error:
446 cmpdi 0,r7,0
447 beqlr
448 li r6,-EFAULT
449 stw r6,0(r7)
450 blr
451
452 .Ldest_error:
453 cmpdi 0,r8,0
454 beqlr
455 li r6,-EFAULT
456 stw r6,0(r8)
457 blr
This page took 0.049918 seconds and 5 git commands to generate.