Merge tag 'drm-fixes-v4.7-rc1' of git://people.freedesktop.org/~airlied/linux
[deliverable/linux.git] / arch / powerpc / lib / checksum_64.S
CommitLineData
14cf11af
PM
1/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
19
14cf11af
PM
20/*
21 * Computes the checksum of a memory block at buff, length len,
22 * and adds in "sum" (32-bit).
23 *
7e393220 24 * __csum_partial(r3=buff, r4=len, r5=sum)
14cf11af 25 */
7e393220 26_GLOBAL(__csum_partial)
9b83ecb0
AB
27 addic r0,r5,0 /* clear carry */
28
29 srdi. r6,r4,3 /* less than 8 bytes? */
30 beq .Lcsum_tail_word
31
32 /*
33 * If only halfword aligned, align to a double word. Since odd
34 * aligned addresses should be rare and they would require more
35 * work to calculate the correct checksum, we ignore that case
36 * and take the potential slowdown of unaligned loads.
37 */
38 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
39 beq .Lcsum_aligned
40
41 li r7,4
42 sub r6,r7,r6
43 mtctr r6
44
451:
46 lhz r6,0(r3) /* align to doubleword */
47 subi r4,r4,2
48 addi r3,r3,2
49 adde r0,r0,r6
50 bdnz 1b
51
52.Lcsum_aligned:
53 /*
54 * We unroll the loop such that each iteration is 64 bytes with an
55 * entry and exit limb of 64 bytes, meaning a minimum size of
56 * 128 bytes.
57 */
58 srdi. r6,r4,7
59 beq .Lcsum_tail_doublewords /* len < 128 */
60
61 srdi r6,r4,6
62 subi r6,r6,1
63 mtctr r6
64
65 stdu r1,-STACKFRAMESIZE(r1)
c75df6f9
MN
66 std r14,STK_REG(R14)(r1)
67 std r15,STK_REG(R15)(r1)
68 std r16,STK_REG(R16)(r1)
9b83ecb0
AB
69
70 ld r6,0(r3)
71 ld r9,8(r3)
72
73 ld r10,16(r3)
74 ld r11,24(r3)
75
76 /*
77 * On POWER6 and POWER7 back to back addes take 2 cycles because of
78 * the XER dependency. This means the fastest this loop can go is
79 * 16 cycles per iteration. The scheduling of the loop below has
80 * been shown to hit this on both POWER6 and POWER7.
81 */
82 .align 5
832:
84 adde r0,r0,r6
85 ld r12,32(r3)
86 ld r14,40(r3)
87
88 adde r0,r0,r9
89 ld r15,48(r3)
90 ld r16,56(r3)
91 addi r3,r3,64
92
93 adde r0,r0,r10
94
95 adde r0,r0,r11
96
97 adde r0,r0,r12
98
99 adde r0,r0,r14
100
101 adde r0,r0,r15
102 ld r6,0(r3)
103 ld r9,8(r3)
104
105 adde r0,r0,r16
106 ld r10,16(r3)
107 ld r11,24(r3)
108 bdnz 2b
109
110
111 adde r0,r0,r6
112 ld r12,32(r3)
113 ld r14,40(r3)
114
115 adde r0,r0,r9
116 ld r15,48(r3)
117 ld r16,56(r3)
118 addi r3,r3,64
119
120 adde r0,r0,r10
121 adde r0,r0,r11
122 adde r0,r0,r12
123 adde r0,r0,r14
124 adde r0,r0,r15
125 adde r0,r0,r16
126
c75df6f9
MN
127 ld r14,STK_REG(R14)(r1)
128 ld r15,STK_REG(R15)(r1)
129 ld r16,STK_REG(R16)(r1)
9b83ecb0
AB
130 addi r1,r1,STACKFRAMESIZE
131
132 andi. r4,r4,63
133
134.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
135 srdi. r6,r4,3
136 beq .Lcsum_tail_word
137
138 mtctr r6
1393:
140 ld r6,0(r3)
141 addi r3,r3,8
142 adde r0,r0,r6
143 bdnz 3b
144
145 andi. r4,r4,7
146
147.Lcsum_tail_word: /* Up to 7 bytes to go */
148 srdi. r6,r4,2
149 beq .Lcsum_tail_halfword
150
151 lwz r6,0(r3)
14cf11af 152 addi r3,r3,4
9b83ecb0 153 adde r0,r0,r6
14cf11af 154 subi r4,r4,4
9b83ecb0
AB
155
156.Lcsum_tail_halfword: /* Up to 3 bytes to go */
157 srdi. r6,r4,1
158 beq .Lcsum_tail_byte
159
160 lhz r6,0(r3)
161 addi r3,r3,2
162 adde r0,r0,r6
163 subi r4,r4,2
164
165.Lcsum_tail_byte: /* Up to 1 byte to go */
166 andi. r6,r4,1
167 beq .Lcsum_finish
168
169 lbz r6,0(r3)
170 sldi r9,r6,8 /* Pad the byte out to 16 bits */
171 adde r0,r0,r9
172
173.Lcsum_finish:
174 addze r0,r0 /* add in final carry */
175 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
176 add r3,r4,r0
177 srdi r3,r3,32
178 blr
14cf11af 179
fdd374b6 180
8f21bd00 181 .macro srcnr
fdd374b6
AB
182100:
183 .section __ex_table,"a"
184 .align 3
8f21bd00 185 .llong 100b,.Lsrc_error_nr
fdd374b6
AB
186 .previous
187 .endm
188
8f21bd00
PM
189 .macro source
190150:
191 .section __ex_table,"a"
192 .align 3
193 .llong 150b,.Lsrc_error
194 .previous
195 .endm
196
197 .macro dstnr
fdd374b6
AB
198200:
199 .section __ex_table,"a"
200 .align 3
8f21bd00
PM
201 .llong 200b,.Ldest_error_nr
202 .previous
203 .endm
204
205 .macro dest
206250:
207 .section __ex_table,"a"
208 .align 3
209 .llong 250b,.Ldest_error
fdd374b6
AB
210 .previous
211 .endm
212
14cf11af
PM
213/*
214 * Computes the checksum of a memory block at src, length len,
215 * and adds in "sum" (32-bit), while copying the block to dst.
216 * If an access exception occurs on src or dst, it stores -EFAULT
fdd374b6
AB
217 * to *src_err or *dst_err respectively. The caller must take any action
218 * required in this case (zeroing memory, recalculating partial checksum etc).
14cf11af
PM
219 *
220 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
221 */
222_GLOBAL(csum_partial_copy_generic)
fdd374b6
AB
223 addic r0,r6,0 /* clear carry */
224
225 srdi. r6,r5,3 /* less than 8 bytes? */
226 beq .Lcopy_tail_word
227
228 /*
229 * If only halfword aligned, align to a double word. Since odd
230 * aligned addresses should be rare and they would require more
231 * work to calculate the correct checksum, we ignore that case
232 * and take the potential slowdown of unaligned loads.
233 *
234 * If the source and destination are relatively unaligned we only
235 * align the source. This keeps things simple.
236 */
237 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
238 beq .Lcopy_aligned
239
d9813c36
PM
240 li r9,4
241 sub r6,r9,r6
fdd374b6
AB
242 mtctr r6
243
2441:
8f21bd00 245srcnr; lhz r6,0(r3) /* align to doubleword */
14cf11af 246 subi r5,r5,2
14cf11af 247 addi r3,r3,2
fdd374b6 248 adde r0,r0,r6
8f21bd00 249dstnr; sth r6,0(r4)
14cf11af 250 addi r4,r4,2
fdd374b6
AB
251 bdnz 1b
252
253.Lcopy_aligned:
254 /*
255 * We unroll the loop such that each iteration is 64 bytes with an
256 * entry and exit limb of 64 bytes, meaning a minimum size of
257 * 128 bytes.
258 */
259 srdi. r6,r5,7
260 beq .Lcopy_tail_doublewords /* len < 128 */
261
262 srdi r6,r5,6
263 subi r6,r6,1
264 mtctr r6
265
266 stdu r1,-STACKFRAMESIZE(r1)
c75df6f9
MN
267 std r14,STK_REG(R14)(r1)
268 std r15,STK_REG(R15)(r1)
269 std r16,STK_REG(R16)(r1)
fdd374b6
AB
270
271source; ld r6,0(r3)
272source; ld r9,8(r3)
273
274source; ld r10,16(r3)
275source; ld r11,24(r3)
276
277 /*
278 * On POWER6 and POWER7 back to back addes take 2 cycles because of
279 * the XER dependency. This means the fastest this loop can go is
280 * 16 cycles per iteration. The scheduling of the loop below has
281 * been shown to hit this on both POWER6 and POWER7.
282 */
283 .align 5
2842:
285 adde r0,r0,r6
286source; ld r12,32(r3)
287source; ld r14,40(r3)
288
289 adde r0,r0,r9
290source; ld r15,48(r3)
291source; ld r16,56(r3)
292 addi r3,r3,64
293
294 adde r0,r0,r10
295dest; std r6,0(r4)
296dest; std r9,8(r4)
297
298 adde r0,r0,r11
299dest; std r10,16(r4)
300dest; std r11,24(r4)
301
302 adde r0,r0,r12
303dest; std r12,32(r4)
304dest; std r14,40(r4)
305
306 adde r0,r0,r14
307dest; std r15,48(r4)
308dest; std r16,56(r4)
309 addi r4,r4,64
310
311 adde r0,r0,r15
312source; ld r6,0(r3)
313source; ld r9,8(r3)
314
315 adde r0,r0,r16
316source; ld r10,16(r3)
317source; ld r11,24(r3)
318 bdnz 2b
319
320
14cf11af 321 adde r0,r0,r6
fdd374b6
AB
322source; ld r12,32(r3)
323source; ld r14,40(r3)
324
325 adde r0,r0,r9
326source; ld r15,48(r3)
327source; ld r16,56(r3)
328 addi r3,r3,64
329
330 adde r0,r0,r10
331dest; std r6,0(r4)
332dest; std r9,8(r4)
333
334 adde r0,r0,r11
335dest; std r10,16(r4)
336dest; std r11,24(r4)
337
338 adde r0,r0,r12
339dest; std r12,32(r4)
340dest; std r14,40(r4)
341
342 adde r0,r0,r14
343dest; std r15,48(r4)
344dest; std r16,56(r4)
345 addi r4,r4,64
346
347 adde r0,r0,r15
348 adde r0,r0,r16
349
c75df6f9
MN
350 ld r14,STK_REG(R14)(r1)
351 ld r15,STK_REG(R15)(r1)
352 ld r16,STK_REG(R16)(r1)
fdd374b6
AB
353 addi r1,r1,STACKFRAMESIZE
354
355 andi. r5,r5,63
356
357.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
358 srdi. r6,r5,3
359 beq .Lcopy_tail_word
360
361 mtctr r6
3623:
8f21bd00 363srcnr; ld r6,0(r3)
fdd374b6 364 addi r3,r3,8
14cf11af 365 adde r0,r0,r6
8f21bd00 366dstnr; std r6,0(r4)
fdd374b6
AB
367 addi r4,r4,8
368 bdnz 3b
14cf11af 369
fdd374b6 370 andi. r5,r5,7
14cf11af 371
fdd374b6
AB
372.Lcopy_tail_word: /* Up to 7 bytes to go */
373 srdi. r6,r5,2
374 beq .Lcopy_tail_halfword
375
8f21bd00 376srcnr; lwz r6,0(r3)
fdd374b6
AB
377 addi r3,r3,4
378 adde r0,r0,r6
8f21bd00 379dstnr; stw r6,0(r4)
fdd374b6
AB
380 addi r4,r4,4
381 subi r5,r5,4
382
383.Lcopy_tail_halfword: /* Up to 3 bytes to go */
384 srdi. r6,r5,1
385 beq .Lcopy_tail_byte
386
8f21bd00 387srcnr; lhz r6,0(r3)
fdd374b6
AB
388 addi r3,r3,2
389 adde r0,r0,r6
8f21bd00 390dstnr; sth r6,0(r4)
14cf11af 391 addi r4,r4,2
fdd374b6
AB
392 subi r5,r5,2
393
394.Lcopy_tail_byte: /* Up to 1 byte to go */
395 andi. r6,r5,1
396 beq .Lcopy_finish
397
8f21bd00 398srcnr; lbz r6,0(r3)
fdd374b6
AB
399 sldi r9,r6,8 /* Pad the byte out to 16 bits */
400 adde r0,r0,r9
8f21bd00 401dstnr; stb r6,0(r4)
fdd374b6
AB
402
403.Lcopy_finish:
404 addze r0,r0 /* add in final carry */
405 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
406 add r3,r4,r0
407 srdi r3,r3,32
408 blr
409
410.Lsrc_error:
8f21bd00
PM
411 ld r14,STK_REG(R14)(r1)
412 ld r15,STK_REG(R15)(r1)
413 ld r16,STK_REG(R16)(r1)
414 addi r1,r1,STACKFRAMESIZE
415.Lsrc_error_nr:
14cf11af 416 cmpdi 0,r7,0
fdd374b6 417 beqlr
14cf11af
PM
418 li r6,-EFAULT
419 stw r6,0(r7)
14cf11af
PM
420 blr
421
fdd374b6 422.Ldest_error:
8f21bd00
PM
423 ld r14,STK_REG(R14)(r1)
424 ld r15,STK_REG(R15)(r1)
425 ld r16,STK_REG(R16)(r1)
426 addi r1,r1,STACKFRAMESIZE
427.Ldest_error_nr:
14cf11af 428 cmpdi 0,r8,0
fdd374b6 429 beqlr
14cf11af
PM
430 li r6,-EFAULT
431 stw r6,0(r8)
14cf11af 432 blr
This page took 0.672626 seconds and 5 git commands to generate.