Commit | Line | Data |
---|---|---|
14cf11af PM |
1 | /* |
2 | * This file contains assembly-language implementations | |
3 | * of IP-style 1's complement checksum routines. | |
4 | * | |
5 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation; either version | |
10 | * 2 of the License, or (at your option) any later version. | |
11 | * | |
12 | * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). | |
13 | */ | |
14 | ||
15 | #include <linux/sys.h> | |
16 | #include <asm/processor.h> | |
7aef4136 | 17 | #include <asm/cache.h> |
14cf11af PM |
18 | #include <asm/errno.h> |
19 | #include <asm/ppc_asm.h> | |
20 | ||
21 | .text | |
22 | ||
14cf11af PM |
23 | /* |
24 | * computes the checksum of a memory block at buff, length len, | |
25 | * and adds in "sum" (32-bit) | |
26 | * | |
7e393220 | 27 | * __csum_partial(buff, len, sum) |
14cf11af | 28 | */ |
7e393220 | 29 | _GLOBAL(__csum_partial) |
14cf11af | 30 | subi r3,r3,4 |
48821a34 | 31 | srawi. r6,r4,2 /* Divide len by 4 and also clear carry */ |
14cf11af | 32 | beq 3f /* if we're doing < 4 bytes */ |
48821a34 | 33 | andi. r0,r3,2 /* Align buffer to longword boundary */ |
14cf11af | 34 | beq+ 1f |
48821a34 | 35 | lhz r0,4(r3) /* do 2 bytes to get aligned */ |
14cf11af | 36 | subi r4,r4,2 |
48821a34 | 37 | addi r3,r3,2 |
14cf11af | 38 | srwi. r6,r4,2 /* # words to do */ |
48821a34 | 39 | adde r5,r5,r0 |
14cf11af | 40 | beq 3f |
f867d556 CL |
41 | 1: andi. r6,r6,3 /* Prepare to handle words 4 by 4 */ |
42 | beq 21f | |
43 | mtctr r6 | |
48821a34 CL |
44 | 2: lwzu r0,4(r3) |
45 | adde r5,r5,r0 | |
14cf11af | 46 | bdnz 2b |
f867d556 CL |
47 | 21: srwi. r6,r4,4 /* # blocks of 4 words to do */ |
48 | beq 3f | |
49 | mtctr r6 | |
50 | 22: lwz r0,4(r3) | |
51 | lwz r6,8(r3) | |
52 | lwz r7,12(r3) | |
53 | lwzu r8,16(r3) | |
54 | adde r5,r5,r0 | |
55 | adde r5,r5,r6 | |
56 | adde r5,r5,r7 | |
57 | adde r5,r5,r8 | |
58 | bdnz 22b | |
48821a34 CL |
59 | 3: andi. r0,r4,2 |
60 | beq+ 4f | |
61 | lhz r0,4(r3) | |
14cf11af | 62 | addi r3,r3,2 |
48821a34 CL |
63 | adde r5,r5,r0 |
64 | 4: andi. r0,r4,1 | |
65 | beq+ 5f | |
66 | lbz r0,4(r3) | |
67 | slwi r0,r0,8 /* Upper byte of word */ | |
68 | adde r5,r5,r0 | |
69 | 5: addze r3,r5 /* add in final carry */ | |
14cf11af PM |
70 | blr |
71 | ||
72 | /* | |
73 | * Computes the checksum of a memory block at src, length len, | |
74 | * and adds in "sum" (32-bit), while copying the block to dst. | |
75 | * If an access exception occurs on src or dst, it stores -EFAULT | |
76 | * to *src_err or *dst_err respectively, and (for an error on | |
77 | * src) zeroes the rest of dst. | |
78 | * | |
79 | * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err) | |
80 | */ | |
7aef4136 CL |
81 | #define CSUM_COPY_16_BYTES_WITHEX(n) \ |
82 | 8 ## n ## 0: \ | |
83 | lwz r7,4(r4); \ | |
84 | 8 ## n ## 1: \ | |
85 | lwz r8,8(r4); \ | |
86 | 8 ## n ## 2: \ | |
87 | lwz r9,12(r4); \ | |
88 | 8 ## n ## 3: \ | |
89 | lwzu r10,16(r4); \ | |
90 | 8 ## n ## 4: \ | |
91 | stw r7,4(r6); \ | |
92 | adde r12,r12,r7; \ | |
93 | 8 ## n ## 5: \ | |
94 | stw r8,8(r6); \ | |
95 | adde r12,r12,r8; \ | |
96 | 8 ## n ## 6: \ | |
97 | stw r9,12(r6); \ | |
98 | adde r12,r12,r9; \ | |
99 | 8 ## n ## 7: \ | |
100 | stwu r10,16(r6); \ | |
101 | adde r12,r12,r10 | |
102 | ||
103 | #define CSUM_COPY_16_BYTES_EXCODE(n) \ | |
104 | .section __ex_table,"a"; \ | |
105 | .align 2; \ | |
106 | .long 8 ## n ## 0b,src_error; \ | |
107 | .long 8 ## n ## 1b,src_error; \ | |
108 | .long 8 ## n ## 2b,src_error; \ | |
109 | .long 8 ## n ## 3b,src_error; \ | |
110 | .long 8 ## n ## 4b,dst_error; \ | |
111 | .long 8 ## n ## 5b,dst_error; \ | |
112 | .long 8 ## n ## 6b,dst_error; \ | |
113 | .long 8 ## n ## 7b,dst_error; \ | |
114 | .text | |
115 | ||
116 | .text | |
117 | .stabs "arch/powerpc/lib/",N_SO,0,0,0f | |
118 | .stabs "checksum_32.S",N_SO,0,0,0f | |
119 | 0: | |
120 | ||
121 | CACHELINE_BYTES = L1_CACHE_BYTES | |
122 | LG_CACHELINE_BYTES = L1_CACHE_SHIFT | |
123 | CACHELINE_MASK = (L1_CACHE_BYTES-1) | |
124 | ||
14cf11af | 125 | _GLOBAL(csum_partial_copy_generic) |
7aef4136 CL |
126 | stwu r1,-16(r1) |
127 | stw r7,12(r1) | |
128 | stw r8,8(r1) | |
129 | ||
130 | andi. r0,r4,1 /* is destination address even ? */ | |
131 | cmplwi cr7,r0,0 | |
132 | addic r12,r6,0 | |
133 | addi r6,r4,-4 | |
134 | neg r0,r4 | |
135 | addi r4,r3,-4 | |
136 | andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ | |
137 | beq 58f | |
138 | ||
139 | cmplw 0,r5,r0 /* is this more than total to do? */ | |
140 | blt 63f /* if not much to do */ | |
141 | andi. r8,r0,3 /* get it word-aligned first */ | |
142 | mtctr r8 | |
143 | beq+ 61f | |
144 | li r3,0 | |
145 | 70: lbz r9,4(r4) /* do some bytes */ | |
146 | addi r4,r4,1 | |
147 | slwi r3,r3,8 | |
148 | rlwimi r3,r9,0,24,31 | |
149 | 71: stb r9,4(r6) | |
150 | addi r6,r6,1 | |
151 | bdnz 70b | |
152 | adde r12,r12,r3 | |
153 | 61: subf r5,r0,r5 | |
154 | srwi. r0,r0,2 | |
155 | mtctr r0 | |
156 | beq 58f | |
157 | 72: lwzu r9,4(r4) /* do some words */ | |
158 | adde r12,r12,r9 | |
159 | 73: stwu r9,4(r6) | |
160 | bdnz 72b | |
161 | ||
162 | 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ | |
163 | clrlwi r5,r5,32-LG_CACHELINE_BYTES | |
164 | li r11,4 | |
165 | beq 63f | |
166 | ||
167 | /* Here we decide how far ahead to prefetch the source */ | |
168 | li r3,4 | |
169 | cmpwi r0,1 | |
170 | li r7,0 | |
171 | ble 114f | |
172 | li r7,1 | |
173 | #if MAX_COPY_PREFETCH > 1 | |
174 | /* Heuristically, for large transfers we prefetch | |
175 | MAX_COPY_PREFETCH cachelines ahead. For small transfers | |
176 | we prefetch 1 cacheline ahead. */ | |
177 | cmpwi r0,MAX_COPY_PREFETCH | |
178 | ble 112f | |
179 | li r7,MAX_COPY_PREFETCH | |
180 | 112: mtctr r7 | |
181 | 111: dcbt r3,r4 | |
182 | addi r3,r3,CACHELINE_BYTES | |
183 | bdnz 111b | |
184 | #else | |
185 | dcbt r3,r4 | |
186 | addi r3,r3,CACHELINE_BYTES | |
187 | #endif /* MAX_COPY_PREFETCH > 1 */ | |
188 | ||
189 | 114: subf r8,r7,r0 | |
190 | mr r0,r7 | |
191 | mtctr r8 | |
192 | ||
193 | 53: dcbt r3,r4 | |
194 | 54: dcbz r11,r6 | |
195 | /* the main body of the cacheline loop */ | |
196 | CSUM_COPY_16_BYTES_WITHEX(0) | |
197 | #if L1_CACHE_BYTES >= 32 | |
198 | CSUM_COPY_16_BYTES_WITHEX(1) | |
199 | #if L1_CACHE_BYTES >= 64 | |
200 | CSUM_COPY_16_BYTES_WITHEX(2) | |
201 | CSUM_COPY_16_BYTES_WITHEX(3) | |
202 | #if L1_CACHE_BYTES >= 128 | |
203 | CSUM_COPY_16_BYTES_WITHEX(4) | |
204 | CSUM_COPY_16_BYTES_WITHEX(5) | |
205 | CSUM_COPY_16_BYTES_WITHEX(6) | |
206 | CSUM_COPY_16_BYTES_WITHEX(7) | |
207 | #endif | |
208 | #endif | |
209 | #endif | |
210 | bdnz 53b | |
211 | cmpwi r0,0 | |
212 | li r3,4 | |
213 | li r7,0 | |
214 | bne 114b | |
215 | ||
216 | 63: srwi. r0,r5,2 | |
217 | mtctr r0 | |
218 | beq 64f | |
219 | 30: lwzu r0,4(r4) | |
220 | adde r12,r12,r0 | |
221 | 31: stwu r0,4(r6) | |
222 | bdnz 30b | |
223 | ||
224 | 64: andi. r0,r5,2 | |
225 | beq+ 65f | |
226 | 40: lhz r0,4(r4) | |
14cf11af | 227 | addi r4,r4,2 |
7aef4136 CL |
228 | 41: sth r0,4(r6) |
229 | adde r12,r12,r0 | |
230 | addi r6,r6,2 | |
231 | 65: andi. r0,r5,1 | |
232 | beq+ 66f | |
233 | 50: lbz r0,4(r4) | |
234 | 51: stb r0,4(r6) | |
235 | slwi r0,r0,8 | |
236 | adde r12,r12,r0 | |
237 | 66: addze r3,r12 | |
238 | addi r1,r1,16 | |
239 | beqlr+ cr7 | |
240 | rlwinm r3,r3,8,0,31 /* swap bytes for odd destination */ | |
14cf11af PM |
241 | blr |
242 | ||
7aef4136 | 243 | /* read fault */ |
14cf11af | 244 | src_error: |
7aef4136 CL |
245 | lwz r7,12(r1) |
246 | addi r1,r1,16 | |
247 | cmpwi cr0,r7,0 | |
248 | beqlr | |
249 | li r0,-EFAULT | |
250 | stw r0,0(r7) | |
14cf11af | 251 | blr |
7aef4136 | 252 | /* write fault */ |
14cf11af | 253 | dst_error: |
7aef4136 CL |
254 | lwz r8,8(r1) |
255 | addi r1,r1,16 | |
256 | cmpwi cr0,r8,0 | |
257 | beqlr | |
258 | li r0,-EFAULT | |
259 | stw r0,0(r8) | |
14cf11af PM |
260 | blr |
261 | ||
7aef4136 CL |
262 | .section __ex_table,"a" |
263 | .align 2 | |
264 | .long 70b,src_error | |
265 | .long 71b,dst_error | |
266 | .long 72b,src_error | |
267 | .long 73b,dst_error | |
268 | .long 54b,dst_error | |
269 | .text | |
270 | ||
271 | /* | |
272 | * this stuff handles faults in the cacheline loop and branches to either | |
273 | * src_error (if in read part) or dst_error (if in write part) | |
274 | */ | |
275 | CSUM_COPY_16_BYTES_EXCODE(0) | |
276 | #if L1_CACHE_BYTES >= 32 | |
277 | CSUM_COPY_16_BYTES_EXCODE(1) | |
278 | #if L1_CACHE_BYTES >= 64 | |
279 | CSUM_COPY_16_BYTES_EXCODE(2) | |
280 | CSUM_COPY_16_BYTES_EXCODE(3) | |
281 | #if L1_CACHE_BYTES >= 128 | |
282 | CSUM_COPY_16_BYTES_EXCODE(4) | |
283 | CSUM_COPY_16_BYTES_EXCODE(5) | |
284 | CSUM_COPY_16_BYTES_EXCODE(6) | |
285 | CSUM_COPY_16_BYTES_EXCODE(7) | |
286 | #endif | |
287 | #endif | |
288 | #endif | |
289 | ||
290 | .section __ex_table,"a" | |
291 | .align 2 | |
292 | .long 30b,src_error | |
293 | .long 31b,dst_error | |
294 | .long 40b,src_error | |
295 | .long 41b,dst_error | |
296 | .long 50b,src_error | |
297 | .long 51b,dst_error |