Commit | Line | Data |
---|---|---|
249ac17e CZ |
1 | /* |
2 | * INET An implementation of the TCP/IP protocol suite for the LINUX | |
3 | * operating system. INET is implemented using the BSD Socket | |
4 | * interface as the means of communication with the user level. | |
5 | * | |
6 | * IP/TCP/UDP checksumming routines | |
7 | * | |
8 | * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea | |
9 | * Optimized by Joe Taylor | |
10 | * | |
11 | * This program is free software; you can redistribute it and/or | |
12 | * modify it under the terms of the GNU General Public License | |
13 | * as published by the Free Software Foundation; either version | |
14 | * 2 of the License, or (at your option) any later version. | |
15 | */ | |
16 | ||
17 | #include <asm/errno.h> | |
18 | #include <linux/linkage.h> | |
173d6681 | 19 | #include <asm/variant/core.h> |
249ac17e CZ |
20 | |
21 | /* | |
22 | * computes a partial checksum, e.g. for TCP/UDP fragments | |
23 | */ | |
24 | ||
25 | /* | |
26 | * unsigned int csum_partial(const unsigned char *buf, int len, | |
27 | * unsigned int sum); | |
28 | * a2 = buf | |
29 | * a3 = len | |
30 | * a4 = sum | |
31 | * | |
32 | * This function assumes 2- or 4-byte alignment. Other alignments will fail! | |
33 | */ | |
34 | ||
35 | /* ONES_ADD converts twos-complement math to ones-complement. */ | |
36 | #define ONES_ADD(sum, val) \ | |
37 | add sum, sum, val ; \ | |
38 | bgeu sum, val, 99f ; \ | |
39 | addi sum, sum, 1 ; \ | |
40 | 99: ; | |
41 | ||
42 | .text | |
43 | ENTRY(csum_partial) | |
44 | /* | |
45 | * Experiments with Ethernet and SLIP connections show that buf | |
46 | * is aligned on either a 2-byte or 4-byte boundary. | |
47 | */ | |
48 | entry sp, 32 | |
49 | extui a5, a2, 0, 2 | |
50 | bnez a5, 8f /* branch if 2-byte aligned */ | |
51 | /* Fall-through on common case, 4-byte alignment */ | |
52 | 1: | |
53 | srli a5, a3, 5 /* 32-byte chunks */ | |
54 | #if XCHAL_HAVE_LOOPS | |
55 | loopgtz a5, 2f | |
56 | #else | |
57 | beqz a5, 2f | |
58 | slli a5, a5, 5 | |
59 | add a5, a5, a2 /* a5 = end of last 32-byte chunk */ | |
60 | .Loop1: | |
61 | #endif | |
62 | l32i a6, a2, 0 | |
63 | l32i a7, a2, 4 | |
64 | ONES_ADD(a4, a6) | |
65 | ONES_ADD(a4, a7) | |
66 | l32i a6, a2, 8 | |
67 | l32i a7, a2, 12 | |
68 | ONES_ADD(a4, a6) | |
69 | ONES_ADD(a4, a7) | |
70 | l32i a6, a2, 16 | |
71 | l32i a7, a2, 20 | |
72 | ONES_ADD(a4, a6) | |
73 | ONES_ADD(a4, a7) | |
74 | l32i a6, a2, 24 | |
75 | l32i a7, a2, 28 | |
76 | ONES_ADD(a4, a6) | |
77 | ONES_ADD(a4, a7) | |
78 | addi a2, a2, 4*8 | |
79 | #if !XCHAL_HAVE_LOOPS | |
80 | blt a2, a5, .Loop1 | |
81 | #endif | |
82 | 2: | |
83 | extui a5, a3, 2, 3 /* remaining 4-byte chunks */ | |
84 | #if XCHAL_HAVE_LOOPS | |
85 | loopgtz a5, 3f | |
86 | #else | |
87 | beqz a5, 3f | |
88 | slli a5, a5, 2 | |
89 | add a5, a5, a2 /* a5 = end of last 4-byte chunk */ | |
90 | .Loop2: | |
91 | #endif | |
92 | l32i a6, a2, 0 | |
93 | ONES_ADD(a4, a6) | |
94 | addi a2, a2, 4 | |
95 | #if !XCHAL_HAVE_LOOPS | |
96 | blt a2, a5, .Loop2 | |
97 | #endif | |
98 | 3: | |
99 | _bbci.l a3, 1, 5f /* remaining 2-byte chunk */ | |
100 | l16ui a6, a2, 0 | |
101 | ONES_ADD(a4, a6) | |
102 | addi a2, a2, 2 | |
103 | 5: | |
104 | _bbci.l a3, 0, 7f /* remaining 1-byte chunk */ | |
105 | 6: l8ui a6, a2, 0 | |
106 | #ifdef __XTENSA_EB__ | |
107 | slli a6, a6, 8 /* load byte into bits 8..15 */ | |
108 | #endif | |
109 | ONES_ADD(a4, a6) | |
110 | 7: | |
111 | mov a2, a4 | |
112 | retw | |
113 | ||
114 | /* uncommon case, buf is 2-byte aligned */ | |
115 | 8: | |
116 | beqz a3, 7b /* branch if len == 0 */ | |
117 | beqi a3, 1, 6b /* branch if len == 1 */ | |
118 | ||
119 | extui a5, a2, 0, 1 | |
120 | bnez a5, 8f /* branch if 1-byte aligned */ | |
121 | ||
122 | l16ui a6, a2, 0 /* common case, len >= 2 */ | |
123 | ONES_ADD(a4, a6) | |
124 | addi a2, a2, 2 /* adjust buf */ | |
125 | addi a3, a3, -2 /* adjust len */ | |
126 | j 1b /* now buf is 4-byte aligned */ | |
127 | ||
128 | /* case: odd-byte aligned, len > 1 | |
129 | * This case is dog slow, so don't give us an odd address. | |
130 | * (I don't think this ever happens, but just in case.) | |
131 | */ | |
132 | 8: | |
133 | srli a5, a3, 2 /* 4-byte chunks */ | |
134 | #if XCHAL_HAVE_LOOPS | |
135 | loopgtz a5, 2f | |
136 | #else | |
137 | beqz a5, 2f | |
138 | slli a5, a5, 2 | |
139 | add a5, a5, a2 /* a5 = end of last 4-byte chunk */ | |
140 | .Loop3: | |
141 | #endif | |
142 | l8ui a6, a2, 0 /* bits 24..31 */ | |
143 | l16ui a7, a2, 1 /* bits 8..23 */ | |
144 | l8ui a8, a2, 3 /* bits 0.. 8 */ | |
145 | #ifdef __XTENSA_EB__ | |
146 | slli a6, a6, 24 | |
147 | #else | |
148 | slli a8, a8, 24 | |
149 | #endif | |
150 | slli a7, a7, 8 | |
151 | or a7, a7, a6 | |
152 | or a7, a7, a8 | |
153 | ONES_ADD(a4, a7) | |
154 | addi a2, a2, 4 | |
155 | #if !XCHAL_HAVE_LOOPS | |
156 | blt a2, a5, .Loop3 | |
157 | #endif | |
158 | 2: | |
159 | _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */ | |
160 | l8ui a6, a2, 0 | |
161 | l8ui a7, a2, 1 | |
162 | #ifdef __XTENSA_EB__ | |
163 | slli a6, a6, 8 | |
164 | #else | |
165 | slli a7, a7, 8 | |
166 | #endif | |
167 | or a7, a7, a6 | |
168 | ONES_ADD(a4, a7) | |
169 | addi a2, a2, 2 | |
170 | 3: | |
171 | j 5b /* branch to handle the remaining byte */ | |
172 | ||
173 | ||
174 | ||
175 | /* | |
176 | * Copy from ds while checksumming, otherwise like csum_partial | |
177 | * | |
178 | * The macros SRC and DST specify the type of access for the instruction. | |
179 | * thus we can call a custom exception handler for each access type. | |
180 | */ | |
181 | ||
182 | #define SRC(y...) \ | |
183 | 9999: y; \ | |
184 | .section __ex_table, "a"; \ | |
185 | .long 9999b, 6001f ; \ | |
186 | .previous | |
187 | ||
188 | #define DST(y...) \ | |
189 | 9999: y; \ | |
190 | .section __ex_table, "a"; \ | |
191 | .long 9999b, 6002f ; \ | |
192 | .previous | |
193 | ||
194 | /* | |
195 | unsigned int csum_partial_copy_generic (const char *src, char *dst, int len, | |
196 | int sum, int *src_err_ptr, int *dst_err_ptr) | |
197 | a2 = src | |
198 | a3 = dst | |
199 | a4 = len | |
200 | a5 = sum | |
201 | a6 = src_err_ptr | |
202 | a7 = dst_err_ptr | |
203 | a8 = temp | |
204 | a9 = temp | |
205 | a10 = temp | |
206 | a11 = original len for exception handling | |
207 | a12 = original dst for exception handling | |
208 | ||
209 | This function is optimized for 4-byte aligned addresses. Other | |
210 | alignments work, but not nearly as efficiently. | |
211 | */ | |
212 | ||
213 | ENTRY(csum_partial_copy_generic) | |
214 | entry sp, 32 | |
215 | mov a12, a3 | |
216 | mov a11, a4 | |
217 | or a10, a2, a3 | |
218 | ||
219 | /* We optimize the following alignment tests for the 4-byte | |
220 | aligned case. Two bbsi.l instructions might seem more optimal | |
221 | (commented out below). However, both labels 5: and 3: are out | |
222 | of the imm8 range, so the assembler relaxes them into | |
223 | equivalent bbci.l, j combinations, which is actually | |
224 | slower. */ | |
225 | ||
226 | extui a9, a10, 0, 2 | |
227 | beqz a9, 1f /* branch if both are 4-byte aligned */ | |
228 | bbsi.l a10, 0, 5f /* branch if one address is odd */ | |
229 | j 3f /* one address is 2-byte aligned */ | |
230 | ||
231 | /* _bbsi.l a10, 0, 5f */ /* branch if odd address */ | |
232 | /* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */ | |
233 | ||
234 | 1: | |
235 | /* src and dst are both 4-byte aligned */ | |
236 | srli a10, a4, 5 /* 32-byte chunks */ | |
237 | #if XCHAL_HAVE_LOOPS | |
238 | loopgtz a10, 2f | |
239 | #else | |
240 | beqz a10, 2f | |
241 | slli a10, a10, 5 | |
242 | add a10, a10, a2 /* a10 = end of last 32-byte src chunk */ | |
243 | .Loop5: | |
244 | #endif | |
245 | SRC( l32i a9, a2, 0 ) | |
246 | SRC( l32i a8, a2, 4 ) | |
247 | DST( s32i a9, a3, 0 ) | |
248 | DST( s32i a8, a3, 4 ) | |
249 | ONES_ADD(a5, a9) | |
250 | ONES_ADD(a5, a8) | |
251 | SRC( l32i a9, a2, 8 ) | |
252 | SRC( l32i a8, a2, 12 ) | |
253 | DST( s32i a9, a3, 8 ) | |
254 | DST( s32i a8, a3, 12 ) | |
255 | ONES_ADD(a5, a9) | |
256 | ONES_ADD(a5, a8) | |
257 | SRC( l32i a9, a2, 16 ) | |
258 | SRC( l32i a8, a2, 20 ) | |
259 | DST( s32i a9, a3, 16 ) | |
260 | DST( s32i a8, a3, 20 ) | |
261 | ONES_ADD(a5, a9) | |
262 | ONES_ADD(a5, a8) | |
263 | SRC( l32i a9, a2, 24 ) | |
264 | SRC( l32i a8, a2, 28 ) | |
265 | DST( s32i a9, a3, 24 ) | |
266 | DST( s32i a8, a3, 28 ) | |
267 | ONES_ADD(a5, a9) | |
268 | ONES_ADD(a5, a8) | |
269 | addi a2, a2, 32 | |
270 | addi a3, a3, 32 | |
271 | #if !XCHAL_HAVE_LOOPS | |
272 | blt a2, a10, .Loop5 | |
273 | #endif | |
274 | 2: | |
275 | extui a10, a4, 2, 3 /* remaining 4-byte chunks */ | |
276 | extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */ | |
277 | #if XCHAL_HAVE_LOOPS | |
278 | loopgtz a10, 3f | |
279 | #else | |
280 | beqz a10, 3f | |
281 | slli a10, a10, 2 | |
282 | add a10, a10, a2 /* a10 = end of last 4-byte src chunk */ | |
283 | .Loop6: | |
284 | #endif | |
285 | SRC( l32i a9, a2, 0 ) | |
286 | DST( s32i a9, a3, 0 ) | |
287 | ONES_ADD(a5, a9) | |
288 | addi a2, a2, 4 | |
289 | addi a3, a3, 4 | |
290 | #if !XCHAL_HAVE_LOOPS | |
291 | blt a2, a10, .Loop6 | |
292 | #endif | |
293 | 3: | |
294 | /* | |
295 | Control comes to here in two cases: (1) It may fall through | |
296 | to here from the 4-byte alignment case to process, at most, | |
297 | one 2-byte chunk. (2) It branches to here from above if | |
298 | either src or dst is 2-byte aligned, and we process all bytes | |
299 | here, except for perhaps a trailing odd byte. It's | |
300 | inefficient, so align your addresses to 4-byte boundaries. | |
301 | ||
302 | a2 = src | |
303 | a3 = dst | |
304 | a4 = len | |
305 | a5 = sum | |
306 | */ | |
307 | srli a10, a4, 1 /* 2-byte chunks */ | |
308 | #if XCHAL_HAVE_LOOPS | |
309 | loopgtz a10, 4f | |
310 | #else | |
311 | beqz a10, 4f | |
312 | slli a10, a10, 1 | |
313 | add a10, a10, a2 /* a10 = end of last 2-byte src chunk */ | |
314 | .Loop7: | |
315 | #endif | |
316 | SRC( l16ui a9, a2, 0 ) | |
317 | DST( s16i a9, a3, 0 ) | |
318 | ONES_ADD(a5, a9) | |
319 | addi a2, a2, 2 | |
320 | addi a3, a3, 2 | |
321 | #if !XCHAL_HAVE_LOOPS | |
322 | blt a2, a10, .Loop7 | |
323 | #endif | |
324 | 4: | |
325 | /* This section processes a possible trailing odd byte. */ | |
326 | _bbci.l a4, 0, 8f /* 1-byte chunk */ | |
327 | SRC( l8ui a9, a2, 0 ) | |
328 | DST( s8i a9, a3, 0 ) | |
329 | #ifdef __XTENSA_EB__ | |
330 | slli a9, a9, 8 /* shift byte to bits 8..15 */ | |
331 | #endif | |
332 | ONES_ADD(a5, a9) | |
333 | 8: | |
334 | mov a2, a5 | |
335 | retw | |
336 | ||
337 | 5: | |
338 | /* Control branch to here when either src or dst is odd. We | |
339 | process all bytes using 8-bit accesses. Grossly inefficient, | |
340 | so don't feed us an odd address. */ | |
341 | ||
342 | srli a10, a4, 1 /* handle in pairs for 16-bit csum */ | |
343 | #if XCHAL_HAVE_LOOPS | |
344 | loopgtz a10, 6f | |
345 | #else | |
346 | beqz a10, 6f | |
347 | slli a10, a10, 1 | |
348 | add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */ | |
349 | .Loop8: | |
350 | #endif | |
351 | SRC( l8ui a9, a2, 0 ) | |
352 | SRC( l8ui a8, a2, 1 ) | |
353 | DST( s8i a9, a3, 0 ) | |
354 | DST( s8i a8, a3, 1 ) | |
355 | #ifdef __XTENSA_EB__ | |
356 | slli a9, a9, 8 /* combine into a single 16-bit value */ | |
357 | #else /* for checksum computation */ | |
358 | slli a8, a8, 8 | |
359 | #endif | |
360 | or a9, a9, a8 | |
361 | ONES_ADD(a5, a9) | |
362 | addi a2, a2, 2 | |
363 | addi a3, a3, 2 | |
364 | #if !XCHAL_HAVE_LOOPS | |
365 | blt a2, a10, .Loop8 | |
366 | #endif | |
367 | 6: | |
368 | j 4b /* process the possible trailing odd byte */ | |
369 | ||
370 | ||
371 | # Exception handler: | |
372 | .section .fixup, "ax" | |
373 | /* | |
374 | a6 = src_err_ptr | |
375 | a7 = dst_err_ptr | |
376 | a11 = original len for exception handling | |
377 | a12 = original dst for exception handling | |
378 | */ | |
379 | ||
380 | 6001: | |
381 | _movi a2, -EFAULT | |
382 | s32i a2, a6, 0 /* src_err_ptr */ | |
383 | ||
384 | # clear the complete destination - computing the rest | |
385 | # is too much work | |
386 | movi a2, 0 | |
387 | #if XCHAL_HAVE_LOOPS | |
388 | loopgtz a11, 2f | |
389 | #else | |
390 | beqz a11, 2f | |
391 | add a11, a11, a12 /* a11 = ending address */ | |
392 | .Leloop: | |
393 | #endif | |
394 | s8i a2, a12, 0 | |
395 | addi a12, a12, 1 | |
396 | #if !XCHAL_HAVE_LOOPS | |
397 | blt a12, a11, .Leloop | |
398 | #endif | |
399 | 2: | |
400 | retw | |
401 | ||
402 | 6002: | |
403 | movi a2, -EFAULT | |
404 | s32i a2, a7, 0 /* dst_err_ptr */ | |
405 | movi a2, 0 | |
406 | retw | |
407 | ||
408 | .previous | |
409 |