Commit | Line | Data |
---|---|---|
249ac17e CZ |
1 | /* |
2 | * INET An implementation of the TCP/IP protocol suite for the LINUX | |
3 | * operating system. INET is implemented using the BSD Socket | |
4 | * interface as the means of communication with the user level. | |
5 | * | |
6 | * IP/TCP/UDP checksumming routines | |
7 | * | |
8 | * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea | |
9 | * Optimized by Joe Taylor | |
10 | * | |
11 | * This program is free software; you can redistribute it and/or | |
12 | * modify it under the terms of the GNU General Public License | |
13 | * as published by the Free Software Foundation; either version | |
14 | * 2 of the License, or (at your option) any later version. | |
15 | */ | |
16 | ||
17 | #include <asm/errno.h> | |
18 | #include <linux/linkage.h> | |
367b8112 | 19 | #include <variant/core.h> |
249ac17e CZ |
20 | |
21 | /* | |
22 | * computes a partial checksum, e.g. for TCP/UDP fragments | |
23 | */ | |
24 | ||
25 | /* | |
26 | * unsigned int csum_partial(const unsigned char *buf, int len, | |
27 | * unsigned int sum); | |
28 | * a2 = buf | |
29 | * a3 = len | |
30 | * a4 = sum | |
31 | * | |
32 | * This function assumes 2- or 4-byte alignment. Other alignments will fail! | |
33 | */ | |
34 | ||
35 | /* ONES_ADD converts twos-complement math to ones-complement. */ | |
36 | #define ONES_ADD(sum, val) \ | |
37 | add sum, sum, val ; \ | |
38 | bgeu sum, val, 99f ; \ | |
39 | addi sum, sum, 1 ; \ | |
40 | 99: ; | |
41 | ||
42 | .text | |
43 | ENTRY(csum_partial) | |
c4c4594b CZ |
44 | |
45 | /* | |
46 | * Experiments with Ethernet and SLIP connections show that buf | |
47 | * is aligned on either a 2-byte or 4-byte boundary. | |
48 | */ | |
249ac17e CZ |
49 | entry sp, 32 |
50 | extui a5, a2, 0, 2 | |
51 | bnez a5, 8f /* branch if 2-byte aligned */ | |
52 | /* Fall-through on common case, 4-byte alignment */ | |
53 | 1: | |
54 | srli a5, a3, 5 /* 32-byte chunks */ | |
55 | #if XCHAL_HAVE_LOOPS | |
56 | loopgtz a5, 2f | |
57 | #else | |
58 | beqz a5, 2f | |
59 | slli a5, a5, 5 | |
60 | add a5, a5, a2 /* a5 = end of last 32-byte chunk */ | |
61 | .Loop1: | |
62 | #endif | |
63 | l32i a6, a2, 0 | |
64 | l32i a7, a2, 4 | |
65 | ONES_ADD(a4, a6) | |
66 | ONES_ADD(a4, a7) | |
67 | l32i a6, a2, 8 | |
68 | l32i a7, a2, 12 | |
69 | ONES_ADD(a4, a6) | |
70 | ONES_ADD(a4, a7) | |
71 | l32i a6, a2, 16 | |
72 | l32i a7, a2, 20 | |
73 | ONES_ADD(a4, a6) | |
74 | ONES_ADD(a4, a7) | |
75 | l32i a6, a2, 24 | |
76 | l32i a7, a2, 28 | |
77 | ONES_ADD(a4, a6) | |
78 | ONES_ADD(a4, a7) | |
79 | addi a2, a2, 4*8 | |
80 | #if !XCHAL_HAVE_LOOPS | |
81 | blt a2, a5, .Loop1 | |
82 | #endif | |
83 | 2: | |
84 | extui a5, a3, 2, 3 /* remaining 4-byte chunks */ | |
85 | #if XCHAL_HAVE_LOOPS | |
86 | loopgtz a5, 3f | |
87 | #else | |
88 | beqz a5, 3f | |
89 | slli a5, a5, 2 | |
90 | add a5, a5, a2 /* a5 = end of last 4-byte chunk */ | |
91 | .Loop2: | |
92 | #endif | |
93 | l32i a6, a2, 0 | |
94 | ONES_ADD(a4, a6) | |
95 | addi a2, a2, 4 | |
96 | #if !XCHAL_HAVE_LOOPS | |
97 | blt a2, a5, .Loop2 | |
98 | #endif | |
99 | 3: | |
100 | _bbci.l a3, 1, 5f /* remaining 2-byte chunk */ | |
101 | l16ui a6, a2, 0 | |
102 | ONES_ADD(a4, a6) | |
103 | addi a2, a2, 2 | |
104 | 5: | |
105 | _bbci.l a3, 0, 7f /* remaining 1-byte chunk */ | |
106 | 6: l8ui a6, a2, 0 | |
107 | #ifdef __XTENSA_EB__ | |
108 | slli a6, a6, 8 /* load byte into bits 8..15 */ | |
109 | #endif | |
110 | ONES_ADD(a4, a6) | |
111 | 7: | |
112 | mov a2, a4 | |
113 | retw | |
114 | ||
115 | /* uncommon case, buf is 2-byte aligned */ | |
116 | 8: | |
117 | beqz a3, 7b /* branch if len == 0 */ | |
118 | beqi a3, 1, 6b /* branch if len == 1 */ | |
119 | ||
120 | extui a5, a2, 0, 1 | |
121 | bnez a5, 8f /* branch if 1-byte aligned */ | |
122 | ||
123 | l16ui a6, a2, 0 /* common case, len >= 2 */ | |
124 | ONES_ADD(a4, a6) | |
125 | addi a2, a2, 2 /* adjust buf */ | |
126 | addi a3, a3, -2 /* adjust len */ | |
127 | j 1b /* now buf is 4-byte aligned */ | |
128 | ||
129 | /* case: odd-byte aligned, len > 1 | |
130 | * This case is dog slow, so don't give us an odd address. | |
131 | * (I don't think this ever happens, but just in case.) | |
132 | */ | |
133 | 8: | |
134 | srli a5, a3, 2 /* 4-byte chunks */ | |
135 | #if XCHAL_HAVE_LOOPS | |
136 | loopgtz a5, 2f | |
137 | #else | |
138 | beqz a5, 2f | |
139 | slli a5, a5, 2 | |
140 | add a5, a5, a2 /* a5 = end of last 4-byte chunk */ | |
141 | .Loop3: | |
142 | #endif | |
143 | l8ui a6, a2, 0 /* bits 24..31 */ | |
144 | l16ui a7, a2, 1 /* bits 8..23 */ | |
145 | l8ui a8, a2, 3 /* bits 0.. 8 */ | |
146 | #ifdef __XTENSA_EB__ | |
147 | slli a6, a6, 24 | |
148 | #else | |
149 | slli a8, a8, 24 | |
150 | #endif | |
151 | slli a7, a7, 8 | |
152 | or a7, a7, a6 | |
153 | or a7, a7, a8 | |
154 | ONES_ADD(a4, a7) | |
155 | addi a2, a2, 4 | |
156 | #if !XCHAL_HAVE_LOOPS | |
157 | blt a2, a5, .Loop3 | |
158 | #endif | |
159 | 2: | |
160 | _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */ | |
161 | l8ui a6, a2, 0 | |
162 | l8ui a7, a2, 1 | |
163 | #ifdef __XTENSA_EB__ | |
164 | slli a6, a6, 8 | |
165 | #else | |
166 | slli a7, a7, 8 | |
167 | #endif | |
168 | or a7, a7, a6 | |
169 | ONES_ADD(a4, a7) | |
170 | addi a2, a2, 2 | |
171 | 3: | |
172 | j 5b /* branch to handle the remaining byte */ | |
173 | ||
d1538c46 | 174 | ENDPROC(csum_partial) |
249ac17e CZ |
175 | |
176 | /* | |
177 | * Copy from ds while checksumming, otherwise like csum_partial | |
178 | * | |
179 | * The macros SRC and DST specify the type of access for the instruction. | |
180 | * thus we can call a custom exception handler for each access type. | |
181 | */ | |
182 | ||
183 | #define SRC(y...) \ | |
184 | 9999: y; \ | |
185 | .section __ex_table, "a"; \ | |
186 | .long 9999b, 6001f ; \ | |
187 | .previous | |
188 | ||
189 | #define DST(y...) \ | |
190 | 9999: y; \ | |
191 | .section __ex_table, "a"; \ | |
192 | .long 9999b, 6002f ; \ | |
193 | .previous | |
194 | ||
195 | /* | |
196 | unsigned int csum_partial_copy_generic (const char *src, char *dst, int len, | |
197 | int sum, int *src_err_ptr, int *dst_err_ptr) | |
198 | a2 = src | |
199 | a3 = dst | |
200 | a4 = len | |
201 | a5 = sum | |
202 | a6 = src_err_ptr | |
203 | a7 = dst_err_ptr | |
204 | a8 = temp | |
205 | a9 = temp | |
206 | a10 = temp | |
207 | a11 = original len for exception handling | |
208 | a12 = original dst for exception handling | |
209 | ||
210 | This function is optimized for 4-byte aligned addresses. Other | |
211 | alignments work, but not nearly as efficiently. | |
212 | */ | |
213 | ||
214 | ENTRY(csum_partial_copy_generic) | |
d1538c46 | 215 | |
249ac17e CZ |
216 | entry sp, 32 |
217 | mov a12, a3 | |
218 | mov a11, a4 | |
219 | or a10, a2, a3 | |
220 | ||
221 | /* We optimize the following alignment tests for the 4-byte | |
222 | aligned case. Two bbsi.l instructions might seem more optimal | |
223 | (commented out below). However, both labels 5: and 3: are out | |
224 | of the imm8 range, so the assembler relaxes them into | |
225 | equivalent bbci.l, j combinations, which is actually | |
226 | slower. */ | |
227 | ||
228 | extui a9, a10, 0, 2 | |
229 | beqz a9, 1f /* branch if both are 4-byte aligned */ | |
230 | bbsi.l a10, 0, 5f /* branch if one address is odd */ | |
231 | j 3f /* one address is 2-byte aligned */ | |
232 | ||
233 | /* _bbsi.l a10, 0, 5f */ /* branch if odd address */ | |
234 | /* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */ | |
235 | ||
236 | 1: | |
237 | /* src and dst are both 4-byte aligned */ | |
238 | srli a10, a4, 5 /* 32-byte chunks */ | |
239 | #if XCHAL_HAVE_LOOPS | |
240 | loopgtz a10, 2f | |
241 | #else | |
242 | beqz a10, 2f | |
243 | slli a10, a10, 5 | |
244 | add a10, a10, a2 /* a10 = end of last 32-byte src chunk */ | |
245 | .Loop5: | |
246 | #endif | |
247 | SRC( l32i a9, a2, 0 ) | |
248 | SRC( l32i a8, a2, 4 ) | |
249 | DST( s32i a9, a3, 0 ) | |
250 | DST( s32i a8, a3, 4 ) | |
251 | ONES_ADD(a5, a9) | |
252 | ONES_ADD(a5, a8) | |
253 | SRC( l32i a9, a2, 8 ) | |
254 | SRC( l32i a8, a2, 12 ) | |
255 | DST( s32i a9, a3, 8 ) | |
256 | DST( s32i a8, a3, 12 ) | |
257 | ONES_ADD(a5, a9) | |
258 | ONES_ADD(a5, a8) | |
259 | SRC( l32i a9, a2, 16 ) | |
260 | SRC( l32i a8, a2, 20 ) | |
261 | DST( s32i a9, a3, 16 ) | |
262 | DST( s32i a8, a3, 20 ) | |
263 | ONES_ADD(a5, a9) | |
264 | ONES_ADD(a5, a8) | |
265 | SRC( l32i a9, a2, 24 ) | |
266 | SRC( l32i a8, a2, 28 ) | |
267 | DST( s32i a9, a3, 24 ) | |
268 | DST( s32i a8, a3, 28 ) | |
269 | ONES_ADD(a5, a9) | |
270 | ONES_ADD(a5, a8) | |
271 | addi a2, a2, 32 | |
272 | addi a3, a3, 32 | |
273 | #if !XCHAL_HAVE_LOOPS | |
274 | blt a2, a10, .Loop5 | |
275 | #endif | |
276 | 2: | |
277 | extui a10, a4, 2, 3 /* remaining 4-byte chunks */ | |
278 | extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */ | |
279 | #if XCHAL_HAVE_LOOPS | |
280 | loopgtz a10, 3f | |
281 | #else | |
282 | beqz a10, 3f | |
283 | slli a10, a10, 2 | |
284 | add a10, a10, a2 /* a10 = end of last 4-byte src chunk */ | |
285 | .Loop6: | |
286 | #endif | |
287 | SRC( l32i a9, a2, 0 ) | |
288 | DST( s32i a9, a3, 0 ) | |
289 | ONES_ADD(a5, a9) | |
290 | addi a2, a2, 4 | |
291 | addi a3, a3, 4 | |
292 | #if !XCHAL_HAVE_LOOPS | |
293 | blt a2, a10, .Loop6 | |
294 | #endif | |
295 | 3: | |
296 | /* | |
297 | Control comes to here in two cases: (1) It may fall through | |
298 | to here from the 4-byte alignment case to process, at most, | |
299 | one 2-byte chunk. (2) It branches to here from above if | |
300 | either src or dst is 2-byte aligned, and we process all bytes | |
301 | here, except for perhaps a trailing odd byte. It's | |
302 | inefficient, so align your addresses to 4-byte boundaries. | |
303 | ||
304 | a2 = src | |
305 | a3 = dst | |
306 | a4 = len | |
307 | a5 = sum | |
308 | */ | |
309 | srli a10, a4, 1 /* 2-byte chunks */ | |
310 | #if XCHAL_HAVE_LOOPS | |
311 | loopgtz a10, 4f | |
312 | #else | |
313 | beqz a10, 4f | |
314 | slli a10, a10, 1 | |
315 | add a10, a10, a2 /* a10 = end of last 2-byte src chunk */ | |
316 | .Loop7: | |
317 | #endif | |
318 | SRC( l16ui a9, a2, 0 ) | |
319 | DST( s16i a9, a3, 0 ) | |
320 | ONES_ADD(a5, a9) | |
321 | addi a2, a2, 2 | |
322 | addi a3, a3, 2 | |
323 | #if !XCHAL_HAVE_LOOPS | |
324 | blt a2, a10, .Loop7 | |
325 | #endif | |
326 | 4: | |
327 | /* This section processes a possible trailing odd byte. */ | |
328 | _bbci.l a4, 0, 8f /* 1-byte chunk */ | |
329 | SRC( l8ui a9, a2, 0 ) | |
330 | DST( s8i a9, a3, 0 ) | |
331 | #ifdef __XTENSA_EB__ | |
332 | slli a9, a9, 8 /* shift byte to bits 8..15 */ | |
333 | #endif | |
334 | ONES_ADD(a5, a9) | |
335 | 8: | |
336 | mov a2, a5 | |
337 | retw | |
338 | ||
339 | 5: | |
340 | /* Control branch to here when either src or dst is odd. We | |
341 | process all bytes using 8-bit accesses. Grossly inefficient, | |
342 | so don't feed us an odd address. */ | |
343 | ||
344 | srli a10, a4, 1 /* handle in pairs for 16-bit csum */ | |
345 | #if XCHAL_HAVE_LOOPS | |
346 | loopgtz a10, 6f | |
347 | #else | |
348 | beqz a10, 6f | |
349 | slli a10, a10, 1 | |
350 | add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */ | |
351 | .Loop8: | |
352 | #endif | |
353 | SRC( l8ui a9, a2, 0 ) | |
354 | SRC( l8ui a8, a2, 1 ) | |
355 | DST( s8i a9, a3, 0 ) | |
356 | DST( s8i a8, a3, 1 ) | |
357 | #ifdef __XTENSA_EB__ | |
358 | slli a9, a9, 8 /* combine into a single 16-bit value */ | |
359 | #else /* for checksum computation */ | |
360 | slli a8, a8, 8 | |
361 | #endif | |
362 | or a9, a9, a8 | |
363 | ONES_ADD(a5, a9) | |
364 | addi a2, a2, 2 | |
365 | addi a3, a3, 2 | |
366 | #if !XCHAL_HAVE_LOOPS | |
367 | blt a2, a10, .Loop8 | |
368 | #endif | |
369 | 6: | |
370 | j 4b /* process the possible trailing odd byte */ | |
371 | ||
d1538c46 CZ |
372 | ENDPROC(csum_partial_copy_generic) |
373 | ||
249ac17e CZ |
374 | |
375 | # Exception handler: | |
376 | .section .fixup, "ax" | |
377 | /* | |
378 | a6 = src_err_ptr | |
379 | a7 = dst_err_ptr | |
380 | a11 = original len for exception handling | |
381 | a12 = original dst for exception handling | |
382 | */ | |
383 | ||
384 | 6001: | |
385 | _movi a2, -EFAULT | |
386 | s32i a2, a6, 0 /* src_err_ptr */ | |
387 | ||
388 | # clear the complete destination - computing the rest | |
389 | # is too much work | |
390 | movi a2, 0 | |
391 | #if XCHAL_HAVE_LOOPS | |
392 | loopgtz a11, 2f | |
393 | #else | |
394 | beqz a11, 2f | |
395 | add a11, a11, a12 /* a11 = ending address */ | |
396 | .Leloop: | |
397 | #endif | |
398 | s8i a2, a12, 0 | |
399 | addi a12, a12, 1 | |
400 | #if !XCHAL_HAVE_LOOPS | |
401 | blt a12, a11, .Leloop | |
402 | #endif | |
403 | 2: | |
404 | retw | |
405 | ||
406 | 6002: | |
407 | movi a2, -EFAULT | |
408 | s32i a2, a7, 0 /* dst_err_ptr */ | |
409 | movi a2, 0 | |
410 | retw | |
411 | ||
412 | .previous |