Commit | Line | Data |
---|---|---|
6a8ce1ef TC |
1 | /* |
2 | * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) | |
3 | * | |
918731fa | 4 | * The white papers on CRC32C calculations with PCLMULQDQ instruction can be |
6a8ce1ef | 5 | * downloaded from: |
918731fa TC |
6 | * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf |
7 | * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf | |
6a8ce1ef TC |
8 | * |
9 | * Copyright (C) 2012 Intel Corporation. | |
10 | * | |
11 | * Authors: | |
12 | * Wajdi Feghali <wajdi.k.feghali@intel.com> | |
13 | * James Guilford <james.guilford@intel.com> | |
14 | * David Cote <david.m.cote@intel.com> | |
15 | * Tim Chen <tim.c.chen@linux.intel.com> | |
16 | * | |
17 | * This software is available to you under a choice of one of two | |
18 | * licenses. You may choose to be licensed under the terms of the GNU | |
19 | * General Public License (GPL) Version 2, available from the file | |
20 | * COPYING in the main directory of this source tree, or the | |
21 | * OpenIB.org BSD license below: | |
22 | * | |
23 | * Redistribution and use in source and binary forms, with or | |
24 | * without modification, are permitted provided that the following | |
25 | * conditions are met: | |
26 | * | |
27 | * - Redistributions of source code must retain the above | |
28 | * copyright notice, this list of conditions and the following | |
29 | * disclaimer. | |
30 | * | |
31 | * - Redistributions in binary form must reproduce the above | |
32 | * copyright notice, this list of conditions and the following | |
33 | * disclaimer in the documentation and/or other materials | |
34 | * provided with the distribution. | |
35 | * | |
36 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
37 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
38 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
39 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
40 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
41 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
42 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
43 | * SOFTWARE. | |
44 | */ | |
45 | ||
57ae1b05 | 46 | #include <asm/inst.h> |
698a5abb JK |
47 | #include <linux/linkage.h> |
48 | ||
6a8ce1ef TC |
49 | ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction |
50 | ||
51 | .macro LABEL prefix n | |
52 | \prefix\n\(): | |
53 | .endm | |
54 | ||
55 | .macro JMPTBL_ENTRY i | |
56 | .word crc_\i - crc_array | |
57 | .endm | |
58 | ||
59 | .macro JNC_LESS_THAN j | |
60 | jnc less_than_\j | |
61 | .endm | |
62 | ||
63 | # Define threshold where buffers are considered "small" and routed to more | |
64 | # efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so | |
65 | # SMALL_SIZE can be no larger than 255. | |
66 | ||
67 | #define SMALL_SIZE 200 | |
68 | ||
69 | .if (SMALL_SIZE > 255) | |
70 | .error "SMALL_ SIZE must be < 256" | |
71 | .endif | |
72 | ||
73 | # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); | |
74 | ||
473946e6 | 75 | .text |
698a5abb | 76 | ENTRY(crc_pcl) |
6a8ce1ef TC |
77 | #define bufp %rdi |
78 | #define bufp_dw %edi | |
79 | #define bufp_w %di | |
80 | #define bufp_b %dil | |
81 | #define bufptmp %rcx | |
82 | #define block_0 %rcx | |
83 | #define block_1 %rdx | |
84 | #define block_2 %r11 | |
85 | #define len %rsi | |
86 | #define len_dw %esi | |
87 | #define len_w %si | |
88 | #define len_b %sil | |
89 | #define crc_init_arg %rdx | |
90 | #define tmp %rbx | |
91 | #define crc_init %r8 | |
92 | #define crc_init_dw %r8d | |
93 | #define crc1 %r9 | |
94 | #define crc2 %r10 | |
95 | ||
96 | pushq %rbx | |
97 | pushq %rdi | |
98 | pushq %rsi | |
99 | ||
100 | ## Move crc_init for Linux to a different | |
101 | mov crc_init_arg, crc_init | |
102 | ||
103 | ################################################################ | |
104 | ## 1) ALIGN: | |
105 | ################################################################ | |
106 | ||
107 | mov bufp, bufptmp # rdi = *buf | |
108 | neg bufp | |
109 | and $7, bufp # calculate the unalignment amount of | |
110 | # the address | |
111 | je proc_block # Skip if aligned | |
112 | ||
113 | ## If len is less than 8 and we're unaligned, we need to jump | |
114 | ## to special code to avoid reading beyond the end of the buffer | |
115 | cmp $8, len | |
116 | jae do_align | |
117 | # less_than_8 expects length in upper 3 bits of len_dw | |
118 | # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] | |
119 | shl $32-3+1, len_dw | |
120 | jmp less_than_8_post_shl1 | |
121 | ||
122 | do_align: | |
123 | #### Calculate CRC of unaligned bytes of the buffer (if any) | |
124 | movq (bufptmp), tmp # load a quadward from the buffer | |
125 | add bufp, bufptmp # align buffer pointer for quadword | |
126 | # processing | |
127 | sub bufp, len # update buffer length | |
128 | align_loop: | |
129 | crc32b %bl, crc_init_dw # compute crc32 of 1-byte | |
130 | shr $8, tmp # get next byte | |
131 | dec bufp | |
132 | jne align_loop | |
133 | ||
134 | proc_block: | |
135 | ||
136 | ################################################################ | |
137 | ## 2) PROCESS BLOCKS: | |
138 | ################################################################ | |
139 | ||
140 | ## compute num of bytes to be processed | |
141 | movq len, tmp # save num bytes in tmp | |
142 | ||
143 | cmpq $128*24, len | |
144 | jae full_block | |
145 | ||
146 | continue_block: | |
147 | cmpq $SMALL_SIZE, len | |
148 | jb small | |
149 | ||
150 | ## len < 128*24 | |
151 | movq $2731, %rax # 2731 = ceil(2^16 / 24) | |
152 | mul len_dw | |
153 | shrq $16, %rax | |
154 | ||
155 | ## eax contains floor(bytes / 24) = num 24-byte chunks to do | |
156 | ||
157 | ## process rax 24-byte chunks (128 >= rax >= 0) | |
158 | ||
159 | ## compute end address of each block | |
160 | ## block 0 (base addr + RAX * 8) | |
161 | ## block 1 (base addr + RAX * 16) | |
162 | ## block 2 (base addr + RAX * 24) | |
163 | lea (bufptmp, %rax, 8), block_0 | |
164 | lea (block_0, %rax, 8), block_1 | |
165 | lea (block_1, %rax, 8), block_2 | |
166 | ||
167 | xor crc1, crc1 | |
168 | xor crc2, crc2 | |
169 | ||
170 | ## branch into array | |
171 | lea jump_table(%rip), bufp | |
172 | movzxw (bufp, %rax, 2), len | |
f66f6191 JP |
173 | lea crc_array(%rip), bufp |
174 | lea (bufp, len, 1), bufp | |
6a8ce1ef TC |
175 | jmp *bufp |
176 | ||
177 | ################################################################ | |
178 | ## 2a) PROCESS FULL BLOCKS: | |
179 | ################################################################ | |
180 | full_block: | |
a734b4a2 | 181 | movl $128,%eax |
6a8ce1ef TC |
182 | lea 128*8*2(block_0), block_1 |
183 | lea 128*8*3(block_0), block_2 | |
184 | add $128*8*1, block_0 | |
185 | ||
186 | xor crc1,crc1 | |
187 | xor crc2,crc2 | |
188 | ||
189 | # Fall thruogh into top of crc array (crc_128) | |
190 | ||
191 | ################################################################ | |
192 | ## 3) CRC Array: | |
193 | ################################################################ | |
194 | ||
195 | crc_array: | |
196 | i=128 | |
197 | .rept 128-1 | |
198 | .altmacro | |
199 | LABEL crc_ %i | |
200 | .noaltmacro | |
201 | crc32q -i*8(block_0), crc_init | |
202 | crc32q -i*8(block_1), crc1 | |
203 | crc32q -i*8(block_2), crc2 | |
204 | i=(i-1) | |
205 | .endr | |
206 | ||
207 | .altmacro | |
208 | LABEL crc_ %i | |
209 | .noaltmacro | |
210 | crc32q -i*8(block_0), crc_init | |
211 | crc32q -i*8(block_1), crc1 | |
212 | # SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet | |
213 | ||
214 | mov block_2, block_0 | |
215 | ||
216 | ################################################################ | |
217 | ## 4) Combine three results: | |
218 | ################################################################ | |
219 | ||
473946e6 | 220 | lea (K_table-8)(%rip), bufp # first entry is for idx 1 |
6a8ce1ef | 221 | shlq $3, %rax # rax *= 8 |
473946e6 GS |
222 | pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2 |
223 | leal (%eax,%eax,2), %eax # rax *= 3 (total *24) | |
224 | subq %rax, tmp # tmp -= rax*24 | |
6a8ce1ef TC |
225 | |
226 | movq crc_init, %xmm1 # CRC for block 1 | |
57ae1b05 | 227 | PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2 |
6a8ce1ef TC |
228 | |
229 | movq crc1, %xmm2 # CRC for block 2 | |
57ae1b05 | 230 | PCLMULQDQ 0x10, %xmm0, %xmm2 # Multiply by K1 |
6a8ce1ef TC |
231 | |
232 | pxor %xmm2,%xmm1 | |
233 | movq %xmm1, %rax | |
234 | xor -i*8(block_2), %rax | |
235 | mov crc2, crc_init | |
236 | crc32 %rax, crc_init | |
237 | ||
473946e6 GS |
238 | ################################################################ |
239 | ## 5) Check for end: | |
240 | ################################################################ | |
6a8ce1ef TC |
241 | |
242 | LABEL crc_ 0 | |
243 | mov tmp, len | |
244 | cmp $128*24, tmp | |
245 | jae full_block | |
246 | cmp $24, tmp | |
247 | jae continue_block | |
248 | ||
249 | less_than_24: | |
250 | shl $32-4, len_dw # less_than_16 expects length | |
251 | # in upper 4 bits of len_dw | |
252 | jnc less_than_16 | |
253 | crc32q (bufptmp), crc_init | |
254 | crc32q 8(bufptmp), crc_init | |
255 | jz do_return | |
256 | add $16, bufptmp | |
257 | # len is less than 8 if we got here | |
258 | # less_than_8 expects length in upper 3 bits of len_dw | |
259 | # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] | |
260 | shl $2, len_dw | |
261 | jmp less_than_8_post_shl1 | |
262 | ||
263 | ####################################################################### | |
264 | ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) | |
265 | ####################################################################### | |
266 | small: | |
267 | shl $32-8, len_dw # Prepare len_dw for less_than_256 | |
268 | j=256 | |
269 | .rept 5 # j = {256, 128, 64, 32, 16} | |
270 | .altmacro | |
271 | LABEL less_than_ %j # less_than_j: Length should be in | |
272 | # upper lg(j) bits of len_dw | |
273 | j=(j/2) | |
274 | shl $1, len_dw # Get next MSB | |
275 | JNC_LESS_THAN %j | |
276 | .noaltmacro | |
277 | i=0 | |
278 | .rept (j/8) | |
279 | crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data | |
280 | i=i+8 | |
281 | .endr | |
282 | jz do_return # Return if remaining length is zero | |
283 | add $j, bufptmp # Advance buf | |
284 | .endr | |
285 | ||
286 | less_than_8: # Length should be stored in | |
287 | # upper 3 bits of len_dw | |
288 | shl $1, len_dw | |
289 | less_than_8_post_shl1: | |
290 | jnc less_than_4 | |
291 | crc32l (bufptmp), crc_init_dw # CRC of 4 bytes | |
292 | jz do_return # return if remaining data is zero | |
293 | add $4, bufptmp | |
294 | less_than_4: # Length should be stored in | |
295 | # upper 2 bits of len_dw | |
296 | shl $1, len_dw | |
297 | jnc less_than_2 | |
298 | crc32w (bufptmp), crc_init_dw # CRC of 2 bytes | |
299 | jz do_return # return if remaining data is zero | |
300 | add $2, bufptmp | |
301 | less_than_2: # Length should be stored in the MSB | |
302 | # of len_dw | |
303 | shl $1, len_dw | |
304 | jnc less_than_1 | |
305 | crc32b (bufptmp), crc_init_dw # CRC of 1 byte | |
306 | less_than_1: # Length should be zero | |
307 | do_return: | |
308 | movq crc_init, %rax | |
309 | popq %rsi | |
310 | popq %rdi | |
311 | popq %rbx | |
312 | ret | |
f66f6191 | 313 | ENDPROC(crc_pcl) |
6a8ce1ef | 314 | |
f66f6191 | 315 | .section .rodata, "a", %progbits |
6a8ce1ef TC |
316 | ################################################################ |
317 | ## jump table Table is 129 entries x 2 bytes each | |
318 | ################################################################ | |
319 | .align 4 | |
320 | jump_table: | |
321 | i=0 | |
322 | .rept 129 | |
323 | .altmacro | |
324 | JMPTBL_ENTRY %i | |
325 | .noaltmacro | |
326 | i=i+1 | |
327 | .endr | |
698a5abb | 328 | |
698a5abb | 329 | |
6a8ce1ef TC |
330 | ################################################################ |
331 | ## PCLMULQDQ tables | |
473946e6 | 332 | ## Table is 128 entries x 2 words (8 bytes) each |
6a8ce1ef | 333 | ################################################################ |
473946e6 | 334 | .align 8 |
6a8ce1ef | 335 | K_table: |
473946e6 GS |
336 | .long 0x493c7d27, 0x00000001 |
337 | .long 0xba4fc28e, 0x493c7d27 | |
338 | .long 0xddc0152b, 0xf20c0dfe | |
339 | .long 0x9e4addf8, 0xba4fc28e | |
340 | .long 0x39d3b296, 0x3da6d0cb | |
341 | .long 0x0715ce53, 0xddc0152b | |
342 | .long 0x47db8317, 0x1c291d04 | |
343 | .long 0x0d3b6092, 0x9e4addf8 | |
344 | .long 0xc96cfdc0, 0x740eef02 | |
345 | .long 0x878a92a7, 0x39d3b296 | |
346 | .long 0xdaece73e, 0x083a6eec | |
347 | .long 0xab7aff2a, 0x0715ce53 | |
348 | .long 0x2162d385, 0xc49f4f67 | |
349 | .long 0x83348832, 0x47db8317 | |
350 | .long 0x299847d5, 0x2ad91c30 | |
351 | .long 0xb9e02b86, 0x0d3b6092 | |
352 | .long 0x18b33a4e, 0x6992cea2 | |
353 | .long 0xb6dd949b, 0xc96cfdc0 | |
354 | .long 0x78d9ccb7, 0x7e908048 | |
355 | .long 0xbac2fd7b, 0x878a92a7 | |
356 | .long 0xa60ce07b, 0x1b3d8f29 | |
357 | .long 0xce7f39f4, 0xdaece73e | |
358 | .long 0x61d82e56, 0xf1d0f55e | |
359 | .long 0xd270f1a2, 0xab7aff2a | |
360 | .long 0xc619809d, 0xa87ab8a8 | |
361 | .long 0x2b3cac5d, 0x2162d385 | |
362 | .long 0x65863b64, 0x8462d800 | |
363 | .long 0x1b03397f, 0x83348832 | |
364 | .long 0xebb883bd, 0x71d111a8 | |
365 | .long 0xb3e32c28, 0x299847d5 | |
366 | .long 0x064f7f26, 0xffd852c6 | |
367 | .long 0xdd7e3b0c, 0xb9e02b86 | |
368 | .long 0xf285651c, 0xdcb17aa4 | |
369 | .long 0x10746f3c, 0x18b33a4e | |
370 | .long 0xc7a68855, 0xf37c5aee | |
371 | .long 0x271d9844, 0xb6dd949b | |
372 | .long 0x8e766a0c, 0x6051d5a2 | |
373 | .long 0x93a5f730, 0x78d9ccb7 | |
374 | .long 0x6cb08e5c, 0x18b0d4ff | |
375 | .long 0x6b749fb2, 0xbac2fd7b | |
376 | .long 0x1393e203, 0x21f3d99c | |
377 | .long 0xcec3662e, 0xa60ce07b | |
378 | .long 0x96c515bb, 0x8f158014 | |
379 | .long 0xe6fc4e6a, 0xce7f39f4 | |
380 | .long 0x8227bb8a, 0xa00457f7 | |
381 | .long 0xb0cd4768, 0x61d82e56 | |
382 | .long 0x39c7ff35, 0x8d6d2c43 | |
383 | .long 0xd7a4825c, 0xd270f1a2 | |
384 | .long 0x0ab3844b, 0x00ac29cf | |
385 | .long 0x0167d312, 0xc619809d | |
386 | .long 0xf6076544, 0xe9adf796 | |
387 | .long 0x26f6a60a, 0x2b3cac5d | |
388 | .long 0xa741c1bf, 0x96638b34 | |
389 | .long 0x98d8d9cb, 0x65863b64 | |
390 | .long 0x49c3cc9c, 0xe0e9f351 | |
391 | .long 0x68bce87a, 0x1b03397f | |
392 | .long 0x57a3d037, 0x9af01f2d | |
393 | .long 0x6956fc3b, 0xebb883bd | |
394 | .long 0x42d98888, 0x2cff42cf | |
395 | .long 0x3771e98f, 0xb3e32c28 | |
396 | .long 0xb42ae3d9, 0x88f25a3a | |
397 | .long 0x2178513a, 0x064f7f26 | |
398 | .long 0xe0ac139e, 0x4e36f0b0 | |
399 | .long 0x170076fa, 0xdd7e3b0c | |
400 | .long 0x444dd413, 0xbd6f81f8 | |
401 | .long 0x6f345e45, 0xf285651c | |
402 | .long 0x41d17b64, 0x91c9bd4b | |
403 | .long 0xff0dba97, 0x10746f3c | |
404 | .long 0xa2b73df1, 0x885f087b | |
405 | .long 0xf872e54c, 0xc7a68855 | |
406 | .long 0x1e41e9fc, 0x4c144932 | |
407 | .long 0x86d8e4d2, 0x271d9844 | |
408 | .long 0x651bd98b, 0x52148f02 | |
409 | .long 0x5bb8f1bc, 0x8e766a0c | |
410 | .long 0xa90fd27a, 0xa3c6f37a | |
411 | .long 0xb3af077a, 0x93a5f730 | |
412 | .long 0x4984d782, 0xd7c0557f | |
413 | .long 0xca6ef3ac, 0x6cb08e5c | |
414 | .long 0x234e0b26, 0x63ded06a | |
415 | .long 0xdd66cbbb, 0x6b749fb2 | |
416 | .long 0x4597456a, 0x4d56973c | |
417 | .long 0xe9e28eb4, 0x1393e203 | |
418 | .long 0x7b3ff57a, 0x9669c9df | |
419 | .long 0xc9c8b782, 0xcec3662e | |
420 | .long 0x3f70cc6f, 0xe417f38a | |
421 | .long 0x93e106a4, 0x96c515bb | |
422 | .long 0x62ec6c6d, 0x4b9e0f71 | |
423 | .long 0xd813b325, 0xe6fc4e6a | |
424 | .long 0x0df04680, 0xd104b8fc | |
425 | .long 0x2342001e, 0x8227bb8a | |
426 | .long 0x0a2a8d7e, 0x5b397730 | |
427 | .long 0x6d9a4957, 0xb0cd4768 | |
428 | .long 0xe8b6368b, 0xe78eb416 | |
429 | .long 0xd2c3ed1a, 0x39c7ff35 | |
430 | .long 0x995a5724, 0x61ff0e01 | |
431 | .long 0x9ef68d35, 0xd7a4825c | |
432 | .long 0x0c139b31, 0x8d96551c | |
433 | .long 0xf2271e60, 0x0ab3844b | |
434 | .long 0x0b0bf8ca, 0x0bf80dd2 | |
435 | .long 0x2664fd8b, 0x0167d312 | |
436 | .long 0xed64812d, 0x8821abed | |
437 | .long 0x02ee03b2, 0xf6076544 | |
438 | .long 0x8604ae0f, 0x6a45d2b2 | |
439 | .long 0x363bd6b3, 0x26f6a60a | |
440 | .long 0x135c83fd, 0xd8d26619 | |
441 | .long 0x5fabe670, 0xa741c1bf | |
442 | .long 0x35ec3279, 0xde87806c | |
443 | .long 0x00bcf5f6, 0x98d8d9cb | |
444 | .long 0x8ae00689, 0x14338754 | |
445 | .long 0x17f27698, 0x49c3cc9c | |
446 | .long 0x58ca5f00, 0x5bd2011f | |
447 | .long 0xaa7c7ad5, 0x68bce87a | |
448 | .long 0xb5cfca28, 0xdd07448e | |
449 | .long 0xded288f8, 0x57a3d037 | |
450 | .long 0x59f229bc, 0xdde8f5b9 | |
451 | .long 0x6d390dec, 0x6956fc3b | |
452 | .long 0x37170390, 0xa3e3e02c | |
453 | .long 0x6353c1cc, 0x42d98888 | |
454 | .long 0xc4584f5c, 0xd73c7bea | |
455 | .long 0xf48642e9, 0x3771e98f | |
456 | .long 0x531377e2, 0x80ff0093 | |
457 | .long 0xdd35bc8d, 0xb42ae3d9 | |
458 | .long 0xb25b29f2, 0x8fe4c34d | |
459 | .long 0x9a5ede41, 0x2178513a | |
460 | .long 0xa563905d, 0xdf99fc11 | |
461 | .long 0x45cddf4e, 0xe0ac139e | |
462 | .long 0xacfa3103, 0x6c23e841 | |
463 | .long 0xa51b6135, 0x170076fa |