Commit | Line | Data |
---|---|---|
6bc9a396 CL |
1 | /* |
2 | * arch/score/lib/csum_partial.S | |
3 | * | |
4 | * Score Processor version. | |
5 | * | |
6 | * Copyright (C) 2009 Sunplus Core Technology Co., Ltd. | |
7 | * Lennox Wu <lennox.wu@sunplusct.com> | |
8 | * Chen Liqin <liqin.chen@sunplusct.com> | |
9 | * | |
10 | * This program is free software; you can redistribute it and/or modify | |
11 | * it under the terms of the GNU General Public License as published by | |
12 | * the Free Software Foundation; either version 2 of the License, or | |
13 | * (at your option) any later version. | |
14 | * | |
15 | * This program is distributed in the hope that it will be useful, | |
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 | * GNU General Public License for more details. | |
19 | * | |
20 | * You should have received a copy of the GNU General Public License | |
21 | * along with this program; if not, see the file COPYING, or write | |
22 | * to the Free Software Foundation, Inc., | |
23 | * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | |
24 | */ | |
25 | #include <linux/linkage.h> | |
26 | ||
27 | #define ADDC(sum,reg) \ | |
28 | add sum, sum, reg; \ | |
29 | cmp.c reg, sum; \ | |
30 | bleu 9f; \ | |
31 | addi sum, 0x1; \ | |
32 | 9: | |
33 | ||
34 | #define CSUM_BIGCHUNK(src, offset, sum) \ | |
35 | lw r8, [src, offset + 0x00]; \ | |
36 | lw r9, [src, offset + 0x04]; \ | |
37 | lw r10, [src, offset + 0x08]; \ | |
38 | lw r11, [src, offset + 0x0c]; \ | |
39 | ADDC(sum, r8); \ | |
40 | ADDC(sum, r9); \ | |
41 | ADDC(sum, r10); \ | |
42 | ADDC(sum, r11); \ | |
43 | lw r8, [src, offset + 0x10]; \ | |
44 | lw r9, [src, offset + 0x14]; \ | |
45 | lw r10, [src, offset + 0x18]; \ | |
46 | lw r11, [src, offset + 0x1c]; \ | |
47 | ADDC(sum, r8); \ | |
48 | ADDC(sum, r9); \ | |
49 | ADDC(sum, r10); \ | |
50 | ADDC(sum, r11); \ | |
51 | ||
52 | #define src r4 | |
53 | #define dest r5 | |
54 | #define sum r27 | |
55 | ||
56 | .text | |
57 | /* unknown src alignment and < 8 bytes to go */ | |
58 | small_csumcpy: | |
59 | mv r5, r10 | |
60 | ldi r9, 0x0 | |
61 | cmpi.c r25, 0x1 | |
62 | beq pass_small_set_t7 /*already set, jump to pass_small_set_t7*/ | |
63 | andri.c r25,r4 , 0x1 /*Is src 2 bytes aligned?*/ | |
64 | ||
65 | pass_small_set_t7: | |
66 | beq aligned | |
67 | cmpi.c r5, 0x0 | |
68 | beq fold | |
69 | lbu r9, [src] | |
70 | slli r9,r9, 0x8 /*Little endian*/ | |
71 | ADDC(sum, r9) | |
72 | addi src, 0x1 | |
73 | subi.c r5, 0x1 | |
74 | ||
75 | /*len still a full word */ | |
76 | aligned: | |
77 | andri.c r8, r5, 0x4 /*Len >= 4?*/ | |
78 | beq len_less_4bytes | |
79 | ||
80 | /* Still a full word (4byte) to go,and the src is word aligned.*/ | |
81 | andri.c r8, src, 0x3 /*src is 4bytes aligned, so use LW!!*/ | |
82 | beq four_byte_aligned | |
83 | lhu r9, [src] | |
84 | addi src, 2 | |
85 | ADDC(sum, r9) | |
86 | lhu r9, [src] | |
87 | addi src, 2 | |
88 | ADDC(sum, r9) | |
89 | b len_less_4bytes | |
90 | ||
91 | four_byte_aligned: /* Len >=4 and four byte aligned */ | |
92 | lw r9, [src] | |
93 | addi src, 4 | |
94 | ADDC(sum, r9) | |
95 | ||
96 | len_less_4bytes: /* 2 byte aligned aligned and length<4B */ | |
97 | andri.c r8, r5, 0x2 | |
98 | beq len_less_2bytes | |
99 | lhu r9, [src] | |
100 | addi src, 0x2 /* src+=2 */ | |
101 | ADDC(sum, r9) | |
102 | ||
103 | len_less_2bytes: /* len = 1 */ | |
104 | andri.c r8, r5, 0x1 | |
105 | beq fold /* less than 2 and not equal 1--> len=0 -> fold */ | |
106 | lbu r9, [src] | |
107 | ||
108 | fold_ADDC: | |
109 | ADDC(sum, r9) | |
110 | fold: | |
111 | /* fold checksum */ | |
112 | slli r26, sum, 16 | |
113 | add sum, sum, r26 | |
114 | cmp.c r26, sum | |
115 | srli sum, sum, 16 | |
116 | bleu 1f /* if r26<=sum */ | |
117 | addi sum, 0x1 /* r26>sum */ | |
118 | 1: | |
119 | /* odd buffer alignment? r25 was set in csum_partial */ | |
120 | cmpi.c r25, 0x0 | |
121 | beq 1f | |
122 | slli r26, sum, 8 | |
123 | srli sum, sum, 8 | |
124 | or sum, sum, r26 | |
125 | andi sum, 0xffff | |
126 | 1: | |
127 | .set optimize | |
128 | /* Add the passed partial csum. */ | |
129 | ADDC(sum, r6) | |
130 | mv r4, sum | |
131 | br r3 | |
132 | .set volatile | |
133 | ||
134 | .align 5 | |
135 | ENTRY(csum_partial) | |
136 | ldi sum, 0 | |
137 | ldi r25, 0 | |
138 | mv r10, r5 | |
139 | cmpi.c r5, 0x8 | |
8a38db13 | 140 | blt small_csumcpy /* < 8(signed) bytes to copy */ |
6bc9a396 CL |
141 | cmpi.c r5, 0x0 |
142 | beq out | |
143 | andri.c r25, src, 0x1 /* odd buffer? */ | |
144 | ||
145 | beq word_align | |
146 | hword_align: /* 1 byte */ | |
147 | lbu r8, [src] | |
148 | subi r5, 0x1 | |
149 | slli r8, r8, 8 | |
150 | ADDC(sum, r8) | |
151 | addi src, 0x1 | |
152 | ||
153 | word_align: /* 2 bytes */ | |
154 | andri.c r8, src, 0x2 /* 4bytes(dword)_aligned? */ | |
155 | beq dword_align /* not, maybe dword_align */ | |
156 | lhu r8, [src] | |
157 | subi r5, 0x2 | |
158 | ADDC(sum, r8) | |
159 | addi src, 0x2 | |
160 | ||
161 | dword_align: /* 4bytes */ | |
162 | mv r26, r5 /* maybe useless when len >=56 */ | |
163 | ldi r8, 56 | |
164 | cmp.c r8, r5 | |
165 | bgtu do_end_words /* if a1(len)<t0(56) ,unsigned */ | |
166 | andri.c r26, src, 0x4 | |
167 | beq qword_align | |
168 | lw r8, [src] | |
169 | subi r5, 0x4 | |
170 | ADDC(sum, r8) | |
171 | addi src, 0x4 | |
172 | ||
173 | qword_align: /* 8 bytes */ | |
174 | andri.c r26, src, 0x8 | |
175 | beq oword_align | |
176 | lw r8, [src, 0x0] | |
177 | lw r9, [src, 0x4] | |
178 | subi r5, 0x8 /* len-=0x8 */ | |
179 | ADDC(sum, r8) | |
180 | ADDC(sum, r9) | |
181 | addi src, 0x8 | |
182 | ||
183 | oword_align: /* 16bytes */ | |
184 | andri.c r26, src, 0x10 | |
185 | beq begin_movement | |
186 | lw r10, [src, 0x08] | |
187 | lw r11, [src, 0x0c] | |
188 | lw r8, [src, 0x00] | |
189 | lw r9, [src, 0x04] | |
190 | ADDC(sum, r10) | |
191 | ADDC(sum, r11) | |
192 | ADDC(sum, r8) | |
193 | ADDC(sum, r9) | |
194 | subi r5, 0x10 | |
195 | addi src, 0x10 | |
196 | ||
197 | begin_movement: | |
198 | srli.c r26, r5, 0x7 /* len>=128? */ | |
199 | beq 1f /* len<128 */ | |
200 | ||
201 | /* r26 is the result that computed in oword_align */ | |
202 | move_128bytes: | |
203 | CSUM_BIGCHUNK(src, 0x00, sum) | |
204 | CSUM_BIGCHUNK(src, 0x20, sum) | |
205 | CSUM_BIGCHUNK(src, 0x40, sum) | |
206 | CSUM_BIGCHUNK(src, 0x60, sum) | |
207 | subi.c r26, 0x01 /* r26 equals len/128 */ | |
208 | addi src, 0x80 | |
209 | bne move_128bytes | |
210 | ||
211 | 1: /* len<128,we process 64byte here */ | |
212 | andri.c r10, r5, 0x40 | |
213 | beq 1f | |
214 | ||
215 | move_64bytes: | |
216 | CSUM_BIGCHUNK(src, 0x00, sum) | |
217 | CSUM_BIGCHUNK(src, 0x20, sum) | |
218 | addi src, 0x40 | |
219 | ||
220 | 1: /* len<64 */ | |
221 | andri r26, r5, 0x1c /* 0x1c=28 */ | |
222 | andri.c r10, r5, 0x20 | |
223 | beq do_end_words /* decided by andri */ | |
224 | ||
225 | move_32bytes: | |
226 | CSUM_BIGCHUNK(src, 0x00, sum) | |
227 | andri r26, r5, 0x1c | |
228 | addri src, src, 0x20 | |
229 | ||
230 | do_end_words: /* len<32 */ | |
231 | /* r26 was set already in dword_align */ | |
232 | cmpi.c r26, 0x0 | |
233 | beq maybe_end_cruft /* len<28 or len<56 */ | |
234 | srli r26, r26, 0x2 | |
235 | ||
236 | end_words: | |
237 | lw r8, [src] | |
238 | subi.c r26, 0x1 /* unit is 4 byte */ | |
239 | ADDC(sum, r8) | |
240 | addi src, 0x4 | |
241 | cmpi.c r26, 0x0 | |
242 | bne end_words /* r26!=0 */ | |
243 | ||
244 | maybe_end_cruft: /* len<4 */ | |
245 | andri r10, r5, 0x3 | |
246 | ||
247 | small_memcpy: | |
248 | mv r5, r10 | |
249 | j small_csumcpy | |
250 | ||
251 | out: | |
252 | mv r4, sum | |
253 | br r3 | |
254 | ||
255 | END(csum_partial) |