Commit | Line | Data |
---|---|---|
77a746ce JN |
1 | /* A memset for CRIS. |
2 | Copyright (C) 1999-2005 Axis Communications. | |
3 | All rights reserved. | |
4 | ||
5 | Redistribution and use in source and binary forms, with or without | |
6 | modification, are permitted provided that the following conditions | |
7 | are met: | |
8 | ||
9 | 1. Redistributions of source code must retain the above copyright | |
10 | notice, this list of conditions and the following disclaimer. | |
11 | ||
12 | 2. Neither the name of Axis Communications nor the names of its | |
13 | contributors may be used to endorse or promote products derived | |
14 | from this software without specific prior written permission. | |
15 | ||
16 | THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS | |
17 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS | |
20 | COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, | |
21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
24 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | |
25 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING | |
26 | IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
27 | POSSIBILITY OF SUCH DAMAGE. */ | |
28 | ||
29 | /* FIXME: This file should really only be used for reference, as the | |
30 | result is somewhat depending on gcc generating what we expect rather | |
31 | than what we describe. An assembly file should be used instead. */ | |
32 | ||
33 | /* Note the multiple occurrence of the expression "12*4", including the | |
34 | asm. It is hard to get it into the asm in a good way. Thus better to | |
35 | expose the problem everywhere: no macro. */ | |
36 | ||
37 | /* Assuming one cycle per dword written or read (ok, not really true; the | |
38 | world is not ideal), and one cycle per instruction, then 43+3*(n/48-1) | |
39 | <= 24+24*(n/48-1) so n >= 45.7; n >= 0.9; we win on the first full | |
40 | 48-byte block to set. */ | |
41 | ||
42 | #define MEMSET_BY_BLOCK_THRESHOLD (1 * 48) | |
43 | ||
44 | /* No name ambiguities in this file. */ | |
45 | __asm__ (".syntax no_register_prefix"); | |
46 | ||
47 | void *memset(void *pdst, int c, unsigned int plen) | |
1da177e4 | 48 | { |
77a746ce JN |
49 | /* Now we want the parameters in special registers. Make sure the |
50 | compiler does something usable with this. */ | |
1da177e4 LT |
51 | |
52 | register char *return_dst __asm__ ("r10") = pdst; | |
53 | register int n __asm__ ("r12") = plen; | |
54 | register int lc __asm__ ("r11") = c; | |
55 | ||
77a746ce JN |
56 | /* Most apps use memset sanely. Memsetting about 3..4 bytes or less get |
57 | penalized here compared to the generic implementation. */ | |
1da177e4 | 58 | |
77a746ce JN |
59 | /* This is fragile performancewise at best. Check with newer GCC |
60 | releases, if they compile cascaded "x |= x << 8" to sane code. */ | |
61 | __asm__("movu.b %0,r13 \n\ | |
62 | lslq 8,r13 \n\ | |
63 | move.b %0,r13 \n\ | |
64 | move.d r13,%0 \n\ | |
65 | lslq 16,r13 \n\ | |
66 | or.d r13,%0" | |
67 | : "=r" (lc) /* Inputs. */ | |
68 | : "0" (lc) /* Outputs. */ | |
69 | : "r13"); /* Trash. */ | |
1da177e4 LT |
70 | |
71 | { | |
72 | register char *dst __asm__ ("r13") = pdst; | |
2e2cd8ba | 73 | |
77a746ce JN |
74 | if (((unsigned long) pdst & 3) != 0 |
75 | /* Oops! n = 0 must be a valid call, regardless of alignment. */ | |
76 | && n >= 3) | |
77 | { | |
78 | if ((unsigned long) dst & 1) | |
79 | { | |
80 | *dst = (char) lc; | |
81 | n--; | |
82 | dst++; | |
83 | } | |
1da177e4 | 84 | |
77a746ce JN |
85 | if ((unsigned long) dst & 2) |
86 | { | |
87 | *(short *) dst = lc; | |
88 | n -= 2; | |
89 | dst += 2; | |
90 | } | |
91 | } | |
1da177e4 | 92 | |
77a746ce JN |
93 | /* Decide which setting method to use. */ |
94 | if (n >= MEMSET_BY_BLOCK_THRESHOLD) | |
95 | { | |
96 | /* It is not optimal to tell the compiler about clobbering any | |
97 | registers; that will move the saving/restoring of those registers | |
98 | to the function prologue/epilogue, and make non-block sizes | |
99 | suboptimal. */ | |
100 | __asm__ volatile | |
101 | ("\ | |
102 | ;; GCC does promise correct register allocations, but let's \n\ | |
103 | ;; make sure it keeps its promises. \n\ | |
104 | .ifnc %0-%1-%4,$r13-$r12-$r11 \n\ | |
105 | .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\" \n\ | |
106 | .endif \n\ | |
107 | \n\ | |
108 | ;; Save the registers we'll clobber in the movem process \n\ | |
109 | ;; on the stack. Don't mention them to gcc, it will only be \n\ | |
110 | ;; upset. \n\ | |
111 | subq 11*4,sp \n\ | |
112 | movem r10,[sp] \n\ | |
2e2cd8ba | 113 | \n\ |
77a746ce JN |
114 | move.d r11,r0 \n\ |
115 | move.d r11,r1 \n\ | |
116 | move.d r11,r2 \n\ | |
117 | move.d r11,r3 \n\ | |
118 | move.d r11,r4 \n\ | |
119 | move.d r11,r5 \n\ | |
120 | move.d r11,r6 \n\ | |
121 | move.d r11,r7 \n\ | |
122 | move.d r11,r8 \n\ | |
123 | move.d r11,r9 \n\ | |
124 | move.d r11,r10 \n\ | |
2e2cd8ba | 125 | \n\ |
77a746ce JN |
126 | ;; Now we've got this: \n\ |
127 | ;; r13 - dst \n\ | |
128 | ;; r12 - n \n\ | |
2e2cd8ba | 129 | \n\ |
77a746ce JN |
130 | ;; Update n for the first loop \n\ |
131 | subq 12*4,r12 \n\ | |
2e2cd8ba | 132 | 0: \n\ |
77a746ce JN |
133 | " |
134 | #ifdef __arch_common_v10_v32 | |
135 | /* Cater to branch offset difference between v32 and v10. We | |
136 | assume the branch below has an 8-bit offset. */ | |
137 | " setf\n" | |
138 | #endif | |
139 | " subq 12*4,r12 \n\ | |
140 | bge 0b \n\ | |
141 | movem r11,[r13+] \n\ | |
2e2cd8ba | 142 | \n\ |
77a746ce JN |
143 | ;; Compensate for last loop underflowing n. \n\ |
144 | addq 12*4,r12 \n\ | |
2e2cd8ba | 145 | \n\ |
77a746ce JN |
146 | ;; Restore registers from stack. \n\ |
147 | movem [sp+],r10" | |
1da177e4 | 148 | |
77a746ce JN |
149 | /* Outputs. */ |
150 | : "=r" (dst), "=r" (n) | |
2e2cd8ba | 151 | |
77a746ce JN |
152 | /* Inputs. */ |
153 | : "0" (dst), "1" (n), "r" (lc)); | |
154 | } | |
155 | ||
156 | /* An ad-hoc unroll, used for 4*12-1..16 bytes. */ | |
157 | while (n >= 16) | |
158 | { | |
159 | *(long *) dst = lc; dst += 4; | |
160 | *(long *) dst = lc; dst += 4; | |
161 | *(long *) dst = lc; dst += 4; | |
162 | *(long *) dst = lc; dst += 4; | |
163 | n -= 16; | |
164 | } | |
1da177e4 | 165 | |
1da177e4 | 166 | switch (n) |
77a746ce | 167 | { |
1da177e4 LT |
168 | case 0: |
169 | break; | |
77a746ce | 170 | |
1da177e4 | 171 | case 1: |
77a746ce | 172 | *dst = (char) lc; |
1da177e4 | 173 | break; |
77a746ce | 174 | |
1da177e4 | 175 | case 2: |
77a746ce | 176 | *(short *) dst = (short) lc; |
1da177e4 | 177 | break; |
77a746ce | 178 | |
1da177e4 | 179 | case 3: |
77a746ce JN |
180 | *(short *) dst = (short) lc; dst += 2; |
181 | *dst = (char) lc; | |
1da177e4 | 182 | break; |
77a746ce | 183 | |
1da177e4 | 184 | case 4: |
77a746ce | 185 | *(long *) dst = lc; |
1da177e4 | 186 | break; |
77a746ce | 187 | |
1da177e4 | 188 | case 5: |
77a746ce JN |
189 | *(long *) dst = lc; dst += 4; |
190 | *dst = (char) lc; | |
1da177e4 | 191 | break; |
77a746ce | 192 | |
1da177e4 | 193 | case 6: |
77a746ce JN |
194 | *(long *) dst = lc; dst += 4; |
195 | *(short *) dst = (short) lc; | |
1da177e4 | 196 | break; |
77a746ce | 197 | |
1da177e4 | 198 | case 7: |
77a746ce JN |
199 | *(long *) dst = lc; dst += 4; |
200 | *(short *) dst = (short) lc; dst += 2; | |
201 | *dst = (char) lc; | |
1da177e4 | 202 | break; |
77a746ce | 203 | |
1da177e4 | 204 | case 8: |
77a746ce JN |
205 | *(long *) dst = lc; dst += 4; |
206 | *(long *) dst = lc; | |
1da177e4 | 207 | break; |
77a746ce | 208 | |
1da177e4 | 209 | case 9: |
77a746ce JN |
210 | *(long *) dst = lc; dst += 4; |
211 | *(long *) dst = lc; dst += 4; | |
212 | *dst = (char) lc; | |
1da177e4 | 213 | break; |
77a746ce | 214 | |
1da177e4 | 215 | case 10: |
77a746ce JN |
216 | *(long *) dst = lc; dst += 4; |
217 | *(long *) dst = lc; dst += 4; | |
218 | *(short *) dst = (short) lc; | |
1da177e4 | 219 | break; |
77a746ce | 220 | |
1da177e4 | 221 | case 11: |
77a746ce JN |
222 | *(long *) dst = lc; dst += 4; |
223 | *(long *) dst = lc; dst += 4; | |
224 | *(short *) dst = (short) lc; dst += 2; | |
225 | *dst = (char) lc; | |
1da177e4 | 226 | break; |
77a746ce | 227 | |
1da177e4 | 228 | case 12: |
77a746ce JN |
229 | *(long *) dst = lc; dst += 4; |
230 | *(long *) dst = lc; dst += 4; | |
231 | *(long *) dst = lc; | |
1da177e4 | 232 | break; |
77a746ce | 233 | |
1da177e4 | 234 | case 13: |
77a746ce JN |
235 | *(long *) dst = lc; dst += 4; |
236 | *(long *) dst = lc; dst += 4; | |
237 | *(long *) dst = lc; dst += 4; | |
238 | *dst = (char) lc; | |
1da177e4 | 239 | break; |
77a746ce | 240 | |
1da177e4 | 241 | case 14: |
77a746ce JN |
242 | *(long *) dst = lc; dst += 4; |
243 | *(long *) dst = lc; dst += 4; | |
244 | *(long *) dst = lc; dst += 4; | |
245 | *(short *) dst = (short) lc; | |
1da177e4 | 246 | break; |
77a746ce | 247 | |
1da177e4 | 248 | case 15: |
77a746ce JN |
249 | *(long *) dst = lc; dst += 4; |
250 | *(long *) dst = lc; dst += 4; | |
251 | *(long *) dst = lc; dst += 4; | |
252 | *(short *) dst = (short) lc; dst += 2; | |
253 | *dst = (char) lc; | |
1da177e4 | 254 | break; |
77a746ce | 255 | } |
1da177e4 LT |
256 | } |
257 | ||
77a746ce JN |
258 | return return_dst; |
259 | } |