Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * This routine clears to zero a linear memory buffer in user space. | |
3 | * | |
4 | * Inputs: | |
5 | * in0: address of buffer | |
6 | * in1: length of buffer in bytes | |
7 | * Outputs: | |
8 | * r8: number of bytes that didn't get cleared due to a fault | |
9 | * | |
10 | * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co | |
11 | * Stephane Eranian <eranian@hpl.hp.com> | |
12 | */ | |
13 | ||
14 | #include <asm/asmmacro.h> | |
15 | ||
16 | // | |
17 | // arguments | |
18 | // | |
19 | #define buf r32 | |
20 | #define len r33 | |
21 | ||
22 | // | |
23 | // local registers | |
24 | // | |
25 | #define cnt r16 | |
26 | #define buf2 r17 | |
27 | #define saved_lc r18 | |
28 | #define saved_pfs r19 | |
29 | #define tmp r20 | |
30 | #define len2 r21 | |
31 | #define len3 r22 | |
32 | ||
33 | // | |
34 | // Theory of operations: | |
35 | // - we check whether or not the buffer is small, i.e., less than 17 | |
36 | // in which case we do the byte by byte loop. | |
37 | // | |
38 | // - Otherwise we go progressively from 1 byte store to 8byte store in | |
39 | // the head part, the body is a 16byte store loop and we finish we the | |
40 | // tail for the last 15 bytes. | |
41 | // The good point about this breakdown is that the long buffer handling | |
42 | // contains only 2 branches. | |
43 | // | |
44 | // The reason for not using shifting & masking for both the head and the | |
45 | // tail is to stay semantically correct. This routine is not supposed | |
46 | // to write bytes outside of the buffer. While most of the time this would | |
47 | // be ok, we can't tolerate a mistake. A classical example is the case | |
48 | // of multithreaded code were to the extra bytes touched is actually owned | |
49 | // by another thread which runs concurrently to ours. Another, less likely, | |
50 | // example is with device drivers where reading an I/O mapped location may | |
51 | // have side effects (same thing for writing). | |
52 | // | |
53 | ||
54 | GLOBAL_ENTRY(__do_clear_user) | |
55 | .prologue | |
56 | .save ar.pfs, saved_pfs | |
57 | alloc saved_pfs=ar.pfs,2,0,0,0 | |
58 | cmp.eq p6,p0=r0,len // check for zero length | |
59 | .save ar.lc, saved_lc | |
60 | mov saved_lc=ar.lc // preserve ar.lc (slow) | |
61 | .body | |
62 | ;; // avoid WAW on CFM | |
63 | adds tmp=-1,len // br.ctop is repeat/until | |
64 | mov ret0=len // return value is length at this point | |
65 | (p6) br.ret.spnt.many rp | |
66 | ;; | |
67 | cmp.lt p6,p0=16,len // if len > 16 then long memset | |
68 | mov ar.lc=tmp // initialize lc for small count | |
69 | (p6) br.cond.dptk .long_do_clear | |
70 | ;; // WAR on ar.lc | |
71 | // | |
72 | // worst case 16 iterations, avg 8 iterations | |
73 | // | |
74 | // We could have played with the predicates to use the extra | |
75 | // M slot for 2 stores/iteration but the cost the initialization | |
76 | // the various counters compared to how long the loop is supposed | |
77 | // to last on average does not make this solution viable. | |
78 | // | |
79 | 1: | |
80 | EX( .Lexit1, st1 [buf]=r0,1 ) | |
81 | adds len=-1,len // countdown length using len | |
82 | br.cloop.dptk 1b | |
83 | ;; // avoid RAW on ar.lc | |
84 | // | |
85 | // .Lexit4: comes from byte by byte loop | |
86 | // len contains bytes left | |
87 | .Lexit1: | |
88 | mov ret0=len // faster than using ar.lc | |
89 | mov ar.lc=saved_lc | |
90 | br.ret.sptk.many rp // end of short clear_user | |
91 | ||
92 | ||
93 | // | |
94 | // At this point we know we have more than 16 bytes to copy | |
95 | // so we focus on alignment (no branches required) | |
96 | // | |
97 | // The use of len/len2 for countdown of the number of bytes left | |
98 | // instead of ret0 is due to the fact that the exception code | |
99 | // changes the values of r8. | |
100 | // | |
101 | .long_do_clear: | |
102 | tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear) | |
103 | ;; | |
104 | EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned | |
105 | (p6) adds len=-1,len;; // sync because buf is modified | |
106 | tbit.nz p6,p0=buf,1 | |
107 | ;; | |
108 | EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned | |
109 | (p6) adds len=-2,len;; | |
110 | tbit.nz p6,p0=buf,2 | |
111 | ;; | |
112 | EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned | |
113 | (p6) adds len=-4,len;; | |
114 | tbit.nz p6,p0=buf,3 | |
115 | ;; | |
116 | EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned | |
117 | (p6) adds len=-8,len;; | |
118 | shr.u cnt=len,4 // number of 128-bit (2x64bit) words | |
119 | ;; | |
120 | cmp.eq p6,p0=r0,cnt | |
121 | adds tmp=-1,cnt | |
122 | (p6) br.cond.dpnt .dotail // we have less than 16 bytes left | |
123 | ;; | |
124 | adds buf2=8,buf // setup second base pointer | |
125 | mov ar.lc=tmp | |
126 | ;; | |
127 | ||
128 | // | |
129 | // 16bytes/iteration core loop | |
130 | // | |
131 | // The second store can never generate a fault because | |
132 | // we come into the loop only when we are 16-byte aligned. | |
133 | // This means that if we cross a page then it will always be | |
134 | // in the first store and never in the second. | |
135 | // | |
136 | // | |
137 | // We need to keep track of the remaining length. A possible (optimistic) | |
138 | // way would be to use ar.lc and derive how many byte were left by | |
139 | // doing : left= 16*ar.lc + 16. this would avoid the addition at | |
140 | // every iteration. | |
141 | // However we need to keep the synchronization point. A template | |
142 | // M;;MB does not exist and thus we can keep the addition at no | |
143 | // extra cycle cost (use a nop slot anyway). It also simplifies the | |
144 | // (unlikely) error recovery code | |
145 | // | |
146 | ||
147 | 2: EX(.Lexit3, st8 [buf]=r0,16 ) | |
148 | ;; // needed to get len correct when error | |
149 | st8 [buf2]=r0,16 | |
150 | adds len=-16,len | |
151 | br.cloop.dptk 2b | |
152 | ;; | |
153 | mov ar.lc=saved_lc | |
154 | // | |
155 | // tail correction based on len only | |
156 | // | |
157 | // We alternate the use of len3,len2 to allow parallelism and correct | |
158 | // error handling. We also reuse p6/p7 to return correct value. | |
159 | // The addition of len2/len3 does not cost anything more compared to | |
160 | // the regular memset as we had empty slots. | |
161 | // | |
162 | .dotail: | |
163 | mov len2=len // for parallelization of error handling | |
164 | mov len3=len | |
165 | tbit.nz p6,p0=len,3 | |
166 | ;; | |
167 | EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes | |
168 | (p6) adds len3=-8,len2 | |
169 | tbit.nz p7,p6=len,2 | |
170 | ;; | |
171 | EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes | |
172 | (p7) adds len2=-4,len3 | |
173 | tbit.nz p6,p7=len,1 | |
174 | ;; | |
175 | EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes | |
176 | (p6) adds len3=-2,len2 | |
177 | tbit.nz p7,p6=len,0 | |
178 | ;; | |
179 | EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left | |
180 | mov ret0=r0 // success | |
181 | br.ret.sptk.many rp // end of most likely path | |
182 | ||
183 | // | |
184 | // Outlined error handling code | |
185 | // | |
186 | ||
187 | // | |
188 | // .Lexit3: comes from core loop, need restore pr/lc | |
189 | // len contains bytes left | |
190 | // | |
191 | // | |
192 | // .Lexit2: | |
193 | // if p6 -> coming from st8 or st2 : len2 contains what's left | |
194 | // if p7 -> coming from st4 or st1 : len3 contains what's left | |
195 | // We must restore lc/pr even though might not have been used. | |
196 | .Lexit2: | |
197 | .pred.rel "mutex", p6, p7 | |
198 | (p6) mov len=len2 | |
199 | (p7) mov len=len3 | |
200 | ;; | |
201 | // | |
202 | // .Lexit4: comes from head, need not restore pr/lc | |
203 | // len contains bytes left | |
204 | // | |
205 | .Lexit3: | |
206 | mov ret0=len | |
207 | mov ar.lc=saved_lc | |
208 | br.ret.sptk.many rp | |
209 | END(__do_clear_user) |