Rewrite SPU overlay handling code. Put overlay calls stubs in the
[deliverable/binutils-gdb.git] / ld / emultempl / spu_ovl.S
1 /* Overlay manager for SPU.
2
3 Copyright 2006, 2007 Free Software Foundation, Inc.
4
5 This file is part of the GNU Binutils.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston,
20 MA 02110-1301, USA. */
21
22 /* MFC DMA defn's. */
23 #define MFC_GET_CMD 0x40
24 #define MFC_MAX_DMA_SIZE 0x4000
25 #define MFC_TAG_UPDATE_ALL 2
26 #define MFC_TAG_ID 0
27
28 /* Register usage. */
29 #define reserved1 $75
30 #define parm $75
31 #define tab1 reserved1
32 #define tab2 reserved1
33 #define vma reserved1
34 #define oldvma reserved1
35 #define newmask reserved1
36 #define map reserved1
37
38 #define reserved2 $76
39 #define off1 reserved2
40 #define off2 reserved2
41 #define present1 reserved2
42 #define present2 reserved2
43 #define sz reserved2
44 #define cmp reserved2
45 #define add64 reserved2
46 #define cgbits reserved2
47 #define off3 reserved2
48 #define off4 reserved2
49 #define off5 reserved2
50 #define tagstat reserved2
51
52 #define reserved3 $77
53 #define buf1 reserved3
54 #define buf2 reserved3
55 #define rv3 reserved3
56 #define ealo reserved3
57 #define cmd reserved3
58 #define off64 reserved3
59 #define tab3 reserved3
60 #define tab4 reserved3
61 #define tab5 reserved3
62
63 #define reserved4 $78
64 #define ovl reserved4
65 #define rv2 reserved4
66 #define rv5 reserved4
67 #define cgshuf reserved4
68 #define newovl reserved4
69
70 #define reserved5 $79
71 #define target reserved5
72
73 #define save1 $72
74 #define rv4 save1
75 #define rv7 save1
76 #define tagid save1
77 #define maxsize save1
78 #define pbyte save1
79 #define pbit save1
80
81 #define save2 $73
82 #define cur save2
83 #define rv6 save2
84 #define osize save2
85 #define zovl save2
86 #define oldovl save2
87 #define newvma save2
88
89 #define save3 $74
90 #define rv1 save3
91 #define ea64 save3
92 #define buf3 save3
93 #define genwi save3
94 #define newmap save3
95 #define oldmask save3
96
97
98 .text
99 .align 4
100 .type __rv_pattern, @object
101 .size __rv_pattern, 16
102 __rv_pattern:
103 .word 0x00010203, 0x10111213, 0x80808080, 0x80808080
104
105 .type __cg_pattern, @object
106 .size __cg_pattern, 16
107 __cg_pattern:
108 .word 0x04050607, 0x80808080, 0x80808080, 0x80808080
109
110 .type __ovly_current, @object
111 .size __ovly_current, 16
112 __ovly_current:
113 .space 16
114
115 /*
116 * __ovly_return - stub for returning from overlay functions.
117 *
118 * On entry the four slots of $lr are:
119 * __ovly_return, prev ovl index, caller return addr, undefined.
120 *
121 * Load the previous overlay and jump to the caller return address.
122 * Updates __ovly_current.
123 */
124 .align 4
125 .global __ovly_return
126 .type __ovly_return, @function
127 __ovly_return:
128 ila tab1, _ovly_table - 16 # 0,2 0
129 shlqbyi ovl, $lr, 4 # 1,4 0
130 #nop
131 shlqbyi target, $lr, 8 # 1,4 1
132 #nop; lnop
133 #nop; lnop
134 shli off1, ovl, 4 # 0,4 4
135 #lnop
136 #nop
137 hbr ovly_ret9, target # 1,15 5
138 #nop; lnop
139 #nop; lnop
140 #nop
141 lqx vma, tab1, off1 # 1,6 8
142 #nop; lnop
143 #nop; lnop
144 #nop; lnop
145 #nop; lnop
146 #nop; lnop
147 #nop
148 rotqbyi buf1, vma, 12 # 1,4 14
149 #nop
150 stqd save3, -48($sp) # 1,6 15
151 #nop
152 stqd save2, -32($sp) # 1,6 16
153 #nop
154 stqd save1, -16($sp) # 1,6 17
155 andi present1, buf1, 1 # 0,2 18
156 stqd ovl, (__ovly_current - __ovly_return)($lr) # 1,6 18
157 #nop; lnop
158 #nop
159 brz present1, __ovly_load_event # 1,4 20
160 ovly_ret9:
161 #nop
162 bi target # 1,4 21
163
164 /*
165 * __ovly_load - copy an overlay partion to local store.
166 *
167 * On entry $75 points to a word consisting of the overlay index in
168 * the top 14 bits, and the target address in the bottom 18 bits.
169 *
170 * Sets up $lr to return via __ovly_return.
171 * Updates __ovly_current.
172 */
173 .align 3
174 .global __ovly_load
175 .type __ovly_load, @function
176 __ovly_load:
177 #if OVL_STUB_SIZE == 8
178 ########
179 #nop
180 lqd target, 0(parm) # 1,6 -11
181 #nop; lnop
182 #nop; lnop
183 #nop; lnop
184 #nop; lnop
185 #nop; lnop
186 #nop
187 rotqby target, target, parm # 1,4 -5
188 ila tab2, _ovly_table - 16 # 0,2 -4
189 stqd save3, -48($sp) # 1,6 -4
190 #nop
191 stqd save2, -32($sp) # 1,6 -3
192 #nop
193 stqd save1, -16($sp) # 1,6 -2
194 rotmi ovl, target, -18 # 0,4 -1
195 hbr ovly_load9, target # 1,15 -1
196 ila rv1, __ovly_return # 0,2 0
197 #lnop
198 #nop; lnop
199 #nop
200 lqd cur, (__ovly_current - __ovly_return)(rv1) # 1,6 2
201 shli off2, ovl, 4 # 0,4 3
202 stqd ovl, (__ovly_current - __ovly_return)(rv1) # 1,6 3
203 ceq rv2, $lr, rv1 # 0,2 4
204 lqd rv3, (__rv_pattern - __ovly_return)(rv1) # 1,6 4
205 #nop; lnop
206 #nop; lnop
207 #nop
208 lqx vma, tab2, off2 # 1,6 7
209 ########
210 #else /* OVL_STUB_SIZE == 16 */
211 ########
212 ila tab2, _ovly_table - 16 # 0,2 0
213 stqd save3, -48($sp) # 1,6 0
214 ila rv1, __ovly_return # 0,2 1
215 stqd save2, -32($sp) # 1,6 1
216 shli off2, ovl, 4 # 0,4 2
217 lqa cur, __ovly_current # 1,6 2
218 nop
219 stqa ovl, __ovly_current # 1,6 3
220 ceq rv2, $lr, rv1 # 0,2 4
221 lqd rv3, (__rv_pattern - __ovly_return)(rv1) # 1,6 4
222 #nop
223 hbr ovly_load9, target # 1,15 5
224 #nop
225 lqx vma, tab2, off2 # 1,6 6
226 #nop
227 stqd save1, -16($sp) # 1,6 7
228 ########
229 #endif
230
231 #nop; lnop
232 #nop; lnop
233 #nop
234 shufb rv4, rv1, cur, rv3 # 1,4 10
235 #nop
236 fsmb rv5, rv2 # 1,4 11
237 #nop
238 rotqmbyi rv6, $lr, -8 # 1,4 12
239 #nop
240 rotqbyi buf2, vma, 12 # 1,4 13
241 #nop
242 lqd save3, -48($sp) # 1,6 14
243 #nop; lnop
244 or rv7, rv4, rv6 # 0,2 16
245 lqd save2, -32($sp) # 1,6 16
246 andi present2, buf2, 1 # 0,2 17
247 lnop # 1,0 17
248 selb $lr, rv7, $lr, rv5 # 0,2 18
249 lqd save1, -16($sp) # 1,6 18
250 #nop
251 brz present2, __ovly_load_event # 1,4 19
252 ovly_load9:
253 #nop
254 bi target # 1,4 20
255
256 /* If we get here, we are about to load a new overlay.
257 * "vma" contains the relevant entry from _ovly_table[].
258 * extern struct {
259 * u32 vma;
260 * u32 size;
261 * u32 file_offset;
262 * u32 buf;
263 * } _ovly_table[];
264 */
265 .align 3
266 .global __ovly_load_event
267 .type __ovly_load_event, @function
268 __ovly_load_event:
269 #nop
270 rotqbyi sz, vma, 8 # 1,4 0
271 #nop
272 rotqbyi osize, vma, 4 # 1,4 1
273 #nop
274 lqa ea64, _EAR_ # 1,6 2
275 #nop
276 lqd cgshuf, (__cg_pattern - __ovly_return)($lr) # 1,6 3
277
278 /* We could predict the branch at the end of this loop by adding a few
279 instructions, and there are plenty of free cycles to do so without
280 impacting loop execution time. However, it doesn't make a great
281 deal of sense since we need to wait for the dma to complete anyway. */
282 __ovly_xfer_loop:
283 #nop
284 rotqmbyi off64, sz, -4 # 1,4 4
285 #nop; lnop
286 #nop; lnop
287 #nop; lnop
288 cg cgbits, ea64, off64 # 0,2 8
289 #lnop
290 #nop; lnop
291 #nop
292 shufb add64, cgbits, cgbits, cgshuf # 1,4 10
293 #nop; lnop
294 #nop; lnop
295 #nop; lnop
296 addx add64, ea64, off64 # 0,2 14
297 #lnop
298 ila maxsize, MFC_MAX_DMA_SIZE # 0,2 15
299 lnop
300 ori ea64, add64, 0 # 0,2 16
301 rotqbyi ealo, add64, 4 # 1,4 16
302 cgt cmp, osize, maxsize # 0,2 17
303 wrch $MFC_LSA, vma # 1,6 17
304 #nop; lnop
305 selb sz, osize, maxsize, cmp # 0,2 19
306 wrch $MFC_EAH, ea64 # 1,6 19
307 ila tagid, MFC_TAG_ID # 0,2 20
308 wrch $MFC_EAL, ealo # 1,6 20
309 ila cmd, MFC_GET_CMD # 0,2 21
310 wrch $MFC_Size, sz # 1,6 21
311 sf osize, sz, osize # 0,2 22
312 wrch $MFC_TagId, tagid # 1,6 22
313 a vma, vma, sz # 0,2 23
314 wrch $MFC_Cmd, cmd # 1,6 23
315 #nop
316 brnz osize, __ovly_xfer_loop # 1,4 24
317
318 /* Now update our data structions while waiting for DMA to complete.
319 Low bit of .buf needs to be cleared on the _ovly_table entry
320 corresponding to the evicted overlay, and set on the entry for the
321 newly loaded overlay. Note that no overlay may in fact be evicted
322 as _ovly_buf_table[] starts with all zeros. Don't zap .buf entry
323 for zero index! Also of course update the _ovly_buf_table entry. */
324 #nop
325 lqd newovl, (__ovly_current - __ovly_return)($lr) # 1,6 25
326 #nop; lnop
327 #nop; lnop
328 #nop; lnop
329 #nop; lnop
330 #nop; lnop
331 shli off3, newovl, 4 # 0,4 31
332 #lnop
333 ila tab3, _ovly_table - 16 # 0,2 32
334 #lnop
335 #nop
336 fsmbi pbyte, 1 # 1,4 33
337 #nop; lnop
338 #nop
339 lqx vma, tab3, off3 # 1,6 35
340 #nop; lnop
341 andi pbit, pbyte, 1 # 0,2 37
342 lnop
343 #nop; lnop
344 #nop; lnop
345 #nop; lnop
346 or newvma, vma, pbit # 0,2 41
347 rotqbyi buf3, vma, 12 # 1,4 41
348 #nop; lnop
349 #nop
350 stqx newvma, tab3, off3 # 1,6 43
351 #nop; lnop
352 shli off4, buf3, 2 # 1,4 45
353 #lnop
354 ila tab4, _ovly_buf_table # 0,2 46
355 #lnop
356 #nop; lnop
357 #nop; lnop
358 #nop
359 lqx map, tab4, off4 # 1,6 49
360 #nop
361 cwx genwi, tab4, off4 # 1,4 50
362 #nop; lnop
363 #nop; lnop
364 #nop; lnop
365 #nop; lnop
366 #nop
367 rotqby oldovl, map, off4 # 1,4 55
368 nop
369 shufb newmap, newovl, map, genwi # 0,4 56
370 #if MFC_TAG_ID < 16
371 ila newmask, 1 << MFC_TAG_ID # 0,2 57
372 #else
373 ilhu newmask, 1 << (MFC_TAG_ID - 16) # 0,2 57
374 #endif
375 #lnop
376 #nop; lnop
377 #nop; lnop
378 stqx newmap, tab4, off4 # 1,6 60
379
380 /* Save app's tagmask, wait for DMA complete, restore mask. */
381 ila tagstat, MFC_TAG_UPDATE_ALL # 0,2 61
382 rdch oldmask, $MFC_RdTagMask # 1,6 61
383 #nop
384 wrch $MFC_WrTagMask, newmask # 1,6 62
385 #nop
386 wrch $MFC_WrTagUpdate, tagstat # 1,6 63
387 #nop
388 rdch tagstat, $MFC_RdTagStat # 1,6 64
389 #nop
390 sync # 1,4 65
391 /* Any hint prior to the sync is lost. A hint here allows the branch
392 to complete 15 cycles after the hint. With no hint the branch will
393 take 18 or 19 cycles. */
394 ila tab5, _ovly_table - 16 # 0,2 66
395 hbr do_load99, target # 1,15 66
396 shli off5, oldovl, 4 # 0,4 67
397 wrch $MFC_WrTagMask, oldmask # 1,6 67
398 ceqi zovl, oldovl, 0 # 0,2 68
399 #lnop
400 #nop; lnop
401 #nop
402 fsm zovl, zovl # 1,4 70
403 #nop
404 lqx oldvma, tab5, off5 # 1,6 71
405 #nop
406 lqd save3, -48($sp) # 1,6 72
407 #nop; lnop
408 andc pbit, pbit, zovl # 0,2 74
409 lqd save2, -32($sp) # 1,6 74
410 #nop; lnop
411 #nop; lnop
412 andc oldvma, oldvma, pbit # 0,2 77
413 lqd save1, -16($sp) # 1,6 77
414 #nop; lnop
415 nop
416 stqx oldvma, tab5, off5 # 1,6 79
417 #nop; lnop
418
419 .global _ovly_debug_event
420 .type _ovly_debug_event, @function
421 _ovly_debug_event:
422 nop
423 /* Branch to target address. */
424 do_load99:
425 bi target # 1,4 81
426
427 .size __ovly_load, . - __ovly_load
This page took 0.045618 seconds and 5 git commands to generate.