Merge tag 'powerpc-4.8-2' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc...
[deliverable/linux.git] / arch / powerpc / kernel / vector.S
CommitLineData
e821ea70 1#include <asm/processor.h>
14cf11af 2#include <asm/ppc_asm.h>
b3b8dc6c 3#include <asm/reg.h>
e821ea70
BH
4#include <asm/asm-offsets.h>
5#include <asm/cputable.h>
6#include <asm/thread_info.h>
7#include <asm/page.h>
46f52210 8#include <asm/ptrace.h>
e821ea70 9
a2dcbb32 10#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
a2dcbb32
MN
11/* void do_load_up_transact_altivec(struct thread_struct *thread)
12 *
13 * This is similar to load_up_altivec but for the transactional version of the
14 * vector regs. It doesn't mess with the task MSR or valid flags.
15 * Furthermore, VEC laziness is not supported with TM currently.
16 */
17_GLOBAL(do_load_up_transact_altivec)
18 mfmsr r6
19 oris r5,r6,MSR_VEC@h
20 MTMSRD(r5)
21 isync
22
23 li r4,1
24 stw r4,THREAD_USED_VR(r3)
25
de79f7b9 26 li r10,THREAD_TRANSACT_VRSTATE+VRSTATE_VSCR
c2ce6f9f
AB
27 lvx v0,r10,r3
28 mtvscr v0
de79f7b9
PM
29 addi r10,r3,THREAD_TRANSACT_VRSTATE
30 REST_32VRS(0,r4,r10)
a2dcbb32 31
a2dcbb32
MN
32 blr
33#endif
34
18461960
PM
35/*
36 * Load state from memory into VMX registers including VSCR.
37 * Assumes the caller has enabled VMX in the MSR.
38 */
39_GLOBAL(load_vr_state)
40 li r4,VRSTATE_VSCR
c2ce6f9f
AB
41 lvx v0,r4,r3
42 mtvscr v0
18461960
PM
43 REST_32VRS(0,r4,r3)
44 blr
45
46/*
47 * Store VMX state into memory, including VSCR.
48 * Assumes the caller has enabled VMX in the MSR.
49 */
50_GLOBAL(store_vr_state)
51 SAVE_32VRS(0, r4, r3)
c2ce6f9f 52 mfvscr v0
18461960 53 li r4, VRSTATE_VSCR
c2ce6f9f 54 stvx v0, r4, r3
18461960
PM
55 blr
56
e821ea70 57/*
e821ea70
BH
58 * Disable VMX for the task which had it previously,
59 * and save its vector registers in its thread_struct.
60 * Enables the VMX for use in the kernel on return.
61 * On SMP we know the VMX is free, since we give it up every
62 * switch (ie, no lazy save of the vector registers).
955c1cab
PM
63 *
64 * Note that on 32-bit this can only use registers that will be
65 * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
e821ea70
BH
66 */
67_GLOBAL(load_up_altivec)
68 mfmsr r5 /* grab the current MSR */
69 oris r5,r5,MSR_VEC@h
70 MTMSRD(r5) /* enable use of AltiVec now */
71 isync
72
dd570237
AB
73 /*
74 * While userspace in general ignores VRSAVE, glibc uses it as a boolean
75 * to optimise userspace context save/restore. Whenever we take an
76 * altivec unavailable exception we must set VRSAVE to something non
77 * zero. Set it to all 1s. See also the programming note in the ISA.
e821ea70
BH
78 */
79 mfspr r4,SPRN_VRSAVE
e090aa80 80 cmpwi 0,r4,0
e821ea70
BH
81 bne+ 1f
82 li r4,-1
83 mtspr SPRN_VRSAVE,r4
841:
85 /* enable use of VMX after return */
86#ifdef CONFIG_PPC32
ee43eb78 87 mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */
e821ea70
BH
88 oris r9,r9,MSR_VEC@h
89#else
90 ld r4,PACACURRENT(r13)
91 addi r5,r4,THREAD /* Get THREAD */
92 oris r12,r12,MSR_VEC@h
93 std r12,_MSR(r1)
94#endif
70fe3d98
CB
95 /* Don't care if r4 overflows, this is desired behaviour */
96 lbz r4,THREAD_LOAD_VEC(r5)
97 addi r4,r4,1
98 stb r4,THREAD_LOAD_VEC(r5)
955c1cab 99 addi r6,r5,THREAD_VRSTATE
e821ea70 100 li r4,1
de79f7b9 101 li r10,VRSTATE_VSCR
e821ea70 102 stw r4,THREAD_USED_VR(r5)
c2ce6f9f
AB
103 lvx v0,r10,r6
104 mtvscr v0
955c1cab 105 REST_32VRS(0,r4,r6)
e821ea70
BH
106 /* restore registers and return */
107 blr
108
109/*
6f515d84
CB
110 * save_altivec(tsk)
111 * Save the vector registers to its thread_struct
e821ea70 112 */
6f515d84 113_GLOBAL(save_altivec)
e821ea70 114 addi r3,r3,THREAD /* want THREAD of task */
18461960 115 PPC_LL r7,THREAD_VRSAVEAREA(r3)
e821ea70 116 PPC_LL r5,PT_REGS(r3)
18461960
PM
117 PPC_LCMPI 0,r7,0
118 bne 2f
119 addi r7,r3,THREAD_VRSTATE
6f515d84 1202: SAVE_32VRS(0,r4,r7)
c2ce6f9f 121 mfvscr v0
de79f7b9 122 li r4,VRSTATE_VSCR
c2ce6f9f 123 stvx v0,r4,r7
e821ea70
BH
124 blr
125
126#ifdef CONFIG_VSX
127
128#ifdef CONFIG_PPC32
129#error This asm code isn't ready for 32-bit kernels
130#endif
131
132/*
133 * load_up_vsx(unused, unused, tsk)
134 * Disable VSX for the task which had it previously,
135 * and save its vector registers in its thread_struct.
136 * Reuse the fp and vsx saves, but first check to see if they have
137 * been saved already.
138 */
139_GLOBAL(load_up_vsx)
140/* Load FP and VSX registers if they haven't been done yet */
141 andi. r5,r12,MSR_FP
142 beql+ load_up_fpu /* skip if already loaded */
143 andis. r5,r12,MSR_VEC@h
144 beql+ load_up_altivec /* skip if already loaded */
145
e821ea70
BH
146 ld r4,PACACURRENT(r13)
147 addi r4,r4,THREAD /* Get THREAD */
148 li r6,1
149 stw r6,THREAD_USED_VSR(r4) /* ... also set thread used vsr */
150 /* enable use of VSX after return */
151 oris r12,r12,MSR_VSX@h
152 std r12,_MSR(r1)
e821ea70
BH
153 b fast_exception_return
154
e821ea70
BH
155#endif /* CONFIG_VSX */
156
14cf11af
PM
157
158/*
159 * The routines below are in assembler so we can closely control the
160 * usage of floating-point registers. These routines must be called
161 * with preempt disabled.
162 */
163#ifdef CONFIG_PPC32
164 .data
165fpzero:
166 .long 0
167fpone:
168 .long 0x3f800000 /* 1.0 in single-precision FP */
169fphalf:
170 .long 0x3f000000 /* 0.5 in single-precision FP */
171
172#define LDCONST(fr, name) \
173 lis r11,name@ha; \
174 lfs fr,name@l(r11)
175#else
176
177 .section ".toc","aw"
178fpzero:
179 .tc FD_0_0[TC],0
180fpone:
181 .tc FD_3ff00000_0[TC],0x3ff0000000000000 /* 1.0 */
182fphalf:
183 .tc FD_3fe00000_0[TC],0x3fe0000000000000 /* 0.5 */
184
185#define LDCONST(fr, name) \
186 lfd fr,name@toc(r2)
187#endif
188
189 .text
190/*
191 * Internal routine to enable floating point and set FPSCR to 0.
192 * Don't call it from C; it doesn't use the normal calling convention.
193 */
194fpenable:
195#ifdef CONFIG_PPC32
196 stwu r1,-64(r1)
197#else
198 stdu r1,-64(r1)
199#endif
200 mfmsr r10
201 ori r11,r10,MSR_FP
202 mtmsr r11
203 isync
204 stfd fr0,24(r1)
205 stfd fr1,16(r1)
206 stfd fr31,8(r1)
207 LDCONST(fr1, fpzero)
208 mffs fr31
3a2c48cf 209 MTFSF_L(fr1)
14cf11af
PM
210 blr
211
212fpdisable:
213 mtlr r12
3a2c48cf 214 MTFSF_L(fr31)
14cf11af
PM
215 lfd fr31,8(r1)
216 lfd fr1,16(r1)
217 lfd fr0,24(r1)
218 mtmsr r10
219 isync
220 addi r1,r1,64
221 blr
222
223/*
224 * Vector add, floating point.
225 */
226_GLOBAL(vaddfp)
227 mflr r12
228 bl fpenable
229 li r0,4
230 mtctr r0
231 li r6,0
2321: lfsx fr0,r4,r6
233 lfsx fr1,r5,r6
234 fadds fr0,fr0,fr1
235 stfsx fr0,r3,r6
236 addi r6,r6,4
237 bdnz 1b
238 b fpdisable
239
240/*
241 * Vector subtract, floating point.
242 */
243_GLOBAL(vsubfp)
244 mflr r12
245 bl fpenable
246 li r0,4
247 mtctr r0
248 li r6,0
2491: lfsx fr0,r4,r6
250 lfsx fr1,r5,r6
251 fsubs fr0,fr0,fr1
252 stfsx fr0,r3,r6
253 addi r6,r6,4
254 bdnz 1b
255 b fpdisable
256
257/*
258 * Vector multiply and add, floating point.
259 */
260_GLOBAL(vmaddfp)
261 mflr r12
262 bl fpenable
263 stfd fr2,32(r1)
264 li r0,4
265 mtctr r0
266 li r7,0
2671: lfsx fr0,r4,r7
268 lfsx fr1,r5,r7
269 lfsx fr2,r6,r7
270 fmadds fr0,fr0,fr2,fr1
271 stfsx fr0,r3,r7
272 addi r7,r7,4
273 bdnz 1b
274 lfd fr2,32(r1)
275 b fpdisable
276
277/*
278 * Vector negative multiply and subtract, floating point.
279 */
280_GLOBAL(vnmsubfp)
281 mflr r12
282 bl fpenable
283 stfd fr2,32(r1)
284 li r0,4
285 mtctr r0
286 li r7,0
2871: lfsx fr0,r4,r7
288 lfsx fr1,r5,r7
289 lfsx fr2,r6,r7
290 fnmsubs fr0,fr0,fr2,fr1
291 stfsx fr0,r3,r7
292 addi r7,r7,4
293 bdnz 1b
294 lfd fr2,32(r1)
295 b fpdisable
296
297/*
298 * Vector reciprocal estimate. We just compute 1.0/x.
299 * r3 -> destination, r4 -> source.
300 */
301_GLOBAL(vrefp)
302 mflr r12
303 bl fpenable
304 li r0,4
305 LDCONST(fr1, fpone)
306 mtctr r0
307 li r6,0
3081: lfsx fr0,r4,r6
309 fdivs fr0,fr1,fr0
310 stfsx fr0,r3,r6
311 addi r6,r6,4
312 bdnz 1b
313 b fpdisable
314
315/*
316 * Vector reciprocal square-root estimate, floating point.
317 * We use the frsqrte instruction for the initial estimate followed
318 * by 2 iterations of Newton-Raphson to get sufficient accuracy.
319 * r3 -> destination, r4 -> source.
320 */
321_GLOBAL(vrsqrtefp)
322 mflr r12
323 bl fpenable
324 stfd fr2,32(r1)
325 stfd fr3,40(r1)
326 stfd fr4,48(r1)
327 stfd fr5,56(r1)
328 li r0,4
329 LDCONST(fr4, fpone)
330 LDCONST(fr5, fphalf)
331 mtctr r0
332 li r6,0
3331: lfsx fr0,r4,r6
334 frsqrte fr1,fr0 /* r = frsqrte(s) */
335 fmuls fr3,fr1,fr0 /* r * s */
336 fmuls fr2,fr1,fr5 /* r * 0.5 */
337 fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */
338 fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */
339 fmuls fr3,fr1,fr0 /* r * s */
340 fmuls fr2,fr1,fr5 /* r * 0.5 */
341 fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */
342 fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */
343 stfsx fr1,r3,r6
344 addi r6,r6,4
345 bdnz 1b
346 lfd fr5,56(r1)
347 lfd fr4,48(r1)
348 lfd fr3,40(r1)
349 lfd fr2,32(r1)
350 b fpdisable
This page took 0.778314 seconds and 5 git commands to generate.