powerpc: Explicit alignment for .data.cacheline_aligned
[deliverable/linux.git] / arch / powerpc / kernel / vector.S
CommitLineData
14cf11af 1#include <asm/ppc_asm.h>
b3b8dc6c 2#include <asm/reg.h>
14cf11af
PM
3
4/*
5 * The routines below are in assembler so we can closely control the
6 * usage of floating-point registers. These routines must be called
7 * with preempt disabled.
8 */
9#ifdef CONFIG_PPC32
10 .data
11fpzero:
12 .long 0
13fpone:
14 .long 0x3f800000 /* 1.0 in single-precision FP */
15fphalf:
16 .long 0x3f000000 /* 0.5 in single-precision FP */
17
18#define LDCONST(fr, name) \
19 lis r11,name@ha; \
20 lfs fr,name@l(r11)
21#else
22
23 .section ".toc","aw"
24fpzero:
25 .tc FD_0_0[TC],0
26fpone:
27 .tc FD_3ff00000_0[TC],0x3ff0000000000000 /* 1.0 */
28fphalf:
29 .tc FD_3fe00000_0[TC],0x3fe0000000000000 /* 0.5 */
30
31#define LDCONST(fr, name) \
32 lfd fr,name@toc(r2)
33#endif
34
35 .text
36/*
37 * Internal routine to enable floating point and set FPSCR to 0.
38 * Don't call it from C; it doesn't use the normal calling convention.
39 */
40fpenable:
41#ifdef CONFIG_PPC32
42 stwu r1,-64(r1)
43#else
44 stdu r1,-64(r1)
45#endif
46 mfmsr r10
47 ori r11,r10,MSR_FP
48 mtmsr r11
49 isync
50 stfd fr0,24(r1)
51 stfd fr1,16(r1)
52 stfd fr31,8(r1)
53 LDCONST(fr1, fpzero)
54 mffs fr31
3a2c48cf 55 MTFSF_L(fr1)
14cf11af
PM
56 blr
57
58fpdisable:
59 mtlr r12
3a2c48cf 60 MTFSF_L(fr31)
14cf11af
PM
61 lfd fr31,8(r1)
62 lfd fr1,16(r1)
63 lfd fr0,24(r1)
64 mtmsr r10
65 isync
66 addi r1,r1,64
67 blr
68
69/*
70 * Vector add, floating point.
71 */
72_GLOBAL(vaddfp)
73 mflr r12
74 bl fpenable
75 li r0,4
76 mtctr r0
77 li r6,0
781: lfsx fr0,r4,r6
79 lfsx fr1,r5,r6
80 fadds fr0,fr0,fr1
81 stfsx fr0,r3,r6
82 addi r6,r6,4
83 bdnz 1b
84 b fpdisable
85
86/*
87 * Vector subtract, floating point.
88 */
89_GLOBAL(vsubfp)
90 mflr r12
91 bl fpenable
92 li r0,4
93 mtctr r0
94 li r6,0
951: lfsx fr0,r4,r6
96 lfsx fr1,r5,r6
97 fsubs fr0,fr0,fr1
98 stfsx fr0,r3,r6
99 addi r6,r6,4
100 bdnz 1b
101 b fpdisable
102
103/*
104 * Vector multiply and add, floating point.
105 */
106_GLOBAL(vmaddfp)
107 mflr r12
108 bl fpenable
109 stfd fr2,32(r1)
110 li r0,4
111 mtctr r0
112 li r7,0
1131: lfsx fr0,r4,r7
114 lfsx fr1,r5,r7
115 lfsx fr2,r6,r7
116 fmadds fr0,fr0,fr2,fr1
117 stfsx fr0,r3,r7
118 addi r7,r7,4
119 bdnz 1b
120 lfd fr2,32(r1)
121 b fpdisable
122
123/*
124 * Vector negative multiply and subtract, floating point.
125 */
126_GLOBAL(vnmsubfp)
127 mflr r12
128 bl fpenable
129 stfd fr2,32(r1)
130 li r0,4
131 mtctr r0
132 li r7,0
1331: lfsx fr0,r4,r7
134 lfsx fr1,r5,r7
135 lfsx fr2,r6,r7
136 fnmsubs fr0,fr0,fr2,fr1
137 stfsx fr0,r3,r7
138 addi r7,r7,4
139 bdnz 1b
140 lfd fr2,32(r1)
141 b fpdisable
142
143/*
144 * Vector reciprocal estimate. We just compute 1.0/x.
145 * r3 -> destination, r4 -> source.
146 */
147_GLOBAL(vrefp)
148 mflr r12
149 bl fpenable
150 li r0,4
151 LDCONST(fr1, fpone)
152 mtctr r0
153 li r6,0
1541: lfsx fr0,r4,r6
155 fdivs fr0,fr1,fr0
156 stfsx fr0,r3,r6
157 addi r6,r6,4
158 bdnz 1b
159 b fpdisable
160
161/*
162 * Vector reciprocal square-root estimate, floating point.
163 * We use the frsqrte instruction for the initial estimate followed
164 * by 2 iterations of Newton-Raphson to get sufficient accuracy.
165 * r3 -> destination, r4 -> source.
166 */
167_GLOBAL(vrsqrtefp)
168 mflr r12
169 bl fpenable
170 stfd fr2,32(r1)
171 stfd fr3,40(r1)
172 stfd fr4,48(r1)
173 stfd fr5,56(r1)
174 li r0,4
175 LDCONST(fr4, fpone)
176 LDCONST(fr5, fphalf)
177 mtctr r0
178 li r6,0
1791: lfsx fr0,r4,r6
180 frsqrte fr1,fr0 /* r = frsqrte(s) */
181 fmuls fr3,fr1,fr0 /* r * s */
182 fmuls fr2,fr1,fr5 /* r * 0.5 */
183 fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */
184 fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */
185 fmuls fr3,fr1,fr0 /* r * s */
186 fmuls fr2,fr1,fr5 /* r * 0.5 */
187 fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */
188 fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */
189 stfsx fr1,r3,r6
190 addi r6,r6,4
191 bdnz 1b
192 lfd fr5,56(r1)
193 lfd fr4,48(r1)
194 lfd fr3,40(r1)
195 lfd fr2,32(r1)
196 b fpdisable
This page took 0.349756 seconds and 5 git commands to generate.