Commit | Line | Data |
---|---|---|
086e9dc0 JH |
1 | ! Copyright (C) 2008-2012 Imagination Technologies Ltd. |
2 | ||
3 | .text | |
4 | .global _memcpy | |
5 | .type _memcpy,function | |
6 | ! D1Ar1 dst | |
7 | ! D0Ar2 src | |
8 | ! D1Ar3 cnt | |
9 | ! D0Re0 dst | |
10 | _memcpy: | |
11 | CMP D1Ar3, #16 | |
12 | MOV A1.2, D0Ar2 ! source pointer | |
13 | MOV A0.2, D1Ar1 ! destination pointer | |
14 | MOV A0.3, D1Ar1 ! for return value | |
15 | ! If there are less than 16 bytes to copy use the byte copy loop | |
16 | BGE $Llong_copy | |
17 | ||
18 | $Lbyte_copy: | |
19 | ! Simply copy a byte at a time | |
20 | SUBS TXRPT, D1Ar3, #1 | |
21 | BLT $Lend | |
22 | $Lloop_byte: | |
23 | GETB D1Re0, [A1.2++] | |
24 | SETB [A0.2++], D1Re0 | |
25 | BR $Lloop_byte | |
26 | ||
27 | $Lend: | |
28 | ! Finally set return value and return | |
29 | MOV D0Re0, A0.3 | |
30 | MOV PC, D1RtP | |
31 | ||
32 | $Llong_copy: | |
33 | ANDS D1Ar5, D1Ar1, #7 ! test destination alignment | |
34 | BZ $Laligned_dst | |
35 | ||
36 | ! The destination address is not 8 byte aligned. We will copy bytes from | |
37 | ! the source to the destination until the remaining data has an 8 byte | |
38 | ! destination address alignment (i.e we should never copy more than 7 | |
39 | ! bytes here). | |
40 | $Lalign_dst: | |
41 | GETB D0Re0, [A1.2++] | |
42 | ADD D1Ar5, D1Ar5, #1 ! dest is aligned when D1Ar5 reaches #8 | |
43 | SUB D1Ar3, D1Ar3, #1 ! decrement count of remaining bytes | |
44 | SETB [A0.2++], D0Re0 | |
45 | CMP D1Ar5, #8 | |
46 | BNE $Lalign_dst | |
47 | ||
48 | ! We have at least (16 - 7) = 9 bytes to copy - calculate the number of 8 byte | |
49 | ! blocks, then jump to the unaligned copy loop or fall through to the aligned | |
50 | ! copy loop as appropriate. | |
51 | $Laligned_dst: | |
52 | MOV D0Ar4, A1.2 | |
53 | LSR D1Ar5, D1Ar3, #3 ! D1Ar5 = number of 8 byte blocks | |
54 | ANDS D0Ar4, D0Ar4, #7 ! test source alignment | |
55 | BNZ $Lunaligned_copy ! if unaligned, use unaligned copy loop | |
56 | ||
57 | ! Both source and destination are 8 byte aligned - the easy case. | |
58 | $Laligned_copy: | |
59 | LSRS D1Ar5, D1Ar3, #5 ! D1Ar5 = number of 32 byte blocks | |
60 | BZ $Lbyte_copy | |
61 | SUB TXRPT, D1Ar5, #1 | |
62 | ||
63 | $Laligned_32: | |
64 | GETL D0Re0, D1Re0, [A1.2++] | |
65 | GETL D0Ar6, D1Ar5, [A1.2++] | |
66 | SETL [A0.2++], D0Re0, D1Re0 | |
67 | SETL [A0.2++], D0Ar6, D1Ar5 | |
68 | GETL D0Re0, D1Re0, [A1.2++] | |
69 | GETL D0Ar6, D1Ar5, [A1.2++] | |
70 | SETL [A0.2++], D0Re0, D1Re0 | |
71 | SETL [A0.2++], D0Ar6, D1Ar5 | |
72 | BR $Laligned_32 | |
73 | ||
74 | ! If there are any remaining bytes use the byte copy loop, otherwise we are done | |
75 | ANDS D1Ar3, D1Ar3, #0x1f | |
76 | BNZ $Lbyte_copy | |
77 | B $Lend | |
78 | ||
79 | ! The destination is 8 byte aligned but the source is not, and there are 8 | |
80 | ! or more bytes to be copied. | |
81 | $Lunaligned_copy: | |
82 | ! Adjust the source pointer (A1.2) to the 8 byte boundary before its | |
83 | ! current value | |
84 | MOV D0Ar4, A1.2 | |
85 | MOV D0Ar6, A1.2 | |
86 | ANDMB D0Ar4, D0Ar4, #0xfff8 | |
87 | MOV A1.2, D0Ar4 | |
88 | ! Save the number of bytes of mis-alignment in D0Ar4 for use later | |
89 | SUBS D0Ar6, D0Ar6, D0Ar4 | |
90 | MOV D0Ar4, D0Ar6 | |
91 | ! if there is no mis-alignment after all, use the aligned copy loop | |
92 | BZ $Laligned_copy | |
93 | ||
94 | ! prefetch 8 bytes | |
95 | GETL D0Re0, D1Re0, [A1.2] | |
96 | ||
97 | SUB TXRPT, D1Ar5, #1 | |
98 | ||
99 | ! There are 3 mis-alignment cases to be considered. Less than 4 bytes, exactly | |
100 | ! 4 bytes, and more than 4 bytes. | |
101 | CMP D0Ar6, #4 | |
102 | BLT $Lunaligned_1_2_3 ! use 1-3 byte mis-alignment loop | |
103 | BZ $Lunaligned_4 ! use 4 byte mis-alignment loop | |
104 | ||
105 | ! The mis-alignment is more than 4 bytes | |
106 | $Lunaligned_5_6_7: | |
107 | SUB D0Ar6, D0Ar6, #4 | |
108 | ! Calculate the bit offsets required for the shift operations necesssary | |
109 | ! to align the data. | |
110 | ! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset) | |
111 | MULW D0Ar6, D0Ar6, #8 | |
112 | MOV D1Ar5, #32 | |
113 | SUB D1Ar5, D1Ar5, D0Ar6 | |
114 | ! Move data 4 bytes before we enter the main loop | |
115 | MOV D0Re0, D1Re0 | |
116 | ||
117 | $Lloop_5_6_7: | |
118 | GETL D0Ar2, D1Ar1, [++A1.2] | |
119 | ! form 64-bit data in D0Re0, D1Re0 | |
120 | LSR D0Re0, D0Re0, D0Ar6 | |
121 | MOV D1Re0, D0Ar2 | |
122 | LSL D1Re0, D1Re0, D1Ar5 | |
123 | ADD D0Re0, D0Re0, D1Re0 | |
124 | ||
125 | LSR D0Ar2, D0Ar2, D0Ar6 | |
126 | LSL D1Re0, D1Ar1, D1Ar5 | |
127 | ADD D1Re0, D1Re0, D0Ar2 | |
128 | ||
129 | SETL [A0.2++], D0Re0, D1Re0 | |
130 | MOV D0Re0, D1Ar1 | |
131 | BR $Lloop_5_6_7 | |
132 | ||
133 | B $Lunaligned_end | |
134 | ||
135 | $Lunaligned_1_2_3: | |
136 | ! Calculate the bit offsets required for the shift operations necesssary | |
137 | ! to align the data. | |
138 | ! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset) | |
139 | MULW D0Ar6, D0Ar6, #8 | |
140 | MOV D1Ar5, #32 | |
141 | SUB D1Ar5, D1Ar5, D0Ar6 | |
142 | ||
143 | $Lloop_1_2_3: | |
144 | ! form 64-bit data in D0Re0,D1Re0 | |
145 | LSR D0Re0, D0Re0, D0Ar6 | |
146 | LSL D1Ar1, D1Re0, D1Ar5 | |
147 | ADD D0Re0, D0Re0, D1Ar1 | |
148 | MOV D0Ar2, D1Re0 | |
149 | LSR D0FrT, D0Ar2, D0Ar6 | |
150 | GETL D0Ar2, D1Ar1, [++A1.2] | |
151 | ||
152 | MOV D1Re0, D0Ar2 | |
153 | LSL D1Re0, D1Re0, D1Ar5 | |
154 | ADD D1Re0, D1Re0, D0FrT | |
155 | ||
156 | SETL [A0.2++], D0Re0, D1Re0 | |
157 | MOV D0Re0, D0Ar2 | |
158 | MOV D1Re0, D1Ar1 | |
159 | BR $Lloop_1_2_3 | |
160 | ||
161 | B $Lunaligned_end | |
162 | ||
163 | ! The 4 byte mis-alignment case - this does not require any shifting, just a | |
164 | ! shuffling of registers. | |
165 | $Lunaligned_4: | |
166 | MOV D0Re0, D1Re0 | |
167 | $Lloop_4: | |
168 | GETL D0Ar2, D1Ar1, [++A1.2] | |
169 | MOV D1Re0, D0Ar2 | |
170 | SETL [A0.2++], D0Re0, D1Re0 | |
171 | MOV D0Re0, D1Ar1 | |
172 | BR $Lloop_4 | |
173 | ||
174 | $Lunaligned_end: | |
175 | ! If there are no remaining bytes to copy, we are done. | |
176 | ANDS D1Ar3, D1Ar3, #7 | |
177 | BZ $Lend | |
178 | ! Re-adjust the source pointer (A1.2) back to the actual (unaligned) byte | |
179 | ! address of the remaining bytes, and fall through to the byte copy loop. | |
180 | MOV D0Ar6, A1.2 | |
181 | ADD D1Ar5, D0Ar4, D0Ar6 | |
182 | MOV A1.2, D1Ar5 | |
183 | B $Lbyte_copy | |
184 | ||
185 | .size _memcpy,.-_memcpy |