109#define MULT_ADD_LS(g0,g1,g2,g3) \
110 asm ( "movq %0, %%r8 \n\t" \
111 "movq %1, %%r9 \n\t" \
112 "movq %2, %%r10 \n\t" \
113 "movq %3, %%r11 \n\t" : : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\
115 VSHUF(Chi_00,T0) VSHUF(Chi_10,T1) \
116 VSHUF(Chi_20,T2) VSHUF(Chi_30,T3) \
117 VMADDSUBIDUP(0,%r8,T0,UChi_00) VMADDSUBIDUP(0,%r9,T1,UChi_10) \
118 VMADDSUBIDUP(3,%r8,T0,UChi_01) VMADDSUBIDUP(3,%r9,T1,UChi_11) \
119 VMADDSUBIDUP(6,%r8,T0,UChi_02) VMADDSUBIDUP(6,%r9,T1,UChi_12) \
120 VMADDSUBIDUP(0,%r10,T2,UChi_20) VMADDSUBIDUP(0,%r11,T3,UChi_30) \
121 VMADDSUBIDUP(3,%r10,T2,UChi_21) VMADDSUBIDUP(3,%r11,T3,UChi_31) \
122 VMADDSUBIDUP(6,%r10,T2,UChi_22) VMADDSUBIDUP(6,%r11,T3,UChi_32) \
123 VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \
124 VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \
125 VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \
126 VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \
127 VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \
128 VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \
129 VSHUF(Chi_01,T0) VSHUF(Chi_11,T1) \
130 VSHUF(Chi_21,T2) VSHUF(Chi_31,T3) \
131 VMADDSUBIDUP(1,%r8,T0,UChi_00) VMADDSUBIDUP(1,%r9,T1,UChi_10) \
132 VMADDSUBIDUP(4,%r8,T0,UChi_01) VMADDSUBIDUP(4,%r9,T1,UChi_11) \
133 VMADDSUBIDUP(7,%r8,T0,UChi_02) VMADDSUBIDUP(7,%r9,T1,UChi_12) \
134 VMADDSUBIDUP(1,%r10,T2,UChi_20) VMADDSUBIDUP(1,%r11,T3,UChi_30) \
135 VMADDSUBIDUP(4,%r10,T2,UChi_21) VMADDSUBIDUP(4,%r11,T3,UChi_31) \
136 VMADDSUBIDUP(7,%r10,T2,UChi_22) VMADDSUBIDUP(7,%r11,T3,UChi_32) \
137 VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \
138 VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \
139 VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \
140 VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \
141 VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \
142 VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \
143 VSHUF(Chi_02,T0) VSHUF(Chi_12,T1) \
144 VSHUF(Chi_22,T2) VSHUF(Chi_32,T3) \
145 VMADDSUBIDUP(2,%r8,T0,UChi_00) VMADDSUBIDUP(2,%r9,T1,UChi_10) \
146 VMADDSUBIDUP(5,%r8,T0,UChi_01) VMADDSUBIDUP(5,%r9,T1,UChi_11) \
147 VMADDSUBIDUP(8,%r8,T0,UChi_02) VMADDSUBIDUP(8,%r9,T1,UChi_12) \
148 VMADDSUBIDUP(2,%r10,T2,UChi_20) VMADDSUBIDUP(2,%r11,T3,UChi_30) \
149 VMADDSUBIDUP(5,%r10,T2,UChi_21) VMADDSUBIDUP(5,%r11,T3,UChi_31) \
150 VMADDSUBIDUP(8,%r10,T2,UChi_22) VMADDSUBIDUP(8,%r11,T3,UChi_32) \
151 VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \
152 VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \
153 VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \
154 VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \
155 VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
156 VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
158#define MULT_LS(g0,g1,g2,g3) \
159 asm ( "movq %0, %%r8 \n\t" \
160 "movq %1, %%r9 \n\t" \
161 "movq %2, %%r10 \n\t" \
162 "movq %3, %%r11 \n\t" : : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\
164 VSHUF(Chi_00,T0) VSHUF(Chi_10,T1) \
165 VSHUF(Chi_20,T2) VSHUF(Chi_30,T3) \
166 VMULIDUP(0,%r8,T0,UChi_00) VMULIDUP(0,%r9,T1,UChi_10) \
167 VMULIDUP(3,%r8,T0,UChi_01) VMULIDUP(3,%r9,T1,UChi_11) \
168 VMULIDUP(6,%r8,T0,UChi_02) VMULIDUP(6,%r9,T1,UChi_12) \
169 VMULIDUP(0,%r10,T2,UChi_20) VMULIDUP(0,%r11,T3,UChi_30) \
170 VMULIDUP(3,%r10,T2,UChi_21) VMULIDUP(3,%r11,T3,UChi_31) \
171 VMULIDUP(6,%r10,T2,UChi_22) VMULIDUP(6,%r11,T3,UChi_32) \
172 VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \
173 VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \
174 VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \
175 VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \
176 VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \
177 VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \
178 VSHUF(Chi_01,T0) VSHUF(Chi_11,T1) \
179 VSHUF(Chi_21,T2) VSHUF(Chi_31,T3) \
180 VMADDSUBIDUP(1,%r8,T0,UChi_00) VMADDSUBIDUP(1,%r9,T1,UChi_10) \
181 VMADDSUBIDUP(4,%r8,T0,UChi_01) VMADDSUBIDUP(4,%r9,T1,UChi_11) \
182 VMADDSUBIDUP(7,%r8,T0,UChi_02) VMADDSUBIDUP(7,%r9,T1,UChi_12) \
183 VMADDSUBIDUP(1,%r10,T2,UChi_20) VMADDSUBIDUP(1,%r11,T3,UChi_30) \
184 VMADDSUBIDUP(4,%r10,T2,UChi_21) VMADDSUBIDUP(4,%r11,T3,UChi_31) \
185 VMADDSUBIDUP(7,%r10,T2,UChi_22) VMADDSUBIDUP(7,%r11,T3,UChi_32) \
186 VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \
187 VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \
188 VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \
189 VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \
190 VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \
191 VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \
192 VSHUF(Chi_02,T0) VSHUF(Chi_12,T1) \
193 VSHUF(Chi_22,T2) VSHUF(Chi_32,T3) \
194 VMADDSUBIDUP(2,%r8,T0,UChi_00) VMADDSUBIDUP(2,%r9,T1,UChi_10) \
195 VMADDSUBIDUP(5,%r8,T0,UChi_01) VMADDSUBIDUP(5,%r9,T1,UChi_11) \
196 VMADDSUBIDUP(8,%r8,T0,UChi_02) VMADDSUBIDUP(8,%r9,T1,UChi_12) \
197 VMADDSUBIDUP(2,%r10,T2,UChi_20) VMADDSUBIDUP(2,%r11,T3,UChi_30) \
198 VMADDSUBIDUP(5,%r10,T2,UChi_21) VMADDSUBIDUP(5,%r11,T3,UChi_31) \
199 VMADDSUBIDUP(8,%r10,T2,UChi_22) VMADDSUBIDUP(8,%r11,T3,UChi_32) \
200 VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \
201 VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \
202 VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \
203 VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \
204 VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
205 VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
207#define MULT_ADD_XYZTa(g0,g1) \
208 asm ( "movq %0, %%r8 \n\t" \
209 "movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9");\
213 VMOVIDUP(0,%r8,Z0 ) \
214 VMOVIDUP(3,%r8,Z1 ) \
215 VMOVIDUP(6,%r8,Z2 ) \
216 VMADDSUB(Z0,T0,UChi_00) \
217 VMADDSUB(Z1,T0,UChi_01) \
218 VMADDSUB(Z2,T0,UChi_02) \
220 VMOVIDUP(0,%r9,Z0 ) \
221 VMOVIDUP(3,%r9,Z1 ) \
222 VMOVIDUP(6,%r9,Z2 ) \
223 VMADDSUB(Z0,T1,UChi_10) \
224 VMADDSUB(Z1,T1,UChi_11) \
225 VMADDSUB(Z2,T1,UChi_12) \
228 VMOVRDUP(0,%r8,Z3 ) \
229 VMOVRDUP(3,%r8,Z4 ) \
230 VMOVRDUP(6,%r8,Z5 ) \
231 VMADDSUB(Z3,Chi_00,UChi_00) \
232 VMADDSUB(Z4,Chi_00,UChi_01) \
233 VMADDSUB(Z5,Chi_00,UChi_02) \
235 VMOVRDUP(0,%r9,Z3 ) \
236 VMOVRDUP(3,%r9,Z4 ) \
237 VMOVRDUP(6,%r9,Z5 ) \
238 VMADDSUB(Z3,Chi_10,UChi_10) \
239 VMADDSUB(Z4,Chi_10,UChi_11)\
240 VMADDSUB(Z5,Chi_10,UChi_12) \
243 VMOVIDUP(1,%r8,Z0 ) \
244 VMOVIDUP(4,%r8,Z1 ) \
245 VMOVIDUP(7,%r8,Z2 ) \
247 VMADDSUB(Z0,T0,UChi_00) \
248 VMADDSUB(Z1,T0,UChi_01) \
249 VMADDSUB(Z2,T0,UChi_02) \
251 VMOVIDUP(1,%r9,Z0 ) \
252 VMOVIDUP(4,%r9,Z1 ) \
253 VMOVIDUP(7,%r9,Z2 ) \
255 VMADDSUB(Z0,T1,UChi_10) \
256 VMADDSUB(Z1,T1,UChi_11) \
257 VMADDSUB(Z2,T1,UChi_12) \
259 VMOVRDUP(1,%r8,Z3 ) \
260 VMOVRDUP(4,%r8,Z4 ) \
261 VMOVRDUP(7,%r8,Z5 ) \
262 VMADDSUB(Z3,Chi_01,UChi_00) \
263 VMADDSUB(Z4,Chi_01,UChi_01) \
264 VMADDSUB(Z5,Chi_01,UChi_02) \
266 VMOVRDUP(1,%r9,Z3 ) \
267 VMOVRDUP(4,%r9,Z4 ) \
268 VMOVRDUP(7,%r9,Z5 ) \
269 VMADDSUB(Z3,Chi_11,UChi_10) \
270 VMADDSUB(Z4,Chi_11,UChi_11) \
271 VMADDSUB(Z5,Chi_11,UChi_12) \
275 VMOVIDUP(2,%r8,Z0 ) \
276 VMOVIDUP(5,%r8,Z1 ) \
277 VMOVIDUP(8,%r8,Z2 ) \
278 VMADDSUB(Z0,T0,UChi_00) \
279 VMADDSUB(Z1,T0,UChi_01) \
280 VMADDSUB(Z2,T0,UChi_02) \
281 VMOVIDUP(2,%r9,Z0 ) \
282 VMOVIDUP(5,%r9,Z1 ) \
283 VMOVIDUP(8,%r9,Z2 ) \
284 VMADDSUB(Z0,T1,UChi_10) \
285 VMADDSUB(Z1,T1,UChi_11) \
286 VMADDSUB(Z2,T1,UChi_12) \
288 VMOVRDUP(2,%r8,Z3 ) \
289 VMOVRDUP(5,%r8,Z4 ) \
290 VMOVRDUP(8,%r8,Z5 ) \
291 VMADDSUB(Z3,Chi_02,UChi_00) \
292 VMADDSUB(Z4,Chi_02,UChi_01) \
293 VMADDSUB(Z5,Chi_02,UChi_02) \
294 VMOVRDUP(2,%r9,Z3 ) \
295 VMOVRDUP(5,%r9,Z4 ) \
296 VMOVRDUP(8,%r9,Z5 ) \
297 VMADDSUB(Z3,Chi_12,UChi_10) \
298 VMADDSUB(Z4,Chi_12,UChi_11) \
299 VMADDSUB(Z5,Chi_12,UChi_12) \
302#define MULT_ADD_XYZT(g0,g1) \
303 asm ( "movq %0, %%r8 \n\t" \
304 "movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9");\
306 VSHUFMEM(0,%r8,Z00) VSHUFMEM(0,%r9,Z10) \
307 VRDUP(Chi_00,T0) VIDUP(Chi_00,Chi_00) \
308 VRDUP(Chi_10,T1) VIDUP(Chi_10,Chi_10) \
309 VMUL(Z00,Chi_00,Z1) VMUL(Z10,Chi_10,Z2) \
310 VSHUFMEM(3,%r8,Z00) VSHUFMEM(3,%r9,Z10) \
311 VMUL(Z00,Chi_00,Z3) VMUL(Z10,Chi_10,Z4) \
312 VSHUFMEM(6,%r8,Z00) VSHUFMEM(6,%r9,Z10) \
313 VMUL(Z00,Chi_00,Z5) VMUL(Z10,Chi_10,Z6) \
314 VMADDMEM(0,%r8,T0,UChi_00) VMADDMEM(0,%r9,T1,UChi_10) \
315 VMADDMEM(3,%r8,T0,UChi_01) VMADDMEM(3,%r9,T1,UChi_11) \
316 VMADDMEM(6,%r8,T0,UChi_02) VMADDMEM(6,%r9,T1,UChi_12) \
317 VSHUFMEM(1,%r8,Z00) VSHUFMEM(1,%r9,Z10) \
318 VRDUP(Chi_01,T0) VIDUP(Chi_01,Chi_01) \
319 VRDUP(Chi_11,T1) VIDUP(Chi_11,Chi_11) \
320 VMADD(Z00,Chi_01,Z1) VMADD(Z10,Chi_11,Z2) \
321 VSHUFMEM(4,%r8,Z00) VSHUFMEM(4,%r9,Z10) \
322 VMADD(Z00,Chi_01,Z3) VMADD(Z10,Chi_11,Z4) \
323 VSHUFMEM(7,%r8,Z00) VSHUFMEM(7,%r9,Z10) \
324 VMADD(Z00,Chi_01,Z5) VMADD(Z10,Chi_11,Z6) \
325 VMADDMEM(1,%r8,T0,UChi_00) VMADDMEM(1,%r9,T1,UChi_10) \
326 VMADDMEM(4,%r8,T0,UChi_01) VMADDMEM(4,%r9,T1,UChi_11) \
327 VMADDMEM(7,%r8,T0,UChi_02) VMADDMEM(7,%r9,T1,UChi_12) \
328 VSHUFMEM(2,%r8,Z00) VSHUFMEM(2,%r9,Z10) \
329 VRDUP(Chi_02,T0) VIDUP(Chi_02,Chi_02) \
330 VRDUP(Chi_12,T1) VIDUP(Chi_12,Chi_12) \
331 VMADD(Z00,Chi_02,Z1) VMADD(Z10,Chi_12,Z2) \
332 VSHUFMEM(5,%r8,Z00) VSHUFMEM(5,%r9,Z10) \
333 VMADD(Z00,Chi_02,Z3) VMADD(Z10,Chi_12,Z4) \
334 VSHUFMEM(8,%r8,Z00) VSHUFMEM(8,%r9,Z10) \
335 VMADD(Z00,Chi_02,Z5) VMADD(Z10,Chi_12,Z6) \
336 VMADDSUBMEM(2,%r8,T0,Z1) VMADDSUBMEM(2,%r9,T1,Z2) \
337 VMADDSUBMEM(5,%r8,T0,Z3) VMADDSUBMEM(5,%r9,T1,Z4) \
338 VMADDSUBMEM(8,%r8,T0,Z5) VMADDSUBMEM(8,%r9,T1,Z6) \
339 VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
340 VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
341 VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) );
343#define MULT_XYZT(g0,g1) \
344 asm ( "movq %0, %%r8 \n\t" \
345 "movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9" ); \
349 VMOVIDUP(0,%r8,Z0 ) \
350 VMOVIDUP(3,%r8,Z1 ) \
351 VMOVIDUP(6,%r8,Z2 ) \
353 VMUL(Z0,T0,UChi_00) \
354 VMUL(Z1,T0,UChi_01) \
355 VMUL(Z2,T0,UChi_02) \
356 VMOVIDUP(0,%r9,Z0 ) \
357 VMOVIDUP(3,%r9,Z1 ) \
358 VMOVIDUP(6,%r9,Z2 ) \
359 VMUL(Z0,T1,UChi_10) \
360 VMUL(Z1,T1,UChi_11) \
361 VMUL(Z2,T1,UChi_12) \
362 VMOVRDUP(0,%r8,Z3 ) \
363 VMOVRDUP(3,%r8,Z4 ) \
364 VMOVRDUP(6,%r8,Z5 ) \
366 VMADDSUB(Z3,Chi_00,UChi_00) \
367 VMADDSUB(Z4,Chi_00,UChi_01)\
368 VMADDSUB(Z5,Chi_00,UChi_02) \
369 VMOVRDUP(0,%r9,Z3 ) \
370 VMOVRDUP(3,%r9,Z4 ) \
371 VMOVRDUP(6,%r9,Z5 ) \
372 VMADDSUB(Z3,Chi_10,UChi_10) \
373 VMADDSUB(Z4,Chi_10,UChi_11)\
374 VMADDSUB(Z5,Chi_10,UChi_12) \
375 VMOVIDUP(1,%r8,Z0 ) \
376 VMOVIDUP(4,%r8,Z1 ) \
377 VMOVIDUP(7,%r8,Z2 ) \
380 VMADDSUB(Z0,T0,UChi_00) \
381 VMADDSUB(Z1,T0,UChi_01) \
382 VMADDSUB(Z2,T0,UChi_02) \
383 VMOVIDUP(1,%r9,Z0 ) \
384 VMOVIDUP(4,%r9,Z1 ) \
385 VMOVIDUP(7,%r9,Z2 ) \
387 VMADDSUB(Z0,T1,UChi_10) \
388 VMADDSUB(Z1,T1,UChi_11) \
389 VMADDSUB(Z2,T1,UChi_12) \
390 VMOVRDUP(1,%r8,Z3 ) \
391 VMOVRDUP(4,%r8,Z4 ) \
392 VMOVRDUP(7,%r8,Z5 ) \
394 VMADDSUB(Z3,Chi_01,UChi_00) \
395 VMADDSUB(Z4,Chi_01,UChi_01) \
396 VMADDSUB(Z5,Chi_01,UChi_02) \
397 VMOVRDUP(1,%r9,Z3 ) \
398 VMOVRDUP(4,%r9,Z4 ) \
399 VMOVRDUP(7,%r9,Z5 ) \
400 VMADDSUB(Z3,Chi_11,UChi_10) \
401 VMADDSUB(Z4,Chi_11,UChi_11) \
402 VMADDSUB(Z5,Chi_11,UChi_12) \
406 VMOVIDUP(2,%r8,Z0 ) \
407 VMOVIDUP(5,%r8,Z1 ) \
408 VMOVIDUP(8,%r8,Z2 ) \
409 VMADDSUB(Z0,T0,UChi_00) \
410 VMADDSUB(Z1,T0,UChi_01) \
411 VMADDSUB(Z2,T0,UChi_02) \
412 VMOVIDUP(2,%r9,Z0 ) \
413 VMOVIDUP(5,%r9,Z1 ) \
414 VMOVIDUP(8,%r9,Z2 ) \
415 VMADDSUB(Z0,T1,UChi_10) \
416 VMADDSUB(Z1,T1,UChi_11) \
417 VMADDSUB(Z2,T1,UChi_12) \
419 VMOVRDUP(2,%r8,Z3 ) \
420 VMOVRDUP(5,%r8,Z4 ) \
421 VMOVRDUP(8,%r8,Z5 ) \
422 VMADDSUB(Z3,Chi_02,UChi_00) \
423 VMADDSUB(Z4,Chi_02,UChi_01) \
424 VMADDSUB(Z5,Chi_02,UChi_02) \
425 VMOVRDUP(2,%r9,Z3 ) \
426 VMOVRDUP(5,%r9,Z4 ) \
427 VMOVRDUP(8,%r9,Z5 ) \
428 VMADDSUB(Z3,Chi_12,UChi_10) \
429 VMADDSUB(Z4,Chi_12,UChi_11) \
430 VMADDSUB(Z5,Chi_12,UChi_12) \
433#define MULT_XYZTa(g0,g1) \
434 asm ( "movq %0, %%r8 \n\t" \
435 "movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9" ); \
437 VSHUFMEM(0,%r8,Z00) VSHUFMEM(0,%r9,Z10) \
438 VRDUP(Chi_00,T0) VIDUP(Chi_00,Chi_00) \
439 VRDUP(Chi_10,T1) VIDUP(Chi_10,Chi_10) \
440 VMUL(Z00,Chi_00,Z1) VMUL(Z10,Chi_10,Z2) \
441 VSHUFMEM(3,%r8,Z00) VSHUFMEM(3,%r9,Z10) \
442 VMUL(Z00,Chi_00,Z3) VMUL(Z10,Chi_10,Z4) \
443 VSHUFMEM(6,%r8,Z00) VSHUFMEM(6,%r9,Z10) \
444 VMUL(Z00,Chi_00,Z5) VMUL(Z10,Chi_10,Z6) \
445 VMULMEM(0,%r8,T0,UChi_00) VMULMEM(0,%r9,T1,UChi_10) \
446 VMULMEM(3,%r8,T0,UChi_01) VMULMEM(3,%r9,T1,UChi_11) \
447 VMULMEM(6,%r8,T0,UChi_02) VMULMEM(6,%r9,T1,UChi_12) \
448 VSHUFMEM(1,%r8,Z00) VSHUFMEM(1,%r9,Z10) \
449 VRDUP(Chi_01,T0) VIDUP(Chi_01,Chi_01) \
450 VRDUP(Chi_11,T1) VIDUP(Chi_11,Chi_11) \
451 VMADD(Z00,Chi_01,Z1) VMADD(Z10,Chi_11,Z2) \
452 VSHUFMEM(4,%r8,Z00) VSHUFMEM(4,%r9,Z10) \
453 VMADD(Z00,Chi_01,Z3) VMADD(Z10,Chi_11,Z4) \
454 VSHUFMEM(7,%r8,Z00) VSHUFMEM(7,%r9,Z10) \
455 VMADD(Z00,Chi_01,Z5) VMADD(Z10,Chi_11,Z6) \
456 VMADDMEM(1,%r8,T0,UChi_00) VMADDMEM(1,%r9,T1,UChi_10) \
457 VMADDMEM(4,%r8,T0,UChi_01) VMADDMEM(4,%r9,T1,UChi_11) \
458 VMADDMEM(7,%r8,T0,UChi_02) VMADDMEM(7,%r9,T1,UChi_12) \
459 VSHUFMEM(2,%r8,Z00) VSHUFMEM(2,%r9,Z10) \
460 VRDUP(Chi_02,T0) VIDUP(Chi_02,Chi_02) \
461 VRDUP(Chi_12,T1) VIDUP(Chi_12,Chi_12) \
462 VMADD(Z00,Chi_02,Z1) VMADD(Z10,Chi_12,Z2) \
463 VSHUFMEM(5,%r8,Z00) VSHUFMEM(5,%r9,Z10) \
464 VMADD(Z00,Chi_02,Z3) VMADD(Z10,Chi_12,Z4) \
465 VSHUFMEM(8,%r8,Z00) VSHUFMEM(8,%r9,Z10) \
466 VMADD(Z00,Chi_02,Z5) VMADD(Z10,Chi_12,Z6) \
467 VMADDSUBMEM(2,%r8,T0,Z1) VMADDSUBMEM(2,%r9,T1,Z2) \
468 VMADDSUBMEM(5,%r8,T0,Z3) VMADDSUBMEM(5,%r9,T1,Z4) \
469 VMADDSUBMEM(8,%r8,T0,Z5) VMADDSUBMEM(8,%r9,T1,Z6) \
470 VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
471 VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
472 VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) );
475#define LOAD_CHI(a0,a1,a2,a3) \
477 "movq %0, %%r8 \n\t" \
478 VLOAD(0,%%r8,pChi_00) \
479 VLOAD(1,%%r8,pChi_01) \
480 VLOAD(2,%%r8,pChi_02) \
481 : : "r" (a0) : "%r8" ); \
483 "movq %0, %%r8 \n\t" \
484 VLOAD(0,%%r8,pChi_10) \
485 VLOAD(1,%%r8,pChi_11) \
486 VLOAD(2,%%r8,pChi_12) \
487 : : "r" (a1) : "%r8" ); \
489 "movq %0, %%r8 \n\t" \
490 VLOAD(0,%%r8,pChi_20) \
491 VLOAD(1,%%r8,pChi_21) \
492 VLOAD(2,%%r8,pChi_22) \
493 : : "r" (a2) : "%r8" ); \
495 "movq %0, %%r8 \n\t" \
496 VLOAD(0,%%r8,pChi_30) \
497 VLOAD(1,%%r8,pChi_31) \
498 VLOAD(2,%%r8,pChi_32) \
499 : : "r" (a3) : "%r8" );
501#define LOAD_CHIa(a0,a1) \
503 "movq %0, %%r8 \n\t" \
504 VLOAD(0,%%r8,pChi_00) \
505 VLOAD(1,%%r8,pChi_01) \
506 VLOAD(2,%%r8,pChi_02) \
507 : : "r" (a0) : "%r8" ); \
509 "movq %0, %%r8 \n\t" \
510 VLOAD(0,%%r8,pChi_10) \
511 VLOAD(1,%%r8,pChi_11) \
512 VLOAD(2,%%r8,pChi_12) \
513 : : "r" (a1) : "%r8" );
518 "movq %0, %%r8 \n\t" \
522 : : "r" (a0) : "%r8" ); \
524#define PF_GAUGE_XYZT(a0)
525#define PF_GAUGE_XYZTa(a0) \
527 "movq %0, %%r8 \n\t" \
537 : : "r" (a0) : "%r8" ); \
539#define PF_GAUGE_LS(a0)
540#define PF_GAUGE_LSa(a0) \
542 "movq %0, %%r8 \n\t" \
545 : : "r" (a0) : "%r8" ); \
550 VADD(UChi_00,UChi_10,UChi_00) \
551 VADD(UChi_01,UChi_11,UChi_01) \
552 VADD(UChi_02,UChi_12,UChi_02) \
553 VADD(UChi_30,UChi_20,UChi_30) \
554 VADD(UChi_31,UChi_21,UChi_31) \
555 VADD(UChi_32,UChi_22,UChi_32) \
556 VADD(UChi_00,UChi_30,UChi_00) \
557 VADD(UChi_01,UChi_31,UChi_01) \
558 VADD(UChi_02,UChi_32,UChi_02) ); \
560 VSTORE(0,%0,pUChi_00) \
561 VSTORE(1,%0,pUChi_01) \
562 VSTORE(2,%0,pUChi_02) \
563 : : "r" (out) : "memory" );
565#define nREDUCE(out) \
567 VADD(UChi_00,UChi_10,UChi_00) \
568 VADD(UChi_01,UChi_11,UChi_01) \
569 VADD(UChi_02,UChi_12,UChi_02) \
570 VADD(UChi_30,UChi_20,UChi_30) \
571 VADD(UChi_31,UChi_21,UChi_31) \
572 VADD(UChi_32,UChi_22,UChi_32) \
573 VADD(UChi_00,UChi_30,UChi_00) \
574 VADD(UChi_01,UChi_31,UChi_01) \
575 VADD(UChi_02,UChi_32,UChi_02) ); \
577 VSUB(UChi_00,Chi_00,UChi_00) \
578 VSUB(UChi_01,Chi_00,UChi_01) \
579 VSUB(UChi_02,Chi_00,UChi_02) ); \
581 VSTORE(0,%0,pUChi_00) \
582 VSTORE(1,%0,pUChi_01) \
583 VSTORE(2,%0,pUChi_02) \
584 : : "r" (out) : "memory" );
586#define REDUCEa(out) \
588 VADD(UChi_00,UChi_10,UChi_00) \
589 VADD(UChi_01,UChi_11,UChi_01) \
590 VADD(UChi_02,UChi_12,UChi_02) ); \
592 VSTORE(0,%0,pUChi_00) \
593 VSTORE(1,%0,pUChi_01) \
594 VSTORE(2,%0,pUChi_02) \
595 : : "r" (out) : "memory" );
598#define nREDUCEa(out) \
600 VADD(UChi_00,UChi_10,UChi_00) \
601 VADD(UChi_01,UChi_11,UChi_01) \
602 VADD(UChi_02,UChi_12,UChi_02) ); \
604 VSUB(UChi_00,Chi_00,UChi_00) \
605 VSUB(UChi_01,Chi_00,UChi_01) \
606 VSUB(UChi_02,Chi_00,UChi_02) ); \
608 VSTORE(0,%0,pUChi_00) \
609 VSTORE(1,%0,pUChi_01) \
610 VSTORE(2,%0,pUChi_02) \
611 : : "r" (out) : "memory" );
613#define PERMUTE_DIR(dir) \
614 permute##dir(Chi_0,Chi_0);\
615 permute##dir(Chi_1,Chi_1);\
616 permute##dir(Chi_2,Chi_2);
622 DoubledGaugeFieldView &
U,
623 DoubledGaugeFieldView &UUU,
624 SiteSpinor *buf,
int sF,
625 int sU,
const FermionFieldView &in, FermionFieldView &out,
int dag)
633#define CONDITIONAL_MOVE(l,o,out) { const SiteSpinor *ptr = l? in_p : buf; out = (uint64_t) &ptr[o]; }
635#define PREPARE_XYZT(X,Y,Z,T,skew,UU) \
636 PREPARE(X,Y,Z,T,skew,UU); \
637 PF_GAUGE_XYZT(gauge0); \
638 PF_GAUGE_XYZT(gauge1); \
639 PF_GAUGE_XYZT(gauge2); \
640 PF_GAUGE_XYZT(gauge3);
642#define PREPARE_LS(X,Y,Z,T,skew,UU) \
643 PREPARE(X,Y,Z,T,skew,UU); \
644 PF_GAUGE_LS(gauge0); \
645 PF_GAUGE_LS(gauge1); \
646 PF_GAUGE_LS(gauge2); \
649#define PREPARE(X,Y,Z,T,skew,UU) \
650 SE0=st.GetEntry(ptype,X+skew,sF); \
652 l0 = SE0->_is_local; \
653 p0 = SE0->_permute; \
654 CONDITIONAL_MOVE(l0,o0,addr0); \
657 SE1=st.GetEntry(ptype,Y+skew,sF); \
659 l1 = SE1->_is_local; \
660 p1 = SE1->_permute; \
661 CONDITIONAL_MOVE(l1,o1,addr1); \
664 SE2=st.GetEntry(ptype,Z+skew,sF); \
666 l2 = SE2->_is_local; \
667 p2 = SE2->_permute; \
668 CONDITIONAL_MOVE(l2,o2,addr2); \
671 SE3=st.GetEntry(ptype,T+skew,sF); \
673 l3 = SE3->_is_local; \
674 p3 = SE3->_permute; \
675 CONDITIONAL_MOVE(l3,o3,addr3); \
678 gauge0 =(uint64_t)&UU[sU]( X ); \
679 gauge1 =(uint64_t)&UU[sU]( Y ); \
680 gauge2 =(uint64_t)&UU[sU]( Z ); \
681 gauge3 =(uint64_t)&UU[sU]( T );
688 DoubledGaugeFieldView &
U,
689 DoubledGaugeFieldView &UUU,
690 SiteSpinor *buf,
int sF,
691 int sU,
const FermionFieldView &in, FermionFieldView &out,
int dag)
694 uint64_t gauge0,gauge1,gauge2,gauge3;
695 uint64_t addr0,addr1,addr2,addr3;
696 const SiteSpinor *in_p; in_p = &in[0];
714 MULT_LS(gauge0,gauge1,gauge2,gauge3);
728 addr0 = (uint64_t) &out[sF];
743 DoubledGaugeFieldView &
U,
744 DoubledGaugeFieldView &UUU,
745 SiteSpinor *buf,
int sF,
746 int sU,
const FermionFieldView &in, FermionFieldView &out,
int dag)
749 uint64_t gauge0,gauge1,gauge2,gauge3;
750 uint64_t addr0,addr1,addr2,addr3;
751 const SiteSpinor *in_p; in_p = &in[0];
768 MULT_LS(gauge0,gauge1,gauge2,gauge3);
782 addr0 = (uint64_t) &out[sF];
797#define PERMUTE_DIR3 __asm__ ( \
798 VPERM3(Chi_00,Chi_00) \
799 VPERM3(Chi_01,Chi_01) \
800 VPERM3(Chi_02,Chi_02) );
802#define PERMUTE_DIR2 __asm__ ( \
803 VPERM2(Chi_10,Chi_10) \
804 VPERM2(Chi_11,Chi_11) \
805 VPERM2(Chi_12,Chi_12) );
807#define PERMUTE_DIR1 __asm__ ( \
808 VPERM1(Chi_00,Chi_00) \
809 VPERM1(Chi_01,Chi_01) \
810 VPERM1(Chi_02,Chi_02) );
812#define PERMUTE_DIR0 __asm__ ( \
813 VPERM0(Chi_10,Chi_10) \
814 VPERM0(Chi_11,Chi_11) \
815 VPERM0(Chi_12,Chi_12) );
818 if ( p0 ) { PERMUTE_DIR3; }\
819 if ( p1 ) { PERMUTE_DIR2; }
822 if ( p2 ) { PERMUTE_DIR1; }\
823 if ( p3 ) { PERMUTE_DIR0; }
829 DoubledGaugeFieldView &
U,
830 DoubledGaugeFieldView &UUU,
831 SiteSpinor *buf,
int sF,
832 int sU,
const FermionFieldView &in, FermionFieldView &out,
int dag)
835 uint64_t gauge0,gauge1,gauge2,gauge3;
836 uint64_t addr0,addr1,addr2,addr3;
837 const SiteSpinor *in_p; in_p = &in[0];
884 addr0 = (uint64_t) &out[sF];
898 DoubledGaugeFieldView &
U,
899 DoubledGaugeFieldView &UUU,
900 SiteSpinor *buf,
int sF,
901 int sU,
const FermionFieldView &in, FermionFieldView &out,
int dag)
904 uint64_t gauge0,gauge1,gauge2,gauge3;
905 uint64_t addr0,addr1,addr2,addr3;
906 const SiteSpinor *in_p; in_p = &in[0];
953 addr0 = (uint64_t) &out[sF];