Grid 0.7.0
Intel512avx.h
Go to the documentation of this file.
1 /*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/simd/Avx512Asm.h
6
7 Copyright (C) 2015
8
9Author: paboyle <paboyle@ph.ed.ac.uk>
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along
22 with this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25 See the full license in the file "LICENSE" in the top level distribution directory
26 *************************************************************************************/
27 /* END LEGAL */
28#ifndef GRID_ASM_AV512_H
29#define GRID_ASM_AV512_H
30
32// Knights Landing specials
34
35#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
36#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
37
38#define ZMULf(Ari,Air,B,Criir,Ciirr) VMULf(Ari,B,Criir) VMULf(Air,B,Ciirr)
39#define ZMULd(Ari,Air,B,Criir,Ciirr) VMULd(Ari,B,Criir) VMULd(Air,B,Ciirr)
40
41#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
42#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
43
44#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
45#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
46
47#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
48 VSHUFMEMf(O,P,tmp) \
49 VMULMEMf(O,P,B,Biirr) \
50 VMULMEMf(O,P,C,Ciirr) \
51 VMULf(tmp,B,Briir) \
52 VMULf(tmp,C,Criir)
53
54#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
55 VSHUFMEMd(O,P,tmp) \
56 VMULMEMd(O,P,B,Biirr) \
57 VMULMEMd(O,P,C,Ciirr) \
58 VMULd(tmp,B,Briir) \
59 VMULd(tmp,C,Criir)
60
61#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
62 VSHUFMEMf(O,P,tmp) \
63 VMADDMEMf(O,P,B,Biirr) \
64 VMADDMEMf(O,P,C,Ciirr) \
65 VMADDf(tmp,B,Briir) \
66 VMADDf(tmp,C,Criir)
67
68#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
69 VSHUFMEMd(O,P,tmp) \
70 VMADDMEMd(O,P,B,Biirr) \
71 VMADDMEMd(O,P,C,Ciirr) \
72 VMADDd(tmp,B,Briir) \
73 VMADDd(tmp,C,Criir)
74
75// Merges accumulation for complex dot chain; less efficient under avx512
76#define ZEND1f(Criir,Ciirr, tmp) "vshufps $0xb1," #Criir "," #Criir "," #tmp ";\n"\
77 "vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n"
78
79#define ZEND2f(Criir,Ciirr, tmp) "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp ";\n"\
80 "vsubps " #tmp "," #Ciirr "," #Criir"{%k7}" ";\n"
81
82#define ZEND1d(Criir,Ciirr, tmp) "vshufpd $0x55," #Criir "," #Criir "," #tmp ";\n"\
83 "vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n"
84
85#define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n"\
86 "vsubpd " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii
87
88#define VMOVRDUPd(OFF,A,DEST) "vpshufd $0x44," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
89#define VMOVIDUPd(OFF,A,DEST) "vpshufd $0xee," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 3,2,3,2
90#define VMOVRDUPf(OFF,PTR,DEST) "vmovsldup " #OFF "*64(" #PTR "), " #DEST ";\n"
91#define VMOVIDUPf(OFF,PTR,DEST) "vmovshdup " #OFF "*64(" #PTR "), " #DEST ";\n"
92
93#define VRDUPd(SRC,DEST) "vpshufd $0x44," #SRC"," #DEST ";\n" // 32 bit level: 1,0,3,2
94#define VRDUPf(SRC,DEST) "vmovsldup " #SRC ", " #DEST ";\n"
95#define VIDUPd(SRC,DEST) "vpshufd $0xee," #SRC"," #DEST ";\n" // 32 bit level: 3,2,3,2
96#define VIDUPf(SRC,DEST) "vmovshdup " #SRC ", " #DEST ";\n"
97
98#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n"
99#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n"
100#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n"
101#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n"
102#define VBCASTCDUPf(OFF,A,DEST) "vbroadcastsd (" #OFF "*64 )(" #A ")," #DEST ";\n"
103#define VBCASTZDUPf(OFF,A,DEST) "vbroadcastf32x4 (" #OFF "*64 )(" #A ")," #DEST ";\n"
104#define VBCASTCDUP(OFF,A,DEST) VBCASTCDUPf(OFF,A,DEST)
105#define VBCASTZDUP(OFF,A,DEST) VBCASTZDUPf(OFF,A,DEST)
106
107#define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n"
108#define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n"
109#define VMADDSUBMEMf(O,P,B,accum) "vfmaddsub231ps " #O"*64("#P "),"#B "," #accum ";\n"
110#define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd " #O"*64("#P "),"#B "," #accum ";\n"
111
112
113#define VMADDRDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
114#define VMADDIDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
115#define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
116#define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
117#define VMULRDUPf(O,P,B,accum) "vmulps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
118#define VMULIDUPf(O,P,B,accum) "vmulps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
119
120#define VMADDRDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
121#define VMADDIDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
122#define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
123#define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
124#define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
125#define VMULIDUPd(O,P,B,accum) "vmulpd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
126 /*
127 * TimesI is used only in the XP recon
128 * Could zero the regs and use RECON_ACCUM
129 */
130
131#define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST)
132#define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
133#define VTIMESI2f(A,DEST, Z) "vsubps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
134
135#define VTIMESI0d(A,DEST, Z) VSHUFd(A,DEST)
136#define VTIMESI1d(A,DEST, Z) "vaddpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
137#define VTIMESI2d(A,DEST, Z) "vsubpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
138
139#define VTIMESMINUSI0f(A,DEST,Z) VSHUFf(A,DEST)
140#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
141#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
142
143#define VTIMESMINUSI0d(A,DEST,Z) VSHUFd(A,DEST)
144#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
145#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
146
147#if 0
148
149#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
150#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
151#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
152
153#define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp)
154#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
155#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
156
157#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp)
158#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
159#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
160
161#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp)
162#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
163#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
164
165#else
166
167// o_p must point to floating 1.0f/d
168//
169// Ai, Ar -> tmp (r i)
170// tmp *1.0
171// ACC i - Ar ; ACC r + Ai
172#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
173#define VACCTIMESMINUSI1f(A,ACC,tmp) VMADDMEMf(1,%r10,tmp,ACC)
174#define VACCTIMESMINUSI2f(A,ACC,tmp)
175
176
177#define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp)
178#define VACCTIMESMINUSI1d(A,ACC,tmp) VMADDMEMd(1,%r10,tmp,ACC)
179#define VACCTIMESMINUSI2d(A,ACC,tmp)
180
181// Ai, Ar -> tmp (r i)
182// tmp *1.0
183// ACC i + Ar ; ACC r - Ai
184#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp)
185#define VACCTIMESI1f(A,ACC,tmp) VMADDMEMf(0,%r10,tmp,ACC)
186#define VACCTIMESI2f(A,ACC,tmp)
187
188#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp)
189#define VACCTIMESI1d(A,ACC,tmp) VMADDMEMd(0,%r10,tmp,ACC)
190#define VACCTIMESI2d(A,ACC,tmp)
191
192#endif
193
194#define VPERM0f(A,B) "vshuff32x4 $0x4e," #A "," #B "," #B ";\n"
195#define VPERM1f(A,B) "vshuff32x4 $0xb1," #A "," #B "," #B ";\n"
196#define VPERM2f(A,B) "vshufps $0x4e," #A "," #B "," #B ";\n"
197#define VPERM3f(A,B) "vshufps $0xb1," #A "," #B "," #B ";\n"
198
199#define VPERM0d(A,B) "vshuff64x2 $0x4e," #A "," #B "," #B ";\n"
200#define VPERM1d(A,B) "vshuff64x2 $0xb1," #A "," #B "," #B ";\n"
201#define VPERM2d(A,B) "vshufpd $0x55," #A "," #B "," #B ";\n"
202#define VPERM3d(A,B) VMOVd(A,B)
203
204
205#endif