Grid 0.7.0
Intel512common.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/simd/Avx512Asm.h
6
7 Copyright (C) 2015
8
9Author: paboyle <paboyle@ph.ed.ac.uk>
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along
22 with this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25 See the full license in the file "LICENSE" in the top level distribution directory
26*************************************************************************************/
27/* END LEGAL */
28#ifndef GRID_ASM_INTEL_COMMON_512_H
29#define GRID_ASM_INTEL_COMMON_512_H
30
32// Peformance options
34#undef AVX512_PF_L2_WRITE
35
37// Opcodes common
39#define MASK_REGS \
40 __asm__ ("mov $0xAAAA, %%eax \n" \
41 "kmovw %%eax, %%k6 \n" \
42 "mov $0x5555, %%eax \n" \
43 "kmovw %%eax, %%k7 \n" : : : "%eax");
44
45//#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" );
46
47#define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n"
48#define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n"
49
50#define VTIMESIf(A,DEST, Z) \
51 VTIMESI0f(A,DEST, Z) \
52 VTIMESI1f(A,DEST, Z) \
53 VTIMESI2f(A,DEST, Z)
54
55#define VTIMESId(A,DEST, Z) \
56 VTIMESI0d(A,DEST, Z) \
57 VTIMESI1d(A,DEST, Z) \
58 VTIMESI2d(A,DEST, Z)
59
60#define VTIMESMINUSIf(A,DEST, Z) \
61 VTIMESMINUSI0f(A,DEST, Z) \
62 VTIMESMINUSI1f(A,DEST, Z) \
63 VTIMESMINUSI2f(A,DEST, Z)
64
65#define VTIMESMINUSId(A,DEST, Z) \
66 VTIMESMINUSI0d(A,DEST, Z) \
67 VTIMESMINUSI1d(A,DEST, Z) \
68 VTIMESMINUSI2d(A,DEST, Z)
69
70#define VACCTIMESIf(A,ACC,tmp) \
71 VACCTIMESI0f(A,ACC,tmp) \
72 VACCTIMESI1f(A,ACC,tmp) \
73 VACCTIMESI2f(A,ACC,tmp)
74
75#define VACCTIMESId(A,ACC,tmp) \
76 VACCTIMESI0d(A,ACC,tmp) \
77 VACCTIMESI1d(A,ACC,tmp) \
78 VACCTIMESI2d(A,ACC,tmp)
79
80#define VACCTIMESMINUSIf(A,ACC,tmp) \
81 VACCTIMESMINUSI0f(A,ACC,tmp) \
82 VACCTIMESMINUSI1f(A,ACC,tmp) \
83 VACCTIMESMINUSI2f(A,ACC,tmp)
84
85#define VACCTIMESMINUSId(A,ACC,tmp) \
86 VACCTIMESMINUSI0d(A,ACC,tmp) \
87 VACCTIMESMINUSI1d(A,ACC,tmp) \
88 VACCTIMESMINUSI2d(A,ACC,tmp)
89
90#define LOAD64a(A,ptr) "movq %0, %" #A : : "r"(ptr) : #A
91#define LOAD64i(A,ptr) __asm__ ( LOAD64a(A,ptr));
92#define LOAD64(A,ptr) LOAD64i(A,ptr)
93
94#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n"
95#define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n"
96
97#define VPREFETCH1(O,A) "prefetcht0 "#O"*64("#A");\n"
98#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n"
99#ifdef AVX512_PF_L2_WRITE
100#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n"
101#else
102#define VPREFETCHW(O,A)
103#endif
104#define VPREFETCHNTA(O,A)
105#define VPREFETCH(O,A)
106
107#define VEVICT(O,A)
108
109//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
110// "clevict0 "#O"*64("#A");\n"
111
112#define VLOADf(OFF,PTR,DEST) "vmovups " #OFF "*64(" #PTR "), " #DEST ";\n"
113#define VLOADd(OFF,PTR,DEST) "vmovupd " #OFF "*64(" #PTR "), " #DEST ";\n"
114
115#define VADDf(A,B,DEST) "vaddps " #A "," #B "," #DEST ";\n"
116#define VADDd(A,B,DEST) "vaddpd " #A "," #B "," #DEST ";\n"
117
118#define VSUBf(A,B,DEST) "vsubps " #A "," #B "," #DEST ";\n"
119#define VSUBd(A,B,DEST) "vsubpd " #A "," #B "," #DEST ";\n"
120
121#define VADDMEMf(O,A,B,DEST) "vaddps "#O"*64("#A ")," #B "," #DEST ";\n"
122#define VADDMEMd(O,A,B,DEST) "vaddpd "#O"*64("#A ")," #B "," #DEST ";\n"
123
124#define VSUBMEMf(O,A,B,DEST) "vsubps "#O"*64("#A ")," #B "," #DEST ";\n"
125#define VSUBMEMd(O,A,B,DEST) "vsubpd "#O"*64("#A ")," #B "," #DEST ";\n"
126
127#define VMULf(A,B,DEST) "vmulps " #A "," #B "," #DEST ";\n"
128#define VMULd(A,B,DEST) "vmulpd " #A "," #B "," #DEST ";\n"
129
130#define VMADDf(A,B,DEST) "vfmadd231ps " #A "," #B "," #DEST ";\n"
131#define VMADDd(A,B,DEST) "vfmadd231pd " #A "," #B "," #DEST ";\n"
132
133#define VMULMEMf(O,A,B,DEST) "vmulps " #O"*64("#A ")," #B "," #DEST ";\n"
134#define VMULMEMd(O,A,B,DEST) "vmulpd " #O"*64("#A ")," #B "," #DEST ";\n"
135
136#define VMADDMEMf(O,A,B,DEST) "vfmadd231ps " #O"*64("#A "),"#B "," #DEST ";\n"
137#define VMADDMEMd(O,A,B,DEST) "vfmadd231pd " #O"*64("#A "),"#B "," #DEST ";\n"
138
139#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
140#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
141
142#define STREAM_STORE
143#ifdef STREAM_STORE
144#define VSTOREf(OFF,PTR,SRC) "vmovntps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
145#define VSTOREd(OFF,PTR,SRC) "vmovntpd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
146#else
147#define VSTOREf(OFF,PTR,SRC) "vmovups " #SRC "," #OFF "*64(" #PTR ")" ";\n"
148#define VSTOREd(OFF,PTR,SRC) "vmovupd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
149#endif
150
151// Swaps Re/Im ; could unify this with IMCI
152#define VSHUFd(A,DEST) "vpshufd $0x4e," #A "," #DEST ";\n"
153#define VSHUFf(A,DEST) "vpshufd $0xb1," #A "," #DEST ";\n"
154#define VSHUFMEMd(OFF,A,DEST) "vpshufd $0x4e, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
155#define VSHUFMEMf(OFF,A,DEST) "vpshufd $0xb1, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 2,3,0,1
156
157#define TRAP " int3 ;\n"
158
159#endif