Go to the documentation of this file.
28#ifndef GRID_ASM_INTEL_COMMON_512_H
29#define GRID_ASM_INTEL_COMMON_512_H
34#undef AVX512_PF_L2_WRITE
40 __asm__ ("mov $0xAAAA, %%eax \n" \
41 "kmovw %%eax, %%k6 \n" \
42 "mov $0x5555, %%eax \n" \
43 "kmovw %%eax, %%k7 \n" : : : "%eax");
47#define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n"
48#define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n"
50#define VTIMESIf(A,DEST, Z) \
51 VTIMESI0f(A,DEST, Z) \
52 VTIMESI1f(A,DEST, Z) \
55#define VTIMESId(A,DEST, Z) \
56 VTIMESI0d(A,DEST, Z) \
57 VTIMESI1d(A,DEST, Z) \
60#define VTIMESMINUSIf(A,DEST, Z) \
61 VTIMESMINUSI0f(A,DEST, Z) \
62 VTIMESMINUSI1f(A,DEST, Z) \
63 VTIMESMINUSI2f(A,DEST, Z)
65#define VTIMESMINUSId(A,DEST, Z) \
66 VTIMESMINUSI0d(A,DEST, Z) \
67 VTIMESMINUSI1d(A,DEST, Z) \
68 VTIMESMINUSI2d(A,DEST, Z)
70#define VACCTIMESIf(A,ACC,tmp) \
71 VACCTIMESI0f(A,ACC,tmp) \
72 VACCTIMESI1f(A,ACC,tmp) \
73 VACCTIMESI2f(A,ACC,tmp)
75#define VACCTIMESId(A,ACC,tmp) \
76 VACCTIMESI0d(A,ACC,tmp) \
77 VACCTIMESI1d(A,ACC,tmp) \
78 VACCTIMESI2d(A,ACC,tmp)
80#define VACCTIMESMINUSIf(A,ACC,tmp) \
81 VACCTIMESMINUSI0f(A,ACC,tmp) \
82 VACCTIMESMINUSI1f(A,ACC,tmp) \
83 VACCTIMESMINUSI2f(A,ACC,tmp)
85#define VACCTIMESMINUSId(A,ACC,tmp) \
86 VACCTIMESMINUSI0d(A,ACC,tmp) \
87 VACCTIMESMINUSI1d(A,ACC,tmp) \
88 VACCTIMESMINUSI2d(A,ACC,tmp)
90#define LOAD64a(A,ptr) "movq %0, %" #A : : "r"(ptr) : #A
91#define LOAD64i(A,ptr) __asm__ ( LOAD64a(A,ptr));
92#define LOAD64(A,ptr) LOAD64i(A,ptr)
94#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n"
95#define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n"
97#define VPREFETCH1(O,A) "prefetcht0 "#O"*64("#A");\n"
98#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n"
99#ifdef AVX512_PF_L2_WRITE
100#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n"
102#define VPREFETCHW(O,A)
104#define VPREFETCHNTA(O,A)
105#define VPREFETCH(O,A)
112#define VLOADf(OFF,PTR,DEST) "vmovups " #OFF "*64(" #PTR "), " #DEST ";\n"
113#define VLOADd(OFF,PTR,DEST) "vmovupd " #OFF "*64(" #PTR "), " #DEST ";\n"
115#define VADDf(A,B,DEST) "vaddps " #A "," #B "," #DEST ";\n"
116#define VADDd(A,B,DEST) "vaddpd " #A "," #B "," #DEST ";\n"
118#define VSUBf(A,B,DEST) "vsubps " #A "," #B "," #DEST ";\n"
119#define VSUBd(A,B,DEST) "vsubpd " #A "," #B "," #DEST ";\n"
121#define VADDMEMf(O,A,B,DEST) "vaddps "#O"*64("#A ")," #B "," #DEST ";\n"
122#define VADDMEMd(O,A,B,DEST) "vaddpd "#O"*64("#A ")," #B "," #DEST ";\n"
124#define VSUBMEMf(O,A,B,DEST) "vsubps "#O"*64("#A ")," #B "," #DEST ";\n"
125#define VSUBMEMd(O,A,B,DEST) "vsubpd "#O"*64("#A ")," #B "," #DEST ";\n"
127#define VMULf(A,B,DEST) "vmulps " #A "," #B "," #DEST ";\n"
128#define VMULd(A,B,DEST) "vmulpd " #A "," #B "," #DEST ";\n"
130#define VMADDf(A,B,DEST) "vfmadd231ps " #A "," #B "," #DEST ";\n"
131#define VMADDd(A,B,DEST) "vfmadd231pd " #A "," #B "," #DEST ";\n"
133#define VMULMEMf(O,A,B,DEST) "vmulps " #O"*64("#A ")," #B "," #DEST ";\n"
134#define VMULMEMd(O,A,B,DEST) "vmulpd " #O"*64("#A ")," #B "," #DEST ";\n"
136#define VMADDMEMf(O,A,B,DEST) "vfmadd231ps " #O"*64("#A "),"#B "," #DEST ";\n"
137#define VMADDMEMd(O,A,B,DEST) "vfmadd231pd " #O"*64("#A "),"#B "," #DEST ";\n"
139#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
140#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
144#define VSTOREf(OFF,PTR,SRC) "vmovntps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
145#define VSTOREd(OFF,PTR,SRC) "vmovntpd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
147#define VSTOREf(OFF,PTR,SRC) "vmovups " #SRC "," #OFF "*64(" #PTR ")" ";\n"
148#define VSTOREd(OFF,PTR,SRC) "vmovupd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
152#define VSHUFd(A,DEST) "vpshufd $0x4e," #A "," #DEST ";\n"
153#define VSHUFf(A,DEST) "vpshufd $0xb1," #A "," #DEST ";\n"
154#define VSHUFMEMd(OFF,A,DEST) "vpshufd $0x4e, " #OFF"*64("#A ")," #DEST ";\n"
155#define VSHUFMEMf(OFF,A,DEST) "vpshufd $0xb1, " #OFF"*64("#A ")," #DEST ";\n"
157#define TRAP " int3 ;\n"