Grid 0.7.0
Lattice_basis.h
Go to the documentation of this file.
1/*************************************************************************************
2
3Grid physics library, www.github.com/paboyle/Grid
4
5Source file: ./lib/lattice/Lattice_basis.h
6
7Copyright (C) 2015
8
9Author: Peter Boyle <paboyle@ph.ed.ac.uk>
10Author: paboyle <paboyle@ph.ed.ac.uk>
11Author: Christoph Lehner <christoph@lhnr.de>
12
13This program is free software; you can redistribute it and/or modify
14it under the terms of the GNU General Public License as published by
15the Free Software Foundation; either version 2 of the License, or
16(at your option) any later version.
17
18This program is distributed in the hope that it will be useful,
19but WITHOUT ANY WARRANTY; without even the implied warranty of
20MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21GNU General Public License for more details.
22
23You should have received a copy of the GNU General Public License along
24with this program; if not, write to the Free Software Foundation, Inc.,
2551 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26
27See the full license in the file "LICENSE" in the top level distribution
28directory
29*************************************************************************************/
30 /* END LEGAL */
31
32#pragma once
33
35
36template<class Field>
37void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
38{
39 // If assume basis[j] are already orthonormal,
40 // can take all inner products in parallel saving 2x bandwidth
41 // Save 3x bandwidth on the second line of loop.
42 // perhaps 2.5x speed up.
43 // 2x overall in Multigrid Lanczos
44 for(int j=0; j<k; ++j){
45 auto ip = innerProduct(basis[j],w);
46 w = w - ip*basis[j];
47 }
48}
49
50template<class VField, class Matrix>
51void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
52{
53 typedef decltype(basis[0]) Field;
54 typedef decltype(basis[0].View(AcceleratorRead)) View;
55
56 hostVector<View> h_basis_v(basis.size());
57 deviceVector<View> d_basis_v(basis.size());
58 typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
59 typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
60
61 GridBase* grid = basis[0].Grid();
62
63 for(int k=0;k<basis.size();k++){
64 h_basis_v[k] = basis[k].View(AcceleratorWrite);
65 acceleratorPut(d_basis_v[k],h_basis_v[k]);
66 }
67
68 View *basis_vp = &d_basis_v[0];
69
70 int nrot = j1-j0;
71 if (!nrot) // edge case not handled gracefully by Cuda
72 return;
73
74 uint64_t oSites =grid->oSites();
75 uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
76
77 deviceVector <vobj> Bt(siteBlock * nrot);
78 auto Bp=&Bt[0];
79
80 // GPU readable copy of matrix
81 hostVector<Coeff_t> h_Qt_jv(Nm*Nm);
82 deviceVector<Coeff_t> Qt_jv(Nm*Nm);
83 Coeff_t *Qt_p = & Qt_jv[0];
84 thread_for(i,Nm*Nm,{
85 int j = i/Nm;
86 int k = i%Nm;
87 h_Qt_jv[i]=Qt(j,k);
88 });
89 acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));
90
91 // Block the loop to keep storage footprint down
92 for(uint64_t s=0;s<oSites;s+=siteBlock){
93
94 // remaining work in this block
95 int ssites=MIN(siteBlock,oSites-s);
96
97 // zero out the accumulators
98 accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
99 decltype(coalescedRead(Bp[ss])) z;
100 z=Zero();
101 coalescedWrite(Bp[ss],z);
102 });
103
104 accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
105
106 int j =sj%nrot;
107 int jj =j0+j;
108 int ss =sj/nrot;
109 int sss=ss+s;
110
111 for(int k=k0; k<k1; ++k){
112 auto tmp = coalescedRead(Bp[ss*nrot+j]);
113 coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_vp[k][sss]));
114 }
115 });
116
117 accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
118 int j =sj%nrot;
119 int jj =j0+j;
120 int ss =sj/nrot;
121 int sss=ss+s;
122 coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
123 });
124 }
125
126 for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
127}
128
129// Extract a single rotated vector
130template<class Field>
131void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
132{
133 typedef decltype(basis[0].View(AcceleratorRead)) View;
134 typedef typename Field::vector_object vobj;
135 GridBase* grid = basis[0].Grid();
136
137 result.Checkerboard() = basis[0].Checkerboard();
138
139 hostVector<View> h_basis_v(basis.size());
140 deviceVector<View> d_basis_v(basis.size());
141 for(int k=0;k<basis.size();k++){
142 h_basis_v[k]=basis[k].View(AcceleratorRead);
143 acceleratorPut(d_basis_v[k],h_basis_v[k]);
144 }
145
146 vobj zz=Zero();
147 deviceVector<double> Qt_jv(Nm);
148 double * Qt_j = & Qt_jv[0];
149 for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k));
150
151 auto basis_vp=& d_basis_v[0];
152 autoView(result_v,result,AcceleratorWrite);
153 accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
154 vobj zzz=Zero();
155 auto B=coalescedRead(zzz);
156 for(int k=k0; k<k1; ++k){
157 B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]);
158 }
159 coalescedWrite(result_v[ss], B);
160 });
161 for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
162}
163
164template<class Field>
165void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)
166{
167 int vlen = idx.size();
168
169 assert(vlen>=1);
170 assert(vlen<=sort_vals.size());
171 assert(vlen<=_v.size());
172
173 for (size_t i=0;i<vlen;i++) {
174
175 if (idx[i] != i) {
176
178 // idx[i] is a table of desired sources giving a permutation.
179 // Swap v[i] with v[idx[i]].
180 // Find j>i for which _vnew[j] = _vold[i],
181 // track the move idx[j] => idx[i]
182 // track the move idx[i] => i
184 size_t j;
185 for (j=i;j<idx.size();j++)
186 if (idx[j]==i)
187 break;
188
189 assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
190
191 swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
192 std::swap(sort_vals[i],sort_vals[idx[i]]);
193
194 idx[j] = idx[i];
195 idx[i] = i;
196 }
197 }
198}
199
200inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)
201{
202 std::vector<int> idx(sort_vals.size());
203 std::iota(idx.begin(), idx.end(), 0);
204
205 // sort indexes based on comparing values in v
206 std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
207 return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
208 });
209 return idx;
210}
211
212template<class Field>
213void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)
214{
215 std::vector<int> idx = basisSortGetIndex(sort_vals);
216 if (reverse)
217 std::reverse(idx.begin(), idx.end());
218
219 basisReorderInPlace(_v,sort_vals,idx);
220}
221
222// PAB: faster to compute the inner products first then fuse loops.
223// If performance critical can improve.
224template<class Field>
225void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
226 result = Zero();
227 assert(_v.size()==eval.size());
228 int N = (int)_v.size();
229 for (int i=0;i<N;i++) {
230 Field& tmp = _v[i];
231 axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
232 }
233}
234
void acceleratorPut(T &dev, const T &host)
void acceleratorCopyToDevice(void *from, void *to, size_t bytes)
#define accelerator_for(iterator, num, nsimd,...)
std::vector< T, devAllocator< T > > deviceVector
std::vector< T, alignedAllocator< T > > hostVector
B
accelerator_inline sobj eval(const uint64_t ss, const sobj &arg)
Definition Lattice_ET.h:103
void axpy(Lattice< vobj > &ret, sobj a, const Lattice< vobj > &x, const Lattice< vobj > &y)
void basisRotate(VField &basis, Matrix &Qt, int j0, int j1, int k0, int k1, int Nm)
void basisDeflate(const std::vector< Field > &_v, const std::vector< RealD > &eval, const Field &src_orig, Field &result)
void basisOrthogonalize(std::vector< Field > &basis, Field &w, int k)
std::vector< int > basisSortGetIndex(std::vector< RealD > &sort_vals)
void basisReorderInPlace(std::vector< Field > &_v, std::vector< RealD > &sort_vals, std::vector< int > &idx)
void basisSortInPlace(std::vector< Field > &_v, std::vector< RealD > &sort_vals, bool reverse)
void basisRotateJ(Field &result, std::vector< Field > &basis, Eigen::MatrixXd &Qt, int j, int k0, int k1, int Nm)
ComplexD innerProduct(const Lattice< vobj > &left, const Lattice< vobj > &right)
#define autoView(l_v, l, mode)
@ AcceleratorRead
@ AcceleratorWrite
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
Definition Tensor_SIMT.h:87
accelerator_inline vobj coalescedRead(const vobj &__restrict__ vec, int lane=0)
Definition Tensor_SIMT.h:61
accelerator_inline std::enable_if<!isGridTensor< T >::value, T >::type TensorRemove(T arg)
#define thread_for(i, num,...)
Definition Threads.h:60
#define MIN(a, b)
Definition Zolotarev.cc:23
int oSites(void) const
Definition Simd.h:194