Grid 0.7.0
ImplicitlyRestartedBlockLanczos.h
Go to the documentation of this file.
1 /*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
6
7 Copyright (C) 2015
8
9Author: Peter Boyle <paboyle@ph.ed.ac.uk>
10Author: Yong-Chull Jang <ypj@quark.phy.bnl.gov>
11Author: Chulwoo Jung <chulwoo@bnl.gov>
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2 of the License, or
16 (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License along
24 with this program; if not, write to the Free Software Foundation, Inc.,
25 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26
27 See the full license in the file "LICENSE" in the top level distribution directory
28 *************************************************************************************/
29 /* END LEGAL */
30#ifndef GRID_IRBL_H
31#define GRID_IRBL_H
32
33#include <string.h> //memset
34#ifdef USE_LAPACK
35#include <mkl_lapack.h>
36#endif
37
38#undef USE_LAPACK
39#define Glog std::cout << GridLogMessage
40
41#ifdef GRID_CUDA
42#include "cublas_v2.h"
43#endif
44
45#if 0
46#define CUDA_COMPLEX cuDoubleComplex
47#define CUDA_FLOAT double
48#define MAKE_CUDA_COMPLEX make_cuDoubleComplex
49#define CUDA_GEMM cublasZgemm
50#else
51#define CUDA_COMPLEX cuComplex
52#define CUDA_FLOAT float
53#define MAKE_CUDA_COMPLEX make_cuComplex
54#define CUDA_GEMM cublasCgemm
55#endif
56
57namespace Grid {
58
60// Helper class for sorting the evalues AND evectors by Field
61// Use pointer swizzle on vectors SHOULD GET RID OF IT SOON!
63template<class Field>
64class SortEigen {
65 private:
66 static bool less_lmd(RealD left,RealD right){
67 return left > right;
68 }
69 static bool less_pair(std::pair<RealD,Field const*>& left,
70 std::pair<RealD,Field const*>& right){
71 return left.first > (right.first);
72 }
73
74 public:
75 void push(std::vector<RealD>& lmd,std::vector<Field>& evec,int N) {
76
78 // PAB: FIXME: VERY VERY VERY wasteful: takes a copy of the entire vector set.
79 // : The vector reorder should be done by pointer swizzle somehow
81 std::vector<Field> cpy(lmd.size(),evec[0].Grid());
82 for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
83
84 std::vector<std::pair<RealD, Field const*> > emod(lmd.size());
85
86 for(int i=0;i<lmd.size();++i) emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
87
88 partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
89
90 typename std::vector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
91 for(int i=0;i<N;++i){
92 lmd[i]=it->first;
93 evec[i]=*(it->second);
94 ++it;
95 }
96 }
97 void push(std::vector<RealD>& lmd,int N) {
98 std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd);
99 }
100 bool saturated(RealD lmd, RealD thrs) {
101 return fabs(lmd) > fabs(thrs);
102 }
103};
104
105enum class LanczosType { irbl, rbl };
106
112
114// Implicitly restarted block lanczos
116template<class Field>
118
119private:
120
121 std::string cname = std::string("ImplicitlyRestartedBlockLanczos");
122 int MaxIter; // Max iterations
123 int Nstop; // Number of evecs checked for convergence
124 int Nu; // Number of vecs in the unit block
125 int Nk; // Number of converged sought
126 int Nm; // total number of vectors
127 int Nblock_k; // Nk/Nu
128 int Nblock_m; // Nm/Nu
129 int Nconv_test_interval; // Number of skipped vectors when checking a convergence
133 // Embedded objects
141 int mrhs;
143 // BLAS objects
145#ifdef GRID_CUDA
146 cudaError_t cudaStat;
147 CUDA_COMPLEX *w_acc, *evec_acc, *c_acc;
148#endif
149 int Nevec_acc; // Number of eigenvectors stored in the buffer evec_acc
150
152 // Constructor
154public:
155 int split_test; //test split in the first iteration
157 LinearOperatorBase<Field> &SLinop, // op
158 GridRedBlackCartesian * FrbGrid,
159 GridRedBlackCartesian * SFrbGrid,
160 int _mrhs,
161 OperatorFunction<Field> & poly, // polynomial
162 int _Nstop, // really sought vecs
163 int _Nconv_test_interval, // conv check interval
164 int _Nu, // vecs in the unit block
165 int _Nk, // sought vecs
166 int _Nm, // total vecs
167 RealD _eresid, // resid in lmd deficit
168 int _MaxIter, // Max iterations
170 : _Linop(Linop), _SLinop(SLinop), _poly(poly),sf_grid(SFrbGrid),f_grid(FrbGrid),
171 Nstop(_Nstop), Nconv_test_interval(_Nconv_test_interval), mrhs(_mrhs),
172 Nu(_Nu), Nk(_Nk), Nm(_Nm),
173 Nblock_m(_Nm/_Nu), Nblock_k(_Nk/_Nu),
174 //eresid(_eresid), MaxIter(10),
175 eresid(_eresid), MaxIter(_MaxIter),
176 diagonalisation(_diagonalisation),split_test(0),
177 Nevec_acc(_Nu)
178 { assert( (Nk%Nu==0) && (Nm%Nu==0) ); };
179
181 // Helpers
183 static RealD normalize(Field& v, int if_print=0)
184 {
185 RealD nn = norm2(v);
186 nn = sqrt(nn);
187 v = v * (1.0/nn);
188 return nn;
189 }
190
191 void orthogonalize(Field& w, std::vector<Field>& evec, int k, int if_print=0)
192 {
193 typedef typename Field::scalar_type MyComplex;
194// MyComplex ip;
195 ComplexD ip;
196
197 for(int j=0; j<k; ++j){
198 ip = innerProduct(evec[j],w);
199 if(if_print)
200 if( norm(ip)/norm2(w) > 1e-14)
201 Glog<<"orthogonalize before: "<<j<<" of "<<k<<" "<< ip <<std::endl;
202 w = w - ip * evec[j];
203 if(if_print) {
204 ip = innerProduct(evec[j],w);
205 if( norm(ip)/norm2(w) > 1e-14)
206 Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl;
207 }
208 }
209 assert(normalize(w,if_print) != 0);
210 }
211 void reorthogonalize(Field& w, std::vector<Field>& evec, int k)
212 {
213 orthogonalize(w, evec, k,1);
214 }
215
216 void orthogonalize(std::vector<Field>& w, int _Nu, std::vector<Field>& evec, int k, int if_print=0)
217 {
218 typedef typename Field::scalar_type MyComplex;
219 MyComplex ip;
220// ComplexD ip;
221
222 for(int j=0; j<k; ++j){
223 for(int i=0; i<_Nu; ++i){
224 ip = innerProduct(evec[j],w[i]);
225 w[i] = w[i] - ip * evec[j];
226 }}
227 for(int i=0; i<_Nu; ++i)
228 assert(normalize(w[i],if_print) !=0);
229 }
230
231
232 void orthogonalize_blas(std::vector<Field>& w, std::vector<Field>& evec, int R, int do_print=0)
233 {
234#ifdef GRID_CUDA
235 Glog << "cuBLAS orthogonalize" << std::endl;
236
237 typedef typename Field::vector_object vobj;
238 typedef typename vobj::scalar_type scalar_type;
239 typedef typename vobj::vector_type vector_type;
240
241 typedef typename Field::scalar_type MyComplex;
242
243 GridBase *grid = w[0].Grid();
244 const uint64_t sites = grid->lSites();
245
246 int Nbatch = R/Nevec_acc;
247 assert( R%Nevec_acc == 0 );
248// Glog << "nBatch, Nevec_acc, R, Nu = "
249// << Nbatch << "," << Nevec_acc << "," << R << "," << Nu << std::endl;
250
251 for (int col=0; col<Nu; ++col) {
252// auto w_v = w[col].View();
253 autoView( w_v,w[col], AcceleratorWrite);
254 CUDA_COMPLEX *z = reinterpret_cast<CUDA_COMPLEX*>(&w_v[0]);
255// Glog << "col= "<<col <<" z= "<<z <<std::endl;
256 for (size_t row=0; row<sites*12; ++row) {
257 w_acc[col*sites*12+row] = z[row];
258 }
259 }
260 cublasHandle_t handle;
261 cublasStatus_t stat;
262
263 stat = cublasCreate(&handle);
264
265 Glog << "cuBLAS Zgemm"<< std::endl;
266
267 for (int b=0; b<Nbatch; ++b) {
268 for (int col=0; col<Nevec_acc; ++col) {
269// auto evec_v = evec[b*Nevec_acc+col].View();
270 autoView( evec_v,evec[b*Nevec_acc+col], AcceleratorWrite);
271 CUDA_COMPLEX *z = reinterpret_cast<CUDA_COMPLEX*>(&evec_v[0]);
272// Glog << "col= "<<col <<" z= "<<z <<std::endl;
273 for (size_t row=0; row<sites*12; ++row) {
274 evec_acc[col*sites*12+row] = z[row];
275 }
276 }
277 CUDA_COMPLEX alpha = MAKE_CUDA_COMPLEX(1.0,0.0);
278 CUDA_COMPLEX beta = MAKE_CUDA_COMPLEX(0.0,0.0);
279 stat = CUDA_GEMM(handle, CUBLAS_OP_C, CUBLAS_OP_N, Nevec_acc, Nu, 12*sites,
280 &alpha,
281 evec_acc, 12*sites, w_acc, 12*sites,
282 &beta,
283 c_acc, Nevec_acc);
284 //Glog << stat << std::endl;
285
286 grid->GlobalSumVector((CUDA_FLOAT*)c_acc,2*Nu*Nevec_acc);
287 alpha = MAKE_CUDA_COMPLEX(-1.0,0.0);
288 beta = MAKE_CUDA_COMPLEX(1.0,0.0);
289 stat = CUDA_GEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, 12*sites, Nu, Nevec_acc,
290 &alpha,
291 evec_acc, 12*sites, c_acc, Nevec_acc,
292 &beta,
293 w_acc, 12*sites);
294 //Glog << stat << std::endl;
295 }
296 for (int col=0; col<Nu; ++col) {
297// auto w_v = w[col].View();
298 autoView( w_v,w[col], AcceleratorWrite);
299 CUDA_COMPLEX *z = reinterpret_cast<CUDA_COMPLEX*>(&w_v[0]);
300 for (size_t row=0; row<sites*12; ++row) {
301 z[row] = w_acc[col*sites*12+row];
302 }
303 }
304 for (int i=0; i<Nu; ++i) {
305 assert(normalize(w[i],do_print)!=0);
306 }
307
308 Glog << "cuBLAS Zgemm done"<< std::endl;
309
310 cublasDestroy(handle);
311
312 Glog << "cuBLAS orthogonalize done" << std::endl;
313#else
314 Glog << "BLAS wrapper is not implemented" << std::endl;
315 exit(1);
316#endif
317 }
318
319
320
321 void orthogonalize_blockhead(Field& w, std::vector<Field>& evec, int k, int Nu)
322 {
323 typedef typename Field::scalar_type MyComplex;
324 MyComplex ip;
325
326 for(int j=0; j<k; ++j){
327 ip = innerProduct(evec[j*Nu],w);
328 w = w - ip * evec[j*Nu];
329 }
330 normalize(w);
331 }
332
333 void calc(std::vector<RealD>& eval,
334 std::vector<Field>& evec,
335 const std::vector<Field>& src, int& Nconv, LanczosType Impl)
336 {
337#ifdef GRID_CUDA
338 GridBase *grid = src[0].Grid();
339 grid->show_decomposition();
340
341// printf("GRID_CUDA\n");
342
343 // set eigenvector buffers for the cuBLAS calls
344 //const uint64_t nsimd = grid->Nsimd();
345 const uint64_t sites = grid->lSites();
346
347 cudaStat = cudaMallocManaged((void **)&w_acc, Nu*sites*12*sizeof(CUDA_COMPLEX));
348// Glog << "w_acc= "<<w_acc << " "<< cudaStat << std::endl;
349cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_COMPLEX));
350// Glog << "evec_acc= "<<evec_acc << " "<< cudaStat << std::endl;
351 cudaStat = cudaMallocManaged((void **)&c_acc, Nu*Nevec_acc*sizeof(CUDA_COMPLEX));
352// Glog << "c_acc= "<<c_acc << " "<< cudaStat << std::endl;
353#endif
354 switch (Impl) {
355 case LanczosType::irbl:
356 calc_irbl(eval,evec,src,Nconv);
357 break;
358
359 case LanczosType::rbl:
360 calc_rbl(eval,evec,src,Nconv);
361 break;
362 }
363#ifdef GRID_CUDA
364 // free eigenvector buffers for the cuBLAS calls
365 cudaFree(w_acc);
366 cudaFree(evec_acc);
367 cudaFree(c_acc);
368#endif
369 }
370
371 void calc_irbl(std::vector<RealD>& eval,
372 std::vector<Field>& evec,
373 const std::vector<Field>& src, int& Nconv)
374 {
375 std::string fname = std::string(cname+"::calc_irbl()");
376 GridBase *grid = evec[0].Grid();
377 assert(grid == src[0].Grid());
378 assert( Nu = src.size() );
379
380 Glog << std::string(74,'*') << std::endl;
381 Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl;
382 Glog << std::string(74,'*') << std::endl;
383 Glog <<" -- seek Nk = "<< Nk <<" vectors"<< std::endl;
384 Glog <<" -- accept Nstop = "<< Nstop <<" vectors"<< std::endl;
385 Glog <<" -- total Nm = "<< Nm <<" vectors"<< std::endl;
386 Glog <<" -- size of eval = "<< eval.size() << std::endl;
387 Glog <<" -- size of evec = "<< evec.size() << std::endl;
389 Glog << "Diagonalisation is Eigen "<< std::endl;
390#ifdef USE_LAPACK
391 } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) {
392 Glog << "Diagonalisation is LAPACK "<< std::endl;
393#endif
394 } else {
395 abort();
396 }
397 Glog << std::string(74,'*') << std::endl;
398
399 assert(Nm == evec.size() && Nm == eval.size());
400
401 std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
402 std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
403 std::vector<std::vector<ComplexD>> lmd2(Nu,std::vector<ComplexD>(Nm,0.0));
404 std::vector<std::vector<ComplexD>> lme2(Nu,std::vector<ComplexD>(Nm,0.0));
405 std::vector<RealD> eval2(Nm);
406 std::vector<RealD> resid(Nk);
407
408 Eigen::MatrixXcd Qt = Eigen::MatrixXcd::Zero(Nm,Nm);
409 Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
410
411 std::vector<int> Iconv(Nm);
412 std::vector<Field> B(Nm,grid); // waste of space replicating
413
414 std::vector<Field> f(Nu,grid);
415 std::vector<Field> f_copy(Nu,grid);
416 Field v(grid);
417
418 Nconv = 0;
419
420 RealD beta_k;
421
422 // set initial vector
423 for (int i=0; i<Nu; ++i) {
424 Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
425 evec[i] = src[i];
426 orthogonalize(evec[i],evec,i);
427 Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
428 }
429
430 // initial Nblock_k steps
431 for(int b=0; b<Nblock_k; ++b) blockwiseStep(lmd,lme,evec,f,f_copy,b);
432
433 // restarting loop begins
434 int iter;
435 for(iter = 0; iter<MaxIter; ++iter){
436
437 Glog <<"#Restart iteration = "<< iter << std::endl;
438 // additional (Nblock_m - Nblock_k) steps
439 for(int b=Nblock_k; b<Nblock_m; ++b) blockwiseStep(lmd,lme,evec,f,f_copy,b);
440
441 // getting eigenvalues
442 for(int u=0; u<Nu; ++u){
443 for(int k=0; k<Nm; ++k){
444 lmd2[u][k] = lmd[u][k];
445 lme2[u][k] = lme[u][k];
446 }
447 }
448 Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
449 diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
450 _sort.push(eval2,Nm);
451 Glog << "#Ritz value before shift: "<< std::endl;
452 for(int i=0; i<Nm; ++i){
453 std::cout.precision(13);
454 std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
455 std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
456 }
457
458 //----------------------------------------------------------------------
459 if ( Nm>Nk ) {
460 Glog <<" #Apply shifted QR transformations "<<std::endl;
461 //int k2 = Nk+Nu;
462 int k2 = Nk;
463
464 Eigen::MatrixXcd BTDM = Eigen::MatrixXcd::Identity(Nm,Nm);
465 Q = Eigen::MatrixXcd::Identity(Nm,Nm);
466
468
469 for(int ip=Nk; ip<Nm; ++ip){
470 shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q);
471 }
472
474
475 for(int i=0; i<k2; ++i) B[i] = 0.0;
476 for(int j=0; j<k2; ++j){
477 for(int k=0; k<Nm; ++k){
478 B[j].Checkerboard() = evec[k].Checkerboard();
479 B[j] += evec[k]*Q(k,j);
480 }
481 }
482 for(int i=0; i<k2; ++i) evec[i] = B[i];
483
484 // reconstruct initial vector for additional pole space
485 blockwiseStep(lmd,lme,evec,f,f_copy,Nblock_k-1);
486
487 // getting eigenvalues
488 for(int u=0; u<Nu; ++u){
489 for(int k=0; k<Nm; ++k){
490 lmd2[u][k] = lmd[u][k];
491 lme2[u][k] = lme[u][k];
492 }
493 }
494 Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
495 diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
496 _sort.push(eval2,Nk);
497 Glog << "#Ritz value after shift: "<< std::endl;
498 for(int i=0; i<Nk; ++i){
499 std::cout.precision(13);
500 std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
501 std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
502 }
503 }
504 //----------------------------------------------------------------------
505
506 // Convergence test
507 Glog <<" #Convergence test: "<<std::endl;
508 for(int k = 0; k<Nk; ++k) B[k]=0.0;
509 for(int j = 0; j<Nk; ++j){
510 for(int k = 0; k<Nk; ++k){
511 B[j].Checkerboard() = evec[k].Checkerboard();
512 B[j] += evec[k]*Qt(k,j);
513 }
514 }
515
516 Nconv = 0;
517 for(int i=0; i<Nk; ++i){
518
519 _Linop.HermOp(B[i],v);
520 RealD vnum = real(innerProduct(B[i],v)); // HermOp.
521 RealD vden = norm2(B[i]);
522 eval2[i] = vnum/vden;
523 v -= eval2[i]*B[i];
524 RealD vv = norm2(v);
525 resid[i] = vv;
526
527 std::cout.precision(13);
528 std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
529 std::cout << "eval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i];
530 std::cout << " resid^2 = "<< std::setw(20)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
531
532 // change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
533 //if( (vv<eresid*eresid) && (i == Nconv) ){
534 if (vv<eresid*eresid) {
535 Iconv[Nconv] = i;
536 ++Nconv;
537 }
538
539 } // i-loop end
540
541 Glog <<" #modes converged: "<<Nconv<<std::endl;
542 for(int i=0; i<Nconv; ++i){
543 std::cout.precision(13);
544 std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<Iconv[i]<<"] ";
545 std::cout << "eval_conv = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[Iconv[i]];
546 std::cout << " resid^2 = "<< std::setw(20)<< std::setiosflags(std::ios_base::right)<< resid[Iconv[i]]<< std::endl;
547 }
548
549 if ( Nconv>=Nstop ) break;
550
551 } // end of iter loop
552
553 Glog << std::string(74,'*') << std::endl;
554 if ( Nconv<Nstop ) {
555 Glog << fname + " NOT converged ; Summary :\n";
556 } else {
557 Glog << fname + " CONVERGED ; Summary :\n";
558 // Sort convered eigenpairs.
559 eval.resize(Nconv);
560 evec.resize(Nconv,grid);
561 for(int i=0; i<Nconv; ++i){
562 eval[i] = eval2[Iconv[i]];
563 evec[i] = B[Iconv[i]];
564 }
565 _sort.push(eval,evec,Nconv);
566 }
567 Glog << std::string(74,'*') << std::endl;
568 Glog << " -- Iterations = "<< iter << "\n";
569 //Glog << " -- beta(k) = "<< beta_k << "\n";
570 Glog << " -- Nconv = "<< Nconv << "\n";
571 Glog << std::string(74,'*') << std::endl;
572
573 }
574
575
576 void calc_rbl(std::vector<RealD>& eval,
577 std::vector<Field>& evec,
578 const std::vector<Field>& src, int& Nconv)
579 {
580 std::string fname = std::string(cname+"::calc_rbl()");
581 GridBase *grid = evec[0].Grid();
582 assert(grid == src[0].Grid());
583 assert( Nu = src.size() );
584
585 int Np = (Nm-Nk);
586 if (Np > 0 && MaxIter > 1) Np /= MaxIter;
587 int Nblock_p = Np/Nu;
588 for(int i=0;i< evec.size();i++) evec[0].Advise()=AdviseInfrequentUse;
589
590 Glog << std::string(74,'*') << std::endl;
591 Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl;
592 Glog << std::string(74,'*') << std::endl;
593 Glog <<" -- seek (min) Nk = "<< Nk <<" vectors"<< std::endl;
594 Glog <<" -- seek (inc) Np = "<< Np <<" vectors"<< std::endl;
595 Glog <<" -- seek (max) Nm = "<< Nm <<" vectors"<< std::endl;
596 Glog <<" -- accept Nstop = "<< Nstop <<" vectors"<< std::endl;
597 Glog <<" -- size of eval = "<< eval.size() << std::endl;
598 Glog <<" -- size of evec = "<< evec.size() << std::endl;
600 Glog << "Diagonalisation is Eigen "<< std::endl;
601#ifdef USE_LAPACK
602 } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) {
603 Glog << "Diagonalisation is LAPACK "<< std::endl;
604#endif
605 } else {
606 abort();
607 }
608 Glog << std::string(74,'*') << std::endl;
609
610 assert(Nm == evec.size() && Nm == eval.size());
611
612 std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
613 std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
614 std::vector<std::vector<ComplexD>> lmd2(Nu,std::vector<ComplexD>(Nm,0.0));
615 std::vector<std::vector<ComplexD>> lme2(Nu,std::vector<ComplexD>(Nm,0.0));
616 std::vector<RealD> eval2(Nk);
617 std::vector<RealD> resid(Nm);
618
619 Eigen::MatrixXcd Qt = Eigen::MatrixXcd::Zero(Nm,Nm);
620 Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
621
622 std::vector<int> Iconv(Nm);
623// int Ntest=Nu;
624// std::vector<Field> B(Nm,grid); // waste of space replicating
625 std::vector<Field> B(1,grid); // waste of space replicating
626
627 std::vector<Field> f(Nu,grid);
628 std::vector<Field> f_copy(Nu,grid);
629 Field v(grid);
630
631 Nconv = 0;
632
633// RealD beta_k;
634
635 // set initial vector
636 for (int i=0; i<Nu; ++i) {
637 Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
638 evec[i] = src[i];
639 orthogonalize(evec[i],evec,i);
640 Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
641 }
642// exit(-43);
643
644 // initial Nblock_k steps
645 for(int b=0; b<Nblock_k; ++b) blockwiseStep(lmd,lme,evec,f,f_copy,b);
646
647 // restarting loop begins
648 int iter;
649 int Nblock_l, Nblock_r;
650 int Nl, Nr;
651 int Nconv_guess = 0;
652
653 for(iter = 0; iter<MaxIter; ++iter){
654
655 Glog <<"#Restart iteration = "<< iter << std::endl;
656
657 Nblock_l = Nblock_k + iter*Nblock_p;
658 Nblock_r = Nblock_l + Nblock_p;
659 Nl = Nblock_l*Nu;
660 Nr = Nblock_r*Nu;
661 eval2.resize(Nr);
662
663 // additional Nblock_p steps
664 for(int b=Nblock_l; b<Nblock_r; ++b) blockwiseStep(lmd,lme,evec,f,f_copy,b);
665
666 // getting eigenvalues
667 for(int u=0; u<Nu; ++u){
668 for(int k=0; k<Nr; ++k){
669 lmd2[u][k] = lmd[u][k];
670 lme2[u][k] = lme[u][k];
671 }
672 }
673 Qt = Eigen::MatrixXcd::Identity(Nr,Nr);
674 diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid);
675 _sort.push(eval2,Nr);
676 Glog << "#Ritz value: "<< std::endl;
677 for(int i=0; i<Nr; ++i){
678 std::cout.precision(13);
679 std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
680 std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
681 }
682
683 // Convergence test
684 Glog <<" #Convergence test: "<<std::endl;
685 Nconv = 0;
686 for(int j = 0; j<Nr; j+=Nconv_test_interval){
687 B[0]=0.0;
688 if ( j/Nconv_test_interval == Nconv ) {
689 Glog <<" #rotation for next check point evec"
690 << std::setw(4)<< std::setiosflags(std::ios_base::right)
691 << "["<< j <<"]" <<std::endl;
692 for(int k = 0; k<Nr; ++k){
693 B[0].Checkerboard() = evec[k].Checkerboard();
694 B[0] += evec[k]*Qt(k,j);
695 }
696
697 _Linop.HermOp(B[0],v);
698 RealD vnum = real(innerProduct(B[0],v)); // HermOp.
699 RealD vden = norm2(B[0]);
700 eval2[j] = vnum/vden;
701 v -= eval2[j]*B[0];
702 RealD vv = norm2(v);
703 resid[j] = vv;
704
705 std::cout.precision(13);
706 std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<j<<"] ";
707 std::cout << "eval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[j];
708 std::cout << " resid^2 = "<< std::setw(20)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
709
710 // change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
711 //if( (vv<eresid*eresid) && (i == Nconv) ){
712 if (vv<eresid*eresid) {
713 Iconv[Nconv] = j;
714 ++Nconv;
715 }
716 } else {
717 break;
718 }
719 } // j-loop end
720
721 Glog <<" #modes converged: "<<Nconv<<std::endl;
722 for(int i=0; i<Nconv; ++i){
723 std::cout.precision(13);
724 std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<Iconv[i]<<"] ";
725 std::cout << "eval_conv = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[Iconv[i]];
726 std::cout << " resid^2 = "<< std::setw(20)<< std::setiosflags(std::ios_base::right)<< resid[Iconv[i]]<< std::endl;
727 }
728
729 (Nconv > 0 ) ? Nconv_guess = 1 + (Nconv-1)*Nconv_test_interval : Nconv_guess = 0;
730 if ( Nconv_guess >= Nstop ) break;
731
732 } // end of iter loop
733
734 Glog << std::string(74,'*') << std::endl;
735 if ( Nconv_guess < Nstop ) {
736 Glog << fname + " NOT converged ; Summary :\n";
737 } else {
738 Glog << fname + " CONVERGED ; Summary :\n";
739 // Sort convered eigenpairs.
740 std::vector<Field> Btmp(Nstop,grid); // waste of space replicating
741
742 for(int i=0; i<Nstop; ++i){
743 Btmp[i]=0.;
744 for(int k = 0; k<Nr; ++k){
745 Btmp[i].Checkerboard() = evec[k].Checkerboard();
746 Btmp[i] += evec[k]*Qt(k,i);
747 }
748 _Linop.HermOp(Btmp[i],v);
749 RealD vnum = real(innerProduct(Btmp[i],v)); // HermOp.
750 RealD vden = norm2(Btmp[i]);
751// eval2[j] = vnum/vden;
752 v -= vnum/vden*Btmp[i];
753 RealD vv = norm2(v);
754// resid[j] = vv;
755
756 std::cout.precision(13);
757 std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
758 std::cout << "eval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< vnum/vden;
759 std::cout << " resid^2 = "<< std::setw(20)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
760 eval[i] = vnum/vden;
761 }
762 for(int i=0; i<Nstop; ++i) evec[i] = Btmp[i];
763 eval.resize(Nstop);
764 evec.resize(Nstop,grid);
765 _sort.push(eval,evec,Nstop);
766 }
767 Glog << std::string(74,'*') << std::endl;
768 Glog << " -- Iterations = "<< iter << "\n";
769 //Glog << " -- beta(k) = "<< beta_k << "\n";
770 Glog << " -- Nconv = "<< Nconv << "\n";
771 Glog << " -- Nconv (guess) = "<< Nconv_guess << "\n";
772 Glog << std::string(74,'*') << std::endl;
773
774 }
775
776private:
777 void blockwiseStep(std::vector<std::vector<ComplexD>>& lmd,
778 std::vector<std::vector<ComplexD>>& lme,
779 std::vector<Field>& evec,
780 std::vector<Field>& w,
781 std::vector<Field>& w_copy,
782 int b)
783 {
784 const RealD tiny = 1.0e-20;
785
786 int Nu = w.size();
787 int Nm = evec.size();
788 assert( b < Nm/Nu );
789// GridCartesian *grid = evec[0]._grid;
790
791 // converts block index to full indicies for an interval [L,R)
792 int L = Nu*b;
793 int R = Nu*(b+1);
794
795 Real beta;
796
797 Glog << "Using split grid"<< std::endl;
798// LatticeGaugeField s_Umu(SGrid);
799 assert((Nu%mrhs)==0);
800 std::vector<Field> in(mrhs,f_grid);
801
802 Field s_in(sf_grid);
803 Field s_out(sf_grid);
804 // unnecessary copy. Can or should it be avoided?
805 int k_start = 0;
806while ( k_start < Nu) {
807 Glog << "k_start= "<<k_start<< std::endl;
808 for (int u=0; u<mrhs; ++u) in[u] = evec[L+k_start+u];
809Glog << "Split "<< std::endl;
810 Grid_split(in, s_in);
811Glog << "Split done "<< std::endl;
812 _poly(_SLinop,s_in,s_out);
813Glog << "Unsplit "<< std::endl;
814 Grid_unsplit(in,s_out);
815Glog << "Unsplit done "<< std::endl;
816 for (int u=0; u<mrhs; ++u) w[k_start+u] = in[u];
817 k_start +=mrhs;
818}
819 Glog << "Using split grid done "<< std::endl;
820
821// test split in the first iteration
822if(split_test){
823 Glog << "Split grid testing "<< std::endl;
824 // 3. wk:=Avkβkv_{k1}
825 for (int k=L, u=0; k<R; ++k, ++u) {
826 _poly(_Linop,evec[k],w_copy[u]);
827 }
828 for (int u=0; u<Nu; ++u) {
829 w_copy[u] -= w[u];
830 Glog << "diff(split - non_split) "<<u<<" " << norm2(w_copy[u]) << std::endl;
831 Glog << "Split grid testing done"<< std::endl;
832 }
833 split_test=0;
834}
835 Glog << "Poly done"<< std::endl;
836 Glog << "LinAlg "<< std::endl;
837// exit(-42);
838
839 if (b>0) {
840 for (int u=0; u<Nu; ++u) {
841 //for (int k=L-Nu; k<L; ++k) {
842 for (int k=L-Nu+u; k<L; ++k) {
843 w[u] = w[u] - evec[k] * conjugate(lme[u][k]);
844 }
845 }
846 }
847
848 // 4. αk:=(vk,wk)
849 //for (int u=0; u<Nu; ++u) {
850 // for (int k=L; k<R; ++k) {
851 // lmd[u][k] = innerProduct(evec[k],w[u]); // lmd = transpose of alpha
852 // }
853 // lmd[u][L+u] = real(lmd[u][L+u]); // force diagonal to be real
854 //}
855 for (int u=0; u<Nu; ++u) {
856 for (int k=L+u; k<R; ++k) {
857 lmd[u][k] = innerProduct(evec[k],w[u]); // lmd = transpose of alpha
858// Glog <<"lmd "<<u<<" "<<k<<lmd[u][k] -conjugate(innerProduct(evec[u+L],w[k-L]))<<std::endl;
859 lmd[k-L][u+L] = conjugate(lmd[u][k]); // force hermicity
860 }
861 lmd[u][L+u] = real(lmd[u][L+u]); // force diagonal to be real
862 }
863
864 // 5. wk:=wk−αkvk
865 for (int u=0; u<Nu; ++u) {
866 for (int k=L; k<R; ++k) {
867 w[u] = w[u] - evec[k]*lmd[u][k];
868 }
869 w_copy[u] = w[u];
870 }
871 Glog << "LinAlg done"<< std::endl;
872
873 // In block version, the steps 6 and 7 in Lanczos construction is
874 // replaced by the QR decomposition of new basis block.
875 // It results block version beta and orthonormal block basis.
876 // Here, QR decomposition is done by using Gram-Schmidt.
877 for (int u=0; u<Nu; ++u) {
878 for (int k=L; k<R; ++k) {
879 lme[u][k] = 0.0;
880 }
881 }
882
883 // re-orthogonalization for numerical stability
884#if 1
885 Glog << "Gram Schmidt"<< std::endl;
886 orthogonalize(w,Nu,evec,R);
887#else
888 Glog << "Gram Schmidt using cublas"<< std::endl;
889 orthogonalize_blas(w,evec,R);
890#endif
891 // QR part
892 for (int u=1; u<Nu; ++u) {
893 orthogonalize(w[u],w,u);
894 }
895 Glog << "Gram Schmidt done "<< std::endl;
896
897 Glog << "LinAlg "<< std::endl;
898 for (int u=0; u<Nu; ++u) {
899 //for (int v=0; v<Nu; ++v) {
900 for (int v=u; v<Nu; ++v) {
901 lme[u][L+v] = innerProduct(w[u],w_copy[v]);
902 }
903 lme[u][L+u] = real(lme[u][L+u]); // force diagonal to be real
904 }
905 //lme[0][L] = beta;
906
907 for (int u=0; u<Nu; ++u) {
908// Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl;
909 assert (!isnan(norm2(w[u])));
910 for (int k=L+u; k<R; ++k) {
911 Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
912 }
913 }
914 Glog << "LinAlg done "<< std::endl;
915
916 if (b < Nm/Nu-1) {
917 for (int u=0; u<Nu; ++u) {
918 evec[R+u] = w[u];
919 }
920 }
921
922 }
923
924
925 void diagonalize_Eigen(std::vector<RealD>& eval,
926 std::vector<std::vector<ComplexD>>& lmd,
927 std::vector<std::vector<ComplexD>>& lme,
928 int Nu, int Nk, int Nm,
929 Eigen::MatrixXcd & Qt, // Nm x Nm
930 GridBase *grid)
931 {
932 assert( Nk%Nu == 0 && Nm%Nu == 0 );
933 assert( Nk <= Nm );
934 Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
935
936 for ( int u=0; u<Nu; ++u ) {
937 for (int k=0; k<Nk; ++k ) {
938 BlockTriDiag(k,u+(k/Nu)*Nu) = lmd[u][k];
939 }
940 }
941
942 for ( int u=0; u<Nu; ++u ) {
943 for (int k=Nu; k<Nk; ++k ) {
944 BlockTriDiag(k-Nu,u+(k/Nu)*Nu) = conjugate(lme[u][k-Nu]);
945 BlockTriDiag(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
946 }
947 }
948 //std::cout << BlockTriDiag << std::endl;
949
950 Eigen::SelfAdjointEigenSolver<Eigen::MatrixXcd> eigensolver(BlockTriDiag);
951
952 for (int i = 0; i < Nk; i++) {
953 eval[Nk-1-i] = eigensolver.eigenvalues()(i);
954 }
955 for (int i = 0; i < Nk; i++) {
956 for (int j = 0; j < Nk; j++) {
957 Qt(j,Nk-1-i) = eigensolver.eigenvectors()(j,i);
958 //Qt(Nk-1-i,j) = eigensolver.eigenvectors()(i,j);
959 //Qt(i,j) = eigensolver.eigenvectors()(i,j);
960 }
961 }
962 }
963
964#ifdef USE_LAPACK
965 void diagonalize_lapack(std::vector<RealD>& eval,
966 std::vector<std::vector<ComplexD>>& lmd,
967 std::vector<std::vector<ComplexD>>& lme,
968 int Nu, int Nk, int Nm,
969 Eigen::MatrixXcd & Qt, // Nm x Nm
970 GridBase *grid)
971 {
972 Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl;
973 assert( Nk%Nu == 0 && Nm%Nu == 0 );
974 assert( Nk <= Nm );
975 Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
976
977 for ( int u=0; u<Nu; ++u ) {
978 for (int k=0; k<Nk; ++k ) {
979// Glog << "lmd "<<u<<" "<<k<<" "<<lmd[u][k] -conjugate(lmd[u][k])<<std::endl;
980 BlockTriDiag(k,u+(k/Nu)*Nu) = lmd[u][k];
981 }
982 }
983
984 for ( int u=0; u<Nu; ++u ) {
985 for (int k=Nu; k<Nk; ++k ) {
986// Glog << "lme "<<u<<" "<<k<<" "<<lme[u][k] -conjugate(lme[u][k])<<std::endl;
987 BlockTriDiag(k-Nu,u+(k/Nu)*Nu) = conjugate(lme[u][k-Nu]);
988 BlockTriDiag(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
989 }
990 }
991 //std::cout << BlockTriDiag << std::endl;
992//#ifdef USE_LAPACK
993 const int size = Nm;
994 MKL_INT NN = Nk;
995// double evals_tmp[NN];
996// double evec_tmp[NN][NN];
997 double *evals_tmp = (double *) malloc(NN*sizeof(double));
998 MKL_Complex16 *evec_tmp = (MKL_Complex16 *) malloc(NN*NN*sizeof(MKL_Complex16));
999 MKL_Complex16 *DD = (MKL_Complex16 *) malloc(NN*NN*sizeof(MKL_Complex16));
1000 for (int i = 0; i< NN; i++) {
1001 for (int j = 0; j <NN ; j++) {
1002 evec_tmp[i*NN+j].real=0.;
1003 evec_tmp[i*NN+j].imag=0.;
1004 DD[i*NN+j].real=BlockTriDiag(i,j).real();
1005 DD[i*NN+j].imag=BlockTriDiag(i,j).imag();
1006 }
1007 }
1008 MKL_INT evals_found;
1009 MKL_INT lwork = (3*NN);
1010 MKL_INT lrwork = (24*NN);
1011 MKL_INT liwork = NN*10 ;
1012 MKL_INT iwork[liwork];
1013 double rwork[lrwork];
1014// double work[lwork];
1015 MKL_Complex16 *work = (MKL_Complex16 *) malloc(lwork*sizeof(MKL_Complex16));
1016 MKL_INT isuppz[2*NN];
1017 char jobz = 'V'; // calculate evals & evecs
1018 char range = 'I'; // calculate all evals
1019 // char range = 'A'; // calculate all evals
1020 char uplo = 'U'; // refer to upper half of original matrix
1021 char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
1022 int ifail[NN];
1023 MKL_INT info;
1024 int total = grid->_Nprocessors;
1025 int node = grid->_processor;
1026 int interval = (NN/total)+1;
1027 double vl = 0.0, vu = 0.0;
1028 MKL_INT il = interval*node+1 , iu = interval*(node+1);
1029 if (iu > NN) iu=NN;
1030 Glog << "node "<<node<<"il "<<il<<"iu "<<iu<<std::endl;
1031 double tol = 0.0;
1032 if (1) {
1033 memset(evals_tmp,0,sizeof(double)*NN);
1034 if ( il <= NN){
1035 zheevr(&jobz, &range, &uplo, &NN,
1036 DD, &NN,
1037 &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
1038 &tol, // tolerance
1039 &evals_found, evals_tmp, (MKL_Complex16*)evec_tmp, &NN,
1040 isuppz,
1041 work, &lwork,
1042 rwork, &lrwork,
1043 iwork, &liwork,
1044 &info);
1045// (double*)EE,
1046 for (int i = iu-1; i>= il-1; i--){
1047 evals_tmp[i] = evals_tmp[i - (il-1)];
1048 if (il>1) evals_tmp[i-(il-1)]=0.;
1049 for (int j = 0; j< NN; j++){
1050 evec_tmp[i*NN+j] = evec_tmp[(i - (il-1))*NN+j];
1051 if (il>1) {
1052 evec_tmp[(i-(il-1))*NN+j].imag=0.;
1053 evec_tmp[(i-(il-1))*NN+j].real=0.;
1054 }
1055 }
1056 }
1057 }
1058 {
1059 grid->GlobalSumVector(evals_tmp,NN);
1060 grid->GlobalSumVector((double*)evec_tmp,2*NN*NN);
1061 }
1062 }
1063 for (int i = 0; i < Nk; i++)
1064 eval[Nk-1-i] = evals_tmp[i];
1065 for (int i = 0; i < Nk; i++) {
1066 for (int j = 0; j < Nk; j++) {
1067// Qt(j,Nk-1-i) = eigensolver.eigenvectors()(j,i);
1068 Qt(j,Nk-1-i)=std::complex<double>
1069 ( evec_tmp[i*Nk+j].real,
1070 evec_tmp[i*Nk+j].imag);
1071// ( evec_tmp[(Nk-1-j)*Nk+Nk-1-i].real,
1072// evec_tmp[(Nk-1-j)*Nk+Nk-1-i].imag);
1073
1074 }
1075 }
1076
1077if (1){
1078 Eigen::SelfAdjointEigenSolver<Eigen::MatrixXcd> eigensolver(BlockTriDiag);
1079
1080 for (int i = 0; i < Nk; i++) {
1081 Glog << "eval = "<<i<<" " <<eval[Nk-1-i] <<" "<< eigensolver.eigenvalues()(i) <<std::endl;
1082// eval[Nk-1-i] = eigensolver.eigenvalues()(i);
1083 }
1084 for (int i = 0; i < Nk; i++) {
1085 for (int j = 0; j < Nk; j++) {
1086// Qt(j,Nk-1-i) = eigensolver.eigenvectors()(j,i);
1087// Glog<<"Qt "<<j<<" "<<Nk-1-i<<" = " <<Qt(j,Nk-1-i) <<" "<<eigensolver.eigenvectors()(j,i) <<std::endl;
1088 MKL_Complex16 tmp = evec_tmp[i*Nk+j];
1089// Glog<<"Qt "<<j<<" "<<Nk-1-i<<" = " <<evec_tmp[(Nk-1-j)*Nk+Nk-1-i].real<<" "<<
1090//evec_tmp[(Nk-1-j)*Nk+Nk-1-i].imag <<" "<<eigensolver.eigenvectors()(j,i) <<std::endl;
1091 if ( (i<5)&& (j<5))
1092 Glog<<"Qt "<<j<<" "<<Nk-1-i<<" = " << norm(Qt(j,Nk-1-i))<<" "<<
1093 norm(eigensolver.eigenvectors()(j,i)) <<std::endl;
1094 }
1095 }
1096}
1097// exit(-43);
1098
1099 free (evals_tmp);
1100 free (evec_tmp);
1101 free (DD);
1102 free (work);
1103 }
1104#endif
1105
1106
1107 void diagonalize(std::vector<RealD>& eval,
1108 std::vector<std::vector<ComplexD>>& lmd,
1109 std::vector<std::vector<ComplexD>>& lme,
1110 int Nu, int Nk, int Nm,
1111 Eigen::MatrixXcd & Qt,
1112 GridBase *grid)
1113 {
1114 Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
1116 diagonalize_Eigen(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
1117#ifdef USE_LAPACK
1118 } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) {
1119 diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
1120#endif
1121 } else {
1122 assert(0);
1123 }
1124 }
1125
1126
1128 std::vector<std::vector<ComplexD>>& lmd,
1129 std::vector<std::vector<ComplexD>>& lme,
1130 int Nu, int Nb, int Nk, int Nm,
1131 Eigen::MatrixXcd& M)
1132 {
1133 //Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
1134 assert( Nk%Nu == 0 && Nm%Nu == 0 );
1135 assert( Nk <= Nm );
1136 M = Eigen::MatrixXcd::Zero(Nk,Nk);
1137
1138 // rearrange
1139 for ( int u=0; u<Nu; ++u ) {
1140 for (int k=0; k<Nk; ++k ) {
1141 M(k,u+(k/Nu)*Nu) = lmd[u][k];
1142 }
1143 }
1144
1145 for ( int u=0; u<Nu; ++u ) {
1146 for (int k=Nu; k<Nk; ++k ) {
1147 M(k-Nu,u+(k/Nu)*Nu) = conjugate(lme[u][k-Nu]);
1148 M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
1149 }
1150 }
1151 //Glog << "unpackHermitBlockTriDiagMatToEigen() end" << endl;
1152 }
1153
1154
1156 std::vector<std::vector<ComplexD>>& lmd,
1157 std::vector<std::vector<ComplexD>>& lme,
1158 int Nu, int Nb, int Nk, int Nm,
1159 Eigen::MatrixXcd& M)
1160 {
1161 //Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
1162 assert( Nk%Nu == 0 && Nm%Nu == 0 );
1163 assert( Nk <= Nm );
1164
1165 // rearrange
1166 for ( int u=0; u<Nu; ++u ) {
1167 for (int k=0; k<Nk; ++k ) {
1168 lmd[u][k] = M(k,u+(k/Nu)*Nu);
1169 }
1170 }
1171
1172 for ( int u=0; u<Nu; ++u ) {
1173 for (int k=Nu; k<Nk; ++k ) {
1174 lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
1175 }
1176 }
1177 //Glog << "packHermitBlockTriDiagMatfromEigen() end" << endl;
1178 }
1179
1180
1181 // assume the input matrix M is a band matrix
1182 void shiftedQRDecompEigen(Eigen::MatrixXcd& M, int Nu, int Nm,
1183 RealD Dsh,
1184 Eigen::MatrixXcd& Qprod)
1185 {
1186 //Glog << "shiftedQRDecompEigen() begin" << '\n';
1187 Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
1188 Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
1189 Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
1190
1191 Mtmp = M;
1192 for (int i=0; i<Nm; ++i ) {
1193 Mtmp(i,i) = M(i,i) - Dsh;
1194 }
1195
1196 Eigen::HouseholderQR<Eigen::MatrixXcd> QRD(Mtmp);
1197 Q = QRD.householderQ();
1198 R = QRD.matrixQR(); // upper triangular part is the R matrix.
1199 // lower triangular part used to represent series
1200 // of Q sequence.
1201
1202 // equivalent operation of Qprod *= Q
1203 //M = Eigen::MatrixXcd::Zero(Nm,Nm);
1204
1205 //for (int i=0; i<Nm; ++i) {
1206 // for (int j=0; j<Nm-2*(Nu+1); ++j) {
1207 // for (int k=0; k<2*(Nu+1)+j; ++k) {
1208 // M(i,j) += Qprod(i,k)*Q(k,j);
1209 // }
1210 // }
1211 //}
1212 //for (int i=0; i<Nm; ++i) {
1213 // for (int j=Nm-2*(Nu+1); j<Nm; ++j) {
1214 // for (int k=0; k<Nm; ++k) {
1215 // M(i,j) += Qprod(i,k)*Q(k,j);
1216 // }
1217 // }
1218 //}
1219
1220 Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
1221
1222 for (int i=0; i<Nm; ++i) {
1223 for (int j=0; j<Nm-(Nu+1); ++j) {
1224 for (int k=0; k<Nu+1+j; ++k) {
1225 Mtmp(i,j) += Qprod(i,k)*Q(k,j);
1226 }
1227 }
1228 }
1229 for (int i=0; i<Nm; ++i) {
1230 for (int j=Nm-(Nu+1); j<Nm; ++j) {
1231 for (int k=0; k<Nm; ++k) {
1232 Mtmp(i,j) += Qprod(i,k)*Q(k,j);
1233 }
1234 }
1235 }
1236
1237 //static int ntimes = 2;
1238 //for (int j=0; j<Nm-(ntimes*Nu); ++j) {
1239 // for (int i=ntimes*Nu+j; i<Nm; ++i) {
1240 // Mtmp(i,j) = 0.0;
1241 // }
1242 //}
1243 //ntimes++;
1244
1245 Qprod = Mtmp;
1246
1247 // equivalent operation of M = Q.adjoint()*(M*Q)
1248 Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
1249
1250 for (int a=0, i=0, kmax=0; a<Nu+1; ++a) {
1251 for (int j=0; j<Nm-a; ++j) {
1252 i = j+a;
1253 kmax = (Nu+1)+j;
1254 if (kmax > Nm) kmax = Nm;
1255 for (int k=i; k<kmax; ++k) {
1256 Mtmp(i,j) += R(i,k)*Q(k,j);
1257 }
1258 Mtmp(j,i) = conj(Mtmp(i,j));
1259 }
1260 }
1261
1262 for (int i=0; i<Nm; ++i) {
1263 Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
1264 }
1265
1266 M = Mtmp;
1267
1268 //M = Q.adjoint()*(M*Q);
1269 //for (int i=0; i<Nm; ++i) {
1270 // for (int j=0; j<Nm; ++j) {
1271 // if (i==j) M(i,i) = real(M(i,i));
1272 // if (j>i) M(i,j) = conj(M(j,i));
1273 // if (i-j > Nu || j-i > Nu) M(i,j) = 0.;
1274 // }
1275 //}
1276
1277 //Glog << "shiftedQRDecompEigen() end" << endl;
1278 }
1279
1281 {
1282 Eigen::MatrixXd A = Eigen::MatrixXd::Zero(3,3);
1283 Eigen::MatrixXd Q = Eigen::MatrixXd::Zero(3,3);
1284 Eigen::MatrixXd R = Eigen::MatrixXd::Zero(3,3);
1285 Eigen::MatrixXd P = Eigen::MatrixXd::Zero(3,3);
1286
1287 A(0,0) = 12.0;
1288 A(0,1) = -51.0;
1289 A(0,2) = 4.0;
1290 A(1,0) = 6.0;
1291 A(1,1) = 167.0;
1292 A(1,2) = -68.0;
1293 A(2,0) = -4.0;
1294 A(2,1) = 24.0;
1295 A(2,2) = -41.0;
1296
1297 Glog << "matrix A before ColPivHouseholder" << std::endl;
1298 for ( int i=0; i<3; i++ ) {
1299 for ( int j=0; j<3; j++ ) {
1300 Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n';
1301 }
1302 }
1303 Glog << std::endl;
1304
1305 Eigen::ColPivHouseholderQR<Eigen::MatrixXd> QRD(A);
1306
1307 Glog << "matrix A after ColPivHouseholder" << std::endl;
1308 for ( int i=0; i<3; i++ ) {
1309 for ( int j=0; j<3; j++ ) {
1310 Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n';
1311 }
1312 }
1313 Glog << std::endl;
1314
1315 Glog << "HouseholderQ with sequence lenth = nonzeroPiviots" << std::endl;
1316 Q = QRD.householderQ().setLength(QRD.nonzeroPivots());
1317 for ( int i=0; i<3; i++ ) {
1318 for ( int j=0; j<3; j++ ) {
1319 Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n';
1320 }
1321 }
1322 Glog << std::endl;
1323
1324 Glog << "HouseholderQ with sequence lenth = 1" << std::endl;
1325 Q = QRD.householderQ().setLength(1);
1326 for ( int i=0; i<3; i++ ) {
1327 for ( int j=0; j<3; j++ ) {
1328 Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n';
1329 }
1330 }
1331 Glog << std::endl;
1332
1333 Glog << "HouseholderQ with sequence lenth = 2" << std::endl;
1334 Q = QRD.householderQ().setLength(2);
1335 for ( int i=0; i<3; i++ ) {
1336 for ( int j=0; j<3; j++ ) {
1337 Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n';
1338 }
1339 }
1340 Glog << std::endl;
1341
1342 Glog << "matrixR" << std::endl;
1343 R = QRD.matrixR();
1344 for ( int i=0; i<3; i++ ) {
1345 for ( int j=0; j<3; j++ ) {
1346 Glog << "R[" << i << "," << j << "] = " << R(i,j) << '\n';
1347 }
1348 }
1349 Glog << std::endl;
1350
1351 Glog << "rank = " << QRD.rank() << std::endl;
1352 Glog << "threshold = " << QRD.threshold() << std::endl;
1353
1354 Glog << "matrixP" << std::endl;
1355 P = QRD.colsPermutation();
1356 for ( int i=0; i<3; i++ ) {
1357 for ( int j=0; j<3; j++ ) {
1358 Glog << "P[" << i << "," << j << "] = " << P(i,j) << '\n';
1359 }
1360 }
1361 Glog << std::endl;
1362
1363
1364 Glog << "QR decomposition without column pivoting" << std::endl;
1365
1366 A(0,0) = 12.0;
1367 A(0,1) = -51.0;
1368 A(0,2) = 4.0;
1369 A(1,0) = 6.0;
1370 A(1,1) = 167.0;
1371 A(1,2) = -68.0;
1372 A(2,0) = -4.0;
1373 A(2,1) = 24.0;
1374 A(2,2) = -41.0;
1375
1376 Glog << "matrix A before Householder" << std::endl;
1377 for ( int i=0; i<3; i++ ) {
1378 for ( int j=0; j<3; j++ ) {
1379 Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n';
1380 }
1381 }
1382 Glog << std::endl;
1383
1384 Eigen::HouseholderQR<Eigen::MatrixXd> QRDplain(A);
1385
1386 Glog << "HouseholderQ" << std::endl;
1387 Q = QRDplain.householderQ();
1388 for ( int i=0; i<3; i++ ) {
1389 for ( int j=0; j<3; j++ ) {
1390 Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n';
1391 }
1392 }
1393 Glog << std::endl;
1394
1395 Glog << "matrix A after Householder" << std::endl;
1396 for ( int i=0; i<3; i++ ) {
1397 for ( int j=0; j<3; j++ ) {
1398 Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n';
1399 }
1400 }
1401 Glog << std::endl;
1402 }
1403
1404 };
1405}
1406#undef Glog
1407#undef USE_LAPACK
1408#undef CUDA_COMPLEX
1409#undef CUDA_FLOAT
1410#undef MAKE_CUDA_COMPLEX
1411#undef CUDA_GEMM
1412#endif
#define conj(a, b, i)
accelerator_inline Grid_simd< S, V > sqrt(const Grid_simd< S, V > &r)
#define MAKE_CUDA_COMPLEX
B
accelerator_inline sobj eval(const uint64_t ss, const sobj &arg)
Definition Lattice_ET.h:103
Lattice< vobj > real(const Lattice< vobj > &lhs)
Lattice< vobj > imag(const Lattice< vobj > &lhs)
Lattice< vobj > conjugate(const Lattice< vobj > &lhs)
ComplexD innerProduct(const Lattice< vobj > &left, const Lattice< vobj > &right)
RealD norm2(const Lattice< vobj > &arg)
void Grid_unsplit(std::vector< Lattice< Vobj > > &full, Lattice< Vobj > &split)
void Grid_split(std::vector< Lattice< Vobj > > &full, Lattice< Vobj > &split)
#define autoView(l_v, l, mode)
@ AcceleratorWrite
@ AdviseInfrequentUse
RealF Real
Definition Simd.h:65
std::complex< RealD > ComplexD
Definition Simd.h:79
double RealD
Definition Simd.h:61
void GlobalSumVector(RealF *, int N)
int lSites(void) const
void show_decomposition()
void calc(std::vector< RealD > &eval, std::vector< Field > &evec, const std::vector< Field > &src, int &Nconv, LanczosType Impl)
void orthogonalize(Field &w, std::vector< Field > &evec, int k, int if_print=0)
void orthogonalize(std::vector< Field > &w, int _Nu, std::vector< Field > &evec, int k, int if_print=0)
void diagonalize_Eigen(std::vector< RealD > &eval, std::vector< std::vector< ComplexD > > &lmd, std::vector< std::vector< ComplexD > > &lme, int Nu, int Nk, int Nm, Eigen::MatrixXcd &Qt, GridBase *grid)
void packHermitBlockTriDiagMatfromEigen(std::vector< std::vector< ComplexD > > &lmd, std::vector< std::vector< ComplexD > > &lme, int Nu, int Nb, int Nk, int Nm, Eigen::MatrixXcd &M)
static RealD normalize(Field &v, int if_print=0)
void reorthogonalize(Field &w, std::vector< Field > &evec, int k)
void orthogonalize_blockhead(Field &w, std::vector< Field > &evec, int k, int Nu)
void calc_rbl(std::vector< RealD > &eval, std::vector< Field > &evec, const std::vector< Field > &src, int &Nconv)
void shiftedQRDecompEigen(Eigen::MatrixXcd &M, int Nu, int Nm, RealD Dsh, Eigen::MatrixXcd &Qprod)
void calc_irbl(std::vector< RealD > &eval, std::vector< Field > &evec, const std::vector< Field > &src, int &Nconv)
void diagonalize(std::vector< RealD > &eval, std::vector< std::vector< ComplexD > > &lmd, std::vector< std::vector< ComplexD > > &lme, int Nu, int Nk, int Nm, Eigen::MatrixXcd &Qt, GridBase *grid)
ImplicitlyRestartedBlockLanczos(LinearOperatorBase< Field > &Linop, LinearOperatorBase< Field > &SLinop, GridRedBlackCartesian *FrbGrid, GridRedBlackCartesian *SFrbGrid, int _mrhs, OperatorFunction< Field > &poly, int _Nstop, int _Nconv_test_interval, int _Nu, int _Nk, int _Nm, RealD _eresid, int _MaxIter, IRBLdiagonalisation _diagonalisation=IRBLdiagonaliseWithEigen)
void blockwiseStep(std::vector< std::vector< ComplexD > > &lmd, std::vector< std::vector< ComplexD > > &lme, std::vector< Field > &evec, std::vector< Field > &w, std::vector< Field > &w_copy, int b)
void unpackHermitBlockTriDiagMatToEigen(std::vector< std::vector< ComplexD > > &lmd, std::vector< std::vector< ComplexD > > &lme, int Nu, int Nb, int Nk, int Nm, Eigen::MatrixXcd &M)
void orthogonalize_blas(std::vector< Field > &w, std::vector< Field > &evec, int R, int do_print=0)
static bool less_lmd(RealD left, RealD right)
bool saturated(RealD lmd, RealD thrs)
static bool less_pair(std::pair< RealD, Field const * > &left, std::pair< RealD, Field const * > &right)
void push(std::vector< RealD > &lmd, std::vector< Field > &evec, int N)
void push(std::vector< RealD > &lmd, int N)