293 const FermionField& in,
295 const CloverDiagonalField& diagonal,
296 const CloverTriangleField& triangle) {
302 typedef SiteSpinor CalcSpinor;
304#if defined(A64FX) || defined(A64FXFIXEDSIZE)
305#define PREFETCH_CLOVER(BASE) { \
307 int pf_dist_L1 = 1; \
308 int pf_dist_L2 = -5; \
310 if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) { \
311 base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0); \
312 svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL1STRM); \
313 svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL1STRM); \
314 svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL1STRM); \
315 svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL1STRM); \
316 svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL1STRM); \
317 svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL1STRM); \
320 if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) { \
321 base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0); \
322 svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL2STRM); \
323 svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL2STRM); \
324 svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL2STRM); \
325 svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL2STRM); \
326 svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL2STRM); \
327 svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL2STRM); \
359#define PREFETCH_CLOVER(BASE)
362 const uint64_t NN = Nsite * Ls;
368 CalcSpinor in_t = in_v[sF];
369 auto diag_t = diagonal_v[sU];
370 auto triangle_t = triangle_v[sU];
375 auto in_cc_0_0 =
conjugate(in_t()(0)(0));
376 auto in_cc_0_1 =
conjugate(in_t()(0)(1));
377 auto in_cc_0_2 =
conjugate(in_t()(0)(2));
378 auto in_cc_1_0 =
conjugate(in_t()(1)(0));
379 auto in_cc_1_1 =
conjugate(in_t()(1)(1));
381 res()(0)(0) = diag_t()(0)( 0) * in_t()(0)(0)
382 + triangle_t()(0)( 0) * in_t()(0)(1)
383 + triangle_t()(0)( 1) * in_t()(0)(2)
384 + triangle_t()(0)( 2) * in_t()(1)(0)
385 + triangle_t()(0)( 3) * in_t()(1)(1)
386 + triangle_t()(0)( 4) * in_t()(1)(2);
388 res()(0)(1) = triangle_t()(0)( 0) * in_cc_0_0;
389 res()(0)(1) = diag_t()(0)( 1) * in_t()(0)(1)
390 + triangle_t()(0)( 5) * in_t()(0)(2)
391 + triangle_t()(0)( 6) * in_t()(1)(0)
392 + triangle_t()(0)( 7) * in_t()(1)(1)
393 + triangle_t()(0)( 8) * in_t()(1)(2)
396 res()(0)(2) = triangle_t()(0)( 1) * in_cc_0_0
397 + triangle_t()(0)( 5) * in_cc_0_1;
398 res()(0)(2) = diag_t()(0)( 2) * in_t()(0)(2)
399 + triangle_t()(0)( 9) * in_t()(1)(0)
400 + triangle_t()(0)(10) * in_t()(1)(1)
401 + triangle_t()(0)(11) * in_t()(1)(2)
404 res()(1)(0) = triangle_t()(0)( 2) * in_cc_0_0
405 + triangle_t()(0)( 6) * in_cc_0_1
406 + triangle_t()(0)( 9) * in_cc_0_2;
407 res()(1)(0) = diag_t()(0)( 3) * in_t()(1)(0)
408 + triangle_t()(0)(12) * in_t()(1)(1)
409 + triangle_t()(0)(13) * in_t()(1)(2)
412 res()(1)(1) = triangle_t()(0)( 3) * in_cc_0_0
413 + triangle_t()(0)( 7) * in_cc_0_1
414 + triangle_t()(0)(10) * in_cc_0_2
415 + triangle_t()(0)(12) * in_cc_1_0;
416 res()(1)(1) = diag_t()(0)( 4) * in_t()(1)(1)
417 + triangle_t()(0)(14) * in_t()(1)(2)
420 res()(1)(2) = triangle_t()(0)( 4) * in_cc_0_0
421 + triangle_t()(0)( 8) * in_cc_0_1
422 + triangle_t()(0)(11) * in_cc_0_2
423 + triangle_t()(0)(13) * in_cc_1_0
424 + triangle_t()(0)(14) * in_cc_1_1;
425 res()(1)(2) = diag_t()(0)( 5) * in_t()(1)(2)
428 vstream(out_v[sF]()(0)(0), res()(0)(0));
429 vstream(out_v[sF]()(0)(1), res()(0)(1));
430 vstream(out_v[sF]()(0)(2), res()(0)(2));
431 vstream(out_v[sF]()(1)(0), res()(1)(0));
432 vstream(out_v[sF]()(1)(1), res()(1)(1));
433 vstream(out_v[sF]()(1)(2), res()(1)(2));
438 auto in_cc_2_0 =
conjugate(in_t()(2)(0));
439 auto in_cc_2_1 =
conjugate(in_t()(2)(1));
440 auto in_cc_2_2 =
conjugate(in_t()(2)(2));
441 auto in_cc_3_0 =
conjugate(in_t()(3)(0));
442 auto in_cc_3_1 =
conjugate(in_t()(3)(1));
444 res()(2)(0) = diag_t()(1)( 0) * in_t()(2)(0)
445 + triangle_t()(1)( 0) * in_t()(2)(1)
446 + triangle_t()(1)( 1) * in_t()(2)(2)
447 + triangle_t()(1)( 2) * in_t()(3)(0)
448 + triangle_t()(1)( 3) * in_t()(3)(1)
449 + triangle_t()(1)( 4) * in_t()(3)(2);
451 res()(2)(1) = triangle_t()(1)( 0) * in_cc_2_0;
452 res()(2)(1) = diag_t()(1)( 1) * in_t()(2)(1)
453 + triangle_t()(1)( 5) * in_t()(2)(2)
454 + triangle_t()(1)( 6) * in_t()(3)(0)
455 + triangle_t()(1)( 7) * in_t()(3)(1)
456 + triangle_t()(1)( 8) * in_t()(3)(2)
459 res()(2)(2) = triangle_t()(1)( 1) * in_cc_2_0
460 + triangle_t()(1)( 5) * in_cc_2_1;
461 res()(2)(2) = diag_t()(1)( 2) * in_t()(2)(2)
462 + triangle_t()(1)( 9) * in_t()(3)(0)
463 + triangle_t()(1)(10) * in_t()(3)(1)
464 + triangle_t()(1)(11) * in_t()(3)(2)
467 res()(3)(0) = triangle_t()(1)( 2) * in_cc_2_0
468 + triangle_t()(1)( 6) * in_cc_2_1
469 + triangle_t()(1)( 9) * in_cc_2_2;
470 res()(3)(0) = diag_t()(1)( 3) * in_t()(3)(0)
471 + triangle_t()(1)(12) * in_t()(3)(1)
472 + triangle_t()(1)(13) * in_t()(3)(2)
475 res()(3)(1) = triangle_t()(1)( 3) * in_cc_2_0
476 + triangle_t()(1)( 7) * in_cc_2_1
477 + triangle_t()(1)(10) * in_cc_2_2
478 + triangle_t()(1)(12) * in_cc_3_0;
479 res()(3)(1) = diag_t()(1)( 4) * in_t()(3)(1)
480 + triangle_t()(1)(14) * in_t()(3)(2)
483 res()(3)(2) = triangle_t()(1)( 4) * in_cc_2_0
484 + triangle_t()(1)( 8) * in_cc_2_1
485 + triangle_t()(1)(11) * in_cc_2_2
486 + triangle_t()(1)(13) * in_cc_3_0
487 + triangle_t()(1)(14) * in_cc_3_1;
488 res()(3)(2) = diag_t()(1)( 5) * in_t()(3)(2)
491 vstream(out_v[sF]()(2)(0), res()(2)(0));
492 vstream(out_v[sF]()(2)(1), res()(2)(1));
493 vstream(out_v[sF]()(2)(2), res()(2)(2));
494 vstream(out_v[sF]()(3)(0), res()(3)(0));
495 vstream(out_v[sF]()(3)(1), res()(3)(1));
496 vstream(out_v[sF]()(3)(2), res()(3)(2));
513 static void Invert(
const CloverDiagonalField& diagonal,
514 const CloverTriangleField& triangle,
515 CloverDiagonalField& diagonalInv,
516 CloverTriangleField& triangleInv) {
521 diagonalInv.Checkerboard() = diagonal.Checkerboard();
522 triangleInv.Checkerboard() = triangle.Checkerboard();
526 long lsites = grid->
lSites();
528 typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
529 typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
537 Eigen::MatrixXcd clover_inv_eigen = Eigen::MatrixXcd::Zero(
Ns*
Nc,
Ns*
Nc);
538 Eigen::MatrixXcd clover_eigen = Eigen::MatrixXcd::Zero(
Ns*
Nc,
Ns*
Nc);
540 scalar_object_diagonal diagonal_tmp =
Zero();
541 scalar_object_diagonal diagonal_inv_tmp =
Zero();
542 scalar_object_triangle triangle_tmp =
Zero();
543 scalar_object_triangle triangle_inv_tmp =
Zero();
552 for (
long s_row=0;s_row<
Ns;s_row++) {
553 for (
long s_col=0;s_col<
Ns;s_col++) {
554 if(
abs(s_row - s_col) > 1 || s_row + s_col == 3)
continue;
555 int block = s_row /
Nhs;
556 int s_row_block = s_row %
Nhs;
557 int s_col_block = s_col %
Nhs;
558 for (
long c_row=0;c_row<
Nc;c_row++) {
559 for (
long c_col=0;c_col<
Nc;c_col++) {
560 int i = s_row_block *
Nc + c_row;
561 int j = s_col_block *
Nc + c_col;
571 clover_inv_eigen = clover_eigen.inverse();
573 for (
long s_row=0;s_row<
Ns;s_row++) {
574 for (
long s_col=0;s_col<
Ns;s_col++) {
575 if(
abs(s_row - s_col) > 1 || s_row + s_col == 3)
continue;
576 int block = s_row /
Nhs;
577 int s_row_block = s_row %
Nhs;
578 int s_col_block = s_col %
Nhs;
579 for (
long c_row=0;c_row<
Nc;c_row++) {
580 for (
long c_col=0;c_col<
Nc;c_col++) {
581 int i = s_row_block *
Nc + c_row;
582 int j = s_col_block *
Nc + c_col;
584 diagonal_inv_tmp()(block)(i) = clover_inv_eigen(s_row*
Nc+c_row, s_col*
Nc+c_col);
586 triangle_inv_tmp()(block)(
triangle_index(i, j)) = clover_inv_eigen(s_row*
Nc+c_row, s_col*
Nc+c_col);
600 CloverDiagonalField& diagonal,
601 CloverTriangleField& triangle) {
605 diagonal.Checkerboard() = full.Checkerboard();
606 triangle.Checkerboard() = full.Checkerboard();
614 for(int s_row = 0; s_row < Ns; s_row++) {
615 for(int s_col = 0; s_col < Ns; s_col++) {
616 if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
617 int block = s_row / Nhs;
618 int s_row_block = s_row % Nhs;
619 int s_col_block = s_col % Nhs;
620 for(int c_row = 0; c_row < Nc; c_row++) {
621 for(int c_col = 0; c_col < Nc; c_col++) {
622 int i = s_row_block * Nc + c_row;
623 int j = s_col_block * Nc + c_col;
625 diagonal_v[ss]()(block)(i) = full_v[ss]()(s_row, s_col)(c_row, c_col);
627 triangle_v[ss]()(block)(triangle_index(i, j)) = full_v[ss]()(s_row, s_col)(c_row, c_col);
639 const CloverTriangleField& triangle,
644 full.Checkerboard() = diagonal.Checkerboard();
654 for(int s_row = 0; s_row < Ns; s_row++) {
655 for(int s_col = 0; s_col < Ns; s_col++) {
656 if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
657 int block = s_row / Nhs;
658 int s_row_block = s_row % Nhs;
659 int s_col_block = s_col % Nhs;
660 for(int c_row = 0; c_row < Nc; c_row++) {
661 for(int c_col = 0; c_col < Nc; c_col++) {
662 int i = s_row_block * Nc + c_row;
663 int j = s_col_block * Nc + c_col;
665 full_v[ss]()(s_row, s_col)(c_row, c_col) = diagonal_v[ss]()(block)(i);
667 full_v[ss]()(s_row, s_col)(c_row, c_col) = triangle_elem(triangle_v[ss], block, i, j);
690 CloverTriangleField zeroTriangle(grid);
691 zeroTriangle.Checkerboard() = triangle.Checkerboard();
692 zeroTriangle =
Zero();
693 triangle = where(t_coor == 0, zeroTriangle, triangle);
694 triangle = where(t_coor == T-1, zeroTriangle, triangle);
698 CloverDiagonalField tmp(grid);
699 tmp.Checkerboard() = diagonal.Checkerboard();
700 tmp = -1.0 * csw_t + diag_mass;
701 diagonal = where(t_coor == 0, tmp, diagonal);
702 diagonal = where(t_coor == T-1, tmp, diagonal);
709 diagonal = where(t_coor == 1, tmp, diagonal);
710 diagonal = where(t_coor == T-2, tmp, diagonal);
716 std::cout <<
GridLogMessage <<
"CompactWilsonCloverHelpers::ModifyBoundaries timings:"
717 <<
" checks = " << (t1 - t0) / 1e6
718 <<
", coordinate = " << (t2 - t1) / 1e6
719 <<
", off-diag zero = " << (t3 - t2) / 1e6
720 <<
", diagonal unity = " << (t4 - t3) / 1e6
721 <<
", near-boundary = " << (t5 - t4) / 1e6
722 <<
", total = " << (t5 - t0) / 1e6