Grid 0.7.0
FlightRecorder.cc
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/Init.cc
6
7 Copyright (C) 2015
8
9Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
10Author: Peter Boyle <paboyle@ph.ed.ac.uk>
11Author: Peter Boyle <peterboyle@MacBook-Pro.local>
12Author: paboyle <paboyle@ph.ed.ac.uk>
13
14 This program is free software; you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation; either version 2 of the License, or
17 (at your option) any later version.
18
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
23
24 You should have received a copy of the GNU General Public License along
25 with this program; if not, write to the Free Software Foundation, Inc.,
26 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27
28 See the full license in the file "LICENSE" in the top level distribution directory
29*************************************************************************************/
30/* END LEGAL */
31#include <Grid/Grid.h>
32
35// Grid Norm logging for repro testing
42const char * FlightRecorder::StepName;
50std::vector<double> FlightRecorder::NormLogVector;
51std::vector<double> FlightRecorder::ReductionLogVector;
52std::vector<uint64_t> FlightRecorder::CsumLogVector;
53std::vector<uint64_t> FlightRecorder::XmitLogVector;
54std::vector<uint64_t> FlightRecorder::RecvLogVector;
55
67{
69 XmitLogVector.resize(0);
70 RecvLogVector.resize(0);
71 NormLogVector.resize(0);
72 CsumLogVector.resize(0);
73 ReductionLogVector.resize(0);
74}
76{
77 switch ( mode ) {
80 break;
83 break;
86 break;
87 case LoggingModeNone:
88 LoggingMode = mode;
89 Truncate();
90 break;
91 default:
92 assert(0);
93 }
94}
95bool FlightRecorder::StepLog(const char *name)
96{
97 StepName = name;
99 return true;
100}
101
103{
104 std::cout << " FlightRecorder: set to print output " <<std::endl;
105 Truncate();
107}
109{
110 std::cout << " FlightRecorder: set to RECORD " <<std::endl;
111 Truncate();
113}
115{
116 std::cout << " FlightRecorder: set to VERIFY " << NormLogVector.size()<< " log entries "<<std::endl;
119}
121{
122 return ErrorCounter;
123}
124bool FlightRecorder::NormLog(double value)
125{
126 uint64_t hex = * ( (uint64_t *)&value );
128 std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
130 return true;
131 }
133 std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
134 NormLogVector.push_back(value);
136 return true;
137 }
139
141 uint64_t hexref = * ( (uint64_t *)&NormLogVector[NormLoggingCounter] );
142
143 if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
144
145 fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
148 std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
149 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
150 <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
151
152 std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<NormLogVector[NormLoggingCounter] <<std::endl;
153
154 fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e expect %.16e\n",
155 GridHostname(),
158 value, NormLogVector[NormLoggingCounter]); fflush(stderr);
159
160 BACKTRACEFP(stderr);
161
162 if(!ContinueOnFail) return false;
163
164 ErrorCounter++;
165 } else {
166 if ( PrintEntireLog ) {
167 std::cerr<<"FlightRecorder::NormLog VALID "<< NormLoggingCounter << std::hex
168 <<" "<<hex<<" "<<hexref
169 <<" "<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::dec<<std::endl;
170 }
171 }
172
173 }
174 if ( NormLogVector.size()==NormLoggingCounter ) {
175 std::cout << "FlightRecorder:: Verified entire sequence of "<<NormLoggingCounter<<" norms "<<std::endl;
176 }
178 }
179 return true;
180}
181bool FlightRecorder::CsumLog(uint64_t hex)
182{
184 std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
186 return true;
187 }
188
190 std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
191 CsumLogVector.push_back(hex);
193 return true;
194 }
195
197
198 if(CsumLoggingCounter < CsumLogVector.size()) {
199
200 uint64_t hexref = CsumLogVector[CsumLoggingCounter] ;
201
202 if ( hex != hexref ) {
203
204 fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
207 std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
208 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
209
210 fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for csum %d %lx expect %lx\n",
211 GridHostname(),
213 CsumLoggingCounter,hex, hexref);
214 BACKTRACEFP(stderr);
215 fflush(stderr);
216
217 if(!ContinueOnFail) return false;
218
219 ErrorCounter++;
220
221 } else {
222
223 if ( PrintEntireLog ) {
224 std::cerr<<"FlightRecorder::CsumLog VALID "<< CsumLoggingCounter << std::hex
225 <<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
226 }
227 }
228 }
229 if ( CsumLogVector.size()==CsumLoggingCounter ) {
230 std::cout << "FlightRecorder:: Verified entire sequence of "<<CsumLoggingCounter<<" checksums "<<std::endl;
231 }
233 }
234 return true;
235}
236
237void FlightRecorder::ReductionLog(double local,double global)
238{
239 uint64_t hex_l = * ( (uint64_t *)&local );
240 uint64_t hex_g = * ( (uint64_t *)&global );
242 std::cerr<<"FlightRecorder::ReductionLog : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
244 }
246 std::cerr<<"FlightRecorder::ReductionLog RECORDING : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
247 ReductionLogVector.push_back(global);
249 }
253 fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
256 fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
257 GridHostname(),
260 global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
261 BACKTRACEFP(stderr);
262
263 if ( !ContinueOnFail ) assert(0);
264
265 ErrorCounter++;
266 } else {
267 if ( PrintEntireLog ) {
268 std::cerr<<"FlightRecorder::ReductionLog : VALID "<< ReductionLoggingCounter <<" "<< std::hexfloat << local << "-> "<< global <<std::endl;
269 }
270 }
271 }
273 std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<ReductionLoggingCounter<<" norms "<<std::endl;
274 }
276 }
277}
278void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
279{
280 if(LoggingMode == LoggingModeNone) return;
281
282 if ( ChecksumCommsSend ){
283
284 if(LoggingMode == LoggingModeNone) return;
285
286#ifdef GRID_SYCL
287 uint64_t *ubuf = (uint64_t *)buf;
288 uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
290 std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
292 }
294 std::cerr<<"FlightRecorder::xmitLog RECORD : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
295 XmitLogVector.push_back(_xor);
297 }
300 if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
301 fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
304 fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu %lx expect glb %lx\n",
305 GridHostname(),
308 _xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
309 BACKTRACEFP(stderr);
310
311 if ( !ContinueOnFail ) assert(0);
312
313 ErrorCounter++;
314 } else {
315 if ( PrintEntireLog ) {
316 std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<< XmitLogVector[XmitLoggingCounter] <<std::endl;
317 }
318 }
319 }
320 if ( XmitLogVector.size()==XmitLoggingCounter ) {
321 std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<XmitLoggingCounter<<" sends "<<std::endl;
322 }
324 }
325#endif
326 }
327}
328void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
329{
330 if ( ChecksumComms ){
331 if(LoggingMode == LoggingModeNone) return;
332#ifdef GRID_SYCL
333 uint64_t *ubuf = (uint64_t *)buf;
334 uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
336 std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
338 }
340 std::cerr<<"FlightRecorder::recvLog RECORD : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
341 RecvLogVector.push_back(_xor);
343 }
346 if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
347 fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
350 fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu %lx expect glb %lx from MPI rank %d\n",
351 GridHostname(),
354 _xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
355 BACKTRACEFP(stderr);
356
357 if ( !ContinueOnFail ) assert(0);
358
359 ErrorCounter++;
360 } else {
361 if ( PrintEntireLog ) {
362 std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<< RecvLogVector[RecvLoggingCounter] <<std::endl;
363 }
364 }
365 }
366 if ( RecvLogVector.size()==RecvLoggingCounter ) {
367 std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<RecvLoggingCounter<<" sends "<<std::endl;
368 }
370 }
371#endif
372 }
373}
374
char * GridHostname(void)
Definition Init.cc:103
Word svm_xor(Word *vec, uint64_t L)
#define BACKTRACEFP(fp)
Definition Log.h:250
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
static const char * StepName
static void SetLoggingModeRecord(void)
static bool StepLog(const char *name)
static void SetLoggingModePrint(void)
static void ReductionLog(double lcl, double glbl)
static void Truncate(void)
static int32_t StepLoggingCounter
static int PrintEntireLog
static bool CsumLog(uint64_t csum)
static std::vector< double > NormLogVector
static uint64_t ErrorCount(void)
static std::vector< double > ReductionLogVector
static int32_t ReductionLoggingCounter
static int ContinueOnFail
static void SetLoggingMode(LoggingMode_t mode)
static int LoggingMode
static int ChecksumCommsSend
static void recvLog(void *, uint64_t bytes, int rank)
static void ResetCounters(void)
static int32_t CsumLoggingCounter
static bool NormLog(double value)
static std::vector< uint64_t > XmitLogVector
static std::vector< uint64_t > CsumLogVector
static std::vector< uint64_t > RecvLogVector
static int32_t XmitLoggingCounter
static void xmitLog(void *, uint64_t bytes)
static uint64_t ErrorCounter
static int32_t RecvLoggingCounter
static int32_t NormLoggingCounter
static int ChecksumComms
static void SetLoggingModeVerify(void)
static int WorldShmRank