3838#include < cublas_v2.h>
3939#include < chrono>
4040
41+ // FOR DEBUG -- DEV VERSION
42+ #include < fstream>
43+
44+ std::ofstream debugLog (" debug_timings.txt" , std::ios::app);
45+
4146#define BLOCK_SIZE 32
4247
4348// Existing SAFE_CALL (for cudaError_t)
@@ -144,51 +149,6 @@ __device__ __host__ float unnormalized_sinc(float t)
144149 }
145150}
146151
147- __global__ void gpu_matrix_mult (const __half * a, const __half * b, __half * c, int m, int n, int k)
148- {
149- int row = blockIdx .y * blockDim .y + threadIdx .y ;
150- int col = blockIdx .x * blockDim .x + threadIdx .x ;
151-
152- if (col < k && row < m)
153- {
154- __half sum = __float2half (0 .0f );
155- for (int i = 0 ; i < n; ++i)
156- {
157- __half valA = a[row * n + i];
158- __half valB = b[i * k + col];
159- sum = __hadd (sum, __hmul (valA, valB));
160- }
161- c[row * k + col] = sum;
162- }
163- }
164-
165- __global__ void gpu_matrix_mult_float (float * a, float * b, float * c, int m, int n, int k)
166- {
167- int row = blockIdx .y * blockDim .y + threadIdx .y ;
168- int col = blockIdx .x * blockDim .x + threadIdx .x ;
169- float sum = 0 ;
170- if (col < k && row < m)
171- {
172- for (int i = 0 ; i < n; i++)
173- {
174- sum += a[row * n + i] * b[i * k + col];
175- }
176- c[row * k + col] = sum;
177- }
178- }
179-
180- __global__ void gpu_diag_matrix_mult (float * Val, int * RowPtr, float * diagVals, int total_rows)
181- {
182- const int row = threadIdx .x + blockIdx .x * blockDim .x ;
183- if (row < total_rows)
184- {
185- for (int i = RowPtr[row]; i < RowPtr[row + 1 ]; i++)
186- {
187- Val[i] = diagVals[row] * Val[i];
188- }
189- }
190- }
191-
192152__global__ void reduce_beams_kernel (
193153 const thrust::complex <float > * __restrict__ d_P_Beams, float * d_P_Beams_F_real,
194154 float * d_P_Beams_F_imag, int nBeams, int nFreq, int nRaysSkipped)
@@ -460,7 +420,6 @@ CArray2D sonar_calculation_wrapper(
460420
461421 if (!memory_initialized)
462422 {
463- std::cout << " Initializing..." << std::endl;
464423 SAFE_CALL (
465424 cudaMalloc ((void **)&d_depth_image, depth_image.step * depth_image.rows ), " depth malloc" );
466425 SAFE_CALL (
@@ -482,8 +441,6 @@ CArray2D sonar_calculation_wrapper(
482441 " P_Beams malloc device" );
483442 SAFE_CALL (cudaMalloc (&d_P_Beams_F_real, sizeof (float ) * nBeams * nFreq), " beam real malloc" );
484443 SAFE_CALL (cudaMalloc (&d_P_Beams_F_imag, sizeof (float ) * nBeams * nFreq), " beam imag malloc" );
485- std::cout << " Middle..." << std::endl;
486-
487444 SAFE_CALL (
488445 cudaMallocHost ((void **)&P_Ray_real, P_Ray_Bytes), " CUDA MallocHost Failed for P_Ray_real" );
489446 SAFE_CALL (
@@ -507,8 +464,6 @@ CArray2D sonar_calculation_wrapper(
507464 SAFE_CALL (
508465 cudaMallocHost ((void **)&beamCorrector_lin_h, beamCorrector_lin_Bytes),
509466 " CUDA MallocHost Failed for beamCorrector_lin_h" );
510- std::cout << " DONE..." << std::endl;
511-
512467 SAFE_CALL (
513468 cudaMalloc ((void **)&d_P_Beams_Cor_real, P_Beams_Cor_Bytes),
514469 " CUDA Malloc Failed for d_P_Beams_Cor_real" );
@@ -524,8 +479,6 @@ CArray2D sonar_calculation_wrapper(
524479 SAFE_CALL (
525480 cudaMalloc ((void **)&d_beamCorrector_lin, beamCorrector_lin_Bytes),
526481 " CUDA Malloc Failed for d_beamCorrector_lin" );
527- std::cout << " realdone..." << std::endl;
528-
529482 memory_initialized = true ;
530483 }
531484
@@ -594,14 +547,20 @@ CArray2D sonar_calculation_wrapper(
594547 SAFE_CALL (
595548 cudaMemcpy (P_Beams, d_P_Beams, P_Beams_Bytes, cudaMemcpyDeviceToHost), " CUDA Memcpy Failed" );
596549
597- // For calc time measure
598550 if (debugFlag)
599551 {
600552 stop = std::chrono::high_resolution_clock::now ();
601553 duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
602- printf (
603- " GPU Sonar Computation Time %lld/100 [s]\n " ,
604- static_cast <long long int >(duration.count () / 10000 ));
554+
555+ long long dcount = duration.count ();
556+ float ms = static_cast <float >(dcount) / 1000 .0f ;
557+
558+ printf (" GPU Sonar Computation Time %lld/100 [s]\n " , dcount / 10000 );
559+ printf (" GPU Sonar Summation Time: %.3f ms\n " , ms);
560+
561+ debugLog << " GPU Sonar Computation Time " << dcount / 10000 << " /100 [s]\n " ;
562+ debugLog << " GPU Sonar Summation Time: " << ms << " ms\n " ;
563+
605564 start = std::chrono::high_resolution_clock::now ();
606565 }
607566
@@ -641,9 +600,15 @@ CArray2D sonar_calculation_wrapper(
641600 {
642601 stop = std::chrono::high_resolution_clock::now ();
643602 duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
644- printf (
645- " Sonar Ray Summation %lld/100 [s]\n " , static_cast <long long int >(duration.count () / 10000 ));
646- printf (" Sonar Ray Summation Time: %.3f ms\n " , static_cast <float >(duration.count ()) / 1000 .0f );
603+
604+ long long dcount = duration.count ();
605+
606+ printf (" Sonar Ray Summation %lld/100 [s]\n " , dcount / 10000 );
607+ printf (" Sonar Ray Summation Time: %.3f ms\n " , static_cast <float >(dcount) / 1000 .0f );
608+
609+ debugLog << " Sonar Ray Summation " << dcount / 10000 << " /100 [s]\n " ;
610+ debugLog << " Sonar Ray Summation Time: " << static_cast <float >(dcount) / 1000 .0f << " ms\n " ;
611+
647612 start = std::chrono::high_resolution_clock::now ();
648613 }
649614
@@ -747,15 +712,21 @@ CArray2D sonar_calculation_wrapper(
747712 P_Beams_Cor_imag_h[f * nBeams + beam] / beamCorrectorSum);
748713 }
749714 }
750- // For calc time measure
715+
751716 if (debugFlag)
752717 {
753718 stop = std::chrono::high_resolution_clock::now ();
754719 duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
755- printf (
756- " GPU Window & Correction %lld/100 [s]\n " ,
757- static_cast <long long int >(duration.count () / 10000 ));
758- printf (" GPU Window & Correction: %.3f ms\n " , static_cast <float >(duration.count ()) / 1000 .0f );
720+
721+ long long dcount = duration.count ();
722+
723+ printf (" GPU Window & Correction %lld/100 [s]\n " , dcount / 10000 );
724+ printf (" GPU Window & Correction: %.3f ms\n " , static_cast <float >(dcount) / 1000 .0f );
725+
726+ // Write to file
727+ debugLog << " GPU Window & Correction " << dcount / 10000 << " /100 [s]\n " ;
728+ debugLog << " GPU Window & Correction: " << static_cast <float >(dcount) / 1000 .0f << " ms\n " ;
729+
759730 start = std::chrono::high_resolution_clock::now ();
760731 }
761732
@@ -797,14 +768,6 @@ CArray2D sonar_calculation_wrapper(
797768 }
798769 }
799770
800- if (debugFlag)
801- {
802- stop = std::chrono::high_resolution_clock::now ();
803- std::chrono::duration<double > duration = stop - start;
804- printf (" GPU LOOP Computation Time: %.6f seconds\n " , duration.count ());
805- start = std::chrono::high_resolution_clock::now ();
806- }
807-
808771 // --- Device side input data allocation and initialization
809772 cufftComplex * deviceInputData;
810773 SAFE_CALL (
@@ -862,14 +825,22 @@ CArray2D sonar_calculation_wrapper(
862825 }
863826 }
864827
865- // For calc time measure
866828 if (debugFlag)
867829 {
868830 stop = std::chrono::high_resolution_clock::now ();
869831 duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
870- printf (" GPU FFT Calc Time Time: %.3f ms\n " , static_cast <float >(duration.count ()) / 1000 .0f );
871- printf (
872- " GPU FFT Calc Time %lld/100 [s]\n " , static_cast <long long int >(duration.count () / 10000 ));
832+
833+ long long dcount = duration.count ();
834+ float ms = static_cast <float >(dcount) / 1000 .0f ;
835+
836+ printf (" GPU FFT Calc Time: %.3f ms\n " , ms);
837+ printf (" GPU FFT Calc Time %lld/100 [s]\n " , dcount / 10000 );
838+
839+ // Write to file
840+ debugLog << " GPU FFT Calc Time: " << ms << " ms\n " ;
841+ debugLog << " GPU FFT Calc Time " << dcount / 10000 << " /100 [s]\n " ;
842+
843+ start = std::chrono::high_resolution_clock::now ();
873844 }
874845
875846 auto total_stop_time = std::chrono::high_resolution_clock::now ();
@@ -878,9 +849,11 @@ CArray2D sonar_calculation_wrapper(
878849
879850 if (debugFlag)
880851 {
881- printf (
882- " Total Sonar Calculation Wrapper Time: %.3f ms\n " ,
883- static_cast <float >(total_duration.count ()) / 1000 .0f );
852+ float ms = static_cast <float >(total_duration.count ()) / 1000 .0f ;
853+
854+ printf (" Total Sonar Calculation Wrapper Time: %.3f ms\n " , ms);
855+
856+ debugLog << " Total Sonar Calculation Wrapper Time: " << ms << " ms\n " ;
884857 }
885858
886859 return P_Beams_F;
0 commit comments