33#include < time.h>
44#include < cuda_runtime.h>
55
6- #define M 256 // Number of rows in A and C
7- #define K 512 // Number of columns in A and rows in B
8- #define N 256 // Number of columns in B and C
9- #define BLOCK_SIZE 32
10-
11- // Example 3x2 @ 2x4 = 3x4 -> (M x K) @ (K x N) = (M x N)
12- // A = [[1, 2],
13- // [3, 4],
14- // [5, 6]]
15-
16- // B = [[7, 8, 9, 10],
17- // [11, 12, 13, 14]]
18-
19- // C = A * B = [[1*7 + 2*11, 1*8 + 2*12, 1*9 + 2*13, 1*10 + 2*14],
20- // [3*7 + 4*11, 3*8 + 4*12, 3*9 + 4*13, 3*10 + 4*14],
21- // [5*7 + 6*11, 5*8 + 6*12, 5*9 + 6*13, 5*10 + 6*14]]
22-
23- // C = [[29, 32, 35, 38],
24- // [65, 72, 79, 86],
25- // [101, 112, 123, 134]]
26-
27-
28- // CPU matrix multiplication
6+ // Matrix dimensions for the computation
7+ // M x K matrix (A) multiplied by K x N matrix (B) produces M x N matrix (C)
8+ #define M 256 // Number of rows in matrix A and result matrix C
9+ #define K 512 // Number of columns in A and rows in B (must match for valid multiplication)
10+ #define N 256 // Number of columns in matrix B and result matrix C
11+ #define BLOCK_SIZE 32 // Size of thread blocks (32x32 threads per block is common for good occupancy)
12+
13+ /* Matrix multiplication example to illustrate the computation:
14+ 3x2 matrix A multiplied by 2x4 matrix B produces 3x4 matrix C
15+
16+ A = [[1, 2], B = [[7, 8, 9, 10], C = A * B = [[29, 32, 35, 38],
17+ [3, 4], [11, 12, 13, 14]] [65, 72, 79, 86],
18+ [5, 6]] [101, 112, 123, 134]]
19+
20+ Each element C[i,j] is computed as the dot product of row i from A and column j from B
21+ */
22+
23+ /* *
24+ * CPU implementation of matrix multiplication
25+ * @param A Input matrix A
26+ * @param B Input matrix B
27+ * @param C Output matrix C = A * B
28+ * @param m Number of rows in A
29+ * @param k Number of columns in A / rows in B
30+ * @param n Number of columns in B
31+ */
2932void matmul_cpu (float *A, float *B, float *C, int m, int k, int n) {
3033 for (int i = 0 ; i < m; i++) {
3134 for (int j = 0 ; j < n; j++) {
3235 float sum = 0 .0f ;
36+ // Compute dot product of row i from A and column j from B
3337 for (int l = 0 ; l < k; l++) {
3438 sum += A[i * k + l] * B[l * n + j];
3539 }
@@ -38,74 +42,98 @@ void matmul_cpu(float *A, float *B, float *C, int m, int k, int n) {
3842 }
3943}
4044
41- // CUDA kernel for matrix multiplication
45+ /* *
46+ * CUDA kernel for parallel matrix multiplication
47+ * Each thread computes one element of the output matrix C
48+ * @param A Input matrix A in device memory
49+ * @param B Input matrix B in device memory
50+ * @param C Output matrix C in device memory
51+ * @param m Number of rows in A
52+ * @param k Number of columns in A / rows in B
53+ * @param n Number of columns in B
54+ */
4255__global__ void matmul_gpu (float *A, float *B, float *C, int m, int k, int n) {
56+ // Calculate global row and column index for this thread
4357 int row = blockIdx .y * blockDim .y + threadIdx .y ;
4458 int col = blockIdx .x * blockDim .x + threadIdx .x ;
4559
60+ // Only compute if within matrix bounds
4661 if (row < m && col < n) {
4762 float sum = 0 .0f ;
63+ // Compute dot product of row from A and column from B
4864 for (int l = 0 ; l < k; l++) {
4965 sum += A[row * k + l] * B[l * n + col];
5066 }
5167 C[row * n + col] = sum;
5268 }
5369}
5470
55- // Initialize matrix with random values
71+ /* *
72+ * Initialize matrix with random float values between 0 and 1
73+ * @param mat Pointer to matrix to initialize
74+ * @param rows Number of rows
75+ * @param cols Number of columns
76+ */
5677void init_matrix (float *mat, int rows, int cols) {
5778 for (int i = 0 ; i < rows * cols; i++) {
5879 mat[i] = (float )rand () / RAND_MAX;
5980 }
6081}
6182
62- // Function to measure execution time
83+ /* *
84+ * Get current time with nanosecond precision
85+ * @return Current time in seconds
86+ */
6387double get_time () {
6488 struct timespec ts;
6589 clock_gettime (CLOCK_MONOTONIC, &ts);
6690 return ts.tv_sec + ts.tv_nsec * 1e-9 ;
6791}
6892
6993int main () {
70- float *h_A, *h_B, *h_C_cpu, *h_C_gpu;
71- float *d_A, *d_B, *d_C;
94+ // Host (CPU) and device (GPU) matrix pointers
95+ float *h_A, *h_B, *h_C_cpu, *h_C_gpu; // Host matrices
96+ float *d_A, *d_B, *d_C; // Device matrices
97+
98+ // Calculate required memory sizes
7299 int size_A = M * K * sizeof (float );
73100 int size_B = K * N * sizeof (float );
74101 int size_C = M * N * sizeof (float );
75102
76- // Allocate host memory
103+ // Allocate host memory for matrices
77104 h_A = (float *)malloc (size_A);
78105 h_B = (float *)malloc (size_B);
79106 h_C_cpu = (float *)malloc (size_C);
80107 h_C_gpu = (float *)malloc (size_C);
81108
82- // Initialize matrices
109+ // Initialize input matrices with random values
83110 srand (time (NULL ));
84111 init_matrix (h_A, M, K);
85112 init_matrix (h_B, K, N);
86113
87- // Allocate device memory
114+ // Allocate device (GPU) memory
88115 cudaMalloc (&d_A, size_A);
89116 cudaMalloc (&d_B, size_B);
90117 cudaMalloc (&d_C, size_C);
91118
92- // Copy data to device
119+ // Copy input matrices from host to device
93120 cudaMemcpy (d_A, h_A, size_A, cudaMemcpyHostToDevice);
94121 cudaMemcpy (d_B, h_B, size_B, cudaMemcpyHostToDevice);
95122
96- // Define grid and block dimensions
97- dim3 blockDim (BLOCK_SIZE, BLOCK_SIZE);
98- dim3 gridDim ((N + BLOCK_SIZE - 1 ) / BLOCK_SIZE, (M + BLOCK_SIZE - 1 ) / BLOCK_SIZE);
123+ // Configure CUDA kernel execution parameters
124+ dim3 blockDim (BLOCK_SIZE, BLOCK_SIZE); // 32x32 threads per block
125+ dim3 gridDim ((N + BLOCK_SIZE - 1 ) / BLOCK_SIZE, // Ceiling division to cover matrix
126+ (M + BLOCK_SIZE - 1 ) / BLOCK_SIZE);
99127
100- // Warm -up runs
128+ // Perform warm -up runs to stabilize GPU clock speeds
101129 printf (" Performing warm-up runs...\n " );
102130 for (int i = 0 ; i < 3 ; i++) {
103131 matmul_cpu (h_A, h_B, h_C_cpu, M, K, N);
104132 matmul_gpu<<<gridDim , blockDim >>> (d_A, d_B, d_C, M, K, N);
105133 cudaDeviceSynchronize ();
106134 }
107135
108- // Benchmark CPU implementation
136+ // Benchmark CPU implementation (average over 20 runs)
109137 printf (" Benchmarking CPU implementation...\n " );
110138 double cpu_total_time = 0.0 ;
111139 for (int i = 0 ; i < 20 ; i++) {
@@ -116,7 +144,7 @@ int main() {
116144 }
117145 double cpu_avg_time = cpu_total_time / 20.0 ;
118146
119- // Benchmark GPU implementation
147+ // Benchmark GPU implementation (average over 20 runs)
120148 printf (" Benchmarking GPU implementation...\n " );
121149 double gpu_total_time = 0.0 ;
122150 for (int i = 0 ; i < 20 ; i++) {
@@ -128,12 +156,12 @@ int main() {
128156 }
129157 double gpu_avg_time = gpu_total_time / 20.0 ;
130158
131- // Print results
159+ // Print benchmark results
132160 printf (" CPU average time: %f microseconds\n " , (cpu_avg_time * 1e6f));
133161 printf (" GPU average time: %f microseconds\n " , (gpu_avg_time * 1e6f));
134162 printf (" Speedup: %fx\n " , cpu_avg_time / gpu_avg_time);
135163
136- // Free memory
164+ // Free allocated memory
137165 free (h_A);
138166 free (h_B);
139167 free (h_C_cpu);
@@ -143,4 +171,4 @@ int main() {
143171 cudaFree (d_C);
144172
145173 return 0 ;
146- }
174+ }
0 commit comments