Skip to content

Commit b12c9e2

Browse files
authored
Update 02 matmul.cu
1 parent 5ebe284 commit b12c9e2

File tree

1 file changed

+69
-41
lines changed

1 file changed

+69
-41
lines changed

05 Writing your First Kernels/02 Kernels/02 matmul.cu

Lines changed: 69 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,33 +3,37 @@
33
#include <time.h>
44
#include <cuda_runtime.h>
55

6-
#define M 256 // Number of rows in A and C
7-
#define K 512 // Number of columns in A and rows in B
8-
#define N 256 // Number of columns in B and C
9-
#define BLOCK_SIZE 32
10-
11-
// Example 3x2 @ 2x4 = 3x4 -> (M x K) @ (K x N) = (M x N)
12-
// A = [[1, 2],
13-
// [3, 4],
14-
// [5, 6]]
15-
16-
// B = [[7, 8, 9, 10],
17-
// [11, 12, 13, 14]]
18-
19-
// C = A * B = [[1*7 + 2*11, 1*8 + 2*12, 1*9 + 2*13, 1*10 + 2*14],
20-
// [3*7 + 4*11, 3*8 + 4*12, 3*9 + 4*13, 3*10 + 4*14],
21-
// [5*7 + 6*11, 5*8 + 6*12, 5*9 + 6*13, 5*10 + 6*14]]
22-
23-
// C = [[29, 32, 35, 38],
24-
// [65, 72, 79, 86],
25-
// [101, 112, 123, 134]]
26-
27-
28-
// CPU matrix multiplication
6+
// Matrix dimensions for the computation
7+
// M x K matrix (A) multiplied by K x N matrix (B) produces M x N matrix (C)
8+
#define M 256 // Number of rows in matrix A and result matrix C
9+
#define K 512 // Number of columns in A and rows in B (must match for valid multiplication)
10+
#define N 256 // Number of columns in matrix B and result matrix C
11+
#define BLOCK_SIZE 32 // Size of thread blocks (32x32 threads per block is common for good occupancy)
12+
13+
/* Matrix multiplication example to illustrate the computation:
14+
3x2 matrix A multiplied by 2x4 matrix B produces 3x4 matrix C
15+
16+
A = [[1, 2], B = [[7, 8, 9, 10], C = A * B = [[29, 32, 35, 38],
17+
[3, 4], [11, 12, 13, 14]] [65, 72, 79, 86],
18+
[5, 6]] [101, 112, 123, 134]]
19+
20+
Each element C[i,j] is computed as the dot product of row i from A and column j from B
21+
*/
22+
23+
/**
24+
* CPU implementation of matrix multiplication
25+
* @param A Input matrix A
26+
* @param B Input matrix B
27+
* @param C Output matrix C = A * B
28+
* @param m Number of rows in A
29+
* @param k Number of columns in A / rows in B
30+
* @param n Number of columns in B
31+
*/
2932
void matmul_cpu(float *A, float *B, float *C, int m, int k, int n) {
3033
for (int i = 0; i < m; i++) {
3134
for (int j = 0; j < n; j++) {
3235
float sum = 0.0f;
36+
// Compute dot product of row i from A and column j from B
3337
for (int l = 0; l < k; l++) {
3438
sum += A[i * k + l] * B[l * n + j];
3539
}
@@ -38,74 +42,98 @@ void matmul_cpu(float *A, float *B, float *C, int m, int k, int n) {
3842
}
3943
}
4044

41-
// CUDA kernel for matrix multiplication
45+
/**
46+
* CUDA kernel for parallel matrix multiplication
47+
* Each thread computes one element of the output matrix C
48+
* @param A Input matrix A in device memory
49+
* @param B Input matrix B in device memory
50+
* @param C Output matrix C in device memory
51+
* @param m Number of rows in A
52+
* @param k Number of columns in A / rows in B
53+
* @param n Number of columns in B
54+
*/
4255
__global__ void matmul_gpu(float *A, float *B, float *C, int m, int k, int n) {
56+
// Calculate global row and column index for this thread
4357
int row = blockIdx.y * blockDim.y + threadIdx.y;
4458
int col = blockIdx.x * blockDim.x + threadIdx.x;
4559

60+
// Only compute if within matrix bounds
4661
if (row < m && col < n) {
4762
float sum = 0.0f;
63+
// Compute dot product of row from A and column from B
4864
for (int l = 0; l < k; l++) {
4965
sum += A[row * k + l] * B[l * n + col];
5066
}
5167
C[row * n + col] = sum;
5268
}
5369
}
5470

55-
// Initialize matrix with random values
71+
/**
72+
* Initialize matrix with random float values between 0 and 1
73+
* @param mat Pointer to matrix to initialize
74+
* @param rows Number of rows
75+
* @param cols Number of columns
76+
*/
5677
void init_matrix(float *mat, int rows, int cols) {
5778
for (int i = 0; i < rows * cols; i++) {
5879
mat[i] = (float)rand() / RAND_MAX;
5980
}
6081
}
6182

62-
// Function to measure execution time
83+
/**
84+
* Get current time with nanosecond precision
85+
* @return Current time in seconds
86+
*/
6387
double get_time() {
6488
struct timespec ts;
6589
clock_gettime(CLOCK_MONOTONIC, &ts);
6690
return ts.tv_sec + ts.tv_nsec * 1e-9;
6791
}
6892

6993
int main() {
70-
float *h_A, *h_B, *h_C_cpu, *h_C_gpu;
71-
float *d_A, *d_B, *d_C;
94+
// Host (CPU) and device (GPU) matrix pointers
95+
float *h_A, *h_B, *h_C_cpu, *h_C_gpu; // Host matrices
96+
float *d_A, *d_B, *d_C; // Device matrices
97+
98+
// Calculate required memory sizes
7299
int size_A = M * K * sizeof(float);
73100
int size_B = K * N * sizeof(float);
74101
int size_C = M * N * sizeof(float);
75102

76-
// Allocate host memory
103+
// Allocate host memory for matrices
77104
h_A = (float*)malloc(size_A);
78105
h_B = (float*)malloc(size_B);
79106
h_C_cpu = (float*)malloc(size_C);
80107
h_C_gpu = (float*)malloc(size_C);
81108

82-
// Initialize matrices
109+
// Initialize input matrices with random values
83110
srand(time(NULL));
84111
init_matrix(h_A, M, K);
85112
init_matrix(h_B, K, N);
86113

87-
// Allocate device memory
114+
// Allocate device (GPU) memory
88115
cudaMalloc(&d_A, size_A);
89116
cudaMalloc(&d_B, size_B);
90117
cudaMalloc(&d_C, size_C);
91118

92-
// Copy data to device
119+
// Copy input matrices from host to device
93120
cudaMemcpy(d_A, h_A, size_A, cudaMemcpyHostToDevice);
94121
cudaMemcpy(d_B, h_B, size_B, cudaMemcpyHostToDevice);
95122

96-
// Define grid and block dimensions
97-
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
98-
dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (M + BLOCK_SIZE - 1) / BLOCK_SIZE);
123+
// Configure CUDA kernel execution parameters
124+
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE); // 32x32 threads per block
125+
dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE, // Ceiling division to cover matrix
126+
(M + BLOCK_SIZE - 1) / BLOCK_SIZE);
99127

100-
// Warm-up runs
128+
// Perform warm-up runs to stabilize GPU clock speeds
101129
printf("Performing warm-up runs...\n");
102130
for (int i = 0; i < 3; i++) {
103131
matmul_cpu(h_A, h_B, h_C_cpu, M, K, N);
104132
matmul_gpu<<<gridDim, blockDim>>>(d_A, d_B, d_C, M, K, N);
105133
cudaDeviceSynchronize();
106134
}
107135

108-
// Benchmark CPU implementation
136+
// Benchmark CPU implementation (average over 20 runs)
109137
printf("Benchmarking CPU implementation...\n");
110138
double cpu_total_time = 0.0;
111139
for (int i = 0; i < 20; i++) {
@@ -116,7 +144,7 @@ int main() {
116144
}
117145
double cpu_avg_time = cpu_total_time / 20.0;
118146

119-
// Benchmark GPU implementation
147+
// Benchmark GPU implementation (average over 20 runs)
120148
printf("Benchmarking GPU implementation...\n");
121149
double gpu_total_time = 0.0;
122150
for (int i = 0; i < 20; i++) {
@@ -128,12 +156,12 @@ int main() {
128156
}
129157
double gpu_avg_time = gpu_total_time / 20.0;
130158

131-
// Print results
159+
// Print benchmark results
132160
printf("CPU average time: %f microseconds\n", (cpu_avg_time * 1e6f));
133161
printf("GPU average time: %f microseconds\n", (gpu_avg_time * 1e6f));
134162
printf("Speedup: %fx\n", cpu_avg_time / gpu_avg_time);
135163

136-
// Free memory
164+
// Free allocated memory
137165
free(h_A);
138166
free(h_B);
139167
free(h_C_cpu);
@@ -143,4 +171,4 @@ int main() {
143171
cudaFree(d_C);
144172

145173
return 0;
146-
}
174+
}

0 commit comments

Comments
 (0)