-
Notifications
You must be signed in to change notification settings - Fork 0
/
Matrix Multiplication 4B.txt
64 lines (52 loc) · 1.68 KB
/
Matrix Multiplication 4B.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#include <stdio.h>
#include <stdlib.h>
#define N 1024
#define BLOCK_SIZE 16
__global__ void matrixMul(int *a, int *b, int *c, int width) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
for (int i = 0; i < width; i++) {
sum += a[row * width + i] * b[i * width + col];
}
c[row * width + col] = sum;
}
int main() {
int *a, *b, *c;
int *d_a, *d_b, *d_c;
int size = N * N * sizeof(int);
// Allocate memory on host
a = (int*)malloc(size);
b = (int*)malloc(size);
c = (int*)malloc(size);
// Initialize matrices
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
a[i * N + j] = i + j;
b[i * N + j] = i - j;
}
}
// Allocate memory on device
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
// Copy data from host to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// Launch kernel with 2D grid and 2D block
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x, (N + dimBlock.y - 1) / dimBlock.y);
matrixMul<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
// Copy result from device to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
// Print first and last elements of result
printf("c[0][0] = %d, c[%d][%d] = %d\n", c[0], N-1, N-1, c[(N-1) * N + (N-1)]);
// Free memory
free(a);
free(b);
free(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}