Matrix Addition
add two matrices
November 12,2025
#include <cuda_runtime.h>
__global__ void matrix_add(const float* A, const float* B, float* C, int N) {
int id = blockDim.x * blockIdx.x + threadIdx.x;
if (id<N) C[id] = A[id]+B[id];
}
// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* A, const float* B, float* C, int N) {
int threadsPerBlock = 256;
int blocksPerGrid = (N * N + threadsPerBlock - 1) / threadsPerBlock;
matrix_add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);
cudaDeviceSynchronize();
}