#include __global__ void add(int N, int* a, int* b, int* out) { const int id = blockIdx.x; out[id] = a[id] + b[id]; } int main() { constexpr int N = 100; bool success = false; int host_array[N] = {0}; int* dev_arrays[3] = {nullptr}; // Allocate device arrays. for (int i = 0; i < 3; ++i) { if (cudaMalloc(&dev_arrays[i], N * sizeof(int)) != cudaSuccess) { goto cleanup; } } // Fill the host array with values 0..N-1. for (int i = 0; i < N; ++i) { host_array[i] = i; } // Copy the host array to each of the first two device arrays. for (int i = 0; i < 2; ++i) { if (cudaMemcpy( dev_arrays[i], host_array, N * sizeof(int), cudaMemcpyHostToDevice) != cudaSuccess) { goto cleanup; } } // Add the first two arrays. // N blocks, 1 thread per block. add<<>>(N, dev_arrays[0], dev_arrays[1], dev_arrays[2]); // Copy the result from the third array to the host. if (cudaMemcpy( host_array, dev_arrays[2], N * sizeof(int), cudaMemcpyDeviceToHost) != cudaSuccess) { goto cleanup; } // Print the result. for (int i = 0; i < N; ++i) { printf("%d ", host_array[i]); } printf("\n"); success = true; cleanup: for (int i = 0; i < 3; ++i) { if (dev_arrays[i] != nullptr) { cudaFree(dev_arrays[i]); } } return success ? 0 : 1; }