This program adds together all elements in an array with the function parallelReduce. Includes testing on an initialization with all 1’s and a calculation of the speed up. I only tried to optimize the kernel itself, not the data transfer by initializing on the gpu for example.

```
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <chrono>
#include <assert.h>
// To make error checking easier
inline cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %sn", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}
/*
Parallel reduce helper function. When run
with n/2 threads, changes array a to a' such that
the sum of the first n elements of a is equal to
the sum of the first lceil n/2 rceil elements of a'
*/
__global__ void reduce(int* a, int n)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int stride = gridDim.x * blockDim.x;
for (int j = i; j < n / 2; j += stride)
{
a(j) += a(n - 1 - j);
}
}
/*
For an array a of length n, puts the sum of all elements in a(0)
*/
void parallelReduce(int* a, int n)
{
// Get some information about the GPU
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int multiProcessors = prop.multiProcessorCount;
// Repeatedly use the helper function reduce to condense
// a to its first element
while (n > 1) {
int threadsPerBlock = 256;
int numberOfBlocks = 32 * multiProcessors;
reduce << <numberOfBlocks, threadsPerBlock >> > (a, n);
checkCuda(cudaGetLastError());
checkCuda(cudaDeviceSynchronize());
n = (n + n % 2) / 2; // Rounds n/2 up.
}
}
int main()
{
// Initialize vector with N 1's.
int N = (2 << 27) + 1; // The array size gets too big before we would need a long or long long for N.
size_t size = N * sizeof(int);
int* h_a;
checkCuda(cudaMallocHost(&h_a, size));
for (int i = 0; i < N; i++) {
h_a(i) = 1;
}
// Copy to device (can be done asynchronically to hide transfer time, but
// that messes up the timing of the kernel).
int* d_a;
checkCuda(cudaMalloc(&d_a, size));
checkCuda(cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice));
// Calculate the sum sequentially and time it.
auto tic = std::chrono::high_resolution_clock::now();
int hostSolution = 0;
for (int i = 0; i < N; i++)
{
hostSolution += h_a(i);
}
auto toc = std::chrono::high_resolution_clock::now();
int duration = std::chrono::duration_cast<std::chrono::milliseconds>(toc - tic).count();
std::cout << "The sequential function says the answer is " << hostSolution << " this took " << duration
<< " ms." << std::endl;
// Kernel computation
tic = std::chrono::high_resolution_clock::now();
parallelReduce(d_a, N);
toc = std::chrono::high_resolution_clock::now();
int parallelDuration = std::chrono::duration_cast<std::chrono::milliseconds>(toc - tic).count();
// Copy result back to host
int solution;
checkCuda(cudaMemcpy(&solution, &d_a(0), sizeof(int), cudaMemcpyDeviceToHost));
// Print the parallel result and speed up:
std::cout << "The parallel function says the answer is " << solution << " this took " << parallelDuration
<< " ms." << std::endl;
std::cout << "This means we have achieved a speed up of " << duration / parallelDuration << std::endl;
}
```