#include "IntentiveCacheCleaning.h"
#include "CUDADriverInfo.h"
#include "CUDAEventTimer.h"
#include "CUDAMemoryHandlers.h"
#include "CUDAKernelLauncher.h"
#include "CUDAStreamsHandler.h"
#include "UtilityFunctions.h"
#include <benchmark/benchmark.h>

using namespace std;
using namespace UtilsCUDA;
using namespace Utils::UtilityFunctions;

void BM_Atomic_Global_Memory_Access(benchmark::State& state)
{
  constexpr size_t numberOfCUDAStreams = 1;
  const CUDADriverInfo cudaDriverInfo(cudaDeviceScheduleAuto, true);

  const size_t numberOfElements =  128 * 1024 * 1024;
  const size_t dataSize         = 1024 * 1024 * 1024;

  for (int device = 0; device < cudaDriverInfo.getDeviceCount(); ++device)
  {
    const CUDAStreamsHandler streams(cudaDriverInfo, device, numberOfCUDAStreams);
    CUDAEventTimer gpuTimer(device, streams[0]);

    // set up input and output data
    HostDeviceMemory<uint32_t> inData(numberOfElements, device);
    inData.memsetAsync(0, streams[0], true);
    inData.copyHostToDeviceAsync(streams[0]);

    // clear up L2 GPU cache
    IntentiveCacheCleaning cacheCleaner(dataSize, streams[0], device);
    cacheCleaner.cleanCache();

    for (auto _ : state)
    {
      gpuTimer.startTimer();
      // Increment data using non atomic operation
      KernelLauncher::create().setGridAndBlock(make_tuple(dim3{1, 1, 1}, dim3{1, 1, 1}))
                              .setStream(streams[0])
                              .setDevice(device)
                              .asynchronous()
                              .runCUDAParallelFor(1, [] __device__ (size_t index, size_t dataLength, uint32_t* __restrict inData)
      {
        while (index < dataLength)
        {
          ++inData[index];
          ++index;
        }
      }, numberOfElements, inData.device());
      DebugConsole_consoleOutLine("Non-atomics: ", gpuTimer.getElapsedTimeInMilliSecs(), " ms.");

      inData.copyDeviceToHost(streams[0]); // enforce further synchronization
      for (size_t i = 0; i < numberOfElements; ++i)
      {
        if (inData[i] != 1)
        {
          DebugConsole_consoleOutLine("Error on non-atomic operations at ", i);
          break;
        }
      }
      inData.memsetAsync(0, streams[0], true);
      inData.copyHostToDeviceAsync(streams[0]);
      cacheCleaner.cleanCache();

      gpuTimer.startTimer();
      // Increment data using atomic operation
      KernelLauncher::create().setGridAndBlock(make_tuple(dim3{1, 1, 1}, dim3{1, 1, 1}))
                              .setStream(streams[0])
                              .setDevice(device)
                              .asynchronous()
                              .runCUDAParallelFor(1, [] __device__ (size_t index, size_t dataLength, uint32_t* __restrict inData)
      {
        while (index < dataLength)
        {
          atomicAdd(&(inData[index]), 1);
          ++index;
        }
      }, numberOfElements, inData.device());
      DebugConsole_consoleOutLine("Atomics: ", gpuTimer.getElapsedTimeInMilliSecs(), " ms.");

      inData.copyDeviceToHost(streams[0]); // enforce further synchronization
      for (size_t i = 0; i < numberOfElements; ++i)
      {
        if (inData[i] != 1)
        {
          DebugConsole_consoleOutLine("Error on atomic operations at ", i);
          break;
        }
      }
      inData.memsetAsync(0, streams[0], true);
      inData.copyHostToDeviceAsync(streams[0]);
      cacheCleaner.cleanCache();
    }
  }
}

BENCHMARK(BM_Atomic_Global_Memory_Access)->Arg(1u)->Iterations(4u)->UseRealTime();