#include "IntentiveCacheCleaning.h"
#include "CUDADeviceScopeGuard.h"
#include "CUDADriverInfo.h"
#include "CUDAEventTimer.h"
#include "CUDAMemoryHandlers.h"
#include "CUDAParallelFor.h"
#include "CUDAStreamsHandler.h"
#include "UtilityFunctions.h"
#include <benchmark/benchmark.h>

using namespace std;
using namespace UtilsCUDA;
using namespace UtilsCUDA::CUDAParallelFor;
using namespace Utils::UtilityFunctions;

void BM_Misaligned_Global_Memory_Access(benchmark::State& state)
{
  constexpr size_t numberOfCUDAStreams = 1;
  const CUDADriverInfo cudaDriverInfo(cudaDeviceScheduleAuto, true);

  const size_t offset           = state.range(0);
  const size_t numberOfElements =  128 * 1024 * 1024;
  const size_t dataSize         = 1024 * 1024 * 1024;

  DebugConsole_consoleOutLine("misalignment gap: ", offset);

  for (int device = 0; device < cudaDriverInfo.getDeviceCount(); ++device)
  {
    const CUDAStreamsHandler streams(cudaDriverInfo, device, numberOfCUDAStreams);
    CUDAEventTimer gpuTimer(device, streams[0]);

    // initialize input and output data
    HostDeviceMemory<uint32_t>  inData(numberOfElements + offset, device);
    HostDeviceMemory<uint32_t> outData(numberOfElements + offset, device);
     inData.memsetAsync(0, streams[0], true);
    outData.memsetAsync(0, streams[0], true);

    // clear up L2 GPU cache
    IntentiveCacheCleaning cacheCleaner(dataSize, streams[0], device);
    cacheCleaner.cleanCache();

    for (auto _ : state)
    {
      // read misaligned data from GPU memory and write back on GPU memory
      gpuTimer.startTimer();
      inData.copyHostToDeviceAsync(streams[0]);
      {
        // choose which GPU to run the GPU kernel on for a multi-GPU system
        CUDADeviceScopeGuard deviceScopeGuard(device);
        launchCUDAParallelForInStream(numberOfElements, 0, streams[0], [] __device__(size_t index, uint32_t* __restrict inData, uint32_t* __restrict outData, size_t offset)
        {
          outData[index + offset] = inData[index];
        }, inData.device(), outData.device(), offset);
      }
      DebugConsole_consoleOutLine("Read misaligned data from GPU memory and write back on GPU memory: ", gpuTimer.getElapsedTimeInMilliSecs(), " ms.");

      cacheCleaner.cleanCache();

      // read data from GPU memory and write back on misaligned GPU memory
      gpuTimer.startTimer();
      inData.copyHostToDeviceAsync(streams[0]);
      {
        // choose which GPU to run the GPU kernel on for a multi-GPU system
        CUDADeviceScopeGuard deviceScopeGuard(device);
        launchCUDAParallelForInStream(numberOfElements, 0, streams[0], [] __device__(size_t index, uint32_t* __restrict inData, uint32_t* __restrict outData, size_t offset)
        {
          outData[index] = inData[index + offset];
        }, inData.device(), outData.device(), offset);
      }
      DebugConsole_consoleOutLine("Read data from GPU memory and write back on misaligned GPU memory: ", gpuTimer.getElapsedTimeInMilliSecs(), " ms.");

      cacheCleaner.cleanCache();

      // read misaligned data from GPU memory and write back on misaligned GPU memory
      gpuTimer.startTimer();
      inData.copyHostToDeviceAsync(streams[0]);
      {
        // choose which GPU to run the GPU kernel on for a multi-GPU system
        CUDADeviceScopeGuard deviceScopeGuard(device);
        launchCUDAParallelForInStream(numberOfElements, 0, streams[0], [] __device__(size_t index, uint32_t* __restrict inData, uint32_t* __restrict outData, size_t offset)
        {
          outData[index + offset] = inData[index + offset];
        }, inData.device(), outData.device(), offset);
      }
      DebugConsole_consoleOutLine("Read misaligned data from GPU memory and write back on misaligned GPU memory: ", gpuTimer.getElapsedTimeInMilliSecs(), " ms.");

      cacheCleaner.cleanCache();
    }
  }
}

BENCHMARK(BM_Misaligned_Global_Memory_Access)->Arg(0u)->Arg(1u)->Arg(2u)->Arg(4u)->Arg(8u)->Arg(16u)->Arg(32u)->Arg(64u)->Iterations(8u)->UseRealTime();