#include "CUDADeviceScopeGuard.h"
#include "CUDADriverInfo.h"
#include "CUDAEventTimer.h"
#include "CUDAMemoryHandlers.h"
#include "CUDAParallelFor.h"
#include "CUDAStreamsHandler.h"
#include "CUDAUtilityDeviceFunctions.h"
#include "CPUParallelism/CPUParallelismNCP.h"
#include "CPUParallelism/CPUParallelismUtilityFunctions.h"
#include "CPUParallelism/ThreadPool.h"
#include "Randomizers.h"
#include "UtilityFunctions.h"
#include <benchmark/benchmark.h>
#include <future>
#include <memory>
#include <limits>
#include <string>

using namespace std;
using namespace UtilsCUDA;
using namespace UtilsCUDA::CUDAParallelFor;
using namespace Utils;
using namespace Utils::CPUParallelism;
using namespace Utils::Randomizers;
using namespace Utils::UtilityFunctions;

namespace // anonymous namespace used instead of deprecated 'static' keyword used for cpp variable locality
{
#if __CUDA_ARCH__ >= 500 // Maxwell GPUs onwards for good atomicMin/Max 64bit support
  __forceinline__ __device__
    void setAtomicMinMaxUint64(size_t index, uint64_t* __restrict atomicMinMaxUint64, const uint64_t* __restrict uint64Values)
  {
    // Note: below are CUDA-needed casts to bridge the uint64_t -> unsigned long long int issue in the atomicMinMax CUDA API
    unsigned long long int  uint64Value = (unsigned long long int)uint64Values[index];
    unsigned long long int* atomicMinMaxUint64Addr = (unsigned long long int*)atomicMinMaxUint64;
    atomicMin(&atomicMinMaxUint64Addr[0], uint64Value); // global atomic min
    atomicMax(&atomicMinMaxUint64Addr[1], uint64Value); // global atomic max
  }
#endif // __CUDA_ARCH__
}

void BM_Atomic_Min_Max(benchmark::State& state)
{
  constexpr size_t numberOfCUDAStreams = 1;
  const CUDADriverInfo cudaDriverInfo(cudaDeviceScheduleAuto, true);

  const size_t currentRange     = state.range(0);
  const size_t numberOfElements = currentRange * 1024 * 1024;

  const size_t threadLocalSize  = numberOfHardwareThreads();
  // initialize thread pool with default parameters
  ThreadPool threadPool(threadLocalSize, AFFINITY_MASK_ALL, PRIORITY_NONE);

  // initialize thread local storage data
  const auto threadLocalStorage = make_unique<RandomRNGWELL512[]>(threadLocalSize);

  DebugConsole_consoleOutLine("Number of elements for this benchmark run: ", numberOfElements);

  for (int device = 0; device < cudaDriverInfo.getDeviceCount(); ++device)
  {
    const CUDAStreamsHandler streams(cudaDriverInfo, device, numberOfCUDAStreams);

    for (auto _ : state)
    {
      // float32 representation case
      {
        HostDeviceMemory<float> memoryHandler;
        auto future1 = memoryHandler.allocateAsync(numberOfElements, device);
        const auto checkHostPtr = make_unique<float[]>(numberOfElements);
        // wait until the reallyAsync() upload above is done
        future1.wait();
        parallelForThreadLocal(0, numberOfElements, [&](size_t i, size_t threadIdx)
        {
          checkHostPtr[i] = memoryHandler[i] = 2.0f * float(threadLocalStorage[threadIdx]()) - 1.0f; // -1.0f - +1.0f range
        }, threadPool);
        memoryHandler.copyHostToDeviceAsync(streams[0]);
        // run CPU atomicMinMaxFloat32 kernel
        atomic<float> atomicMinFloat32(numeric_limits<float>::max());
        atomic<float> atomicMaxFloat32(numeric_limits<float>::min());
        parallelFor(0, numberOfElements, [&](size_t i)
        {
          // below a naïve but thread-safe way of finding min/max in parallel
          CPUParallelismUtilityFunctions::atomicMin(atomicMinFloat32, checkHostPtr[i]); // global atomic min
          CPUParallelismUtilityFunctions::atomicMax(atomicMaxFloat32, checkHostPtr[i]); // global atomic max
        }, threadPool);
        // run GPU atomicMinMaxFloat32 kernel
        array<float, 2> hostAtomicMinMaxFloat32 = { { numeric_limits<float>::max(), numeric_limits<float>::min() } };
        DeviceMemory<float> atomicMinMaxFloat32(2, device);
        atomicMinMaxFloat32.copyHostToDeviceAsync(hostAtomicMinMaxFloat32.data(), streams[0]);
        {
          ProfileGPUTimer profileGPUTimer("parallelForCUDAAtomicMinMaxFloat32<<<>>> kernel time taken:", device, streams[0]);
          // choose which GPU to run the GPU kernel on for a multi-GPU system
          CUDADeviceScopeGuard deviceScopeGuard(device);
          launchCUDAParallelForInStream(numberOfElements, 0, streams[0], [] __device__ (size_t index, float* __restrict atomicMinMaxFloat32, const float* __restrict float32Values)
          {
            // below a naïve but thread-safe way of finding min/max in parallel
            CUDAUtilityDeviceFunctions::atomicMin(&atomicMinMaxFloat32[0], float32Values[index]); // global atomic min, do NOT use this utility function for shared memory
            CUDAUtilityDeviceFunctions::atomicMax(&atomicMinMaxFloat32[1], float32Values[index]); // global atomic max, do NOT use this utility function for shared memory
          }, atomicMinMaxFloat32.device(), memoryHandler.device());
        }
        atomicMinMaxFloat32.copyDeviceToHostAsync(hostAtomicMinMaxFloat32.data(), streams[0]);
        memoryHandler.copyDeviceToHost(streams[0]); // enforce further synchronization
        // test results
        const bool float32MinResult = MathFunctions::equal(atomicMinFloat32.load(), hostAtomicMinMaxFloat32[0]);
        const bool float32MaxResult = MathFunctions::equal(atomicMaxFloat32.load(), hostAtomicMinMaxFloat32[1]);

        DebugConsole_consoleOutLine("Float32 CPU-vs-GPU verification succeeded: ", StringAuxiliaryFunctions::toString<bool>(float32MinResult && float32MaxResult));
      }

      // float64 representation case
      {
        HostDeviceMemory<double> memoryHandler;
        auto future1 = memoryHandler.allocateAsync(numberOfElements, device);
        const auto checkHostPtr = make_unique<double[]>(numberOfElements);
        // wait until the reallyAsync() upload above is done
        future1.wait();
        parallelForThreadLocal(0, numberOfElements, [&](size_t i, size_t threadIdx)
        {
          checkHostPtr[i] = memoryHandler[i] = 2.0 * threadLocalStorage[threadIdx]() - 1.0; // -1.0 - +1.0 range
        }, threadPool);
        memoryHandler.copyHostToDeviceAsync(streams[0]);
        // run CPU atomicMinMaxFloat64 kernel
        atomic<double> atomicMinFloat64(numeric_limits<double>::max());
        atomic<double> atomicMaxFloat64(numeric_limits<double>::min());
        parallelFor(0, numberOfElements, [&](size_t i)
        {
          // below a naïve but thread-safe way of finding min/max in parallel
          CPUParallelismUtilityFunctions::atomicMin(atomicMinFloat64, checkHostPtr[i]); // global atomic min
          CPUParallelismUtilityFunctions::atomicMax(atomicMaxFloat64, checkHostPtr[i]); // global atomic max
        }, threadPool);
        // run GPU atomicMinMaxFloat64 kernel
        array<double, 2> hostAtomicMinMaxFloat64 = { { numeric_limits<double>::max(), numeric_limits<double>::min() } };
        DeviceMemory<double> atomicMinMaxFloat64(2, device);
        atomicMinMaxFloat64.copyHostToDeviceAsync(hostAtomicMinMaxFloat64.data(), streams[0]);
        {
          ProfileGPUTimer profileGPUTimer("parallelForCUDAAtomicMinMaxFloat64<<<>>> kernel time taken:", device, streams[0]);
          // choose which GPU to run the GPU kernel on for a multi-GPU system
          CUDADeviceScopeGuard deviceScopeGuard(device);
          launchCUDAParallelForInStream(numberOfElements, 0, streams[0], [] __device__ (size_t index, double* __restrict atomicMinMaxFloat64, const double* __restrict float64Values)
          {
            // below a naïve but thread-safe way of finding min/max in parallel
            CUDAUtilityDeviceFunctions::atomicMin(&atomicMinMaxFloat64[0], float64Values[index]); // global atomic min, do NOT use this utility function for shared memory
            CUDAUtilityDeviceFunctions::atomicMax(&atomicMinMaxFloat64[1], float64Values[index]); // global atomic max, do NOT use this utility function for shared memory
          }, atomicMinMaxFloat64.device(), memoryHandler.device());
        }
        // note the non-async function usage below
        atomicMinMaxFloat64.copyDeviceToHostAsync(hostAtomicMinMaxFloat64.data(), streams[0]);
        memoryHandler.copyDeviceToHost(streams[0]); // enforce further synchronization
        // test results
        const bool float64MinResult = MathFunctions::equal(atomicMinFloat64.load(), hostAtomicMinMaxFloat64[0]);
        const bool float64MaxResult = MathFunctions::equal(atomicMaxFloat64.load(), hostAtomicMinMaxFloat64[1]);

        DebugConsole_consoleOutLine("Float64 CPU-vs-GPU verification succeeded: ", StringAuxiliaryFunctions::toString<bool>(float64MinResult && float64MaxResult));
      }

      // float32 -> uint32 representation case
      {
        HostDeviceMemory<uint32_t> memoryHandler;
        auto future1 = memoryHandler.allocateAsync(numberOfElements, device);
        const auto checkHostPtr = make_unique<uint32_t[]>(numberOfElements);
        // wait until the reallyAsync() upload above is done
        future1.wait();
        parallelForThreadLocal(0, numberOfElements, [&](size_t i, size_t threadIdx)
        {
          checkHostPtr[i] = memoryHandler[i] = CUDAUtilityFunctions::float32Flip(2.0f * float(threadLocalStorage[threadIdx]()) - 1.0f); // -1.0f - +1.0f range
        }, threadPool);
        memoryHandler.copyHostToDeviceAsync(streams[0]);
        // run CPU atomicMinMaxFloat32 kernel
        atomic<uint32_t> atomicMinUint32(numeric_limits<uint32_t>::max());
        atomic<uint32_t> atomicMaxUint32(numeric_limits<uint32_t>::min());
        parallelFor(0, numberOfElements, [&](size_t i)
        {
          // below a naïve but thread-safe way of finding min/max in parallel
          CPUParallelismUtilityFunctions::atomicMin(atomicMinUint32, checkHostPtr[i]); // global atomic min
          CPUParallelismUtilityFunctions::atomicMax(atomicMaxUint32, checkHostPtr[i]); // global atomic max
        }, threadPool);
        // run GPU atomicMinMaxUint32 kernel
        array<uint32_t, 2> hostAtomicMinMaxUint32 = { { numeric_limits<uint32_t>::max(), numeric_limits<uint32_t>::min() } };
        DeviceMemory<uint32_t> atomicMinMaxUint32(2, device);
        atomicMinMaxUint32.copyHostToDeviceAsync(hostAtomicMinMaxUint32.data(), streams[0]);
        {
          ProfileGPUTimer profileGPUTimer("parallelForCUDAAtomicMinMaxUint32<<<>>> kernel time taken:", device, streams[0]);
          // choose which GPU to run the GPU kernel on for a multi-GPU system
          CUDADeviceScopeGuard deviceScopeGuard(device);
          launchCUDAParallelForInStream(numberOfElements, 0, streams[0], [] __device__ (size_t index, uint32_t* __restrict atomicMinMaxUint32, const uint32_t* __restrict uint32Values)
          {
            // below a naïve but thread-safe way of finding min/max in parallel
            atomicMin(&atomicMinMaxUint32[0], uint32Values[index]); // global atomic min
            atomicMax(&atomicMinMaxUint32[1], uint32Values[index]); // global atomic max
          }, atomicMinMaxUint32.device(), memoryHandler.device());
        }
        atomicMinMaxUint32.copyDeviceToHostAsync(hostAtomicMinMaxUint32.data(), streams[0]);
        memoryHandler.copyDeviceToHost(streams[0]); // enforce further synchronization
        // test results
        const bool uint32MinResult1 = MathFunctions::equal(CUDAUtilityFunctions::float32Unflip(atomicMinUint32.load()), CUDAUtilityFunctions::float32Unflip(hostAtomicMinMaxUint32[0]));
        const bool uint32MaxResult1 = MathFunctions::equal(CUDAUtilityFunctions::float32Unflip(atomicMaxUint32.load()), CUDAUtilityFunctions::float32Unflip(hostAtomicMinMaxUint32[1]));
        const bool uint32MinResult2 = (atomicMinUint32.load() == hostAtomicMinMaxUint32[0]);
        const bool uint32MaxResult2 = (atomicMaxUint32.load() == hostAtomicMinMaxUint32[1]);

        DebugConsole_consoleOutLine("Uint32 CPU-vs-GPU verification succeeded: ", StringAuxiliaryFunctions::toString<bool>(uint32MinResult1 && uint32MaxResult1 && uint32MinResult2 && uint32MaxResult2));
      }

    #if __CUDA_ARCH__ >= 500 // Maxwell GPUs onwards for good atomicMin/Max 64bit support
      // Maxwell GPUs onwards for good atomicMin/Max 64bit support
      if (cudaDriverInfo.getIsAtLeastMaxwell(0)) // 1st GPU only runs this test
      {
        // float64 -> uint64 representation case
        {
          HostDeviceMemory<uint64_t> memoryHandler;
          auto future1 = memoryHandler.allocateAsync(numberOfElements, device);
          const auto checkHostPtr = make_unique<uint64_t[]>(numberOfElements);
          // wait until the reallyAsync() upload above is done
          future1.wait();
          parallelForThreadLocal(0, numberOfElements, [&](size_t i, size_t threadIdx)
          {
            checkHostPtr[i] = memoryHandler[i] = CUDAUtilityFunctions::float64Flip(2.0 * threadLocalStorage[threadIdx]() - 1.0); // -1.0f - +1.0f range
          }, threadPool);
          memoryHandler.copyHostToDeviceAsync(streams[0]);
          // run CPU atomicMinMaxFloat64 kernel
          atomic<uint64_t> atomicMinUint64(numeric_limits<uint64_t>::max());
          atomic<uint64_t> atomicMaxUint64(numeric_limits<uint64_t>::min());
          parallelFor(0, numberOfElements, [&](size_t i)
          {
            // below a naïve but thread-safe way of finding min/max in parallel
            CPUParallelismUtilityFunctions::atomicMin(atomicMinUint64, checkHostPtr[i]); // global atomic min
            CPUParallelismUtilityFunctions::atomicMax(atomicMaxUint64, checkHostPtr[i]); // global atomic max
          }, threadPool);
          // run GPU atomicMinMaxUint64 kernel
          array<uint64_t, 2> hostAtomicMinMaxUint64 = { { numeric_limits<uint64_t>::max(), numeric_limits<uint64_t>::min() } };
          DeviceMemory<uint64_t> atomicMinMaxUint64(2, device);
          atomicMinMaxUint64.copyHostToDeviceAsync(hostAtomicMinMaxUint64.data(), streams[0]);
          {
            ProfileGPUTimer profileGPUTimer("parallelForCUDAAtomicMinMaxUint64<<<>>> kernel time taken:", device, streams[0]);
            // choose which GPU to run the GPU kernel on for a multi-GPU system
            CUDADeviceScopeGuard deviceScopeGuard(device);
            launchCUDAParallelForInStream(numberOfElements, 0, streams[0], [] __device__ (size_t index, uint64_t* __restrict atomicMinMaxUint64, const uint64_t* __restrict uint64Values)
            {
              // below a naïve but thread-safe way of finding min/max in parallel
              setAtomicMinMaxUint64(index, atomicMinMaxUint64Values, uint64Values);
            }, atomicMinMaxUint64.device(), memoryHandler.device());
          }
          atomicMinMaxUint64.copyDeviceToHostAsync(hostAtomicMinMaxUint64.data(), streams[0]);
          memoryHandler.copyDeviceToHost(streams[0]); // enforce further synchronization
          // test results
          const bool uint64MinResult1 = MathFunctions::equal(CUDAUtilityFunctions::float64Unflip(atomicMinUint64.load()), CUDAUtilityFunctions::float64Unflip(hostAtomicMinMaxUint64[0]));
          const bool uint64MaxResult1 = MathFunctions::equal(CUDAUtilityFunctions::float64Unflip(atomicMaxUint64.load()), CUDAUtilityFunctions::float64Unflip(hostAtomicMinMaxUint64[1]));
          const bool uint64MinResult2 = (atomicMinUint64.load() == hostAtomicMinMaxUint64[0]);
          const bool uint64MaxResult2 = (atomicMaxUint64.load() == hostAtomicMinMaxUint64[1]);

          DebugConsole_consoleOutLine("Uint64 CPU-vs-GPU verification succeeded: ", StringAuxiliaryFunctions::toString<bool>(uint64MinResult1 && uint64MaxResult1 && uint64MinResult2 && uint64MaxResult2));
        }
      }
  #endif // __CUDA_ARCH__
    }
  }
}

BENCHMARK(BM_Atomic_Min_Max)->DenseRange(1u, 4u)->Iterations(8u)->UseRealTime();