#include "GPUPrimerUnitTests.h"
#include "CUDADriverInfo.h"
#include "CUDAEventTimer.h"
#include "CUDAMemoryHandlers.h"
#include "CUDAParallelFor.h"
#include "CPUParallelism/CPUParallelismNCP.h"
#include "AccurateTimers.h"
#include "Randomizers.h"
#include "UtilityFunctions.h"
#include <gtest/gtest.h>
#include <cuda_runtime_api.h>
#include <vector_types.h>
#include <array>
#include <cstdint>
#include <cstdlib>
#include <sstream>
#include <string>
#include <vector>

using namespace std;
using namespace Tests;
using namespace UtilsCUDA;
using namespace Utils;
using namespace Utils::CPUParallelism;
using namespace Utils::AccurateTimers;
using namespace Utils::Randomizers;
using namespace Utils::UtilityFunctions;

namespace // anonymous namespace used instead of deprecated 'static' keyword used for cpp variable locality
{
  namespace PointCloudTransformCPU
  {
    // CPU function that scans though an array of points and rotates/translates these points into a new array
    inline void transform(const float* __restrict__ points, const float* __restrict__ rotation, const float* __restrict__ translation, float* __restrict__ pointsTransform, size_t index)
    {
      // rotation
      for (size_t i = 0U; i < 3U; ++i)
      {
        for (size_t j = 0U; j < 3U; ++j)
        {
          pointsTransform[3U * index + i] += (rotation[3U * i + j] * points[3U * index + j]);
        }
      }

      // translation
      for (size_t i = 0U; i < 3U; ++i)
      {
        pointsTransform[3U * index + i] += translation[i];
      }
    }

    inline void transformSingleCore(size_t numberOfPoints, const float* __restrict__ points, const float* __restrict__ rotation, const float* __restrict__ translation, float* __restrict__ pointsTransform)
    {
      for (size_t index = 0U; index < numberOfPoints; ++index)
      {
        transform(points, rotation, translation, pointsTransform, index);
      }
    }

    inline void transformMultiCore(size_t numberOfPoints, const float* __restrict__ points, const float* __restrict__ rotation, const float* __restrict__ translation, float* __restrict__ pointsTransform)
    {
      parallelFor(0U, numberOfPoints, [=](size_t index) { transform(points, rotation, translation, pointsTransform, index); });
    }
  }

  namespace PointCloudTransformGPU
  {
    #define CHECK_CUDA_ERROR(...)                                                                      \
    {                                                                                                  \
      auto errnum = __VA_ARGS__;                                                                       \
      if (errnum)                                                                                      \
      {                                                                                                \
        PointCloudTransformGPU::reportError(cudaGetErrorString(errnum), __FILE__, __func__, __LINE__); \
      }                                                                                                \
    }

    inline void reportError(const string& error, const char* file, const char* function, int line, bool abort = true)
    {
      ostringstream ss;
      ss << "\nFile: " << file << endl;
      ss << "Function: " << function << endl;
      ss << "Line: " << line << endl;
      ss << "CUDA error reported: " << error << endl;
      DebugConsole_consoleOutLine(ss.str());
      if (abort)
      {
        exit(EXIT_FAILURE);
      }
    }

    // GPU kernel function that scans though an array of points and rotates/translates these points into a new array
    __global__ void transform(size_t numberOfPoints, const float* __restrict__ points, const float* __restrict__ rotation, const float* __restrict__ translation, float* __restrict__ pointsTransform)
    {
      size_t index = blockIdx.x * blockDim.x + threadIdx.x;
      if (index >= numberOfPoints)
      {
        return;
      }

      // rotation
      for (size_t i = 0U; i < 3U; ++i)
      {
        for (size_t j = 0U; j < 3U; ++j)
        {
          pointsTransform[3U * index + i] += (rotation[3U * i + j] * points[3U * index + j]);
        }
      }

      // translation
      for (size_t i = 0U; i < 3U; ++i)
      {
        pointsTransform[3U * index + i] += translation[i];
      }
    }
  }

  namespace PointCloudTransformGPUDotRed
  {
    // GPU device function that scans though an array of points and rotates/translates these points into a new array
    __forceinline__ __device__ void transform(size_t index, const float* __restrict__ points, const float* __restrict__ rotation, const float* __restrict__ translation, float* __restrict__ pointsTransform)
    {
      // rotation
      for (size_t i = 0U; i < 3U; ++i)
      {
        for (size_t j = 0U; j < 3U; ++j)
        {
          pointsTransform[3U * index + i] += (rotation[3U * i + j] * points[3U * index + j]);
        }
      }

      // translation
      for (size_t i = 0U; i < 3U; ++i)
      {
        pointsTransform[3U * index + i] += translation[i];
      }
    }
  }
}

void GPUPrimerGoogleTest01__PointCloudTransformCPU::executeTest()
{
  // number of points for test
  constexpr size_t numberOfPoints = 10000000U;

  // start the CPU timer
  AccurateCPUTimer timer;

  // allocate CPU memory
  timer.startTimer();
  vector<float> points(3 * numberOfPoints);          // default constructor of a float primitive equals to zero
  vector<float> pointsTransform(3 * numberOfPoints); // default constructor of a float primitive equals to zero
  timer.stopTimer();
  DebugConsole_consoleOutLine("CPU allocation and default construction took ", timer.getDecimalElapsedTimeInMilliSecs(), " ms.");

  // generate example data
  const array<float, 9> rotationMatrix{ {1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f} };
  const array<float, 3> translationMatrix{ {0.0f, 0.0f, 0.0f} };
  // generate random input point data
  UniformRandom random;
  // each point is 3D, so generate x,y,z coordinates
  for (size_t index = 0U; index < numberOfPoints; ++index)
  {
    points[3U * index + 0U] = float(random());
    points[3U * index + 1U] = float(random());
    points[3U * index + 2U] = float(random());
  }

  // execute transformation in single-core mode
  timer.startTimer();
  PointCloudTransformCPU::transformSingleCore(numberOfPoints, points.data(), rotationMatrix.data(), translationMatrix.data(), pointsTransform.data());
  timer.stopTimer();
  DebugConsole_consoleOutLine("CPU transformSingleCore() function took ", timer.getDecimalElapsedTimeInMilliSecs(), " ms.");

  // execute transformation in multi-core mode
  timer.startTimer();
  PointCloudTransformCPU::transformMultiCore(numberOfPoints, points.data(), rotationMatrix.data(), translationMatrix.data(), pointsTransform.data());
  timer.stopTimer();
  DebugConsole_consoleOutLine("CPU transformMultiCore() function took ", timer.getDecimalElapsedTimeInMilliSecs(), " ms.");

  // we reached the end with no problem (dummy check)
  EXPECT_TRUE(!points.empty());
  EXPECT_TRUE(!pointsTransform.empty());
}

TEST(GPUPrimerGoogleTest01__PointCloudTransformCPU, PointCloudTransformCPU)
{
  GPUPrimerGoogleTest01__PointCloudTransformCPU::executeTest();
}

void GPUPrimerGoogleTest02__PointCloudTransformGPU::executeTest()
{
  // number of points for test
  constexpr size_t numberOfPoints = 10000000U;

  // Note: the canonical way to force runtime API context establishment is to call 'cudaFree(nullptr)'
  CHECK_CUDA_ERROR(cudaFree(nullptr));

  // use a CUDA event timer
  cudaEvent_t start{};
  cudaEvent_t stop{};
  CHECK_CUDA_ERROR(cudaEventCreate(&start));
  CHECK_CUDA_ERROR(cudaEventCreate(&stop));
  float elapsedTime = 0.0f;

  // generate example data
  const array<float, 9U> rotationMatrix{ {1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f} };
  const array<float, 3U> translationMatrix{ {0.0f, 0.0f, 0.0f} };
  vector<float> points(3U * numberOfPoints);          // default constructor of a float primitive equals to zero
  vector<float> pointsTransform(3U * numberOfPoints); // default constructor of a float primitive equals to zero
  UniformRandom random;
  // each point is 3D, so generate x,y,z coordinates
  for (size_t index = 0U; index < numberOfPoints; ++index)
  {
    points[3U * index + 0U] = float(random());
    points[3U * index + 1U] = float(random());
    points[3U * index + 2U] = float(random());
  }

  // allocate memory
  float* pointsGPU = nullptr;
  float* pointsTransformGPU = nullptr;
  float* rotationMatrixGPU = nullptr;
  float* translationMatrixGPU = nullptr;
  const size_t pointsSizeBytes = points.size() * sizeof(float);
  CHECK_CUDA_ERROR(cudaEventRecord(start));
  CHECK_CUDA_ERROR(cudaMalloc(&pointsGPU, pointsSizeBytes));
  CHECK_CUDA_ERROR(cudaMalloc(&pointsTransformGPU, pointsSizeBytes));
  CHECK_CUDA_ERROR(cudaMemset(pointsTransformGPU, 0, pointsSizeBytes));
  CHECK_CUDA_ERROR(cudaMalloc(&rotationMatrixGPU, 9U * sizeof(float)));
  CHECK_CUDA_ERROR(cudaMalloc(&translationMatrixGPU, 3U * sizeof(float)));
  CHECK_CUDA_ERROR(cudaEventRecord(stop));
  CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
  CHECK_CUDA_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
  DebugConsole_consoleOutLine("GPU allocation and memset function took ", elapsedTime, " ms.");

  // copy from CPU to GPU
  CHECK_CUDA_ERROR(cudaEventRecord(start));
  CHECK_CUDA_ERROR(cudaMemcpy(pointsGPU, points.data(), pointsSizeBytes, cudaMemcpyHostToDevice));
  CHECK_CUDA_ERROR(cudaMemcpy(rotationMatrixGPU, rotationMatrix.data(), 9U * sizeof(float), cudaMemcpyHostToDevice));
  CHECK_CUDA_ERROR(cudaMemcpy(translationMatrixGPU, translationMatrix.data(), 3U * sizeof(float), cudaMemcpyHostToDevice));
  CHECK_CUDA_ERROR(cudaEventRecord(stop));
  CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
  CHECK_CUDA_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
  DebugConsole_consoleOutLine("Moving from CPU to GPU took ", elapsedTime, " ms.");

  // transform point cloud
  CHECK_CUDA_ERROR(cudaEventRecord(start));
  PointCloudTransformGPU::transform<<<numberOfPoints / 256U + 1U, 256U>>>(numberOfPoints, pointsGPU, rotationMatrixGPU, translationMatrixGPU, pointsTransformGPU);
  CHECK_CUDA_ERROR(cudaGetLastError());
  CHECK_CUDA_ERROR(cudaEventRecord(stop));
  CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
  CHECK_CUDA_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
  DebugConsole_consoleOutLine("GPU transform function took ", elapsedTime, " ms.");

  // get results to CPU
  CHECK_CUDA_ERROR(cudaEventRecord(start));
  CHECK_CUDA_ERROR(cudaMemcpy(pointsTransform.data(), pointsTransformGPU, pointsSizeBytes, cudaMemcpyDeviceToHost));
  CHECK_CUDA_ERROR(cudaEventRecord(stop));
  CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
  CHECK_CUDA_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
  DebugConsole_consoleOutLine("Moving from GPU to CPU took ", elapsedTime, " ms.");

  CHECK_CUDA_ERROR(cudaEventDestroy(start));
  CHECK_CUDA_ERROR(cudaEventDestroy(stop));

  CHECK_CUDA_ERROR(cudaFree(pointsGPU));
  CHECK_CUDA_ERROR(cudaFree(pointsTransformGPU));
  CHECK_CUDA_ERROR(cudaFree(rotationMatrixGPU));
  CHECK_CUDA_ERROR(cudaFree(translationMatrixGPU));

  // we reached the end with no problem (dummy check)
  EXPECT_TRUE(!points.empty());
  EXPECT_TRUE(!pointsTransform.empty());
}

TEST(GPUPrimerGoogleTest02__PointCloudTransformGPU, PointCloudTransformGPU)
{
  GPUPrimerGoogleTest02__PointCloudTransformGPU::executeTest();
}

void GPUPrimerGoogleTest03__PointCloudTransformGPUDotRed::executeTest()
{
  // number of points for test
  constexpr size_t numberOfPoints = 10000000U;

  // create below the CUDA driver info for testing the GPU(s) with optional CUDA profiling enabled
  const CUDADriverInfo cudaDriverInfo(cudaDeviceScheduleAuto, true);
  // start the GPU timer
  CUDAEventTimer timer;

  // allocate GPU & pinned memory
  timer.startTimer();
  HostDeviceMemory<float> pointsHandler(3U * numberOfPoints);
  HostDeviceMemory<float> pointsTransformHandler(3U * numberOfPoints);
  DeviceMemory<float> rotationHandler(9U);
  DeviceMemory<float> translationHandler(3U);
  pointsTransformHandler.memset(0);
  timer.stopTimer();
  DebugConsole_consoleOutLine("GPU allocation and memset function took ", timer.getDecimalElapsedTimeInMilliSecs(), " ms.");

  // generate example data
  const array<float, 9U> rotationMatrix{ {1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f} };
  const array<float, 3U> translationMatrix{ {0.0f, 0.0f, 0.0f} };
  // generate random input point data
  UniformRandom random;
  // each point is 3D, so generate x,y,z coordinates
  for (size_t index = 0U; index < numberOfPoints; ++index)
  {
    pointsHandler[3U * index + 0U] = float(random());
    pointsHandler[3U * index + 1U] = float(random());
    pointsHandler[3U * index + 2U] = float(random());
  }

  // copy from CPU to GPU
  timer.startTimer();
  pointsHandler.copyHostToDevice();
  rotationHandler.copyHostToDevice(rotationMatrix.data());
  translationHandler.copyHostToDevice(translationMatrix.data());
  timer.stopTimer();
  DebugConsole_consoleOutLine("Moving from CPU to GPU took ", timer.getDecimalElapsedTimeInMilliSecs(), " ms.");

  // transform point cloud
  timer.startTimer();
  CUDAParallelFor::launchCUDAParallelFor(numberOfPoints, [] __device__(size_t index, const float* __restrict__ points, const float* __restrict__ rotation, const float* __restrict__ translation, float* __restrict__ pointsTransform)
  {
    PointCloudTransformGPUDotRed::transform(index, points, rotation, translation, pointsTransform);
  },
  pointsHandler.device(), rotationHandler.device(), translationHandler.device(), pointsTransformHandler.device());
  timer.stopTimer();
  DebugConsole_consoleOutLine("GPU transform function took ", timer.getDecimalElapsedTimeInMilliSecs(), " ms.");

  // get results to CPU
  timer.startTimer();
  pointsTransformHandler.copyDeviceToHost();
  timer.stopTimer();
  DebugConsole_consoleOutLine("Moving from GPU to CPU took ", timer.getDecimalElapsedTimeInMilliSecs(), " ms.");

  // we reached the end with no problem (dummy check)
  EXPECT_TRUE(pointsHandler);
  EXPECT_TRUE(pointsTransformHandler);
  EXPECT_TRUE(rotationHandler);
  EXPECT_TRUE(translationHandler);
  EXPECT_TRUE(pointsHandler.getNumberOfElements() > 0U);
  EXPECT_TRUE(pointsTransformHandler.getNumberOfElements() > 0U);
  EXPECT_TRUE(rotationHandler.getNumberOfElements() > 0U);
  EXPECT_TRUE(translationHandler.getNumberOfElements() > 0U);
}

TEST(GPUPrimerGoogleTest03__PointCloudTransformGPUDotRed, PointCloudTransformGPUDotRed)
{
  GPUPrimerGoogleTest03__PointCloudTransformGPUDotRed::executeTest();
}

// The main entry point of the DeviceUnitTests executable.
int main(int argc, char* argv[])
{
#ifdef GPU_FRAMEWORK_DEBUG
  DebugConsole::setUseLogFile(true);
  DebugConsole::setLogFileName("GPUPrimerUnitTests.log");
#endif // GPU_FRAMEWORK_DEBUG

  testing::InitGoogleTest(&argc, argv);
  return RUN_ALL_TESTS();
}