// Copyright 2019-2024, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.

#ifndef _NVHPC_OPENACC_SCAN_HPP
#define _NVHPC_OPENACC_SCAN_HPP

namespace std { namespace __stdpar { namespace __openacc {

//========== exclusive_scan ==========

template <class _FIt1, class _FIt2, class _T, class _BF>
_FIt2 exclusive_scan(_FIt1 __first, _FIt1 __last, _FIt2 __d_first, _T __init,
                     _BF __f) {
  using _Index = typename std::iterator_traits<_FIt1>::difference_type;
  using _ValT = _T;
  _Index __input_size = std::distance(__first, __last);
  if (__input_size == 0) {
    return __d_first;
  }
  _Index __num_chunks = __detail::__iterations_for_reduce_or_scan(__input_size);
  if (__num_chunks == 0) {
    while (__first != __last) {
      _ValT __new_sum = __f(__init, *__first);
      *__d_first = std::move(__init);
      __init = std::move(__new_sum);
      ++__first;
      ++__d_first;
    }
    return __d_first;
  }
  _Index __chunk_size = __input_size / __num_chunks;
  _Index __leftover = __input_size % __num_chunks;
  _ValT* __partial_scan = new _ValT[__num_chunks];
  // To protect against first == d_first, the last element from the previous
  // chunk needs to be stored in the temporary storage.
  __partial_scan[0] = std::move(__init);
  for (_Index __i = 1; __i < __num_chunks; ++__i) {
    __partial_scan[__i] =
        __first[__detail::__chunk_start(__i, __chunk_size, __leftover) - 1];
  }
  // Exclusive scan of each chunk in parallel.
  #pragma acc_stdpar parallel loop present(__partial_scan)
  for (_Index __i = 0; __i < __num_chunks; ++__i) {
    _Index __chunk_start =
        __detail::__chunk_start(__i, __chunk_size, __leftover);
    _Index __chunk_end = __detail::__chunk_end(__i, __chunk_size, __leftover);
    _ValT __acc = std::move(__partial_scan[__i]);
    for (_Index __j = __chunk_start; __j < __chunk_end - 1; ++__j) {
      _ValT __next = __f(__acc, __first[__j]);
      __d_first[__j] = std::move(__acc);
      __acc = std::move(__next);
    }
    __d_first[__chunk_end - 1] = __acc;
    __partial_scan[__i] = std::move(__acc);
  }
  // Inclusive scan of the partial scan array.  The last entry isn't used.
  for (_Index __i = 1; __i < __num_chunks - 1; ++__i) {
    __partial_scan[__i] =
        __f(__partial_scan[__i - 1], std::move(__partial_scan[__i]));
  }
  // Apply the partial scan results to each element, processing chunks in
  // parallel.  The first chunk doesn't need any adjustment.
  #pragma acc_stdpar parallel loop
  for (_Index __i = 1; __i < __num_chunks; ++__i) {
    _Index __chunk_start =
        __detail::__chunk_start(__i, __chunk_size, __leftover);
    _Index __chunk_end = __detail::__chunk_end(__i, __chunk_size, __leftover);
    _Index __this_size = __chunk_end - __chunk_start;
    #pragma acc_stdpar loop vector
    for (_Index __j = 0; __j < __this_size; ++__j) {
      _Index __idx = __chunk_start + __j;
      __d_first[__idx] =
          __f(__partial_scan[__i - 1], std::move(__d_first[__idx]));
    }
  }
  delete[] __partial_scan;
  return __d_first + __input_size;
}

}}} // namespace std::__stdpar::__openacc

#endif
