43#if defined(SYCL_LANGUAGE_VERSION)
44#include <sycl/sycl.hpp>
57#include "cs_math_cuda.cuh"
73#if defined(SYCL_LANGUAGE_VERSION)
75#define CS_DISPATCH_REDUCER_TYPE(type) auto
79#define CS_DISPATCH_REDUCER_TYPE(type) type
104template <
class Derived>
110 template <
class F,
class... Args>
115 template <
class M,
class F,
class... Args>
122 template <
class M,
class F,
class... Args>
130 template <
class T,
class F,
class... Args>
137 template <
class T,
class R,
class F,
class... Args>
140 (
cs_lnum_t n, T& r, R& reducer, F&& f, Args&&... args) =
delete;
144 template <
class... Args>
165template <
class Derived>
166template <
class M,
class F,
class... Args>
168 (
const M* m, F&& f, Args&&... args) {
169 return static_cast<Derived*
>(
this)->parallel_for
172 static_cast<Args&&
>(args)...);
176template <
class Derived>
177template <
class M,
class F,
class... Args>
179 (
const M* m, F&& f, Args&&... args) {
180 return static_cast<Derived*
>(
this)->parallel_for
183 static_cast<Args&&
>(args)...);
187template <
class Derived>
190 ([[maybe_unused]]
const M* m,
197template <
class Derived>
200 ([[maybe_unused]]
const M* m,
220 : n_min_per_thread(
CS_THR_MIN), n_threads_(-1)
232#if defined(HAVE_OPENMP)
238 int n_t = n_threads_;
241 int n_t_l = n / n_min_per_thread;
255 [[maybe_unused]]
size_t type_size,
259#if defined(HAVE_OPENMP)
260 const int t_id = omp_get_thread_num();
261 const int n_t = omp_get_num_threads();
262 const cs_lnum_t t_n = (n + n_t - 1) / n_t;
266 e_id = (t_id+1) * t_n;
269 if (e_id > n) e_id = n;
281 this->n_min_per_thread = n;
287 return this->n_min_per_thread;
293 this->n_threads_ = n;
299 return this->n_threads_;
303 template <
class F,
class... Args>
307 #pragma omp parallel for num_threads(n_threads(n))
317 template <
class M,
class F,
class... Args>
320 const int n_i_groups = m->i_face_numbering->n_groups;
321 const int n_i_threads = m->i_face_numbering->n_threads;
323 for (
int g_id = 0; g_id < n_i_groups; g_id++) {
325 #pragma omp parallel for
327 for (
int t_id = 0; t_id < n_i_threads; t_id++) {
328 for (
cs_lnum_t f_id = i_group_index[(t_id * n_i_groups + g_id) * 2];
329 f_id < i_group_index[(t_id * n_i_groups + g_id) * 2 + 1];
340 template <
class M,
class F,
class... Args>
343 const int n_b_groups = m->b_face_numbering->n_groups;
344 const int n_b_threads = m->b_face_numbering->n_threads;
346 for (
int g_id = 0; g_id < n_b_groups; g_id++) {
348 #pragma omp parallel for
350 for (
int t_id = 0; t_id < n_b_threads; t_id++) {
351 for (
cs_lnum_t f_id = b_group_index[(t_id * n_b_groups + g_id) * 2];
352 f_id < b_group_index[(t_id * n_b_groups + g_id) * 2 + 1];
362 template <
class T,
class F,
class... Args>
372 #pragma omp parallel for reduction(+:sum) num_threads(n_threads(n))
379 #pragma omp parallel num_threads(n_threads(n))
383 thread_range(n, 4, s_id, e_id);
389 cs_lnum_t n_blocks = (_n + block_size - 1) / block_size;
390 n_sblocks = (n_blocks > 1) ? std::sqrt(n_blocks) : 1;
392 blocks_in_sblocks = (_n + n_b - 1) / n_b;
395 for (
cs_lnum_t sid = 0; sid < n_sblocks; sid++) {
398 for (
cs_lnum_t bid = 0; bid < blocks_in_sblocks; bid++) {
399 cs_lnum_t start_id = block_size * (blocks_in_sblocks*sid + bid) + s_id;
400 cs_lnum_t end_id = start_id + block_size;
404 for (
cs_lnum_t i = start_id; i < end_id; i++) {
405 f(i, sum_block, args...);
407 sum_sblock += sum_block;
424 template <
class T,
class R,
class F,
class... Args>
431 reducer.identity(result);
434 #pragma omp parallel num_threads(n_threads(n))
438 thread_range(n, 4, s_id, e_id);
444 cs_lnum_t n_blocks = (_n + block_size - 1) / block_size;
445 n_sblocks = (n_blocks > 1) ? std::sqrt(n_blocks) : 1;
447 blocks_in_sblocks = (_n + n_b - 1) / n_b;
450 for (
cs_lnum_t sid = 0; sid < n_sblocks; sid++) {
452 reducer.identity(result_sblock);
454 for (
cs_lnum_t bid = 0; bid < blocks_in_sblocks; bid++) {
455 cs_lnum_t start_id = block_size * (blocks_in_sblocks*sid + bid) + s_id;
456 cs_lnum_t end_id = start_id + block_size;
460 reducer.identity(result_block);
461 for (
cs_lnum_t i = start_id; i < end_id; i++) {
462 f(i, result_block, args...);
463 reducer.combine(result_sblock, result_block);
471 reducer.combine(result, result_sblock);
481 template <
class... Args>
507#if defined(__CUDACC__)
515template <
class F,
class... Args>
516__global__
void cs_cuda_kernel_parallel_for(
cs_lnum_t n, F f, Args... args) {
518 for (
cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x;
id < n;
519 id += blockDim.x * gridDim.x) {
531template <
class T,
class F,
class... Args>
533cs_cuda_kernel_parallel_for_reduce_sum(
cs_lnum_t n,
538 extern __shared__
int p_stmp[];
539 T *stmp =
reinterpret_cast<T *
>(p_stmp);
544 for (
cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x;
id < n;
545 id += blockDim.x * gridDim.x) {
546 f(
id, stmp[tid], args...);
549 switch (blockDim.x) {
551 cs_cuda_reduce_block_reduce_sum<1024, 1>(stmp, tid, b_res);
554 cs_cuda_reduce_block_reduce_sum<512, 1>(stmp, tid, b_res);
557 cs_cuda_reduce_block_reduce_sum<256, 1>(stmp, tid, b_res);
560 cs_cuda_reduce_block_reduce_sum<128, 1>(stmp, tid, b_res);
574template <
class T,
class R,
class F,
class... Args>
576cs_cuda_kernel_parallel_for_reduce(
cs_lnum_t n,
582 extern __shared__
int p_stmp[];
583 T *stmp =
reinterpret_cast<T *
>(p_stmp);
586 reducer.identity(stmp[tid]);
588 for (
cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x;
id < n;
589 id += blockDim.x * gridDim.x) {
600 switch (blockDim.x) {
602 cs_cuda_reduce_block_reduce<1024, R>(stmp, tid, b_res);
605 cs_cuda_reduce_block_reduce<512, R>(stmp, tid, b_res);
608 cs_cuda_reduce_block_reduce<256, R>(stmp, tid, b_res);
611 cs_cuda_reduce_block_reduce<128, R>(stmp, tid, b_res);
630 cudaStream_t stream_;
639 cs_device_context(
void)
640 : grid_size_(0), block_size_(256), stream_(cs_cuda_get_stream(0)),
641 device_(0), use_gpu_(true)
643 device_ = cs_glob_cuda_device_id;
646 cs_device_context(
long grid_size,
650 : grid_size_(grid_size), block_size_(block_size), stream_(stream),
651 device_(device), use_gpu_(true)
654 cs_device_context(
long grid_size,
657 : grid_size_(grid_size), block_size_(block_size), stream_(stream),
658 device_(0), use_gpu_(true)
660 device_ = cs_base_cuda_get_device();
663 cs_device_context(
long grid_size,
665 : grid_size_(grid_size), block_size_(block_size),
666 stream_(cs_cuda_get_stream(0)), device_(0), use_gpu_(true)
668 device_ = cs_base_cuda_get_device();
671 cs_device_context(cudaStream_t stream)
672 : grid_size_(0), block_size_(256), stream_(stream), device_(0),
675 device_ = cs_base_cuda_get_device();
685 cs_device_context(cs_device_context
const &) =
default;
687 cs_device_context(cs_device_context &&) =
default;
690 operator = (cs_device_context
const &) =
default;
693 operator = (cs_device_context &&) =
default;
703 set_cuda_grid(
long grid_size,
705 this->grid_size_ = (grid_size > 0) ? grid_size : -1;
706 this->block_size_ = block_size;
712 set_cuda_stream(cudaStream_t stream) {
713 this->stream_ = stream;
719 set_cuda_stream(
int stream_id) {
720 this->stream_ = cs_cuda_get_stream(stream_id);
727 return this->stream_;
733 set_cuda_device(
int device) {
734 this->device_ = device;
740 set_use_gpu(
bool use_gpu) {
741 this->use_gpu_ = use_gpu;
748 return (device_ >= 0 && use_gpu_);
761 alloc_mode(
bool readable_on_cpu) {
763 if (device_ >= 0 && use_gpu_) {
775 template <
class F,
class... Args>
778 if (device_ < 0 || use_gpu_ ==
false) {
782 long l_grid_size = grid_size_;
783 if (l_grid_size < 1) {
784 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
788 cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
789 (n,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
795 template <
class M,
class F,
class... Args>
799 if (device_ < 0 || use_gpu_ ==
false) {
803 long l_grid_size = grid_size_;
804 if (l_grid_size < 1) {
805 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
809 cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
810 (n,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
817 template <
class T,
class F,
class... Args>
823 if (device_ < 0 || use_gpu_ ==
false) {
829 long l_grid_size = grid_size_;
830 if (l_grid_size < 1) {
831 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
837 int stream_id = cs_cuda_get_stream_id(stream_);
841 T *r_grid_, *r_reduce_, *r_host_;
842 cs_cuda_get_2_stage_reduce_buffers
843 (stream_id, n,
sizeof(
sum), l_grid_size,
844 (
void *&)r_grid_, (
void *&)r_reduce_, (
void *&)r_host_);
846 int smem_size = block_size_ *
sizeof(T);
847 cs_cuda_kernel_parallel_for_reduce_sum
848 <<<l_grid_size, block_size_, smem_size, stream_>>>
849 (n, r_grid_,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
851#if defined(DEBUG) || !defined(NDEBUG)
852 cudaError_t retcode = cudaGetLastError();
853 if (retcode != cudaSuccess)
855 "[CUDA error] %d: %s\n"
856 "with grid size %ld, block size %ld, shared memory size %d.",
857 retcode, ::cudaGetErrorString(retcode),
858 l_grid_size, block_size_, smem_size);
861 switch (block_size_) {
863 cs_cuda_reduce_sum_single_block<1024, 1>
864 <<<1, block_size_, 0, stream_>>>
865 (l_grid_size, r_grid_, r_reduce_);
868 cs_cuda_reduce_sum_single_block<512, 1>
869 <<<1, block_size_, 0, stream_>>>
870 (l_grid_size, r_grid_, r_reduce_);
873 cs_cuda_reduce_sum_single_block<256, 1>
874 <<<1, block_size_, 0, stream_>>>
875 (l_grid_size, r_grid_, r_reduce_);
878 cs_cuda_reduce_sum_single_block<128, 1>
879 <<<1, block_size_, 0, stream_>>>
880 (l_grid_size, r_grid_, r_reduce_);
886 CS_CUDA_CHECK(cudaMemcpyAsync(r_host_, r_reduce_,
sizeof(
sum),
887 cudaMemcpyDeviceToHost, stream_));
889#if defined(DEBUG) || !defined(NDEBUG)
890 retcode = cudaGetLastError();
891 if (retcode != cudaSuccess)
893 "[CUDA error] %d: %s\n"
894 "with grid size %ld, block size %ld, shared memory size %d.",
895 retcode, ::cudaGetErrorString(retcode),
896 l_grid_size, block_size_, (
int)smem_size);
899 CS_CUDA_CHECK(cudaStreamSynchronize(stream_));
900 CS_CUDA_CHECK(cudaGetLastError());
907 template <
class T,
class R,
class F,
class... Args>
914 if (device_ < 0 || use_gpu_ ==
false) {
918 reducer.identity(result);
920 long l_grid_size = grid_size_;
921 if (l_grid_size < 1) {
922 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
928 int stream_id = cs_cuda_get_stream_id(stream_);
932 T *r_grid_, *r_reduce_, *r_host_;
933 cs_cuda_get_2_stage_reduce_buffers
934 (stream_id, n,
sizeof(result), l_grid_size,
935 (
void *&)r_grid_, (
void *&)r_reduce_, (
void *&)r_host_);
937 int l_block_size = block_size_;
938 int smem_size = l_block_size *
sizeof(T);
939 while (smem_size > cs_glob_cuda_shared_mem_per_block) {
942 if (l_block_size < 2)
944 "Type of size %d exceeds capacity of "
945 "CUDA shared memory (%d).",
946 (
int)
sizeof(T), cs_glob_cuda_shared_mem_per_block);
948 smem_size = l_block_size *
sizeof(T);
951#if defined(DEBUG) || !defined(NDEBUG)
952 cudaError_t retcode = cudaSuccess;
955 cs_cuda_kernel_parallel_for_reduce<T, R>
956 <<<l_grid_size, l_block_size, smem_size, stream_>>>
957 (n, r_grid_, reducer,
static_cast<F&&
>(f),
958 static_cast<Args&&
>(args)...);
960#if defined(DEBUG) || !defined(NDEBUG)
961 retcode = cudaGetLastError();
962 if (retcode != cudaSuccess)
964 "[CUDA error] %d: %s\n"
965 "with grid size %ld, block size %d, shared memory size %d.",
966 retcode, ::cudaGetErrorString(retcode),
967 l_grid_size, l_block_size, smem_size);
970 switch (l_block_size) {
972 cs_cuda_reduce_single_block<1024, R>
973 <<<1, l_block_size, smem_size, stream_>>>
974 (l_grid_size, r_grid_, r_reduce_);
977 cs_cuda_reduce_single_block<512, R>
978 <<<1, l_block_size, smem_size, stream_>>>
979 (l_grid_size, r_grid_, r_reduce_);
982 cs_cuda_reduce_single_block<256, R>
983 <<<1, l_block_size, smem_size, stream_>>>
984 (l_grid_size, r_grid_, r_reduce_);
987 cs_cuda_reduce_single_block<128, R>
988 <<<1, l_block_size, smem_size, stream_>>>
989 (l_grid_size, r_grid_, r_reduce_);
995#if defined(DEBUG) || !defined(NDEBUG)
996 retcode = cudaGetLastError();
997 if (retcode != cudaSuccess)
999 "[CUDA error] %d: %s\n"
1000 "with grid size %ld, block size %d, shared memory size %d.",
1001 retcode, ::cudaGetErrorString(retcode),
1002 l_grid_size, l_block_size, (
int)smem_size);
1005 CS_CUDA_CHECK(cudaMemcpyAsync(r_host_, r_reduce_,
sizeof(result),
1006 cudaMemcpyDeviceToHost, stream_));
1008 CS_CUDA_CHECK(cudaStreamSynchronize(stream_));
1009 CS_CUDA_CHECK(cudaGetLastError());
1010 result = r_host_[0];
1016 template <
class... Args>
1019 if (device_ > -1 && use_gpu_) {
1020 CS_CUDA_CHECK(cudaStreamSynchronize(stream_));
1021 CS_CUDA_CHECK(cudaGetLastError());
1032 if (device_ < 0 || use_gpu_ ==
false) {
1045 if (device_ < 0 || use_gpu_ ==
false) {
1055#elif defined(SYCL_LANGUAGE_VERSION)
1058#if !defined(CS_GLOB_SYCL_QUEUE_IS_DEFINED)
1059extern sycl::queue cs_glob_sycl_queue;
1060#define CS_GLOB_SYCL_QUEUE_IS_DEFINED 1
1071 sycl::queue &queue_;
1080 cs_device_context(
void)
1081 : queue_(cs_glob_sycl_queue), is_gpu(false), use_gpu_(true)
1083 is_gpu = queue_.get_device().is_gpu();
1089 set_use_gpu(
bool use_gpu) {
1090 this->use_gpu_ = use_gpu;
1097 return (is_gpu && use_gpu_);
1110 alloc_mode([[maybe_unused]]
bool readable_on_cpu) {
1119 template <
class F,
class... Args>
1122 if (is_gpu ==
false || use_gpu_ ==
false) {
1126 queue_.parallel_for(n,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
1132 template <
class M,
class F,
class... Args>
1136 if (is_gpu ==
false || use_gpu_ ==
false) {
1140 queue_.parallel_for(n,
static_cast<F&&
>(f),
static_cast<Args&&
>(args)...);
1146 template <
class T,
class F,
class... Args>
1152 if (is_gpu ==
false || use_gpu_ ==
false) {
1160 T *sum_ptr = (T *)sycl::malloc_shared(
sizeof(T), queue_);
1162 queue_.parallel_for(n,
1163 sycl::reduction(sum_ptr, (T)0, sycl::plus<T>()),
1164 static_cast<F&&
>(f),
1165 static_cast<Args&&
>(args)...).wait();
1169 sycl::free((
void *)sum_ptr, queue_);
1175 template <
class T,
class R,
class F,
class... Args>
1188 template <
class... Args>
1191 if (is_gpu && use_gpu_) {
1203 if (is_gpu ==
false || use_gpu_ ==
false) {
1216 if (is_gpu ==
false || use_gpu_ ==
false) {
1226#elif defined(HAVE_OPENMP_TARGET)
1244 cs_device_context(
void)
1245 : is_gpu(false), use_gpu_(true)
1249 is_gpu = (omp_get_num_devices() > 1) ?
true :
false;
1255 set_use_gpu(
bool use_gpu) {
1256 this->use_gpu_ = use_gpu;
1263 return (is_gpu && use_gpu_);
1276 alloc_mode([[maybe_unused]]
bool readable_on_cpu) {
1285 template <
class F,
class... Args>
1288 if (is_gpu ==
false || use_gpu_ ==
false) {
1293# pragma omp target teams distribute parallel for
1302 template <
class T,
class F,
class... Args>
1308 if (is_gpu ==
false || use_gpu_ ==
false) {
1314# pragma omp target teams distribute parallel for reduction(+:sum)
1323 template <
class T,
class R,
class F,
class... Args>
1336 template <
class... Args>
1347 if (is_gpu ==
false || use_gpu_ ==
false) {
1360 if (is_gpu ==
false || use_gpu_ ==
false) {
1385#if !defined(__CUDACC__)
1396 [[maybe_unused]]
long block_size) {
1409#if !defined(__CUDACC__) \
1410 && !defined(SYCL_LANGUAGE_VERSION) \
1411 && !defined(HAVE_OPENMP_TARGET)
1443 template <
class F,
class... Args>
1445 [[maybe_unused]] F&& f,
1446 [[maybe_unused]] Args&&... args) {
1452 template <
class T,
class F,
class... Args>
1454 [[maybe_unused]] T&
sum,
1455 [[maybe_unused]] F&& f,
1456 [[maybe_unused]] Args&&... args) {
1462 template <
class T,
class R,
class F,
class... Args>
1464 [[maybe_unused]] T& result,
1465 [[maybe_unused]] R& reducer,
1466 [[maybe_unused]] F&& f,
1467 [[maybe_unused]] Args&&... args) {
1473 template <
class... Args>
1487template <
class... Contexts>
1490 public Contexts... {
1498 : Contexts(std::move(contexts))...
1522 template <
class M,
class F,
class... Args>
1524 bool launched =
false;
1525 [[maybe_unused]]
decltype(
nullptr) try_execute[] = {
1526 ( launched = launched
1527 || Contexts::parallel_for_i_faces(m, f, args...),
nullptr)...
1550 template <
class M,
class F,
class... Args>
1552 bool launched =
false;
1553 [[maybe_unused]]
decltype(
nullptr) try_execute[] = {
1554 ( launched = launched
1555 || Contexts::parallel_for_b_faces(m, f, args...),
nullptr)...
1569 template <
class F,
class... Args>
1571 bool launched =
false;
1572 [[maybe_unused]]
decltype(
nullptr) try_execute[] = {
1573 ( launched = launched
1574 || Contexts::parallel_for(n, f, args...),
nullptr)...
1591 template <
class T,
class F,
class... Args>
1594 bool launched =
false;
1595 [[maybe_unused]]
decltype(
nullptr) try_execute[] = {
1596 ( launched = launched
1597 || Contexts::parallel_for_reduce_sum(n,
sum, f, args...),
1617 template <
class T,
class R,
class F,
class... Args>
1619 (
cs_lnum_t n, T& result, R& reducer, F&& f, Args&&... args) {
1620 bool launched =
false;
1621 [[maybe_unused]]
decltype(
nullptr) try_execute[] = {
1622 ( launched = launched
1623 || Contexts::parallel_for_reduce(n, result, reducer, f, args...),
1637 [[maybe_unused]]
decltype(
nullptr) try_execute[] = {
1639 || Contexts::wait(),
nullptr)...
1660 [[maybe_unused]]
decltype(
nullptr) try_query[] = {
1662 || Contexts::try_get_parallel_for_i_faces_sum_type(m, sum_type),
1685 [[maybe_unused]]
decltype(
nullptr) try_query[] = {
1687 || Contexts::try_get_parallel_for_b_faces_sum_type(m, sum_type),
1703#if defined(__CUDACC__) \
1704 || defined(SYCL_LANGUAGE_VERSION) \
1705 || defined(HAVE_OPENMP_TARGET)
1715#if defined(__CUDACC__) \
1716 || defined(SYCL_LANGUAGE_VERSION) \
1717 || defined(HAVE_OPENMP_TARGET)
1725 using base_t::base_t;
1726 using base_t::operator=;
1773template <
typename T>
1774__device__
static void __forceinline__
1781 using sum_v = assembled_value<T>;
1785 sum_v::ref(*dest).conflict_free_add(-1u, v);
1787 atomicAdd(dest, src);
1795#elif defined(SYCL_LANGUAGE_VERSION)
1797template <
typename T>
1808 sycl::memory_order::relaxed,
1809 sycl::memory_scope::device> aref(*dest);
1810 aref.fetch_add(src);
1816template <
typename T>
1853template <
size_t dim,
typename T>
1854__device__
static void __forceinline__
1865#if __CUDA_ARCH__ >= 700
1866 using sum_v = assembled_value<T, dim>;
1869 for (
size_t i = 0; i < dim; i++) {
1870 v[i].get() = src[i];
1873 sum_v &vs =
reinterpret_cast<sum_v &
>(*dest);
1874 vs.conflict_free_add(-1u, v);
1878 for (
size_t i = 0; i < dim; i++) {
1879 atomicAdd(&dest[i], src[i]);
1885#elif defined(SYCL_LANGUAGE_VERSION)
1887template <
size_t dim,
typename T>
1894 for (
size_t i = 0; i < dim; i++) {
1899 for (
size_t i = 0; i < dim; i++) {
1901 sycl::memory_order::relaxed,
1902 sycl::memory_scope::device> aref(dest[i]);
1903 aref.fetch_add(src[i]);
1910template <
size_t dim,
typename T>
1917 for (
size_t i = 0; i < dim; i++) {
1922 for (
size_t i = 0; i < dim; i++) {
void bft_error(const char *const file_name, const int line_num, const int sys_error_code, const char *const format,...)
Calls the error handler (set by bft_error_handler_set() or default).
Definition: bft_error.cpp:193
Definition: cs_dispatch.h:1490
cs_combined_context()=default
auto parallel_for_reduce_sum(cs_lnum_t n, T &sum, F &&f, Args &&... args)
Definition: cs_dispatch.h:1593
cs_combined_context(Contexts... contexts)
Definition: cs_dispatch.h:1497
auto parallel_for_reduce(cs_lnum_t n, T &result, R &reducer, F &&f, Args &&... args)
Definition: cs_dispatch.h:1619
auto parallel_for_b_faces(const M *m, F &&f, Args &&... args)
Definition: cs_dispatch.h:1551
auto parallel_for_i_faces(const M *m, F &&f, Args &&... args)
Definition: cs_dispatch.h:1523
auto parallel_for(cs_lnum_t n, F &&f, Args &&... args)
Definition: cs_dispatch.h:1570
void wait(void)
Wait (synchronize) until launched computations have finished.
Definition: cs_dispatch.h:1635
cs_dispatch_sum_type_t get_parallel_for_b_faces_sum_type(const M *m)
Return sum type to be used with parallel_for_b_faces.
Definition: cs_dispatch.h:1682
cs_dispatch_sum_type_t get_parallel_for_i_faces_sum_type(const M *m)
Return sum type to be used with parallel_for_i_faces.
Definition: cs_dispatch.h:1657
Definition: cs_dispatch.h:105
decltype(auto) parallel_for(cs_lnum_t n, F &&f, Args &&... args)=delete
bool try_get_parallel_for_b_faces_sum_type(const M *m, cs_dispatch_sum_type_t &st)
Definition: cs_dispatch.h:200
decltype(auto) parallel_for_b_faces(const M *m, F &&f, Args &&... args)
Definition: cs_dispatch.h:179
decltype(auto) parallel_for_reduce_sum(cs_lnum_t n, T &sum, F &&f, Args &&... args)=delete
bool try_get_parallel_for_i_faces_sum_type(const M *m, cs_dispatch_sum_type_t &st)
Definition: cs_dispatch.h:190
decltype(auto) wait(void)=delete
decltype(auto) parallel_for_i_faces(const M *m, F &&f, Args &&... args)
Definition: cs_dispatch.h:168
decltype(auto) parallel_for_reduce(cs_lnum_t n, T &r, R &reducer, F &&f, Args &&... args)=delete
Definition: cs_dispatch.h:1711
Definition: cs_dispatch.h:210
cs_host_context()
Definition: cs_dispatch.h:219
cs_lnum_t n_min_per_cpu_thread(void)
Get minimum number of elements threshold for CPU multithread execution.
Definition: cs_dispatch.h:286
bool parallel_for(cs_lnum_t n, F &&f, Args &&... args)
Iterate using a plain omp parallel for.
Definition: cs_dispatch.h:305
bool parallel_for_reduce_sum(cs_lnum_t n, T &sum, F &&f, Args &&... args)
Plain OpenMP parallel reduction with simple sum.
Definition: cs_dispatch.h:364
void set_n_cpu_threads(int n)
Set number of threads for CPU multithread execution.
Definition: cs_dispatch.h:292
bool wait(void)
Wait upon completion.
Definition: cs_dispatch.h:483
bool try_get_parallel_for_b_faces_sum_type(const M *m, cs_dispatch_sum_type_t &st)
Definition: cs_dispatch.h:499
bool try_get_parallel_for_i_faces_sum_type(const M *m, cs_dispatch_sum_type_t &st)
Definition: cs_dispatch.h:490
bool parallel_for_b_faces(const M *m, F &&f, Args &&... args)
Definition: cs_dispatch.h:342
int n_cpu_threads(void)
Get number of threads for CPU multithread execution (-1 if automatic)
Definition: cs_dispatch.h:298
bool parallel_for_i_faces(const M *m, F &&f, Args &&... args)
Definition: cs_dispatch.h:319
bool parallel_for_reduce(cs_lnum_t n, T &result, R &reducer, F &&f, Args &&... args)
OpenMP parallel reduction with general reducer.
Definition: cs_dispatch.h:426
void set_n_min_per_cpu_thread(cs_lnum_t n)
Set minimum number of elements threshold for CPU multithread execution.
Definition: cs_dispatch.h:280
Definition: cs_dispatch.h:1376
bool parallel_for(cs_lnum_t n, F &&f, Args &&... args)
Definition: cs_dispatch.h:1444
void set_cuda_device(int device_id)
Definition: cs_dispatch.h:1404
cs_alloc_mode_t alloc_mode(bool readable_on_cpu)
Definition: cs_dispatch.h:1434
bool parallel_for_reduce_sum(cs_lnum_t n, T &sum, F &&f, Args &&... args)
Definition: cs_dispatch.h:1453
bool wait(void)
Definition: cs_dispatch.h:1475
void set_use_gpu(bool use_gpu)
Definition: cs_dispatch.h:1416
cs_void_context(void)
Constructor.
Definition: cs_dispatch.h:1382
cs_alloc_mode_t alloc_mode(void)
Check preferred allocation mode depending on execution policy.
Definition: cs_dispatch.h:1429
bool parallel_for_reduce(cs_lnum_t n, T &result, R &reducer, F &&f, Args &&... args)
Definition: cs_dispatch.h:1463
void set_cuda_stream(int stream_id)
Definition: cs_dispatch.h:1400
void set_cuda_grid(long grid_size, long block_size)
Definition: cs_dispatch.h:1395
bool use_gpu(void)
Check whether we are trying to run on GPU.
Definition: cs_dispatch.h:1422
#define cs_assert(expr)
Abort the program if the given assertion is false.
Definition: cs_assert.h:67
int cs_glob_n_threads
Definition: cs_defs.cpp:172
#define restrict
Definition: cs_defs.h:158
#define CS_THR_MIN
Definition: cs_defs.h:508
static cs_lnum_t cs_align(cs_lnum_t i, cs_lnum_t m)
Given a base index i, return the next index aligned with a size m.
Definition: cs_defs.h:669
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:350
#define CS_CL_SIZE
Definition: cs_defs.h:513
void cs_dispatch_sum(T *dest, const T src, cs_dispatch_sum_type_t sum_type)
sum values using a chosen dispatch sum type.
Definition: cs_dispatch.h:1818
cs_dispatch_sum_type_t
Definition: cs_dispatch.h:90
@ CS_DISPATCH_SUM_SIMPLE
Definition: cs_dispatch.h:92
@ CS_DISPATCH_SUM_ATOMIC
Definition: cs_dispatch.h:94
#define cs_alloc_mode_device
Definition: cs_mem.h:189
cs_alloc_mode_t
Definition: cs_mem.h:50
@ CS_ALLOC_HOST
Definition: cs_mem.h:52
@ CS_ALLOC_HOST_DEVICE_SHARED
Definition: cs_mem.h:57
static void sum(const cs_execution_context *ec, T &first, Vals &... values)
Sum values of a given datatype over a given communicator.
Definition: cs_parall.h:881