9.1
general documentation
cs_dispatch.h
Go to the documentation of this file.
1#ifndef CS_DISPATCH_H
2#define CS_DISPATCH_H
3
4/*============================================================================
5 * Class to dispatch computation using various runtimes (OpenMP, CUDA, ...)
6 *============================================================================*/
7
8/*
9 This file is part of code_saturne, a general-purpose CFD tool.
10
11 Copyright (C) 1998-2025 EDF S.A.
12
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; either version 2 of the License, or (at your option) any later
16 version.
17
18 This program is distributed in the hope that it will be useful, but WITHOUT
19 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
21 details.
22
23 You should have received a copy of the GNU General Public License along with
24 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
25 Street, Fifth Floor, Boston, MA 02110-1301, USA.
26*/
27
28// Valid only for C++
29
30#ifdef __cplusplus
31
32/*----------------------------------------------------------------------------*/
33
34#include "base/cs_defs.h"
35
36/*----------------------------------------------------------------------------
37 * Standard C++ library headers
38 *----------------------------------------------------------------------------*/
39
40#include <utility>
41#include <cmath>
42
43#if defined(SYCL_LANGUAGE_VERSION)
44#include <sycl/sycl.hpp>
45#endif
46
47/*----------------------------------------------------------------------------
48 * Local headers
49 *----------------------------------------------------------------------------*/
50
51#include "base/cs_assert.h"
52#include "base/cs_mem.h"
53
54#ifdef __CUDACC__
55#include "base/cs_base_cuda.h"
56#include "base/cs_cuda_reduce.h"
57#include "cs_math_cuda.cuh"
58#endif
59
60/*=============================================================================
61 * Additional doxygen documentation
62 *============================================================================*/
63
69/*=============================================================================
70 * Macro definitions
71 *============================================================================*/
72
73#if defined(SYCL_LANGUAGE_VERSION)
74
75#define CS_DISPATCH_REDUCER_TYPE(type) auto
76
77#else
78
79#define CS_DISPATCH_REDUCER_TYPE(type) type
80
81#endif
82
83/*============================================================================
84 * Type definitions
85 *============================================================================*/
86
90typedef enum {
91
97
104template <class Derived>
106public:
107
108 // Loop over n elements
109 // Must be redefined by the child class
110 template <class F, class... Args>
111 decltype(auto)
112 parallel_for(cs_lnum_t n, F&& f, Args&&... args) = delete;
113
114 // Assembly loop over all internal faces
115 template <class M, class F, class... Args>
116 decltype(auto)
118 F&& f,
119 Args&&... args);
120
121 // Assembly loop over all boundary faces
122 template <class M, class F, class... Args>
123 decltype(auto)
125 F&& f,
126 Args&&... args);
127
128 // Parallel reduction with simple sum.
129 // Must be redefined by the child class
130 template <class T, class F, class... Args>
131 decltype(auto)
133 (cs_lnum_t n, T& sum, F&& f, Args&&... args) = delete;
134
135 // Parallel reduction with reducer template.
136 // Must be redefined by the child class
137 template <class T, class R, class F, class... Args>
138 decltype(auto)
140 (cs_lnum_t n, T& r, R& reducer, F&& f, Args&&... args) = delete;
141
142 // Wait upon completion
143 // Must be redefined by the child class
144 template <class... Args>
145 decltype(auto)
146 wait(void) = delete;
147
148 // Query sum type for assembly loop over all interior faces
149 // Must be redefined by the child class
150 template <class M>
151 bool
154
155 // Query sum type for assembly loop over all boundary faces
156 // Must be redefined by the child class
157 template <class M>
158 bool
161
162};
163
164// Default implementation of parallel_for_i_faces based on parallel_for
165template <class Derived>
166template <class M, class F, class... Args>
168 (const M* m, F&& f, Args&&... args) {
169 return static_cast<Derived*>(this)->parallel_for
170 (m->n_i_faces,
171 static_cast<F&&>(f),
172 static_cast<Args&&>(args)...);
173}
174
175// Default implementation of parallel_for_b_faces based on parallel_for_sum
176template <class Derived>
177template <class M, class F, class... Args>
179 (const M* m, F&& f, Args&&... args) {
180 return static_cast<Derived*>(this)->parallel_for
181 (m->n_b_faces,
182 static_cast<F&&>(f),
183 static_cast<Args&&>(args)...);
184}
185
186// Default implementation of get interior faces sum type
187template <class Derived>
188template <class M>
190 ([[maybe_unused]]const M* m,
193 return true;
194}
195
196// Default implementation of get boundary faces sum type
197template <class Derived>
198template <class M>
200 ([[maybe_unused]]const M* m,
203 return true;
204}
205
206/*
207 * cs_context to execute loops with OpenMP on the CPU
208 */
209
210class cs_host_context : public cs_dispatch_context_mixin<cs_host_context> {
211
212private:
213
214 cs_lnum_t n_min_per_thread;
215 int n_threads_;
217public:
218
220 : n_min_per_thread(CS_THR_MIN), n_threads_(-1)
221 {}
222
224 //
225 // \param[in] n size of array
226 // \param[in] type_size element type size (or multiple)
227 // \param[in, out] s_id start index for the current thread
228 // \param[in, out] e_id past-the-end index for the current thread
229
230private:
231
232#if defined(HAVE_OPENMP)
233
234 // Determine number of threads that should actually be used.
235 int
236 n_threads(cs_lnum_t n)
237 {
238 int n_t = n_threads_;
239 if (n_t < 0) {
240 n_t = cs_glob_n_threads;
241 int n_t_l = n / n_min_per_thread;
242 if (n_t_l < n_t)
243 n_t = n_t_l;
244 if (n_t < 1)
245 n_t = 1;
246 }
247 return n_t;
248 }
249
250#endif
251
252 // Determine element range for current thread.
253 void
254 thread_range(cs_lnum_t n,
255 [[maybe_unused]] size_t type_size,
256 cs_lnum_t &s_id,
257 cs_lnum_t &e_id)
258 {
259#if defined(HAVE_OPENMP)
260 const int t_id = omp_get_thread_num();
261 const int n_t = omp_get_num_threads();
262 const cs_lnum_t t_n = (n + n_t - 1) / n_t;
263 const cs_lnum_t cl_m = CS_CL_SIZE / type_size; /* Cache line multiple */
264
265 s_id = t_id * t_n;
266 e_id = (t_id+1) * t_n;
267 s_id = cs_align(s_id, cl_m);
268 e_id = cs_align(e_id, cl_m);
269 if (e_id > n) e_id = n;
270#else
271 s_id = 0;
272 e_id = n;
273#endif
274 }
275
276public:
277
279 void
281 this->n_min_per_thread = n;
282 }
283
287 return this->n_min_per_thread;
288 }
289
291 void
293 this->n_threads_ = n;
294 }
295
297 int
299 return this->n_threads_;
300 }
301
303 template <class F, class... Args>
304 bool
305 parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
306 #ifdef _OPENMP
307 #pragma omp parallel for num_threads(n_threads(n))
308 #endif
309 for (cs_lnum_t i = 0; i < n; ++i) {
310 f(i, args...);
311 }
312 return true;
313 }
314
317 template <class M, class F, class... Args>
318 bool
319 parallel_for_i_faces(const M* m, F&& f, Args&&... args) {
320 const int n_i_groups = m->i_face_numbering->n_groups;
321 const int n_i_threads = m->i_face_numbering->n_threads;
322 const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index;
323 for (int g_id = 0; g_id < n_i_groups; g_id++) {
324 #ifdef _OPENMP
325 #pragma omp parallel for
326 #endif
327 for (int t_id = 0; t_id < n_i_threads; t_id++) {
328 for (cs_lnum_t f_id = i_group_index[(t_id * n_i_groups + g_id) * 2];
329 f_id < i_group_index[(t_id * n_i_groups + g_id) * 2 + 1];
330 f_id++) {
331 f(f_id, args...);
332 }
333 }
334 }
335 return true;
336 }
337
340 template <class M, class F, class... Args>
341 bool
342 parallel_for_b_faces(const M* m, F&& f, Args&&... args) {
343 const int n_b_groups = m->b_face_numbering->n_groups;
344 const int n_b_threads = m->b_face_numbering->n_threads;
345 const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index;
346 for (int g_id = 0; g_id < n_b_groups; g_id++) {
347 #ifdef _OPENMP
348 #pragma omp parallel for
349 #endif
350 for (int t_id = 0; t_id < n_b_threads; t_id++) {
351 for (cs_lnum_t f_id = b_group_index[(t_id * n_b_groups + g_id) * 2];
352 f_id < b_group_index[(t_id * n_b_groups + g_id) * 2 + 1];
353 f_id++) {
354 f(f_id, args...);
355 }
356 }
357 }
358 return true;
359 }
360
362 template <class T, class F, class... Args>
363 bool
365 T& sum,
366 F&& f,
367 Args&&... args) {
368 sum = 0;
369
370#if 0
371 #ifdef _OPENMP
372 #pragma omp parallel for reduction(+:sum) num_threads(n_threads(n))
373 #endif
374 for (cs_lnum_t i = 0; i < n; ++i) {
375 f(i, sum, args...);
376 }
377#else
378 #ifdef _OPENMP
379 #pragma omp parallel num_threads(n_threads(n))
380 #endif
381 {
382 cs_lnum_t s_id, e_id;
383 thread_range(n, 4, s_id, e_id);
384
385 const cs_lnum_t _n = e_id - s_id;
386 cs_lnum_t n_sblocks, blocks_in_sblocks;
387 const cs_lnum_t block_size = 60;
388 { // superblock counts
389 cs_lnum_t n_blocks = (_n + block_size - 1) / block_size;
390 n_sblocks = (n_blocks > 1) ? std::sqrt(n_blocks) : 1;
391 cs_lnum_t n_b = block_size * n_sblocks;
392 blocks_in_sblocks = (_n + n_b - 1) / n_b;
393 }
394
395 for (cs_lnum_t sid = 0; sid < n_sblocks; sid++) {
396 T sum_sblock = 0;
397
398 for (cs_lnum_t bid = 0; bid < blocks_in_sblocks; bid++) {
399 cs_lnum_t start_id = block_size * (blocks_in_sblocks*sid + bid) + s_id;
400 cs_lnum_t end_id = start_id + block_size;
401 if (end_id > e_id)
402 end_id = e_id;
403 T sum_block = 0;
404 for (cs_lnum_t i = start_id; i < end_id; i++) {
405 f(i, sum_block, args...);
406 }
407 sum_sblock += sum_block;
408 }
409
410 #ifdef _OPENMP
411 #pragma omp atomic
412 #endif
413 sum += sum_sblock;
414 }
415 }
416
417#endif
418 return true;
419 }
420
422 // In case the reduction involves floating-point sums,
423 // we use a Superblock / block loop to reduce numerical error.
424 template <class T, class R, class F, class... Args>
425 bool
427 T& result,
428 R& reducer,
429 F&& f,
430 Args&&... args) {
431 reducer.identity(result);
432
433 #ifdef _OPENMP
434 #pragma omp parallel num_threads(n_threads(n))
435 #endif
436 {
437 cs_lnum_t s_id, e_id;
438 thread_range(n, 4, s_id, e_id);
439
440 const cs_lnum_t _n = e_id - s_id;
441 cs_lnum_t n_sblocks, blocks_in_sblocks;
442 const cs_lnum_t block_size = 60;
443 { // superblock counts
444 cs_lnum_t n_blocks = (_n + block_size - 1) / block_size;
445 n_sblocks = (n_blocks > 1) ? std::sqrt(n_blocks) : 1;
446 cs_lnum_t n_b = block_size * n_sblocks;
447 blocks_in_sblocks = (_n + n_b - 1) / n_b;
448 }
449
450 for (cs_lnum_t sid = 0; sid < n_sblocks; sid++) {
451 T result_sblock;
452 reducer.identity(result_sblock);
453
454 for (cs_lnum_t bid = 0; bid < blocks_in_sblocks; bid++) {
455 cs_lnum_t start_id = block_size * (blocks_in_sblocks*sid + bid) + s_id;
456 cs_lnum_t end_id = start_id + block_size;
457 if (end_id > e_id)
458 end_id = e_id;
459 T result_block;
460 reducer.identity(result_block);
461 for (cs_lnum_t i = start_id; i < end_id; i++) {
462 f(i, result_block, args...);
463 reducer.combine(result_sblock, result_block);
464 }
465 }
466
467 #ifdef _OPENMP
468 #pragma omp critical
469 #endif
470 {
471 reducer.combine(result, result_sblock);
472 }
473 }
474 }
475
476 return true;
477 }
478
480 // No-op here as Open-MP based methods used here have implicit barriers.
481 template <class... Args>
482 bool
483 wait(void) {
484 return true;
485 }
486
487 // Get interior faces sum type associated with this context
488 template <class M>
489 bool
490 try_get_parallel_for_i_faces_sum_type([[maybe_unused]]const M* m,
493 return true;
494 }
495
496 // Get boundary faces sum type associated with this context
497 template <class M>
498 bool
499 try_get_parallel_for_b_faces_sum_type([[maybe_unused]]const M* m,
502 return true;
503 }
504
505};
506
507#if defined(__CUDACC__)
508
509/* Default kernel that loops over an integer range and calls a device functor.
510 This kernel uses a grid_size-stride loop and thus guarantees that all
511 integers are processed, even if the grid is smaller.
512 All arguments *must* be passed by value to avoid passing CPU references
513 to the GPU. */
514
515template <class F, class... Args>
516__global__ void cs_cuda_kernel_parallel_for(cs_lnum_t n, F f, Args... args) {
517 // grid_size-stride loop
518 for (cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; id < n;
519 id += blockDim.x * gridDim.x) {
520 f(id, args...);
521 }
522}
523
524/* Default kernel that loops over an integer range and calls a device functor,
525 also reducing a sum over all elements.
526 This kernel uses a grid_size-stride loop and thus guarantees that all
527 integers are processed, even if the grid is smaller.
528 All arguments *must* be passed by value to avoid passing CPU references
529 to the GPU. */
530
531template <class T, class F, class... Args>
532__global__ void
533cs_cuda_kernel_parallel_for_reduce_sum(cs_lnum_t n,
534 T *b_res,
535 F f,
536 Args... args) {
537 // grid_size-stride loop
538 extern __shared__ int p_stmp[];
539 T *stmp = reinterpret_cast<T *>(p_stmp);
540 const cs_lnum_t tid = threadIdx.x;
541
542 stmp[tid] = 0;
543
544 for (cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; id < n;
545 id += blockDim.x * gridDim.x) {
546 f(id, stmp[tid], args...);
547 }
548
549 switch (blockDim.x) {
550 case 1024:
551 cs_cuda_reduce_block_reduce_sum<1024, 1>(stmp, tid, b_res);
552 break;
553 case 512:
554 cs_cuda_reduce_block_reduce_sum<512, 1>(stmp, tid, b_res);
555 break;
556 case 256:
557 cs_cuda_reduce_block_reduce_sum<256, 1>(stmp, tid, b_res);
558 break;
559 case 128:
560 cs_cuda_reduce_block_reduce_sum<128, 1>(stmp, tid, b_res);
561 break;
562 default:
563 assert(0);
564 }
565}
566
567/* Default kernel that loops over an integer range and calls a device functor.
568 also computing a reduction over all elements.
569 This kernel uses a grid_size-stride loop and thus guarantees that all
570 integers are processed, even if the grid is smaller.
571 All arguments *must* be passed by value to avoid passing CPU references
572 to the GPU. */
573
574template <class T, class R, class F, class... Args>
575__global__ void
576cs_cuda_kernel_parallel_for_reduce(cs_lnum_t n,
577 T *b_res,
578 R &reducer,
579 F f,
580 Args... args) {
581 // grid_size-stride loop
582 extern __shared__ int p_stmp[];
583 T *stmp = reinterpret_cast<T *>(p_stmp);
584 const cs_lnum_t tid = threadIdx.x;
585
586 reducer.identity(stmp[tid]);
587
588 for (cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; id < n;
589 id += blockDim.x * gridDim.x) {
590 T rd;
591 /* It would be safer to call reducer.identyity() here in case all
592 values of rd are not set for each thread, but this might incurr
593 a small performance penalty, and is redundant in most cases,
594 so we consider all values of rd must be set by the caller. */
595 // reducer.identity(rd);
596 f(id, rd, args...);
597 stmp[tid] = rd;
598 }
599
600 switch (blockDim.x) {
601 case 1024:
602 cs_cuda_reduce_block_reduce<1024, R>(stmp, tid, b_res);
603 break;
604 case 512:
605 cs_cuda_reduce_block_reduce<512, R>(stmp, tid, b_res);
606 break;
607 case 256:
608 cs_cuda_reduce_block_reduce<256, R>(stmp, tid, b_res);
609 break;
610 case 128:
611 cs_cuda_reduce_block_reduce<128, R>(stmp, tid, b_res);
612 break;
613 default:
614 assert(0);
615 }
616}
617
622class cs_device_context : public cs_dispatch_context_mixin<cs_device_context> {
623
624private:
625
626 long grid_size_;
629 long block_size_;
630 cudaStream_t stream_;
631 int device_;
633 bool use_gpu_;
635public:
636
638
639 cs_device_context(void)
640 : grid_size_(0), block_size_(256), stream_(cs_cuda_get_stream(0)),
641 device_(0), use_gpu_(true)
642 {
643 device_ = cs_glob_cuda_device_id;
644 }
645
646 cs_device_context(long grid_size,
647 long block_size,
648 cudaStream_t stream,
649 int device)
650 : grid_size_(grid_size), block_size_(block_size), stream_(stream),
651 device_(device), use_gpu_(true)
652 {}
653
654 cs_device_context(long grid_size,
655 long block_size,
656 cudaStream_t stream)
657 : grid_size_(grid_size), block_size_(block_size), stream_(stream),
658 device_(0), use_gpu_(true)
659 {
660 device_ = cs_base_cuda_get_device();
661 }
662
663 cs_device_context(long grid_size,
664 long block_size)
665 : grid_size_(grid_size), block_size_(block_size),
666 stream_(cs_cuda_get_stream(0)), device_(0), use_gpu_(true)
667 {
668 device_ = cs_base_cuda_get_device();
669 }
670
671 cs_device_context(cudaStream_t stream)
672 : grid_size_(0), block_size_(256), stream_(stream), device_(0),
673 use_gpu_(true)
674 {
675 device_ = cs_base_cuda_get_device();
676 }
677
678#if 0 // Operators adding in process of cs_dispatch_queue addition,
679 // then marked as useless and removed.
680 // Not totally removed for now, but "quarantined", to be removed
681 // once we are sure they are not missed.
682
684
685 cs_device_context(cs_device_context const &) = default;
686
687 cs_device_context(cs_device_context &&) = default;
688
689 cs_device_context &
690 operator = (cs_device_context const &) = default;
691
692 cs_device_context &
693 operator = (cs_device_context &&) = default;
694
695#endif
696
698 //
699 // \param[in] grid_size CUDA grid size, or -1 for automatic choice
700 // \param[in] block_size CUDA block size (power of 2 if reduction is used)
701
702 void
703 set_cuda_grid(long grid_size,
704 long block_size) {
705 this->grid_size_ = (grid_size > 0) ? grid_size : -1;
706 this->block_size_ = block_size;
707 }
708
710
711 void
712 set_cuda_stream(cudaStream_t stream) {
713 this->stream_ = stream;
714 }
715
717
718 void
719 set_cuda_stream(int stream_id) {
720 this->stream_ = cs_cuda_get_stream(stream_id);
721 }
722
724
725 cudaStream_t
726 cuda_stream(void) {
727 return this->stream_;
728 }
729
731
732 void
733 set_cuda_device(int device) {
734 this->device_ = device;
735 }
736
738
739 void
740 set_use_gpu(bool use_gpu) {
741 this->use_gpu_ = use_gpu;
742 }
743
745
746 bool
747 use_gpu(void) {
748 return (device_ >= 0 && use_gpu_);
749 }
750
752
754 alloc_mode(void) {
755 cs_alloc_mode_t amode
756 = (device_ >= 0 && use_gpu_) ? cs_alloc_mode_device : CS_ALLOC_HOST;
757 return (amode);
758 }
759
761 alloc_mode(bool readable_on_cpu) {
763 if (device_ >= 0 && use_gpu_) {
764 if (readable_on_cpu)
766 else
767 amode = cs_alloc_mode_device;
768 }
769 return (amode);
770 }
771
772public:
773
775 template <class F, class... Args>
776 bool
777 parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
778 if (device_ < 0 || use_gpu_ == false) {
779 return false;
780 }
781
782 long l_grid_size = grid_size_;
783 if (l_grid_size < 1) {
784 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
785 }
786
787 if (n > 0)
788 cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
789 (n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
790
791 return true;
792 }
793
795 template <class M, class F, class... Args>
796 bool
797 parallel_for_i_faces(const M* m, F&& f, Args&&... args) {
798 const cs_lnum_t n = m->n_i_faces;
799 if (device_ < 0 || use_gpu_ == false) {
800 return false;
801 }
802
803 long l_grid_size = grid_size_;
804 if (l_grid_size < 1) {
805 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
806 }
807
808 if (n > 0)
809 cs_cuda_kernel_parallel_for<<<l_grid_size, block_size_, 0, stream_>>>
810 (n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
811
812 return true;
813 }
814
817 template <class T, class F, class... Args>
818 bool
820 T& sum,
821 F&& f,
822 Args&&... args) {
823 if (device_ < 0 || use_gpu_ == false) {
824 return false;
825 }
826
827 sum = 0;
828
829 long l_grid_size = grid_size_;
830 if (l_grid_size < 1) {
831 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
832 }
833 if (n == 0) {
834 return true;
835 }
836
837 int stream_id = cs_cuda_get_stream_id(stream_);
838 if (stream_id < 0)
839 stream_id = 0;
840
841 T *r_grid_, *r_reduce_, *r_host_;
842 cs_cuda_get_2_stage_reduce_buffers
843 (stream_id, n, sizeof(sum), l_grid_size,
844 (void *&)r_grid_, (void *&)r_reduce_, (void *&)r_host_);
845
846 int smem_size = block_size_ * sizeof(T);
847 cs_cuda_kernel_parallel_for_reduce_sum
848 <<<l_grid_size, block_size_, smem_size, stream_>>>
849 (n, r_grid_, static_cast<F&&>(f), static_cast<Args&&>(args)...);
850
851#if defined(DEBUG) || !defined(NDEBUG)
852 cudaError_t retcode = cudaGetLastError();
853 if (retcode != cudaSuccess)
854 bft_error(__FILE__, __LINE__, 0,
855 "[CUDA error] %d: %s\n"
856 "with grid size %ld, block size %ld, shared memory size %d.",
857 retcode, ::cudaGetErrorString(retcode),
858 l_grid_size, block_size_, smem_size);
859#endif
860
861 switch (block_size_) {
862 case 1024:
863 cs_cuda_reduce_sum_single_block<1024, 1>
864 <<<1, block_size_, 0, stream_>>>
865 (l_grid_size, r_grid_, r_reduce_);
866 break;
867 case 512:
868 cs_cuda_reduce_sum_single_block<512, 1>
869 <<<1, block_size_, 0, stream_>>>
870 (l_grid_size, r_grid_, r_reduce_);
871 break;
872 case 256:
873 cs_cuda_reduce_sum_single_block<256, 1>
874 <<<1, block_size_, 0, stream_>>>
875 (l_grid_size, r_grid_, r_reduce_);
876 break;
877 case 128:
878 cs_cuda_reduce_sum_single_block<128, 1>
879 <<<1, block_size_, 0, stream_>>>
880 (l_grid_size, r_grid_, r_reduce_);
881 break;
882 default:
883 cs_assert(0);
884 }
885
886 CS_CUDA_CHECK(cudaMemcpyAsync(r_host_, r_reduce_, sizeof(sum),
887 cudaMemcpyDeviceToHost, stream_));
888
889#if defined(DEBUG) || !defined(NDEBUG)
890 retcode = cudaGetLastError();
891 if (retcode != cudaSuccess)
892 bft_error(__FILE__, __LINE__, 0,
893 "[CUDA error] %d: %s\n"
894 "with grid size %ld, block size %ld, shared memory size %d.",
895 retcode, ::cudaGetErrorString(retcode),
896 l_grid_size, block_size_, (int)smem_size);
897#endif
898
899 CS_CUDA_CHECK(cudaStreamSynchronize(stream_));
900 CS_CUDA_CHECK(cudaGetLastError());
901 sum = r_host_[0];
902
903 return true;
904 }
905
907 template <class T, class R, class F, class... Args>
908 bool
910 T& result,
911 R& reducer,
912 F&& f,
913 Args&&... args) {
914 if (device_ < 0 || use_gpu_ == false) {
915 return false;
916 }
917
918 reducer.identity(result);
919
920 long l_grid_size = grid_size_;
921 if (l_grid_size < 1) {
922 l_grid_size = (n % block_size_) ? n/block_size_ + 1 : n/block_size_;
923 }
924 if (n == 0) {
925 return true;
926 }
927
928 int stream_id = cs_cuda_get_stream_id(stream_);
929 if (stream_id < 0)
930 stream_id = 0;
931
932 T *r_grid_, *r_reduce_, *r_host_;
933 cs_cuda_get_2_stage_reduce_buffers
934 (stream_id, n, sizeof(result), l_grid_size,
935 (void *&)r_grid_, (void *&)r_reduce_, (void *&)r_host_);
936
937 int l_block_size = block_size_;
938 int smem_size = l_block_size * sizeof(T);
939 while (smem_size > cs_glob_cuda_shared_mem_per_block) {
940 // We should have a runtime failure if even blocks of size 64
941 // are too large relative to the available shared memory.
942 if (l_block_size < 2)
943 bft_error(__FILE__, __LINE__, 0,
944 "Type of size %d exceeds capacity of "
945 "CUDA shared memory (%d).",
946 (int)sizeof(T), cs_glob_cuda_shared_mem_per_block);
947 l_block_size /= 2;
948 smem_size = l_block_size * sizeof(T);
949 }
950
951#if defined(DEBUG) || !defined(NDEBUG)
952 cudaError_t retcode = cudaSuccess;
953#endif
954
955 cs_cuda_kernel_parallel_for_reduce<T, R>
956 <<<l_grid_size, l_block_size, smem_size, stream_>>>
957 (n, r_grid_, reducer, static_cast<F&&>(f),
958 static_cast<Args&&>(args)...);
959
960#if defined(DEBUG) || !defined(NDEBUG)
961 retcode = cudaGetLastError();
962 if (retcode != cudaSuccess)
963 bft_error(__FILE__, __LINE__, 0,
964 "[CUDA error] %d: %s\n"
965 "with grid size %ld, block size %d, shared memory size %d.",
966 retcode, ::cudaGetErrorString(retcode),
967 l_grid_size, l_block_size, smem_size);
968#endif
969
970 switch (l_block_size) {
971 case 1024:
972 cs_cuda_reduce_single_block<1024, R>
973 <<<1, l_block_size, smem_size, stream_>>>
974 (l_grid_size, r_grid_, r_reduce_);
975 break;
976 case 512:
977 cs_cuda_reduce_single_block<512, R>
978 <<<1, l_block_size, smem_size, stream_>>>
979 (l_grid_size, r_grid_, r_reduce_);
980 break;
981 case 256:
982 cs_cuda_reduce_single_block<256, R>
983 <<<1, l_block_size, smem_size, stream_>>>
984 (l_grid_size, r_grid_, r_reduce_);
985 break;
986 case 128:
987 cs_cuda_reduce_single_block<128, R>
988 <<<1, l_block_size, smem_size, stream_>>>
989 (l_grid_size, r_grid_, r_reduce_);
990 break;
991 default:
992 cs_assert(0);
993 }
994
995#if defined(DEBUG) || !defined(NDEBUG)
996 retcode = cudaGetLastError();
997 if (retcode != cudaSuccess)
998 bft_error(__FILE__, __LINE__, 0,
999 "[CUDA error] %d: %s\n"
1000 "with grid size %ld, block size %d, shared memory size %d.",
1001 retcode, ::cudaGetErrorString(retcode),
1002 l_grid_size, l_block_size, (int)smem_size);
1003#endif
1004
1005 CS_CUDA_CHECK(cudaMemcpyAsync(r_host_, r_reduce_, sizeof(result),
1006 cudaMemcpyDeviceToHost, stream_));
1007
1008 CS_CUDA_CHECK(cudaStreamSynchronize(stream_));
1009 CS_CUDA_CHECK(cudaGetLastError());
1010 result = r_host_[0];
1011
1012 return true;
1013 }
1014
1016 template <class... Args>
1017 bool
1018 wait(void) {
1019 if (device_ > -1 && use_gpu_) {
1020 CS_CUDA_CHECK(cudaStreamSynchronize(stream_));
1021 CS_CUDA_CHECK(cudaGetLastError());
1022 return true;
1023 }
1024 return false;
1025 }
1026
1027 // Get interior faces sum type associated with this context
1028 template <class M>
1029 bool
1032 if (device_ < 0 || use_gpu_ == false) {
1033 return false;
1034 }
1035
1037 return true;
1038 }
1039
1040 // Get boundary faces sum type associated with this context
1041 template <class M>
1042 bool
1045 if (device_ < 0 || use_gpu_ == false) {
1046 return false;
1047 }
1048
1050 return true;
1051 }
1052
1053};
1054
1055#elif defined(SYCL_LANGUAGE_VERSION)
1056
1058#if !defined(CS_GLOB_SYCL_QUEUE_IS_DEFINED)
1059extern sycl::queue cs_glob_sycl_queue;
1060#define CS_GLOB_SYCL_QUEUE_IS_DEFINED 1
1061#endif
1062
1067class cs_device_context : public cs_dispatch_context_mixin<cs_device_context> {
1068
1069private:
1070
1071 sycl::queue &queue_;
1072 bool is_gpu;
1074 bool use_gpu_;
1076public:
1077
1079
1080 cs_device_context(void)
1081 : queue_(cs_glob_sycl_queue), is_gpu(false), use_gpu_(true)
1082 {
1083 is_gpu = queue_.get_device().is_gpu();
1084 }
1085
1087
1088 void
1089 set_use_gpu(bool use_gpu) {
1090 this->use_gpu_ = use_gpu;
1091 }
1092
1094
1095 bool
1096 use_gpu(void) {
1097 return (is_gpu && use_gpu_);
1098 }
1099
1101
1103 alloc_mode(void) {
1104 cs_alloc_mode_t amode
1105 = (is_gpu && use_gpu_) ? CS_ALLOC_HOST_DEVICE_SHARED : CS_ALLOC_HOST;
1106 return (amode);
1107 }
1108
1110 alloc_mode([[maybe_unused]] bool readable_on_cpu) {
1111 cs_alloc_mode_t amode
1112 = (is_gpu && use_gpu_) ? CS_ALLOC_HOST_DEVICE_SHARED : CS_ALLOC_HOST;
1113 return (amode);
1114 }
1115
1116public:
1117
1119 template <class F, class... Args>
1120 bool
1121 parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
1122 if (is_gpu == false || use_gpu_ == false) {
1123 return false;
1124 }
1125
1126 queue_.parallel_for(n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
1127
1128 return true;
1129 }
1130
1132 template <class M, class F, class... Args>
1133 bool
1134 parallel_for_i_faces(const M* m, F&& f, Args&&... args) {
1135 const cs_lnum_t n = m->n_i_faces;
1136 if (is_gpu == false || use_gpu_ == false) {
1137 return false;
1138 }
1139
1140 queue_.parallel_for(n, static_cast<F&&>(f), static_cast<Args&&>(args)...);
1141
1142 return true;
1143 }
1144
1146 template <class T, class F, class... Args>
1147 bool
1149 T& sum_,
1150 F&& f,
1151 Args&&... args) {
1152 if (is_gpu == false || use_gpu_ == false) {
1153 return false;
1154 }
1155
1156 sum_ = 0;
1157
1158 // TODO: use persistent allocation as we do in CUDA BLAS to avoid
1159 // excess allocation/deallocation.
1160 T *sum_ptr = (T *)sycl::malloc_shared(sizeof(T), queue_);
1161
1162 queue_.parallel_for(n,
1163 sycl::reduction(sum_ptr, (T)0, sycl::plus<T>()),
1164 static_cast<F&&>(f),
1165 static_cast<Args&&>(args)...).wait();
1166
1167 sum_ = sum_ptr[0];
1168
1169 sycl::free((void *)sum_ptr, queue_);
1170
1171 return true;
1172 }
1173
1175 template <class T, class R, class F, class... Args>
1176 bool
1178 T& result,
1179 R& reducer,
1180 F&& f,
1181 Args&&... args) {
1182
1183 // TODO implement this
1184 return false;
1185 }
1186
1188 template <class... Args>
1189 bool
1190 wait(void) {
1191 if (is_gpu && use_gpu_) {
1192 queue_.wait();
1193 return true;
1194 }
1195 return false;
1196 }
1197
1198 // Get interior faces sum type associated with this context
1199 template <class M>
1200 bool
1203 if (is_gpu == false || use_gpu_ == false) {
1204 return false;
1205 }
1206
1208 return true;
1209 }
1210
1211 // Get interior faces sum type associated with this context
1212 template <class M>
1213 bool
1216 if (is_gpu == false || use_gpu_ == false) {
1217 return false;
1218 }
1219
1221 return true;
1222 }
1223
1224};
1225
1226#elif defined(HAVE_OPENMP_TARGET)
1227
1232class cs_device_context : public cs_dispatch_context_mixin<cs_device_context> {
1233
1234private:
1235
1236 bool is_gpu;
1238 bool use_gpu_;
1240public:
1241
1243
1244 cs_device_context(void)
1245 : is_gpu(false), use_gpu_(true)
1246 {
1247 // This should be improved for any actual use of this approach
1248 // beyond basic testing
1249 is_gpu = (omp_get_num_devices() > 1) ? true : false;
1250 }
1251
1253
1254 void
1255 set_use_gpu(bool use_gpu) {
1256 this->use_gpu_ = use_gpu;
1257 }
1258
1260
1261 bool
1262 use_gpu(void) {
1263 return (is_gpu && use_gpu_);
1264 }
1265
1267
1269 alloc_mode(void) {
1270 cs_alloc_mode_t amode
1271 = (is_gpu && use_gpu_) ? CS_ALLOC_HOST_DEVICE_SHARED : CS_ALLOC_HOST;
1272 return (amode);
1273 }
1274
1276 alloc_mode([[maybe_unused]] bool readable_on_cpu) {
1277 cs_alloc_mode_t amode
1278 = (is_gpu && use_gpu_) ? CS_ALLOC_HOST_DEVICE_SHARED : CS_ALLOC_HOST;
1279 return (amode);
1280 }
1281
1282public:
1283
1285 template <class F, class... Args>
1286 bool
1287 parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
1288 if (is_gpu == false || use_gpu_ == false) {
1289 return false;
1290 }
1291
1293# pragma omp target teams distribute parallel for
1294 for (cs_lnum_t i = 0; i < n; ++i) {
1295 f(i, args...);
1296 }
1297
1298 return true;
1299 }
1300
1302 template <class T, class F, class... Args>
1303 bool
1305 T& sum,
1306 F&& f,
1307 Args&&... args) {
1308 if (is_gpu == false || use_gpu_ == false) {
1309 return false;
1310 }
1311
1312 sum = 0;
1314# pragma omp target teams distribute parallel for reduction(+:sum)
1315 for (cs_lnum_t i = 0; i < n; ++i) {
1316 f(i, sum, args...);
1317 }
1318
1319 return true;
1320 }
1321
1323 template <class T, class R, class F, class... Args>
1324 bool
1326 T& result,
1327 R& reducer,
1328 F&& f,
1329 Args&&... args) {
1330
1331 // TODO implement this
1332 return false;
1333 }
1334
1336 template <class... Args>
1337 bool
1338 wait(void) {
1339 return true;
1340 }
1341
1342 // Get interior faces sum type associated with this context
1343 template <class M>
1344 bool
1347 if (is_gpu == false || use_gpu_ == false) {
1348 return false;
1349 }
1350
1352 return true;
1353 }
1354
1355 // Get interior faces sum type associated with this context
1356 template <class M>
1357 bool
1360 if (is_gpu == false || use_gpu_ == false) {
1361 return false;
1362 }
1363
1365 return true;
1366 }
1367
1368};
1369
1370#endif // __CUDACC__ or SYCL or defined(HAVE_OPENMP_TARGET)
1371
1376class cs_void_context : public cs_dispatch_context_mixin<cs_void_context> {
1377
1378public:
1379
1381
1383 {}
1384
1385#if !defined(__CUDACC__)
1386
1387 /* Fill-in for CUDA methods, so as to allow using these methods
1388 in final cs_dispatch_context even when CUDA is not available,
1389 and without requiring a static cast of the form
1390
1391 static_cast<cs_device_context&>(ctx).set_use_gpu(true);
1392 */
1393
1394 void
1395 set_cuda_grid([[maybe_unused]] long grid_size,
1396 [[maybe_unused]] long block_size) {
1397 }
1398
1399 void
1400 set_cuda_stream([[maybe_unused]] int stream_id) {
1401 }
1402
1403 void
1404 set_cuda_device([[maybe_unused]] int device_id) {
1405 }
1406
1407#endif // !defined(__CUDACC__)
1408
1409#if !defined(__CUDACC__) \
1410 && !defined(SYCL_LANGUAGE_VERSION) \
1411 && !defined(HAVE_OPENMP_TARGET)
1412
1413 /* Fill-in for device methods */
1414
1415 void
1416 set_use_gpu([[maybe_unused]] bool use_gpu) {
1417 }
1418
1420
1421 bool
1422 use_gpu(void) {
1423 return false;
1424 }
1425
1427
1430 return CS_ALLOC_HOST;
1431 }
1432
1434 alloc_mode([[maybe_unused]] bool readable_on_cpu) {
1435 return CS_ALLOC_HOST;
1436 }
1437
1438#endif // ! __CUDACC__ && ! SYCL_LANGUAGE_VERSION && ! defined(HAVE_OPENMP_TARGET)
1439
1440public:
1441
1442 // Abort execution if no execution method is available.
1443 template <class F, class... Args>
1444 bool parallel_for([[maybe_unused]] cs_lnum_t n,
1445 [[maybe_unused]] F&& f,
1446 [[maybe_unused]] Args&&... args) {
1447 cs_assert(0);
1448 return false;
1449 }
1450
1451 // Abort execution if no execution method is available.
1452 template <class T, class F, class... Args>
1453 bool parallel_for_reduce_sum([[maybe_unused]] cs_lnum_t n,
1454 [[maybe_unused]] T& sum,
1455 [[maybe_unused]] F&& f,
1456 [[maybe_unused]] Args&&... args) {
1457 cs_assert(0);
1458 return false;
1459 }
1460
1461 // Abort execution if no execution method is available.
1462 template <class T, class R, class F, class... Args>
1463 bool parallel_for_reduce([[maybe_unused]] cs_lnum_t n,
1464 [[maybe_unused]] T& result,
1465 [[maybe_unused]] R& reducer,
1466 [[maybe_unused]] F&& f,
1467 [[maybe_unused]] Args&&... args) {
1468 cs_assert(0);
1469 return false;
1470 }
1471
1472 // Abort execution if no synchronization method is available.
1473 template <class... Args>
1474 bool
1475 wait(void) {
1476 cs_assert(0);
1477 return false;
1478 }
1479
1480};
1481
1487template <class... Contexts>
1489 : public cs_dispatch_context_mixin<cs_combined_context<Contexts...>>,
1490 public Contexts... {
1491
1492private:
1494
1495public:
1497 cs_combined_context(Contexts... contexts)
1498 : Contexts(std::move(contexts))...
1499 {}
1500
1501public:
1502
1503 /*--------------------------------------------------------------------------*/
1504 /* \brief Parallel computation over interior faces.
1505 *
1506 * This method is intended for use when assembling cell values
1507 * with face-based computations, using the appropriate \ref cs_dispatch_sum
1508 * functions.
1509 *
1510 * On CPU, loops are scheduled based on the current face numbering, so
1511 * as to avoid thread races when summing values. On GPU, atomic sums are used.
1512 *
1513 * \tparam M mesh type structure (templated mostly to avoid
1514 * dependency to mesh definitions in lower level code)
1515 * \tparam F lambda function or functor
1516 *
1517 * \param[in] m pointer to mesh
1518 * \param[in] f lambda function or functor to execute
1519 */
1520 /*--------------------------------------------------------------------------*/
1521
1522 template <class M, class F, class... Args>
1523 auto parallel_for_i_faces(const M* m, F&& f, Args&&... args) {
1524 bool launched = false;
1525 [[maybe_unused]] decltype(nullptr) try_execute[] = {
1526 ( launched = launched
1527 || Contexts::parallel_for_i_faces(m, f, args...), nullptr)...
1528 };
1529 }
1530
1531 /*--------------------------------------------------------------------------*/
1532 /* \brief Parallel computation over boundary faces.
1533 *
1534 * This method is intended for use when assembling cell values
1535 * with face-based computations, using the appropriate \ref cs_dispatch_sum
1536 * functions.
1537 *
1538 * On CPU, loops are scheduled based on the current face numbering, so
1539 * as to avoid thread races when summing values. On GPU, atomic sums are used.
1540 *
1541 * \tparam M mesh type structure (templated mostly to avoid
1542 * dependency to mesh definitions in lower level code)
1543 * \tparam F lambda function or functor
1544 *
1545 * \param[in] m pointer to mesh
1546 * \param[in] f lambda function or functor to execute
1547 */
1548 /*--------------------------------------------------------------------------*/
1549
1550 template <class M, class F, class... Args>
1551 auto parallel_for_b_faces(const M* m, F&& f, Args&&... args) {
1552 bool launched = false;
1553 [[maybe_unused]] decltype(nullptr) try_execute[] = {
1554 ( launched = launched
1555 || Contexts::parallel_for_b_faces(m, f, args...), nullptr)...
1556 };
1557 }
1558
1559 /*--------------------------------------------------------------------------*/
1560 /* \brief General parallel computation over elements.
1561 *
1562 * \tparam F lambda function or functor
1563 *
1564 * \param[in] n number of elements to compute
1565 * \param[in] f lambda function or functor to execute
1566 */
1567 /*--------------------------------------------------------------------------*/
1568
1569 template <class F, class... Args>
1570 auto parallel_for(cs_lnum_t n, F&& f, Args&&... args) {
1571 bool launched = false;
1572 [[maybe_unused]] decltype(nullptr) try_execute[] = {
1573 ( launched = launched
1574 || Contexts::parallel_for(n, f, args...), nullptr)...
1575 };
1576 }
1577
1578 /*--------------------------------------------------------------------------*/
1579 /* \brief General parallel computation over elements, with a floating-point
1580 * sum reduction.
1581 *
1582 * \tparam T reduced element type
1583 * \tparam F lambda function or functor
1584 *
1585 * \param[in] n number of elements to compute
1586 * \param[in] sum resulting sum
1587 * \param[in] f lambda function or functor to execute
1588 */
1589 /*--------------------------------------------------------------------------*/
1590
1591 template <class T, class F, class... Args>
1593 (cs_lnum_t n, T& sum, F&& f, Args&&... args) {
1594 bool launched = false;
1595 [[maybe_unused]] decltype(nullptr) try_execute[] = {
1596 ( launched = launched
1597 || Contexts::parallel_for_reduce_sum(n, sum, f, args...),
1598 nullptr)...
1599 };
1600 }
1601
1602 /*--------------------------------------------------------------------------*/
1603 /* \brief General parallel computation over elements, with a
1604 * user-defined reduction.
1605 *
1606 * \tparam T reduced element type
1607 * \tparam R reducer class
1608 * \tparam F lambda function or functor
1609 *
1610 * \param[in] n number of elements to compute
1611 * \param[out] result resulting sum
1612 * \param[in] reducer reducer object
1613 * \param[in] f lambda function or functor to execute
1614 */
1615 /*--------------------------------------------------------------------------*/
1616
1617 template <class T, class R, class F, class... Args>
1619 (cs_lnum_t n, T& result, R& reducer, F&& f, Args&&... args) {
1620 bool launched = false;
1621 [[maybe_unused]] decltype(nullptr) try_execute[] = {
1622 ( launched = launched
1623 || Contexts::parallel_for_reduce(n, result, reducer, f, args...),
1624 nullptr)...
1625 };
1626 }
1627
1628 /*--------------------------------------------------------------------------*/
1632 /*--------------------------------------------------------------------------*/
1633
1634 void
1635 wait(void) {
1636 bool done = false;
1637 [[maybe_unused]] decltype(nullptr) try_execute[] = {
1638 ( done = done
1639 || Contexts::wait(), nullptr)...
1640 };
1641 }
1642
1643 /*--------------------------------------------------------------------------*/
1653 /*--------------------------------------------------------------------------*/
1654
1655 template <class M>
1659 bool known = false;
1660 [[maybe_unused]] decltype(nullptr) try_query[] = {
1661 ( known = known
1662 || Contexts::try_get_parallel_for_i_faces_sum_type(m, sum_type),
1663 nullptr)...
1664 };
1665 return sum_type;
1666 }
1667
1668 /*--------------------------------------------------------------------------*/
1678 /*--------------------------------------------------------------------------*/
1679
1680 template <class M>
1684 bool known = false;
1685 [[maybe_unused]] decltype(nullptr) try_query[] = {
1686 ( known = known
1687 || Contexts::try_get_parallel_for_b_faces_sum_type(m, sum_type),
1688 nullptr)...
1689 };
1690 return sum_type;
1691 }
1692
1693};
1694
1695/*----------------------------------------------------------------------------*/
1700/*----------------------------------------------------------------------------*/
1701
1703#if defined(__CUDACC__) \
1704 || defined(SYCL_LANGUAGE_VERSION) \
1705 || defined(HAVE_OPENMP_TARGET)
1706 cs_device_context,
1707#endif
1708 cs_host_context,
1709 cs_void_context
1710>
1711{
1712
1713private:
1715#if defined(__CUDACC__) \
1716 || defined(SYCL_LANGUAGE_VERSION) \
1717 || defined(HAVE_OPENMP_TARGET)
1718 cs_device_context,
1719#endif
1722>;
1723
1724public:
1725 using base_t::base_t;
1726 using base_t::operator=;
1727
1728};
1729
1730/*
1731 Remarks:
1732
1733 Instantiation can simply be done using:
1734
1735 `cs_dispatch_context ctx;`
1736
1737 Instanciation can also be done with specific construction options,
1738 for example:
1739
1740 `cs_dispatch_context ctx(cs_device_context(stream), {});`
1741
1742 or:
1743
1744 `cs_dispatch_context ctx(cs_device_context(), {});`
1745
1746*/
1747
1748/*=============================================================================
1749 * Global variable definitions
1750 *============================================================================*/
1751
1752/*=============================================================================
1753 * Public function prototypes
1754 *============================================================================*/
1755
1756/*----------------------------------------------------------------------------*/
1769/*----------------------------------------------------------------------------*/
1770
1771#ifdef __CUDA_ARCH__ // Test whether we are on GPU or CPU...
1772
1773template <typename T>
1774__device__ static void __forceinline__
1775cs_dispatch_sum(T *dest,
1776 const T src,
1777 cs_dispatch_sum_type_t sum_type)
1778{
1779 if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1780#if 1
1781 using sum_v = assembled_value<T>;
1782 sum_v v;
1783
1784 v.get() = src;
1785 sum_v::ref(*dest).conflict_free_add(-1u, v);
1786#else
1787 atomicAdd(dest, src);
1788#endif
1789 }
1790 else if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1791 *dest += src;
1792 }
1793}
1794
1795#elif defined(SYCL_LANGUAGE_VERSION)
1796
1797template <typename T>
1798inline void
1799cs_dispatch_sum(T *dest,
1800 const T src,
1801 cs_dispatch_sum_type_t sum_type)
1802{
1803 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1804 *dest += src;
1805 }
1806 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1807 sycl::atomic_ref<T,
1808 sycl::memory_order::relaxed,
1809 sycl::memory_scope::device> aref(*dest);
1810 aref.fetch_add(src);
1811 }
1812}
1813
1814#else // ! CUDA or SYCL
1815
1816template <typename T>
1817inline void
1819 const T src,
1820 cs_dispatch_sum_type_t sum_type)
1821{
1822 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1823 *dest += src;
1824 }
1825 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1826 #ifdef _OPENMP
1827 #pragma omp atomic
1828 #endif
1829 *dest += src;
1830 }
1831}
1832
1833#endif // __CUDA_ARCH__
1834
1835/*----------------------------------------------------------------------------*/
1849/*----------------------------------------------------------------------------*/
1850
1851#ifdef __CUDA_ARCH__ // Test whether we are on GPU or CPU...
1852
1853template <size_t dim, typename T>
1854__device__ static void __forceinline__
1855cs_dispatch_sum(T *dest,
1856 const T *src,
1857 cs_dispatch_sum_type_t sum_type)
1858{
1859 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1860 for (cs_lnum_t i = 0; i < dim; i++) {
1861 dest[i] += src[i];
1862 }
1863 }
1864 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1865#if __CUDA_ARCH__ >= 700
1866 using sum_v = assembled_value<T, dim>;
1867 sum_v v;
1868
1869 for (size_t i = 0; i < dim; i++) {
1870 v[i].get() = src[i];
1871 }
1872
1873 sum_v &vs = reinterpret_cast<sum_v &>(*dest);
1874 vs.conflict_free_add(-1u, v);
1875
1876 //sum_v::ref(dest).conflict_free_add(-1u, v);
1877#else
1878 for (size_t i = 0; i < dim; i++) {
1879 atomicAdd(&dest[i], src[i]);
1880 }
1881#endif
1882 }
1883}
1884
1885#elif defined(SYCL_LANGUAGE_VERSION)
1886
1887template <size_t dim, typename T>
1888inline void
1889cs_dispatch_sum(T *dest,
1890 const T *src,
1891 cs_dispatch_sum_type_t sum_type)
1892{
1893 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1894 for (size_t i = 0; i < dim; i++) {
1895 dest[i] += src[i];
1896 }
1897 }
1898 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1899 for (size_t i = 0; i < dim; i++) {
1900 sycl::atomic_ref<T,
1901 sycl::memory_order::relaxed,
1902 sycl::memory_scope::device> aref(dest[i]);
1903 aref.fetch_add(src[i]);
1904 }
1905 }
1906}
1907
1908#else // ! CUDA or SYCL
1909
1910template <size_t dim, typename T>
1911inline void
1913 const T *src,
1914 cs_dispatch_sum_type_t sum_type)
1915{
1916 if (sum_type == CS_DISPATCH_SUM_SIMPLE) {
1917 for (size_t i = 0; i < dim; i++) {
1918 dest[i] += src[i];
1919 }
1920 }
1921 else if (sum_type == CS_DISPATCH_SUM_ATOMIC) {
1922 for (size_t i = 0; i < dim; i++) {
1923 #ifdef _OPENMP
1924 #pragma omp atomic
1925 #endif
1926 dest[i] += src[i];
1927 }
1928 }
1929}
1930
1931#endif // __CUDA_ARCH__
1932
1933/*----------------------------------------------------------------------------*/
1934
1935#endif /* __cplusplus */
1936
1937#endif /* CS_DISPATCH_H */
void bft_error(const char *const file_name, const int line_num, const int sys_error_code, const char *const format,...)
Calls the error handler (set by bft_error_handler_set() or default).
Definition: bft_error.cpp:193
Definition: cs_dispatch.h:1490
cs_combined_context()=default
auto parallel_for_reduce_sum(cs_lnum_t n, T &sum, F &&f, Args &&... args)
Definition: cs_dispatch.h:1593
cs_combined_context(Contexts... contexts)
Definition: cs_dispatch.h:1497
auto parallel_for_reduce(cs_lnum_t n, T &result, R &reducer, F &&f, Args &&... args)
Definition: cs_dispatch.h:1619
auto parallel_for_b_faces(const M *m, F &&f, Args &&... args)
Definition: cs_dispatch.h:1551
auto parallel_for_i_faces(const M *m, F &&f, Args &&... args)
Definition: cs_dispatch.h:1523
auto parallel_for(cs_lnum_t n, F &&f, Args &&... args)
Definition: cs_dispatch.h:1570
void wait(void)
Wait (synchronize) until launched computations have finished.
Definition: cs_dispatch.h:1635
cs_dispatch_sum_type_t get_parallel_for_b_faces_sum_type(const M *m)
Return sum type to be used with parallel_for_b_faces.
Definition: cs_dispatch.h:1682
cs_dispatch_sum_type_t get_parallel_for_i_faces_sum_type(const M *m)
Return sum type to be used with parallel_for_i_faces.
Definition: cs_dispatch.h:1657
Definition: cs_dispatch.h:105
decltype(auto) parallel_for(cs_lnum_t n, F &&f, Args &&... args)=delete
bool try_get_parallel_for_b_faces_sum_type(const M *m, cs_dispatch_sum_type_t &st)
Definition: cs_dispatch.h:200
decltype(auto) parallel_for_b_faces(const M *m, F &&f, Args &&... args)
Definition: cs_dispatch.h:179
decltype(auto) parallel_for_reduce_sum(cs_lnum_t n, T &sum, F &&f, Args &&... args)=delete
bool try_get_parallel_for_i_faces_sum_type(const M *m, cs_dispatch_sum_type_t &st)
Definition: cs_dispatch.h:190
decltype(auto) wait(void)=delete
decltype(auto) parallel_for_i_faces(const M *m, F &&f, Args &&... args)
Definition: cs_dispatch.h:168
decltype(auto) parallel_for_reduce(cs_lnum_t n, T &r, R &reducer, F &&f, Args &&... args)=delete
Definition: cs_dispatch.h:1711
Definition: cs_dispatch.h:210
cs_host_context()
Definition: cs_dispatch.h:219
cs_lnum_t n_min_per_cpu_thread(void)
Get minimum number of elements threshold for CPU multithread execution.
Definition: cs_dispatch.h:286
bool parallel_for(cs_lnum_t n, F &&f, Args &&... args)
Iterate using a plain omp parallel for.
Definition: cs_dispatch.h:305
bool parallel_for_reduce_sum(cs_lnum_t n, T &sum, F &&f, Args &&... args)
Plain OpenMP parallel reduction with simple sum.
Definition: cs_dispatch.h:364
void set_n_cpu_threads(int n)
Set number of threads for CPU multithread execution.
Definition: cs_dispatch.h:292
bool wait(void)
Wait upon completion.
Definition: cs_dispatch.h:483
bool try_get_parallel_for_b_faces_sum_type(const M *m, cs_dispatch_sum_type_t &st)
Definition: cs_dispatch.h:499
bool try_get_parallel_for_i_faces_sum_type(const M *m, cs_dispatch_sum_type_t &st)
Definition: cs_dispatch.h:490
bool parallel_for_b_faces(const M *m, F &&f, Args &&... args)
Definition: cs_dispatch.h:342
int n_cpu_threads(void)
Get number of threads for CPU multithread execution (-1 if automatic)
Definition: cs_dispatch.h:298
bool parallel_for_i_faces(const M *m, F &&f, Args &&... args)
Definition: cs_dispatch.h:319
bool parallel_for_reduce(cs_lnum_t n, T &result, R &reducer, F &&f, Args &&... args)
OpenMP parallel reduction with general reducer.
Definition: cs_dispatch.h:426
void set_n_min_per_cpu_thread(cs_lnum_t n)
Set minimum number of elements threshold for CPU multithread execution.
Definition: cs_dispatch.h:280
Definition: cs_dispatch.h:1376
bool parallel_for(cs_lnum_t n, F &&f, Args &&... args)
Definition: cs_dispatch.h:1444
void set_cuda_device(int device_id)
Definition: cs_dispatch.h:1404
cs_alloc_mode_t alloc_mode(bool readable_on_cpu)
Definition: cs_dispatch.h:1434
bool parallel_for_reduce_sum(cs_lnum_t n, T &sum, F &&f, Args &&... args)
Definition: cs_dispatch.h:1453
bool wait(void)
Definition: cs_dispatch.h:1475
void set_use_gpu(bool use_gpu)
Definition: cs_dispatch.h:1416
cs_void_context(void)
Constructor.
Definition: cs_dispatch.h:1382
cs_alloc_mode_t alloc_mode(void)
Check preferred allocation mode depending on execution policy.
Definition: cs_dispatch.h:1429
bool parallel_for_reduce(cs_lnum_t n, T &result, R &reducer, F &&f, Args &&... args)
Definition: cs_dispatch.h:1463
void set_cuda_stream(int stream_id)
Definition: cs_dispatch.h:1400
void set_cuda_grid(long grid_size, long block_size)
Definition: cs_dispatch.h:1395
bool use_gpu(void)
Check whether we are trying to run on GPU.
Definition: cs_dispatch.h:1422
#define cs_assert(expr)
Abort the program if the given assertion is false.
Definition: cs_assert.h:67
int cs_glob_n_threads
Definition: cs_defs.cpp:172
#define restrict
Definition: cs_defs.h:158
#define CS_THR_MIN
Definition: cs_defs.h:508
static cs_lnum_t cs_align(cs_lnum_t i, cs_lnum_t m)
Given a base index i, return the next index aligned with a size m.
Definition: cs_defs.h:669
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:350
#define CS_CL_SIZE
Definition: cs_defs.h:513
void cs_dispatch_sum(T *dest, const T src, cs_dispatch_sum_type_t sum_type)
sum values using a chosen dispatch sum type.
Definition: cs_dispatch.h:1818
cs_dispatch_sum_type_t
Definition: cs_dispatch.h:90
@ CS_DISPATCH_SUM_SIMPLE
Definition: cs_dispatch.h:92
@ CS_DISPATCH_SUM_ATOMIC
Definition: cs_dispatch.h:94
#define cs_alloc_mode_device
Definition: cs_mem.h:189
cs_alloc_mode_t
Definition: cs_mem.h:50
@ CS_ALLOC_HOST
Definition: cs_mem.h:52
@ CS_ALLOC_HOST_DEVICE_SHARED
Definition: cs_mem.h:57
static void sum(const cs_execution_context *ec, T &first, Vals &... values)
Sum values of a given datatype over a given communicator.
Definition: cs_parall.h:881