9.1
general documentation
cs_dispatch_queue.h
Go to the documentation of this file.
1#pragma once
2
3/*============================================================================
4 * Class to dispatch computation using various runtimes (OpenMP, CUDA, ...)
5 * with explicit dependencies between launched kernels (inspired by SYCL).
6 *============================================================================*/
7
8/*
9 This file is part of code_saturne, a general-purpose CFD tool.
10
11 Copyright (C) 1998-2025 EDF S.A.
12
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; either version 2 of the License, or (at your option) any later
16 version.
17
18 This program is distributed in the hope that it will be useful, but WITHOUT
19 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
21 details.
22
23 You should have received a copy of the GNU General Public License along with
24 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
25 Street, Fifth Floor, Boston, MA 02110-1301, USA.
26*/
27
28// Valid only for C++
29
30#ifdef __cplusplus
31
32/*----------------------------------------------------------------------------*/
33
34#include "base/cs_defs.h"
35
36/*----------------------------------------------------------------------------
37 * Standard C++ library headers
38 *----------------------------------------------------------------------------*/
39
40#include <chrono>
41#include <initializer_list>
42#include <tuple>
43#include <type_traits>
44#include <utility>
45
46#if defined(__CUDACC__)
47#include <cuda.h>
48#include <cuda_runtime.h>
49#else
50#endif
51
52/*----------------------------------------------------------------------------
53 * Local headers
54 *----------------------------------------------------------------------------*/
55
56#include "base/cs_assert.h"
57#include "base/cs_mem.h"
58
59#include "base/cs_dispatch.h"
60
61/*=============================================================================
62 * Macro definitions
63 *============================================================================*/
64
66#ifndef CS_DISPATCH_QUEUE_FORCE_SYNC
67#define CS_DISPATCH_QUEUE_FORCE_SYNC 0
68#endif
69
70/*============================================================================
71 * Type definitions
72 *============================================================================*/
73
74struct cs_event;
75struct cs_task;
76
77/*----------------------------------------------------------------------------*/
81/*----------------------------------------------------------------------------*/
82
83struct cs_event {
85#if defined(__CUDACC__)
86 cudaEvent_t;
87#else
88 std::chrono::steady_clock::time_point;
89#endif
90
92
94
96 {
97#if defined(__CUDACC__)
98 cudaEventCreate(&event_impl);
99#endif
100 }
101
103
104 cs_event(cs_event const &other) = delete;
105 cs_event &
106 operator=(cs_event const &) = delete;
107
109#if defined(__CUDACC__)
110 {
111 event_impl = other.event_impl;
112 other.event_impl = nullptr;
113 }
114#else
115 = default;
116#endif
117
118 cs_event &
120#if defined(__CUDACC__)
121 {
122 if (event_impl != nullptr) {
123 cudaEventDestroy(event_impl);
124 }
125
126 event_impl = other.event_impl;
127 other.event_impl = nullptr;
128
129 return *this;
130 }
131#else
132 = default;
133#endif
134
138 {
139 return event_impl;
140 }
141
143#if defined(__CUDACC__)
144 {
145 if (event_impl != nullptr) {
146 cudaEventDestroy(event_impl);
147 }
148 }
149#else
150 = default;
151#endif
152
153 // Actions
154
156
157 void
159 {
160#if defined(__CUDACC__)
161 cudaEventSynchronize(event_impl);
162#endif
163 }
164};
165
166/*----------------------------------------------------------------------------*/
170/*----------------------------------------------------------------------------*/
171
173 cs_event *event_ptr;
174
175public:
177
178 cs_event_ref(cs_event &event) : event_ptr(&event) {}
179
180 cs_event_ref() = delete;
181
182 cs_event_ref(cs_event_ref &&other) = default;
183 cs_event_ref(cs_event_ref const &other) = default;
185 operator=(cs_event_ref &&) & = default;
187 operator=(cs_event_ref const &) & = default;
188
190 cs_event *
192 {
193 return event_ptr;
194 }
195
197 cs_event &
199 {
200 return *event_ptr;
201 }
202
207 {
208 return ~(*event_ptr);
209 }
210};
211
212/*----------------------------------------------------------------------------*/
220/*----------------------------------------------------------------------------*/
221
222class cs_task {
223 cs_dispatch_context context_;
224
226 cs_event start_event;
227
229 cs_event end_event;
230
231public:
232 cs_task(cs_task const &) = delete;
233 cs_task &
234 operator=(cs_task const &) = delete;
235
236 cs_task(cs_task &&) = default;
237 cs_task &
238 operator=(cs_task &&) = default;
239
241 cs_task(cs_dispatch_context context = {}) : context_(std::move(context))
242 {
243#if defined(__CUDACC__)
244 cudaStream_t new_stream;
245 cudaStreamCreate(&new_stream);
246 context_.set_cuda_stream(new_stream);
247 cudaEventRecord(~start_event, context_.cuda_stream());
248#else
249 ~start_event = std::chrono::steady_clock::now();
250#endif
251 }
252
254 void
256 {
257#if defined(__CUDACC__)
258 cudaStreamWaitEvent(context_.cuda_stream(), ~event);
259#endif
260 }
261
264 void
265 add_dependency(std::initializer_list<cs_event_ref> const &sync_events)
266 {
267#if defined(__CUDACC__)
268 for (auto const &event : sync_events) {
269 add_dependency(event);
270 }
271#endif
272 }
273
275 void
277 {
278 end_event.wait();
279 }
280
284 {
285#if defined(__CUDACC__)
286 cudaEventRecord(~end_event, context_.cuda_stream());
287#else
288 ~end_event = std::chrono::steady_clock::now();
289#endif
290 return { end_event };
291 }
292
294 operator cs_event_ref() { return end_event; }
295
299 {
300 return context_;
301 }
302
306 {
307 return start_event;
308 }
309
313 {
314 return start_event;
315 }
316
319 {
320 context_.wait();
321#if defined(__CUDACC__)
322 cudaStreamDestroy(context_.cuda_stream());
323#endif
324 }
325};
326
328template <class FunctionType, class... Args>
329class cs_host_task : public cs_task {
330public:
331 cs_host_task(cs_host_task const &) = delete;
333 operator=(cs_host_task const &) = delete;
334
337 operator=(cs_host_task &&) = default;
338
340 using args_tuple_t = std::tuple<Args...>;
341
345#if defined(__CUDACC__)
346 std::tuple<FunctionType, args_tuple_t>;
347#else
348 std::tuple<FunctionType>;
349#endif
350
351private:
354 data_tuple_t data_tuple_;
355
356public:
359 cs_host_task(FunctionType &&function, cs_dispatch_context context)
360 : cs_task(std::move(context)),
361
362#if defined(__CUDACC__)
363 data_tuple_(std::move(function), args_tuple_t{})
364#else
365 data_tuple_(std::move(function))
366#endif
367 {
368 }
369
372#if defined(__CUDACC__)
373 cudaError_t
374#else
375 void
376#endif
377 launch(Args... args)
378 {
379#if defined(__CUDACC__)
380 if (this->get_context().use_gpu()) {
381 // Setting the arguments
382 std::get<1>(data_tuple_) = args_tuple_t{ std::move(args)... };
383
384 // Async launch on the task's own stream
385 return cudaLaunchHostFunc
386 (get_context().cuda_stream(),
387 // Wrapper lambda: unwraps the parameter passed as a void* pointer
388 // to invoke the host function
389 [](void *data_tuple_ptr) -> void {
390 auto &[f, args_tuple] = *(data_tuple_t *)(data_tuple_ptr);
391 std::apply(f, args_tuple);
392 },
393 &data_tuple_);
394 }
395 else {
396 this->record_end_event();
397 this->wait();
398 std::get<0>(data_tuple_)(args...);
399 return cudaSuccess;
400 }
401#else
402 std::get<0>(data_tuple_)(args...);
403#endif
404 }
405
408 {
409 // We must wait host task termination to avoid data_tuple_
410 // to be unstacked before the task is executed
411 wait();
412 }
413};
414
415/*----------------------------------------------------------------------------*/
420/*----------------------------------------------------------------------------*/
421
423
424public:
427
428 template <class F, class... Args>
429 cs_task
430 parallel_for(cs_lnum_t n, F &&f, Args &&...args)
431 {
433 new_task.get_context().parallel_for(n,
434 std::forward<F>(f),
435 std::forward<Args>(args)...);
436 new_task.record_end_event();
437 return new_task;
438 }
439
440 template <class F, class... Args>
441 cs_task
443 std::initializer_list<cs_event_ref> const &sync_events,
444 F &&f,
445 Args &&...args)
446 {
448 new_task.add_dependency(sync_events);
449 new_task.get_context().parallel_for(n,
450 std::forward<F>(f),
451 std::forward<Args>(args)...);
452 new_task.record_end_event();
453 return new_task;
454 }
455
456 template <class M, class F, class... Args>
457 cs_task
458 parallel_for_i_faces(const M *m, F &&f, Args &&...args)
459 {
462 std::forward<F>(f),
463 std::forward<Args>(args)...);
464 new_task.record_end_event();
465 return new_task;
466 }
467
468 template <class M, class F, class... Args>
469 cs_task
471 std::initializer_list<cs_event_ref> const &sync_events,
472 F &&f,
473 Args &&...args)
474 {
476 new_task.add_dependency(sync_events);
478 std::forward<F>(f),
479 std::forward<Args>(args)...);
480 new_task.record_end_event();
481 return new_task;
482 }
483
484 template <class M, class F, class... Args>
485 cs_task
486 parallel_for_b_faces(const M *m, F &&f, Args &&...args)
487 {
490 std::forward<F>(f),
491 std::forward<Args>(args)...);
492 new_task.record_end_event();
493 return new_task;
494 }
495
496 template <class M, class F, class... Args>
497 cs_task
499 std::initializer_list<cs_event_ref> const &sync_events,
500 F &&f,
501 Args &&...args)
502 {
504 new_task.add_dependency(sync_events);
506 std::forward<F>(f),
507 std::forward<Args>(args)...);
508 new_task.record_end_event();
509 return new_task;
510 }
511
512 template <class T, class F, class... Args>
513 cs_task
514 parallel_for_reduce_sum(cs_lnum_t n, T &sum, F &&f, Args &&...args)
515 {
518 sum,
519 std::forward<F>(f),
520 std::forward<Args>(args)...);
521 new_task.record_end_event();
522 return new_task;
523 }
524
525 template <class T, class F, class... Args>
526 cs_task
528 cs_lnum_t n,
529 std::initializer_list<cs_event_ref> const &sync_events,
530 T &sum,
531 F &&f,
532 Args &&...args)
533 {
535 new_task.add_dependency(sync_events);
537 sum,
538 std::forward<F>(f),
539 std::forward<Args>(args)...);
540 new_task.record_end_event();
541 return new_task;
542 }
543
544 template <class T, class R, class F, class... Args>
545 cs_task
546 parallel_for_reduce(cs_lnum_t n, T &r, R &reducer, F &&f, Args &&...args)
547 {
549 new_task.get_context().parallel_for_reduce(n,
550 r,
551 reducer,
552 std::forward<F>(f),
553 std::forward<Args>(args)...);
554 new_task.record_end_event();
555 return new_task;
556 }
557
558 template <class T, class R, class F, class... Args>
559 cs_task
561 std::initializer_list<cs_event_ref> const &sync_events,
562 T &r,
563 R &reducer,
564 F &&f,
565 Args &&...args)
566 {
568 new_task.add_dependency(sync_events);
569 new_task.get_context().parallel_for_reduce(n,
570 r,
571 reducer,
572 std::forward<F>(f),
573 std::forward<Args>(args)...);
574 new_task.record_end_event();
575 return new_task;
576 }
577
580 template <class FunctionType, class... Args>
582 single_task(std::initializer_list<cs_event_ref> const &sync_events,
583 FunctionType &&host_function,
584 Args &&...args)
585 {
587 std::move(host_function),
589 new_task.add_dependency(sync_events);
590 new_task.launch(std::forward<Args>(args)...);
591 new_task.record_end_event();
592 return new_task;
593 }
594
596 template <class FunctionType, class... Args>
598 single_task(FunctionType &&host_function, Args &&...args)
599 {
601 std::move(host_function),
603 new_task.launch(std::forward<Args>(args)...);
604 new_task.record_end_event();
605 return new_task;
606 }
607};
608
609/*=============================================================================
610 * Global variable definitions
611 *============================================================================*/
612
613/*=============================================================================
614 * Public function prototypes
615 *============================================================================*/
616
619#if defined(__CUDACC__)
620 // cudaEventElapsedTime gives a time in milliseconds
621 // with a resolution of around 0.5 microseconds
622 std::chrono::microseconds;
623#else
624 std::chrono::steady_clock::duration;
625#endif
626
627/*----------------------------------------------------------------------------*/
636/*----------------------------------------------------------------------------*/
637
640 cs_event_ref end)
641{
642 start->wait();
643 end->wait();
644
645#if defined(__CUDACC__)
646 // cudaEventElapsedTime gives a time in milliseconds
647 // with a resolution of around 0.5 microseconds
648 float result_ms;
649 cudaEventElapsedTime(&result_ms, ~start, ~end);
650 return cs_event_duration{ long(result_ms * 1000.f) };
651#else
652 return ~end - ~start;
653#endif
654}
655
656/*----------------------------------------------------------------------------*/
666/*----------------------------------------------------------------------------*/
667
670{
671 return cs_elapsed_time(task.get_start_event(), task.get_end_event());
672}
673
674/*----------------------------------------------------------------------------*/
675
676#endif /* __cplusplus */
677
auto parallel_for_reduce_sum(cs_lnum_t n, T &sum, F &&f, Args &&... args)
Definition: cs_dispatch.h:1593
auto parallel_for_reduce(cs_lnum_t n, T &result, R &reducer, F &&f, Args &&... args)
Definition: cs_dispatch.h:1619
auto parallel_for_b_faces(const M *m, F &&f, Args &&... args)
Definition: cs_dispatch.h:1551
auto parallel_for_i_faces(const M *m, F &&f, Args &&... args)
Definition: cs_dispatch.h:1523
auto parallel_for(cs_lnum_t n, F &&f, Args &&... args)
Definition: cs_dispatch.h:1570
void wait(void)
Wait (synchronize) until launched computations have finished.
Definition: cs_dispatch.h:1635
Definition: cs_dispatch.h:1711
Definition: cs_dispatch_queue.h:422
cs_task parallel_for_i_faces(const M *m, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:458
cs_task parallel_for_i_faces(const M *m, std::initializer_list< cs_event_ref > const &sync_events, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:470
cs_host_task< FunctionType, std::remove_reference_t< Args >... > single_task(std::initializer_list< cs_event_ref > const &sync_events, FunctionType &&host_function, Args &&...args)
Definition: cs_dispatch_queue.h:582
cs_host_task< FunctionType, std::remove_reference_t< Args >... > single_task(FunctionType &&host_function, Args &&...args)
Initiates a single thread task that runs on the host.
Definition: cs_dispatch_queue.h:598
cs_task parallel_for_reduce(cs_lnum_t n, std::initializer_list< cs_event_ref > const &sync_events, T &r, R &reducer, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:560
cs_task parallel_for_reduce_sum(cs_lnum_t n, T &sum, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:514
cs_task parallel_for(cs_lnum_t n, std::initializer_list< cs_event_ref > const &sync_events, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:442
cs_task parallel_for_reduce_sum(cs_lnum_t n, std::initializer_list< cs_event_ref > const &sync_events, T &sum, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:527
cs_task parallel_for_reduce(cs_lnum_t n, T &r, R &reducer, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:546
cs_task parallel_for_b_faces(const M *m, std::initializer_list< cs_event_ref > const &sync_events, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:498
cs_task parallel_for_b_faces(const M *m, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:486
cs_task parallel_for(cs_lnum_t n, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:430
cs_dispatch_context initializer_context
Context used to initialize tasks.
Definition: cs_dispatch_queue.h:426
Definition: cs_dispatch_queue.h:172
cs_event_ref & operator=(cs_event_ref const &) &=default
cs_event_ref & operator=(cs_event_ref &&) &=default
cs_event_ref(cs_event_ref const &other)=default
typename cs_event::underlying_type underlying_type
Definition: cs_dispatch_queue.h:176
cs_event * operator->()
Arrow operator to access members of the pointed event.
Definition: cs_dispatch_queue.h:191
underlying_type & operator~()
Definition: cs_dispatch_queue.h:206
cs_event_ref(cs_event &event)
Definition: cs_dispatch_queue.h:178
cs_event & operator*()
Dereference operator to access the pointed event.
Definition: cs_dispatch_queue.h:198
cs_event_ref(cs_event_ref &&other)=default
cs_event_ref()=delete
cs_host_task extends cs_device_task to add support for host function tasks.
Definition: cs_dispatch_queue.h:329
cs_host_task & operator=(cs_host_task &&)=default
~cs_host_task()
Wait for task termination.
Definition: cs_dispatch_queue.h:407
cs_host_task(cs_host_task const &)=delete
cs_host_task(FunctionType &&function, cs_dispatch_context context)
Definition: cs_dispatch_queue.h:359
std::tuple< Args... > args_tuple_t
Tuple type for argument storage.
Definition: cs_dispatch_queue.h:340
std::tuple< FunctionType > data_tuple_t
Definition: cs_dispatch_queue.h:348
cs_host_task & operator=(cs_host_task const &)=delete
cs_host_task(cs_host_task &&)=default
void launch(Args... args)
Definition: cs_dispatch_queue.h:377
Definition: cs_dispatch_queue.h:222
~cs_task()
Waits for task termination and destroys the associated CUDA stream.
Definition: cs_dispatch_queue.h:318
cs_task(cs_task &&)=default
cs_event_ref get_end_event()
Return a reference to the end event.
Definition: cs_dispatch_queue.h:312
cs_task(cs_dispatch_context context={})
Create a new task with a given context and initialize a new stream.
Definition: cs_dispatch_queue.h:241
cs_task(cs_task const &)=delete
cs_event_ref get_start_event()
Return a reference to the start event.
Definition: cs_dispatch_queue.h:305
cs_dispatch_context & get_context()
Return a reference to the context.
Definition: cs_dispatch_queue.h:298
cs_event_ref record_end_event()
Record an event from the task and return a cs_event_ref to it.
Definition: cs_dispatch_queue.h:283
void wait()
Wait for task completion.
Definition: cs_dispatch_queue.h:276
cs_task & operator=(cs_task &&)=default
void add_dependency(cs_event_ref event)
Add an event to wait for.
Definition: cs_dispatch_queue.h:255
cs_task & operator=(cs_task const &)=delete
void add_dependency(std::initializer_list< cs_event_ref > const &sync_events)
Definition: cs_dispatch_queue.h:265
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:350
std::chrono::steady_clock::duration cs_event_duration
Duration type for elapsed time between two events.
Definition: cs_dispatch_queue.h:624
cs_event_duration cs_elapsed_time(cs_event_ref start, cs_event_ref end)
Returns elapsed time (in microseconds) between two events.
Definition: cs_dispatch_queue.h:639
static void sum(const cs_execution_context *ec, T &first, Vals &... values)
Sum values of a given datatype over a given communicator.
Definition: cs_parall.h:881
Definition: cs_dispatch_queue.h:83
cs_event(cs_event &&other)=default
~cs_event()=default
cs_event(cs_event const &other)=delete
Destructor.
underlying_type & operator~()
Return the underlying implementation.
Definition: cs_dispatch_queue.h:137
cs_event()
Constructor.
Definition: cs_dispatch_queue.h:95
cs_event & operator=(cs_event &&other)=default
void wait()
Wait upon completion.
Definition: cs_dispatch_queue.h:158
cs_event & operator=(cs_event const &)=delete
std::chrono::steady_clock::time_point underlying_type
Definition: cs_dispatch_queue.h:88
underlying_type event_impl
Definition: cs_dispatch_queue.h:91