41#include <initializer_list>
46#if defined(__CUDACC__)
48#include <cuda_runtime.h>
66#ifndef CS_DISPATCH_QUEUE_FORCE_SYNC
67#define CS_DISPATCH_QUEUE_FORCE_SYNC 0
85#if defined(__CUDACC__)
88 std::chrono::steady_clock::time_point;
97#if defined(__CUDACC__)
109#if defined(__CUDACC__)
112 other.event_impl =
nullptr;
120#if defined(__CUDACC__)
127 other.event_impl =
nullptr;
143#if defined(__CUDACC__)
160#if defined(__CUDACC__)
208 return ~(*event_ptr);
243#if defined(__CUDACC__)
244 cudaStream_t new_stream;
245 cudaStreamCreate(&new_stream);
246 context_.set_cuda_stream(new_stream);
247 cudaEventRecord(~start_event, context_.cuda_stream());
249 ~start_event = std::chrono::steady_clock::now();
257#if defined(__CUDACC__)
258 cudaStreamWaitEvent(context_.cuda_stream(), ~event);
267#if defined(__CUDACC__)
268 for (
auto const &event : sync_events) {
285#if defined(__CUDACC__)
286 cudaEventRecord(~end_event, context_.cuda_stream());
288 ~end_event = std::chrono::steady_clock::now();
290 return { end_event };
321#if defined(__CUDACC__)
322 cudaStreamDestroy(context_.cuda_stream());
328template <
class FunctionType,
class... Args>
345#if defined(__CUDACC__)
346 std::tuple<FunctionType, args_tuple_t>;
348 std::tuple<FunctionType>;
362#if defined(__CUDACC__)
365 data_tuple_(std::move(function))
372#if defined(__CUDACC__)
379#if defined(__CUDACC__)
382 std::get<1>(data_tuple_) =
args_tuple_t{ std::move(args)... };
385 return cudaLaunchHostFunc
389 [](
void *data_tuple_ptr) ->
void {
390 auto &[f, args_tuple] = *(
data_tuple_t *)(data_tuple_ptr);
391 std::apply(f, args_tuple);
398 std::get<0>(data_tuple_)(args...);
402 std::get<0>(data_tuple_)(args...);
428 template <
class F,
class... Args>
435 std::forward<Args>(args)...);
440 template <
class F,
class... Args>
443 std::initializer_list<cs_event_ref>
const &sync_events,
451 std::forward<Args>(args)...);
456 template <
class M,
class F,
class... Args>
463 std::forward<Args>(args)...);
468 template <
class M,
class F,
class... Args>
471 std::initializer_list<cs_event_ref>
const &sync_events,
479 std::forward<Args>(args)...);
484 template <
class M,
class F,
class... Args>
491 std::forward<Args>(args)...);
496 template <
class M,
class F,
class... Args>
499 std::initializer_list<cs_event_ref>
const &sync_events,
507 std::forward<Args>(args)...);
512 template <
class T,
class F,
class... Args>
520 std::forward<Args>(args)...);
525 template <
class T,
class F,
class... Args>
529 std::initializer_list<cs_event_ref>
const &sync_events,
539 std::forward<Args>(args)...);
544 template <
class T,
class R,
class F,
class... Args>
553 std::forward<Args>(args)...);
558 template <
class T,
class R,
class F,
class... Args>
561 std::initializer_list<cs_event_ref>
const &sync_events,
573 std::forward<Args>(args)...);
580 template <
class FunctionType,
class... Args>
582 single_task(std::initializer_list<cs_event_ref>
const &sync_events,
583 FunctionType &&host_function,
587 std::move(host_function),
589 new_task.add_dependency(sync_events);
590 new_task.launch(std::forward<Args>(args)...);
591 new_task.record_end_event();
596 template <
class FunctionType,
class... Args>
601 std::move(host_function),
603 new_task.launch(std::forward<Args>(args)...);
604 new_task.record_end_event();
619#if defined(__CUDACC__)
622 std::chrono::microseconds;
624 std::chrono::steady_clock::duration;
645#if defined(__CUDACC__)
649 cudaEventElapsedTime(&result_ms, ~start, ~end);
652 return ~end - ~start;
auto parallel_for_reduce_sum(cs_lnum_t n, T &sum, F &&f, Args &&... args)
Definition: cs_dispatch.h:1593
auto parallel_for_reduce(cs_lnum_t n, T &result, R &reducer, F &&f, Args &&... args)
Definition: cs_dispatch.h:1619
auto parallel_for_b_faces(const M *m, F &&f, Args &&... args)
Definition: cs_dispatch.h:1551
auto parallel_for_i_faces(const M *m, F &&f, Args &&... args)
Definition: cs_dispatch.h:1523
auto parallel_for(cs_lnum_t n, F &&f, Args &&... args)
Definition: cs_dispatch.h:1570
void wait(void)
Wait (synchronize) until launched computations have finished.
Definition: cs_dispatch.h:1635
Definition: cs_dispatch.h:1711
Definition: cs_dispatch_queue.h:422
cs_task parallel_for_i_faces(const M *m, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:458
cs_task parallel_for_i_faces(const M *m, std::initializer_list< cs_event_ref > const &sync_events, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:470
cs_host_task< FunctionType, std::remove_reference_t< Args >... > single_task(std::initializer_list< cs_event_ref > const &sync_events, FunctionType &&host_function, Args &&...args)
Definition: cs_dispatch_queue.h:582
cs_host_task< FunctionType, std::remove_reference_t< Args >... > single_task(FunctionType &&host_function, Args &&...args)
Initiates a single thread task that runs on the host.
Definition: cs_dispatch_queue.h:598
cs_task parallel_for_reduce(cs_lnum_t n, std::initializer_list< cs_event_ref > const &sync_events, T &r, R &reducer, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:560
cs_task parallel_for_reduce_sum(cs_lnum_t n, T &sum, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:514
cs_task parallel_for(cs_lnum_t n, std::initializer_list< cs_event_ref > const &sync_events, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:442
cs_task parallel_for_reduce_sum(cs_lnum_t n, std::initializer_list< cs_event_ref > const &sync_events, T &sum, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:527
cs_task parallel_for_reduce(cs_lnum_t n, T &r, R &reducer, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:546
cs_task parallel_for_b_faces(const M *m, std::initializer_list< cs_event_ref > const &sync_events, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:498
cs_task parallel_for_b_faces(const M *m, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:486
cs_task parallel_for(cs_lnum_t n, F &&f, Args &&...args)
Definition: cs_dispatch_queue.h:430
cs_dispatch_context initializer_context
Context used to initialize tasks.
Definition: cs_dispatch_queue.h:426
Definition: cs_dispatch_queue.h:172
cs_event_ref & operator=(cs_event_ref const &) &=default
cs_event_ref & operator=(cs_event_ref &&) &=default
cs_event_ref(cs_event_ref const &other)=default
typename cs_event::underlying_type underlying_type
Definition: cs_dispatch_queue.h:176
cs_event * operator->()
Arrow operator to access members of the pointed event.
Definition: cs_dispatch_queue.h:191
underlying_type & operator~()
Definition: cs_dispatch_queue.h:206
cs_event_ref(cs_event &event)
Definition: cs_dispatch_queue.h:178
cs_event & operator*()
Dereference operator to access the pointed event.
Definition: cs_dispatch_queue.h:198
cs_event_ref(cs_event_ref &&other)=default
cs_host_task extends cs_device_task to add support for host function tasks.
Definition: cs_dispatch_queue.h:329
cs_host_task & operator=(cs_host_task &&)=default
~cs_host_task()
Wait for task termination.
Definition: cs_dispatch_queue.h:407
cs_host_task(cs_host_task const &)=delete
cs_host_task(FunctionType &&function, cs_dispatch_context context)
Definition: cs_dispatch_queue.h:359
std::tuple< Args... > args_tuple_t
Tuple type for argument storage.
Definition: cs_dispatch_queue.h:340
std::tuple< FunctionType > data_tuple_t
Definition: cs_dispatch_queue.h:348
cs_host_task & operator=(cs_host_task const &)=delete
cs_host_task(cs_host_task &&)=default
void launch(Args... args)
Definition: cs_dispatch_queue.h:377
Definition: cs_dispatch_queue.h:222
~cs_task()
Waits for task termination and destroys the associated CUDA stream.
Definition: cs_dispatch_queue.h:318
cs_task(cs_task &&)=default
cs_event_ref get_end_event()
Return a reference to the end event.
Definition: cs_dispatch_queue.h:312
cs_task(cs_dispatch_context context={})
Create a new task with a given context and initialize a new stream.
Definition: cs_dispatch_queue.h:241
cs_task(cs_task const &)=delete
cs_event_ref get_start_event()
Return a reference to the start event.
Definition: cs_dispatch_queue.h:305
cs_dispatch_context & get_context()
Return a reference to the context.
Definition: cs_dispatch_queue.h:298
cs_event_ref record_end_event()
Record an event from the task and return a cs_event_ref to it.
Definition: cs_dispatch_queue.h:283
void wait()
Wait for task completion.
Definition: cs_dispatch_queue.h:276
cs_task & operator=(cs_task &&)=default
void add_dependency(cs_event_ref event)
Add an event to wait for.
Definition: cs_dispatch_queue.h:255
cs_task & operator=(cs_task const &)=delete
void add_dependency(std::initializer_list< cs_event_ref > const &sync_events)
Definition: cs_dispatch_queue.h:265
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:350
std::chrono::steady_clock::duration cs_event_duration
Duration type for elapsed time between two events.
Definition: cs_dispatch_queue.h:624
cs_event_duration cs_elapsed_time(cs_event_ref start, cs_event_ref end)
Returns elapsed time (in microseconds) between two events.
Definition: cs_dispatch_queue.h:639
static void sum(const cs_execution_context *ec, T &first, Vals &... values)
Sum values of a given datatype over a given communicator.
Definition: cs_parall.h:881
Definition: cs_dispatch_queue.h:83
cs_event(cs_event &&other)=default
cs_event(cs_event const &other)=delete
Destructor.
underlying_type & operator~()
Return the underlying implementation.
Definition: cs_dispatch_queue.h:137
cs_event()
Constructor.
Definition: cs_dispatch_queue.h:95
cs_event & operator=(cs_event &&other)=default
void wait()
Wait upon completion.
Definition: cs_dispatch_queue.h:158
cs_event & operator=(cs_event const &)=delete
std::chrono::steady_clock::time_point underlying_type
Definition: cs_dispatch_queue.h:88
underlying_type event_impl
Definition: cs_dispatch_queue.h:91