68template <
typename T,
size_t str
ide>
76 for (
cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x;
id < n;
77 id += blockDim.x * gridDim.x) {
78 for (
int j = 0; j < size_arrs; j++) {
79 for (
size_t k = 0;
k < stride;
k++) {
80 array_ptrs[j][
id*stride +
k] = ref_val;
86template <
typename T,
size_t str
ide>
94 for (
cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x;
id < n;
95 id += blockDim.x * gridDim.x) {
96 for (
int j = 0; j < size_arrs; j++) {
97 for (
size_t k = 0;
k < stride;
k++) {
98 array_ptrs[j][
id*stride +
k] = ref_val[
k];
104template <
typename T,
size_t str
ide>
111 for (
cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x;
id < n;
112 id += blockDim.x * gridDim.x) {
113 for (
size_t k = 0;
k < stride;
k++) {
114 array[
id*stride +
k] = ref_val;
119template <
typename T,
size_t str
ide>
126 for (
cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x;
id < n;
127 id += blockDim.x * gridDim.x) {
128 for (
size_t k = 0;
k < stride;
k++) {
129 array[
id*stride +
k] = *ref_val;
134template <
typename T,
size_t str
ide>
139 T c =
static_cast<T
>(0);
142 for (
cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x;
id < n;
143 id += blockDim.x * gridDim.x) {
144 for (
size_t k = 0;
k < stride;
k++) {
145 array[
id*stride +
k] = c;
168template <
typename T,
size_t stride,
typename... Arrays>
176 const long block_size_ = 256;
177 const long grid_size_ = (n_elts % block_size_) ?
178 n_elts/block_size_ + 1 : n_elts/block_size_;
181 const int size_arrs =
sizeof...(arrays);
182 T* _array_ptrs[] = {arrays ...};
186 cudaGraphExec_t graph_exec = NULL;
188 cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
189 for (
int j = 0; j < size_arrs; j++) {
190 cuda_kernel_set_value<T, stride><<<grid_size_, block_size_, 0, stream>>>
191 (n_elts, ref_val, _array_ptrs[j]);
193 cudaStreamEndCapture(stream, &graph);
195 cudaError_t status = cudaGraphInstantiate(&graph_exec, graph,
196 nullptr,
nullptr, 0);
197 cudaGraphLaunch(graph_exec, stream);
203 CS_CUDA_CHECK(cudaStreamSynchronize(stream));
204 CS_CUDA_CHECK(cudaGetLastError());
206 cudaGraphDestroy(graph);
207 cudaGraphExecDestroy(graph_exec);
211 cuda_kernel_set_value<T, stride><<<grid_size_, block_size_, 0, stream>>>
212 (n_elts, ref_val, _array_ptrs[0]);
214 if (async ==
false) {
215 CS_CUDA_CHECK(cudaStreamSynchronize(stream));
216 CS_CUDA_CHECK(cudaGetLastError());
239template <
typename T,
size_t stride,
typename... Arrays>
247 const long block_size_ = 256;
248 const long grid_size_ = (n_elts % block_size_) ?
249 n_elts/block_size_ + 1 : n_elts/block_size_;
252 const int size_arrs =
sizeof...(arrays);
253 T* _array_ptrs[] = {arrays ...};
257 cudaGraphExec_t graph_exec = NULL;
259 cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
261 for (
int j = 0; j < size_arrs; j++) {
262 cuda_kernel_set_value<T, stride><<<grid_size_, block_size_, 0, stream>>>
263 (n_elts, ref_val, _array_ptrs[j]);
265 cudaStreamEndCapture(stream, &graph);
267 cudaError_t status = cudaGraphInstantiate(&graph_exec, graph,
268 nullptr,
nullptr, 0);
269 cudaGraphLaunch(graph_exec, stream);
275 CS_CUDA_CHECK(cudaStreamSynchronize(stream));
276 CS_CUDA_CHECK(cudaGetLastError());
278 cudaGraphDestroy(graph);
279 cudaGraphExecDestroy(graph_exec);
283 cuda_kernel_set_value<T, stride><<<grid_size_, block_size_, 0, stream>>>
284 (n_elts, ref_val, _array_ptrs[0]);
286 if (async ==
false) {
287 CS_CUDA_CHECK(cudaStreamSynchronize(stream));
288 CS_CUDA_CHECK(cudaGetLastError());
311template <
typename T,
size_t stride,
typename... Arrays>
318 const long block_size_ = 256;
319 const long grid_size_ = (n_elts % block_size_) ?
320 n_elts/block_size_ + 1 : n_elts/block_size_;
323 const int size_arrs =
sizeof...(arrays);
324 T* _array_ptrs[] = {arrays ...};
328 cudaGraphExec_t graph_exec = NULL;
330 cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
331 for (
int j = 0; j < size_arrs; j++) {
332 cs_cuda_kernel_set_zero<T, stride><<<grid_size_, block_size_, 0, stream>>>
333 (n_elts, _array_ptrs[j]);
335 cudaStreamEndCapture(stream, &graph);
337 cudaError_t status = cudaGraphInstantiate(&graph_exec, graph,
338 nullptr,
nullptr, 0);
339 cudaGraphLaunch(graph_exec, stream);
345 CS_CUDA_CHECK(cudaStreamSynchronize(stream));
346 CS_CUDA_CHECK(cudaGetLastError());
348 cudaGraphDestroy(graph);
349 cudaGraphExecDestroy(graph_exec);
353 cs_cuda_kernel_set_zero<T, stride><<<grid_size_, block_size_, 0, stream>>>
354 (n_elts, _array_ptrs[0]);
356 if (async ==
false) {
357 CS_CUDA_CHECK(cudaStreamSynchronize(stream));
358 CS_CUDA_CHECK(cudaGetLastError());
384 cudaMemcpyAsync(dest, src, size*
sizeof(T),
385 cudaMemcpyDeviceToDevice, stream);
void cs_array_copy(cudaStream_t stream, const cs_lnum_t size, const T *src, T *dest)
Copy values from an array to another of the same dimensions.
Definition: cs_array_cuda.h:379
__global__ void cs_cuda_kernel_set_zero(cs_lnum_t n, T *array)
Definition: cs_array_cuda.h:136
void cs_arrays_set_zero(cudaStream_t stream, bool async, cs_lnum_t n_elts, Arrays &&... arrays)
Assign values to all elements of multiple arrays. ref_val is input as a pointer or an array.
Definition: cs_array_cuda.h:313
__global__ void cuda_kernel_set_value(cs_lnum_t n, const T ref_val, const int size_arrs, T **array_ptrs)
Definition: cs_array_cuda.h:70
void cs_arrays_set_value(cudaStream_t stream, bool async, cs_lnum_t n_elts, T *ref_val, Arrays &&... arrays)
Assign values to all elements of multiple arrays. ref_val is input as a pointer or an array.
Definition: cs_array_cuda.h:170
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:350
@ k
Definition: cs_field_pointer.h:72