9.1
general documentation
cs_array_cuda.h
Go to the documentation of this file.
1/*============================================================================
2 * Array handling utilities.
3 *============================================================================*/
4
5/*
6 This file is part of code_saturne, a general-purpose CFD tool.
7
8 Copyright (C) 1998-2025 EDF S.A.
9
10 This program is free software; you can redistribute it and/or modify it under
11 the terms of the GNU General Public License as published by the Free Software
12 Foundation; either version 2 of the License, or (at your option) any later
13 version.
14
15 This program is distributed in the hope that it will be useful, but WITHOUT
16 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
18 details.
19
20 You should have received a copy of the GNU General Public License along with
21 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22 Street, Fifth Floor, Boston, MA 02110-1301, USA.
23*/
24
25/*----------------------------------------------------------------------------*/
26
27/*----------------------------------------------------------------------------
28 * Standard C and C++ library headers
29 *----------------------------------------------------------------------------*/
30
31#include <string.h>
32
33/*----------------------------------------------------------------------------
34 * Local headers
35 *----------------------------------------------------------------------------*/
36
37#include "base/cs_defs.h"
38#include "base/cs_base_cuda.h"
39
40/*----------------------------------------------------------------------------*/
41
42/*=============================================================================
43 * Macro definitions
44 *============================================================================*/
45
46/*============================================================================
47 * Type definitions
48 *============================================================================*/
49
50/*============================================================================
51 * Global variables
52 *============================================================================*/
53
54/*=============================================================================
55 * Public inline function prototypes
56 *============================================================================*/
57
58/*=============================================================================
59 * Public function prototypes
60 *============================================================================*/
61
62/* Default kernel that loops over an integer range and calls a device functor.
63 This kernel uses a grid_size-stride loop and thus guarantees that all
64 integers are processed, even if the grid is smaller.
65 All arguments *must* be passed by value to avoid passing CPU references
66 to the GPU. */
67
68template <typename T, size_t stride>
69__global__ void
71 const T ref_val,
72 const int size_arrs,
73 T **array_ptrs)
74{
75 // grid_size-stride loop
76 for (cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; id < n;
77 id += blockDim.x * gridDim.x) {
78 for (int j = 0; j < size_arrs; j++) {
79 for (size_t k = 0; k < stride; k++) {
80 array_ptrs[j][id*stride + k] = ref_val;
81 } // end loop stride
82 }
83 } // end loop arrays
84}
85
86template <typename T, size_t stride>
87__global__ void
89 const T *ref_val,
90 const int size_arrs,
91 T **array_ptrs)
92{
93 // grid_size-stride loop
94 for (cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; id < n;
95 id += blockDim.x * gridDim.x) {
96 for (int j = 0; j < size_arrs; j++) {
97 for (size_t k = 0; k < stride; k++) {
98 array_ptrs[j][id*stride + k] = ref_val[k];
99 } // end loop stride
100 }
101 } // end loop arrays
102}
103
104template <typename T, size_t stride>
105__global__ void
107 const T ref_val,
108 T *array)
109{
110 // grid_size-stride loop
111 for (cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; id < n;
112 id += blockDim.x * gridDim.x) {
113 for (size_t k = 0; k < stride; k++) {
114 array[id*stride + k] = ref_val;
115 } // end loop stride
116 } // end loop arrays
117}
118
119template <typename T, size_t stride>
120__global__ void
122 const T *ref_val,
123 T *array)
124{
125 // grid_size-stride loop
126 for (cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; id < n;
127 id += blockDim.x * gridDim.x) {
128 for (size_t k = 0; k < stride; k++) {
129 array[id*stride + k] = *ref_val;
130 } // end loop stride
131 } // end loop arrays
132}
133
134template <typename T, size_t stride>
135__global__ void
137 T *array)
138{
139 T c = static_cast<T>(0);
140
141 // grid_size-stride loop
142 for (cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; id < n;
143 id += blockDim.x * gridDim.x) {
144 for (size_t k = 0; k < stride; k++) {
145 array[id*stride + k] = c;
146 } // end loop stride
147 } // end loop arrays
148}
149
150/*----------------------------------------------------------------------------*/
166/*----------------------------------------------------------------------------*/
167
168template <typename T, size_t stride, typename... Arrays>
169void
170cs_arrays_set_value(cudaStream_t stream,
171 bool async,
172 cs_lnum_t n_elts,
173 T *ref_val,
174 Arrays&&... arrays)
175{
176 const long block_size_ = 256;
177 const long grid_size_ = (n_elts % block_size_) ?
178 n_elts/block_size_ + 1 : n_elts/block_size_;
179
180 /* Explicit expansion of arrays */
181 const int size_arrs = sizeof...(arrays);
182 T* _array_ptrs[] = {arrays ...};
183
184 if (size_arrs > 1) {
185 cudaGraph_t graph;
186 cudaGraphExec_t graph_exec = NULL;
187
188 cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
189 for (int j = 0; j < size_arrs; j++) {
190 cuda_kernel_set_value<T, stride><<<grid_size_, block_size_, 0, stream>>>
191 (n_elts, ref_val, _array_ptrs[j]);
192 }
193 cudaStreamEndCapture(stream, &graph);
194
195 cudaError_t status = cudaGraphInstantiate(&graph_exec, graph,
196 nullptr, nullptr, 0);
197 cudaGraphLaunch(graph_exec, stream);
198
199 /* For multiple arrays, sync even if async is required,
200 to ensure the graph is not destroyed before all kernels
201 are scheduled */
202
203 CS_CUDA_CHECK(cudaStreamSynchronize(stream));
204 CS_CUDA_CHECK(cudaGetLastError());
205
206 cudaGraphDestroy(graph);
207 cudaGraphExecDestroy(graph_exec);
208 }
209
210 else {
211 cuda_kernel_set_value<T, stride><<<grid_size_, block_size_, 0, stream>>>
212 (n_elts, ref_val, _array_ptrs[0]);
213
214 if (async == false) {
215 CS_CUDA_CHECK(cudaStreamSynchronize(stream));
216 CS_CUDA_CHECK(cudaGetLastError());
217 }
218 }
219}
220
221/*----------------------------------------------------------------------------*/
237/*----------------------------------------------------------------------------*/
238
239template <typename T, size_t stride, typename... Arrays>
240void
241cs_arrays_set_value(cudaStream_t stream,
242 bool async,
243 cs_lnum_t n_elts,
244 T ref_val,
245 Arrays&&... arrays)
246{
247 const long block_size_ = 256;
248 const long grid_size_ = (n_elts % block_size_) ?
249 n_elts/block_size_ + 1 : n_elts/block_size_;
250
251 /* Explicit expansion of arrays */
252 const int size_arrs = sizeof...(arrays);
253 T* _array_ptrs[] = {arrays ...};
254
255 if (size_arrs > 1) {
256 cudaGraph_t graph;
257 cudaGraphExec_t graph_exec = NULL;
258
259 cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
260
261 for (int j = 0; j < size_arrs; j++) {
262 cuda_kernel_set_value<T, stride><<<grid_size_, block_size_, 0, stream>>>
263 (n_elts, ref_val, _array_ptrs[j]);
264 }
265 cudaStreamEndCapture(stream, &graph);
266
267 cudaError_t status = cudaGraphInstantiate(&graph_exec, graph,
268 nullptr, nullptr, 0);
269 cudaGraphLaunch(graph_exec, stream);
270
271 /* For multiple arrays, sync even if async is required,
272 to ensure the graph is not destroyed before all kernels
273 are scheduled */
274
275 CS_CUDA_CHECK(cudaStreamSynchronize(stream));
276 CS_CUDA_CHECK(cudaGetLastError());
277
278 cudaGraphDestroy(graph);
279 cudaGraphExecDestroy(graph_exec);
280 }
281
282 else {
283 cuda_kernel_set_value<T, stride><<<grid_size_, block_size_, 0, stream>>>
284 (n_elts, ref_val, _array_ptrs[0]);
285
286 if (async == false) {
287 CS_CUDA_CHECK(cudaStreamSynchronize(stream));
288 CS_CUDA_CHECK(cudaGetLastError());
289 }
290 }
291}
292
293/*----------------------------------------------------------------------------*/
309/*----------------------------------------------------------------------------*/
310
311template <typename T, size_t stride, typename... Arrays>
312void
313cs_arrays_set_zero(cudaStream_t stream,
314 bool async,
315 cs_lnum_t n_elts,
316 Arrays&&... arrays)
317{
318 const long block_size_ = 256;
319 const long grid_size_ = (n_elts % block_size_) ?
320 n_elts/block_size_ + 1 : n_elts/block_size_;
321
322 /* Explicit expansion of arrays */
323 const int size_arrs = sizeof...(arrays);
324 T* _array_ptrs[] = {arrays ...};
325
326 if (size_arrs > 1) {
327 cudaGraph_t graph;
328 cudaGraphExec_t graph_exec = NULL;
329
330 cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
331 for (int j = 0; j < size_arrs; j++) {
332 cs_cuda_kernel_set_zero<T, stride><<<grid_size_, block_size_, 0, stream>>>
333 (n_elts, _array_ptrs[j]);
334 }
335 cudaStreamEndCapture(stream, &graph);
336
337 cudaError_t status = cudaGraphInstantiate(&graph_exec, graph,
338 nullptr, nullptr, 0);
339 cudaGraphLaunch(graph_exec, stream);
340
341 /* For multiple arrays, sync even if async is required,
342 to ensure the graph is not destroyed before all kernels
343 are scheduled */
344
345 CS_CUDA_CHECK(cudaStreamSynchronize(stream));
346 CS_CUDA_CHECK(cudaGetLastError());
347
348 cudaGraphDestroy(graph);
349 cudaGraphExecDestroy(graph_exec);
350 }
351
352 else {
353 cs_cuda_kernel_set_zero<T, stride><<<grid_size_, block_size_, 0, stream>>>
354 (n_elts, _array_ptrs[0]);
355
356 if (async == false) {
357 CS_CUDA_CHECK(cudaStreamSynchronize(stream));
358 CS_CUDA_CHECK(cudaGetLastError());
359 }
360 }
361}
362
363/*----------------------------------------------------------------------------*/
375/*----------------------------------------------------------------------------*/
376
377template <typename T>
378void
379cs_array_copy(cudaStream_t stream,
380 const cs_lnum_t size,
381 const T* src,
382 T* dest)
383{
384 cudaMemcpyAsync(dest, src, size*sizeof(T),
385 cudaMemcpyDeviceToDevice, stream);
386}
387
388/*----------------------------------------------------------------------------*/
void cs_array_copy(cudaStream_t stream, const cs_lnum_t size, const T *src, T *dest)
Copy values from an array to another of the same dimensions.
Definition: cs_array_cuda.h:379
__global__ void cs_cuda_kernel_set_zero(cs_lnum_t n, T *array)
Definition: cs_array_cuda.h:136
void cs_arrays_set_zero(cudaStream_t stream, bool async, cs_lnum_t n_elts, Arrays &&... arrays)
Assign values to all elements of multiple arrays. ref_val is input as a pointer or an array.
Definition: cs_array_cuda.h:313
__global__ void cuda_kernel_set_value(cs_lnum_t n, const T ref_val, const int size_arrs, T **array_ptrs)
Definition: cs_array_cuda.h:70
void cs_arrays_set_value(cudaStream_t stream, bool async, cs_lnum_t n_elts, T *ref_val, Arrays &&... arrays)
Assign values to all elements of multiple arrays. ref_val is input as a pointer or an array.
Definition: cs_array_cuda.h:170
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:350
@ k
Definition: cs_field_pointer.h:72