9.1
general documentation
cs_base_cuda.h
Go to the documentation of this file.
1#ifndef __CS_BASE_CUDA_H__
2#define __CS_BASE_CUDA_H__
3
4/*============================================================================
5 * Definitions, global variables, and base functions for CUDA
6 *============================================================================*/
7
8/*
9 This file is part of code_saturne, a general-purpose CFD tool.
10
11 Copyright (C) 1998-2025 EDF S.A.
12
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; either version 2 of the License, or (at your option) any later
16 version.
17
18 This program is distributed in the hope that it will be useful, but WITHOUT
19 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
21 details.
22
23 You should have received a copy of the GNU General Public License along with
24 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
25 Street, Fifth Floor, Boston, MA 02110-1301, USA.
26*/
27
28/*----------------------------------------------------------------------------*/
29
30#include "base/cs_defs.h"
31
32#if defined(HAVE_CUDA)
33
34/*----------------------------------------------------------------------------
35 * Standard C library headers
36 *----------------------------------------------------------------------------*/
37
38#if defined(HAVE_NCCL)
39#include <nccl.h>
40#endif
41
42/*----------------------------------------------------------------------------
43 * Local headers
44 *----------------------------------------------------------------------------*/
45
46#include "base/cs_base_accel.h"
47#include "base/cs_log.h"
48
49/*=============================================================================
50 * Macro definitions
51 *============================================================================*/
52
53#define CS_CUDA_CHECK(a) { \
54 cudaError_t _l_ret_code = a; \
55 if (cudaSuccess != _l_ret_code) { \
56 bft_error(__FILE__, __LINE__, 0, "[CUDA error] %d: %s\n running: %s", \
57 _l_ret_code, ::cudaGetErrorString(_l_ret_code), #a); \
58 } \
59 }
60
61#define CS_CUDA_CHECK_CALL(a, file_name, line_num) { \
62 cudaError_t _l_ret_code = a; \
63 if (cudaSuccess != _l_ret_code) { \
64 bft_error(file_name, line_num, 0, "[CUDA error] %d: %s\n running: %s", \
65 _l_ret_code, ::cudaGetErrorString(_l_ret_code), #a); \
66 } \
67 }
68
69/* For all current compute capabilities, the warp size is 32; If it ever
70 changes, it can be obtained through cudaDeviceProp, so we could then
71 replace this macro with a global variable */
72
73#define CS_CUDA_WARP_SIZE 32
74
75/*----------------------------------------------------------------------------*/
76
78
79/*============================================================================
80 * Type definitions
81 *============================================================================*/
82
83/*=============================================================================
84 * Global variable definitions
85 *============================================================================*/
86
87extern int cs_glob_cuda_device_id;
88
89/* Other device parameters */
90
91extern int cs_glob_cuda_shared_mem_per_block;
92extern int cs_glob_cuda_max_threads_per_block;
93extern int cs_glob_cuda_max_block_size;
94extern int cs_glob_cuda_max_blocks;
95extern int cs_glob_cuda_n_mp; /* Number of multiprocessors */
96
97#if defined(HAVE_NCCL)
98
99extern ncclComm_t cs_glob_nccl_comm;
100
101/* NCCL Datatypes associated with code_saturne datatypes */
102extern ncclDataType_t cs_datatype_to_nccl[];
103
104#endif
105
106/* Allow graphs for kernel launches ? May interfere with profiling (nsys),
107 so can be deactivated. */
108
109extern bool cs_glob_cuda_allow_graph;
110
111/*============================================================================
112 * Semi-private function prototypes
113 *
114 * The following functions are intended to be used by the common
115 * host-device memory management functions from cs_base_accel.c, and
116 * not directly by the user.
117 *============================================================================*/
118
119/*----------------------------------------------------------------------------*/
120/*
121 * \brief Copy data from host to device.
122 *
123 * This is simply a wrapper over cudaMemcpy.
124 *
125 * A safety check is added.
126 *
127 * \param [out] dst pointer to destination data
128 * \param [in] src pointer to source data
129 * \param [in] size size of data to copy
130 */
131/*----------------------------------------------------------------------------*/
132
133void
134cs_cuda_copy_h2d(void *dst,
135 const void *src,
136 size_t size);
137
138/*----------------------------------------------------------------------------*/
139/*
140 * \brief Copy data from host to device, possibly returning on the host
141 * before the copy is finished.
142 *
143 * This is simply a wrapper over cudaMemcpyAsync.
144 *
145 * A safety check is added.
146 *
147 * \param [out] dst pointer to destination data
148 * \param [in] src pointer to source data
149 * \param [in] size size of data to copy
150 *
151 * \returns pointer to allocated memory.
152 */
153/*----------------------------------------------------------------------------*/
154
155void
156cs_cuda_copy_h2d_async(void *dst,
157 const void *src,
158 size_t size);
159
160/*----------------------------------------------------------------------------*/
161/*
162 * \brief Copy data from device to host.
163 *
164 * This is simply a wrapper over cudaMemcpy.
165 *
166 * A safety check is added.
167 *
168 * \param [out] dst pointer to destination data
169 * \param [in] src pointer to source data
170 * \param [in] size size of data to copy
171 *
172 * \returns pointer to allocated memory.
173 */
174/*----------------------------------------------------------------------------*/
175
176void
177cs_cuda_copy_d2h(void *dst,
178 const void *src,
179 size_t size);
180
181/*----------------------------------------------------------------------------*/
182/*
183 * \brief Copy data from host to device.
184 *
185 * This is simply a wrapper over cudaMemcpy.
186 *
187 * A safety check is added.
188 *
189 * \param [out] dst pointer to destination data
190 * \param [in] src pointer to source data
191 * \param [in] size size of data to copy
192 *
193 * \returns pointer to allocated memory.
194 */
195/*----------------------------------------------------------------------------*/
196
197void
198cs_cuda_copy_d2h_async(void *dst,
199 const void *src,
200 size_t size);
201
202/*----------------------------------------------------------------------------*/
203/*
204 * \brief Copy data from device to device.
205 *
206 * This is simply a wrapper over cudaMemcpy.
207 *
208 * A safety check is added.
209 *
210 * \param [out] dst pointer to destination data
211 * \param [in] src pointer to source data
212 * \param [in] size size of data to copy
213 */
214/*----------------------------------------------------------------------------*/
215
216void
217cs_cuda_copy_d2d(void *dst,
218 const void *src,
219 size_t size);
220
221/*----------------------------------------------------------------------------*/
222/*
223 * \brief Get host pointer for a managed or device pointer.
224 *
225 * This function can be called with a pointer inside an allocated block of
226 * memory, so is not restricted to values returned by CS_MALLOC_HD.
227 *
228 * This makes it possible to check whether a pointer to an array inside
229 * a larger array is shared or accessible from the device only
230 * (for example when grouping allocations).
231 *
232 * \param [in] ptr pointer to device data
233 *
234 * \return pointer to host data if shared or mapped at the CUDA level,
235 * NULL otherwise.
236 */
237/*----------------------------------------------------------------------------*/
238
239void *
240cs_cuda_get_host_ptr(const void *ptr);
241
242/*=============================================================================
243 * Inline function prototypes
244 *============================================================================*/
245
246/*----------------------------------------------------------------------------*/
259/*----------------------------------------------------------------------------*/
260
261static inline unsigned int
262cs_cuda_grid_size(cs_lnum_t n,
263 unsigned int block_size)
264{
265 return (n % block_size) ? n/block_size + 1 : n/block_size;
266}
267
269
270#if defined(__NVCC__)
271
272/*----------------------------------------------------------------------------
273 * Synchronize of copy a cs_real_t type array from the host to a device.
274 *
275 * parameters:
276 * val_h <-- pointer to host data
277 * n_vals <-- number of data values
278 * device_id <-- associated device id
279 * stream <-- associated stream (for async prefetch only)
280 * val_d --> matching pointer on device
281 * buf_d --> matching allocation pointer on device (should be freed
282 * after use if non-null)
283 *----------------------------------------------------------------------------*/
284
285template <typename T>
286void
287cs_sync_or_copy_h2d(const T *val_h,
288 cs_lnum_t n_vals,
289 int device_id,
290 cudaStream_t stream,
291 const T **val_d,
292 void **buf_d)
293{
294 const T *_val_d = NULL;
295 void *_buf_d = NULL;
296
297 if (val_h != NULL) {
298
299 cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h);
300 size_t size = n_vals * sizeof(T);
301
302 if (alloc_mode == CS_ALLOC_HOST) {
303 CS_CUDA_CHECK(cudaMalloc(&_buf_d, size));
304 cs_cuda_copy_h2d(_buf_d, val_h, size);
305 _val_d = (const T *)_buf_d;
306 }
307 else {
308 _val_d = (const T *)cs_get_device_ptr_const((const void *)val_h);
309
310 if (alloc_mode < CS_ALLOC_HOST_DEVICE_SHARED)
311 cs_sync_h2d(val_h);
312 }
313
314 }
315
316 *val_d = _val_d;
317 *buf_d = _buf_d;
318}
319
320/*=============================================================================
321 * Public function prototypes
322 *============================================================================*/
323
324/*----------------------------------------------------------------------------*/
325/*
326 * \brief Return stream handle from stream pool.
327 *
328 * If the requested stream id is higher than the current number of streams,
329 * one or more new streams will be created, so that size of the stream pool
330 * matches at least stream_id+1.
331 *
332 * By default, the first stream (with id 0) will be used for most operations,
333 * while stream id 1 will be used for operations which can be done
334 * concurrently, such as memory prefetching.
335 *
336 * Additional streams can be used for independent tasks, though opportunities
337 * for this are limited in the current code (this would probably also require
338 * associating different MPI communicators with each task).
339 *
340 * \param [in] stream_id id or requested stream
341 *
342 * \returns handle to requested stream
343 */
344/*----------------------------------------------------------------------------*/
345
346cudaStream_t
347cs_cuda_get_stream(int stream_id);
348
349/*----------------------------------------------------------------------------*/
350/*
351 * \brief Return stream handle used for prefetching.
352 *
353 * By default, a single stream is created specifically for prefetching.
354 *
355 * \returns handle to prefetching stream
356 */
357/*----------------------------------------------------------------------------*/
358
359cudaStream_t
360cs_cuda_get_stream_prefetch(void);
361
362/*----------------------------------------------------------------------------*/
372/*----------------------------------------------------------------------------*/
373
374int
375cs_cuda_get_stream_id(cudaStream_t stream);
376
377/*----------------------------------------------------------------------------*/
392/*----------------------------------------------------------------------------*/
393
394void
395cs_cuda_get_2_stage_reduce_buffers(int stream_id,
396 cs_lnum_t n_elts,
397 size_t elt_size,
398 unsigned int grid_size,
399 void* &r_grid,
400 void* &r_reduce,
401 void* &r_host);
402
403#endif /* defined(__NVCC__) */
404
406
407/*----------------------------------------------------------------------------*/
408/*
409 * \brief Log information on available CUDA devices.
410 *
411 * \param[in] log_id id of log file in which to print information
412 */
413/*----------------------------------------------------------------------------*/
414
415void
416cs_base_cuda_device_info(cs_log_t log_id);
417
418/*----------------------------------------------------------------------------*/
419/*
420 * \brief Log information on available CUDA version.
421 *
422 * \param[in] log_id id of log file in which to print information
423 */
424/*----------------------------------------------------------------------------*/
425
426void
427cs_base_cuda_version_info(cs_log_t log_id);
428
429/*----------------------------------------------------------------------------*/
430/*
431 * \brief Log information on CUDA compiler.
432 *
433 * \param[in] log_id id of log file in which to print information
434 */
435/*----------------------------------------------------------------------------*/
436
437void
438cs_base_cuda_compiler_info(cs_log_t log_id);
439
440/*----------------------------------------------------------------------------*/
446/*----------------------------------------------------------------------------*/
447
448extern "C" void
449cs_base_cuda_nccl_info(cs_log_t log_id);
450
451/*----------------------------------------------------------------------------*/
452/*
453 * \brief Set CUDA device based on MPI rank and number of devices.
454 *
455 * \param[in] comm associated MPI communicator
456 * \param[in] ranks_per_node number of ranks per node (min and max)
457 *
458 * \return selected device id, or -1 if no usable device is available
459 */
460/*----------------------------------------------------------------------------*/
461
462int
463cs_base_cuda_select_default_device(void);
464
465/*----------------------------------------------------------------------------*/
466/*
467 * \brief Return currently selected CUDA devices.
468 *
469 * \return selected device id, or -1 if no usable device is available
470 */
471/*----------------------------------------------------------------------------*/
472
473int
474cs_base_cuda_get_device(void);
475
476#endif /* CS_HAVE_CUDA */
477
478/*----------------------------------------------------------------------------*/
479
481
482#endif /* __CS_BASE_CUDA_H__ */
#define BEGIN_C_DECLS
Definition: cs_defs.h:554
#define END_C_DECLS
Definition: cs_defs.h:555
int cs_lnum_t
local mesh entity id
Definition: cs_defs.h:350
cs_log_t
Definition: cs_log.h:48
static const void * cs_get_device_ptr_const(const void *ptr)
Return matching device pointer for a given constant pointer.
Definition: cs_mem.h:697
static cs_alloc_mode_t cs_check_device_ptr(const void *ptr)
Check if a pointer is associated with a device.
Definition: cs_mem.h:737
static void cs_sync_h2d(const void *ptr)
Synchronize data from host to device.
Definition: cs_mem.h:947
cs_alloc_mode_t
Definition: cs_mem.h:50
@ CS_ALLOC_HOST
Definition: cs_mem.h:52
@ CS_ALLOC_HOST_DEVICE_SHARED
Definition: cs_mem.h:57