9.1
general documentation
cs_cuda_reduce.h
Go to the documentation of this file.
1#ifndef CS_CUDA_REDUCE_H
2#define CS_CUDA_REDUCE_H
3
4/*============================================================================
5 * BLAS (Basic Linear Algebra Subroutine) functions
6 *============================================================================*/
7
8/*
9 This file is part of code_saturne, a general-purpose CFD tool.
10
11 Copyright (C) 1998-2025 EDF S.A.
12
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; either version 2 of the License, or (at your option) any later
16 version.
17
18 This program is distributed in the hope that it will be useful, but WITHOUT
19 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
21 details.
22
23 You should have received a copy of the GNU General Public License along with
24 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
25 Street, Fifth Floor, Boston, MA 02110-1301, USA.
26*/
27
28/*----------------------------------------------------------------------------*/
29
30#include "base/cs_defs.h"
31
32/*----------------------------------------------------------------------------
33 * External library headers
34 *----------------------------------------------------------------------------*/
35
36/*----------------------------------------------------------------------------
37 * Local headers
38 *----------------------------------------------------------------------------*/
39
40#include "base/cs_base.h"
41#include "base/cs_base_cuda.h"
42
43/*----------------------------------------------------------------------------*/
44
46
47/*============================================================================
48 * Macro definitions
49 *============================================================================*/
50
51/*============================================================================
52 * Type definitions
53 *============================================================================*/
54
56
57/*============================================================================
58 * Templated function definitions
59 *============================================================================*/
60
61#if defined(__CUDACC__)
62
63/*----------------------------------------------------------------------------*/
74/*----------------------------------------------------------------------------*/
75
76template <size_t blockSize, size_t stride, typename T>
77__device__ static void __forceinline__
78cs_cuda_reduce_warp_reduce_sum(volatile T *stmp,
79 size_t tid)
80{
81 if (stride == 1) {
82
83 if (blockSize >= 64) stmp[tid] += stmp[tid + 32];
84 if (blockSize >= 32) stmp[tid] += stmp[tid + 16];
85 if (blockSize >= 16) stmp[tid] += stmp[tid + 8];
86 if (blockSize >= 8) stmp[tid] += stmp[tid + 4];
87 if (blockSize >= 4) stmp[tid] += stmp[tid + 2];
88 if (blockSize >= 2) stmp[tid] += stmp[tid + 1];
89
90 }
91 else {
92
93 if (blockSize >= 64) {
94 #pragma unroll
95 for (size_t i = 0; i < stride; i++)
96 stmp[tid*stride + i] += stmp[(tid + 32)*stride + i];
97 }
98 if (blockSize >= 32) {
99 #pragma unroll
100 for (size_t i = 0; i < stride; i++)
101 stmp[tid*stride + i] += stmp[(tid + 16)*stride + i];
102 }
103 if (blockSize >= 16) {
104 #pragma unroll
105 for (size_t i = 0; i < stride; i++)
106 stmp[tid*stride + i] += stmp[(tid + 8)*stride + i];
107 }
108 if (blockSize >= 8) {
109 #pragma unroll
110 for (size_t i = 0; i < stride; i++)
111 stmp[tid*stride + i] += stmp[(tid + 4)*stride + i];
112 }
113 if (blockSize >= 4) {
114 #pragma unroll
115 for (size_t i = 0; i < stride; i++)
116 stmp[tid*stride + i] += stmp[(tid + 2)*stride + i];
117 }
118 if (blockSize >= 2) {
119 #pragma unroll
120 for (size_t i = 0; i < stride; i++)
121 stmp[tid*stride + i] += stmp[(tid + 1)*stride + i];
122 }
123 }
124}
125
126/*----------------------------------------------------------------------------*/
144/*----------------------------------------------------------------------------*/
145
146template <size_t blockSize, size_t stride, typename T>
147__device__ static void __forceinline__
148cs_cuda_reduce_block_reduce_sum(T *stmp,
149 size_t tid,
150 T *sum_block)
151{
152 __syncthreads();
153
154 /* Loop explicitely unrolled */
155
156 if (stride == 1) { /* Scalar data */
157
158 if (blockSize >= 1024) {
159 if (tid < 512) {
160 stmp[tid] += stmp[tid + 512];
161 }
162 __syncthreads();
163 }
164 if (blockSize >= 512) {
165 if (tid < 256) {
166 stmp[tid] += stmp[tid + 256];
167 }
168 __syncthreads();
169 }
170 if (blockSize >= 256) {
171 if (tid < 128) {
172 stmp[tid] += stmp[tid + 128];
173 } __syncthreads();
174 }
175 if (blockSize >= 128) {
176 if (tid < 64) {
177 stmp[tid] += stmp[tid + 64];
178 } __syncthreads();
179 }
180
181 if (tid < 32) {
182 cs_cuda_reduce_warp_reduce_sum<blockSize, stride>(stmp, tid);
183 }
184
185 // Output: b_res for this block
186
187 if (tid == 0) sum_block[blockIdx.x] = stmp[0];
188
189 }
190
191 else { /* Vector data */
192
193 if (blockSize >= 1024) {
194 if (tid < 512) {
195 #pragma unroll
196 for (size_t i = 0; i < stride; i++)
197 stmp[tid*stride + i] += stmp[(tid + 512)*stride + i];
198 }
199 __syncthreads();
200 }
201 if (blockSize >= 512) {
202 if (tid < 256) {
203 #pragma unroll
204 for (size_t i = 0; i < stride; i++)
205 stmp[tid*stride + i] += stmp[(tid + 256)*stride + i];
206 }
207 __syncthreads();
208 }
209 if (blockSize >= 256) {
210 if (tid < 128) {
211 #pragma unroll
212 for (size_t i = 0; i < stride; i++)
213 stmp[tid*stride + i] += stmp[(tid + 128)*stride + i];
214 } __syncthreads();
215 }
216 if (blockSize >= 128) {
217 if (tid < 64) {
218 #pragma unroll
219 for (size_t i = 0; i < stride; i++)
220 stmp[tid*stride + i] += stmp[(tid + 64)*stride + i];
221 } __syncthreads();
222 }
223
224 if (tid < 32)
225 cs_cuda_reduce_warp_reduce_sum<blockSize, stride>(stmp, tid);
226
227 // Output: b_res for this block
228
229 if (tid == 0) {
230 #pragma unroll
231 for (size_t i = 0; i < stride; i++)
232 sum_block[(blockIdx.x)*stride + i] = stmp[i];
233 }
234
235 }
236}
237
238/*----------------------------------------------------------------------------*/
248/*----------------------------------------------------------------------------*/
249
250template <size_t blockSize, size_t stride, typename T>
251__global__ static void
252cs_cuda_reduce_sum_single_block(size_t n,
253 T *g_idata,
254 T *g_odata)
255{
256 __shared__ T sdata[blockSize * stride];
257
258 size_t tid = threadIdx.x;
259 T r_s[stride];
260
261 if (stride == 1) {
262
263 r_s[0] = 0;
264 sdata[tid] = 0.;
265
266 for (size_t i = threadIdx.x; i < n; i+= blockSize)
267 r_s[0] += g_idata[i];
268
269 sdata[tid] = r_s[0];
270 __syncthreads();
271
272 for (size_t j = blockSize/2; j > CS_CUDA_WARP_SIZE; j /= 2) {
273 if (tid < j) {
274 sdata[tid] += sdata[tid + j];
275 }
276 __syncthreads();
277 }
278
279 if (tid < 32) cs_cuda_reduce_warp_reduce_sum<blockSize, stride>(sdata, tid);
280 if (tid == 0) *g_odata = sdata[0];
281
282 }
283 else {
284
285 #pragma unroll
286 for (size_t k = 0; k < stride; k++) {
287 r_s[k] = 0;
288 sdata[tid*stride + k] = 0.;
289 }
290
291 for (size_t i = threadIdx.x; i < n; i+= blockSize) {
292 #pragma unroll
293 for (size_t k = 0; k < stride; k++) {
294 r_s[k] += g_idata[i*stride + k];
295 }
296 }
297
298 #pragma unroll
299 for (size_t k = 0; k < stride; k++)
300 sdata[tid*stride + k] = r_s[k];
301 __syncthreads();
302
303 for (size_t j = blockSize/2; j > CS_CUDA_WARP_SIZE; j /= 2) {
304 if (tid < j) {
305 #pragma unroll
306 for (size_t k = 0; k < stride; k++)
307 sdata[tid*stride + k] += sdata[(tid + j)*stride + k];
308 }
309 __syncthreads();
310 }
311
312 if (tid < 32) cs_cuda_reduce_warp_reduce_sum<blockSize, stride>(sdata, tid);
313 if (tid == 0) {
314 #pragma unroll
315 for (size_t k = 0; k < stride; k++)
316 g_odata[k] = sdata[k];
317 }
318
319 }
320}
321
322/*----------------------------------------------------------------------------*/
333/*----------------------------------------------------------------------------*/
334
335template <size_t blockSize, typename R, typename T>
336__device__ static void __forceinline__
337cs_cuda_reduce_warp_reduce(volatile T *stmp,
338 size_t tid)
339{
340 R reducer;
341
342 if (blockSize >= 64) reducer.combine(stmp[tid], stmp[tid + 32]);
343 if (blockSize >= 32) reducer.combine(stmp[tid], stmp[tid + 16]);
344 if (blockSize >= 16) reducer.combine(stmp[tid], stmp[tid + 8]);
345 if (blockSize >= 8) reducer.combine(stmp[tid], stmp[tid + 4]);
346 if (blockSize >= 4) reducer.combine(stmp[tid], stmp[tid + 2]);
347 if (blockSize >= 2) reducer.combine(stmp[tid], stmp[tid + 1]);
348}
349
350/*----------------------------------------------------------------------------*/
365/*----------------------------------------------------------------------------*/
366
367template <size_t blockSize, typename R, typename T>
368__device__ static void __forceinline__
369cs_cuda_reduce_block_reduce(T *stmp,
370 size_t tid,
371 T *rd_block)
372{
373 R reducer;
374 __syncthreads();
375
376 /* Loop explicitely unrolled */
377
378 if (blockSize >= 1024) {
379 if (tid < 512) {
380 reducer.combine(stmp[tid], stmp[tid + 512]);
381 }
382 __syncthreads();
383 }
384 if (blockSize >= 512) {
385 if (tid < 256) {
386 reducer.combine(stmp[tid], stmp[tid + 256]);
387 }
388 __syncthreads();
389 }
390 if (blockSize >= 256) {
391 if (tid < 128) {
392 reducer.combine(stmp[tid], stmp[tid + 128]);
393 } __syncthreads();
394 }
395 if (blockSize >= 128) {
396 if (tid < 64) {
397 reducer.combine(stmp[tid], stmp[tid + 64]);
398 } __syncthreads();
399 }
400
401 if (tid < 32) {
402 cs_cuda_reduce_warp_reduce<blockSize, R>(stmp, tid);
403 }
404
405 // Output: rd_block for this block
406
407 if (tid == 0) rd_block[blockIdx.x] = stmp[0];
408}
409
410/*----------------------------------------------------------------------------*/
422/*----------------------------------------------------------------------------*/
423
424template <size_t blockSize, typename R, typename T>
425__global__ static void
426cs_cuda_reduce_single_block(size_t n,
427 T *g_idata,
428 T *g_odata)
429{
430 extern __shared__ int p_stmp[];
431 T *sdata = reinterpret_cast<T *>(p_stmp);
432 R reducer;
433
434 size_t tid = threadIdx.x;
435 T r_s[1];
436
437 reducer.identity(r_s[0]);
438 reducer.identity(sdata[tid]);
439
440 for (size_t i = threadIdx.x; i < n; i+= blockSize)
441 reducer.combine(r_s[0], g_idata[i]);
442
443 sdata[tid] = r_s[0];
444 __syncthreads();
445
446 for (size_t j = blockSize/2; j > CS_CUDA_WARP_SIZE; j /= 2) {
447 if (tid < j) {
448 reducer.combine(sdata[tid], sdata[tid + j]);
449 }
450 __syncthreads();
451 }
452
453 if (tid < 32) cs_cuda_reduce_warp_reduce<blockSize, R>(sdata, tid);
454 if (tid == 0) *g_odata = sdata[0];
455}
456
457#endif /* defined(__CUDACC__) */
458
459/*============================================================================
460 * Public function prototypes
461 *============================================================================*/
462
463/*----------------------------------------------------------------------------*/
464
465#endif /* CS_CUDA_REDUCE_H */
#define BEGIN_C_DECLS
Definition: cs_defs.h:554
#define END_C_DECLS
Definition: cs_defs.h:555
@ k
Definition: cs_field_pointer.h:72