1#ifndef CS_CUDA_REDUCE_H
2#define CS_CUDA_REDUCE_H
61#if defined(__CUDACC__)
76template <
size_t blockSize,
size_t str
ide,
typename T>
77__device__
static void __forceinline__
78cs_cuda_reduce_warp_reduce_sum(
volatile T *stmp,
83 if (blockSize >= 64) stmp[tid] += stmp[tid + 32];
84 if (blockSize >= 32) stmp[tid] += stmp[tid + 16];
85 if (blockSize >= 16) stmp[tid] += stmp[tid + 8];
86 if (blockSize >= 8) stmp[tid] += stmp[tid + 4];
87 if (blockSize >= 4) stmp[tid] += stmp[tid + 2];
88 if (blockSize >= 2) stmp[tid] += stmp[tid + 1];
93 if (blockSize >= 64) {
95 for (
size_t i = 0; i < stride; i++)
96 stmp[tid*stride + i] += stmp[(tid + 32)*stride + i];
98 if (blockSize >= 32) {
100 for (
size_t i = 0; i < stride; i++)
101 stmp[tid*stride + i] += stmp[(tid + 16)*stride + i];
103 if (blockSize >= 16) {
105 for (
size_t i = 0; i < stride; i++)
106 stmp[tid*stride + i] += stmp[(tid + 8)*stride + i];
108 if (blockSize >= 8) {
110 for (
size_t i = 0; i < stride; i++)
111 stmp[tid*stride + i] += stmp[(tid + 4)*stride + i];
113 if (blockSize >= 4) {
115 for (
size_t i = 0; i < stride; i++)
116 stmp[tid*stride + i] += stmp[(tid + 2)*stride + i];
118 if (blockSize >= 2) {
120 for (
size_t i = 0; i < stride; i++)
121 stmp[tid*stride + i] += stmp[(tid + 1)*stride + i];
146template <
size_t blockSize,
size_t str
ide,
typename T>
147__device__
static void __forceinline__
148cs_cuda_reduce_block_reduce_sum(T *stmp,
158 if (blockSize >= 1024) {
160 stmp[tid] += stmp[tid + 512];
164 if (blockSize >= 512) {
166 stmp[tid] += stmp[tid + 256];
170 if (blockSize >= 256) {
172 stmp[tid] += stmp[tid + 128];
175 if (blockSize >= 128) {
177 stmp[tid] += stmp[tid + 64];
182 cs_cuda_reduce_warp_reduce_sum<blockSize, stride>(stmp, tid);
187 if (tid == 0) sum_block[blockIdx.x] = stmp[0];
193 if (blockSize >= 1024) {
196 for (
size_t i = 0; i < stride; i++)
197 stmp[tid*stride + i] += stmp[(tid + 512)*stride + i];
201 if (blockSize >= 512) {
204 for (
size_t i = 0; i < stride; i++)
205 stmp[tid*stride + i] += stmp[(tid + 256)*stride + i];
209 if (blockSize >= 256) {
212 for (
size_t i = 0; i < stride; i++)
213 stmp[tid*stride + i] += stmp[(tid + 128)*stride + i];
216 if (blockSize >= 128) {
219 for (
size_t i = 0; i < stride; i++)
220 stmp[tid*stride + i] += stmp[(tid + 64)*stride + i];
225 cs_cuda_reduce_warp_reduce_sum<blockSize, stride>(stmp, tid);
231 for (
size_t i = 0; i < stride; i++)
232 sum_block[(blockIdx.x)*stride + i] = stmp[i];
250template <
size_t blockSize,
size_t str
ide,
typename T>
251__global__
static void
252cs_cuda_reduce_sum_single_block(
size_t n,
256 __shared__ T sdata[blockSize * stride];
258 size_t tid = threadIdx.x;
266 for (
size_t i = threadIdx.x; i < n; i+= blockSize)
267 r_s[0] += g_idata[i];
272 for (
size_t j = blockSize/2; j > CS_CUDA_WARP_SIZE; j /= 2) {
274 sdata[tid] += sdata[tid + j];
279 if (tid < 32) cs_cuda_reduce_warp_reduce_sum<blockSize, stride>(sdata, tid);
280 if (tid == 0) *g_odata = sdata[0];
286 for (
size_t k = 0;
k < stride;
k++) {
288 sdata[tid*stride +
k] = 0.;
291 for (
size_t i = threadIdx.x; i < n; i+= blockSize) {
293 for (
size_t k = 0;
k < stride;
k++) {
294 r_s[
k] += g_idata[i*stride +
k];
299 for (
size_t k = 0;
k < stride;
k++)
300 sdata[tid*stride +
k] = r_s[
k];
303 for (
size_t j = blockSize/2; j > CS_CUDA_WARP_SIZE; j /= 2) {
306 for (
size_t k = 0;
k < stride;
k++)
307 sdata[tid*stride +
k] += sdata[(tid + j)*stride +
k];
312 if (tid < 32) cs_cuda_reduce_warp_reduce_sum<blockSize, stride>(sdata, tid);
315 for (
size_t k = 0;
k < stride;
k++)
316 g_odata[
k] = sdata[
k];
335template <
size_t blockSize,
typename R,
typename T>
336__device__
static void __forceinline__
337cs_cuda_reduce_warp_reduce(
volatile T *stmp,
342 if (blockSize >= 64) reducer.combine(stmp[tid], stmp[tid + 32]);
343 if (blockSize >= 32) reducer.combine(stmp[tid], stmp[tid + 16]);
344 if (blockSize >= 16) reducer.combine(stmp[tid], stmp[tid + 8]);
345 if (blockSize >= 8) reducer.combine(stmp[tid], stmp[tid + 4]);
346 if (blockSize >= 4) reducer.combine(stmp[tid], stmp[tid + 2]);
347 if (blockSize >= 2) reducer.combine(stmp[tid], stmp[tid + 1]);
367template <
size_t blockSize,
typename R,
typename T>
368__device__
static void __forceinline__
369cs_cuda_reduce_block_reduce(T *stmp,
378 if (blockSize >= 1024) {
380 reducer.combine(stmp[tid], stmp[tid + 512]);
384 if (blockSize >= 512) {
386 reducer.combine(stmp[tid], stmp[tid + 256]);
390 if (blockSize >= 256) {
392 reducer.combine(stmp[tid], stmp[tid + 128]);
395 if (blockSize >= 128) {
397 reducer.combine(stmp[tid], stmp[tid + 64]);
402 cs_cuda_reduce_warp_reduce<blockSize, R>(stmp, tid);
407 if (tid == 0) rd_block[blockIdx.x] = stmp[0];
424template <
size_t blockSize,
typename R,
typename T>
425__global__
static void
426cs_cuda_reduce_single_block(
size_t n,
430 extern __shared__
int p_stmp[];
431 T *sdata =
reinterpret_cast<T *
>(p_stmp);
434 size_t tid = threadIdx.x;
437 reducer.identity(r_s[0]);
438 reducer.identity(sdata[tid]);
440 for (
size_t i = threadIdx.x; i < n; i+= blockSize)
441 reducer.combine(r_s[0], g_idata[i]);
446 for (
size_t j = blockSize/2; j > CS_CUDA_WARP_SIZE; j /= 2) {
448 reducer.combine(sdata[tid], sdata[tid + j]);
453 if (tid < 32) cs_cuda_reduce_warp_reduce<blockSize, R>(sdata, tid);
454 if (tid == 0) *g_odata = sdata[0];
#define BEGIN_C_DECLS
Definition: cs_defs.h:554
#define END_C_DECLS
Definition: cs_defs.h:555
@ k
Definition: cs_field_pointer.h:72