1

I am trying to learn the usuage of Shared memory with a view to increase the performance . here I am trying to copy the global memory to shared memory. but when I have single block(256 thread) it gives the result and with more than 1 block it gives random result.

#include <cuda.h>
#include <stdio.h>

__global__ void staticReverse(int *d, int n)
{
  __shared__ int s[400];

  int t = blockIdx.x * blockDim.x + threadIdx.x;
  d[t] = d[t]*d[t];
  s[t] =d[t];

  __syncthreads();

  d[t] = s[t];  
}


__global__ void dynamicReverse(int *d, int n)
{
  extern __shared__ int s[];
  int t = threadIdx.x;

  s[t] = d[t]*d[t];
  __syncthreads();
  d[t] = s[t];
}

int main(void)
{
  const int n = 400;
  int a[n], d[n];

  for (int i = 0; i < n; i++)
  {
    a[i] = i; 
  }

  int *d_d;
  cudaMalloc(&d_d, n * sizeof(int)); 

  // run version with static shared memory
  int block_size = 256;
  int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);
  cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice);
  staticReverse<<<n_blocks,block_size>>>(d_d, n);
  cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost);
  for (int i = 0; i < n; i++) 
  {
    printf("%d\n",d[i]);
  }
}

1)what does the third argument in dynamicReverse<<<n_blocks,block_size,n*sizeof(int)>>>(d_d, n); kernal call does? does it allocates shared memory for entire block or thread.

2) if I required more than 64kb of shared memory per multiprocessor in compute capability 5.0 what I need to do?

sandeep.ganage
  • 1,382
  • 2
  • 19
  • 44
Malacu
  • 191
  • 2
  • 10

1 Answers1

3

In your static shared memory allocation code you had three issues:

  1. The size of the statically allocated shared memory should comply with the block size, not with the size of the input array,
  2. You should use local thread index for indexing shared memory, instead of the global one;
  3. You had no array out of bounds checking.

The dynamic shared memory allocation code had the same issues #2 and #3 as above, plus the fact that you were indexing global memory with local thread index, instead of global. You can use the third argument to specify the size of the shared memory to be allocated. In particular, you should allocate an amount of 256 ints, i.e., related to the block size, similarly to the static shared memory allocation case.

Here is the complete working code:

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/***********************************/
/* SHARED MEMORY STATIC ALLOCATION */
/***********************************/
#include <cuda.h>
#include <stdio.h>

__global__ void staticReverse(int *d, int n)
{
    __shared__ int s[256];

    int t = blockIdx.x * blockDim.x + threadIdx.x;

    if (t < n) {
        d[t] = d[t]*d[t];
        s[threadIdx.x] =d[t];

        __syncthreads();

        d[t] = s[threadIdx.x];
    }
}


/************************************/
/* SHARED MEMORY DYNAMIC ALLOCATION */
/************************************/
__global__ void dynamicReverse(int *d, int n)
{
    extern __shared__ int s[];
    int t = blockIdx.x * blockDim.x + threadIdx.x;

    if (t < n) {
        s[threadIdx.x] = d[t]*d[t];
        __syncthreads();
        d[t] = s[threadIdx.x];
    }
}

int main(void)
{
    const int n = 400;

    int* a = (int*) malloc(n*sizeof(int));
    int* d = (int*) malloc(n*sizeof(int));

    for (int i = 0; i < n; i++) { a[i] = i; }

    int *d_d; gpuErrchk(cudaMalloc(&d_d, n * sizeof(int))); 

    // run version with static shared memory
    int block_size = 256;
    int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);

    gpuErrchk(cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice));
    //staticReverse<<<n_blocks,block_size>>>(d_d, n);
    dynamicReverse<<<n_blocks,block_size,256*sizeof(int)>>>(d_d, n);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost));

    for (int i = 0; i < n; i++) { printf("%d\n",d[i]); }

}
Vitality
  • 19,527
  • 4
  • 93
  • 139
  • when I have large value of n about 100 thousand as n= 100,000 it gives -ve value. what is it? it this due to limited shared memory?? – Malacu Aug 27 '14 at 06:26
  • what about if I used two kernals with shared memory. In that case the memory useable for a block is divided?? – Malacu Aug 27 '14 at 08:40
  • @user3929491 I have edited the code by adding [CUDA error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api). Please, use this version and report any runtime error it issues. The code above seems to run fine to me for `n = 100000`. Concerning your second question, for each kernel you have always the same amount of available shared memory per block, it is not divided. – Vitality Aug 27 '14 at 21:02
  • @user3929491 The most probable reason why your code is not working for large values of `n` is that you are performing a static allocation of `a` and `d`. Try changing `int a[n], d[n];` to `int* a = (int*) malloc(n*sizeof(int));` and `int* d = (int*) malloc(n*sizeof(int));`. I have edited my post accordingly. – Vitality Aug 28 '14 at 07:06
  • for the second block at s[0], the value of d[256] is copied which seems to be overriden of that of first block. then doesnot it makes any change in output of d[0] and d[256] at the end. don't they have same value?? – Malacu Aug 29 '14 at 07:15