I have the following problem (keep in mind that I am fairly new to programming with CUDA),
I have a class called vec3f that is just like the float3 data type but with overloaded operators, and other vector functions. These functions are prefixed with __ device __ __ host __ (i added spaces because it was making these words bolded). Then, in my kernel I do a nested for loop over block_x and block_y indicies and do something like,
//set up shared memory block
extern __shared__ vec3f share[];
vec3f *sh_pos = share;
vec3f *sh_velocity = &sh_pos[blockDim.x*blockDim.y];
sh_pos[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].position();
sh_velocity[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].velocity();
__syncthreads();
In the above code, oldParticles is a pointer to a class called particles that is being passed to the kernel. OldParticles is acutally an underlying pointer of a thrust::device_vector (im not sure if this has something to do with it). Everything compiles okay but when I run I get the error
libc++abi.dylib: terminate called throwing an exception
Abort trap: 6
Thanks for the replies. I think the error had to do with me not allocating room for the arguments being passed to my kernel. Doing the following in my host code fixed this error,
particle* particle_ptrs[2];
particle_ptrs[0] = thrust::raw_pointer_cast(&d_old_particles[0]);
particle_ptrs[1] = thrust::raw_pointer_cast(&d_new_particles[0]);
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) ) );
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) ) );
The kernel call is then,
force_kernel<<< grid,block,sharedMemSize >>>(particle_ptrs[0],particle_ptrs[1],time_step);
The issue that I am having now seems to be that I can't get data copied back to the host from the device. I think this has to do with me not being familiar with thrust.
Im doing a series of copies as follows,
//make a host vector assume this is initialized
thrust::host_vector<particle> h_particles;
thrust::device_vector<particle> d_old_particles, d_new_particles;
d_old_particles = h_particles;
//launch kernel as shown above
//with thrust vectors having been casted into their underlying pointers
//particle_ptrs[1] gets modified and so shouldnt d_new_particles?
//copy back
h_particles = d_new_particles;
So I guess my question is, can I modify a thrust device vector in a kernel (in this case particle_pters[0]) save the modification to to another thrust device vector in the kernel (in this case particle_pters[1]) and then once I exit from the kernel, copy that to a host vector?
I still can't get this to work. I made a shorter example where I am having the same problem,
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include "vec3f.h"
const int BLOCK_SIZE = 8;
const int max_particles = 64;
const float dt = 0.01;
using namespace std;
//particle class
class particle {
public:
particle() :
_velocity(vec3f(0,0,0)), _position(vec3f(0,0,0)), _density(0.0) {
};
particle(const vec3f& pos, const vec3f& vel) :
_position(pos), _velocity(vel), _density(0.0) {
};
vec3f _velocity;
vec3f _position;
float _density;
};
//forward declaration of kernel func
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt);
//global thrust vectors
thrust::host_vector<particle> h_parts;
thrust::device_vector<particle> old_parts, new_parts;
particle* particle_ptrs[2];
int main() {
//load host vector
for (int i =0; i<max_particles; i++) {
h_parts.push_back(particle(vec3f(0.5,0.5,0.5),vec3f(10,10,10)));
}
particle_ptrs[0] = thrust::raw_pointer_cast(&old_parts[0]);
particle_ptrs[1] = thrust::raw_pointer_cast(&new_parts[0]);
cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) );
cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) );
//copy host particles to old device particles...
old_parts = h_parts;
//kernel block and grid dimensions
dim3 block(BLOCK_SIZE,BLOCK_SIZE,1);
dim3 grid(int(sqrt(float(max_particles) / (float(block.x*block.y)))), int(sqrt(float(max_particles) / (float(block.x*block.y)))), 1);
kernel_func<<<block,grid>>>(particle_ptrs[0],particle_ptrs[1],dt);
//copy new device particles back to host particles
h_parts = new_parts;
for (int i =0; i<max_particles; i++) {
particle temp1 = h_parts[i];
cout << temp1._position << endl;
}
//delete thrust device vectors
old_parts.clear();
old_parts.shrink_to_fit();
new_parts.clear();
new_parts.shrink_to_fit();
return 0;
}
//kernel function
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt) {
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
//get array position for 2d grid...
unsigned int arr_pos = y*blockDim.x*gridDim.x + x;
new_parts[arr_pos]._velocity = old_parts[arr_pos]._velocity * 10.0 * dt;
new_parts[arr_pos]._position = old_parts[arr_pos]._position * 10.0 * dt;
new_parts[arr_pos]._density = old_parts[arr_pos]._density * 10.0 * dt;
}
So the host vector has an initial position of (0.5,0.5,0.5) for all 64 particles. Then the kernel attempts to multiply that by 10 to give (5,5,5) as the position for all particles. But I dont see this when I "cout" the data. It is still just (0.5,0.5,0.5). Is there a problem with how I am allocating memory? Is there a problem with the lines:
//copy new device particles back to host particles
h_parts = new_parts;
What could be the issue? Thank you.