stl - Thrust: selectively move elements to another vector -
i'm trying figure out best way following using thrust: vector has million floats, have particular order. want move vector b every element x in x>7.0 such that order of elements maintain in both vectors , b. importantly, tiny fraction of elements need moved. efficiency more important code elegance.
my idea use thrust::copy_if b , thrust::remove_if on a. don't know exact number of elements copy, , since apparently memory b must allocated in advance, counting operation necessary. inelegant way skip counting operation pre-allocate "enough" memory vector b.
using thrust::remove_copy_if has same problems: need allocate memory b in advance, , doesn't remove thrust::remove_if required anyway.
another idea had use thrust::stable_sort custom-made comparison functor, push elements want out end of a, , somehow figure out how many there , thrust::copy them b. looks pretty inelegant...
you're on right track thrust::copy_if. allocate 2 more buffers of same size first one. copy_if > 7.0f first 1 , copy_if <= 7.0f second one. allocating buffers of same size original buffer fine long know there's room, , 1 million floats takes 4mb.
edit:
i did performance comparison of copy_if , stable_partition approaches. on card, gtx660, stable_partition took around 150% long copy_if "split" values of 0.1f, 0.5f , 0.9f. added tests ensure both methods stable (maintain order of values).
#include <cuda.h> #include <curand.h> #include <thrust/device_vector.h> #include <thrust/host_vector.h> #include <thrust/copy.h> #include <thrust/partition.h> #include <iostream> #include <cassert> #define check_cuda_call(x) { if((x)!=cudasuccess) { \ printf("error @ %s:%d\n",__file__,__line__);\ return exit_failure;}} while(0) #define check_curand_call(x) { if((x)!=curand_status_success) { \ printf("error @ %s:%d\n",__file__,__line__);\ return exit_failure;}} while(0) #define split 0.1f struct is_low { __host__ __device__ bool operator()(const float x) { return x <= split; } }; struct is_high { __host__ __device__ bool operator()(const float x) { return x > split; } }; class eventtimer { public: eventtimer() : mstarted(false), mstopped(false) { cudaeventcreate(&mstart); cudaeventcreate(&mstop); } ~eventtimer() { cudaeventdestroy(mstart); cudaeventdestroy(mstop); } void start(cudastream_t s = 0) { cudaeventrecord(mstart, s); mstarted = true; mstopped = false; } void stop(cudastream_t s = 0) { assert(mstarted); cudaeventrecord(mstop, s); mstarted = false; mstopped = true; } float elapsed() { assert(mstopped); if (!mstopped) return 0; cudaeventsynchronize(mstop); float elapsed = 0; cudaeventelapsedtime(&elapsed, mstart, mstop); return elapsed; } private: bool mstarted, mstopped; cudaevent_t mstart, mstop; }; int main(int argc, char *argv[]) { const size_t n = 1024 * 1024 * 50; // create prng curandgenerator_t gen; check_curand_call(curandcreategenerator(&gen, curand_rng_pseudo_default)); // set seed check_curand_call(curandsetpseudorandomgeneratorseed(gen, 1234ull)); // generate n floats on device thrust::device_vector<float> vec_rnd_d(n); float* ptr_rnd_d = thrust::raw_pointer_cast(vec_rnd_d.data()); check_curand_call(curandgenerateuniform(gen, ptr_rnd_d, n)); thrust::device_vector<float> vec_low_d(n); thrust::device_vector<float> vec_high_d(n); (int = 0; < 5; ++i) { eventtimer timer; timer.start(); thrust::device_vector<float>::iterator iter_end; iter_end = thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_low_d.begin(), is_low()); thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_high_d.begin(), is_high()); timer.stop(); std::cout << "copy_if: " << timer.elapsed() << "ms" << std::endl; // check result thrust::host_vector<float> vec_rnd_h = vec_rnd_d; thrust::host_vector<float> vec_low_h = vec_low_d; thrust::host_vector<float> vec_high_h = vec_high_d; thrust::host_vector<float>::iterator low_iter_h = vec_low_h.begin(); thrust::host_vector<float>::iterator high_iter_h = vec_high_h.begin(); (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin(); rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) { if (*rnd_iter_h <= split) { assert(*low_iter_h == *rnd_iter_h); ++low_iter_h; } else { assert(*high_iter_h == *rnd_iter_h); ++high_iter_h; } } } (int = 0; < 5; ++i) { thrust::device_vector<float> vec_rnd_copy = vec_rnd_d; eventtimer timer; timer.start(); thrust::device_vector<float>::iterator iter_split = thrust::stable_partition(vec_rnd_copy.begin(), vec_rnd_copy.end(), is_low()); timer.stop(); size_t n_low = iter_split - vec_rnd_copy.begin(); std::cout << "stable_partition: " << timer.elapsed() << "ms" << std::endl; // check result thrust::host_vector<float> vec_rnd_h = vec_rnd_d; thrust::host_vector<float> vec_partitioned_h = vec_rnd_copy; thrust::host_vector<float>::iterator low_iter_h = vec_partitioned_h.begin(); thrust::host_vector<float>::iterator high_iter_h = vec_partitioned_h.begin() + n_low; (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin(); rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) { if (*rnd_iter_h <= split) { assert(*low_iter_h == *rnd_iter_h); ++low_iter_h; } else { assert(*high_iter_h == *rnd_iter_h); ++high_iter_h; } } } check_curand_call(curanddestroygenerator(gen)); return exit_success; } output:
c:\rd\projects\cpp\test_cuda\release>test_cuda.exe copy_if: 40.2919ms copy_if: 38.0157ms copy_if: 38.5036ms copy_if: 37.6751ms copy_if: 38.1054ms stable_partition: 59.5473ms stable_partition: 61.4016ms stable_partition: 59.1854ms stable_partition: 61.3195ms stable_partition: 59.1205ms
Comments
Post a Comment