stl - Thrust: selectively move elements to another vector -


i'm trying figure out best way following using thrust: vector has million floats, have particular order. want move vector b every element x in x>7.0 such that order of elements maintain in both vectors , b. importantly, tiny fraction of elements need moved. efficiency more important code elegance.

my idea use thrust::copy_if b , thrust::remove_if on a. don't know exact number of elements copy, , since apparently memory b must allocated in advance, counting operation necessary. inelegant way skip counting operation pre-allocate "enough" memory vector b.

using thrust::remove_copy_if has same problems: need allocate memory b in advance, , doesn't remove thrust::remove_if required anyway.

another idea had use thrust::stable_sort custom-made comparison functor, push elements want out end of a, , somehow figure out how many there , thrust::copy them b. looks pretty inelegant...

you're on right track thrust::copy_if. allocate 2 more buffers of same size first one. copy_if > 7.0f first 1 , copy_if <= 7.0f second one. allocating buffers of same size original buffer fine long know there's room, , 1 million floats takes 4mb.

edit:

i did performance comparison of copy_if , stable_partition approaches. on card, gtx660, stable_partition took around 150% long copy_if "split" values of 0.1f, 0.5f , 0.9f. added tests ensure both methods stable (maintain order of values).

#include <cuda.h> #include <curand.h>  #include <thrust/device_vector.h> #include <thrust/host_vector.h> #include <thrust/copy.h> #include <thrust/partition.h>  #include <iostream> #include <cassert>  #define check_cuda_call(x) { if((x)!=cudasuccess) { \     printf("error @ %s:%d\n",__file__,__line__);\     return exit_failure;}} while(0)   #define check_curand_call(x) { if((x)!=curand_status_success) { \     printf("error @ %s:%d\n",__file__,__line__);\     return exit_failure;}} while(0)   #define split  0.1f  struct is_low {   __host__ __device__ bool operator()(const float x)   {     return x <= split;   } };   struct is_high {   __host__ __device__ bool operator()(const float x)   {     return x > split;   } };   class eventtimer { public:   eventtimer() : mstarted(false), mstopped(false) {     cudaeventcreate(&mstart);     cudaeventcreate(&mstop);   }   ~eventtimer() {     cudaeventdestroy(mstart);     cudaeventdestroy(mstop);   }   void start(cudastream_t s = 0) {     cudaeventrecord(mstart, s);      mstarted = true;     mstopped = false;   }   void stop(cudastream_t s = 0)  {     assert(mstarted);     cudaeventrecord(mstop, s);      mstarted = false;     mstopped = true;   }   float elapsed() {     assert(mstopped);     if (!mstopped) return 0;      cudaeventsynchronize(mstop);     float elapsed = 0;     cudaeventelapsedtime(&elapsed, mstart, mstop);     return elapsed;   }  private:   bool mstarted, mstopped;   cudaevent_t mstart, mstop; };   int main(int argc, char *argv[]) {     const size_t n = 1024 * 1024 * 50;      // create prng     curandgenerator_t gen;     check_curand_call(curandcreategenerator(&gen, curand_rng_pseudo_default));      // set seed     check_curand_call(curandsetpseudorandomgeneratorseed(gen, 1234ull));      // generate n floats on device      thrust::device_vector<float> vec_rnd_d(n);     float* ptr_rnd_d = thrust::raw_pointer_cast(vec_rnd_d.data());     check_curand_call(curandgenerateuniform(gen, ptr_rnd_d, n));      thrust::device_vector<float> vec_low_d(n);     thrust::device_vector<float> vec_high_d(n);      (int = 0; < 5; ++i) {       eventtimer timer;       timer.start();       thrust::device_vector<float>::iterator iter_end;       iter_end = thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_low_d.begin(), is_low());       thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_high_d.begin(), is_high());       timer.stop();       std::cout << "copy_if: " << timer.elapsed() << "ms" << std::endl;        // check result       thrust::host_vector<float> vec_rnd_h = vec_rnd_d;       thrust::host_vector<float> vec_low_h = vec_low_d;       thrust::host_vector<float> vec_high_h = vec_high_d;       thrust::host_vector<float>::iterator low_iter_h = vec_low_h.begin();       thrust::host_vector<float>::iterator high_iter_h = vec_high_h.begin();       (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin();           rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) {         if (*rnd_iter_h <= split) {           assert(*low_iter_h == *rnd_iter_h);           ++low_iter_h;         }         else {           assert(*high_iter_h == *rnd_iter_h);           ++high_iter_h;         }       }     }      (int = 0; < 5; ++i) {       thrust::device_vector<float> vec_rnd_copy = vec_rnd_d;       eventtimer timer;       timer.start();       thrust::device_vector<float>::iterator iter_split =          thrust::stable_partition(vec_rnd_copy.begin(), vec_rnd_copy.end(), is_low());       timer.stop();       size_t n_low = iter_split - vec_rnd_copy.begin();       std::cout << "stable_partition: " << timer.elapsed() << "ms" << std::endl;        // check result       thrust::host_vector<float> vec_rnd_h = vec_rnd_d;       thrust::host_vector<float> vec_partitioned_h = vec_rnd_copy;       thrust::host_vector<float>::iterator low_iter_h = vec_partitioned_h.begin();       thrust::host_vector<float>::iterator high_iter_h = vec_partitioned_h.begin() + n_low;       (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin();           rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) {         if (*rnd_iter_h <= split) {           assert(*low_iter_h == *rnd_iter_h);           ++low_iter_h;         }         else {           assert(*high_iter_h == *rnd_iter_h);           ++high_iter_h;         }       }     }      check_curand_call(curanddestroygenerator(gen));      return exit_success; } 

output:

c:\rd\projects\cpp\test_cuda\release>test_cuda.exe copy_if: 40.2919ms copy_if: 38.0157ms copy_if: 38.5036ms copy_if: 37.6751ms copy_if: 38.1054ms stable_partition: 59.5473ms stable_partition: 61.4016ms stable_partition: 59.1854ms stable_partition: 61.3195ms stable_partition: 59.1205ms 

Comments