PSkel: src/PSkelMap.hpp Source File

00001 //-------------------------------------------------------------------------------
00002 // Copyright (c) 2015, ICEI - PUC Minas
00003 // All rights reserved.
00004 // 
00005 // Redistribution and use in source and binary forms, with or without
00006 // modification, are permitted provided that the following conditions are met:
00007 // 
00008 // 1. Redistributions of source code must retain the above copyright notice, this
00009 // list of conditions and the following disclaimer.
00010 // 
00011 // 2. Redistributions in binary form must reproduce the above copyright notice,
00012 // this list of conditions and the following disclaimer in the documentation
00013 // and/or other materials provided with the distribution.
00014 // 
00015 // 3. Neither the name of the copyright holder nor the names of its contributors
00016 // may be used to endorse or promote products derived from this software without
00017 // specific prior written permission.
00018 // 
00019 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00020 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00021 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
00022 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
00023 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00024 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
00025 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00026 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
00027 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00028 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029 //-------------------------------------------------------------------------------
00030 
00031 #ifndef PSKEL_MAP_HPP
00032 #define PSKEL_MAP_HPP
00033 
00034 #include <algorithm>
00035 #include <cmath>
00036 
00037 #ifdef PSKEL_TBB
00038   #include <tbb/blocked_range.h>
00039   #include <tbb/parallel_for.h>
00040   #include <tbb/task_scheduler_init.h>
00041 #endif
00042 
00043 namespace PSkel{
00044 
00045 //********************************************************************************************
00046 // Kernels CUDA. Chama o kernel implementado pelo usuario
00047 //********************************************************************************************
00048 
00049 template<typename T1, class Args>
00050 __global__ void mapCU(Array<T1> input,Array<T1> output,Args args);
00051 
00052 template<typename T1, class Args>
00053 __global__ void mapCU2D(Array2D<T1> input,Array2D<T1> output,Args args);
00054 
00055 template<typename T1, class Args>
00056 __global__ void mapCU3D(Array3D<T1> input,Array3D<T1> output,Args args);
00057 
00058 
00059 //********************************************************************************************
00060 // Kernels CUDA. Chama o kernel implementado pelo usuario
00061 //********************************************************************************************
00062 
00063 template<typename T1, class Args>
00064 __global__ void mapCU(Array<T1> input,Array<T1> output, Args args){
00065         size_t i = blockIdx.x*blockDim.x+threadIdx.x;
00066         
00067         if(i<input.getWidth()){
00068                 mapKernel(input, output, args, i);
00069         }
00070 }
00071 
00072 template<typename T1, class Args>
00073 __global__ void mapCU2D(Array2D<T1> input,Array2D<T1> output,Args args){
00074         size_t w = blockIdx.x*blockDim.x+threadIdx.x;
00075         size_t h = blockIdx.y*blockDim.y+threadIdx.y;
00076   
00077         if(w<input.getWidth() && h<input.getHeight()){
00078                 mapKernel(input, output, args, h, w);
00079         }
00080 }
00081 
00082 template<typename T1, class Args>
00083 __global__ void mapCU3D(Array3D<T1> input,Array3D<T1> output,Args args){
00084         size_t w = blockIdx.x*blockDim.x+threadIdx.x;
00085         size_t h = blockIdx.y*blockDim.y+threadIdx.y;
00086         size_t d = blockIdx.z*blockDim.z+threadIdx.z;
00087   
00088         if(w<input.getWidth() && h<input.getHeight() && d<input.getDepth()){
00089                 mapKernel(input, output, args, h, w,d);
00090         }
00091 }
00092 
00093 //*******************************************************************************************
00094 // Stencil Base
00095 //*******************************************************************************************
00096 
00097 template<class Arrays, class Args>
00098 void MapBase<Arrays, Args>::runSequential(){
00099         this->runSeq(this->input, this->output);
00100 }
00101 
00102 template<class Arrays, class Args>
00103 void MapBase<Arrays, Args>::runCPU(size_t numThreads){
00104         numThreads = (numThreads==0)?omp_get_num_procs():numThreads;
00105         #ifdef PSKEL_TBB
00106                 this->runTBB(this->input, this->output, numThreads);
00107         #else
00108                 this->runOpenMP(this->input, this->output, numThreads);
00109         #endif
00110 }
00111 
00112 template<class Arrays, class Args>
00113 void MapBase<Arrays, Args>::runGPU(size_t blockSize){
00114         if(blockSize==0){
00115                 int device;
00116                 cudaGetDevice(&device);
00117                 cudaDeviceProp deviceProperties;
00118                 cudaGetDeviceProperties(&deviceProperties, device);
00119                 blockSize = deviceProperties.warpSize;
00120         }
00121         input.deviceAlloc();
00122         output.deviceAlloc();
00123         input.copyToDevice();
00124         this->runCUDA(this->input, this->output, blockSize);
00125         output.copyToHost();
00126         input.deviceFree();
00127         output.deviceFree();
00128 }
00129 /*
00130 template<class Arrays, class Args>
00131 void MapBase<Arrays, Args>::runHybrid(float GPUPartition, size_t GPUBlockSize, size_t numThreads){
00132         if(GPUPartition==0.0){
00133                 runCPU(numThreads);
00134         }else if(GPUPartition==1.0){
00135                 runGPU(GPUBlockSize);
00136         }else{
00137                 Arrays inputSliceGPU;
00138                 Arrays outputSliceGPU;
00139                 Arrays inputSliceCPU;
00140                 Arrays outputSliceCPU;
00141                 if(input.getHeight()>1){
00142                         size_t GPUHeight = size_t(this->input.getHeight()*GPUPartition);
00143                         inputSliceGPU.hostSlice(this->input, 0, 0, 0, this->input.getWidth(), GPUHeight, this->input.getDepth());
00144                         outputSliceGPU.hostSlice(this->output, 0, 0, 0, this->output.getWidth(), GPUHeight, this->output.getDepth());
00145                         inputSliceCPU.hostSlice(this->input, 0, GPUHeight, 0, this->input.getWidth(), this->input.getHeight()-GPUHeight, this->input.getDepth());
00146                         outputSliceCPU.hostSlice(this->output, 0, GPUHeight, 0, this->output.getWidth(), this->output.getHeight()-GPUHeight, this->output.getDepth());
00147                 }else{
00148                         size_t GPUWidth= size_t(this->input.getWidth()*GPUPartition);
00149                         inputSliceGPU.hostSlice(this->input, 0, 0, 0, GPUWidth, this->input.getHeight(), this->input.getDepth());
00150                         outputSliceGPU.hostSlice(this->output, 0, 0, 0, GPUWidth, this->output.getHeight(), this->output.getDepth());
00151                         inputSliceCPU.hostSlice(this->input, GPUWidth, 0, 0, this->input.getWidth()-GPUWidth, this->input.getHeight(), this->input.getDepth());
00152                         outputSliceCPU.hostSlice(this->output, GPUWidth, 0, 0, this->output.getWidth()-GPUWidth, this->output.getHeight(), this->output.getDepth());
00153                 }
00154                 omp_set_num_threads(2);
00155         
00156                 #pragma omp parallel sections
00157                 {
00158                         #pragma omp section
00159                         {       
00160                                 inputSliceGPU.deviceAlloc();
00161                                 inputSliceGPU.copyToDevice();
00162                                 outputSliceGPU.deviceAlloc();
00163                                 this->runCUDA(inputSliceGPU, outputSliceGPU, GPUBlockSize);
00164                                 outputSliceGPU.copyToHost();
00165         
00166                                 inputSliceGPU.deviceFree();
00167                                 outputSliceGPU.deviceFree();
00168                         }
00169                         #pragma omp section
00170                         {
00171                                 this->runTBB(inputSliceCPU, outputSliceCPU,numThreads);
00172                         }
00173                 }
00174         }
00175 }
00176 */
00177 template<class Arrays, class Args>
00178 void MapBase<Arrays, Args>::runIterativeSequential(size_t iterations){
00179         Arrays inputCopy;
00180         inputCopy.hostClone(input);
00181         for(size_t it = 0; it<iterations; it++){
00182                 if(it%2==0) this->runSeq(inputCopy, this->output);
00183                 else this->runSeq(this->output, inputCopy);
00184         }
00185         if((iterations%2)==0) output.hostMemCopy(inputCopy);
00186         inputCopy.hostFree();
00187 }
00188 
00189 template<class Arrays, class Args>
00190 void MapBase<Arrays, Args>::runIterativeCPU(size_t iterations, size_t numThreads){
00191         numThreads = (numThreads==0)?omp_get_num_procs():numThreads;
00192         Arrays inputCopy;
00193         inputCopy.hostClone(input);
00194         for(size_t it = 0; it<iterations; it++){
00195                 if(it%2==0){
00196                         #ifdef PSKEL_TBB
00197                                 this->runTBB(inputCopy, this->output, numThreads);
00198                         #else
00199                                 this->runOpenMP(inputCopy, this->output, numThreads);
00200                         #endif
00201                 }else {
00202                         #ifdef PSKEL_TBB
00203                                 this->runTBB(this->output, inputCopy, numThreads);
00204                         #else
00205                                 this->runOpenMP(this->output, inputCopy, numThreads);
00206                         #endif
00207                 }
00208         }
00209         if((iterations%2)==0) output.hostMemCopy(inputCopy);
00210         inputCopy.hostFree();
00211 }
00212 
00213 template<class Arrays, class Args>
00214 void MapBase<Arrays, Args>::runIterativeGPU(size_t iterations, size_t blockSize){
00215         if(blockSize==0){
00216                 int device;
00217                 cudaGetDevice(&device);
00218                 cudaDeviceProp deviceProperties;
00219                 cudaGetDeviceProperties(&deviceProperties, device);
00220                 blockSize = deviceProperties.warpSize;
00221         }
00222         input.deviceAlloc();
00223         input.copyToDevice();
00224         output.deviceAlloc();
00225         for(size_t it = 0; it<iterations; it++){
00226                 if((it%2)==0)
00227                         this->runCUDA(this->input, this->output, blockSize);
00228                 else this->runCUDA(this->output, this->input, blockSize);
00229         }
00230         if((iterations%2)==1)
00231                 output.copyToHost();
00232         else output.copyFromDevice(input);
00233         input.deviceFree();
00234         output.deviceFree();
00235 }
00236 /*
00237 template<class Arrays, class Args>
00238 void MapBase<Arrays, Args>::runIterativeHybrid(size_t iterations, float GPUPartition, size_t GPUBlockSize, size_t numThreads){
00239         if(GPUPartition==0.0){
00240                 runIterativeCPU(iterations, numThreads);
00241         }else if(GPUPartition==1.0){
00242                 runIterativeGPU(iterations, GPUBlockSize);
00243         }else{
00244                 Arrays inputSliceGPU;
00245                 Arrays outputSliceGPU;
00246                 Arrays inputSliceCPU;
00247                 Arrays outputSliceCPU;
00248                 if(input.getHeight()>1){
00249                         size_t GPUHeight = size_t(this->input.getHeight()*GPUPartition);
00250                         inputSliceGPU.hostSlice(this->input, 0, 0, 0, this->input.getWidth(), GPUHeight, this->input.getDepth());
00251                         outputSliceGPU.hostSlice(this->output, 0, 0, 0, this->output.getWidth(), GPUHeight, this->output.getDepth());
00252                         inputSliceCPU.hostSlice(this->input, 0, GPUHeight, 0, this->input.getWidth(), this->input.getHeight()-GPUHeight, this->input.getDepth());
00253                         outputSliceCPU.hostSlice(this->output, 0, GPUHeight, 0, this->output.getWidth(), this->output.getHeight()-GPUHeight, this->output.getDepth());
00254                 }else{
00255                         size_t GPUWidth= size_t(this->input.getWidth()*GPUPartition);
00256                         inputSliceGPU.hostSlice(this->input, 0, 0, 0, GPUWidth, this->input.getHeight(), this->input.getDepth());
00257                         outputSliceGPU.hostSlice(this->output, 0, 0, 0, GPUWidth, this->output.getHeight(), this->output.getDepth());
00258                         inputSliceCPU.hostSlice(this->input, GPUWidth, 0, 0, this->input.getWidth()-GPUWidth, this->input.getHeight(), this->input.getDepth());
00259                         outputSliceCPU.hostSlice(this->output, GPUWidth, 0, 0, this->output.getWidth()-GPUWidth, this->output.getHeight(), this->output.getDepth());
00260                 }
00261                 
00262                 omp_set_num_threads(2);
00263         
00264                 #pragma omp parallel sections
00265                 {
00266                         #pragma omp section
00267                         {
00268                                 inputSliceGPU.deviceAlloc();
00269                                 inputSliceGPU.copyToDevice();
00270                                 outputSliceGPU.deviceAlloc();
00271                                 for(size_t it = 0; it<iterations; it++){
00272                                         if((it%2)==0)
00273                                                 this->runCUDA(inputSliceGPU, outputSliceGPU, GPUBlockSize);
00274                                         else this->runCUDA(outputSliceGPU, inputSliceGPU, GPUBlockSize);
00275                                 }
00276                                 //outputSliceGPU.copyToHost();
00277         
00278                                 //outputSliceGPU.deviceFree();
00279                         }
00280                         #pragma omp section
00281                         {
00282                                 Arrays inputCopy;
00283                                 inputCopy.hostClone(inputSliceCPU);
00284                                 for(size_t it = 0; it<iterations; it++){
00285                                         if(it%2==0) this->runTBB(inputCopy, outputSliceCPU, numThreads);
00286                                         else this->runTBB(outputSliceCPU, inputCopy, numThreads);
00287                                         //std::swap(input,output);
00288                                 }
00289                                 if((iterations%2)==0) outputSliceCPU.hostMemCopy(inputCopy);
00290                                 inputCopy.hostFree();
00291                         }
00292                 }
00293                 if((iterations%2)==1)
00294                         outputSliceGPU.copyToHost();
00295                 else outputSliceGPU.copyFromDevice(inputSliceGPU);
00296                 inputSliceGPU.deviceFree();
00297                 outputSliceGPU.deviceFree();
00298         }
00299 }
00300 */
00301 //*******************************************************************************************
00302 // Map 3D
00303 //*******************************************************************************************
00304 
00305 template<class Arrays, class Args>
00306 Map3D<Arrays,Args>::Map3D(){}
00307         
00308 template<class Arrays, class Args>
00309 Map3D<Arrays,Args>::Map3D(Arrays input, Arrays output, Args args){
00310         this->input = input;
00311         this->output = output;
00312         this->args = args;
00313 }
00314 
00315 template<class Arrays, class Args>
00316 void Map3D<Arrays,Args>::runCUDA(Arrays in, Arrays out, int blockSize){
00317         dim3 DimBlock(blockSize, blockSize, 1);
00318         dim3 DimGrid((in.getWidth() - 1)/blockSize + 1, ((in.getHeight()) - 1)/blockSize + 1, in.getDepth());
00319                         
00320         mapCU3D<<<DimGrid, DimBlock>>>(in, out, this->args);
00321         gpuErrchk( cudaPeekAtLastError() );
00322         gpuErrchk( cudaDeviceSynchronize() );
00323 }
00324 
00325 template<class Arrays, class Args>
00326 void Map3D<Arrays,Args>::runSeq(Arrays in, Arrays out){
00327         for(int h = 0; h<in.getHeight(); ++h){
00328         for(int w = 0; w<in.getWidth(); ++w){
00329         for(int d = 0; d<in.getDepth(); ++d){
00330                 mapKernel(in, out, this->args, h, w,d);
00331         }}}
00332 }
00333 
00334 template<class Arrays, class Args>
00335 void Map3D<Arrays,Args>::runOpenMP(size_t numThreads){
00336         omp_set_num_threads(numThreads);
00337         #pragma omp parallel for
00338         for(int h = 0; h<this->input.getHeight(); ++h){
00339         for(int w = 0; w<this->input.getWidth(); ++w){
00340         for(int d = 0; d<this->input.getDepth(); ++d){
00341                 mapKernel(this->input, this->output, this->args, h, w,d);
00342         }}}
00343 }
00344 
00345 #ifdef PSKEL_TBB
00346 template<class Arrays, class Args>
00347 struct TBBMap3D{
00348         Arrays input;
00349         Arrays output;
00350         Args args;
00351         TBBMap3D(Arrays input, Arrays output, Args args){
00352                 this->input = input;
00353                 this->output = output;
00354                 this->args = args;
00355         }
00356         void operator()(tbb::blocked_range<int> r)const{
00357                 for(int h = r.begin(); h!=r.end(); h++){
00358                 for(int w = 0; w<this->input.getWidth(); ++w){
00359                 for(int d = 0; d<this->input.getDepth(); ++d){
00360                         mapKernel(this->input, this->output, this->args, h, w,d);
00361                 }}}
00362         }
00363 };
00364 
00365 template<class Arrays, class Args>
00366 void Map3D<Arrays, Args>::runTBB(Arrays in, Arrays out, size_t numThreads){
00367         TBBMap3D<Arrays, Args> tbbmap(in, out, this->args);
00368         tbb::task_scheduler_init init(numThreads);
00369         tbb::parallel_for(tbb::blocked_range<int>(0, in.getHeight()), tbbmap);
00370 }
00371 #endif
00372 
00373 //*******************************************************************************************
00374 // Map 2D
00375 //*******************************************************************************************
00376 
00377 template<class Arrays, class Args>
00378 Map2D<Arrays,Args>::Map2D(){}
00379 
00380 template<class Arrays, class Args>
00381 Map2D<Arrays,Args>::Map2D(Arrays input, Arrays output, Args args){
00382         this->input = input;
00383         this->output = output;
00384         this->args = args;
00385 }
00386 
00387 template<class Arrays, class Args>
00388 void Map2D<Arrays,Args>::runCUDA(Arrays in, Arrays out, int blockSize){
00389         dim3 DimBlock(blockSize, blockSize, 1);
00390         dim3 DimGrid((in.getWidth() - 1)/blockSize + 1, (in.getHeight() - 1)/blockSize + 1, 1);
00391                         
00392         mapCU2D<<<DimGrid, DimBlock>>>(in, out, this->args);
00393         gpuErrchk( cudaPeekAtLastError() );
00394         gpuErrchk( cudaDeviceSynchronize() );   
00395         //gpuErrchk( cudaGetLastError() );      
00396 }
00397         
00398 template<class Arrays, class Args>
00399 void Map2D<Arrays,Args>::runSeq(Arrays in, Arrays out){
00400         for (int h = 0; h < in.getHeight(); h++){
00401         for (int w = 0; w < in.getWidth(); w++){
00402                 mapKernel(in, out, this->args,h,w);
00403         }}
00404 }
00405 
00406 template<class Arrays, class Args>
00407 void Map2D<Arrays,Args>::runOpenMP(size_t numThreads){
00408         omp_set_num_threads(numThreads);
00409         #pragma omp parallel for
00410         for (int h = 0; h < this->input.getHeight(); h++){
00411         for (int w = 0; w < this->input.getWidth(); w++){
00412                 mapKernel(this->input, this->output, this->args,h,w);
00413         }}
00414 }
00415 
00416 #ifdef PSKEL_TBB
00417 template<class Arrays, class Args>
00418 struct TBBMap2D{
00419         Arrays input;
00420         Arrays output;
00421         Args args;
00422         TBBMap2D(Arrays input, Arrays output, Args args){
00423                 this->input = input;
00424                 this->output = output;
00425                 this->args = args;
00426         }
00427         void operator()(tbb::blocked_range<int> r)const{
00428                 for (int h = r.begin(); h != r.end(); h++){
00429                 for (int w = 0; w < this->input.getWidth(); w++){
00430                         mapKernel(this->input, this->output, this->args,h,w);
00431                 }}
00432         }
00433 };
00434 
00435 template<class Arrays, class Args>
00436 void Map2D<Arrays, Args>::runTBB(Arrays in, Arrays out, size_t numThreads){
00437         TBBMap2D<Arrays, Args> tbbmap(in, out, this->args);
00438         tbb::task_scheduler_init init(numThreads);
00439         tbb::parallel_for(tbb::blocked_range<int>(0, in.getHeight()), tbbmap);
00440 }
00441 #endif
00442 
00443 //*******************************************************************************************
00444 // Stencil 1D
00445 //*******************************************************************************************
00446 
00447 
00448 template<class Arrays, class Args>
00449 Map<Arrays,Args>::Map(){}
00450         
00451 template<class Arrays, class Args>
00452 Map<Arrays,Args>::Map(Arrays input, Arrays output, Args args){
00453         this->input = input;
00454         this->output = output;
00455         this->args = args;
00456 }
00457 
00458 template<class Arrays, class Args>
00459 void Map<Arrays,Args>::runCUDA(Arrays in, Arrays out, int blockSize){
00460         dim3 DimBlock(blockSize, 1, 1);
00461         dim3 DimGrid((in.getWidth() - 1)/blockSize + 1,1,1);
00462                         
00463         mapCU<<<DimGrid, DimBlock>>>(in, out, this->args);
00464         gpuErrchk( cudaPeekAtLastError() );
00465         gpuErrchk( cudaDeviceSynchronize() );           
00466 }
00467 
00468 template<class Arrays, class Args>
00469 void Map<Arrays,Args>::runSeq(Arrays in, Arrays out){
00470         for (int i = 0; i < in.getWidth(); i++){
00471                 mapKernel(in, out, this->args, i);
00472         }
00473 }
00474 
00475 template<class Arrays, class Args>
00476 void Map<Arrays,Args>::runOpenMP(size_t numThreads){
00477         omp_set_num_threads(numThreads);
00478         #pragma omp parallel for
00479         for (int i = 0; i < this->input.getWidth(); i++){
00480                 mapKernel(this->input, this->output, this->args, i);
00481         }
00482 }
00483 
00484 #ifdef PSKEL_TBB
00485 template<class Arrays, class Args>
00486 struct TBBMap{
00487         Arrays input;
00488         Arrays output;
00489         Args args;
00490         TBBMap(Arrays input, Arrays output, Args args){
00491                 this->input = input;
00492                 this->output = output;
00493                 this->args = args;
00494         }
00495         void operator()(tbb::blocked_range<int> r)const{
00496                 for (int i = r.begin(); i != r.end(); i++){
00497                         mapKernel(this->input, this->output, this->args, i);
00498                 }
00499         }
00500 };
00501 
00502 template<class Arrays, class Args>
00503 void Map<Arrays, Args>::runTBB(Arrays in, Arrays out, size_t numThreads){
00504         TBBMap<Arrays, Args> tbbmap(in, out, this->args);
00505         tbb::task_scheduler_init init(numThreads);
00506         tbb::parallel_for(tbb::blocked_range<int>(0, in.getWidth()), tbbmap);
00507 }
00508 #endif
00509 
00510 }//end namespace
00511 
00512 #endif