PSkel
|
00001 //------------------------------------------------------------------------------- 00002 // Copyright (c) 2015, ICEI - PUC Minas 00003 // All rights reserved. 00004 // 00005 // Redistribution and use in source and binary forms, with or without 00006 // modification, are permitted provided that the following conditions are met: 00007 // 00008 // 1. Redistributions of source code must retain the above copyright notice, this 00009 // list of conditions and the following disclaimer. 00010 // 00011 // 2. Redistributions in binary form must reproduce the above copyright notice, 00012 // this list of conditions and the following disclaimer in the documentation 00013 // and/or other materials provided with the distribution. 00014 // 00015 // 3. Neither the name of the copyright holder nor the names of its contributors 00016 // may be used to endorse or promote products derived from this software without 00017 // specific prior written permission. 00018 // 00019 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 00020 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00021 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 00022 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 00023 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00024 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 00025 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00026 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 00027 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00028 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00029 //------------------------------------------------------------------------------- 00030 00031 #ifndef PSKEL_MAP_HPP 00032 #define PSKEL_MAP_HPP 00033 00034 #include <algorithm> 00035 #include <cmath> 00036 00037 #ifdef PSKEL_TBB 00038 #include <tbb/blocked_range.h> 00039 #include <tbb/parallel_for.h> 00040 #include <tbb/task_scheduler_init.h> 00041 #endif 00042 00043 namespace PSkel{ 00044 00045 //******************************************************************************************** 00046 // Kernels CUDA. Chama o kernel implementado pelo usuario 00047 //******************************************************************************************** 00048 00049 template<typename T1, class Args> 00050 __global__ void mapCU(Array<T1> input,Array<T1> output,Args args); 00051 00052 template<typename T1, class Args> 00053 __global__ void mapCU2D(Array2D<T1> input,Array2D<T1> output,Args args); 00054 00055 template<typename T1, class Args> 00056 __global__ void mapCU3D(Array3D<T1> input,Array3D<T1> output,Args args); 00057 00058 00059 //******************************************************************************************** 00060 // Kernels CUDA. Chama o kernel implementado pelo usuario 00061 //******************************************************************************************** 00062 00063 template<typename T1, class Args> 00064 __global__ void mapCU(Array<T1> input,Array<T1> output, Args args){ 00065 size_t i = blockIdx.x*blockDim.x+threadIdx.x; 00066 00067 if(i<input.getWidth()){ 00068 mapKernel(input, output, args, i); 00069 } 00070 } 00071 00072 template<typename T1, class Args> 00073 __global__ void mapCU2D(Array2D<T1> input,Array2D<T1> output,Args args){ 00074 size_t w = blockIdx.x*blockDim.x+threadIdx.x; 00075 size_t h = blockIdx.y*blockDim.y+threadIdx.y; 00076 00077 if(w<input.getWidth() && h<input.getHeight()){ 00078 mapKernel(input, output, args, h, w); 00079 } 00080 } 00081 00082 template<typename T1, class Args> 00083 __global__ void mapCU3D(Array3D<T1> input,Array3D<T1> output,Args args){ 00084 size_t w = blockIdx.x*blockDim.x+threadIdx.x; 00085 size_t h = blockIdx.y*blockDim.y+threadIdx.y; 00086 size_t d = blockIdx.z*blockDim.z+threadIdx.z; 00087 00088 if(w<input.getWidth() && h<input.getHeight() && d<input.getDepth()){ 00089 mapKernel(input, output, args, h, w,d); 00090 } 00091 } 00092 00093 //******************************************************************************************* 00094 // Stencil Base 00095 //******************************************************************************************* 00096 00097 template<class Arrays, class Args> 00098 void MapBase<Arrays, Args>::runSequential(){ 00099 this->runSeq(this->input, this->output); 00100 } 00101 00102 template<class Arrays, class Args> 00103 void MapBase<Arrays, Args>::runCPU(size_t numThreads){ 00104 numThreads = (numThreads==0)?omp_get_num_procs():numThreads; 00105 #ifdef PSKEL_TBB 00106 this->runTBB(this->input, this->output, numThreads); 00107 #else 00108 this->runOpenMP(this->input, this->output, numThreads); 00109 #endif 00110 } 00111 00112 template<class Arrays, class Args> 00113 void MapBase<Arrays, Args>::runGPU(size_t blockSize){ 00114 if(blockSize==0){ 00115 int device; 00116 cudaGetDevice(&device); 00117 cudaDeviceProp deviceProperties; 00118 cudaGetDeviceProperties(&deviceProperties, device); 00119 blockSize = deviceProperties.warpSize; 00120 } 00121 input.deviceAlloc(); 00122 output.deviceAlloc(); 00123 input.copyToDevice(); 00124 this->runCUDA(this->input, this->output, blockSize); 00125 output.copyToHost(); 00126 input.deviceFree(); 00127 output.deviceFree(); 00128 } 00129 /* 00130 template<class Arrays, class Args> 00131 void MapBase<Arrays, Args>::runHybrid(float GPUPartition, size_t GPUBlockSize, size_t numThreads){ 00132 if(GPUPartition==0.0){ 00133 runCPU(numThreads); 00134 }else if(GPUPartition==1.0){ 00135 runGPU(GPUBlockSize); 00136 }else{ 00137 Arrays inputSliceGPU; 00138 Arrays outputSliceGPU; 00139 Arrays inputSliceCPU; 00140 Arrays outputSliceCPU; 00141 if(input.getHeight()>1){ 00142 size_t GPUHeight = size_t(this->input.getHeight()*GPUPartition); 00143 inputSliceGPU.hostSlice(this->input, 0, 0, 0, this->input.getWidth(), GPUHeight, this->input.getDepth()); 00144 outputSliceGPU.hostSlice(this->output, 0, 0, 0, this->output.getWidth(), GPUHeight, this->output.getDepth()); 00145 inputSliceCPU.hostSlice(this->input, 0, GPUHeight, 0, this->input.getWidth(), this->input.getHeight()-GPUHeight, this->input.getDepth()); 00146 outputSliceCPU.hostSlice(this->output, 0, GPUHeight, 0, this->output.getWidth(), this->output.getHeight()-GPUHeight, this->output.getDepth()); 00147 }else{ 00148 size_t GPUWidth= size_t(this->input.getWidth()*GPUPartition); 00149 inputSliceGPU.hostSlice(this->input, 0, 0, 0, GPUWidth, this->input.getHeight(), this->input.getDepth()); 00150 outputSliceGPU.hostSlice(this->output, 0, 0, 0, GPUWidth, this->output.getHeight(), this->output.getDepth()); 00151 inputSliceCPU.hostSlice(this->input, GPUWidth, 0, 0, this->input.getWidth()-GPUWidth, this->input.getHeight(), this->input.getDepth()); 00152 outputSliceCPU.hostSlice(this->output, GPUWidth, 0, 0, this->output.getWidth()-GPUWidth, this->output.getHeight(), this->output.getDepth()); 00153 } 00154 omp_set_num_threads(2); 00155 00156 #pragma omp parallel sections 00157 { 00158 #pragma omp section 00159 { 00160 inputSliceGPU.deviceAlloc(); 00161 inputSliceGPU.copyToDevice(); 00162 outputSliceGPU.deviceAlloc(); 00163 this->runCUDA(inputSliceGPU, outputSliceGPU, GPUBlockSize); 00164 outputSliceGPU.copyToHost(); 00165 00166 inputSliceGPU.deviceFree(); 00167 outputSliceGPU.deviceFree(); 00168 } 00169 #pragma omp section 00170 { 00171 this->runTBB(inputSliceCPU, outputSliceCPU,numThreads); 00172 } 00173 } 00174 } 00175 } 00176 */ 00177 template<class Arrays, class Args> 00178 void MapBase<Arrays, Args>::runIterativeSequential(size_t iterations){ 00179 Arrays inputCopy; 00180 inputCopy.hostClone(input); 00181 for(size_t it = 0; it<iterations; it++){ 00182 if(it%2==0) this->runSeq(inputCopy, this->output); 00183 else this->runSeq(this->output, inputCopy); 00184 } 00185 if((iterations%2)==0) output.hostMemCopy(inputCopy); 00186 inputCopy.hostFree(); 00187 } 00188 00189 template<class Arrays, class Args> 00190 void MapBase<Arrays, Args>::runIterativeCPU(size_t iterations, size_t numThreads){ 00191 numThreads = (numThreads==0)?omp_get_num_procs():numThreads; 00192 Arrays inputCopy; 00193 inputCopy.hostClone(input); 00194 for(size_t it = 0; it<iterations; it++){ 00195 if(it%2==0){ 00196 #ifdef PSKEL_TBB 00197 this->runTBB(inputCopy, this->output, numThreads); 00198 #else 00199 this->runOpenMP(inputCopy, this->output, numThreads); 00200 #endif 00201 }else { 00202 #ifdef PSKEL_TBB 00203 this->runTBB(this->output, inputCopy, numThreads); 00204 #else 00205 this->runOpenMP(this->output, inputCopy, numThreads); 00206 #endif 00207 } 00208 } 00209 if((iterations%2)==0) output.hostMemCopy(inputCopy); 00210 inputCopy.hostFree(); 00211 } 00212 00213 template<class Arrays, class Args> 00214 void MapBase<Arrays, Args>::runIterativeGPU(size_t iterations, size_t blockSize){ 00215 if(blockSize==0){ 00216 int device; 00217 cudaGetDevice(&device); 00218 cudaDeviceProp deviceProperties; 00219 cudaGetDeviceProperties(&deviceProperties, device); 00220 blockSize = deviceProperties.warpSize; 00221 } 00222 input.deviceAlloc(); 00223 input.copyToDevice(); 00224 output.deviceAlloc(); 00225 for(size_t it = 0; it<iterations; it++){ 00226 if((it%2)==0) 00227 this->runCUDA(this->input, this->output, blockSize); 00228 else this->runCUDA(this->output, this->input, blockSize); 00229 } 00230 if((iterations%2)==1) 00231 output.copyToHost(); 00232 else output.copyFromDevice(input); 00233 input.deviceFree(); 00234 output.deviceFree(); 00235 } 00236 /* 00237 template<class Arrays, class Args> 00238 void MapBase<Arrays, Args>::runIterativeHybrid(size_t iterations, float GPUPartition, size_t GPUBlockSize, size_t numThreads){ 00239 if(GPUPartition==0.0){ 00240 runIterativeCPU(iterations, numThreads); 00241 }else if(GPUPartition==1.0){ 00242 runIterativeGPU(iterations, GPUBlockSize); 00243 }else{ 00244 Arrays inputSliceGPU; 00245 Arrays outputSliceGPU; 00246 Arrays inputSliceCPU; 00247 Arrays outputSliceCPU; 00248 if(input.getHeight()>1){ 00249 size_t GPUHeight = size_t(this->input.getHeight()*GPUPartition); 00250 inputSliceGPU.hostSlice(this->input, 0, 0, 0, this->input.getWidth(), GPUHeight, this->input.getDepth()); 00251 outputSliceGPU.hostSlice(this->output, 0, 0, 0, this->output.getWidth(), GPUHeight, this->output.getDepth()); 00252 inputSliceCPU.hostSlice(this->input, 0, GPUHeight, 0, this->input.getWidth(), this->input.getHeight()-GPUHeight, this->input.getDepth()); 00253 outputSliceCPU.hostSlice(this->output, 0, GPUHeight, 0, this->output.getWidth(), this->output.getHeight()-GPUHeight, this->output.getDepth()); 00254 }else{ 00255 size_t GPUWidth= size_t(this->input.getWidth()*GPUPartition); 00256 inputSliceGPU.hostSlice(this->input, 0, 0, 0, GPUWidth, this->input.getHeight(), this->input.getDepth()); 00257 outputSliceGPU.hostSlice(this->output, 0, 0, 0, GPUWidth, this->output.getHeight(), this->output.getDepth()); 00258 inputSliceCPU.hostSlice(this->input, GPUWidth, 0, 0, this->input.getWidth()-GPUWidth, this->input.getHeight(), this->input.getDepth()); 00259 outputSliceCPU.hostSlice(this->output, GPUWidth, 0, 0, this->output.getWidth()-GPUWidth, this->output.getHeight(), this->output.getDepth()); 00260 } 00261 00262 omp_set_num_threads(2); 00263 00264 #pragma omp parallel sections 00265 { 00266 #pragma omp section 00267 { 00268 inputSliceGPU.deviceAlloc(); 00269 inputSliceGPU.copyToDevice(); 00270 outputSliceGPU.deviceAlloc(); 00271 for(size_t it = 0; it<iterations; it++){ 00272 if((it%2)==0) 00273 this->runCUDA(inputSliceGPU, outputSliceGPU, GPUBlockSize); 00274 else this->runCUDA(outputSliceGPU, inputSliceGPU, GPUBlockSize); 00275 } 00276 //outputSliceGPU.copyToHost(); 00277 00278 //outputSliceGPU.deviceFree(); 00279 } 00280 #pragma omp section 00281 { 00282 Arrays inputCopy; 00283 inputCopy.hostClone(inputSliceCPU); 00284 for(size_t it = 0; it<iterations; it++){ 00285 if(it%2==0) this->runTBB(inputCopy, outputSliceCPU, numThreads); 00286 else this->runTBB(outputSliceCPU, inputCopy, numThreads); 00287 //std::swap(input,output); 00288 } 00289 if((iterations%2)==0) outputSliceCPU.hostMemCopy(inputCopy); 00290 inputCopy.hostFree(); 00291 } 00292 } 00293 if((iterations%2)==1) 00294 outputSliceGPU.copyToHost(); 00295 else outputSliceGPU.copyFromDevice(inputSliceGPU); 00296 inputSliceGPU.deviceFree(); 00297 outputSliceGPU.deviceFree(); 00298 } 00299 } 00300 */ 00301 //******************************************************************************************* 00302 // Map 3D 00303 //******************************************************************************************* 00304 00305 template<class Arrays, class Args> 00306 Map3D<Arrays,Args>::Map3D(){} 00307 00308 template<class Arrays, class Args> 00309 Map3D<Arrays,Args>::Map3D(Arrays input, Arrays output, Args args){ 00310 this->input = input; 00311 this->output = output; 00312 this->args = args; 00313 } 00314 00315 template<class Arrays, class Args> 00316 void Map3D<Arrays,Args>::runCUDA(Arrays in, Arrays out, int blockSize){ 00317 dim3 DimBlock(blockSize, blockSize, 1); 00318 dim3 DimGrid((in.getWidth() - 1)/blockSize + 1, ((in.getHeight()) - 1)/blockSize + 1, in.getDepth()); 00319 00320 mapCU3D<<<DimGrid, DimBlock>>>(in, out, this->args); 00321 gpuErrchk( cudaPeekAtLastError() ); 00322 gpuErrchk( cudaDeviceSynchronize() ); 00323 } 00324 00325 template<class Arrays, class Args> 00326 void Map3D<Arrays,Args>::runSeq(Arrays in, Arrays out){ 00327 for(int h = 0; h<in.getHeight(); ++h){ 00328 for(int w = 0; w<in.getWidth(); ++w){ 00329 for(int d = 0; d<in.getDepth(); ++d){ 00330 mapKernel(in, out, this->args, h, w,d); 00331 }}} 00332 } 00333 00334 template<class Arrays, class Args> 00335 void Map3D<Arrays,Args>::runOpenMP(size_t numThreads){ 00336 omp_set_num_threads(numThreads); 00337 #pragma omp parallel for 00338 for(int h = 0; h<this->input.getHeight(); ++h){ 00339 for(int w = 0; w<this->input.getWidth(); ++w){ 00340 for(int d = 0; d<this->input.getDepth(); ++d){ 00341 mapKernel(this->input, this->output, this->args, h, w,d); 00342 }}} 00343 } 00344 00345 #ifdef PSKEL_TBB 00346 template<class Arrays, class Args> 00347 struct TBBMap3D{ 00348 Arrays input; 00349 Arrays output; 00350 Args args; 00351 TBBMap3D(Arrays input, Arrays output, Args args){ 00352 this->input = input; 00353 this->output = output; 00354 this->args = args; 00355 } 00356 void operator()(tbb::blocked_range<int> r)const{ 00357 for(int h = r.begin(); h!=r.end(); h++){ 00358 for(int w = 0; w<this->input.getWidth(); ++w){ 00359 for(int d = 0; d<this->input.getDepth(); ++d){ 00360 mapKernel(this->input, this->output, this->args, h, w,d); 00361 }}} 00362 } 00363 }; 00364 00365 template<class Arrays, class Args> 00366 void Map3D<Arrays, Args>::runTBB(Arrays in, Arrays out, size_t numThreads){ 00367 TBBMap3D<Arrays, Args> tbbmap(in, out, this->args); 00368 tbb::task_scheduler_init init(numThreads); 00369 tbb::parallel_for(tbb::blocked_range<int>(0, in.getHeight()), tbbmap); 00370 } 00371 #endif 00372 00373 //******************************************************************************************* 00374 // Map 2D 00375 //******************************************************************************************* 00376 00377 template<class Arrays, class Args> 00378 Map2D<Arrays,Args>::Map2D(){} 00379 00380 template<class Arrays, class Args> 00381 Map2D<Arrays,Args>::Map2D(Arrays input, Arrays output, Args args){ 00382 this->input = input; 00383 this->output = output; 00384 this->args = args; 00385 } 00386 00387 template<class Arrays, class Args> 00388 void Map2D<Arrays,Args>::runCUDA(Arrays in, Arrays out, int blockSize){ 00389 dim3 DimBlock(blockSize, blockSize, 1); 00390 dim3 DimGrid((in.getWidth() - 1)/blockSize + 1, (in.getHeight() - 1)/blockSize + 1, 1); 00391 00392 mapCU2D<<<DimGrid, DimBlock>>>(in, out, this->args); 00393 gpuErrchk( cudaPeekAtLastError() ); 00394 gpuErrchk( cudaDeviceSynchronize() ); 00395 //gpuErrchk( cudaGetLastError() ); 00396 } 00397 00398 template<class Arrays, class Args> 00399 void Map2D<Arrays,Args>::runSeq(Arrays in, Arrays out){ 00400 for (int h = 0; h < in.getHeight(); h++){ 00401 for (int w = 0; w < in.getWidth(); w++){ 00402 mapKernel(in, out, this->args,h,w); 00403 }} 00404 } 00405 00406 template<class Arrays, class Args> 00407 void Map2D<Arrays,Args>::runOpenMP(size_t numThreads){ 00408 omp_set_num_threads(numThreads); 00409 #pragma omp parallel for 00410 for (int h = 0; h < this->input.getHeight(); h++){ 00411 for (int w = 0; w < this->input.getWidth(); w++){ 00412 mapKernel(this->input, this->output, this->args,h,w); 00413 }} 00414 } 00415 00416 #ifdef PSKEL_TBB 00417 template<class Arrays, class Args> 00418 struct TBBMap2D{ 00419 Arrays input; 00420 Arrays output; 00421 Args args; 00422 TBBMap2D(Arrays input, Arrays output, Args args){ 00423 this->input = input; 00424 this->output = output; 00425 this->args = args; 00426 } 00427 void operator()(tbb::blocked_range<int> r)const{ 00428 for (int h = r.begin(); h != r.end(); h++){ 00429 for (int w = 0; w < this->input.getWidth(); w++){ 00430 mapKernel(this->input, this->output, this->args,h,w); 00431 }} 00432 } 00433 }; 00434 00435 template<class Arrays, class Args> 00436 void Map2D<Arrays, Args>::runTBB(Arrays in, Arrays out, size_t numThreads){ 00437 TBBMap2D<Arrays, Args> tbbmap(in, out, this->args); 00438 tbb::task_scheduler_init init(numThreads); 00439 tbb::parallel_for(tbb::blocked_range<int>(0, in.getHeight()), tbbmap); 00440 } 00441 #endif 00442 00443 //******************************************************************************************* 00444 // Stencil 1D 00445 //******************************************************************************************* 00446 00447 00448 template<class Arrays, class Args> 00449 Map<Arrays,Args>::Map(){} 00450 00451 template<class Arrays, class Args> 00452 Map<Arrays,Args>::Map(Arrays input, Arrays output, Args args){ 00453 this->input = input; 00454 this->output = output; 00455 this->args = args; 00456 } 00457 00458 template<class Arrays, class Args> 00459 void Map<Arrays,Args>::runCUDA(Arrays in, Arrays out, int blockSize){ 00460 dim3 DimBlock(blockSize, 1, 1); 00461 dim3 DimGrid((in.getWidth() - 1)/blockSize + 1,1,1); 00462 00463 mapCU<<<DimGrid, DimBlock>>>(in, out, this->args); 00464 gpuErrchk( cudaPeekAtLastError() ); 00465 gpuErrchk( cudaDeviceSynchronize() ); 00466 } 00467 00468 template<class Arrays, class Args> 00469 void Map<Arrays,Args>::runSeq(Arrays in, Arrays out){ 00470 for (int i = 0; i < in.getWidth(); i++){ 00471 mapKernel(in, out, this->args, i); 00472 } 00473 } 00474 00475 template<class Arrays, class Args> 00476 void Map<Arrays,Args>::runOpenMP(size_t numThreads){ 00477 omp_set_num_threads(numThreads); 00478 #pragma omp parallel for 00479 for (int i = 0; i < this->input.getWidth(); i++){ 00480 mapKernel(this->input, this->output, this->args, i); 00481 } 00482 } 00483 00484 #ifdef PSKEL_TBB 00485 template<class Arrays, class Args> 00486 struct TBBMap{ 00487 Arrays input; 00488 Arrays output; 00489 Args args; 00490 TBBMap(Arrays input, Arrays output, Args args){ 00491 this->input = input; 00492 this->output = output; 00493 this->args = args; 00494 } 00495 void operator()(tbb::blocked_range<int> r)const{ 00496 for (int i = r.begin(); i != r.end(); i++){ 00497 mapKernel(this->input, this->output, this->args, i); 00498 } 00499 } 00500 }; 00501 00502 template<class Arrays, class Args> 00503 void Map<Arrays, Args>::runTBB(Arrays in, Arrays out, size_t numThreads){ 00504 TBBMap<Arrays, Args> tbbmap(in, out, this->args); 00505 tbb::task_scheduler_init init(numThreads); 00506 tbb::parallel_for(tbb::blocked_range<int>(0, in.getWidth()), tbbmap); 00507 } 00508 #endif 00509 00510 }//end namespace 00511 00512 #endif