PSkel
|
00001 //------------------------------------------------------------------------------- 00002 // Copyright (c) 2015, ICEI - PUC Minas 00003 // All rights reserved. 00004 // 00005 // Redistribution and use in source and binary forms, with or without 00006 // modification, are permitted provided that the following conditions are met: 00007 // 00008 // 1. Redistributions of source code must retain the above copyright notice, this 00009 // list of conditions and the following disclaimer. 00010 // 00011 // 2. Redistributions in binary form must reproduce the above copyright notice, 00012 // this list of conditions and the following disclaimer in the documentation 00013 // and/or other materials provided with the distribution. 00014 // 00015 // 3. Neither the name of the copyright holder nor the names of its contributors 00016 // may be used to endorse or promote products derived from this software without 00017 // specific prior written permission. 00018 // 00019 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 00020 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00021 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 00022 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 00023 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00024 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 00025 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00026 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 00027 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00028 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00029 //------------------------------------------------------------------------------- 00030 00031 #ifndef PSKEL_STENCIL_HPP 00032 #define PSKEL_STENCIL_HPP 00033 00034 #include <cmath> 00035 #include <algorithm> 00036 #include <iostream> 00037 00038 #include <iostream> 00039 using namespace std; 00040 00041 #include <ga/ga.h> 00042 #include <ga/std_stream.h> 00043 00044 namespace PSkel{ 00045 00046 //******************************************************************************************** 00047 // Kernels CUDA. Chama o kernel implementado pelo usuario 00048 //******************************************************************************************** 00049 00050 template<typename T1, typename T2, class Args> 00051 __global__ void stencilTilingCU(Array<T1> input,Array<T1> output,Mask<T2> mask,Args args, size_t widthOffset, size_t heightOffset, size_t depthOffset, size_t tilingWidth, size_t tilingHeight, size_t tilingDepth); 00052 00053 template<typename T1, typename T2, class Args> 00054 __global__ void stencilTilingCU(Array2D<T1> input,Array2D<T1> output,Mask2D<T2> mask,Args args, size_t widthOffset, size_t heightOffset, size_t depthOffset, size_t tilingWidth, size_t tilingHeight, size_t tilingDepth); 00055 00056 template<typename T1, typename T2, class Args> 00057 __global__ void stencilTilingCU(Array3D<T1> input,Array3D<T1> output,Mask3D<T2> mask,Args args, size_t widthOffset, size_t heightOffset, size_t depthOffset, size_t tilingWidth, size_t tilingHeight, size_t tilingDepth); 00058 00059 00060 //******************************************************************************************** 00061 // Kernels CUDA. Chama o kernel implementado pelo usuario 00062 //******************************************************************************************** 00063 00064 template<typename T1, typename T2, class Args> 00065 __global__ void stencilTilingCU(Array<T1> input,Array<T1> output,Mask<T2> mask,Args args, size_t widthOffset, size_t heightOffset, size_t depthOffset, size_t tilingWidth, size_t tilingHeight, size_t tilingDepth){ 00066 size_t i = blockIdx.x*blockDim.x+threadIdx.x; 00067 #ifdef PSKEL_SHARED_MASK 00068 extern __shared__ int shared[]; 00069 if(threadIdx.x<(mask.size*mask.dimension)) 00070 shared[threadIdx.x] = mask.deviceMask[threadIdx.x]; 00071 __syncthreads(); 00072 mask.deviceMask = shared; 00073 #endif 00074 if(i>=widthOffset && i<(widthOffset+tilingWidth)){ 00075 stencilKernel(input, output, mask, args, i); 00076 } 00077 } 00078 00079 template<typename T1, typename T2, class Args> 00080 __global__ void stencilTilingCU(Array2D<T1> input,Array2D<T1> output,Mask2D<T2> mask,Args args, size_t widthOffset, size_t heightOffset, size_t depthOffset, size_t tilingWidth, size_t tilingHeight, size_t tilingDepth){ 00081 size_t w = blockIdx.x*blockDim.x+threadIdx.x; 00082 size_t h = blockIdx.y*blockDim.y+threadIdx.y; 00083 #ifdef PSKEL_SHARED_MASK 00084 extern __shared__ int shared[]; 00085 if(threadIdx.x<(mask.size*mask.dimension)) 00086 shared[threadIdx.x] = mask.deviceMask[threadIdx.x]; 00087 __syncthreads(); 00088 mask.deviceMask = shared; 00089 #endif 00090 if(w>=widthOffset && w<(widthOffset+tilingWidth) && h>=heightOffset && h<(heightOffset+tilingHeight) ){ 00091 stencilKernel(input, output, mask, args, h, w); 00092 } 00093 } 00094 00095 template<typename T1, typename T2, class Args> 00096 __global__ void stencilTilingCU(Array3D<T1> input,Array3D<T1> output,Mask3D<T2> mask,Args args, size_t widthOffset, size_t heightOffset, size_t depthOffset, size_t tilingWidth, size_t tilingHeight, size_t tilingDepth){ 00097 size_t w = blockIdx.x*blockDim.x+threadIdx.x; 00098 size_t h = blockIdx.y*blockDim.y+threadIdx.y; 00099 size_t d = blockIdx.z*blockDim.z+threadIdx.z; 00100 #ifdef PSKEL_SHARED_MASK 00101 extern __shared__ int shared[]; 00102 if(threadIdx.x<(mask.size*mask.dimension)) 00103 shared[threadIdx.x] = mask.deviceMask[threadIdx.x]; 00104 __syncthreads(); 00105 mask.deviceMask = shared; 00106 #endif 00107 00108 if(w>=widthOffset && w<(widthOffset+tilingWidth) && h>=heightOffset && h<(heightOffset+tilingHeight) && d>=depthOffset && d<(depthOffset+tilingDepth) ){ 00109 stencilKernel(input, output, mask, args, h, w, d); 00110 } 00111 } 00112 00113 //******************************************************************************************* 00114 // Stencil Base 00115 //******************************************************************************************* 00116 00117 template<class Array, class Mask, class Args> 00118 void StencilBase<Array, Mask,Args>::runSequential(){ 00119 this->runSeq(this->input, this->output); 00120 } 00121 00122 template<class Array, class Mask, class Args> 00123 void StencilBase<Array, Mask,Args>::runCPU(size_t numThreads){ 00124 numThreads = (numThreads==0)?omp_get_num_procs():numThreads; 00125 #ifdef PSKEL_TBB 00126 this->runTBB(this->input, this->output, numThreads); 00127 #else 00128 this->runOpenMP(this->input, this->output, numThreads); 00129 #endif 00130 } 00131 00132 template<class Array, class Mask, class Args> 00133 void StencilBase<Array, Mask,Args>::runGPU(size_t GPUBlockSize){ 00134 if(GPUBlockSize==0){ 00135 int device; 00136 cudaGetDevice(&device); 00137 cudaDeviceProp deviceProperties; 00138 cudaGetDeviceProperties(&deviceProperties, device); 00139 //GPUBlockSize = deviceProperties.maxThreadsPerBlock/2; 00140 GPUBlockSize = deviceProperties.warpSize; 00141 //int minGridSize, blockSize; 00142 //cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, stencilTilingCU, 0, in.size()); 00143 //GPUBlockSize = blockSize; 00144 //cout << "GPUBlockSize: "<<GPUBlockSize<<endl; 00145 //int maxActiveBlocks; 00146 //cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, stencilTilingCU, GPUBlockSize, 0); 00147 //float occupancy = (maxActiveBlocks * GPUBlockSize / deviceProperties.warpSize) / 00148 // (float)(deviceProperties.maxThreadsPerMultiProcessor / 00149 // deviceProperties.warpSize); 00150 //printf("Launched blocks of size %d. Theoretical occupancy: %f\n", GPUBlockSize, occupancy); 00151 } 00152 input.deviceAlloc(); 00153 output.deviceAlloc(); 00154 mask.deviceAlloc(); 00155 mask.copyToDevice(); 00156 input.copyToDevice(); 00157 //this->setGPUInputData(); 00158 this->runCUDA(this->input, this->output, GPUBlockSize); 00159 //this->getGPUOutputData(); 00160 output.copyToHost(); 00161 input.deviceFree(); 00162 output.deviceFree(); 00163 mask.deviceFree(); 00164 } 00165 00166 template<class Array, class Mask, class Args> 00167 void StencilBase<Array, Mask,Args>::runTilingGPU(size_t tilingWidth, size_t tilingHeight, size_t tilingDepth, size_t GPUBlockSize){ 00168 if(GPUBlockSize==0){ 00169 int device; 00170 cudaGetDevice(&device); 00171 cudaDeviceProp deviceProperties; 00172 cudaGetDeviceProperties(&deviceProperties, device); 00173 //GPUBlockSize = deviceProperties.maxThreadsPerBlock/2; 00174 GPUBlockSize = deviceProperties.warpSize; 00175 //int minGridSize, blockSize; 00176 //cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, stencilTilingCU, 0, in.size()); 00177 //GPUBlockSize = blockSize; 00178 //cout << "GPUBlockSize: "<<GPUBlockSize<<endl; 00179 //int maxActiveBlocks; 00180 //cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, stencilTilingCU, GPUBlockSize, 0); 00181 //float occupancy = (maxActiveBlocks * GPUBlockSize / deviceProperties.warpSize) / 00182 // (float)(deviceProperties.maxThreadsPerMultiProcessor / 00183 // deviceProperties.warpSize); 00184 //printf("Launched blocks of size %d. Theoretical occupancy: %f\n", GPUBlockSize, occupancy); 00185 } 00186 size_t wTiling = ceil(float(this->input.getWidth())/float(tilingWidth)); 00187 size_t hTiling = ceil(float(this->input.getHeight())/float(tilingHeight)); 00188 size_t dTiling = ceil(float(this->input.getDepth())/float(tilingDepth)); 00189 mask.deviceAlloc(); 00190 mask.copyToDevice(); 00191 //setGPUMask(); 00192 StencilTiling<Array, Mask> tiling(input, output, mask); 00193 Array inputTile; 00194 Array outputTile; 00195 Array tmp; 00196 for(size_t ht=0; ht<hTiling; ht++){ 00197 for(size_t wt=0; wt<wTiling; wt++){ 00198 for(size_t dt=0; dt<dTiling; dt++){ 00199 size_t heightOffset = ht*tilingHeight; 00200 size_t widthOffset = wt*tilingWidth; 00201 size_t depthOffset = dt*tilingDepth; 00202 //CUDA input memory copy 00203 tiling.tile(1, widthOffset, heightOffset, depthOffset, tilingWidth, tilingHeight, tilingDepth); 00204 inputTile.hostSlice(tiling.input, tiling.widthOffset, tiling.heightOffset, tiling.depthOffset, tiling.width, tiling.height, tiling.depth); 00205 outputTile.hostSlice(tiling.output, tiling.widthOffset, tiling.heightOffset, tiling.depthOffset, tiling.width, tiling.height, tiling.depth); 00206 inputTile.deviceAlloc(); 00207 outputTile.deviceAlloc(); 00208 inputTile.copyToDevice(); 00209 tmp.hostAlloc(tiling.width, tiling.height, tiling.depth); 00210 //this->setGPUInputDataIterative(inputCopy, output, innerIterations, widthOffset, heightOffset, depthOffset, tilingWidth, tilingHeight, tilingDepth); 00211 //CUDA kernel execution 00212 this->runIterativeTilingCUDA(inputTile, outputTile, tiling, GPUBlockSize); 00213 tmp.copyFromDevice(outputTile); 00214 Array coreTmp; 00215 Array coreOutput; 00216 coreTmp.hostSlice(tmp, tiling.coreWidthOffset, tiling.coreHeightOffset, tiling.coreDepthOffset, tiling.coreWidth, tiling.coreHeight, tiling.coreDepth); 00217 coreOutput.hostSlice(outputTile, tiling.coreWidthOffset, tiling.coreHeightOffset, tiling.coreDepthOffset, tiling.coreWidth, tiling.coreHeight, tiling.coreDepth); 00218 coreOutput.hostMemCopy(coreTmp); 00219 tmp.hostFree(); 00220 }}} 00221 inputTile.deviceFree(); 00222 outputTile.deviceFree(); 00223 mask.deviceFree(); 00224 cudaDeviceSynchronize(); 00225 } 00226 00227 template<class Array, class Mask, class Args> 00228 void StencilBase<Array, Mask,Args>::runAutoGPU(size_t GPUBlockSize){ 00229 size_t gpuMemFree, gpuMemTotal; 00230 //gpuErrchk( cudaDeviceSynchronize() ); 00231 cudaMemGetInfo(&gpuMemFree, &gpuMemTotal); 00232 if((this->input.memSize()+this->output.memSize()+this->mask.memSize())<(0.998*gpuMemFree)){ 00233 runGPU(GPUBlockSize); 00234 }else{ 00235 size_t typeSize = this->input.memSize()/this->input.size(); 00236 float div = float(this->input.memSize()+this->output.memSize())/((gpuMemFree-this->mask.memSize())*0.97); 00237 if(this->input.getHeight()==1){ 00238 size_t width = floor(float(this->input.getWidth())/div); 00239 width = (width>0)?width:1; 00240 while( (((this->input.getHeight()*this->input.getDepth()+this->output.getHeight()*this->output.getDepth())*(2*this->mask.getRange() + width))*typeSize + this->mask.memSize()) > gpuMemFree*0.998 ){ 00241 width+=2; 00242 } 00243 while( (((this->input.getHeight()*this->input.getDepth()+this->output.getHeight()*this->output.getDepth())*(2*this->mask.getRange() + width))*typeSize + this->mask.memSize()) > gpuMemFree*0.998 ){ 00244 width--; 00245 } 00246 runTilingGPU(width, this->input.getHeight(), this->input.getDepth(), GPUBlockSize); 00247 }else{ 00248 size_t height = floor(float(this->input.getHeight())/div); 00249 height = (height>0)?height:1; 00250 while( (((this->input.getWidth()*this->input.getDepth()+this->output.getWidth()*this->output.getDepth())*(2*this->mask.getRange() + height))*typeSize + this->mask.memSize()) < gpuMemFree*0.998 ){ 00251 height+=2; 00252 } 00253 while( (((this->input.getWidth()*this->input.getDepth()+this->output.getWidth()*this->output.getDepth())*(2*this->mask.getRange() + height))*typeSize + this->mask.memSize()) > gpuMemFree*0.998 ){ 00254 height--; 00255 } 00256 runTilingGPU(this->input.getWidth(), height, this->input.getDepth(), GPUBlockSize); 00257 } 00258 } 00259 } 00260 00261 00262 template<class Array, class Mask, class Args> 00263 void StencilBase<Array, Mask,Args>::runIterativeSequential(size_t iterations){ 00264 Array inputCopy; 00265 inputCopy.hostClone(input); 00266 for(size_t it = 0; it<iterations; it++){ 00267 if(it%2==0) this->runSeq(inputCopy, this->output); 00268 else this->runSeq(this->output, inputCopy); 00269 } 00270 if((iterations%2)==0) output.hostMemCopy(inputCopy); 00271 inputCopy.hostFree(); 00272 } 00273 00274 template<class Array, class Mask, class Args> 00275 void StencilBase<Array, Mask,Args>::runIterativeCPU(size_t iterations, size_t numThreads){ 00276 numThreads = (numThreads==0)?omp_get_num_procs():numThreads; 00277 //cout << "numThreads: " << numThreads << endl; 00278 Array inputCopy; 00279 inputCopy.hostClone(input); 00280 for(size_t it = 0; it<iterations; it++){ 00281 if(it%2==0){ 00282 #ifdef PSKEL_TBB 00283 this->runTBB(inputCopy, this->output, numThreads); 00284 #else 00285 this->runOpenMP(inputCopy, this->output, numThreads); 00286 #endif 00287 }else { 00288 #ifdef PSKEL_TBB 00289 this->runTBB(this->output, inputCopy, numThreads); 00290 #else 00291 this->runOpenMP(this->output, inputCopy, numThreads); 00292 #endif 00293 } 00294 } 00295 if((iterations%2)==0) output.hostMemCopy(inputCopy); 00296 inputCopy.hostFree(); 00297 } 00298 00299 template<class Array, class Mask, class Args> 00300 void StencilBase<Array, Mask,Args>::runIterativeGPU(size_t iterations, size_t GPUBlockSize){ 00301 if(GPUBlockSize==0){ 00302 int device; 00303 cudaGetDevice(&device); 00304 cudaDeviceProp deviceProperties; 00305 cudaGetDeviceProperties(&deviceProperties, device); 00306 //GPUBlockSize = deviceProperties.maxThreadsPerBlock/2; 00307 GPUBlockSize = deviceProperties.warpSize; 00308 //int minGridSize, blockSize; 00309 //cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, stencilTilingCU, 0, in.size()); 00310 //GPUBlockSize = blockSize; 00311 //cout << "GPUBlockSize: "<<GPUBlockSize<<endl; 00312 //int maxActiveBlocks; 00313 //cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, stencilTilingCU, GPUBlockSize, 0); 00314 //float occupancy = (maxActiveBlocks * GPUBlockSize / deviceProperties.warpSize) / 00315 // (float)(deviceProperties.maxThreadsPerMultiProcessor / 00316 // deviceProperties.warpSize); 00317 //printf("Launched blocks of size %d. Theoretical occupancy: %f\n", GPUBlockSize, occupancy); 00318 } 00319 input.deviceAlloc(); 00320 input.copyToDevice(); 00321 mask.deviceAlloc(); 00322 mask.copyToDevice(); 00323 output.deviceAlloc(); 00324 //output.copyToDevice(); 00325 //this->setGPUInputData(); 00326 for(size_t it = 0; it<iterations; it++){ 00327 if((it%2)==0) 00328 this->runCUDA(this->input, this->output, GPUBlockSize); 00329 else this->runCUDA(this->output, this->input, GPUBlockSize); 00330 } 00331 if((iterations%2)==1) 00332 output.copyToHost(); 00333 else output.copyFromDevice(input); 00334 input.deviceFree(); 00335 mask.deviceFree(); 00336 output.deviceFree(); 00337 //this->getGPUOutputData(); 00338 } 00339 00340 template<class Array, class Mask, class Args> 00341 void StencilBase<Array, Mask,Args>::runIterativeTilingGPU(size_t iterations, size_t tilingWidth, size_t tilingHeight, size_t tilingDepth, size_t innerIterations, size_t GPUBlockSize){ 00342 if(GPUBlockSize==0){ 00343 int device; 00344 cudaGetDevice(&device); 00345 cudaDeviceProp deviceProperties; 00346 cudaGetDeviceProperties(&deviceProperties, device); 00347 //GPUBlockSize = deviceProperties.maxThreadsPerBlock/2; 00348 GPUBlockSize = deviceProperties.warpSize; 00349 //int minGridSize, blockSize; 00350 //cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, stencilTilingCU, 0, in.size()); 00351 //GPUBlockSize = blockSize; 00352 //cout << "GPUBlockSize: "<<GPUBlockSize<<endl; 00353 //int maxActiveBlocks; 00354 //cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, stencilTilingCU, GPUBlockSize, 0); 00355 //float occupancy = (maxActiveBlocks * GPUBlockSize / deviceProperties.warpSize) / 00356 // (float)(deviceProperties.maxThreadsPerMultiProcessor / 00357 // deviceProperties.warpSize); 00358 //printf("Launched blocks of size %d. Theoretical occupancy: %f\n", GPUBlockSize, occupancy); 00359 } 00360 Array inputCopy; 00361 inputCopy.hostClone(this->input); 00362 size_t wTiling = ceil(float(this->input.getWidth())/float(tilingWidth)); 00363 size_t hTiling = ceil(float(this->input.getHeight())/float(tilingHeight)); 00364 size_t dTiling = ceil(float(this->input.getDepth())/float(tilingDepth)); 00365 mask.deviceAlloc(); 00366 mask.copyToDevice(); 00367 //setGPUMask(); 00368 StencilTiling<Array, Mask> tiling(inputCopy, this->output, this->mask); 00369 Array inputTile; 00370 Array outputTile; 00371 Array tmp; 00372 size_t outterIterations = ceil(float(iterations)/innerIterations); 00373 for(size_t it = 0; it<outterIterations; it++){ 00374 size_t subIterations = innerIterations; 00375 if(((it+1)*innerIterations)>iterations){ 00376 subIterations = iterations-(it*innerIterations); 00377 } 00378 //cout << "Iteration: " << it << endl; 00379 //cout << "#SubIterations: " << subIterations << endl; 00380 for(size_t ht=0; ht<hTiling; ht++){ 00381 for(size_t wt=0; wt<wTiling; wt++){ 00382 for(size_t dt=0; dt<dTiling; dt++){ 00383 size_t heightOffset = ht*tilingHeight; 00384 size_t widthOffset = wt*tilingWidth; 00385 size_t depthOffset = dt*tilingDepth; 00386 00387 //CUDA input memory copy 00388 tiling.tile(subIterations, widthOffset, heightOffset, depthOffset, tilingWidth, tilingHeight, tilingDepth); 00389 inputTile.hostSlice(tiling.input, tiling.widthOffset, tiling.heightOffset, tiling.depthOffset, tiling.width, tiling.height, tiling.depth); 00390 outputTile.hostSlice(tiling.output, tiling.widthOffset, tiling.heightOffset, tiling.depthOffset, tiling.width, tiling.height, tiling.depth); 00391 inputTile.deviceAlloc(); 00392 outputTile.deviceAlloc(); 00393 tmp.hostAlloc(tiling.width, tiling.height, tiling.depth); 00394 //this->setGPUInputDataIterative(inputCopy, output, innerIterations, widthOffset, heightOffset, depthOffset, tilingWidth, tilingHeight, tilingDepth); 00395 if(it%2==0){ 00396 inputTile.copyToDevice(); 00397 //CUDA kernel execution 00398 this->runIterativeTilingCUDA(inputTile, outputTile, tiling, GPUBlockSize); 00399 if(subIterations%2==0){ 00400 tmp.copyFromDevice(inputTile); 00401 }else{ 00402 tmp.copyFromDevice(outputTile); 00403 } 00404 Array coreTmp; 00405 Array coreOutput; 00406 coreTmp.hostSlice(tmp, tiling.coreWidthOffset, tiling.coreHeightOffset, tiling.coreDepthOffset, tiling.coreWidth, tiling.coreHeight, tiling.coreDepth); 00407 coreOutput.hostSlice(outputTile, tiling.coreWidthOffset, tiling.coreHeightOffset, tiling.coreDepthOffset, tiling.coreWidth, tiling.coreHeight, tiling.coreDepth); 00408 coreOutput.hostMemCopy(coreTmp); 00409 //this->copyTilingOutput(output, innerIterations, widthOffset, heightOffset, depthOffset, tilingWidth, tilingHeight, tilingDepth); 00410 tmp.hostFree(); 00411 }else{ 00412 outputTile.copyToDevice(); 00413 //CUDA kernel execution 00414 this->runIterativeTilingCUDA(outputTile, inputTile, tiling, GPUBlockSize); 00415 if(subIterations%2==0){ 00416 tmp.copyFromDevice(outputTile); 00417 }else{ 00418 tmp.copyFromDevice(inputTile); 00419 } 00420 Array coreTmp; 00421 Array coreInput; 00422 //cout << "[Computing iteration: " << it << "]" << endl; 00423 coreTmp.hostSlice(tmp, tiling.coreWidthOffset, tiling.coreHeightOffset, tiling.coreDepthOffset, tiling.coreWidth, tiling.coreHeight, tiling.coreDepth); 00424 coreInput.hostSlice(inputTile, tiling.coreWidthOffset, tiling.coreHeightOffset, tiling.coreDepthOffset, tiling.coreWidth, tiling.coreHeight, tiling.coreDepth); 00425 coreInput.hostMemCopy(coreTmp); 00426 tmp.hostFree(); 00427 } 00428 }}} 00429 } 00430 inputTile.deviceFree(); 00431 outputTile.deviceFree(); 00432 mask.deviceFree(); 00433 cudaDeviceSynchronize(); 00434 00435 if((outterIterations%2)==0) tiling.output.hostMemCopy(tiling.input); 00436 inputCopy.hostFree(); 00437 } 00438 00439 template<class Array, class Mask, class Args> 00440 void StencilBase<Array, Mask,Args>::runCUDA(Array in, Array out, int GPUBlockSize){ 00441 dim3 DimBlock(GPUBlockSize, GPUBlockSize, 1); 00442 dim3 DimGrid((in.getWidth() - 1)/GPUBlockSize + 1, (in.getHeight() - 1)/GPUBlockSize + 1, in.getDepth()); 00443 00444 #ifdef PSKEL_SHARED_MASK 00445 stencilTilingCU<<<DimGrid, DimBlock, (this->mask.size*this->mask.dimension)>>>(in, out, this->mask, this->args, 0,0,0,in.getWidth(),in.getHeight(),in.getDepth()); 00446 #else 00447 stencilTilingCU<<<DimGrid, DimBlock>>>(in, out, this->mask, this->args, 0,0,0,in.getWidth(),in.getHeight(),in.getDepth()); 00448 #endif 00449 gpuErrchk( cudaPeekAtLastError() ); 00450 gpuErrchk( cudaDeviceSynchronize() ); 00451 } 00452 00453 template<class Array, class Mask, class Args> 00454 void StencilBase<Array, Mask,Args>::runIterativeTilingCUDA(Array in, Array out, StencilTiling<Array, Mask> tiling, size_t GPUBlockSize){ 00455 dim3 DimBlock(GPUBlockSize,GPUBlockSize, 1); 00456 dim3 DimGrid((in.getWidth() - 1)/GPUBlockSize + 1, (in.getHeight() - 1)/GPUBlockSize + 1, 1); 00457 //dim3 DimBlock(GPUBlockSize,GPUBlockSize, 1); 00458 //dim3 DimGrid((in.getWidth() - 1)/GPUBlockSize + 1, (in.getHeight() - 1)/GPUBlockSize + 1, 1); 00459 size_t maskRange = this->mask.getRange(); 00460 for(size_t it=0; it<tiling.iterations; it++){ 00461 //cout << "[Computing iteration: " << it << "]" << endl; 00462 //cout << "mask range: " <<maskRange << endl; 00463 //cout << "mask margin: " <<(maskRange*(tiling.iterations-(it+1))) << endl; 00464 size_t margin = (maskRange*(tiling.iterations-(it+1))); 00465 size_t widthOffset = 0; 00466 size_t extra = 0; 00467 if(tiling.coreWidthOffset>margin){ 00468 widthOffset = tiling.coreWidthOffset-margin; 00469 }else extra = margin-widthOffset; 00470 //cout << "width extra: " << extra << endl; 00471 size_t width = tiling.coreWidth+margin*2 - extra; 00472 if((widthOffset+width)>=tiling.width){ 00473 width = tiling.width-widthOffset; 00474 } 00475 size_t heightOffset = 0; 00476 extra = 0; 00477 if(tiling.coreHeightOffset>margin){ 00478 heightOffset = tiling.coreHeightOffset-margin; 00479 }else extra = margin-heightOffset; 00480 //cout << "height extra: " << extra << endl; 00481 size_t height = tiling.coreHeight+margin*2-extra; 00482 if((heightOffset+height)>=tiling.height){ 00483 height = tiling.height-heightOffset; 00484 } 00485 size_t depthOffset = 0; 00486 extra = 0; 00487 if(tiling.coreDepthOffset>margin){ 00488 depthOffset = tiling.coreDepthOffset-margin; 00489 }else extra = margin-depthOffset; 00490 //cout << "depth extra: " << extra << endl; 00491 size_t depth = tiling.coreDepth+margin*2-extra; 00492 if((depthOffset+depth)>=tiling.depth){ 00493 depth = tiling.depth-depthOffset; 00494 } 00495 00496 //cout << "width-offset: " <<widthOffset << endl; 00497 //cout << "height-offset: " <<heightOffset << endl; 00498 //cout << "depth-offset: " <<depthOffset << endl; 00499 00500 //cout << "width: " <<width << endl; 00501 //cout << "height: " <<height << endl; 00502 //cout << "depth: " <<depth << endl; 00503 if(it%2==0){ 00504 #ifdef PSKEL_SHARED_MASK 00505 stencilTilingCU<<<DimGrid, DimBlock, (this->mask.size*this->mask.dimension)>>>(in, out, this->mask, this->args, widthOffset, heightOffset, depthOffset, width, height, depth); 00506 #else 00507 stencilTilingCU<<<DimGrid, DimBlock>>>(in, out, this->mask, this->args, widthOffset, heightOffset, depthOffset, width, height, depth); 00508 #endif 00509 }else{ 00510 #ifdef PSKEL_SHARED_MASK 00511 stencilTilingCU<<<DimGrid, DimBlock, (this->mask.size*this->mask.dimension)>>>(out, in, this->mask, this->args, widthOffset, heightOffset, depthOffset, width, height, depth); 00512 #else 00513 stencilTilingCU<<<DimGrid, DimBlock>>>(out, in, this->mask, this->args, widthOffset, heightOffset, depthOffset, width, height, depth); 00514 #endif 00515 } 00516 gpuErrchk( cudaPeekAtLastError() ); 00517 gpuErrchk( cudaDeviceSynchronize() ); 00518 } 00519 } 00520 00521 00522 struct TilingGPUGeneticEvaluationFunction{ 00523 size_t iterations; 00524 size_t height; 00525 size_t width; 00526 size_t depth; 00527 size_t range; 00528 size_t typeSize; 00529 size_t memFree; 00530 size_t popsize; 00531 size_t ngen; 00532 size_t dw; 00533 size_t dt; 00534 size_t dh; 00535 float score; 00536 }; 00537 TilingGPUGeneticEvaluationFunction tilingGPUEvaluator; 00538 00539 float objective2D(GAGenome &c){ 00540 GABin2DecGenome &genome = (GABin2DecGenome &)c; 00541 00542 float h = genome.phenotype(0); 00543 float it = genome.phenotype(1); 00544 size_t tileHeight = ((tilingGPUEvaluator.height<=(2*it*tilingGPUEvaluator.range + h))?tilingGPUEvaluator.height:(2*it*tilingGPUEvaluator.range + h)); 00545 00546 if(2*(tilingGPUEvaluator.width*tilingGPUEvaluator.depth*tileHeight*tilingGPUEvaluator.typeSize) > tilingGPUEvaluator.memFree)return 0; 00547 else { 00548 float val = h/tileHeight; 00549 return val*((it*h)/(tilingGPUEvaluator.height*tilingGPUEvaluator.iterations)); 00550 } 00551 } 00552 00553 void solve2D(unsigned int seed){ 00554 int popsize = tilingGPUEvaluator.popsize; 00555 int ngen = tilingGPUEvaluator.ngen; 00556 float pmut = 0.01; 00557 float pcross = 0.6; 00558 00559 float div = (2.0*(tilingGPUEvaluator.width*tilingGPUEvaluator.depth*tilingGPUEvaluator.height*tilingGPUEvaluator.typeSize))/(tilingGPUEvaluator.memFree*1.1); 00560 size_t maxHeight = ceil(float(tilingGPUEvaluator.height)/div); 00561 //Create a phenotype for two variables. The number of bits you can use to 00562 //represent any number is limited by the type of computer you are using. In 00563 //this case, we use 16 bits to represent a floating point number whose value 00564 //can range from -5 to 5, inclusive. The bounds on x1 and x2 can be applied 00565 //here and/or in the objective function. 00566 GABin2DecPhenotype map; 00567 map.add(16, 1, maxHeight); //min/max boundaries, inclusive 00568 map.add(16, 1, tilingGPUEvaluator.iterations); 00569 00570 //Create the template genome using the phenotype map we just made. 00571 GABin2DecGenome genome(map, objective2D); 00572 00573 //Now create the GA using the genome and run it. We'll use sigma truncation 00574 //scaling so that we can handle negative objective scores. 00575 GASimpleGA ga(genome); 00576 GASigmaTruncationScaling scaling; 00577 ga.populationSize(popsize); 00578 ga.nGenerations(ngen); 00579 ga.pMutation(pmut); 00580 ga.pCrossover(pcross); 00581 ga.scaling(scaling); 00582 ga.scoreFrequency(0); 00583 ga.flushFrequency(0); //stop flushing the record of the score of given generations 00584 //ga.scoreFilename(0); //stop recording the score of given generations 00585 ga.evolve(seed); 00586 00587 //Obtains the best individual from the best population evolved 00588 genome = ga.statistics().bestIndividual(); 00589 00590 //cout << "the ga found an optimum at the point ("; 00591 //cout << genome.phenotype(0) << ", " << genome.phenotype(1) << ")\n\n"; 00592 //cout << "best of generation data are in '" << ga.scoreFilename() << "'\n"; 00593 tilingGPUEvaluator.dw = tilingGPUEvaluator.width; 00594 tilingGPUEvaluator.dh = genome.phenotype(0);//height; 00595 tilingGPUEvaluator.dt = genome.phenotype(1);//subIterations; 00596 tilingGPUEvaluator.score = objective2D(genome); 00597 } 00598 00599 float objective3D(GAGenome &c){ 00600 GABin2DecGenome &genome = (GABin2DecGenome &)c; 00601 00602 float w = genome.phenotype(0); 00603 float h = genome.phenotype(1); 00604 float t = genome.phenotype(2); 00605 float tileWidth = ((tilingGPUEvaluator.width<=(2*t*tilingGPUEvaluator.range + w))?tilingGPUEvaluator.width:(2*t*tilingGPUEvaluator.range + w)); 00606 float tileHeight = ((tilingGPUEvaluator.height<=(2*t*tilingGPUEvaluator.range + h))?tilingGPUEvaluator.height:(2*t*tilingGPUEvaluator.range + h)); 00607 00608 if(2*(tileWidth*tileHeight*tilingGPUEvaluator.depth*tilingGPUEvaluator.typeSize) > tilingGPUEvaluator.memFree) return 0; 00609 else { 00610 float val = (w*h)/(tileWidth*tileHeight); 00611 return val*((w*h*t)/(tilingGPUEvaluator.width*tilingGPUEvaluator.height*tilingGPUEvaluator.iterations)); 00612 } 00613 } 00614 00615 void solve3D(unsigned int seed){ 00616 int popsize = tilingGPUEvaluator.popsize; 00617 int ngen = tilingGPUEvaluator.ngen; 00618 float pmut = 0.01; 00619 float pcross = 0.6; 00620 00621 //float div = (2.0*(tilingGPUEvaluator.width*tilingGPUEvaluator.depth*tilingGPUEvaluator.height*tilingGPUEvaluator.typeSize))/(tilingGPUEvaluator.memFree*1.1); 00622 //size_t maxHeight = ceil(float(tilingGPUEvaluator.height)/div); 00623 //Create a phenotype for two variables. The number of bits you can use to 00624 //represent any number is limited by the type of computer you are using. In 00625 //this case, we use 16 bits to represent a floating point number whose value 00626 //can range from -5 to 5, inclusive. The bounds on x1 and x2 can be applied 00627 //here and/or in the objective function. 00628 GABin2DecPhenotype map; 00629 //map.add(16, 1, maxHeight); //min/max boundaries, inclusive 00630 map.add(16, 1, tilingGPUEvaluator.width); 00631 map.add(16, 1, tilingGPUEvaluator.height); 00632 map.add(16, 1, tilingGPUEvaluator.iterations); 00633 00634 //Create the template genome using the phenotype map we just made. 00635 GABin2DecGenome genome(map, objective3D); 00636 00637 //Now create the GA using the genome and run it. We'll use sigma truncation 00638 //scaling so that we can handle negative objective scores. 00639 GASimpleGA ga(genome); 00640 GASigmaTruncationScaling scaling; 00641 ga.populationSize(popsize); 00642 ga.nGenerations(ngen); 00643 ga.pMutation(pmut); 00644 ga.pCrossover(pcross); 00645 ga.scaling(scaling); 00646 ga.scoreFrequency(0); 00647 ga.flushFrequency(0); //stop flushing the record of the score of given generations 00648 //ga.scoreFilename(0); //stop recording the score of given generations 00649 ga.evolve(seed); 00650 00651 //Obtains the best individual from the best population evolved 00652 genome = ga.statistics().bestIndividual(); 00653 00654 //cout << "the ga found an optimum at the point ("; 00655 //cout << genome.phenotype(0) << ", " << genome.phenotype(1) << ")\n\n"; 00656 //cout << "best of generation data are in '" << ga.scoreFilename() << "'\n"; 00657 tilingGPUEvaluator.dw = genome.phenotype(0);//width; 00658 tilingGPUEvaluator.dh = genome.phenotype(1);//height; 00659 tilingGPUEvaluator.dt = genome.phenotype(2);//subIterations; 00660 tilingGPUEvaluator.score = objective3D(genome); 00661 } 00662 00663 template<class Array, class Mask, class Args> 00664 void StencilBase<Array, Mask,Args>::runIterativeAutoGPU(size_t iterations, size_t GPUBlockSize){ 00665 size_t gpuMemFree, gpuMemTotal; 00666 //gpuErrchk( cudaDeviceSynchronize() ); 00667 cudaMemGetInfo(&gpuMemFree, &gpuMemTotal); 00668 if((this->input.memSize()+this->output.memSize()+this->mask.memSize())<(0.999*gpuMemFree)){ 00669 runIterativeGPU(iterations, GPUBlockSize); 00670 }else if(this->input.getHeight()==1){ 00671 //solving for a 'transposed matrix' 00672 tilingGPUEvaluator.typeSize = this->input.memSize()/this->input.size(); 00673 tilingGPUEvaluator.iterations = iterations; 00674 tilingGPUEvaluator.width = this->input.getDepth(); //'transposed matrix' 00675 tilingGPUEvaluator.height = this->input.getWidth(); //'transposed matrix' 00676 tilingGPUEvaluator.depth = 1; 00677 tilingGPUEvaluator.range = this->mask.getRange(); 00678 tilingGPUEvaluator.memFree = (gpuMemFree-this->mask.memSize())*0.999;//gpuMemFree*0.998; 00679 00680 tilingGPUEvaluator.popsize = 100; 00681 tilingGPUEvaluator.ngen = 2500; 00682 00683 unsigned int seed = time(NULL); 00684 solve2D(seed); 00685 00686 size_t subIterations = tilingGPUEvaluator.dt; 00687 size_t width = tilingGPUEvaluator.dh; 00688 //cout << "GPU Mem Total: "<< gpuMemTotal <<endl; 00689 //cout << "GPU Mem Free: "<< gpuMemFree <<endl; 00690 //cout << "sub iterations: "<< subIterations <<endl; 00691 //cout << "tiling height: "<<height<<endl; 00692 runIterativeTilingGPU(iterations, width, 1, this->input.getDepth(), subIterations, GPUBlockSize); 00693 00694 }else { 00695 size_t typeSize = this->input.memSize()/this->input.size(); 00696 tilingGPUEvaluator.typeSize = typeSize; 00697 tilingGPUEvaluator.iterations = iterations; 00698 tilingGPUEvaluator.width = this->input.getWidth(); 00699 tilingGPUEvaluator.height = this->input.getHeight(); 00700 tilingGPUEvaluator.depth = this->input.getDepth(); 00701 tilingGPUEvaluator.range = this->mask.getRange(); 00702 tilingGPUEvaluator.memFree = (gpuMemFree-this->mask.memSize())*0.999;//gpuMemFree*0.998; 00703 if( (2*(1+2*this->mask.getRange())*(this->input.getWidth()*this->input.getDepth())*typeSize+this->mask.memSize()) > (0.98*gpuMemFree) ){ 00704 tilingGPUEvaluator.popsize = 100; 00705 tilingGPUEvaluator.ngen = 2500; 00706 unsigned int seed = time(NULL); 00707 solve3D(seed); 00708 00709 size_t width = tilingGPUEvaluator.dw; 00710 size_t height = tilingGPUEvaluator.dh; 00711 size_t subIterations = tilingGPUEvaluator.dt; 00712 //cout << "GPU Mem Total: "<< gpuMemTotal <<endl; 00713 //cout << "GPU Mem Free: "<< gpuMemFree <<endl; 00714 //cout << "sub iterations: "<< subIterations <<endl; 00715 //cout << "tiling height: "<<height<<endl; 00716 runIterativeTilingGPU(iterations, width, height, this->input.getDepth(), subIterations, GPUBlockSize); 00717 }else{ 00718 tilingGPUEvaluator.popsize = 100; 00719 tilingGPUEvaluator.ngen = 2500; 00720 unsigned int seed = time(NULL); 00721 solve2D(seed); 00722 00723 size_t subIterations = tilingGPUEvaluator.dt; 00724 size_t height = tilingGPUEvaluator.dh; 00725 //cout << "GPU Mem Total: "<< gpuMemTotal <<endl; 00726 //cout << "GPU Mem Free: "<< gpuMemFree <<endl; 00727 //cout << "sub iterations: "<< subIterations <<endl; 00728 //cout << "tiling height: "<<height<<endl; 00729 runIterativeTilingGPU(iterations, this->input.getWidth(), height, this->input.getDepth(), subIterations, GPUBlockSize); 00730 } 00731 } 00732 } 00733 00734 //******************************************************************************************* 00735 // Stencil 3D 00736 //******************************************************************************************* 00737 00738 00739 template<class Array, class Mask, class Args> 00740 Stencil3D<Array,Mask,Args>::Stencil3D(){} 00741 00742 template<class Array, class Mask, class Args> 00743 Stencil3D<Array,Mask,Args>::Stencil3D(Array _input, Array _output, Mask _mask, Args _args){ 00744 this->input = _input; 00745 this->output = _output; 00746 this->args = _args; 00747 this->mask = _mask; 00748 } 00749 00750 template<class Array, class Mask, class Args> 00751 void Stencil3D<Array,Mask,Args>::runSeq(Array in, Array out){ 00752 for (int h = 0; h < in.getHeight(); h++){ 00753 for (int w = 0; w < in.getWidth(); w++){ 00754 for (int d = 0; d < in.getDepth(); d++){ 00755 stencilKernel(in,out,this->mask,this->args,h,w,d); 00756 }}} 00757 } 00758 00759 template<class Array, class Mask, class Args> 00760 void Stencil3D<Array,Mask,Args>::runOpenMP(Array in, Array out, size_t numThreads){ 00761 omp_set_num_threads(numThreads); 00762 #pragma omp parallel for 00763 for (int h = 0; h < in.getHeight(); h++){ 00764 for (int w = 0; w < in.getWidth(); w++){ 00765 for (int d = 0; d < in.getDepth(); d++){ 00766 stencilKernel(in,out,this->mask,this->args,h,w,d); 00767 }}} 00768 } 00769 00770 #ifdef PSKEL_TBB 00771 template<class Array, class Mask, class Args> 00772 struct TBBStencil3D{ 00773 Array input; 00774 Array output; 00775 Mask mask; 00776 Args args; 00777 TBBStencil3D(Array input, Array output, Mask mask, Args args){ 00778 this->input = input; 00779 this->output = output; 00780 this->mask = mask; 00781 this->args = args; 00782 } 00783 void operator()(tbb::blocked_range<int> r)const{ 00784 for (int h = r.begin(); h != r.end(); h++){ 00785 for (int w = 0; w < this->input.getWidth(); w++){ 00786 for (int d = 0; d < this->input.getDepth(); d++){ 00787 stencilKernel(this->input,this->output,this->mask,this->args,h,w,d); 00788 }}} 00789 } 00790 }; 00791 00792 template<class Array, class Mask, class Args> 00793 void Stencil3D<Array,Mask,Args>::runTBB(Array in, Array out, size_t numThreads){ 00794 TBBStencil3D<Array, Mask, Args> tbbstencil(in, out, this->mask, this->args); 00795 tbb::task_scheduler_init init(numThreads); 00796 tbb::parallel_for(tbb::blocked_range<int>(0, in.getHeight()), tbbstencil); 00797 } 00798 #endif 00799 00800 //******************************************************************************************* 00801 // Stencil 2D 00802 //******************************************************************************************* 00803 00804 template<class Array, class Mask, class Args> 00805 Stencil2D<Array,Mask,Args>::Stencil2D(){} 00806 00807 template<class Array, class Mask, class Args> 00808 Stencil2D<Array,Mask,Args>::Stencil2D(Array _input, Array _output, Mask _mask, Args _args){ 00809 this->input = _input; 00810 this->output = _output; 00811 this->args = _args; 00812 this->mask = _mask; 00813 } 00814 /* 00815 template<class Array, class Mask, class Args> 00816 Stencil2D<Array,Mask,Args>::~Stencil2D(){ 00817 this->cudaMemFree(); 00818 this->cpuMemFree(); 00819 } 00820 */ 00821 00822 template<class Array, class Mask, class Args> 00823 void Stencil2D<Array,Mask,Args>::runSeq(Array in, Array out){ 00824 for (int h = 0; h < in.getHeight(); h++){ 00825 for (int w = 0; w < in.getWidth(); w++){ 00826 stencilKernel(in,out,this->mask, this->args,h,w); 00827 }} 00828 } 00829 00830 template<class Array, class Mask, class Args> 00831 void Stencil2D<Array,Mask,Args>::runOpenMP(Array in, Array out, size_t numThreads){ 00832 omp_set_num_threads(numThreads); 00833 #pragma omp parallel for 00834 for (int h = 0; h < in.getHeight(); h++){ 00835 for (int w = 0; w < in.getWidth(); w++){ 00836 stencilKernel(in,out,this->mask, this->args,h,w); 00837 }} 00838 } 00839 00840 #ifdef PSKEL_TBB 00841 template<class Array, class Mask, class Args> 00842 struct TBBStencil2D{ 00843 Array input; 00844 Array output; 00845 Mask mask; 00846 Args args; 00847 TBBStencil2D(Array input, Array output, Mask mask, Args args){ 00848 this->input = input; 00849 this->output = output; 00850 this->mask = mask; 00851 this->args = args; 00852 } 00853 void operator()(tbb::blocked_range<int> r)const{ 00854 for (int h = r.begin(); h != r.end(); h++){ 00855 for (int w = 0; w < this->input.getWidth(); w++){ 00856 stencilKernel(this->input,this->output,this->mask, this->args,h,w); 00857 }} 00858 } 00859 }; 00860 00861 template<class Array, class Mask, class Args> 00862 void Stencil2D<Array,Mask,Args>::runTBB(Array in, Array out, size_t numThreads){ 00863 TBBStencil2D<Array, Mask, Args> tbbstencil(in, out, this->mask, this->args); 00864 tbb::task_scheduler_init init(numThreads); 00865 tbb::parallel_for(tbb::blocked_range<int>(0, in.getHeight()), tbbstencil); 00866 } 00867 #endif 00868 00869 //******************************************************************************************* 00870 // Stencil 1D 00871 //******************************************************************************************* 00872 00873 00874 template<class Array, class Mask, class Args> 00875 Stencil<Array,Mask,Args>::Stencil(){} 00876 00877 template<class Array, class Mask, class Args> 00878 Stencil<Array,Mask,Args>::Stencil(Array _input, Array _output, Mask _mask, Args _args){ 00879 this->input = _input; 00880 this->output = _output; 00881 this->args = _args; 00882 this->mask = _mask; 00883 } 00884 00885 template<class Array, class Mask, class Args> 00886 void Stencil<Array,Mask,Args>::runSeq(Array in, Array out){ 00887 for (int i = 0; i < in.getWidth(); i++){ 00888 stencilKernel(in,out,this->mask, this->args,i); 00889 } 00890 } 00891 00892 template<class Array, class Mask, class Args> 00893 void Stencil<Array,Mask,Args>::runOpenMP(Array in, Array out, size_t numThreads){ 00894 omp_set_num_threads(numThreads); 00895 #pragma omp parallel for 00896 for (int i = 0; i < in.getWidth(); i++){ 00897 stencilKernel(in,out,this->mask, this->args,i); 00898 } 00899 } 00900 00901 #ifdef PSKEL_TBB 00902 template<class Array, class Mask, class Args> 00903 struct TBBStencil{ 00904 Array input; 00905 Array output; 00906 Mask mask; 00907 Args args; 00908 TBBStencil(Array input, Array output, Mask mask, Args args){ 00909 this->input = input; 00910 this->output = output; 00911 this->mask = mask; 00912 this->args = args; 00913 } 00914 void operator()(tbb::blocked_range<int> r)const{ 00915 for (int i = r.begin(); i != r.end(); i++){ 00916 stencilKernel(this->input,this->output,this->mask, this->args,i); 00917 } 00918 } 00919 }; 00920 00921 template<class Array, class Mask, class Args> 00922 void Stencil<Array,Mask,Args>::runTBB(Array in, Array out, size_t numThreads){ 00923 TBBStencil<Array, Mask, Args> tbbstencil(in, out, this->mask, this->args); 00924 tbb::task_scheduler_init init(numThreads); 00925 tbb::parallel_for(tbb::blocked_range<int>(0, in.getWidth()), tbbstencil); 00926 } 00927 #endif 00928 00929 }//end namespace 00930 00931 00932 #endif