PSkel
src/PSkelStencil.hpp
00001 //-------------------------------------------------------------------------------
00002 // Copyright (c) 2015, ICEI - PUC Minas
00003 // All rights reserved.
00004 // 
00005 // Redistribution and use in source and binary forms, with or without
00006 // modification, are permitted provided that the following conditions are met:
00007 // 
00008 // 1. Redistributions of source code must retain the above copyright notice, this
00009 // list of conditions and the following disclaimer.
00010 // 
00011 // 2. Redistributions in binary form must reproduce the above copyright notice,
00012 // this list of conditions and the following disclaimer in the documentation
00013 // and/or other materials provided with the distribution.
00014 // 
00015 // 3. Neither the name of the copyright holder nor the names of its contributors
00016 // may be used to endorse or promote products derived from this software without
00017 // specific prior written permission.
00018 // 
00019 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00020 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00021 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
00022 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
00023 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00024 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
00025 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00026 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
00027 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00028 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029 //-------------------------------------------------------------------------------
00030 
00031 #ifndef PSKEL_STENCIL_HPP
00032 #define PSKEL_STENCIL_HPP
00033 
00034 #include <cmath>
00035 #include <algorithm>
00036 #include <iostream>
00037 
00038 #include <iostream>
00039 using namespace std;
00040 
00041 #include <ga/ga.h>
00042 #include <ga/std_stream.h>
00043 
00044 namespace PSkel{
00045 
00046 //********************************************************************************************
00047 // Kernels CUDA. Chama o kernel implementado pelo usuario
00048 //********************************************************************************************
00049 
00050 template<typename T1, typename T2, class Args>
00051 __global__ void stencilTilingCU(Array<T1> input,Array<T1> output,Mask<T2> mask,Args args, size_t widthOffset, size_t heightOffset, size_t depthOffset, size_t tilingWidth, size_t tilingHeight, size_t tilingDepth);
00052 
00053 template<typename T1, typename T2, class Args>
00054 __global__ void stencilTilingCU(Array2D<T1> input,Array2D<T1> output,Mask2D<T2> mask,Args args, size_t widthOffset, size_t heightOffset, size_t depthOffset, size_t tilingWidth, size_t tilingHeight, size_t tilingDepth);
00055 
00056 template<typename T1, typename T2, class Args>
00057 __global__ void stencilTilingCU(Array3D<T1> input,Array3D<T1> output,Mask3D<T2> mask,Args args, size_t widthOffset, size_t heightOffset, size_t depthOffset, size_t tilingWidth, size_t tilingHeight, size_t tilingDepth);
00058 
00059 
00060 //********************************************************************************************
00061 // Kernels CUDA. Chama o kernel implementado pelo usuario
00062 //********************************************************************************************
00063 
00064 template<typename T1, typename T2, class Args>
00065 __global__ void stencilTilingCU(Array<T1> input,Array<T1> output,Mask<T2> mask,Args args, size_t widthOffset, size_t heightOffset, size_t depthOffset, size_t tilingWidth, size_t tilingHeight, size_t tilingDepth){
00066         size_t i = blockIdx.x*blockDim.x+threadIdx.x;
00067         #ifdef PSKEL_SHARED_MASK
00068         extern __shared__ int shared[];
00069         if(threadIdx.x<(mask.size*mask.dimension))
00070                 shared[threadIdx.x] = mask.deviceMask[threadIdx.x];
00071         __syncthreads();
00072         mask.deviceMask = shared;
00073         #endif
00074         if(i>=widthOffset && i<(widthOffset+tilingWidth)){
00075                 stencilKernel(input, output, mask, args, i);
00076         }
00077 }
00078 
00079 template<typename T1, typename T2, class Args>
00080 __global__ void stencilTilingCU(Array2D<T1> input,Array2D<T1> output,Mask2D<T2> mask,Args args, size_t widthOffset, size_t heightOffset, size_t depthOffset, size_t tilingWidth, size_t tilingHeight, size_t tilingDepth){
00081         size_t w = blockIdx.x*blockDim.x+threadIdx.x;
00082         size_t h = blockIdx.y*blockDim.y+threadIdx.y;
00083         #ifdef PSKEL_SHARED_MASK
00084         extern __shared__ int shared[];
00085         if(threadIdx.x<(mask.size*mask.dimension))
00086                 shared[threadIdx.x] = mask.deviceMask[threadIdx.x];
00087         __syncthreads();
00088         mask.deviceMask = shared;
00089         #endif
00090         if(w>=widthOffset && w<(widthOffset+tilingWidth) && h>=heightOffset && h<(heightOffset+tilingHeight) ){
00091                 stencilKernel(input, output, mask, args, h, w);
00092         }
00093 }
00094 
00095 template<typename T1, typename T2, class Args>
00096 __global__ void stencilTilingCU(Array3D<T1> input,Array3D<T1> output,Mask3D<T2> mask,Args args, size_t widthOffset, size_t heightOffset, size_t depthOffset, size_t tilingWidth, size_t tilingHeight, size_t tilingDepth){
00097         size_t w = blockIdx.x*blockDim.x+threadIdx.x;
00098         size_t h = blockIdx.y*blockDim.y+threadIdx.y;
00099         size_t d = blockIdx.z*blockDim.z+threadIdx.z;
00100         #ifdef PSKEL_SHARED_MASK
00101         extern __shared__ int shared[];
00102         if(threadIdx.x<(mask.size*mask.dimension))
00103                 shared[threadIdx.x] = mask.deviceMask[threadIdx.x];
00104         __syncthreads();
00105         mask.deviceMask = shared;
00106         #endif
00107   
00108         if(w>=widthOffset && w<(widthOffset+tilingWidth) && h>=heightOffset && h<(heightOffset+tilingHeight) && d>=depthOffset && d<(depthOffset+tilingDepth) ){
00109                 stencilKernel(input, output, mask, args, h, w, d);
00110         }
00111 }
00112 
00113 //*******************************************************************************************
00114 // Stencil Base
00115 //*******************************************************************************************
00116 
00117 template<class Array, class Mask, class Args>
00118 void StencilBase<Array, Mask,Args>::runSequential(){
00119         this->runSeq(this->input, this->output);
00120 }
00121 
00122 template<class Array, class Mask, class Args>
00123 void StencilBase<Array, Mask,Args>::runCPU(size_t numThreads){
00124         numThreads = (numThreads==0)?omp_get_num_procs():numThreads;
00125         #ifdef PSKEL_TBB
00126                 this->runTBB(this->input, this->output, numThreads);
00127         #else
00128                 this->runOpenMP(this->input, this->output, numThreads);
00129         #endif
00130 }
00131 
00132 template<class Array, class Mask, class Args>
00133 void StencilBase<Array, Mask,Args>::runGPU(size_t GPUBlockSize){
00134         if(GPUBlockSize==0){
00135                 int device;
00136                 cudaGetDevice(&device);
00137                 cudaDeviceProp deviceProperties;
00138                 cudaGetDeviceProperties(&deviceProperties, device);
00139                 //GPUBlockSize = deviceProperties.maxThreadsPerBlock/2;
00140                 GPUBlockSize = deviceProperties.warpSize;
00141                 //int minGridSize, blockSize;
00142                 //cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, stencilTilingCU, 0, in.size());
00143                 //GPUBlockSize = blockSize;
00144                 //cout << "GPUBlockSize: "<<GPUBlockSize<<endl;
00145                 //int maxActiveBlocks;
00146                 //cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, stencilTilingCU, GPUBlockSize, 0);
00147                 //float occupancy = (maxActiveBlocks * GPUBlockSize / deviceProperties.warpSize) / 
00148                 //    (float)(deviceProperties.maxThreadsPerMultiProcessor / 
00149                 //            deviceProperties.warpSize);
00150                 //printf("Launched blocks of size %d. Theoretical occupancy: %f\n", GPUBlockSize, occupancy);
00151         }
00152         input.deviceAlloc();
00153         output.deviceAlloc();
00154         mask.deviceAlloc();
00155         mask.copyToDevice();
00156         input.copyToDevice();
00157         //this->setGPUInputData();
00158         this->runCUDA(this->input, this->output, GPUBlockSize);
00159         //this->getGPUOutputData();
00160         output.copyToHost();
00161         input.deviceFree();
00162         output.deviceFree();
00163         mask.deviceFree();
00164 }
00165 
00166 template<class Array, class Mask, class Args>
00167 void StencilBase<Array, Mask,Args>::runTilingGPU(size_t tilingWidth, size_t tilingHeight, size_t tilingDepth, size_t GPUBlockSize){
00168         if(GPUBlockSize==0){
00169                 int device;
00170                 cudaGetDevice(&device);
00171                 cudaDeviceProp deviceProperties;
00172                 cudaGetDeviceProperties(&deviceProperties, device);
00173                 //GPUBlockSize = deviceProperties.maxThreadsPerBlock/2;
00174                 GPUBlockSize = deviceProperties.warpSize;
00175                 //int minGridSize, blockSize;
00176                 //cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, stencilTilingCU, 0, in.size());
00177                 //GPUBlockSize = blockSize;
00178                 //cout << "GPUBlockSize: "<<GPUBlockSize<<endl;
00179                 //int maxActiveBlocks;
00180                 //cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, stencilTilingCU, GPUBlockSize, 0);
00181                 //float occupancy = (maxActiveBlocks * GPUBlockSize / deviceProperties.warpSize) / 
00182                 //    (float)(deviceProperties.maxThreadsPerMultiProcessor / 
00183                 //            deviceProperties.warpSize);
00184                 //printf("Launched blocks of size %d. Theoretical occupancy: %f\n", GPUBlockSize, occupancy);
00185         }
00186         size_t wTiling = ceil(float(this->input.getWidth())/float(tilingWidth));
00187         size_t hTiling = ceil(float(this->input.getHeight())/float(tilingHeight));
00188         size_t dTiling = ceil(float(this->input.getDepth())/float(tilingDepth));
00189         mask.deviceAlloc();
00190         mask.copyToDevice();
00191         //setGPUMask();
00192         StencilTiling<Array, Mask> tiling(input, output, mask);
00193         Array inputTile;
00194         Array outputTile;
00195         Array tmp;
00196         for(size_t ht=0; ht<hTiling; ht++){
00197          for(size_t wt=0; wt<wTiling; wt++){
00198           for(size_t dt=0; dt<dTiling; dt++){
00199                 size_t heightOffset = ht*tilingHeight;
00200                 size_t widthOffset = wt*tilingWidth;
00201                 size_t depthOffset = dt*tilingDepth;
00202                 //CUDA input memory copy
00203                 tiling.tile(1, widthOffset, heightOffset, depthOffset, tilingWidth, tilingHeight, tilingDepth);
00204                 inputTile.hostSlice(tiling.input, tiling.widthOffset, tiling.heightOffset, tiling.depthOffset, tiling.width, tiling.height, tiling.depth);
00205                 outputTile.hostSlice(tiling.output, tiling.widthOffset, tiling.heightOffset, tiling.depthOffset, tiling.width, tiling.height, tiling.depth);
00206                 inputTile.deviceAlloc();
00207                 outputTile.deviceAlloc();
00208                 inputTile.copyToDevice();
00209                 tmp.hostAlloc(tiling.width, tiling.height, tiling.depth);
00210                 //this->setGPUInputDataIterative(inputCopy, output, innerIterations, widthOffset, heightOffset, depthOffset, tilingWidth, tilingHeight, tilingDepth);
00211                 //CUDA kernel execution
00212                 this->runIterativeTilingCUDA(inputTile, outputTile, tiling, GPUBlockSize);
00213                 tmp.copyFromDevice(outputTile);
00214                 Array coreTmp;
00215                 Array coreOutput;
00216                 coreTmp.hostSlice(tmp, tiling.coreWidthOffset, tiling.coreHeightOffset, tiling.coreDepthOffset, tiling.coreWidth, tiling.coreHeight, tiling.coreDepth);
00217                 coreOutput.hostSlice(outputTile, tiling.coreWidthOffset, tiling.coreHeightOffset, tiling.coreDepthOffset, tiling.coreWidth, tiling.coreHeight, tiling.coreDepth);
00218                 coreOutput.hostMemCopy(coreTmp);
00219                 tmp.hostFree();
00220         }}}
00221         inputTile.deviceFree();
00222         outputTile.deviceFree();
00223         mask.deviceFree();
00224         cudaDeviceSynchronize();
00225 }
00226 
00227 template<class Array, class Mask, class Args>
00228 void StencilBase<Array, Mask,Args>::runAutoGPU(size_t GPUBlockSize){
00229         size_t gpuMemFree, gpuMemTotal;
00230         //gpuErrchk( cudaDeviceSynchronize() );
00231         cudaMemGetInfo(&gpuMemFree, &gpuMemTotal);
00232         if((this->input.memSize()+this->output.memSize()+this->mask.memSize())<(0.998*gpuMemFree)){
00233                 runGPU(GPUBlockSize);
00234         }else{
00235                 size_t typeSize = this->input.memSize()/this->input.size();
00236                 float div = float(this->input.memSize()+this->output.memSize())/((gpuMemFree-this->mask.memSize())*0.97);
00237                 if(this->input.getHeight()==1){
00238                         size_t width = floor(float(this->input.getWidth())/div);
00239                         width = (width>0)?width:1;
00240                         while( (((this->input.getHeight()*this->input.getDepth()+this->output.getHeight()*this->output.getDepth())*(2*this->mask.getRange() + width))*typeSize + this->mask.memSize()) > gpuMemFree*0.998 ){
00241                                 width+=2;
00242                         }
00243                         while( (((this->input.getHeight()*this->input.getDepth()+this->output.getHeight()*this->output.getDepth())*(2*this->mask.getRange() + width))*typeSize + this->mask.memSize()) > gpuMemFree*0.998 ){
00244                                 width--;
00245                         }
00246                         runTilingGPU(width, this->input.getHeight(), this->input.getDepth(), GPUBlockSize);
00247                 }else{
00248                         size_t height = floor(float(this->input.getHeight())/div);
00249                         height = (height>0)?height:1;
00250                         while( (((this->input.getWidth()*this->input.getDepth()+this->output.getWidth()*this->output.getDepth())*(2*this->mask.getRange() + height))*typeSize + this->mask.memSize()) < gpuMemFree*0.998 ){
00251                                 height+=2;
00252                         }
00253                         while( (((this->input.getWidth()*this->input.getDepth()+this->output.getWidth()*this->output.getDepth())*(2*this->mask.getRange() + height))*typeSize + this->mask.memSize()) > gpuMemFree*0.998 ){
00254                                 height--;
00255                         }
00256                         runTilingGPU(this->input.getWidth(), height, this->input.getDepth(), GPUBlockSize);
00257                 }
00258         }
00259 }
00260 
00261 
00262 template<class Array, class Mask, class Args>
00263 void StencilBase<Array, Mask,Args>::runIterativeSequential(size_t iterations){
00264         Array inputCopy;
00265         inputCopy.hostClone(input);
00266         for(size_t it = 0; it<iterations; it++){
00267                 if(it%2==0) this->runSeq(inputCopy, this->output);
00268                 else this->runSeq(this->output, inputCopy);
00269         }
00270         if((iterations%2)==0) output.hostMemCopy(inputCopy);
00271         inputCopy.hostFree();
00272 }
00273 
00274 template<class Array, class Mask, class Args>
00275 void StencilBase<Array, Mask,Args>::runIterativeCPU(size_t iterations, size_t numThreads){
00276         numThreads = (numThreads==0)?omp_get_num_procs():numThreads;
00277         //cout << "numThreads: " << numThreads << endl;
00278         Array inputCopy;
00279         inputCopy.hostClone(input);
00280         for(size_t it = 0; it<iterations; it++){
00281                 if(it%2==0){
00282                         #ifdef PSKEL_TBB
00283                                 this->runTBB(inputCopy, this->output, numThreads);
00284                         #else
00285                                 this->runOpenMP(inputCopy, this->output, numThreads);
00286                         #endif
00287                 }else {
00288                         #ifdef PSKEL_TBB
00289                                 this->runTBB(this->output, inputCopy, numThreads);
00290                         #else
00291                                 this->runOpenMP(this->output, inputCopy, numThreads);
00292                         #endif
00293                 }
00294         }
00295         if((iterations%2)==0) output.hostMemCopy(inputCopy);
00296         inputCopy.hostFree();
00297 }
00298 
00299 template<class Array, class Mask, class Args>
00300 void StencilBase<Array, Mask,Args>::runIterativeGPU(size_t iterations, size_t GPUBlockSize){
00301         if(GPUBlockSize==0){
00302                 int device;
00303                 cudaGetDevice(&device);
00304                 cudaDeviceProp deviceProperties;
00305                 cudaGetDeviceProperties(&deviceProperties, device);
00306                 //GPUBlockSize = deviceProperties.maxThreadsPerBlock/2;
00307                 GPUBlockSize = deviceProperties.warpSize;
00308                 //int minGridSize, blockSize;
00309                 //cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, stencilTilingCU, 0, in.size());
00310                 //GPUBlockSize = blockSize;
00311                 //cout << "GPUBlockSize: "<<GPUBlockSize<<endl;
00312                 //int maxActiveBlocks;
00313                 //cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, stencilTilingCU, GPUBlockSize, 0);
00314                 //float occupancy = (maxActiveBlocks * GPUBlockSize / deviceProperties.warpSize) / 
00315                 //    (float)(deviceProperties.maxThreadsPerMultiProcessor / 
00316                 //            deviceProperties.warpSize);
00317                 //printf("Launched blocks of size %d. Theoretical occupancy: %f\n", GPUBlockSize, occupancy);
00318         }
00319         input.deviceAlloc();
00320         input.copyToDevice();
00321         mask.deviceAlloc();
00322         mask.copyToDevice();
00323         output.deviceAlloc();
00324         //output.copyToDevice();
00325         //this->setGPUInputData();
00326         for(size_t it = 0; it<iterations; it++){
00327                 if((it%2)==0)
00328                         this->runCUDA(this->input, this->output, GPUBlockSize);
00329                 else this->runCUDA(this->output, this->input, GPUBlockSize);
00330         }
00331         if((iterations%2)==1)
00332                 output.copyToHost();
00333         else output.copyFromDevice(input);
00334         input.deviceFree();
00335         mask.deviceFree();
00336         output.deviceFree();
00337         //this->getGPUOutputData();
00338 }
00339 
00340 template<class Array, class Mask, class Args>
00341 void StencilBase<Array, Mask,Args>::runIterativeTilingGPU(size_t iterations, size_t tilingWidth, size_t tilingHeight, size_t tilingDepth, size_t innerIterations, size_t GPUBlockSize){
00342         if(GPUBlockSize==0){
00343                 int device;
00344                 cudaGetDevice(&device);
00345                 cudaDeviceProp deviceProperties;
00346                 cudaGetDeviceProperties(&deviceProperties, device);
00347                 //GPUBlockSize = deviceProperties.maxThreadsPerBlock/2;
00348                 GPUBlockSize = deviceProperties.warpSize;
00349                 //int minGridSize, blockSize;
00350                 //cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, stencilTilingCU, 0, in.size());
00351                 //GPUBlockSize = blockSize;
00352                 //cout << "GPUBlockSize: "<<GPUBlockSize<<endl;
00353                 //int maxActiveBlocks;
00354                 //cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, stencilTilingCU, GPUBlockSize, 0);
00355                 //float occupancy = (maxActiveBlocks * GPUBlockSize / deviceProperties.warpSize) / 
00356                 //    (float)(deviceProperties.maxThreadsPerMultiProcessor / 
00357                 //            deviceProperties.warpSize);
00358                 //printf("Launched blocks of size %d. Theoretical occupancy: %f\n", GPUBlockSize, occupancy);
00359         }
00360         Array inputCopy;
00361         inputCopy.hostClone(this->input);
00362         size_t wTiling = ceil(float(this->input.getWidth())/float(tilingWidth));
00363         size_t hTiling = ceil(float(this->input.getHeight())/float(tilingHeight));
00364         size_t dTiling = ceil(float(this->input.getDepth())/float(tilingDepth));
00365         mask.deviceAlloc();
00366         mask.copyToDevice();
00367         //setGPUMask();
00368         StencilTiling<Array, Mask> tiling(inputCopy, this->output, this->mask);
00369         Array inputTile;
00370         Array outputTile;
00371         Array tmp;
00372         size_t outterIterations = ceil(float(iterations)/innerIterations);
00373         for(size_t it = 0; it<outterIterations; it++){
00374                 size_t subIterations = innerIterations;
00375                 if(((it+1)*innerIterations)>iterations){
00376                         subIterations = iterations-(it*innerIterations);
00377                 }
00378                 //cout << "Iteration: " << it << endl;
00379                 //cout << "#SubIterations: " << subIterations << endl;
00380                 for(size_t ht=0; ht<hTiling; ht++){
00381                  for(size_t wt=0; wt<wTiling; wt++){
00382                   for(size_t dt=0; dt<dTiling; dt++){
00383                         size_t heightOffset = ht*tilingHeight;
00384                         size_t widthOffset = wt*tilingWidth;
00385                         size_t depthOffset = dt*tilingDepth;
00386 
00387                         //CUDA input memory copy
00388                         tiling.tile(subIterations, widthOffset, heightOffset, depthOffset, tilingWidth, tilingHeight, tilingDepth);
00389                         inputTile.hostSlice(tiling.input, tiling.widthOffset, tiling.heightOffset, tiling.depthOffset, tiling.width, tiling.height, tiling.depth);
00390                         outputTile.hostSlice(tiling.output, tiling.widthOffset, tiling.heightOffset, tiling.depthOffset, tiling.width, tiling.height, tiling.depth);
00391                         inputTile.deviceAlloc();
00392                         outputTile.deviceAlloc();
00393                         tmp.hostAlloc(tiling.width, tiling.height, tiling.depth);
00394                         //this->setGPUInputDataIterative(inputCopy, output, innerIterations, widthOffset, heightOffset, depthOffset, tilingWidth, tilingHeight, tilingDepth);
00395                         if(it%2==0){
00396                                 inputTile.copyToDevice();
00397                                 //CUDA kernel execution
00398                                 this->runIterativeTilingCUDA(inputTile, outputTile, tiling, GPUBlockSize);
00399                                 if(subIterations%2==0){
00400                                         tmp.copyFromDevice(inputTile);
00401                                 }else{
00402                                         tmp.copyFromDevice(outputTile);
00403                                 }
00404                                 Array coreTmp;
00405                                 Array coreOutput;
00406                                 coreTmp.hostSlice(tmp, tiling.coreWidthOffset, tiling.coreHeightOffset, tiling.coreDepthOffset, tiling.coreWidth, tiling.coreHeight, tiling.coreDepth);
00407                                 coreOutput.hostSlice(outputTile, tiling.coreWidthOffset, tiling.coreHeightOffset, tiling.coreDepthOffset, tiling.coreWidth, tiling.coreHeight, tiling.coreDepth);
00408                                 coreOutput.hostMemCopy(coreTmp);
00409                                 //this->copyTilingOutput(output, innerIterations, widthOffset, heightOffset, depthOffset, tilingWidth, tilingHeight, tilingDepth);
00410                                 tmp.hostFree();
00411                         }else{
00412                                 outputTile.copyToDevice();
00413                                 //CUDA kernel execution
00414                                 this->runIterativeTilingCUDA(outputTile, inputTile, tiling, GPUBlockSize);
00415                                 if(subIterations%2==0){
00416                                         tmp.copyFromDevice(outputTile);
00417                                 }else{
00418                                         tmp.copyFromDevice(inputTile);
00419                                 }
00420                                 Array coreTmp;
00421                                 Array coreInput;
00422                                 //cout << "[Computing iteration: " << it << "]" << endl;
00423                                 coreTmp.hostSlice(tmp, tiling.coreWidthOffset, tiling.coreHeightOffset, tiling.coreDepthOffset, tiling.coreWidth, tiling.coreHeight, tiling.coreDepth);
00424                                 coreInput.hostSlice(inputTile, tiling.coreWidthOffset, tiling.coreHeightOffset, tiling.coreDepthOffset, tiling.coreWidth, tiling.coreHeight, tiling.coreDepth);
00425                                 coreInput.hostMemCopy(coreTmp);
00426                                 tmp.hostFree();
00427                         }
00428                 }}}
00429         }
00430         inputTile.deviceFree();
00431         outputTile.deviceFree();
00432         mask.deviceFree();
00433         cudaDeviceSynchronize();
00434 
00435         if((outterIterations%2)==0) tiling.output.hostMemCopy(tiling.input);
00436         inputCopy.hostFree();
00437 }
00438 
00439 template<class Array, class Mask, class Args>
00440 void StencilBase<Array, Mask,Args>::runCUDA(Array in, Array out, int GPUBlockSize){
00441         dim3 DimBlock(GPUBlockSize, GPUBlockSize, 1);
00442         dim3 DimGrid((in.getWidth() - 1)/GPUBlockSize + 1, (in.getHeight() - 1)/GPUBlockSize + 1, in.getDepth());
00443 
00444         #ifdef PSKEL_SHARED_MASK
00445         stencilTilingCU<<<DimGrid, DimBlock, (this->mask.size*this->mask.dimension)>>>(in, out, this->mask, this->args, 0,0,0,in.getWidth(),in.getHeight(),in.getDepth());
00446         #else
00447         stencilTilingCU<<<DimGrid, DimBlock>>>(in, out, this->mask, this->args, 0,0,0,in.getWidth(),in.getHeight(),in.getDepth());
00448         #endif
00449         gpuErrchk( cudaPeekAtLastError() );
00450         gpuErrchk( cudaDeviceSynchronize() );
00451 }
00452 
00453 template<class Array, class Mask, class Args>
00454 void StencilBase<Array, Mask,Args>::runIterativeTilingCUDA(Array in, Array out, StencilTiling<Array, Mask> tiling, size_t GPUBlockSize){
00455         dim3 DimBlock(GPUBlockSize,GPUBlockSize, 1);
00456         dim3 DimGrid((in.getWidth() - 1)/GPUBlockSize + 1, (in.getHeight() - 1)/GPUBlockSize + 1, 1);
00457         //dim3 DimBlock(GPUBlockSize,GPUBlockSize, 1);
00458         //dim3 DimGrid((in.getWidth() - 1)/GPUBlockSize + 1, (in.getHeight() - 1)/GPUBlockSize + 1, 1);
00459         size_t maskRange = this->mask.getRange();
00460         for(size_t it=0; it<tiling.iterations; it++){
00461                 //cout << "[Computing iteration: " << it << "]" << endl;
00462                 //cout << "mask range: " <<maskRange << endl;
00463                 //cout << "mask margin: " <<(maskRange*(tiling.iterations-(it+1))) << endl;
00464                 size_t margin = (maskRange*(tiling.iterations-(it+1)));
00465                 size_t widthOffset = 0;
00466                 size_t extra = 0;
00467                 if(tiling.coreWidthOffset>margin){
00468                         widthOffset = tiling.coreWidthOffset-margin;
00469                 }else extra = margin-widthOffset;
00470                 //cout << "width extra: " << extra << endl;
00471                 size_t width = tiling.coreWidth+margin*2 - extra;
00472                 if((widthOffset+width)>=tiling.width){
00473                         width = tiling.width-widthOffset;
00474                 }
00475                 size_t heightOffset = 0;
00476                 extra = 0;
00477                 if(tiling.coreHeightOffset>margin){
00478                         heightOffset = tiling.coreHeightOffset-margin;
00479                 }else extra = margin-heightOffset;
00480                 //cout << "height extra: " << extra << endl;
00481                 size_t height = tiling.coreHeight+margin*2-extra;
00482                 if((heightOffset+height)>=tiling.height){
00483                         height = tiling.height-heightOffset;
00484                 }
00485                 size_t depthOffset = 0;
00486                 extra = 0;
00487                 if(tiling.coreDepthOffset>margin){
00488                         depthOffset = tiling.coreDepthOffset-margin;
00489                 }else extra = margin-depthOffset;
00490                 //cout << "depth extra: " << extra << endl;
00491                 size_t depth = tiling.coreDepth+margin*2-extra;
00492                 if((depthOffset+depth)>=tiling.depth){
00493                         depth = tiling.depth-depthOffset;
00494                 }
00495                 
00496                 //cout << "width-offset: " <<widthOffset << endl;
00497                 //cout << "height-offset: " <<heightOffset << endl;
00498                 //cout << "depth-offset: " <<depthOffset << endl;
00499                 
00500                 //cout << "width: " <<width << endl;
00501                 //cout << "height: " <<height << endl;
00502                 //cout << "depth: " <<depth << endl;
00503                 if(it%2==0){
00504                         #ifdef PSKEL_SHARED_MASK
00505                         stencilTilingCU<<<DimGrid, DimBlock, (this->mask.size*this->mask.dimension)>>>(in, out, this->mask, this->args, widthOffset, heightOffset, depthOffset, width, height, depth);
00506                         #else
00507                         stencilTilingCU<<<DimGrid, DimBlock>>>(in, out, this->mask, this->args, widthOffset, heightOffset, depthOffset, width, height, depth);
00508                         #endif
00509                 }else{
00510                         #ifdef PSKEL_SHARED_MASK
00511                         stencilTilingCU<<<DimGrid, DimBlock, (this->mask.size*this->mask.dimension)>>>(out, in, this->mask, this->args, widthOffset, heightOffset, depthOffset, width, height, depth);
00512                         #else
00513                         stencilTilingCU<<<DimGrid, DimBlock>>>(out, in, this->mask, this->args, widthOffset, heightOffset, depthOffset, width, height, depth);
00514                         #endif
00515                 }
00516                 gpuErrchk( cudaPeekAtLastError() );
00517                 gpuErrchk( cudaDeviceSynchronize() );
00518         }
00519 }
00520 
00521 
00522 struct TilingGPUGeneticEvaluationFunction{
00523     size_t iterations;
00524     size_t height;
00525     size_t width;
00526     size_t depth;
00527     size_t range;
00528     size_t typeSize;
00529     size_t memFree;
00530     size_t popsize;
00531     size_t ngen;
00532     size_t dw;
00533     size_t dt;
00534     size_t dh;
00535     float score;
00536 };
00537 TilingGPUGeneticEvaluationFunction tilingGPUEvaluator;
00538 
00539 float objective2D(GAGenome &c){
00540         GABin2DecGenome &genome = (GABin2DecGenome &)c;
00541         
00542         float h = genome.phenotype(0);
00543         float it = genome.phenotype(1);
00544         size_t tileHeight = ((tilingGPUEvaluator.height<=(2*it*tilingGPUEvaluator.range + h))?tilingGPUEvaluator.height:(2*it*tilingGPUEvaluator.range + h));
00545  
00546         if(2*(tilingGPUEvaluator.width*tilingGPUEvaluator.depth*tileHeight*tilingGPUEvaluator.typeSize) > tilingGPUEvaluator.memFree)return 0;
00547         else {
00548                 float val = h/tileHeight;
00549                 return val*((it*h)/(tilingGPUEvaluator.height*tilingGPUEvaluator.iterations));
00550         }
00551 }
00552 
00553 void solve2D(unsigned int seed){
00554         int popsize = tilingGPUEvaluator.popsize;
00555         int ngen = tilingGPUEvaluator.ngen;
00556         float pmut = 0.01;
00557         float pcross = 0.6;
00558         
00559         float div = (2.0*(tilingGPUEvaluator.width*tilingGPUEvaluator.depth*tilingGPUEvaluator.height*tilingGPUEvaluator.typeSize))/(tilingGPUEvaluator.memFree*1.1);
00560         size_t maxHeight = ceil(float(tilingGPUEvaluator.height)/div);
00561         //Create a phenotype for two variables.  The number of bits you can use to
00562         //represent any number is limited by the type of computer you are using.  In
00563         //this case, we use 16 bits to represent a floating point number whose value
00564         //can range from -5 to 5, inclusive.  The bounds on x1 and x2 can be applied
00565         //here and/or in the objective function.
00566         GABin2DecPhenotype map;
00567         map.add(16, 1, maxHeight); //min/max boundaries, inclusive
00568         map.add(16, 1, tilingGPUEvaluator.iterations);
00569 
00570         //Create the template genome using the phenotype map we just made.
00571         GABin2DecGenome genome(map, objective2D);
00572 
00573         //Now create the GA using the genome and run it.  We'll use sigma truncation
00574         //scaling so that we can handle negative objective scores.
00575         GASimpleGA ga(genome);
00576         GASigmaTruncationScaling scaling;
00577         ga.populationSize(popsize);
00578         ga.nGenerations(ngen);
00579         ga.pMutation(pmut);
00580         ga.pCrossover(pcross);
00581         ga.scaling(scaling);
00582         ga.scoreFrequency(0);
00583         ga.flushFrequency(0); //stop flushing the record of the score of given generations
00584         //ga.scoreFilename(0); //stop recording the score of given generations
00585         ga.evolve(seed);
00586 
00587         //Obtains the best individual from the best population evolved
00588         genome = ga.statistics().bestIndividual();
00589 
00590         //cout << "the ga found an optimum at the point (";
00591         //cout << genome.phenotype(0) << ", " << genome.phenotype(1) << ")\n\n";
00592         //cout << "best of generation data are in '" << ga.scoreFilename() << "'\n";
00593         tilingGPUEvaluator.dw = tilingGPUEvaluator.width;
00594         tilingGPUEvaluator.dh = genome.phenotype(0);//height;
00595         tilingGPUEvaluator.dt = genome.phenotype(1);//subIterations;
00596         tilingGPUEvaluator.score = objective2D(genome);
00597 }
00598 
00599 float objective3D(GAGenome &c){
00600         GABin2DecGenome &genome = (GABin2DecGenome &)c;
00601         
00602         float w = genome.phenotype(0);
00603         float h = genome.phenotype(1);
00604         float t = genome.phenotype(2);
00605         float tileWidth = ((tilingGPUEvaluator.width<=(2*t*tilingGPUEvaluator.range + w))?tilingGPUEvaluator.width:(2*t*tilingGPUEvaluator.range + w));
00606         float tileHeight = ((tilingGPUEvaluator.height<=(2*t*tilingGPUEvaluator.range + h))?tilingGPUEvaluator.height:(2*t*tilingGPUEvaluator.range + h));
00607  
00608         if(2*(tileWidth*tileHeight*tilingGPUEvaluator.depth*tilingGPUEvaluator.typeSize) > tilingGPUEvaluator.memFree) return 0;
00609         else {
00610                 float val = (w*h)/(tileWidth*tileHeight);
00611                 return val*((w*h*t)/(tilingGPUEvaluator.width*tilingGPUEvaluator.height*tilingGPUEvaluator.iterations));
00612         }
00613 }
00614 
00615 void solve3D(unsigned int seed){
00616         int popsize = tilingGPUEvaluator.popsize;
00617         int ngen = tilingGPUEvaluator.ngen;
00618         float pmut = 0.01;
00619         float pcross = 0.6;
00620         
00621         //float div = (2.0*(tilingGPUEvaluator.width*tilingGPUEvaluator.depth*tilingGPUEvaluator.height*tilingGPUEvaluator.typeSize))/(tilingGPUEvaluator.memFree*1.1);
00622         //size_t maxHeight = ceil(float(tilingGPUEvaluator.height)/div);
00623         //Create a phenotype for two variables.  The number of bits you can use to
00624         //represent any number is limited by the type of computer you are using.  In
00625         //this case, we use 16 bits to represent a floating point number whose value
00626         //can range from -5 to 5, inclusive.  The bounds on x1 and x2 can be applied
00627         //here and/or in the objective function.
00628         GABin2DecPhenotype map;
00629         //map.add(16, 1, maxHeight); //min/max boundaries, inclusive
00630         map.add(16, 1, tilingGPUEvaluator.width);
00631         map.add(16, 1, tilingGPUEvaluator.height);
00632         map.add(16, 1, tilingGPUEvaluator.iterations);
00633 
00634         //Create the template genome using the phenotype map we just made.
00635         GABin2DecGenome genome(map, objective3D);
00636 
00637         //Now create the GA using the genome and run it.  We'll use sigma truncation
00638         //scaling so that we can handle negative objective scores.
00639         GASimpleGA ga(genome);
00640         GASigmaTruncationScaling scaling;
00641         ga.populationSize(popsize);
00642         ga.nGenerations(ngen);
00643         ga.pMutation(pmut);
00644         ga.pCrossover(pcross);
00645         ga.scaling(scaling);
00646         ga.scoreFrequency(0);
00647         ga.flushFrequency(0); //stop flushing the record of the score of given generations
00648         //ga.scoreFilename(0); //stop recording the score of given generations
00649         ga.evolve(seed);
00650 
00651         //Obtains the best individual from the best population evolved
00652         genome = ga.statistics().bestIndividual();
00653 
00654         //cout << "the ga found an optimum at the point (";
00655         //cout << genome.phenotype(0) << ", " << genome.phenotype(1) << ")\n\n";
00656         //cout << "best of generation data are in '" << ga.scoreFilename() << "'\n";
00657         tilingGPUEvaluator.dw = genome.phenotype(0);//width;
00658         tilingGPUEvaluator.dh = genome.phenotype(1);//height;
00659         tilingGPUEvaluator.dt = genome.phenotype(2);//subIterations;
00660         tilingGPUEvaluator.score = objective3D(genome);
00661 }
00662 
00663 template<class Array, class Mask, class Args>
00664 void StencilBase<Array, Mask,Args>::runIterativeAutoGPU(size_t iterations, size_t GPUBlockSize){
00665         size_t gpuMemFree, gpuMemTotal;
00666         //gpuErrchk( cudaDeviceSynchronize() );
00667         cudaMemGetInfo(&gpuMemFree, &gpuMemTotal);
00668         if((this->input.memSize()+this->output.memSize()+this->mask.memSize())<(0.999*gpuMemFree)){
00669                 runIterativeGPU(iterations, GPUBlockSize);
00670         }else if(this->input.getHeight()==1){
00671                 //solving for a 'transposed matrix'
00672                 tilingGPUEvaluator.typeSize = this->input.memSize()/this->input.size();
00673                 tilingGPUEvaluator.iterations = iterations;
00674                 tilingGPUEvaluator.width = this->input.getDepth(); //'transposed matrix'
00675                 tilingGPUEvaluator.height = this->input.getWidth(); //'transposed matrix'
00676                 tilingGPUEvaluator.depth = 1;
00677                 tilingGPUEvaluator.range = this->mask.getRange();
00678                 tilingGPUEvaluator.memFree = (gpuMemFree-this->mask.memSize())*0.999;//gpuMemFree*0.998;
00679 
00680                 tilingGPUEvaluator.popsize = 100;
00681                 tilingGPUEvaluator.ngen = 2500;
00682 
00683                 unsigned int seed = time(NULL);
00684                 solve2D(seed);
00685 
00686                 size_t subIterations = tilingGPUEvaluator.dt;
00687                 size_t width = tilingGPUEvaluator.dh;
00688                 //cout << "GPU Mem Total: "<< gpuMemTotal <<endl;
00689                 //cout << "GPU Mem Free: "<< gpuMemFree <<endl;
00690                 //cout << "sub iterations: "<< subIterations <<endl;
00691                 //cout << "tiling height: "<<height<<endl;
00692                 runIterativeTilingGPU(iterations, width, 1, this->input.getDepth(), subIterations, GPUBlockSize);
00693                 
00694         }else {
00695                 size_t typeSize = this->input.memSize()/this->input.size();
00696                 tilingGPUEvaluator.typeSize = typeSize;
00697                 tilingGPUEvaluator.iterations = iterations;
00698                 tilingGPUEvaluator.width = this->input.getWidth();
00699                 tilingGPUEvaluator.height = this->input.getHeight();
00700                 tilingGPUEvaluator.depth = this->input.getDepth();
00701                 tilingGPUEvaluator.range = this->mask.getRange();
00702                 tilingGPUEvaluator.memFree = (gpuMemFree-this->mask.memSize())*0.999;//gpuMemFree*0.998;
00703                 if( (2*(1+2*this->mask.getRange())*(this->input.getWidth()*this->input.getDepth())*typeSize+this->mask.memSize()) > (0.98*gpuMemFree) ){
00704                         tilingGPUEvaluator.popsize = 100;
00705                         tilingGPUEvaluator.ngen = 2500;
00706                         unsigned int seed = time(NULL);
00707                         solve3D(seed);
00708 
00709                         size_t width = tilingGPUEvaluator.dw;
00710                         size_t height = tilingGPUEvaluator.dh;
00711                         size_t subIterations = tilingGPUEvaluator.dt;
00712                         //cout << "GPU Mem Total: "<< gpuMemTotal <<endl;
00713                         //cout << "GPU Mem Free: "<< gpuMemFree <<endl;
00714                         //cout << "sub iterations: "<< subIterations <<endl;
00715                         //cout << "tiling height: "<<height<<endl;
00716                         runIterativeTilingGPU(iterations, width, height, this->input.getDepth(), subIterations, GPUBlockSize);
00717                 }else{
00718                         tilingGPUEvaluator.popsize = 100;
00719                         tilingGPUEvaluator.ngen = 2500;
00720                         unsigned int seed = time(NULL);
00721                         solve2D(seed);
00722 
00723                         size_t subIterations = tilingGPUEvaluator.dt;
00724                         size_t height = tilingGPUEvaluator.dh;
00725                         //cout << "GPU Mem Total: "<< gpuMemTotal <<endl;
00726                         //cout << "GPU Mem Free: "<< gpuMemFree <<endl;
00727                         //cout << "sub iterations: "<< subIterations <<endl;
00728                         //cout << "tiling height: "<<height<<endl;
00729                         runIterativeTilingGPU(iterations, this->input.getWidth(), height, this->input.getDepth(), subIterations, GPUBlockSize);
00730                 }
00731         }
00732 }
00733 
00734 //*******************************************************************************************
00735 // Stencil 3D
00736 //*******************************************************************************************
00737 
00738 
00739 template<class Array, class Mask, class Args>
00740 Stencil3D<Array,Mask,Args>::Stencil3D(){}
00741         
00742 template<class Array, class Mask, class Args>
00743 Stencil3D<Array,Mask,Args>::Stencil3D(Array _input, Array _output, Mask _mask, Args _args){
00744         this->input = _input;
00745         this->output = _output;
00746         this->args = _args;
00747         this->mask = _mask;
00748 }
00749 
00750 template<class Array, class Mask, class Args>
00751 void Stencil3D<Array,Mask,Args>::runSeq(Array in, Array out){
00752         for (int h = 0; h < in.getHeight(); h++){
00753         for (int w = 0; w < in.getWidth(); w++){
00754         for (int d = 0; d < in.getDepth(); d++){
00755                 stencilKernel(in,out,this->mask,this->args,h,w,d);
00756         }}}
00757 }
00758 
00759 template<class Array, class Mask, class Args>
00760 void Stencil3D<Array,Mask,Args>::runOpenMP(Array in, Array out, size_t numThreads){
00761         omp_set_num_threads(numThreads);
00762         #pragma omp parallel for
00763         for (int h = 0; h < in.getHeight(); h++){
00764         for (int w = 0; w < in.getWidth(); w++){
00765         for (int d = 0; d < in.getDepth(); d++){
00766                 stencilKernel(in,out,this->mask,this->args,h,w,d);
00767         }}}
00768 }
00769 
00770 #ifdef PSKEL_TBB
00771 template<class Array, class Mask, class Args>
00772 struct TBBStencil3D{
00773         Array input;
00774         Array output;
00775         Mask mask;
00776         Args args;
00777         TBBStencil3D(Array input, Array output, Mask mask, Args args){
00778                 this->input = input;
00779                 this->output = output;
00780                 this->mask = mask;
00781                 this->args = args;
00782         }
00783         void operator()(tbb::blocked_range<int> r)const{
00784                 for (int h = r.begin(); h != r.end(); h++){
00785                 for (int w = 0; w < this->input.getWidth(); w++){
00786                 for (int d = 0; d < this->input.getDepth(); d++){
00787                         stencilKernel(this->input,this->output,this->mask,this->args,h,w,d);
00788                 }}}
00789         }
00790 };
00791 
00792 template<class Array, class Mask, class Args>
00793 void Stencil3D<Array,Mask,Args>::runTBB(Array in, Array out, size_t numThreads){
00794         TBBStencil3D<Array, Mask, Args> tbbstencil(in, out, this->mask, this->args);
00795         tbb::task_scheduler_init init(numThreads);
00796         tbb::parallel_for(tbb::blocked_range<int>(0, in.getHeight()), tbbstencil);
00797 }
00798 #endif
00799 
00800 //*******************************************************************************************
00801 // Stencil 2D
00802 //*******************************************************************************************
00803 
00804 template<class Array, class Mask, class Args>
00805 Stencil2D<Array,Mask,Args>::Stencil2D(){}
00806 
00807 template<class Array, class Mask, class Args>
00808 Stencil2D<Array,Mask,Args>::Stencil2D(Array _input, Array _output, Mask _mask, Args _args){
00809         this->input = _input;
00810         this->output = _output;
00811         this->args = _args;
00812         this->mask = _mask;
00813 }
00814 /*
00815 template<class Array, class Mask, class Args>
00816 Stencil2D<Array,Mask,Args>::~Stencil2D(){
00817         this->cudaMemFree();
00818         this->cpuMemFree();
00819 }
00820 */
00821 
00822 template<class Array, class Mask, class Args>
00823 void Stencil2D<Array,Mask,Args>::runSeq(Array in, Array out){
00824         for (int h = 0; h < in.getHeight(); h++){
00825         for (int w = 0; w < in.getWidth(); w++){
00826                 stencilKernel(in,out,this->mask, this->args,h,w);
00827         }}
00828 }
00829 
00830 template<class Array, class Mask, class Args>
00831 void Stencil2D<Array,Mask,Args>::runOpenMP(Array in, Array out, size_t numThreads){
00832         omp_set_num_threads(numThreads);
00833         #pragma omp parallel for
00834         for (int h = 0; h < in.getHeight(); h++){
00835         for (int w = 0; w < in.getWidth(); w++){
00836                 stencilKernel(in,out,this->mask, this->args,h,w);
00837         }}
00838 }
00839 
00840 #ifdef PSKEL_TBB
00841 template<class Array, class Mask, class Args>
00842 struct TBBStencil2D{
00843         Array input;
00844         Array output;
00845         Mask mask;
00846         Args args;
00847         TBBStencil2D(Array input, Array output, Mask mask, Args args){
00848                 this->input = input;
00849                 this->output = output;
00850                 this->mask = mask;
00851                 this->args = args;
00852         }
00853         void operator()(tbb::blocked_range<int> r)const{
00854                 for (int h = r.begin(); h != r.end(); h++){
00855                 for (int w = 0; w < this->input.getWidth(); w++){
00856                         stencilKernel(this->input,this->output,this->mask, this->args,h,w);
00857                 }}
00858         }
00859 };
00860 
00861 template<class Array, class Mask, class Args>
00862 void Stencil2D<Array,Mask,Args>::runTBB(Array in, Array out, size_t numThreads){
00863         TBBStencil2D<Array, Mask, Args> tbbstencil(in, out, this->mask, this->args);
00864         tbb::task_scheduler_init init(numThreads);
00865         tbb::parallel_for(tbb::blocked_range<int>(0, in.getHeight()), tbbstencil);
00866 }
00867 #endif
00868 
00869 //*******************************************************************************************
00870 // Stencil 1D
00871 //*******************************************************************************************
00872 
00873 
00874 template<class Array, class Mask, class Args>
00875 Stencil<Array,Mask,Args>::Stencil(){}
00876         
00877 template<class Array, class Mask, class Args>
00878 Stencil<Array,Mask,Args>::Stencil(Array _input, Array _output, Mask _mask, Args _args){
00879         this->input = _input;
00880         this->output = _output;
00881         this->args = _args;
00882         this->mask = _mask;
00883 }
00884 
00885 template<class Array, class Mask, class Args>
00886 void Stencil<Array,Mask,Args>::runSeq(Array in, Array out){
00887         for (int i = 0; i < in.getWidth(); i++){
00888                 stencilKernel(in,out,this->mask, this->args,i);
00889         }
00890 }
00891 
00892 template<class Array, class Mask, class Args>
00893 void Stencil<Array,Mask,Args>::runOpenMP(Array in, Array out, size_t numThreads){
00894         omp_set_num_threads(numThreads);
00895         #pragma omp parallel for
00896         for (int i = 0; i < in.getWidth(); i++){
00897                 stencilKernel(in,out,this->mask, this->args,i);
00898         }
00899 }
00900 
00901 #ifdef PSKEL_TBB
00902 template<class Array, class Mask, class Args>
00903 struct TBBStencil{
00904         Array input;
00905         Array output;
00906         Mask mask;
00907         Args args;
00908         TBBStencil(Array input, Array output, Mask mask, Args args){
00909                 this->input = input;
00910                 this->output = output;
00911                 this->mask = mask;
00912                 this->args = args;
00913         }
00914         void operator()(tbb::blocked_range<int> r)const{
00915                 for (int i = r.begin(); i != r.end(); i++){
00916                         stencilKernel(this->input,this->output,this->mask, this->args,i);
00917                 }
00918         }
00919 };
00920 
00921 template<class Array, class Mask, class Args>
00922 void Stencil<Array,Mask,Args>::runTBB(Array in, Array out, size_t numThreads){
00923         TBBStencil<Array, Mask, Args> tbbstencil(in, out, this->mask, this->args);
00924         tbb::task_scheduler_init init(numThreads);
00925         tbb::parallel_for(tbb::blocked_range<int>(0, in.getWidth()), tbbstencil);
00926 }
00927 #endif
00928 
00929 }//end namespace
00930 
00931 
00932 #endif
 All Classes Files Functions