PSkel: src/PSkelArray.hpp Source File

00001 //-------------------------------------------------------------------------------
00002 // Copyright (c) 2015, ICEI - PUC Minas
00003 // All rights reserved.
00004 // 
00005 // Redistribution and use in source and binary forms, with or without
00006 // modification, are permitted provided that the following conditions are met:
00007 // 
00008 // 1. Redistributions of source code must retain the above copyright notice, this
00009 // list of conditions and the following disclaimer.
00010 // 
00011 // 2. Redistributions in binary form must reproduce the above copyright notice,
00012 // this list of conditions and the following disclaimer in the documentation
00013 // and/or other materials provided with the distribution.
00014 // 
00015 // 3. Neither the name of the copyright holder nor the names of its contributors
00016 // may be used to endorse or promote products derived from this software without
00017 // specific prior written permission.
00018 // 
00019 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00020 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00021 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
00022 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
00023 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00024 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
00025 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00026 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
00027 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00028 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029 //-------------------------------------------------------------------------------
00030 
00031 #ifndef PSKEL_ARRAY_HPP
00032 #define PSKEL_ARRAY_HPP
00033 
00034 #include <cstring>
00035 #include <omp.h>
00036 
00037 namespace PSkel{
00038 
00039 template<typename T>
00040 ArrayBase<T>::ArrayBase(size_t width, size_t height, size_t depth){
00041         this->width = width;
00042         this->height = height;
00043         this->depth = depth;
00044         this->realWidth = width;
00045         this->realHeight = height;
00046         this->realDepth = depth;
00047         this->widthOffset = 0;
00048         this->heightOffset = 0;
00049         this->depthOffset = 0;
00050         this->hostArray = 0;
00051         this->deviceArray = 0;
00052         if(size()>0) this->hostAlloc();
00053 }
00054 
00055 template<typename T>
00056 void ArrayBase<T>::deviceAlloc(){
00057         if(this->deviceArray==NULL){
00058                 gpuErrchk( cudaMalloc((void **) &deviceArray, size()*sizeof(T)) );
00059                 cudaMemset(this->deviceArray, 0, size()*sizeof(T));
00060         }
00061 }
00062 
00063 template<typename T>
00064 void ArrayBase<T>::deviceFree(){
00065         //if(this->deviceArray!=NULL){
00066                 cudaFree(this->deviceArray);
00067                 this->deviceArray = NULL;
00068         //}
00069 }
00070 
00071 template<typename T>
00072 void ArrayBase<T>::hostAlloc(size_t width, size_t height, size_t depth){
00073         this->width = width;
00074         this->height = height;
00075         this->depth = depth;
00076         this->realWidth = width;
00077         this->realHeight = height;
00078         this->realDepth = depth;
00079         this->widthOffset = 0;
00080         this->heightOffset = 0;
00081         this->depthOffset = 0;
00082         this->hostArray = NULL;
00083         this->deviceArray = NULL;
00084 
00085         this->hostAlloc();
00086 }
00087 
00088 template<typename T>
00089 void ArrayBase<T>::hostAlloc(){
00090         if(this->hostArray==NULL){
00091                 this->hostArray = (T*) calloc(size(), sizeof(T));
00092                 //gpuErrchk( cudaMallocHost((void**)&hostArray, size()*sizeof(T)) );
00093                 //memset(this->hostArray, 0, size()*sizeof(T));
00094         }
00095 }
00096         
00097 template<typename T>
00098 void ArrayBase<T>::hostFree(){
00099         //if(this->hostArray!=NULL){
00100                 free(this->hostArray);
00101                 //cudaFreeHost(this->hostArray);
00102                 this->hostArray = NULL;
00103         //}
00104 }
00105 
00106 template<typename T>
00107 size_t ArrayBase<T>::getWidth() const{
00108         return width;
00109 }
00110         
00111 template<typename T>
00112 size_t ArrayBase<T>::getHeight() const{
00113         return height;
00114 }
00115 
00116 template<typename T>
00117 size_t ArrayBase<T>::getDepth() const{
00118         return depth;
00119 }
00120         
00121 template<typename T>
00122 size_t ArrayBase<T>::memSize() const{
00123         return size()*sizeof(T);
00124 }
00125 
00126 template<typename T>
00127 size_t ArrayBase<T>::size() const{
00128         return height*width*depth;
00129 }
00130 
00131 template<typename T>
00132 size_t ArrayBase<T>::realSize() const{
00133         return realHeight*realWidth*realDepth;
00134 }
00135 
00136 template<typename T>
00137 __device__ __forceinline__ T & ArrayBase<T>::deviceGet(size_t h, size_t w, size_t d) const {
00138         return this->deviceArray[(h*width+w)*depth+d];
00139 }
00140 
00141 template<typename T>
00142 T & ArrayBase<T>::hostGet(size_t h, size_t w, size_t d) const {
00143         return this->hostArray[ ((h+heightOffset)*realWidth + (w+widthOffset))*realDepth + (d+depthOffset) ];
00144 }
00145 
00146 template<typename T> template<typename Arrays>
00147 void ArrayBase<T>::hostSlice(Arrays array, size_t widthOffset, size_t heightOffset, size_t depthOffset, size_t width, size_t height, size_t depth){
00148         //maintain previous allocated area
00149         if(this->deviceArray!=NULL){
00150                 if(this->size()!=(width*height*depth)){
00151                         this->deviceFree();
00152                         this->deviceArray = NULL;
00153                 }
00154         }
00155         //Copy dimensions
00156         this->width = width;
00157         this->height = height;
00158         this->depth = depth;
00159         this->widthOffset = array.widthOffset+widthOffset;
00160         this->heightOffset = array.heightOffset+heightOffset;
00161         this->depthOffset = array.depthOffset+depthOffset;
00162         this->realWidth = array.realWidth;
00163         this->realHeight = array.realHeight;
00164         this->realDepth = array.realDepth;
00165         this->hostArray = array.hostArray;
00166 }
00167 
00168 //TODO: Alterar para retornar um Array ao invÃ©s de receber por parametro
00169 template<typename T> template<typename Arrays>
00170 void ArrayBase<T>::hostClone(Arrays array){
00171         //Copy dimensions
00172         this->width = array.width;
00173         this->height = array.height;
00174         this->depth = array.depth;
00175         this->widthOffset = 0;
00176         this->heightOffset = 0;
00177         this->depthOffset = 0;
00178         this->realWidth = array.width;
00179         this->realHeight = array.height;
00180         this->realDepth = array.depth;
00181         //Alloc clone memory
00182         this->hostArray = NULL;
00183         this->hostAlloc();
00184         //Copy clone memory
00185         this->hostMemCopy(array);
00186 }
00187         
00188 template<typename T> template<typename Arrays>
00189 void ArrayBase<T>::hostMemCopy(Arrays array){
00190         if(array.size()==array.realSize() && this->size()==this->realSize()){
00191                 memcpy(this->hostArray, array.hostArray, size()*sizeof(T));
00192         }else{
00193                 #pragma omp parallel for
00194                 for(size_t i = 0; i<height; ++i){
00195                 for(size_t j = 0; j<width; ++j){
00196                 for(size_t k = 0; k<depth; ++k){
00197                         this->hostGet(i,j,k)=array.hostGet(i,j,k);
00198                 }}}
00199         }
00200 }
00201 
00202 template<typename T>
00203 void ArrayBase<T>::copyToDevice(){
00204         if(size()==realSize()){
00205                 gpuErrchk ( cudaMemcpy(deviceArray, hostArray, size()*sizeof(T), cudaMemcpyHostToDevice) );
00206         }else if(depth==realDepth && width==realWidth){
00207                 T *hostPtr = (T*)(hostArray) + size_t(heightOffset*realWidth*realDepth);
00208                 gpuErrchk ( cudaMemcpy(deviceArray, hostPtr, size()*sizeof(T),cudaMemcpyHostToDevice) );
00209         }else if(realDepth==1 && realHeight==1){
00210                 T *hostPtr = (T*)(hostArray) + size_t(widthOffset);
00211                 gpuErrchk ( cudaMemcpy(deviceArray, hostPtr, size()*sizeof(T),cudaMemcpyHostToDevice) );
00212         }else{ 
00213                 //if "virtual" array is non-continuously allocated,
00214                 //create a copy in pinned memory before transfering.
00215                 T *copyPtr;
00216                 gpuErrchk( cudaMallocHost((void**)&copyPtr, size()*sizeof(T)) );
00217                 #pragma omp parallel for
00218                 for(size_t h = 0; h<height; ++h){
00219                 for(size_t w = 0; w<width; ++w){
00220                 for(size_t d = 0; d<depth; ++d){
00221                         copyPtr[(h*width+w)*depth+d] = this->hostGet(h,w,d);
00222                 }}}
00223                 gpuErrchk ( cudaMemcpy(deviceArray, copyPtr, size()*sizeof(T), cudaMemcpyHostToDevice) );
00224                 cudaFreeHost(copyPtr);
00225         }
00226 }
00227 
00228 template<typename T> template<typename Arrays>
00229 void ArrayBase<T>::copyFromDevice(Arrays array){
00230         if(array.size()==realSize()){
00231                 gpuErrchk ( cudaMemcpy(hostArray, array.deviceArray, array.size()*sizeof(T),cudaMemcpyDeviceToHost) );
00232         }else if(array.depth==realDepth && array.width==realWidth){
00233                 T *hostPtr = (T*)(hostArray) + size_t(heightOffset*realWidth*realDepth);
00234                 gpuErrchk ( cudaMemcpy(hostPtr, array.deviceArray, array.size()*sizeof(T), cudaMemcpyDeviceToHost) );
00235         }else if(realDepth==1 && realHeight==1){
00236                 T *hostPtr = (T*)(hostArray) + size_t(widthOffset);
00237                 gpuErrchk ( cudaMemcpy(hostPtr, array.deviceArray, array.size()*sizeof(T), cudaMemcpyDeviceToHost) );
00238         }else{
00239                 //if "virtual" array is non-continuously allocated,
00240                 //create a copy in pinned memory before transfering.
00241                 T *copyPtr;
00242                 gpuErrchk( cudaMallocHost((void**)&copyPtr, size()*sizeof(T)) );
00243                 gpuErrchk ( cudaMemcpy(copyPtr, array.deviceArray, size()*sizeof(T), cudaMemcpyDeviceToHost) );
00244                 #pragma omp parallel for
00245                 for(size_t h = 0; h<height; ++h){
00246                 for(size_t w = 0; w<width; ++w){
00247                 for(size_t d = 0; d<depth; ++d){
00248                         this->hostGet(h,w,d) = copyPtr[(h*width+w)*depth+d];
00249                 }}}
00250                 cudaFreeHost(copyPtr);
00251         }
00252 }
00253 
00254 template<typename T>
00255 void ArrayBase<T>::copyToHost(){
00256         if(size()==realSize()){
00257                 gpuErrchk ( cudaMemcpy(hostArray, deviceArray, size()*sizeof(T),cudaMemcpyDeviceToHost) );
00258         }else if(depth==realDepth && width==realWidth){
00259                 T *hostPtr = (T*)(hostArray) + size_t(heightOffset*realWidth*realDepth);
00260                 gpuErrchk ( cudaMemcpy(hostPtr, deviceArray, size()*sizeof(T), cudaMemcpyDeviceToHost) );
00261         }else if(realDepth==1 && realHeight==1){
00262                 T *hostPtr = (T*)(hostArray) + size_t(widthOffset);
00263                 gpuErrchk ( cudaMemcpy(hostPtr, deviceArray, size()*sizeof(T), cudaMemcpyDeviceToHost) );
00264         }else{
00265                 //if "virtual" array is non-continuously allocated,
00266                 //create a copy in pinned memory before transfering.
00267                 T *copyPtr;
00268                 gpuErrchk( cudaMallocHost((void**)&copyPtr, size()*sizeof(T)) );
00269                 gpuErrchk ( cudaMemcpy(copyPtr, deviceArray, size()*sizeof(T), cudaMemcpyDeviceToHost) );
00270                 #pragma omp parallel for
00271                 for(size_t h = 0; h<height; ++h){
00272                 for(size_t w = 0; w<width; ++w){
00273                 for(size_t d = 0; d<depth; ++d){
00274                         this->hostGet(h,w,d) = copyPtr[(h*width+w)*depth+d];
00275                 }}}
00276                 cudaFreeHost(copyPtr);
00277         }
00278 }
00279 
00280 template<typename T>
00281 ArrayBase<T>::operator bool() const {
00282         #ifdef __CUDA_ARCH__
00283         return(this->deviceArray!=NULL);
00284         #else
00285         return(this->hostArray!=NULL);
00286         #endif
00287 }
00288 
00289 //*******************************************************************************************
00290 // Array 3D
00291 //*******************************************************************************************
00292 
00293 template<typename T>
00294 Array3D<T>::Array3D() : ArrayBase<T>(0,0,0) {}
00295         
00296 /*
00297 //TODO O kernel cuda nÃ£o aceita structs com destrutores. Corrigir nas prÃ³ximas versÃµes
00298 ~Array3D(){
00299 free(hostArray);
00300 cudaFree(deviceArray);
00301 }*/
00302 
00303 template<typename T>
00304 Array3D<T>::Array3D(size_t width, size_t height, size_t depth) : ArrayBase<T>(width,height,depth){}
00305 
00306 template<typename T>
00307 T & Array3D<T>::operator()(size_t h,size_t w,size_t d) const {
00308         #ifdef __CUDA_ARCH__
00309                 return this->deviceGet(h,w,d);
00310         #else
00311                 return this->hostGet(h,w,d);
00312         #endif
00313 }
00314 
00315 //*******************************************************************************************
00316 // Array 2D
00317 //*******************************************************************************************
00318 
00319 template<typename T>
00320 Array2D<T>::Array2D() : ArrayBase<T>(0,0,0) {}
00321 
00322 template<typename T>
00323 Array2D<T>::Array2D(size_t width, size_t height) : ArrayBase<T>(width,height,1){}
00324 
00325 template<typename T>
00326 T & Array2D<T>::operator()(size_t h, size_t w) const {
00327         #ifdef __CUDA_ARCH__
00328                 return this->deviceGet(h,w,0);
00329         #else
00330                 return this->hostGet(h,w,0);
00331         #endif
00332 }
00333 
00334 //*******************************************************************************************
00335 // Array 1D
00336 //*******************************************************************************************
00337 
00338 template<typename T>
00339 Array<T>::Array() : ArrayBase<T>(0,0,0){}
00340 
00341 template<typename T>
00342 Array<T>::Array(size_t size) : ArrayBase<T>(size,1,1){}
00343 
00344 template<typename T>
00345 T & Array<T>::operator()(size_t w) const {
00346         #ifdef __CUDA_ARCH__
00347                 return this->deviceGet(0,w,0);
00348         #else
00349                 return this->hostGet(0,w,0);
00350         #endif
00351 }
00352 
00353 }//end namespace
00354 #endif