PSkel
|
00001 //------------------------------------------------------------------------------- 00002 // Copyright (c) 2015, ICEI - PUC Minas 00003 // All rights reserved. 00004 // 00005 // Redistribution and use in source and binary forms, with or without 00006 // modification, are permitted provided that the following conditions are met: 00007 // 00008 // 1. Redistributions of source code must retain the above copyright notice, this 00009 // list of conditions and the following disclaimer. 00010 // 00011 // 2. Redistributions in binary form must reproduce the above copyright notice, 00012 // this list of conditions and the following disclaimer in the documentation 00013 // and/or other materials provided with the distribution. 00014 // 00015 // 3. Neither the name of the copyright holder nor the names of its contributors 00016 // may be used to endorse or promote products derived from this software without 00017 // specific prior written permission. 00018 // 00019 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 00020 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00021 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 00022 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 00023 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00024 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 00025 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00026 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 00027 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00028 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00029 //------------------------------------------------------------------------------- 00030 00031 #ifndef PSKEL_ARRAY_HPP 00032 #define PSKEL_ARRAY_HPP 00033 00034 #include <cstring> 00035 #include <omp.h> 00036 00037 namespace PSkel{ 00038 00039 template<typename T> 00040 ArrayBase<T>::ArrayBase(size_t width, size_t height, size_t depth){ 00041 this->width = width; 00042 this->height = height; 00043 this->depth = depth; 00044 this->realWidth = width; 00045 this->realHeight = height; 00046 this->realDepth = depth; 00047 this->widthOffset = 0; 00048 this->heightOffset = 0; 00049 this->depthOffset = 0; 00050 this->hostArray = 0; 00051 this->deviceArray = 0; 00052 if(size()>0) this->hostAlloc(); 00053 } 00054 00055 template<typename T> 00056 void ArrayBase<T>::deviceAlloc(){ 00057 if(this->deviceArray==NULL){ 00058 gpuErrchk( cudaMalloc((void **) &deviceArray, size()*sizeof(T)) ); 00059 cudaMemset(this->deviceArray, 0, size()*sizeof(T)); 00060 } 00061 } 00062 00063 template<typename T> 00064 void ArrayBase<T>::deviceFree(){ 00065 //if(this->deviceArray!=NULL){ 00066 cudaFree(this->deviceArray); 00067 this->deviceArray = NULL; 00068 //} 00069 } 00070 00071 template<typename T> 00072 void ArrayBase<T>::hostAlloc(size_t width, size_t height, size_t depth){ 00073 this->width = width; 00074 this->height = height; 00075 this->depth = depth; 00076 this->realWidth = width; 00077 this->realHeight = height; 00078 this->realDepth = depth; 00079 this->widthOffset = 0; 00080 this->heightOffset = 0; 00081 this->depthOffset = 0; 00082 this->hostArray = NULL; 00083 this->deviceArray = NULL; 00084 00085 this->hostAlloc(); 00086 } 00087 00088 template<typename T> 00089 void ArrayBase<T>::hostAlloc(){ 00090 if(this->hostArray==NULL){ 00091 this->hostArray = (T*) calloc(size(), sizeof(T)); 00092 //gpuErrchk( cudaMallocHost((void**)&hostArray, size()*sizeof(T)) ); 00093 //memset(this->hostArray, 0, size()*sizeof(T)); 00094 } 00095 } 00096 00097 template<typename T> 00098 void ArrayBase<T>::hostFree(){ 00099 //if(this->hostArray!=NULL){ 00100 free(this->hostArray); 00101 //cudaFreeHost(this->hostArray); 00102 this->hostArray = NULL; 00103 //} 00104 } 00105 00106 template<typename T> 00107 size_t ArrayBase<T>::getWidth() const{ 00108 return width; 00109 } 00110 00111 template<typename T> 00112 size_t ArrayBase<T>::getHeight() const{ 00113 return height; 00114 } 00115 00116 template<typename T> 00117 size_t ArrayBase<T>::getDepth() const{ 00118 return depth; 00119 } 00120 00121 template<typename T> 00122 size_t ArrayBase<T>::memSize() const{ 00123 return size()*sizeof(T); 00124 } 00125 00126 template<typename T> 00127 size_t ArrayBase<T>::size() const{ 00128 return height*width*depth; 00129 } 00130 00131 template<typename T> 00132 size_t ArrayBase<T>::realSize() const{ 00133 return realHeight*realWidth*realDepth; 00134 } 00135 00136 template<typename T> 00137 __device__ __forceinline__ T & ArrayBase<T>::deviceGet(size_t h, size_t w, size_t d) const { 00138 return this->deviceArray[(h*width+w)*depth+d]; 00139 } 00140 00141 template<typename T> 00142 T & ArrayBase<T>::hostGet(size_t h, size_t w, size_t d) const { 00143 return this->hostArray[ ((h+heightOffset)*realWidth + (w+widthOffset))*realDepth + (d+depthOffset) ]; 00144 } 00145 00146 template<typename T> template<typename Arrays> 00147 void ArrayBase<T>::hostSlice(Arrays array, size_t widthOffset, size_t heightOffset, size_t depthOffset, size_t width, size_t height, size_t depth){ 00148 //maintain previous allocated area 00149 if(this->deviceArray!=NULL){ 00150 if(this->size()!=(width*height*depth)){ 00151 this->deviceFree(); 00152 this->deviceArray = NULL; 00153 } 00154 } 00155 //Copy dimensions 00156 this->width = width; 00157 this->height = height; 00158 this->depth = depth; 00159 this->widthOffset = array.widthOffset+widthOffset; 00160 this->heightOffset = array.heightOffset+heightOffset; 00161 this->depthOffset = array.depthOffset+depthOffset; 00162 this->realWidth = array.realWidth; 00163 this->realHeight = array.realHeight; 00164 this->realDepth = array.realDepth; 00165 this->hostArray = array.hostArray; 00166 } 00167 00168 //TODO: Alterar para retornar um Array ao invés de receber por parametro 00169 template<typename T> template<typename Arrays> 00170 void ArrayBase<T>::hostClone(Arrays array){ 00171 //Copy dimensions 00172 this->width = array.width; 00173 this->height = array.height; 00174 this->depth = array.depth; 00175 this->widthOffset = 0; 00176 this->heightOffset = 0; 00177 this->depthOffset = 0; 00178 this->realWidth = array.width; 00179 this->realHeight = array.height; 00180 this->realDepth = array.depth; 00181 //Alloc clone memory 00182 this->hostArray = NULL; 00183 this->hostAlloc(); 00184 //Copy clone memory 00185 this->hostMemCopy(array); 00186 } 00187 00188 template<typename T> template<typename Arrays> 00189 void ArrayBase<T>::hostMemCopy(Arrays array){ 00190 if(array.size()==array.realSize() && this->size()==this->realSize()){ 00191 memcpy(this->hostArray, array.hostArray, size()*sizeof(T)); 00192 }else{ 00193 #pragma omp parallel for 00194 for(size_t i = 0; i<height; ++i){ 00195 for(size_t j = 0; j<width; ++j){ 00196 for(size_t k = 0; k<depth; ++k){ 00197 this->hostGet(i,j,k)=array.hostGet(i,j,k); 00198 }}} 00199 } 00200 } 00201 00202 template<typename T> 00203 void ArrayBase<T>::copyToDevice(){ 00204 if(size()==realSize()){ 00205 gpuErrchk ( cudaMemcpy(deviceArray, hostArray, size()*sizeof(T), cudaMemcpyHostToDevice) ); 00206 }else if(depth==realDepth && width==realWidth){ 00207 T *hostPtr = (T*)(hostArray) + size_t(heightOffset*realWidth*realDepth); 00208 gpuErrchk ( cudaMemcpy(deviceArray, hostPtr, size()*sizeof(T),cudaMemcpyHostToDevice) ); 00209 }else if(realDepth==1 && realHeight==1){ 00210 T *hostPtr = (T*)(hostArray) + size_t(widthOffset); 00211 gpuErrchk ( cudaMemcpy(deviceArray, hostPtr, size()*sizeof(T),cudaMemcpyHostToDevice) ); 00212 }else{ 00213 //if "virtual" array is non-continuously allocated, 00214 //create a copy in pinned memory before transfering. 00215 T *copyPtr; 00216 gpuErrchk( cudaMallocHost((void**)©Ptr, size()*sizeof(T)) ); 00217 #pragma omp parallel for 00218 for(size_t h = 0; h<height; ++h){ 00219 for(size_t w = 0; w<width; ++w){ 00220 for(size_t d = 0; d<depth; ++d){ 00221 copyPtr[(h*width+w)*depth+d] = this->hostGet(h,w,d); 00222 }}} 00223 gpuErrchk ( cudaMemcpy(deviceArray, copyPtr, size()*sizeof(T), cudaMemcpyHostToDevice) ); 00224 cudaFreeHost(copyPtr); 00225 } 00226 } 00227 00228 template<typename T> template<typename Arrays> 00229 void ArrayBase<T>::copyFromDevice(Arrays array){ 00230 if(array.size()==realSize()){ 00231 gpuErrchk ( cudaMemcpy(hostArray, array.deviceArray, array.size()*sizeof(T),cudaMemcpyDeviceToHost) ); 00232 }else if(array.depth==realDepth && array.width==realWidth){ 00233 T *hostPtr = (T*)(hostArray) + size_t(heightOffset*realWidth*realDepth); 00234 gpuErrchk ( cudaMemcpy(hostPtr, array.deviceArray, array.size()*sizeof(T), cudaMemcpyDeviceToHost) ); 00235 }else if(realDepth==1 && realHeight==1){ 00236 T *hostPtr = (T*)(hostArray) + size_t(widthOffset); 00237 gpuErrchk ( cudaMemcpy(hostPtr, array.deviceArray, array.size()*sizeof(T), cudaMemcpyDeviceToHost) ); 00238 }else{ 00239 //if "virtual" array is non-continuously allocated, 00240 //create a copy in pinned memory before transfering. 00241 T *copyPtr; 00242 gpuErrchk( cudaMallocHost((void**)©Ptr, size()*sizeof(T)) ); 00243 gpuErrchk ( cudaMemcpy(copyPtr, array.deviceArray, size()*sizeof(T), cudaMemcpyDeviceToHost) ); 00244 #pragma omp parallel for 00245 for(size_t h = 0; h<height; ++h){ 00246 for(size_t w = 0; w<width; ++w){ 00247 for(size_t d = 0; d<depth; ++d){ 00248 this->hostGet(h,w,d) = copyPtr[(h*width+w)*depth+d]; 00249 }}} 00250 cudaFreeHost(copyPtr); 00251 } 00252 } 00253 00254 template<typename T> 00255 void ArrayBase<T>::copyToHost(){ 00256 if(size()==realSize()){ 00257 gpuErrchk ( cudaMemcpy(hostArray, deviceArray, size()*sizeof(T),cudaMemcpyDeviceToHost) ); 00258 }else if(depth==realDepth && width==realWidth){ 00259 T *hostPtr = (T*)(hostArray) + size_t(heightOffset*realWidth*realDepth); 00260 gpuErrchk ( cudaMemcpy(hostPtr, deviceArray, size()*sizeof(T), cudaMemcpyDeviceToHost) ); 00261 }else if(realDepth==1 && realHeight==1){ 00262 T *hostPtr = (T*)(hostArray) + size_t(widthOffset); 00263 gpuErrchk ( cudaMemcpy(hostPtr, deviceArray, size()*sizeof(T), cudaMemcpyDeviceToHost) ); 00264 }else{ 00265 //if "virtual" array is non-continuously allocated, 00266 //create a copy in pinned memory before transfering. 00267 T *copyPtr; 00268 gpuErrchk( cudaMallocHost((void**)©Ptr, size()*sizeof(T)) ); 00269 gpuErrchk ( cudaMemcpy(copyPtr, deviceArray, size()*sizeof(T), cudaMemcpyDeviceToHost) ); 00270 #pragma omp parallel for 00271 for(size_t h = 0; h<height; ++h){ 00272 for(size_t w = 0; w<width; ++w){ 00273 for(size_t d = 0; d<depth; ++d){ 00274 this->hostGet(h,w,d) = copyPtr[(h*width+w)*depth+d]; 00275 }}} 00276 cudaFreeHost(copyPtr); 00277 } 00278 } 00279 00280 template<typename T> 00281 ArrayBase<T>::operator bool() const { 00282 #ifdef __CUDA_ARCH__ 00283 return(this->deviceArray!=NULL); 00284 #else 00285 return(this->hostArray!=NULL); 00286 #endif 00287 } 00288 00289 //******************************************************************************************* 00290 // Array 3D 00291 //******************************************************************************************* 00292 00293 template<typename T> 00294 Array3D<T>::Array3D() : ArrayBase<T>(0,0,0) {} 00295 00296 /* 00297 //TODO O kernel cuda não aceita structs com destrutores. Corrigir nas próximas versões 00298 ~Array3D(){ 00299 free(hostArray); 00300 cudaFree(deviceArray); 00301 }*/ 00302 00303 template<typename T> 00304 Array3D<T>::Array3D(size_t width, size_t height, size_t depth) : ArrayBase<T>(width,height,depth){} 00305 00306 template<typename T> 00307 T & Array3D<T>::operator()(size_t h,size_t w,size_t d) const { 00308 #ifdef __CUDA_ARCH__ 00309 return this->deviceGet(h,w,d); 00310 #else 00311 return this->hostGet(h,w,d); 00312 #endif 00313 } 00314 00315 //******************************************************************************************* 00316 // Array 2D 00317 //******************************************************************************************* 00318 00319 template<typename T> 00320 Array2D<T>::Array2D() : ArrayBase<T>(0,0,0) {} 00321 00322 template<typename T> 00323 Array2D<T>::Array2D(size_t width, size_t height) : ArrayBase<T>(width,height,1){} 00324 00325 template<typename T> 00326 T & Array2D<T>::operator()(size_t h, size_t w) const { 00327 #ifdef __CUDA_ARCH__ 00328 return this->deviceGet(h,w,0); 00329 #else 00330 return this->hostGet(h,w,0); 00331 #endif 00332 } 00333 00334 //******************************************************************************************* 00335 // Array 1D 00336 //******************************************************************************************* 00337 00338 template<typename T> 00339 Array<T>::Array() : ArrayBase<T>(0,0,0){} 00340 00341 template<typename T> 00342 Array<T>::Array(size_t size) : ArrayBase<T>(size,1,1){} 00343 00344 template<typename T> 00345 T & Array<T>::operator()(size_t w) const { 00346 #ifdef __CUDA_ARCH__ 00347 return this->deviceGet(0,w,0); 00348 #else 00349 return this->hostGet(0,w,0); 00350 #endif 00351 } 00352 00353 }//end namespace 00354 #endif