/*
-----------------------------------------------------------------------
Copyright 2012 iMinds-Vision Lab, University of Antwerp
Contact: astra@ua.ac.be
Website: http://astra.ua.ac.be
This file is part of the
All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
The ASTRA Toolbox is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
The ASTRA Toolbox is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with the ASTRA Toolbox. If not, see .
-----------------------------------------------------------------------
$Id$
*/
#include
#include
#include "util3d.h"
#include "../2d/util.h"
namespace astraCUDA3d {
cudaPitchedPtr allocateVolumeData(const SDimensions3D& dims)
{
cudaExtent extentV;
extentV.width = dims.iVolX*sizeof(float);
extentV.height = dims.iVolY;
extentV.depth = dims.iVolZ;
cudaPitchedPtr volData;
cudaError err = cudaMalloc3D(&volData, extentV);
if (err != cudaSuccess) {
astraCUDA::reportCudaError(err);
fprintf(stderr, "Failed to allocate %dx%dx%d GPU buffer\n", dims.iVolX, dims.iVolY, dims.iVolZ);
volData.ptr = 0;
// TODO: return 0 somehow?
}
return volData;
}
cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims)
{
cudaExtent extentP;
extentP.width = dims.iProjU*sizeof(float);
extentP.height = dims.iProjAngles;
extentP.depth = dims.iProjV;
cudaPitchedPtr projData;
cudaError err = cudaMalloc3D(&projData, extentP);
if (err != cudaSuccess) {
astraCUDA::reportCudaError(err);
fprintf(stderr, "Failed to allocate %dx%dx%d GPU buffer\n", dims.iProjU, dims.iProjAngles, dims.iProjV);
projData.ptr = 0;
// TODO: return 0 somehow?
}
return projData;
}
bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims)
{
char* t = (char*)D_data.ptr;
cudaError err;
for (unsigned int z = 0; z < dims.iVolZ; ++z) {
err = cudaMemset2D(t, D_data.pitch, 0, dims.iVolX*sizeof(float), dims.iVolY);
ASTRA_CUDA_ASSERT(err);
t += D_data.pitch * dims.iVolY;
}
return true;
}
bool zeroProjectionData(cudaPitchedPtr& D_data, const SDimensions3D& dims)
{
char* t = (char*)D_data.ptr;
cudaError err;
for (unsigned int z = 0; z < dims.iProjV; ++z) {
err = cudaMemset2D(t, D_data.pitch, 0, dims.iProjU*sizeof(float), dims.iProjAngles);
ASTRA_CUDA_ASSERT(err);
t += D_data.pitch * dims.iProjAngles;
}
return true;
}
bool copyVolumeToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
{
if (!pitch)
pitch = dims.iVolX;
cudaPitchedPtr ptr;
ptr.ptr = (void*)data; // const cast away
ptr.pitch = pitch*sizeof(float);
ptr.xsize = dims.iVolX*sizeof(float);
ptr.ysize = dims.iVolY;
cudaExtent extentV;
extentV.width = dims.iVolX*sizeof(float);
extentV.height = dims.iVolY;
extentV.depth = dims.iVolZ;
cudaPos zp = { 0, 0, 0 };
cudaMemcpy3DParms p;
p.srcArray = 0;
p.srcPos = zp;
p.srcPtr = ptr;
p.dstArray = 0;
p.dstPos = zp;
p.dstPtr = D_data;
p.extent = extentV;
p.kind = cudaMemcpyHostToDevice;
cudaError err;
err = cudaMemcpy3D(&p);
ASTRA_CUDA_ASSERT(err);
return err == cudaSuccess;
}
bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
{
if (!pitch)
pitch = dims.iProjU;
cudaPitchedPtr ptr;
ptr.ptr = (void*)data; // const cast away
ptr.pitch = pitch*sizeof(float);
ptr.xsize = dims.iProjU*sizeof(float);
ptr.ysize = dims.iProjAngles;
cudaExtent extentV;
extentV.width = dims.iProjU*sizeof(float);
extentV.height = dims.iProjAngles;
extentV.depth = dims.iProjV;
cudaPos zp = { 0, 0, 0 };
cudaMemcpy3DParms p;
p.srcArray = 0;
p.srcPos = zp;
p.srcPtr = ptr;
p.dstArray = 0;
p.dstPos = zp;
p.dstPtr = D_data;
p.extent = extentV;
p.kind = cudaMemcpyHostToDevice;
cudaError err;
err = cudaMemcpy3D(&p);
ASTRA_CUDA_ASSERT(err);
return err == cudaSuccess;
}
bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
{
if (!pitch)
pitch = dims.iVolX;
cudaPitchedPtr ptr;
ptr.ptr = data;
ptr.pitch = pitch*sizeof(float);
ptr.xsize = dims.iVolX*sizeof(float);
ptr.ysize = dims.iVolY;
cudaExtent extentV;
extentV.width = dims.iVolX*sizeof(float);
extentV.height = dims.iVolY;
extentV.depth = dims.iVolZ;
cudaPos zp = { 0, 0, 0 };
cudaMemcpy3DParms p;
p.srcArray = 0;
p.srcPos = zp;
p.srcPtr = D_data;
p.dstArray = 0;
p.dstPos = zp;
p.dstPtr = ptr;
p.extent = extentV;
p.kind = cudaMemcpyDeviceToHost;
cudaError err;
err = cudaMemcpy3D(&p);
ASTRA_CUDA_ASSERT(err);
return err == cudaSuccess;
}
bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
{
if (!pitch)
pitch = dims.iProjU;
cudaPitchedPtr ptr;
ptr.ptr = data;
ptr.pitch = pitch*sizeof(float);
ptr.xsize = dims.iProjU*sizeof(float);
ptr.ysize = dims.iProjAngles;
cudaExtent extentV;
extentV.width = dims.iProjU*sizeof(float);
extentV.height = dims.iProjAngles;
extentV.depth = dims.iProjV;
cudaPos zp = { 0, 0, 0 };
cudaMemcpy3DParms p;
p.srcArray = 0;
p.srcPos = zp;
p.srcPtr = D_data;
p.dstArray = 0;
p.dstPos = zp;
p.dstPtr = ptr;
p.extent = extentV;
p.kind = cudaMemcpyDeviceToHost;
cudaError err;
err = cudaMemcpy3D(&p);
ASTRA_CUDA_ASSERT(err);
return err == cudaSuccess;
}
bool duplicateVolumeData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims)
{
cudaExtent extentV;
extentV.width = dims.iVolX*sizeof(float);
extentV.height = dims.iVolY;
extentV.depth = dims.iVolZ;
cudaPos zp = { 0, 0, 0 };
cudaMemcpy3DParms p;
p.srcArray = 0;
p.srcPos = zp;
p.srcPtr = D_src;
p.dstArray = 0;
p.dstPos = zp;
p.dstPtr = D_dst;
p.extent = extentV;
p.kind = cudaMemcpyDeviceToDevice;
cudaError err;
err = cudaMemcpy3D(&p);
ASTRA_CUDA_ASSERT(err);
return err == cudaSuccess;
}
bool duplicateProjectionData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims)
{
cudaExtent extentV;
extentV.width = dims.iProjU*sizeof(float);
extentV.height = dims.iProjAngles;
extentV.depth = dims.iProjV;
cudaPos zp = { 0, 0, 0 };
cudaMemcpy3DParms p;
p.srcArray = 0;
p.srcPos = zp;
p.srcPtr = D_src;
p.dstArray = 0;
p.dstPos = zp;
p.dstPtr = D_dst;
p.extent = extentV;
p.kind = cudaMemcpyDeviceToDevice;
cudaError err;
err = cudaMemcpy3D(&p);
ASTRA_CUDA_ASSERT(err);
return err == cudaSuccess;
}
// TODO: Consider using a single array of size max(proj,volume) (per dim)
// instead of allocating a new one each time
// TODO: Figure out a faster way of zeroing the padding?
cudaArray* allocateVolumeArray(const SDimensions3D& dims)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc();
cudaArray* cuArray;
cudaExtent extentA;
extentA.width = dims.iVolX+2;
extentA.height = dims.iVolY+2;
extentA.depth = dims.iVolZ+2;
cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA);
if (err != cudaSuccess) {
astraCUDA::reportCudaError(err);
fprintf(stderr, "Failed to allocate %dx%dx%d GPU array\n", dims.iVolX, dims.iVolY, dims.iVolZ);
return 0;
}
zeroVolumeArray(cuArray, dims);
return cuArray;
}
cudaArray* allocateProjectionArray(const SDimensions3D& dims)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc();
cudaArray* cuArray;
cudaExtent extentA;
extentA.width = dims.iProjU+2;
extentA.height = dims.iProjAngles;
extentA.depth = dims.iProjV+2;
cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA);
if (err != cudaSuccess) {
astraCUDA::reportCudaError(err);
fprintf(stderr, "Failed to allocate %dx%dx%d GPU array\n", dims.iProjU, dims.iProjAngles, dims.iProjV);
return 0;
}
zeroProjectionArray(cuArray, dims);
return cuArray;
}
bool zeroVolumeArray(cudaArray* array, const SDimensions3D& dims)
{
cudaPitchedPtr zeroBuf;
cudaExtent extentS;
extentS.width = sizeof(float)*(dims.iVolX+2);
extentS.height = dims.iVolY+2;
extentS.depth = 1;
cudaExtent extentA;
extentA.width = dims.iVolX+2;
extentA.height = dims.iVolY+2;
extentA.depth = 1;
cudaError err;
err = cudaMalloc3D(&zeroBuf, extentS);
ASTRA_CUDA_ASSERT(err);
err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iVolX+2), dims.iVolY+2);
ASTRA_CUDA_ASSERT(err);
// zero array
for (unsigned int i = 0; i < dims.iVolZ+2; ++i) {
cudaMemcpy3DParms p;
cudaPos zp = {0, 0, 0};
cudaPos dp = {0, 0, i};
p.srcArray = 0;
p.srcPos = zp;
p.srcPtr = zeroBuf;
p.dstArray = array;
p.dstPtr.ptr = 0;
p.dstPtr.pitch = 0;
p.dstPtr.xsize = 0;
p.dstPtr.ysize = 0;
p.dstPos = dp;
p.extent = extentA;
p.kind = cudaMemcpyDeviceToDevice;
err = cudaMemcpy3D(&p);
ASTRA_CUDA_ASSERT(err);
}
cudaFree(zeroBuf.ptr);
// TODO: check errors
return true;
}
bool zeroProjectionArray(cudaArray* array, const SDimensions3D& dims)
{
cudaPitchedPtr zeroBuf;
cudaExtent extentS;
extentS.width = sizeof(float)*(dims.iProjU+2);
extentS.height = dims.iProjAngles;
extentS.depth = 1;
cudaExtent extentA;
extentA.width = dims.iProjU+2;
extentA.height = dims.iProjAngles;
extentA.depth = 1;
cudaError err;
err = cudaMalloc3D(&zeroBuf, extentS);
ASTRA_CUDA_ASSERT(err);
err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iProjU+2), dims.iProjAngles);
ASTRA_CUDA_ASSERT(err);
for (unsigned int i = 0; i < dims.iProjV+2; ++i) {
cudaMemcpy3DParms p;
cudaPos zp = {0, 0, 0};
cudaPos dp = {0, 0, i};
p.srcArray = 0;
p.srcPos = zp;
p.srcPtr = zeroBuf;
p.dstArray = array;
p.dstPtr.ptr = 0;
p.dstPtr.pitch = 0;
p.dstPtr.xsize = 0;
p.dstPtr.ysize = 0;
p.dstPos = dp;
p.extent = extentA;
p.kind = cudaMemcpyDeviceToDevice;
err = cudaMemcpy3D(&p);
ASTRA_CUDA_ASSERT(err);
}
cudaFree(zeroBuf.ptr);
// TODO: check errors
return true;
}
bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const SDimensions3D& dims)
{
cudaExtent extentA;
extentA.width = dims.iVolX;
extentA.height = dims.iVolY;
extentA.depth = dims.iVolZ;
cudaMemcpy3DParms p;
cudaPos zp = {0, 0, 0};
cudaPos dp = {1, 1, 1};
p.srcArray = 0;
p.srcPos = zp;
p.srcPtr = D_volumeData;
p.dstArray = array;
p.dstPtr.ptr = 0;
p.dstPtr.pitch = 0;
p.dstPtr.xsize = 0;
p.dstPtr.ysize = 0;
p.dstPos = dp;
p.extent = extentA;
p.kind = cudaMemcpyDeviceToDevice;
cudaError err = cudaMemcpy3D(&p);
ASTRA_CUDA_ASSERT(err);
// TODO: check errors
return true;
}
bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, const SDimensions3D& dims)
{
cudaExtent extentA;
extentA.width = dims.iProjU;
extentA.height = dims.iProjAngles;
extentA.depth = dims.iProjV;
cudaMemcpy3DParms p;
cudaPos zp = {0, 0, 0};
cudaPos dp = {1, 0, 1};
p.srcArray = 0;
p.srcPos = zp;
p.srcPtr = D_projData;
p.dstArray = array;
p.dstPtr.ptr = 0;
p.dstPtr.pitch = 0;
p.dstPtr.xsize = 0;
p.dstPtr.ysize = 0;
p.dstPos = dp;
p.extent = extentA;
p.kind = cudaMemcpyDeviceToDevice;
cudaError err = cudaMemcpy3D(&p);
ASTRA_CUDA_ASSERT(err);
// TODO: check errors
return true;
}
float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y,
unsigned int z)
{
return astraCUDA::dotProduct2D((float*)data.ptr, data.pitch/sizeof(float), x, y*z, 0, 0);
}
bool cudaTextForceKernelsCompletion()
{
cudaError_t returnedCudaError = cudaThreadSynchronize();
if(returnedCudaError != cudaSuccess) {
fprintf(stderr, "Failed to force completion of cuda kernels: %d: %s.\n", returnedCudaError, cudaGetErrorString(returnedCudaError));
return false;
}
return true;
}
int calcNextPowerOfTwo(int _iValue)
{
int iOutput = 1;
while(iOutput < _iValue)
iOutput *= 2;
return iOutput;
}
}