summaryrefslogtreecommitdiffstats
path: root/cuda/2d
diff options
context:
space:
mode:
authorWillem Jan Palenstijn <WillemJan.Palenstijn@uantwerpen.be>2014-04-16 11:12:55 +0000
committerwpalenst <WillemJan.Palenstijn@uantwerpen.be>2014-04-16 11:12:55 +0000
commit3a6769465bee7d56d0ddff36613b886446421e07 (patch)
tree624e85c5d6a4ab19c958a388e3436219693a6296 /cuda/2d
parent4dfb881ceb82b07630437e952dec62323977ab56 (diff)
downloadastra-3a6769465bee7d56d0ddff36613b886446421e07.tar.gz
astra-3a6769465bee7d56d0ddff36613b886446421e07.tar.bz2
astra-3a6769465bee7d56d0ddff36613b886446421e07.tar.xz
astra-3a6769465bee7d56d0ddff36613b886446421e07.zip
Remove padding in 2D cuda in favour of Border mode
Diffstat (limited to 'cuda/2d')
-rw-r--r--cuda/2d/algo.cu10
-rw-r--r--cuda/2d/arith.cu182
-rw-r--r--cuda/2d/arith.h37
-rw-r--r--cuda/2d/astra.cu44
-rw-r--r--cuda/2d/cgls.cu52
-rw-r--r--cuda/2d/darthelper.cu62
-rw-r--r--cuda/2d/dataop.cu16
-rw-r--r--cuda/2d/em.cu44
-rw-r--r--cuda/2d/fan_bp.cu28
-rw-r--r--cuda/2d/fan_fp.cu26
-rw-r--r--cuda/2d/fft.cu8
-rw-r--r--cuda/2d/fft.h4
-rw-r--r--cuda/2d/par_bp.cu28
-rw-r--r--cuda/2d/par_fp.cu38
-rw-r--r--cuda/2d/sart.cu60
-rw-r--r--cuda/2d/sirt.cu72
-rw-r--r--cuda/2d/util.cu46
-rw-r--r--cuda/2d/util.h4
18 files changed, 365 insertions, 396 deletions
diff --git a/cuda/2d/algo.cu b/cuda/2d/algo.cu
index f04607f..71cbfb3 100644
--- a/cuda/2d/algo.cu
+++ b/cuda/2d/algo.cu
@@ -214,11 +214,11 @@ bool ReconAlgo::setMaxConstraint(float fMax)
bool ReconAlgo::allocateBuffers()
{
bool ok;
- ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ ok = allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
if (!ok)
return false;
- ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+ ok = allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);
if (!ok) {
cudaFree(D_volumeData);
D_volumeData = 0;
@@ -226,7 +226,7 @@ bool ReconAlgo::allocateBuffers()
}
if (useVolumeMask) {
- ok = allocateVolume(D_maskData, dims.iVolWidth+2, dims.iVolHeight+2, maskPitch);
+ ok = allocateVolume(D_maskData, dims.iVolWidth, dims.iVolHeight, maskPitch);
if (!ok) {
cudaFree(D_volumeData);
cudaFree(D_sinoData);
@@ -237,7 +237,7 @@ bool ReconAlgo::allocateBuffers()
}
if (useSinogramMask) {
- ok = allocateVolume(D_smaskData, dims.iProjDets+2, dims.iProjAngles, smaskPitch);
+ ok = allocateVolume(D_smaskData, dims.iProjDets, dims.iProjAngles, smaskPitch);
if (!ok) {
cudaFree(D_volumeData);
cudaFree(D_sinoData);
@@ -271,7 +271,7 @@ bool ReconAlgo::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPit
return false;
// rescale sinogram to adjust for pixel size
- processVol<opMul,SINO>(D_sinoData, fSinogramScale,
+ processVol<opMul>(D_sinoData, fSinogramScale,
//1.0f/(fPixelSize*fPixelSize),
sinoPitch,
dims.iProjDets, dims.iProjAngles);
diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu
index 1ee02ca..42c2c98 100644
--- a/cuda/2d/arith.cu
+++ b/cuda/2d/arith.cu
@@ -144,14 +144,14 @@ struct opMulMask {
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -161,14 +161,14 @@ __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, uns
}
}
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -178,14 +178,14 @@ __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned
}
}
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -197,14 +197,14 @@ __global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fPa
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -214,14 +214,14 @@ __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, uns
}
}
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -231,14 +231,14 @@ __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned
}
}
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -248,14 +248,14 @@ __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, u
}
}
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -280,51 +280,51 @@ __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2,
-template<typename op, VolType t>
+template<typename op>
void processVolCopy(float* out, unsigned int width, unsigned int height)
{
float* D_out;
unsigned int pitch;
- allocateVolume(D_out, width+2, height+2, pitch);
+ allocateVolume(D_out, width, height, pitch);
copyVolumeToDevice(out, width, width, height, D_out, pitch);
- processVol<op, t>(D_out, pitch, width, height);
+ processVol<op>(D_out, pitch, width, height);
copyVolumeFromDevice(out, width, width, height, D_out, pitch);
cudaFree(D_out);
}
-template<typename op, VolType t>
+template<typename op>
void processVolCopy(float* out, float param, unsigned int width, unsigned int height)
{
float* D_out;
unsigned int pitch;
- allocateVolume(D_out, width+2, height+2, pitch);
+ allocateVolume(D_out, width, height, pitch);
copyVolumeToDevice(out, width, width, height, D_out, pitch);
- processVol<op, t>(D_out, param, pitch, width, height);
+ processVol<op>(D_out, param, pitch, width, height);
copyVolumeFromDevice(out, width, width, height, D_out, pitch);
cudaFree(D_out);
}
-template<typename op, VolType t>
+template<typename op>
void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height)
{
float* D_out1;
float* D_out2;
unsigned int pitch;
- allocateVolume(D_out1, width+2, height+2, pitch);
+ allocateVolume(D_out1, width, height, pitch);
copyVolumeToDevice(out1, width, width, height, D_out1, pitch);
- allocateVolume(D_out2, width+2, height+2, pitch);
+ allocateVolume(D_out2, width, height, pitch);
copyVolumeToDevice(out2, width, width, height, D_out2, pitch);
- processVol<op, t>(D_out1, D_out2, param1, param2, pitch, width, height);
+ processVol<op>(D_out1, D_out2, param1, param2, pitch, width, height);
copyVolumeFromDevice(out1, width, width, height, D_out1, pitch);
copyVolumeFromDevice(out2, width, width, height, D_out2, pitch);
@@ -334,19 +334,19 @@ void processVolCopy(float* out1, float* out2, float param1, float param2, unsign
}
-template<typename op, VolType t>
+template<typename op>
void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height)
{
float* D_out;
float* D_in;
unsigned int pitch;
- allocateVolume(D_out, width+2, height+2, pitch);
+ allocateVolume(D_out, width, height, pitch);
copyVolumeToDevice(out, width, width, height, D_out, pitch);
- allocateVolume(D_in, width+2, height+2, pitch);
+ allocateVolume(D_in, width, height, pitch);
copyVolumeToDevice(in, width, width, height, D_in, pitch);
- processVol<op, t>(D_out, D_in, pitch, width, height);
+ processVol<op>(D_out, D_in, pitch, width, height);
copyVolumeFromDevice(out, width, width, height, D_out, pitch);
@@ -354,19 +354,19 @@ void processVolCopy(float* out, const float* in, unsigned int width, unsigned in
cudaFree(D_in);
}
-template<typename op, VolType t>
+template<typename op>
void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height)
{
float* D_out;
float* D_in;
unsigned int pitch;
- allocateVolume(D_out, width+2, height+2, pitch);
+ allocateVolume(D_out, width, height, pitch);
copyVolumeToDevice(out, width, width, height, D_out, pitch);
- allocateVolume(D_in, width+2, height+2, pitch);
+ allocateVolume(D_in, width, height, pitch);
copyVolumeToDevice(in, width, width, height, D_in, pitch);
- processVol<op, t>(D_out, D_in, param, pitch, width, height);
+ processVol<op>(D_out, D_in, param, pitch, width, height);
copyVolumeFromDevice(out, width, width, height, D_out, pitch);
@@ -374,7 +374,7 @@ void processVolCopy(float* out, const float* in, float param, unsigned int width
cudaFree(D_in);
}
-template<typename op, VolType t>
+template<typename op>
void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height)
{
float* D_out;
@@ -382,14 +382,14 @@ void processVolCopy(float* out, const float* in1, const float* in2, unsigned int
float* D_in2;
unsigned int pitch;
- allocateVolume(D_out, width+2, height+2, pitch);
+ allocateVolume(D_out, width, height, pitch);
copyVolumeToDevice(out, width, width, height, D_out, pitch);
- allocateVolume(D_in1, width+2, height+2, pitch);
+ allocateVolume(D_in1, width, height, pitch);
copyVolumeToDevice(in1, width, width, height, D_in1, pitch);
- allocateVolume(D_in2, width+2, height+2, pitch);
+ allocateVolume(D_in2, width, height, pitch);
copyVolumeToDevice(in2, width, width, height, D_in2, pitch);
- processVol<op, t>(D_out, D_in1, D_in2, pitch, width, height);
+ processVol<op>(D_out, D_in1, D_in2, pitch, width, height);
copyVolumeFromDevice(out, width, width, height, D_out, pitch);
@@ -398,7 +398,7 @@ void processVolCopy(float* out, const float* in1, const float* in2, unsigned int
cudaFree(D_in2);
}
-template<typename op, VolType t>
+template<typename op>
void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height)
{
float* D_out;
@@ -406,14 +406,14 @@ void processVolCopy(float* out, const float* in1, const float* in2, float param,
float* D_in2;
unsigned int pitch;
- allocateVolume(D_out, width+2, height+2, pitch);
+ allocateVolume(D_out, width, height, pitch);
copyVolumeToDevice(out, width, width, height, D_out, pitch);
- allocateVolume(D_in1, width+2, height+2, pitch);
+ allocateVolume(D_in1, width, height, pitch);
copyVolumeToDevice(in1, width, width, height, D_in1, pitch);
- allocateVolume(D_in2, width+2, height+2, pitch);
+ allocateVolume(D_in2, width, height, pitch);
copyVolumeToDevice(in2, width, width, height, D_in2, pitch);
- processVol<op, t>(D_out, D_in1, D_in2, param, pitch, width, height);
+ processVol<op>(D_out, D_in1, D_in2, param, pitch, width, height);
copyVolumeFromDevice(out, width, width, height, D_out, pitch);
@@ -430,80 +430,80 @@ void processVolCopy(float* out, const float* in1, const float* in2, float param,
-template<typename op, VolType t>
+template<typename op>
void processVol(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
dim3 gridSize((width+15)/16, (height+511)/512);
- devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
+ devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
cudaTextForceKernelsCompletion();
}
-template<typename op, VolType t>
+template<typename op>
void processVol(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
dim3 gridSize((width+15)/16, (height+15)/16);
- devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
+ devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
cudaTextForceKernelsCompletion();
}
-template<typename op, VolType t>
+template<typename op>
void processVol(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
dim3 gridSize((width+15)/16, (height+15)/16);
- devFFtoDD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height);
+ devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height);
cudaTextForceKernelsCompletion();
}
-template<typename op, VolType t>
+template<typename op>
void processVol(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
dim3 gridSize((width+15)/16, (height+15)/16);
- devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
+ devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
cudaTextForceKernelsCompletion();
}
-template<typename op, VolType t>
+template<typename op>
void processVol(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
dim3 gridSize((width+15)/16, (height+15)/16);
- devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
+ devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
cudaTextForceKernelsCompletion();
}
-template<typename op, VolType t>
+template<typename op>
void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
dim3 gridSize((width+15)/16, (height+15)/16);
- devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
+ devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
cudaTextForceKernelsCompletion();
}
-template<typename op, VolType t>
+template<typename op>
void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
dim3 gridSize((width+15)/16, (height+15)/16);
- devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
+ devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
cudaTextForceKernelsCompletion();
}
@@ -533,7 +533,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims)
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
}
@@ -549,7 +549,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
}
@@ -566,7 +566,7 @@ void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, flo
unsigned int step = out1.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut1 += step;
pfOut2 += step;
}
@@ -585,7 +585,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
pfIn += step;
}
@@ -603,7 +603,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
pfIn += step;
}
@@ -622,7 +622,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
pfIn1 += step;
pfIn2 += step;
@@ -642,7 +642,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
pfIn1 += step;
pfIn2 += step;
@@ -672,7 +672,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims)
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
}
@@ -688,7 +688,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
}
@@ -705,7 +705,7 @@ void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, fl
unsigned int step = out1.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut1 += step;
pfOut2 += step;
}
@@ -724,7 +724,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
pfIn += step;
}
@@ -742,7 +742,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam,
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
pfIn += step;
}
@@ -761,7 +761,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
pfIn1 += step;
pfIn2 += step;
@@ -781,7 +781,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
pfIn1 += step;
pfIn2 += step;
@@ -808,59 +808,45 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
#define INST_DFtoD(name) \
- template void processVolCopy<name, VOL>(float* out, const float* in, float param, unsigned int width, unsigned int height); \
- template void processVolCopy<name, SINO>(float* out, const float* in, float param, unsigned int width, unsigned int height); \
- template void processVol<name, VOL>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVolCopy<name>(float* out, const float* in, float param, unsigned int width, unsigned int height); \
+ template void processVol<name>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
#define INST_DtoD(name) \
- template void processVolCopy<name, VOL>(float* out, const float* in, unsigned int width, unsigned int height); \
- template void processVolCopy<name, SINO>(float* out, const float* in, unsigned int width, unsigned int height); \
- template void processVol<name, VOL>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVolCopy<name>(float* out, const float* in, unsigned int width, unsigned int height); \
+ template void processVol<name>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
#define INST_DDtoD(name) \
- template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \
- template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \
- template void processVol<name, VOL>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVolCopy<name>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \
+ template void processVol<name>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
#define INST_DDFtoD(name) \
- template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \
- template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \
- template void processVol<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVolCopy<name>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \
+ template void processVol<name>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
#define INST_toD(name) \
- template void processVolCopy<name, VOL>(float* out, unsigned int width, unsigned int height); \
- template void processVolCopy<name, SINO>(float* out, unsigned int width, unsigned int height); \
- template void processVol<name, VOL>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVolCopy<name>(float* out, unsigned int width, unsigned int height); \
+ template void processVol<name>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims);
#define INST_FtoD(name) \
- template void processVolCopy<name, VOL>(float* out, float param, unsigned int width, unsigned int height); \
- template void processVolCopy<name, SINO>(float* out, float param, unsigned int width, unsigned int height); \
- template void processVol<name, VOL>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVolCopy<name>(float* out, float param, unsigned int width, unsigned int height); \
+ template void processVol<name>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims);
#define INST_FFtoDD(name) \
- template void processVolCopy<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \
- template void processVolCopy<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \
- template void processVol<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVolCopy<name>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \
+ template void processVol<name>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims);
diff --git a/cuda/2d/arith.h b/cuda/2d/arith.h
index c8c7b41..d745aef 100644
--- a/cuda/2d/arith.h
+++ b/cuda/2d/arith.h
@@ -55,28 +55,21 @@ struct opSetMaskedValues;
struct opMulMask;
-
-enum VolType {
- SINO = 0,
- VOL = 1
-};
-
-
-template<typename op, VolType t> void processVolCopy(float* out, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVolCopy(float* out, float param, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height);
-
-template<typename op, VolType t> void processVol(float* out, unsigned int pitch, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVol(float* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVol(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVol(float* out, const float* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op> void processVolCopy(float* out, unsigned int width, unsigned int height);
+template<typename op> void processVolCopy(float* out, float param, unsigned int width, unsigned int height);
+template<typename op> void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height);
+template<typename op> void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height);
+template<typename op> void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height);
+template<typename op> void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height);
+template<typename op> void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height);
+
+template<typename op> void processVol(float* out, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op> void processVol(float* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op> void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op> void processVol(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op> void processVol(float* out, const float* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op> void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op> void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height);
template<typename op> void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims);
template<typename op> void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims);
diff --git a/cuda/2d/astra.cu b/cuda/2d/astra.cu
index 2240629..1c2e623 100644
--- a/cuda/2d/astra.cu
+++ b/cuda/2d/astra.cu
@@ -206,13 +206,13 @@ bool AstraFBP::init(int iGPUIndex)
}
}
- bool ok = allocateVolume(pData->D_volumeData, pData->dims.iVolWidth+2, pData->dims.iVolHeight+2, pData->volumePitch);
+ bool ok = allocateVolume(pData->D_volumeData, pData->dims.iVolWidth, pData->dims.iVolHeight, pData->volumePitch);
if (!ok)
{
return false;
}
- ok = allocateVolume(pData->D_sinoData, pData->dims.iProjDets+2, pData->dims.iProjAngles, pData->sinoPitch);
+ ok = allocateVolume(pData->D_sinoData, pData->dims.iProjDets, pData->dims.iProjAngles, pData->sinoPitch);
if (!ok)
{
cudaFree(pData->D_volumeData);
@@ -241,7 +241,7 @@ bool AstraFBP::setSinogram(const float* pfSinogram,
return false;
// rescale sinogram to adjust for pixel size
- processVol<opMul,SINO>(pData->D_sinoData,
+ processVol<opMul>(pData->D_sinoData,
1.0f/(pData->fPixelSize*pData->fPixelSize),
pData->sinoPitch,
pData->dims.iProjDets, pData->dims.iProjAngles);
@@ -270,7 +270,7 @@ bool AstraFBP::run()
return false;
}
- zeroVolume(pData->D_volumeData, pData->volumePitch, pData->dims.iVolWidth+2, pData->dims.iVolHeight+2);
+ zeroVolume(pData->D_volumeData, pData->volumePitch, pData->dims.iVolWidth, pData->dims.iVolHeight);
bool ok = false;
@@ -283,11 +283,11 @@ bool AstraFBP::run()
allocateComplexOnDevice(pData->dims.iProjAngles, iFFTFourDetCount, &pDevComplexSinogram);
- runCudaFFT(pData->dims.iProjAngles, pData->D_sinoData, pData->sinoPitch, 1, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount, pDevComplexSinogram);
+ runCudaFFT(pData->dims.iProjAngles, pData->D_sinoData, pData->sinoPitch, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount, pDevComplexSinogram);
applyFilter(pData->dims.iProjAngles, iFFTFourDetCount, pDevComplexSinogram, pData->m_pDevFilter);
- runCudaIFFT(pData->dims.iProjAngles, pDevComplexSinogram, pData->D_sinoData, pData->sinoPitch, 1, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount);
+ runCudaIFFT(pData->dims.iProjAngles, pDevComplexSinogram, pData->D_sinoData, pData->sinoPitch, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount);
freeComplexOnDevice(pDevComplexSinogram);
@@ -299,7 +299,7 @@ bool AstraFBP::run()
return false;
}
- processVol<opMul,VOL>(pData->D_volumeData,
+ processVol<opMul>(pData->D_volumeData,
(M_PI / 2.0f) / (float)pData->dims.iProjAngles,
pData->volumePitch,
pData->dims.iVolWidth, pData->dims.iVolHeight);
@@ -443,7 +443,7 @@ bool AstraFBP::setFilter(E_FBPFILTER _eFilter, const float * _pfHostFilter /* =
cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice);
delete[] pfHostRealFilter;
- runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, 0, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter);
+ runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter);
cudaFree(pfDevRealFilter);
@@ -478,7 +478,7 @@ bool AstraFBP::setFilter(E_FBPFILTER _eFilter, const float * _pfHostFilter /* =
cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice);
delete[] pfHostRealFilter;
- runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, 0, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter);
+ runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter);
cudaFree(pfDevRealFilter);
@@ -515,7 +515,7 @@ bool BPalgo::init()
bool BPalgo::iterate(unsigned int)
{
// TODO: This zeroVolume makes an earlier memcpy of D_volumeData redundant
- zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight);
callBP(D_volumeData, volumePitch, D_sinoData, sinoPitch);
return true;
}
@@ -525,12 +525,12 @@ float BPalgo::computeDiffNorm()
float *D_projData;
unsigned int projPitch;
- allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+ allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
- cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+ cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*dims.iProjDets, dims.iProjAngles, cudaMemcpyDeviceToDevice);
callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
- float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+ float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
cudaFree(D_projData);
@@ -579,14 +579,14 @@ bool astraCudaFP(const float* pfVolume, float* pfSinogram,
float* D_volumeData;
unsigned int volumePitch;
- ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ ok = allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
if (!ok)
return false;
float* D_sinoData;
unsigned int sinoPitch;
- ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+ ok = allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);
if (!ok) {
cudaFree(D_volumeData);
return false;
@@ -601,7 +601,7 @@ bool astraCudaFP(const float* pfVolume, float* pfSinogram,
return false;
}
- zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+ zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);
ok = FP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pfAngles, pfOffsets, 1.0f);
if (!ok) {
cudaFree(D_volumeData);
@@ -666,14 +666,14 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
float* D_volumeData;
unsigned int volumePitch;
- ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ ok = allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
if (!ok)
return false;
float* D_sinoData;
unsigned int sinoPitch;
- ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+ ok = allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);
if (!ok) {
cudaFree(D_volumeData);
return false;
@@ -688,7 +688,7 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
return false;
}
- zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+ zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);
// TODO: Turn this geometry conversion into a util function
SFanProjection* projs = new SFanProjection[dims.iProjAngles];
@@ -777,14 +777,14 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
float* D_volumeData;
unsigned int volumePitch;
- ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ ok = allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
if (!ok)
return false;
float* D_sinoData;
unsigned int sinoPitch;
- ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+ ok = allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);
if (!ok) {
cudaFree(D_volumeData);
return false;
@@ -799,7 +799,7 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
return false;
}
- zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+ zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);
ok = FanFP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pAngles, 1.0f);
diff --git a/cuda/2d/cgls.cu b/cuda/2d/cgls.cu
index 5b1cf46..df8db0b 100644
--- a/cuda/2d/cgls.cu
+++ b/cuda/2d/cgls.cu
@@ -73,16 +73,16 @@ void CGLS::reset()
bool CGLS::init()
{
// Lifetime of z: within an iteration
- allocateVolume(D_z, dims.iVolWidth+2, dims.iVolHeight+2, zPitch);
+ allocateVolume(D_z, dims.iVolWidth, dims.iVolHeight, zPitch);
// Lifetime of p: full algorithm
- allocateVolume(D_p, dims.iVolWidth+2, dims.iVolHeight+2, pPitch);
+ allocateVolume(D_p, dims.iVolWidth, dims.iVolHeight, pPitch);
// Lifetime of r: full algorithm
- allocateVolume(D_r, dims.iProjDets+2, dims.iProjAngles, rPitch);
+ allocateVolume(D_r, dims.iProjDets, dims.iProjAngles, rPitch);
// Lifetime of w: within an iteration
- allocateVolume(D_w, dims.iProjDets+2, dims.iProjAngles, wPitch);
+ allocateVolume(D_w, dims.iProjDets, dims.iProjAngles, wPitch);
// TODO: check if allocations succeeded
return true;
@@ -120,13 +120,13 @@ bool CGLS::iterate(unsigned int iterations)
if (!sliceInitialized) {
// copy sinogram
- cudaMemcpy2D(D_r, sizeof(float)*rPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+ cudaMemcpy2D(D_r, sizeof(float)*rPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);
// r = sino - A*x
if (useVolumeMask) {
// Use z as temporary storage here since it is unused
- cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
- processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
+ cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice);
+ processVol<opMul>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
callFP(D_z, zPitch, D_r, rPitch, -1.0f);
} else {
callFP(D_volumeData, volumePitch, D_r, rPitch, -1.0f);
@@ -134,13 +134,13 @@ bool CGLS::iterate(unsigned int iterations)
// p = A'*r
- zeroVolume(D_p, pPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ zeroVolume(D_p, pPitch, dims.iVolWidth, dims.iVolHeight);
callBP(D_p, pPitch, D_r, rPitch);
if (useVolumeMask)
- processVol<opMul, VOL>(D_p, D_maskData, pPitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opMul>(D_p, D_maskData, pPitch, dims.iVolWidth, dims.iVolHeight);
- gamma = dotProduct2D(D_p, pPitch, dims.iVolWidth, dims.iVolHeight, 1, 1);
+ gamma = dotProduct2D(D_p, pPitch, dims.iVolWidth, dims.iVolHeight);
sliceInitialized = true;
}
@@ -150,32 +150,32 @@ bool CGLS::iterate(unsigned int iterations)
for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
// w = A*p
- zeroVolume(D_w, wPitch, dims.iProjDets+2, dims.iProjAngles);
+ zeroVolume(D_w, wPitch, dims.iProjDets, dims.iProjAngles);
callFP(D_p, pPitch, D_w, wPitch, 1.0f);
// alpha = gamma / <w,w>
- float ww = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+ float ww = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles);
float alpha = gamma / ww;
// x += alpha*p
- processVol<opAddScaled, VOL>(D_volumeData, D_p, alpha, volumePitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opAddScaled>(D_volumeData, D_p, alpha, volumePitch, dims.iVolWidth, dims.iVolHeight);
// r -= alpha*w
- processVol<opAddScaled, SINO>(D_r, D_w, -alpha, rPitch, dims.iProjDets, dims.iProjAngles);
+ processVol<opAddScaled>(D_r, D_w, -alpha, rPitch, dims.iProjDets, dims.iProjAngles);
// z = A'*r
- zeroVolume(D_z, zPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ zeroVolume(D_z, zPitch, dims.iVolWidth, dims.iVolHeight);
callBP(D_z, zPitch, D_r, rPitch);
if (useVolumeMask)
- processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opMul>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
float beta = 1.0f / gamma;
- gamma = dotProduct2D(D_z, zPitch, dims.iVolWidth, dims.iVolHeight, 1, 1);
+ gamma = dotProduct2D(D_z, zPitch, dims.iVolWidth, dims.iVolHeight);
beta *= gamma;
// p = z + beta*p
- processVol<opScaleAndAdd, VOL>(D_p, D_z, beta, pPitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opScaleAndAdd>(D_p, D_z, beta, pPitch, dims.iVolWidth, dims.iVolHeight);
}
@@ -189,12 +189,12 @@ float CGLS::computeDiffNorm()
// used outside of iterations.
// copy sinogram to w
- cudaMemcpy2D(D_w, sizeof(float)*wPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+ cudaMemcpy2D(D_w, sizeof(float)*wPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);
// do FP, subtracting projection from sinogram
if (useVolumeMask) {
- cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
- processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
+ cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice);
+ processVol<opMul>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
callFP(D_z, zPitch, D_w, wPitch, -1.0f);
} else {
callFP(D_volumeData, volumePitch, D_w, wPitch, -1.0f);
@@ -202,7 +202,7 @@ float CGLS::computeDiffNorm()
// compute norm of D_w
- float s = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+ float s = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles);
return sqrt(s);
}
@@ -264,12 +264,12 @@ int main()
dims.iRaysPerDet = 1;
unsigned int volumePitch, sinoPitch;
- allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
- zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
+ zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight);
printf("pitch: %u\n", volumePitch);
- allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
- zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+ allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);
+ zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);
printf("pitch: %u\n", sinoPitch);
unsigned int y, x;
diff --git a/cuda/2d/darthelper.cu b/cuda/2d/darthelper.cu
index 768d14e..b61eaae 100644
--- a/cuda/2d/darthelper.cu
+++ b/cuda/2d/darthelper.cu
@@ -33,7 +33,7 @@ $Id$
namespace astraCUDA {
// CUDA function for the selection of ROI
-__global__ void devRoiSelect(float* in, float radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+__global__ void devRoiSelect(float* in, float radius, unsigned int pitch, unsigned int width, unsigned int height)
{
float x = (float)(threadIdx.x + 16*blockIdx.x);
float y = (float)(threadIdx.y + 16*blockIdx.y);
@@ -44,7 +44,7 @@ __global__ void devRoiSelect(float* in, float radius, unsigned int pitch, unsign
if ((x-w)*(x-w) + (y-h)*(y-h) > radius * radius * 0.25f)
{
float* d = (float*)in;
- unsigned int o = (y+padY)*pitch+x+padX;
+ unsigned int o = y*pitch+x;
d[o] = 0.0f;
}
}
@@ -54,12 +54,12 @@ void roiSelect(float* out, float radius, unsigned int width, unsigned int height
float* D_data;
unsigned int pitch;
- allocateVolume(D_data, width+2, height+2, pitch);
+ allocateVolume(D_data, width, height, pitch);
copyVolumeToDevice(out, width, width, height, D_data, pitch);
dim3 blockSize(16,16);
dim3 gridSize((width+15)/16, (height+15)/16);
- devRoiSelect<<<gridSize, blockSize>>>(D_data, radius, pitch, width, height, 1, 1);
+ devRoiSelect<<<gridSize, blockSize>>>(D_data, radius, pitch, width, height);
copyVolumeFromDevice(out, width, width, height, D_data, pitch);
@@ -70,7 +70,7 @@ void roiSelect(float* out, float radius, unsigned int width, unsigned int height
// CUDA function for the masking of DART with a radius == 1
-__global__ void devDartMask(float* mask, const float* in, unsigned int conn, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+__global__ void devDartMask(float* mask, const float* in, unsigned int conn, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
unsigned int y = threadIdx.y + 16*blockIdx.y;
@@ -80,7 +80,7 @@ __global__ void devDartMask(float* mask, const float* in, unsigned int conn, uns
float* d = (float*)in;
float* m = (float*)mask;
- unsigned int o2 = (y+padY)*pitch+x+padX; // On this row.
+ unsigned int o2 = y*pitch+x; // On this row.
unsigned int o1 = o2 - pitch; // On previous row.
unsigned int o3 = o2 + pitch; // On next row.
@@ -101,7 +101,7 @@ __global__ void devDartMask(float* mask, const float* in, unsigned int conn, uns
// CUDA function for the masking of DART with a radius > 1
-__global__ void devDartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+__global__ void devDartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
unsigned int y = threadIdx.y + 16*blockIdx.y;
@@ -115,13 +115,13 @@ __global__ void devDartMaskRadius(float* mask, const float* in, unsigned int con
int r = radius;
// o2: index of the current center pixel
- int o2 = (y+padY)*pitch+x+padX;
+ int o2 = y*pitch+x;
if (conn == 8) // 8-connected
{
for (int row = -r; row <= r; row++)
{
- int o1 = (y+padY+row)*pitch+x+padX;
+ int o1 = (y+row)*pitch+x;
for (int col = -r; col <= r; col++)
{
if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;}
@@ -131,7 +131,7 @@ __global__ void devDartMaskRadius(float* mask, const float* in, unsigned int con
else if (conn == 4) // 4-connected
{
// horizontal
- unsigned int o1 = (y+padY)*pitch+x+padX;
+ unsigned int o1 = y*pitch+x;
for (int col = -r; col <= r; col++)
{
if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;}
@@ -140,7 +140,7 @@ __global__ void devDartMaskRadius(float* mask, const float* in, unsigned int con
// vertical
for (int row = -r; row <= r; row++)
{
- unsigned int o1 = (y+padY+row)*pitch+x+padX;
+ unsigned int o1 = (y+row)*pitch+x;
if (d[o1] != d[o2]) {m[o2] = 1.0f; return;}
}
}
@@ -149,7 +149,7 @@ __global__ void devDartMaskRadius(float* mask, const float* in, unsigned int con
// CUDA function for the masking of ADART with a radius == 1
-__global__ void devADartMask(float* mask, const float* in, unsigned int conn, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+__global__ void devADartMask(float* mask, const float* in, unsigned int conn, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
unsigned int y = threadIdx.y + 16*blockIdx.y;
@@ -159,7 +159,7 @@ __global__ void devADartMask(float* mask, const float* in, unsigned int conn, un
float* d = (float*)in;
float* m = (float*)mask;
- unsigned int o2 = (y+padY)*pitch+x+padX; // On this row.
+ unsigned int o2 = y*pitch+x; // On this row.
unsigned int o1 = o2 - pitch; // On previous row.
unsigned int o3 = o2 + pitch; // On next row.
@@ -186,7 +186,7 @@ __global__ void devADartMask(float* mask, const float* in, unsigned int conn, un
// CUDA function for the masking of ADART with a radius > 1
-__global__ void devADartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+__global__ void devADartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
unsigned int y = threadIdx.y + 16*blockIdx.y;
@@ -199,13 +199,13 @@ __global__ void devADartMaskRadius(float* mask, const float* in, unsigned int co
int r = radius;
- unsigned int o2 = (y+padY)*pitch+x+padX; // On this row.
+ unsigned int o2 = y*pitch+x; // On this row.
if (conn == 8)
{
for (int row = -r; row <= r; row++)
{
- unsigned int o1 = (y+padY+row)*pitch+x+padX;
+ unsigned int o1 = (y+row)*pitch+x;
for (int col = -r; col <= r; col++)
{
if (d[o1+col] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
@@ -223,7 +223,7 @@ __global__ void devADartMaskRadius(float* mask, const float* in, unsigned int co
// vertical
for (int row = -r; row <= r; row++)
{
- unsigned int o1 = (y+padY+row)*pitch+x+padX;
+ unsigned int o1 = (y+row)*pitch+x;
if (d[o1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
}
}
@@ -237,23 +237,23 @@ void dartMask(float* mask, const float* segmentation, unsigned int conn, unsigne
float* D_maskData;
unsigned int pitch;
- allocateVolume(D_segmentationData, width+2, height+2, pitch);
+ allocateVolume(D_segmentationData, width, height, pitch);
copyVolumeToDevice(segmentation, width, width, height, D_segmentationData, pitch);
- allocateVolume(D_maskData, width+2, height+2, pitch);
- zeroVolume(D_maskData, pitch, width+2, height+2);
+ allocateVolume(D_maskData, width, height, pitch);
+ zeroVolume(D_maskData, pitch, width, height);
dim3 blockSize(16,16);
dim3 gridSize((width+15)/16, (height+15)/16);
if (threshold == 1 && radius == 1)
- devDartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, pitch, width, height, 1, 1);
+ devDartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, pitch, width, height);
else if (threshold > 1 && radius == 1)
- devADartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, threshold, pitch, width, height, 1, 1);
+ devADartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, threshold, pitch, width, height);
else if (threshold == 1 && radius > 1)
- devDartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, pitch, width, height, 1, 1);
+ devDartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, pitch, width, height);
else
- devADartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height, 1, 1);
+ devADartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height);
copyVolumeFromDevice(mask, width, width, height, D_maskData, pitch);
@@ -263,7 +263,7 @@ void dartMask(float* mask, const float* segmentation, unsigned int conn, unsigne
}
-__global__ void devDartSmoothingRadius(float* out, const float* in, float b, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+__global__ void devDartSmoothingRadius(float* out, const float* in, float b, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
unsigned int y = threadIdx.y + 16*blockIdx.y;
@@ -274,13 +274,13 @@ __global__ void devDartSmoothingRadius(float* out, const float* in, float b, uns
float* d = (float*)in;
float* m = (float*)out;
- unsigned int o2 = (y+padY)*pitch+x+padX;
+ unsigned int o2 = y*pitch+x;
int r = radius;
float res = -d[o2];
for (int row = -r; row < r; row++)
{
- unsigned int o1 = (y+padY+row)*pitch+x+padX;
+ unsigned int o1 = (y+row)*pitch+x;
for (int col = -r; col <= r; col++)
{
res += d[o1+col];
@@ -295,7 +295,7 @@ __global__ void devDartSmoothingRadius(float* out, const float* in, float b, uns
}
-__global__ void devDartSmoothing(float* out, const float* in, float b, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+__global__ void devDartSmoothing(float* out, const float* in, float b, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
unsigned int y = threadIdx.y + 16*blockIdx.y;
@@ -305,7 +305,7 @@ __global__ void devDartSmoothing(float* out, const float* in, float b, unsigned
float* d = (float*)in;
float* m = (float*)out;
- unsigned int o2 = (y+padY)*pitch+x+padX; // On this row.
+ unsigned int o2 = y*pitch+x; // On this row.
unsigned int o1 = o2 - pitch; // On previous row.
unsigned int o3 = o2 + pitch; // On next row.
@@ -329,9 +329,9 @@ void dartSmoothing(float* out, const float* in, float b, unsigned int radius, un
dim3 blockSize(16,16);
dim3 gridSize((width+15)/16, (height+15)/16);
if (radius == 1)
- devDartSmoothing<<<gridSize, blockSize>>>(D_outData, D_inData, b, pitch, width, height, 1, 1);
+ devDartSmoothing<<<gridSize, blockSize>>>(D_outData, D_inData, b, pitch, width, height);
else
- devDartSmoothingRadius<<<gridSize, blockSize>>>(D_outData, D_inData, b, radius, pitch, width, height, 1, 1);
+ devDartSmoothingRadius<<<gridSize, blockSize>>>(D_outData, D_inData, b, radius, pitch, width, height);
copyVolumeFromDevice(out, width, width, height, D_outData, pitch);
diff --git a/cuda/2d/dataop.cu b/cuda/2d/dataop.cu
index 68573b2..9dc193e 100644
--- a/cuda/2d/dataop.cu
+++ b/cuda/2d/dataop.cu
@@ -39,10 +39,10 @@ void operationVolumeMult(float* data1, float* data2, unsigned int width, unsigne
float* D_data2;
unsigned int pitch;
- allocateVolume(D_data1, width+2, height+2, pitch);
+ allocateVolume(D_data1, width, height, pitch);
copyVolumeToDevice(data1, width, width, height, D_data1, pitch);
- allocateVolume(D_data2, width+2, height+2, pitch);
+ allocateVolume(D_data2, width, height, pitch);
copyVolumeToDevice(data2, width, width, height, D_data2, pitch);
processVol<opMul, VOL>(D_data1, D_data2, pitch, width, height);
@@ -59,10 +59,10 @@ void operationVolumeMultScalarMask(float* data, float* mask, float scalar, unsig
float* D_mask;
unsigned int pitch;
- allocateVolume(D_data, width+2, height+2, pitch);
+ allocateVolume(D_data, width, height, pitch);
copyVolumeToDevice(data, width, width, height, D_data, pitch);
- allocateVolume(D_mask, width+2, height+2, pitch);
+ allocateVolume(D_mask, width, height, pitch);
copyVolumeToDevice(mask, width, width, height, D_mask, pitch);
processVol<opMulMask, VOL>(D_data, D_mask, scalar, pitch, width, height);
@@ -79,7 +79,7 @@ void operationVolumeMultScalar(float* data, float scalar, unsigned int width, un
float* D_data;
unsigned int pitch;
- allocateVolume(D_data, width+2, height+2, pitch);
+ allocateVolume(D_data, width, height, pitch);
copyVolumeToDevice(data, width, width, height, D_data, pitch);
processVol<opMul, VOL>(D_data, scalar, pitch, width, height);
@@ -96,10 +96,10 @@ void operationVolumeAdd(float* data1, float* data2, unsigned int width, unsigned
float* D_data2;
unsigned int pitch;
- allocateVolume(D_data1, width+2, height+2, pitch);
+ allocateVolume(D_data1, width, height, pitch);
copyVolumeToDevice(data1, width, width, height, D_data1, pitch);
- allocateVolume(D_data2, width+2, height+2, pitch);
+ allocateVolume(D_data2, width, height, pitch);
copyVolumeToDevice(data2, width, width, height, D_data2, pitch);
processVol<opAdd, VOL>(D_data1, D_data2, pitch, width, height);
@@ -116,7 +116,7 @@ void operationVolumeAddScalar(float* data, float scalar, unsigned int width, uns
float* D_data;
unsigned int pitch;
- allocateVolume(D_data, width+2, height+2, pitch);
+ allocateVolume(D_data, width, height, pitch);
copyVolumeToDevice(data, width, width, height, D_data, pitch);
processVol<opAdd, VOL>(D_data, scalar, pitch, width, height);
diff --git a/cuda/2d/em.cu b/cuda/2d/em.cu
index 74d1bbf..cf3f10c 100644
--- a/cuda/2d/em.cu
+++ b/cuda/2d/em.cu
@@ -73,14 +73,14 @@ void EM::reset()
bool EM::init()
{
- allocateVolume(D_pixelWeight, dims.iVolWidth+2, dims.iVolHeight+2, pixelPitch);
- zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ allocateVolume(D_pixelWeight, dims.iVolWidth, dims.iVolHeight, pixelPitch);
+ zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
- allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch);
- zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ allocateVolume(D_tmpData, dims.iVolWidth, dims.iVolHeight, tmpPitch);
+ zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
- allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
- zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+ allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
+ zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
// We can't precompute pixelWeights when using a volume mask
#if 0
@@ -94,22 +94,22 @@ bool EM::init()
bool EM::precomputeWeights()
{
- zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
#if 0
if (useSinogramMask) {
callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch);
} else
#endif
{
- processVol<opSet, SINO>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles);
+ processVol<opSet>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles);
callBP(D_pixelWeight, pixelPitch, D_projData, projPitch);
}
- processVol<opInvert, VOL>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opInvert>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
#if 0
if (useVolumeMask) {
// scale pixel weights with mask to zero out masked pixels
- processVol<opMul, VOL>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opMul>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight);
}
#endif
@@ -129,18 +129,18 @@ bool EM::iterate(unsigned int iterations)
for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
// Do FP of volumeData
- zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+ zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
callFP(D_volumeData, volumePitch, D_projData, projPitch, 1.0f);
// Divide sinogram by FP (into projData)
- processVol<opDividedBy, SINO>(D_projData, D_sinoData, projPitch, dims.iProjDets, dims.iProjAngles);
+ processVol<opDividedBy>(D_projData, D_sinoData, projPitch, dims.iProjDets, dims.iProjAngles);
// Do BP of projData into tmpData
- zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
callBP(D_tmpData, tmpPitch, D_projData, projPitch);
// Multiply volumeData with tmpData divided by pixel weights
- processVol<opMul2, VOL>(D_volumeData, D_tmpData, D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opMul2>(D_volumeData, D_tmpData, D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
}
@@ -150,12 +150,12 @@ bool EM::iterate(unsigned int iterations)
float EM::computeDiffNorm()
{
// copy sinogram to projection data
- cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+ cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);
// do FP, subtracting projection from sinogram
if (useVolumeMask) {
- cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
- processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+ cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice);
+ processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);
} else {
callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
@@ -164,7 +164,7 @@ float EM::computeDiffNorm()
// compute norm of D_projData
- float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+ float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
return sqrt(s);
}
@@ -218,12 +218,12 @@ int main()
dims.iRaysPerDet = 1;
unsigned int volumePitch, sinoPitch;
- allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
- zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
+ zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight);
printf("pitch: %u\n", volumePitch);
- allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
- zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+ allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);
+ zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);
printf("pitch: %u\n", sinoPitch);
unsigned int y, x;
diff --git a/cuda/2d/fan_bp.cu b/cuda/2d/fan_bp.cu
index 12086c0..9bec25d 100644
--- a/cuda/2d/fan_bp.cu
+++ b/cuda/2d/fan_bp.cu
@@ -61,12 +61,12 @@ __constant__ float gC_DetUX[g_MaxAngles];
__constant__ float gC_DetUY[g_MaxAngles];
-static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height)
+static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height, cudaTextureAddressMode mode = cudaAddressModeBorder)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
- gT_FanProjTexture.addressMode[0] = cudaAddressModeClamp;
- gT_FanProjTexture.addressMode[1] = cudaAddressModeClamp;
+ gT_FanProjTexture.addressMode[0] = mode;
+ gT_FanProjTexture.addressMode[1] = mode;
gT_FanProjTexture.filterMode = cudaFilterModeLinear;
gT_FanProjTexture.normalized = false;
@@ -116,12 +116,12 @@ __global__ void devFanBP(float* D_volData, unsigned int volPitch, unsigned int s
const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX;
const float fDen = fDetUX * fYD - fDetUY * fXD;
- const float fT = fNum / fDen + 1.0f;
+ const float fT = fNum / fDen;
fVal += tex2D(gT_FanProjTexture, fT, fA);
fA += 1.0f;
}
- volData[(Y+1)*volPitch+X+1] += fVal;
+ volData[Y*volPitch+X] += fVal;
}
// supersampling version
@@ -171,7 +171,7 @@ __global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned in
const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX;
const float fDen = fDetUX * fYD - fDetUY * fXD;
- const float fT = fNum / fDen + 1.0f;
+ const float fT = fNum / fDen;
fVal += tex2D(gT_FanProjTexture, fT, fA);
fY -= fSubStep;
}
@@ -180,7 +180,7 @@ __global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned in
fA += 1.0f;
}
- volData[(Y+1)*volPitch+X+1] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);
+ volData[Y*volPitch+X] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);
}
@@ -222,7 +222,7 @@ __global__ void devFanBP_SART(float* D_volData, unsigned int volPitch, const SDi
const float fT = fNum / fDen;
const float fVal = tex2D(gT_FanProjTexture, fT, 0.5f);
- volData[(Y+1)*volPitch+X+1] += fVal;
+ volData[Y*volPitch+X] += fVal;
}
// Weighted BP for use in fan beam FBP
@@ -268,12 +268,12 @@ __global__ void devFanBP_FBPWeighted(float* D_volData, unsigned int volPitch, un
const float fWeight = fXD*fXD + fYD*fYD;
- const float fT = fNum / fDen + 1.0f;
+ const float fT = fNum / fDen;
fVal += tex2D(gT_FanProjTexture, fT, fA) / fWeight;
fA += 1.0f;
}
- volData[(Y+1)*volPitch+X+1] += fVal;
+ volData[Y*volPitch+X] += fVal;
}
@@ -331,7 +331,7 @@ bool FanBP_FBPWeighted(float* D_volumeData, unsigned int volumePitch,
// TODO: process angles block by block
assert(dims.iProjAngles <= g_MaxAngles);
- bindProjDataTexture(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+ bindProjDataTexture(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
// transfer angles to constant memory
float* tmp = new float[dims.iProjAngles];
@@ -376,7 +376,7 @@ bool FanBP_SART(float* D_volumeData, unsigned int volumePitch,
const SDimensions& dims, const SFanProjection* angles)
{
// only one angle
- bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1);
+ bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1, cudaAddressModeClamp);
// transfer angle to constant memory
#define TRANSFER_TO_CONSTANT(name) do { cudaMemcpyToSymbol(gC_##name, &(angles[angle].f##name), sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
@@ -442,10 +442,10 @@ int main()
#undef ROTATE0
- allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
printf("pitch: %u\n", volumePitch);
- allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+ allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
printf("pitch: %u\n", projPitch);
unsigned int y, x;
diff --git a/cuda/2d/fan_fp.cu b/cuda/2d/fan_fp.cu
index cf9f352..b24029c 100644
--- a/cuda/2d/fan_fp.cu
+++ b/cuda/2d/fan_fp.cu
@@ -67,8 +67,8 @@ static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned i
cudaMallocArray(&dataArray, &channelDesc, width, height);
cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice);
- gT_FanVolumeTexture.addressMode[0] = cudaAddressModeClamp;
- gT_FanVolumeTexture.addressMode[1] = cudaAddressModeClamp;
+ gT_FanVolumeTexture.addressMode[0] = cudaAddressModeBorder;
+ gT_FanVolumeTexture.addressMode[1] = cudaAddressModeBorder;
gT_FanVolumeTexture.filterMode = cudaFilterModeLinear;
gT_FanVolumeTexture.normalized = false;
@@ -129,8 +129,8 @@ __global__ void FanFPhorizontal(float* D_projData, unsigned int projPitch, unsig
// intersect ray with first slice
- float fY = -alpha * (startSlice - 0.5f*dims.iVolWidth + 0.5f) - beta + 0.5f*dims.iVolHeight - 0.5f + 1.5f;
- float fX = startSlice + 1.5f;
+ float fY = -alpha * (startSlice - 0.5f*dims.iVolWidth + 0.5f) - beta + 0.5f*dims.iVolHeight - 0.5f + 0.5f;
+ float fX = startSlice + 0.5f;
int endSlice = startSlice + g_blockSlices;
if (endSlice > dims.iVolWidth)
@@ -148,7 +148,7 @@ __global__ void FanFPhorizontal(float* D_projData, unsigned int projPitch, unsig
}
- projData[angle*projPitch+detector+1] += fVal;
+ projData[angle*projPitch+detector] += fVal;
}
@@ -201,8 +201,8 @@ __global__ void FanFPvertical(float* D_projData, unsigned int projPitch, unsigne
// intersect ray with first slice
- float fX = -alpha * (startSlice - 0.5f*dims.iVolHeight + 0.5f) + beta + 0.5f*dims.iVolWidth - 0.5f + 1.5f;
- float fY = startSlice + 1.5f;
+ float fX = -alpha * (startSlice - 0.5f*dims.iVolHeight + 0.5f) + beta + 0.5f*dims.iVolWidth - 0.5f + 0.5f;
+ float fY = startSlice + 0.5f;
int endSlice = startSlice + g_blockSlices;
if (endSlice > dims.iVolHeight)
@@ -221,7 +221,7 @@ __global__ void FanFPvertical(float* D_projData, unsigned int projPitch, unsigne
}
- projData[angle*projPitch+detector+1] += fVal;
+ projData[angle*projPitch+detector] += fVal;
}
bool FanFP(float* D_volumeData, unsigned int volumePitch,
@@ -233,7 +233,7 @@ bool FanFP(float* D_volumeData, unsigned int volumePitch,
assert(dims.iProjAngles <= g_MaxAngles);
cudaArray* D_dataArray;
- bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight);
// transfer angles to constant memory
float* tmp = new float[dims.iProjAngles];
@@ -325,10 +325,10 @@ int main()
#undef ROTATE0
- allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
printf("pitch: %u\n", volumePitch);
- allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+ allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
printf("pitch: %u\n", projPitch);
unsigned int y, x;
@@ -358,8 +358,8 @@ int main()
s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x];
printf("cpu norm: %f\n", s);
- //zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
- s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+ //zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
+ s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
printf("gpu norm: %f\n", s);
saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino);
diff --git a/cuda/2d/fft.cu b/cuda/2d/fft.cu
index 79e4be7..75bd92c 100644
--- a/cuda/2d/fft.cu
+++ b/cuda/2d/fft.cu
@@ -209,7 +209,7 @@ bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount,
}
bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,
- int _iSourcePitch, int _iSourcePadX, int _iProjDets,
+ int _iSourcePitch, int _iProjDets,
int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount,
cufftComplex * _pDevTargetComplex)
{
@@ -221,7 +221,7 @@ bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,
for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)
{
- const float * pfSourceLocation = _pfDevRealSource + iProjectionIndex * _iSourcePitch + _iSourcePadX;
+ const float * pfSourceLocation = _pfDevRealSource + iProjectionIndex * _iSourcePitch;
float * pfTargetLocation = pfDevRealFFTSource + iProjectionIndex * _iFFTRealDetectorCount;
SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice));
@@ -241,7 +241,7 @@ bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,
bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,
float * _pfRealTarget,
- int _iTargetPitch, int _iTargetPadX, int _iProjDets,
+ int _iTargetPitch, int _iProjDets,
int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount)
{
float * pfDevRealFFTTarget = NULL;
@@ -264,7 +264,7 @@ bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,
for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)
{
const float * pfSourceLocation = pfDevRealFFTTarget + iProjectionIndex * _iFFTRealDetectorCount;
- float* pfTargetLocation = _pfRealTarget + iProjectionIndex * _iTargetPitch + _iTargetPadX;
+ float* pfTargetLocation = _pfRealTarget + iProjectionIndex * _iTargetPitch;
SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice));
}
diff --git a/cuda/2d/fft.h b/cuda/2d/fft.h
index 55324e5..d4428a5 100644
--- a/cuda/2d/fft.h
+++ b/cuda/2d/fft.h
@@ -45,13 +45,13 @@ bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount,
cufftComplex * _pDevComplexTarget);
bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,
- int _iSourcePitch, int _iSourcePadX, int _iProjDets,
+ int _iSourcePitch, int _iProjDets,
int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount,
cufftComplex * _pDevTargetComplex);
bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,
float * _pfRealTarget,
- int _iTargetPitch, int _iTargetPadX, int _iProjDets,
+ int _iTargetPitch, int _iProjDets,
int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount);
void applyFilter(int _iProjectionCount, int _iFreqBinCount,
diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu
index d202341..fc99102 100644
--- a/cuda/2d/par_bp.cu
+++ b/cuda/2d/par_bp.cu
@@ -57,12 +57,12 @@ __constant__ float gC_angle_sin[g_MaxAngles];
__constant__ float gC_angle_cos[g_MaxAngles];
__constant__ float gC_angle_offset[g_MaxAngles];
-static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height)
+static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height, cudaTextureAddressMode mode = cudaAddressModeBorder)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
- gT_projTexture.addressMode[0] = cudaAddressModeClamp;
- gT_projTexture.addressMode[1] = cudaAddressModeClamp;
+ gT_projTexture.addressMode[0] = mode;
+ gT_projTexture.addressMode[1] = mode;
gT_projTexture.filterMode = cudaFilterModeLinear;
gT_projTexture.normalized = false;
@@ -94,7 +94,7 @@ __global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int star
float fVal = 0.0f;
float fA = startAngle + 0.5f;
- const float fT_base = 0.5f*dims.iProjDets - 0.5f + 1.5f;
+ const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f;
if (offsets) {
@@ -123,7 +123,7 @@ __global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int star
}
- volData[(Y+1)*volPitch+X+1] += fVal;
+ volData[Y*volPitch+X] += fVal;
}
// supersampling version
@@ -150,7 +150,7 @@ __global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int s
float fVal = 0.0f;
float fA = startAngle + 0.5f;
- const float fT_base = 0.5f*dims.iProjDets - 0.5f + 1.5f;
+ const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f;
if (offsets) {
@@ -196,7 +196,7 @@ __global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int s
}
- volData[(Y+1)*volPitch+X+1] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);
+ volData[Y*volPitch+X] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);
}
__global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset, float angle_sin, float angle_cos, const SDimensions dims)
@@ -218,7 +218,7 @@ __global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset
const float fT = fT_base + fX * angle_cos - fY * angle_sin + offset;
const float fVal = tex2D(gT_projTexture, fT, 0.5f);
- D_volData[(Y+1)*volPitch+X+1] += fVal;
+ D_volData[Y*volPitch+X] += fVal;
}
@@ -232,7 +232,7 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch,
float* angle_sin = new float[dims.iProjAngles];
float* angle_cos = new float[dims.iProjAngles];
- bindProjDataTexture(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+ bindProjDataTexture(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
for (unsigned int i = 0; i < dims.iProjAngles; ++i) {
angle_sin[i] = sinf(angles[i]);
@@ -302,8 +302,10 @@ bool BP_SART(float* D_volumeData, unsigned int volumePitch,
unsigned int angle, const SDimensions& dims,
const float* angles, const float* TOffsets)
{
- // only one angle
- bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1);
+ // Only one angle.
+ // We need to Clamp to the border pixels instead of to zero, because
+ // SART weights with ray length.
+ bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1, cudaAddressModeClamp);
float angle_sin = sinf(angles[angle]);
float angle_cos = cosf(angles[angle]);
@@ -346,10 +348,10 @@ int main()
unsigned int volumePitch, projPitch;
- allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
printf("pitch: %u\n", volumePitch);
- allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+ allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
printf("pitch: %u\n", projPitch);
unsigned int y, x;
diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu
index f811f98..097122b 100644
--- a/cuda/2d/par_fp.cu
+++ b/cuda/2d/par_fp.cu
@@ -74,8 +74,8 @@ static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned i
cudaMallocArray(&dataArray, &channelDesc, width, height);
cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice);
- gT_volumeTexture.addressMode[0] = cudaAddressModeClamp;
- gT_volumeTexture.addressMode[1] = cudaAddressModeClamp;
+ gT_volumeTexture.addressMode[0] = cudaAddressModeBorder;
+ gT_volumeTexture.addressMode[1] = cudaAddressModeBorder;
gT_volumeTexture.filterMode = cudaFilterModeLinear;
gT_volumeTexture.normalized = false;
@@ -153,8 +153,8 @@ __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned
float fVal = 0.0f;
// project detector on slice
- float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f;
- float fS = startSlice + 1.5f;
+ float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 0.5f;
+ float fS = startSlice + 0.5f;
int endSlice = startSlice + g_blockSlices;
if (endSlice > dims.iVolWidth)
endSlice = dims.iVolWidth;
@@ -189,7 +189,7 @@ __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned
}
- D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+ D_projData[angle*projPitch+detector] += fVal * fDistCorr;
}
// projection for angles that are roughly vertical
@@ -255,8 +255,8 @@ __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned i
fDistCorr *= outputScale;
float fVal = 0.0f;
- float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f;
- float fS = startSlice+1.5f;
+ float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 0.5f;
+ float fS = startSlice+0.5f;
int endSlice = startSlice + g_blockSlices;
if (endSlice > dims.iVolHeight)
endSlice = dims.iVolHeight;
@@ -290,7 +290,7 @@ __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned i
}
- D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+ D_projData[angle*projPitch+detector] += fVal * fDistCorr;
}
// projection for angles that are roughly horizontal
@@ -331,8 +331,8 @@ __global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, u
float fVal = 0.0f;
// project detector on slice
- float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f;
- float fS = startSlice + 1.5f;
+ float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 0.5f;
+ float fS = startSlice + 0.5f;
int endSlice = startSlice + g_blockSlices;
if (endSlice > dims.iVolWidth)
endSlice = dims.iVolWidth;
@@ -367,7 +367,7 @@ __global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, u
}
- D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+ D_projData[angle*projPitch+detector] += fVal * fDistCorr;
}
@@ -408,8 +408,8 @@ __global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, uns
fDistCorr *= outputScale;
float fVal = 0.0f;
- float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f;
- float fS = startSlice+1.5f;
+ float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 0.5f;
+ float fS = startSlice+0.5f;
int endSlice = startSlice + g_blockSlices;
if (endSlice > dims.iVolHeight)
endSlice = dims.iVolHeight;
@@ -443,7 +443,7 @@ __global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, uns
}
- D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+ D_projData[angle*projPitch+detector] += fVal * fDistCorr;
}
@@ -457,7 +457,7 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch,
assert(dims.iProjAngles <= g_MaxAngles);
cudaArray* D_dataArray;
- bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight);
cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
@@ -579,7 +579,7 @@ bool FP(float* D_volumeData, unsigned int volumePitch,
assert(dims.fDetScale > 0.9999f);
cudaArray* D_dataArray;
- bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight);
cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
@@ -682,10 +682,10 @@ int main()
dims.iRaysPerDet = 1;
unsigned int volumePitch, projPitch;
- allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
printf("pitch: %u\n", volumePitch);
- allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+ allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
printf("pitch: %u\n", projPitch);
unsigned int y, x;
@@ -715,7 +715,7 @@ int main()
s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x];
printf("cpu norm: %f\n", s);
- //zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+ //zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
printf("gpu norm: %f\n", s);
diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu
index a40176d..7f499ce 100644
--- a/cuda/2d/sart.cu
+++ b/cuda/2d/sart.cu
@@ -39,14 +39,13 @@ $Id$
namespace astraCUDA {
-
+// FIXME: Remove these functions. (Outdated)
__global__ void devMUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
- // Copy result down and left one pixel.
- pfOut[x + pitch] = pfOut[x + 1] * pfIn[x + 1];
+ pfOut[x] *= pfIn[x];
}
void MUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width)
@@ -106,18 +105,15 @@ void SART::reset()
bool SART::init()
{
if (useVolumeMask) {
- allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch);
- zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ allocateVolume(D_tmpData, dims.iVolWidth, dims.iVolHeight, tmpPitch);
+ zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
}
- // HACK: D_projData consists of two lines. The first is used padded,
- // the second unpadded. This is to satisfy the alignment requirements
- // of resp. FP and BP_SART.
- allocateVolume(D_projData, dims.iProjDets+2, 2, projPitch);
- zeroVolume(D_projData, projPitch, dims.iProjDets+2, 1);
+ allocateVolume(D_projData, dims.iProjDets, 1, projPitch);
+ zeroVolume(D_projData, projPitch, dims.iProjDets, 1);
- allocateVolume(D_lineWeight, dims.iProjDets+2, dims.iProjAngles, linePitch);
- zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles);
+ allocateVolume(D_lineWeight, dims.iProjDets, dims.iProjAngles, linePitch);
+ zeroVolume(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
// We can't precompute lineWeights when using a mask
if (!useVolumeMask)
@@ -142,23 +138,23 @@ bool SART::setProjectionOrder(int* _projectionOrder, int _projectionCount)
bool SART::precomputeWeights()
{
- zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles);
+ zeroVolume(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
if (useVolumeMask) {
callFP(D_maskData, maskPitch, D_lineWeight, linePitch, 1.0f);
} else {
// Allocate tmpData temporarily
- allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch);
- zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ allocateVolume(D_tmpData, dims.iVolWidth, dims.iVolHeight, tmpPitch);
+ zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
- processVol<opSet, VOL>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opSet>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight);
callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f);
cudaFree(D_tmpData);
D_tmpData = 0;
}
- processVol<opInvert, SINO>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
+ processVol<opInvert>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
return true;
}
@@ -181,12 +177,12 @@ bool SART::iterate(unsigned int iterations)
}
// copy one line of sinogram to projection data
- cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData + angle*sinoPitch, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), 1, cudaMemcpyDeviceToDevice);
+ cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData + angle*sinoPitch, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), 1, cudaMemcpyDeviceToDevice);
// do FP, subtracting projection from sinogram
if (useVolumeMask) {
- cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
- processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+ cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice);
+ processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
callFP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle, -1.0f);
} else {
callFP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle, -1.0f);
@@ -197,17 +193,17 @@ bool SART::iterate(unsigned int iterations)
if (useVolumeMask) {
// BP, mask, and add back
// TODO: Try putting the masking directly in the BP
- zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
callBP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle);
- processVol<opAddMul, VOL>(D_volumeData, D_maskData, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opAddMul>(D_volumeData, D_maskData, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight);
} else {
callBP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle);
}
if (useMinConstraint)
- processVol<opClampMin, VOL>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opClampMin>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
if (useMaxConstraint)
- processVol<opClampMax, VOL>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opClampMax>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
iteration++;
@@ -220,16 +216,16 @@ float SART::computeDiffNorm()
{
unsigned int pPitch;
float *D_p;
- allocateVolume(D_p, dims.iProjDets+2, dims.iProjAngles, pPitch);
- zeroVolume(D_p, pPitch, dims.iProjDets+2, dims.iProjAngles);
+ allocateVolume(D_p, dims.iProjDets, dims.iProjAngles, pPitch);
+ zeroVolume(D_p, pPitch, dims.iProjDets, dims.iProjAngles);
// copy sinogram to D_p
- cudaMemcpy2D(D_p, sizeof(float)*pPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+ cudaMemcpy2D(D_p, sizeof(float)*pPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);
// do FP, subtracting projection from sinogram
if (useVolumeMask) {
- cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
- processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+ cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice);
+ processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);
} else {
callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
@@ -237,7 +233,7 @@ float SART::computeDiffNorm()
// compute norm of D_p
- float s = dotProduct2D(D_p, pPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+ float s = dotProduct2D(D_p, pPitch, dims.iProjDets, dims.iProjAngles);
cudaFree(D_p);
@@ -267,11 +263,11 @@ bool SART::callBP_SART(float* D_volumeData, unsigned int volumePitch,
{
if (angles) {
assert(!fanProjs);
- return BP_SART(D_volumeData, volumePitch, D_projData + projPitch, projPitch,
+ return BP_SART(D_volumeData, volumePitch, D_projData, projPitch,
angle, dims, angles, TOffsets);
} else {
assert(fanProjs);
- return FanBP_SART(D_volumeData, volumePitch, D_projData + projPitch, projPitch,
+ return FanBP_SART(D_volumeData, volumePitch, D_projData, projPitch,
angle, dims, fanProjs);
}
diff --git a/cuda/2d/sirt.cu b/cuda/2d/sirt.cu
index 31954e4..eb65962 100644
--- a/cuda/2d/sirt.cu
+++ b/cuda/2d/sirt.cu
@@ -88,17 +88,17 @@ void SIRT::reset()
bool SIRT::init()
{
- allocateVolume(D_pixelWeight, dims.iVolWidth+2, dims.iVolHeight+2, pixelPitch);
- zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ allocateVolume(D_pixelWeight, dims.iVolWidth, dims.iVolHeight, pixelPitch);
+ zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
- allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch);
- zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ allocateVolume(D_tmpData, dims.iVolWidth, dims.iVolHeight, tmpPitch);
+ zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
- allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
- zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+ allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
+ zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
- allocateVolume(D_lineWeight, dims.iProjDets+2, dims.iProjAngles, linePitch);
- zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles);
+ allocateVolume(D_lineWeight, dims.iProjDets, dims.iProjAngles, linePitch);
+ zeroVolume(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
// We can't precompute lineWeights and pixelWeights when using a mask
if (!useVolumeMask && !useSinogramMask)
@@ -110,33 +110,33 @@ bool SIRT::init()
bool SIRT::precomputeWeights()
{
- zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles);
+ zeroVolume(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
if (useVolumeMask) {
callFP(D_maskData, maskPitch, D_lineWeight, linePitch, 1.0f);
} else {
- processVol<opSet, VOL>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opSet>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight);
callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f);
}
- processVol<opInvert, SINO>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
+ processVol<opInvert>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
if (useSinogramMask) {
// scale line weights with sinogram mask to zero out masked sinogram pixels
- processVol<opMul, SINO>(D_lineWeight, D_smaskData, linePitch, dims.iProjDets, dims.iProjAngles);
+ processVol<opMul>(D_lineWeight, D_smaskData, linePitch, dims.iProjDets, dims.iProjAngles);
}
- zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
if (useSinogramMask) {
callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch);
} else {
- processVol<opSet, SINO>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles);
+ processVol<opSet>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles);
callBP(D_pixelWeight, pixelPitch, D_projData, projPitch);
}
- processVol<opInvert, VOL>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opInvert>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
if (useVolumeMask) {
// scale pixel weights with mask to zero out masked pixels
- processVol<opMul, VOL>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opMul>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight);
}
return true;
@@ -160,7 +160,7 @@ bool SIRT::uploadMinMaxMasks(const float* pfMinMaskData, const float* pfMaxMaskD
freeMinMaxMasks = true;
bool ok = true;
if (pfMinMaskData) {
- allocateVolume(D_minMaskData, dims.iVolWidth+2, dims.iVolHeight+2, minMaskPitch);
+ allocateVolume(D_minMaskData, dims.iVolWidth, dims.iVolHeight, minMaskPitch);
ok = copyVolumeToDevice(pfMinMaskData, iPitch,
dims.iVolWidth, dims.iVolHeight,
D_minMaskData, minMaskPitch);
@@ -169,7 +169,7 @@ bool SIRT::uploadMinMaxMasks(const float* pfMinMaskData, const float* pfMaxMaskD
return false;
if (pfMaxMaskData) {
- allocateVolume(D_maxMaskData, dims.iVolWidth+2, dims.iVolHeight+2, maxMaskPitch);
+ allocateVolume(D_maxMaskData, dims.iVolWidth, dims.iVolHeight, maxMaskPitch);
ok = copyVolumeToDevice(pfMaxMaskData, iPitch,
dims.iVolWidth, dims.iVolHeight,
D_maxMaskData, maxMaskPitch);
@@ -191,33 +191,33 @@ bool SIRT::iterate(unsigned int iterations)
for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
// copy sinogram to projection data
- cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+ cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);
// do FP, subtracting projection from sinogram
if (useVolumeMask) {
- cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
- processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+ cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice);
+ processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);
} else {
callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
}
- processVol<opMul, SINO>(D_projData, D_lineWeight, projPitch, dims.iProjDets, dims.iProjAngles);
+ processVol<opMul>(D_projData, D_lineWeight, projPitch, dims.iProjDets, dims.iProjAngles);
- zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
callBP(D_tmpData, tmpPitch, D_projData, projPitch);
- processVol<opAddMul, VOL>(D_volumeData, D_pixelWeight, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opAddMul>(D_volumeData, D_pixelWeight, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight);
if (useMinConstraint)
- processVol<opClampMin, VOL>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opClampMin>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
if (useMaxConstraint)
- processVol<opClampMax, VOL>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opClampMax>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
if (D_minMaskData)
- processVol<opClampMinMask, VOL>(D_volumeData, D_minMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opClampMinMask>(D_volumeData, D_minMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight);
if (D_maxMaskData)
- processVol<opClampMaxMask, VOL>(D_volumeData, D_maxMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight);
+ processVol<opClampMaxMask>(D_volumeData, D_maxMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight);
}
return true;
@@ -226,12 +226,12 @@ bool SIRT::iterate(unsigned int iterations)
float SIRT::computeDiffNorm()
{
// copy sinogram to projection data
- cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+ cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);
// do FP, subtracting projection from sinogram
if (useVolumeMask) {
- cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
- processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+ cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice);
+ processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);
} else {
callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
@@ -240,7 +240,7 @@ float SIRT::computeDiffNorm()
// compute norm of D_projData
- float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+ float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
return sqrt(s);
}
@@ -300,12 +300,12 @@ int main()
dims.iRaysPerDet = 1;
unsigned int volumePitch, sinoPitch;
- allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
- zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+ allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
+ zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight);
printf("pitch: %u\n", volumePitch);
- allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
- zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+ allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);
+ zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);
printf("pitch: %u\n", sinoPitch);
unsigned int y, x;
diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu
index 06f6714..8bb2f2f 100644
--- a/cuda/2d/util.cu
+++ b/cuda/2d/util.cu
@@ -36,11 +36,8 @@ bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch,
unsigned int width, unsigned int height,
float* outD_data, unsigned int out_pitch)
{
- // TODO: a full memset isn't necessary. Only the edges.
cudaError_t err;
- err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, sizeof(float)*(width+2), height+2);
- ASTRA_CUDA_ASSERT(err);
- err = cudaMemcpy2D(outD_data + out_pitch + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
+ err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
ASTRA_CUDA_ASSERT(err);
assert(err == cudaSuccess);
return true;
@@ -50,7 +47,7 @@ bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch,
unsigned int width, unsigned int height,
float* inD_data, unsigned int in_pitch)
{
- cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + (in_pitch + 1), sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
+ cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
ASTRA_CUDA_ASSERT(err);
return true;
}
@@ -60,7 +57,7 @@ bool copySinogramFromDevice(float* out_data, unsigned int out_pitch,
unsigned int width, unsigned int height,
float* inD_data, unsigned int in_pitch)
{
- cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + 1, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
+ cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
ASTRA_CUDA_ASSERT(err);
return true;
}
@@ -69,11 +66,8 @@ bool copySinogramToDevice(const float* in_data, unsigned int in_pitch,
unsigned int width, unsigned int height,
float* outD_data, unsigned int out_pitch)
{
- // TODO: a full memset isn't necessary. Only the edges.
cudaError_t err;
- err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, (width+2)*sizeof(float), height);
- ASTRA_CUDA_ASSERT(err);
- err = cudaMemcpy2D(outD_data + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
+ err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
ASTRA_CUDA_ASSERT(err);
return true;
}
@@ -132,8 +126,7 @@ __global__ void reduce1D(float *g_idata, float *g_odata, unsigned int n)
__global__ void reduce2D(float *g_idata, float *g_odata,
unsigned int pitch,
- unsigned int nx, unsigned int ny,
- unsigned int padX, unsigned int padY)
+ unsigned int nx, unsigned int ny)
{
extern __shared__ float sdata[];
const unsigned int tidx = threadIdx.x;
@@ -145,11 +138,10 @@ __global__ void reduce2D(float *g_idata, float *g_odata,
sdata[tid] = 0;
- if (x >= padX && x < padX + nx) {
+ if (x < nx) {
- while (y < padY + ny) {
- if (y >= padY)
- sdata[tid] += (g_idata[pitch*y+x] * g_idata[pitch*y+x]);
+ while (y < ny) {
+ sdata[tid] += (g_idata[pitch*y+x] * g_idata[pitch*y+x]);
y += 16 * gridDim.y;
}
@@ -180,11 +172,10 @@ __global__ void reduce2D(float *g_idata, float *g_odata,
}
float dotProduct2D(float* D_data, unsigned int pitch,
- unsigned int width, unsigned int height,
- unsigned int padX, unsigned int padY)
+ unsigned int width, unsigned int height)
{
- unsigned int bx = ((width+padX) + 15) / 16;
- unsigned int by = ((height+padY) + 127) / 128;
+ unsigned int bx = (width + 15) / 16;
+ unsigned int by = (height + 127) / 128;
unsigned int shared_mem2 = sizeof(float) * 16 * 16;
dim3 dimBlock2(16, 16);
@@ -192,26 +183,27 @@ float dotProduct2D(float* D_data, unsigned int pitch,
float* D_buf;
cudaMalloc(&D_buf, sizeof(float) * (bx * by + 1) );
+ float* D_res = D_buf + (bx*by);
// Step 1: reduce 2D from image to a single vector, taking sum of squares
- reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height, padX, padY);
+ reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height);
cudaTextForceKernelsCompletion();
// Step 2: reduce 1D: add up elements in vector
if (bx * by > 512)
- reduce1D<512><<< 1, 512, sizeof(float)*512>>>(D_buf, D_buf+(bx*by), bx*by);
+ reduce1D<512><<< 1, 512, sizeof(float)*512>>>(D_buf, D_res, bx*by);
else if (bx * by > 128)
- reduce1D<128><<< 1, 128, sizeof(float)*128>>>(D_buf, D_buf+(bx*by), bx*by);
+ reduce1D<128><<< 1, 128, sizeof(float)*128>>>(D_buf, D_res, bx*by);
else if (bx * by > 32)
- reduce1D<32><<< 1, 32, sizeof(float)*32*2>>>(D_buf, D_buf+(bx*by), bx*by);
+ reduce1D<32><<< 1, 32, sizeof(float)*32*2>>>(D_buf, D_res, bx*by);
else if (bx * by > 8)
- reduce1D<8><<< 1, 8, sizeof(float)*8*2>>>(D_buf, D_buf+(bx*by), bx*by);
+ reduce1D<8><<< 1, 8, sizeof(float)*8*2>>>(D_buf, D_res, bx*by);
else
- reduce1D<1><<< 1, 1, sizeof(float)*1*2>>>(D_buf, D_buf+(bx*by), bx*by);
+ reduce1D<1><<< 1, 1, sizeof(float)*1*2>>>(D_buf, D_res, bx*by);
float x;
- cudaMemcpy(&x, D_buf+(bx*by), 4, cudaMemcpyDeviceToHost);
+ cudaMemcpy(&x, D_res, 4, cudaMemcpyDeviceToHost);
cudaTextForceKernelsCompletion();
diff --git a/cuda/2d/util.h b/cuda/2d/util.h
index d31e2eb..ed4a45c 100644
--- a/cuda/2d/util.h
+++ b/cuda/2d/util.h
@@ -82,8 +82,8 @@ void reportCudaError(cudaError_t err);
float dotProduct2D(float* D_data, unsigned int pitch,
- unsigned int width, unsigned int height,
- unsigned int padX, unsigned int padY);
+ unsigned int width, unsigned int height);
+
}