summaryrefslogtreecommitdiffstats
path: root/cuda/2d/par_fp.cu
diff options
context:
space:
mode:
authorWillem Jan Palenstijn <WillemJan.Palenstijn@uantwerpen.be>2013-07-01 22:34:11 +0000
committerwpalenst <WillemJan.Palenstijn@uantwerpen.be>2013-07-01 22:34:11 +0000
commitb2fc6c70434674d74551c3a6c01ffb3233499312 (patch)
treeb17f080ebc504ab85ebb7c3d89f917fd87ce9e00 /cuda/2d/par_fp.cu
downloadastra-b2fc6c70434674d74551c3a6c01ffb3233499312.tar.gz
astra-b2fc6c70434674d74551c3a6c01ffb3233499312.tar.bz2
astra-b2fc6c70434674d74551c3a6c01ffb3233499312.tar.xz
astra-b2fc6c70434674d74551c3a6c01ffb3233499312.zip
Update version to 1.3
Diffstat (limited to 'cuda/2d/par_fp.cu')
-rw-r--r--cuda/2d/par_fp.cu704
1 files changed, 704 insertions, 0 deletions
diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu
new file mode 100644
index 0000000..585cb06
--- /dev/null
+++ b/cuda/2d/par_fp.cu
@@ -0,0 +1,704 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include "util.h"
+#include "arith.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+#define PIXELTRACE
+
+
+typedef texture<float, 2, cudaReadModeElementType> texture2D;
+
+static texture2D gT_volumeTexture;
+
+
+namespace astraCUDA {
+
+static const unsigned g_MaxAngles = 2560;
+__constant__ float gC_angle[g_MaxAngles];
+__constant__ float gC_angle_offset[g_MaxAngles];
+
+
+// optimization parameters
+static const unsigned int g_anglesPerBlock = 16;
+static const unsigned int g_detBlockSize = 32;
+static const unsigned int g_blockSlices = 64;
+
+// fixed point scaling factor
+#define fPREC_FACTOR 16.0f
+#define iPREC_FACTOR 16
+
+
+// if necessary, a buffer of zeroes of size g_MaxAngles
+static float* g_pfZeroes = 0;
+
+
+static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ dataArray = 0;
+ cudaMallocArray(&dataArray, &channelDesc, width, height);
+ cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice);
+
+ gT_volumeTexture.addressMode[0] = cudaAddressModeClamp;
+ gT_volumeTexture.addressMode[1] = cudaAddressModeClamp;
+ gT_volumeTexture.filterMode = cudaFilterModeLinear;
+ gT_volumeTexture.normalized = false;
+
+ // TODO: For very small sizes (roughly <=512x128) with few angles (<=180)
+ // not using an array is more efficient.
+// cudaBindTexture2D(0, gT_volumeTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch);
+ cudaBindTextureToArray(gT_volumeTexture, dataArray, channelDesc);
+
+ // TODO: error value?
+
+ return true;
+}
+
+// projection for angles that are roughly horizontal
+// theta between 45 and 135 degrees
+__global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale)
+{
+ const int relDet = threadIdx.x;
+ const int relAngle = threadIdx.y;
+
+ int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+
+ if (angle >= endAngle)
+ return;
+
+ const float theta = gC_angle[angle];
+ const float cos_theta = __cosf(theta);
+ const float sin_theta = __sinf(theta);
+
+ // compute start detector for this block/angle:
+ // (The same for all threadIdx.x)
+ // -------------------------------------
+
+ const int midSlice = startSlice + g_blockSlices / 2;
+
+ // ASSUMPTION: fDetScale >= 1.0f
+ // problem: detector regions get skipped because slice blocks aren't large
+ // enough
+ const unsigned int g_blockSliceSize = g_detBlockSize;
+
+ // project (midSlice,midRegion) on this thread's detector
+
+ const float fBase = 0.5f*dims.iProjDets - 0.5f +
+ (
+ (midSlice - 0.5f*dims.iVolWidth + 0.5f) * cos_theta
+ - (g_blockSliceSize/2 - 0.5f*dims.iVolHeight + 0.5f) * sin_theta
+ + gC_angle_offset[angle]
+ ) / dims.fDetScale;
+ int iBase = (int)floorf(fBase * fPREC_FACTOR);
+ int iInc = (int)floorf(g_blockSliceSize * sin_theta / dims.fDetScale * -fPREC_FACTOR);
+
+ // ASSUMPTION: 16 > regionOffset / fDetScale
+ const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
+ const int detPrevRegion = (iBase + (blockIdx.y - regionOffset - 1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
+
+ if (blockIdx.y > 0 && detRegion == detPrevRegion)
+ return;
+
+ const int detector = detRegion * g_detBlockSize + relDet;
+
+ // Now project the part of the ray to angle,detector through
+ // slices startSlice to startSlice+g_blockSlices-1
+
+ if (detector < 0 || detector >= dims.iProjDets)
+ return;
+
+ const float fDetStep = -dims.fDetScale / sin_theta;
+ float fSliceStep = cos_theta / sin_theta;
+ float fDistCorr;
+ if (sin_theta > 0.0f)
+ fDistCorr = -fDetStep;
+ else
+ fDistCorr = fDetStep;
+ fDistCorr *= outputScale;
+
+ float fVal = 0.0f;
+ // project detector on slice
+ float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f;
+ float fS = startSlice + 1.5f;
+ int endSlice = startSlice + g_blockSlices;
+ if (endSlice > dims.iVolWidth)
+ endSlice = dims.iVolWidth;
+
+ if (dims.iRaysPerDet > 1) {
+
+ fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep;
+ const float fSubDetStep = fDetStep / dims.iRaysPerDet;
+ fDistCorr /= dims.iRaysPerDet;
+
+ fSliceStep -= dims.iRaysPerDet * fSubDetStep;
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+ fVal += tex2D(gT_volumeTexture, fS, fP);
+ fP += fSubDetStep;
+ }
+ fP += fSliceStep;
+ fS += 1.0f;
+ }
+
+ } else {
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ fVal += tex2D(gT_volumeTexture, fS, fP);
+ fP += fSliceStep;
+ fS += 1.0f;
+ }
+
+
+ }
+
+ D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+}
+
+// projection for angles that are roughly vertical
+// theta between 0 and 45, or 135 and 180 degrees
+__global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale)
+{
+ const int relDet = threadIdx.x;
+ const int relAngle = threadIdx.y;
+
+ int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+
+ if (angle >= endAngle)
+ return;
+
+ const float theta = gC_angle[angle];
+ const float cos_theta = __cosf(theta);
+ const float sin_theta = __sinf(theta);
+
+ // compute start detector for this block/angle:
+ // (The same for all threadIdx.x)
+ // -------------------------------------
+
+ const int midSlice = startSlice + g_blockSlices / 2;
+
+ // project (midSlice,midRegion) on this thread's detector
+
+ // ASSUMPTION: fDetScale >= 1.0f
+ // problem: detector regions get skipped because slice blocks aren't large
+ // enough
+ const unsigned int g_blockSliceSize = g_detBlockSize;
+
+ const float fBase = 0.5f*dims.iProjDets - 0.5f +
+ (
+ (g_blockSliceSize/2 - 0.5f*dims.iVolWidth + 0.5f) * cos_theta
+ - (midSlice - 0.5f*dims.iVolHeight + 0.5f) * sin_theta
+ + gC_angle_offset[angle]
+ ) / dims.fDetScale;
+ int iBase = (int)floorf(fBase * fPREC_FACTOR);
+ int iInc = (int)floorf(g_blockSliceSize * cos_theta / dims.fDetScale * fPREC_FACTOR);
+
+ // ASSUMPTION: 16 > regionOffset / fDetScale
+ const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
+ const int detPrevRegion = (iBase + (blockIdx.y - regionOffset-1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
+
+ if (blockIdx.y > 0 && detRegion == detPrevRegion)
+ return;
+
+ const int detector = detRegion * g_detBlockSize + relDet;
+
+ // Now project the part of the ray to angle,detector through
+ // slices startSlice to startSlice+g_blockSlices-1
+
+ if (detector < 0 || detector >= dims.iProjDets)
+ return;
+
+ const float fDetStep = dims.fDetScale / cos_theta;
+ float fSliceStep = sin_theta / cos_theta;
+ float fDistCorr;
+ if (cos_theta < 0.0f)
+ fDistCorr = -fDetStep;
+ else
+ fDistCorr = fDetStep;
+ fDistCorr *= outputScale;
+
+ float fVal = 0.0f;
+ float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f;
+ float fS = startSlice+1.5f;
+ int endSlice = startSlice + g_blockSlices;
+ if (endSlice > dims.iVolHeight)
+ endSlice = dims.iVolHeight;
+
+ if (dims.iRaysPerDet > 1) {
+
+ fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep;
+ const float fSubDetStep = fDetStep / dims.iRaysPerDet;
+ fDistCorr /= dims.iRaysPerDet;
+
+ fSliceStep -= dims.iRaysPerDet * fSubDetStep;
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+ fVal += tex2D(gT_volumeTexture, fP, fS);
+ fP += fSubDetStep;
+ }
+ fP += fSliceStep;
+ fS += 1.0f;
+ }
+
+ } else {
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ fVal += tex2D(gT_volumeTexture, fP, fS);
+ fP += fSliceStep;
+ fS += 1.0f;
+ }
+
+ }
+
+ D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+}
+
+// projection for angles that are roughly horizontal
+// (detector roughly vertical)
+__global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
+{
+ const int relDet = threadIdx.x;
+ const int relAngle = threadIdx.y;
+
+ int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+
+ if (angle >= endAngle)
+ return;
+
+ const float theta = gC_angle[angle];
+ const float cos_theta = __cosf(theta);
+ const float sin_theta = __sinf(theta);
+
+ // compute start detector for this block/angle:
+ const int detRegion = blockIdx.y;
+
+ const int detector = detRegion * g_detBlockSize + relDet;
+
+ // Now project the part of the ray to angle,detector through
+ // slices startSlice to startSlice+g_blockSlices-1
+
+ if (detector < 0 || detector >= dims.iProjDets)
+ return;
+
+ const float fDetStep = -dims.fDetScale / sin_theta;
+ float fSliceStep = cos_theta / sin_theta;
+ float fDistCorr;
+ if (sin_theta > 0.0f)
+ fDistCorr = -fDetStep;
+ else
+ fDistCorr = fDetStep;
+ fDistCorr *= outputScale;
+
+ float fVal = 0.0f;
+ // project detector on slice
+ float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f;
+ float fS = startSlice + 1.5f;
+ int endSlice = startSlice + g_blockSlices;
+ if (endSlice > dims.iVolWidth)
+ endSlice = dims.iVolWidth;
+
+ if (dims.iRaysPerDet > 1) {
+
+ fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep;
+ const float fSubDetStep = fDetStep / dims.iRaysPerDet;
+ fDistCorr /= dims.iRaysPerDet;
+
+ fSliceStep -= dims.iRaysPerDet * fSubDetStep;
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+ fVal += tex2D(gT_volumeTexture, fS, fP);
+ fP += fSubDetStep;
+ }
+ fP += fSliceStep;
+ fS += 1.0f;
+ }
+
+ } else {
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ fVal += tex2D(gT_volumeTexture, fS, fP);
+ fP += fSliceStep;
+ fS += 1.0f;
+ }
+
+
+ }
+
+ D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+}
+
+
+// projection for angles that are roughly vertical
+// (detector roughly horizontal)
+__global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
+{
+ const int relDet = threadIdx.x;
+ const int relAngle = threadIdx.y;
+
+ int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+
+ if (angle >= endAngle)
+ return;
+
+ const float theta = gC_angle[angle];
+ const float cos_theta = __cosf(theta);
+ const float sin_theta = __sinf(theta);
+
+ // compute start detector for this block/angle:
+ const int detRegion = blockIdx.y;
+
+ const int detector = detRegion * g_detBlockSize + relDet;
+
+ // Now project the part of the ray to angle,detector through
+ // slices startSlice to startSlice+g_blockSlices-1
+
+ if (detector < 0 || detector >= dims.iProjDets)
+ return;
+
+ const float fDetStep = dims.fDetScale / cos_theta;
+ float fSliceStep = sin_theta / cos_theta;
+ float fDistCorr;
+ if (cos_theta < 0.0f)
+ fDistCorr = -fDetStep;
+ else
+ fDistCorr = fDetStep;
+ fDistCorr *= outputScale;
+
+ float fVal = 0.0f;
+ float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f;
+ float fS = startSlice+1.5f;
+ int endSlice = startSlice + g_blockSlices;
+ if (endSlice > dims.iVolHeight)
+ endSlice = dims.iVolHeight;
+
+ if (dims.iRaysPerDet > 1) {
+
+ fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep;
+ const float fSubDetStep = fDetStep / dims.iRaysPerDet;
+ fDistCorr /= dims.iRaysPerDet;
+
+ fSliceStep -= dims.iRaysPerDet * fSubDetStep;
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+ fVal += tex2D(gT_volumeTexture, fP, fS);
+ fP += fSubDetStep;
+ }
+ fP += fSliceStep;
+ fS += 1.0f;
+ }
+
+ } else {
+
+ for (int slice = startSlice; slice < endSlice; ++slice)
+ {
+ fVal += tex2D(gT_volumeTexture, fP, fS);
+ fP += fSliceStep;
+ fS += 1.0f;
+ }
+
+ }
+
+ D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+}
+
+
+
+bool FP_simple(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ const SDimensions& dims, const float* angles,
+ const float* TOffsets, float outputScale)
+{
+ // TODO: load angles into constant memory in smaller blocks
+ assert(dims.iProjAngles <= g_MaxAngles);
+
+ cudaArray* D_dataArray;
+ bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+ cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+ if (TOffsets) {
+ cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+ } else {
+ if (!g_pfZeroes) {
+ g_pfZeroes = new float[g_MaxAngles];
+ memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float));
+ }
+ cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+ }
+
+ dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // detector block size, angles
+
+ std::list<cudaStream_t> streams;
+
+
+ // Run over all angles, grouping them into groups of the same
+ // orientation (roughly horizontal vs. roughly vertical).
+ // Start a stream of grids for each such group.
+
+ // TODO: Check if it's worth it to store this info instead
+ // of recomputing it every FP.
+
+ unsigned int blockStart = 0;
+ unsigned int blockEnd = 0;
+ bool blockVertical = false;
+ for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
+ bool vertical;
+ // TODO: Having <= instead of < below causes a 5% speedup.
+ // Maybe we should detect corner cases and put them in the optimal
+ // group of angles.
+ if (a != dims.iProjAngles)
+ vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a])));
+ if (a == dims.iProjAngles || vertical != blockVertical) {
+ // block done
+
+ blockEnd = a;
+ if (blockStart != blockEnd) {
+ dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock,
+ (dims.iProjDets+g_detBlockSize-1)/g_detBlockSize); // angle blocks, detector blocks
+
+ // TODO: check if we can't immediately
+ // destroy the stream after use
+ cudaStream_t stream;
+ cudaStreamCreate(&stream);
+ streams.push_back(stream);
+ //printf("angle block: %d to %d, %d\n", blockStart, blockEnd, blockVertical);
+ if (!blockVertical)
+ for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices)
+ FPhorizontal_simple<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale);
+ else
+ for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices)
+ FPvertical_simple<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale);
+ }
+ blockVertical = vertical;
+ blockStart = a;
+ }
+ }
+
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
+ cudaStreamDestroy(*iter);
+
+ streams.clear();
+
+ cudaThreadSynchronize();
+
+ cudaTextForceKernelsCompletion();
+
+ cudaFreeArray(D_dataArray);
+
+
+ return true;
+}
+
+bool FP(float* D_volumeData, unsigned int volumePitch,
+ float* D_projData, unsigned int projPitch,
+ const SDimensions& dims, const float* angles,
+ const float* TOffsets, float outputScale)
+{
+ return FP_simple(D_volumeData, volumePitch, D_projData, projPitch,
+ dims, angles, TOffsets, outputScale);
+
+ // TODO: Fix bug in this non-simple FP with large detscale and TOffsets
+#if 0
+
+ // TODO: load angles into constant memory in smaller blocks
+ assert(dims.iProjAngles <= g_MaxAngles);
+
+ // TODO: compute region size dynamically to resolve these two assumptions
+ // ASSUMPTION: 16 > regionOffset / fDetScale
+ const unsigned int g_blockSliceSize = g_detBlockSize;
+ assert(16 > (g_blockSlices / g_blockSliceSize) / dims.fDetScale);
+ // ASSUMPTION: fDetScale >= 1.0f
+ assert(dims.fDetScale > 0.9999f);
+
+ cudaArray* D_dataArray;
+ bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+ cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+ if (TOffsets) {
+ cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+ } else {
+ if (!g_pfZeroes) {
+ g_pfZeroes = new float[g_MaxAngles];
+ memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float));
+ }
+ cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+ }
+
+ int regionOffset = g_blockSlices / g_blockSliceSize;
+
+ dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // region size, angles
+
+ std::list<cudaStream_t> streams;
+
+
+ // Run over all angles, grouping them into groups of the same
+ // orientation (roughly horizontal vs. roughly vertical).
+ // Start a stream of grids for each such group.
+
+ // TODO: Check if it's worth it to store this info instead
+ // of recomputing it every FP.
+
+ unsigned int blockStart = 0;
+ unsigned int blockEnd = 0;
+ bool blockVertical = false;
+ for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
+ bool vertical;
+ // TODO: Having <= instead of < below causes a 5% speedup.
+ // Maybe we should detect corner cases and put them in the optimal
+ // group of angles.
+ if (a != dims.iProjAngles)
+ vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a])));
+ if (a == dims.iProjAngles || vertical != blockVertical) {
+ // block done
+
+ blockEnd = a;
+ if (blockStart != blockEnd) {
+ unsigned int length = dims.iVolHeight;
+ if (blockVertical)
+ length = dims.iVolWidth;
+ dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock,
+ (length+g_blockSliceSize-1)/g_blockSliceSize+2*regionOffset); // angle blocks, regions
+ // TODO: check if we can't immediately
+ // destroy the stream after use
+ cudaStream_t stream;
+ cudaStreamCreate(&stream);
+ streams.push_back(stream);
+ //printf("angle block: %d to %d, %d\n", blockStart, blockEnd, blockVertical);
+ if (!blockVertical)
+ for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices)
+ FPhorizontal<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale);
+ else
+ for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices)
+ FPvertical<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale);
+ }
+ blockVertical = vertical;
+ blockStart = a;
+ }
+ }
+
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
+ cudaStreamDestroy(*iter);
+
+ streams.clear();
+
+ cudaThreadSynchronize();
+
+ cudaTextForceKernelsCompletion();
+
+ cudaFreeArray(D_dataArray);
+
+
+ return true;
+#endif
+}
+
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA;
+
+int main()
+{
+ float* D_volumeData;
+ float* D_projData;
+
+ SDimensions dims;
+ dims.iVolWidth = 1024;
+ dims.iVolHeight = 1024;
+ dims.iProjAngles = 512;
+ dims.iProjDets = 1536;
+ dims.fDetScale = 1.0f;
+ dims.iRaysPerDet = 1;
+ unsigned int volumePitch, projPitch;
+
+ allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+ printf("pitch: %u\n", volumePitch);
+
+ allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+ printf("pitch: %u\n", projPitch);
+
+ unsigned int y, x;
+ float* img = loadImage("phantom.png", y, x);
+
+ float* sino = new float[dims.iProjAngles * dims.iProjDets];
+
+ memset(sino, 0, dims.iProjAngles * dims.iProjDets * sizeof(float));
+
+ copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+ copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
+
+ float* angle = new float[dims.iProjAngles];
+
+ for (unsigned int i = 0; i < dims.iProjAngles; ++i)
+ angle[i] = i*(M_PI/dims.iProjAngles);
+
+ FP(D_volumeData, volumePitch, D_projData, projPitch, dims, angle, 0, 1.0f);
+
+ delete[] angle;
+
+ copySinogramFromDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
+
+ float s = 0.0f;
+ for (unsigned int y = 0; y < dims.iProjAngles; ++y)
+ for (unsigned int x = 0; x < dims.iProjDets; ++x)
+ s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x];
+ printf("cpu norm: %f\n", s);
+
+ //zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+ s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+ printf("gpu norm: %f\n", s);
+
+ saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino);
+
+
+ return 0;
+}
+#endif