summaryrefslogtreecommitdiffstats
path: root/cuda/2d/par_fp.cu
diff options
context:
space:
mode:
authorWillem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl>2016-07-29 12:03:38 +0200
committerWillem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl>2016-07-29 12:11:52 +0200
commitb1ffc11d930c19bd73af9837a08bc8dde9fe8e72 (patch)
tree40ce622ffdc8d75c885135db1ce4773c9670e2c3 /cuda/2d/par_fp.cu
parentb2611a03577c285ddf48edab0d05dafa09ab362c (diff)
downloadastra-b1ffc11d930c19bd73af9837a08bc8dde9fe8e72.tar.gz
astra-b1ffc11d930c19bd73af9837a08bc8dde9fe8e72.tar.bz2
astra-b1ffc11d930c19bd73af9837a08bc8dde9fe8e72.tar.xz
astra-b1ffc11d930c19bd73af9837a08bc8dde9fe8e72.zip
Add CUDA parvec support
Diffstat (limited to 'cuda/2d/par_fp.cu')
-rw-r--r--cuda/2d/par_fp.cu360
1 files changed, 49 insertions, 311 deletions
diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu
index bb8b909..3c010b3 100644
--- a/cuda/2d/par_fp.cu
+++ b/cuda/2d/par_fp.cu
@@ -30,6 +30,7 @@ $Id$
#include <cassert>
#include <iostream>
#include <list>
+#include <cmath>
#include "util.h"
#include "arith.h"
@@ -51,6 +52,7 @@ namespace astraCUDA {
static const unsigned g_MaxAngles = 2560;
__constant__ float gC_angle[g_MaxAngles];
__constant__ float gC_angle_offset[g_MaxAngles];
+__constant__ float gC_angle_detsize[g_MaxAngles];
// optimization parameters
@@ -63,10 +65,6 @@ static const unsigned int g_blockSlices = 64;
#define iPREC_FACTOR 16
-// if necessary, a buffer of zeroes of size g_MaxAngles
-static float* g_pfZeroes = 0;
-
-
static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned int pitch, unsigned int width, unsigned int height)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
@@ -89,9 +87,10 @@ static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned i
return true;
}
+
// projection for angles that are roughly horizontal
-// theta between 45 and 135 degrees
-__global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale)
+// (detector roughly vertical)
+__global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
{
const int relDet = threadIdx.x;
const int relAngle = threadIdx.y;
@@ -106,33 +105,7 @@ __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned
const float sin_theta = __sinf(theta);
// compute start detector for this block/angle:
- // (The same for all threadIdx.x)
- // -------------------------------------
-
- const int midSlice = startSlice + g_blockSlices / 2;
-
- // ASSUMPTION: fDetScale >= 1.0f
- // problem: detector regions get skipped because slice blocks aren't large
- // enough
- const unsigned int g_blockSliceSize = g_detBlockSize;
-
- // project (midSlice,midRegion) on this thread's detector
-
- const float fBase = 0.5f*dims.iProjDets - 0.5f +
- (
- (midSlice - 0.5f*dims.iVolWidth + 0.5f) * cos_theta
- - (g_blockSliceSize/2 - 0.5f*dims.iVolHeight + 0.5f) * sin_theta
- + gC_angle_offset[angle]
- ) / dims.fDetScale;
- int iBase = (int)floorf(fBase * fPREC_FACTOR);
- int iInc = (int)floorf(g_blockSliceSize * sin_theta / dims.fDetScale * -fPREC_FACTOR);
-
- // ASSUMPTION: 16 > regionOffset / fDetScale
- const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
- const int detPrevRegion = (iBase + (blockIdx.y - regionOffset - 1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
-
- if (blockIdx.y > 0 && detRegion == detPrevRegion)
- return;
+ const int detRegion = blockIdx.y;
const int detector = detRegion * g_detBlockSize + relDet;
@@ -142,7 +115,7 @@ __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned
if (detector < 0 || detector >= dims.iProjDets)
return;
- const float fDetStep = -dims.fDetScale / sin_theta;
+ const float fDetStep = -gC_angle_detsize[angle] / sin_theta;
float fSliceStep = cos_theta / sin_theta;
float fDistCorr;
if (sin_theta > 0.0f)
@@ -192,9 +165,10 @@ __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned
D_projData[angle*projPitch+detector] += fVal * fDistCorr;
}
+
// projection for angles that are roughly vertical
-// theta between 0 and 45, or 135 and 180 degrees
-__global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale)
+// (detector roughly horizontal)
+__global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
{
const int relDet = threadIdx.x;
const int relAngle = threadIdx.y;
@@ -209,33 +183,7 @@ __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned i
const float sin_theta = __sinf(theta);
// compute start detector for this block/angle:
- // (The same for all threadIdx.x)
- // -------------------------------------
-
- const int midSlice = startSlice + g_blockSlices / 2;
-
- // project (midSlice,midRegion) on this thread's detector
-
- // ASSUMPTION: fDetScale >= 1.0f
- // problem: detector regions get skipped because slice blocks aren't large
- // enough
- const unsigned int g_blockSliceSize = g_detBlockSize;
-
- const float fBase = 0.5f*dims.iProjDets - 0.5f +
- (
- (g_blockSliceSize/2 - 0.5f*dims.iVolWidth + 0.5f) * cos_theta
- - (midSlice - 0.5f*dims.iVolHeight + 0.5f) * sin_theta
- + gC_angle_offset[angle]
- ) / dims.fDetScale;
- int iBase = (int)floorf(fBase * fPREC_FACTOR);
- int iInc = (int)floorf(g_blockSliceSize * cos_theta / dims.fDetScale * fPREC_FACTOR);
-
- // ASSUMPTION: 16 > regionOffset / fDetScale
- const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
- const int detPrevRegion = (iBase + (blockIdx.y - regionOffset-1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
-
- if (blockIdx.y > 0 && detRegion == detPrevRegion)
- return;
+ const int detRegion = blockIdx.y;
const int detector = detRegion * g_detBlockSize + relDet;
@@ -245,7 +193,7 @@ __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned i
if (detector < 0 || detector >= dims.iProjDets)
return;
- const float fDetStep = dims.fDetScale / cos_theta;
+ const float fDetStep = gC_angle_detsize[angle] / cos_theta;
float fSliceStep = sin_theta / cos_theta;
float fDistCorr;
if (cos_theta < 0.0f)
@@ -293,183 +241,67 @@ __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned i
D_projData[angle*projPitch+detector] += fVal * fDistCorr;
}
-// projection for angles that are roughly horizontal
-// (detector roughly vertical)
-__global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
-{
- const int relDet = threadIdx.x;
- const int relAngle = threadIdx.y;
-
- int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
-
- if (angle >= endAngle)
- return;
-
- const float theta = gC_angle[angle];
- const float cos_theta = __cosf(theta);
- const float sin_theta = __sinf(theta);
-
- // compute start detector for this block/angle:
- const int detRegion = blockIdx.y;
-
- const int detector = detRegion * g_detBlockSize + relDet;
-
- // Now project the part of the ray to angle,detector through
- // slices startSlice to startSlice+g_blockSlices-1
-
- if (detector < 0 || detector >= dims.iProjDets)
- return;
-
- const float fDetStep = -dims.fDetScale / sin_theta;
- float fSliceStep = cos_theta / sin_theta;
- float fDistCorr;
- if (sin_theta > 0.0f)
- fDistCorr = -fDetStep;
- else
- fDistCorr = fDetStep;
- fDistCorr *= outputScale;
-
- float fVal = 0.0f;
- // project detector on slice
- float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 0.5f;
- float fS = startSlice + 0.5f;
- int endSlice = startSlice + g_blockSlices;
- if (endSlice > dims.iVolWidth)
- endSlice = dims.iVolWidth;
- if (dims.iRaysPerDet > 1) {
- fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep;
- const float fSubDetStep = fDetStep / dims.iRaysPerDet;
- fDistCorr /= dims.iRaysPerDet;
- fSliceStep -= dims.iRaysPerDet * fSubDetStep;
+// Coordinates of center of detector pixel number t:
+// x = (t - 0.5*nDets + 0.5 - fOffset) * fSize * cos(fAngle)
+// y = - (t - 0.5*nDets + 0.5 - fOffset) * fSize * sin(fAngle)
- for (int slice = startSlice; slice < endSlice; ++slice)
- {
- for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
- fVal += tex2D(gT_volumeTexture, fS, fP);
- fP += fSubDetStep;
- }
- fP += fSliceStep;
- fS += 1.0f;
- }
-
- } else {
- for (int slice = startSlice; slice < endSlice; ++slice)
- {
- fVal += tex2D(gT_volumeTexture, fS, fP);
- fP += fSliceStep;
- fS += 1.0f;
- }
-
-
- }
-
- D_projData[angle*projPitch+detector] += fVal * fDistCorr;
-}
-
-
-// projection for angles that are roughly vertical
-// (detector roughly horizontal)
-__global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
+static void convertAndUploadAngles(const SParProjection *projs, unsigned int nth, unsigned int ndets)
{
- const int relDet = threadIdx.x;
- const int relAngle = threadIdx.y;
-
- int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
-
- if (angle >= endAngle)
- return;
-
- const float theta = gC_angle[angle];
- const float cos_theta = __cosf(theta);
- const float sin_theta = __sinf(theta);
-
- // compute start detector for this block/angle:
- const int detRegion = blockIdx.y;
-
- const int detector = detRegion * g_detBlockSize + relDet;
+ float *angles = new float[nth];
+ float *offset = new float[nth];
+ float *detsizes = new float[nth];
- // Now project the part of the ray to angle,detector through
- // slices startSlice to startSlice+g_blockSlices-1
+ for (int i = 0; i < nth; ++i) {
- if (detector < 0 || detector >= dims.iProjDets)
- return;
+#warning TODO: Replace by getParParamaters
- const float fDetStep = dims.fDetScale / cos_theta;
- float fSliceStep = sin_theta / cos_theta;
- float fDistCorr;
- if (cos_theta < 0.0f)
- fDistCorr = -fDetStep;
- else
- fDistCorr = fDetStep;
- fDistCorr *= outputScale;
+ // Take part of DetU orthogonal to Ray
+ double ux = projs[i].fDetUX;
+ double uy = projs[i].fDetUY;
- float fVal = 0.0f;
- float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 0.5f;
- float fS = startSlice+0.5f;
- int endSlice = startSlice + g_blockSlices;
- if (endSlice > dims.iVolHeight)
- endSlice = dims.iVolHeight;
+ double t = (ux * projs[i].fRayX + uy * projs[i].fRayY) / (projs[i].fRayX * projs[i].fRayX + projs[i].fRayY * projs[i].fRayY);
- if (dims.iRaysPerDet > 1) {
+ ux -= t * projs[i].fRayX;
+ uy -= t * projs[i].fRayY;
- fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep;
- const float fSubDetStep = fDetStep / dims.iRaysPerDet;
- fDistCorr /= dims.iRaysPerDet;
+ double angle = atan2(uy, ux);
- fSliceStep -= dims.iRaysPerDet * fSubDetStep;
+ angles[i] = angle;
- for (int slice = startSlice; slice < endSlice; ++slice)
- {
- for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
- fVal += tex2D(gT_volumeTexture, fP, fS);
- fP += fSubDetStep;
- }
- fP += fSliceStep;
- fS += 1.0f;
- }
+ double norm2 = uy * uy + ux * ux;
- } else {
-
- for (int slice = startSlice; slice < endSlice; ++slice)
- {
- fVal += tex2D(gT_volumeTexture, fP, fS);
- fP += fSliceStep;
- fS += 1.0f;
- }
+ detsizes[i] = sqrt(norm2);
+ // CHECKME: SIGNS?
+ offset[i] = -0.5*ndets - (projs[i].fDetSY*uy + projs[i].fDetSX*ux) / norm2;
}
- D_projData[angle*projPitch+detector] += fVal * fDistCorr;
+ cudaMemcpyToSymbol(gC_angle, angles, nth*sizeof(float), 0, cudaMemcpyHostToDevice);
+ cudaMemcpyToSymbol(gC_angle_offset, offset, nth*sizeof(float), 0, cudaMemcpyHostToDevice);
+ cudaMemcpyToSymbol(gC_angle_detsize, detsizes, nth*sizeof(float), 0, cudaMemcpyHostToDevice);
}
bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch,
float* D_projData, unsigned int projPitch,
- const SDimensions& dims, const float* angles,
- const float* TOffsets, float outputScale)
+ const SDimensions& dims, const SParProjection* angles,
+ float outputScale)
{
- // TODO: load angles into constant memory in smaller blocks
assert(dims.iProjAngles <= g_MaxAngles);
+ assert(angles);
+
cudaArray* D_dataArray;
bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight);
- cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
- if (TOffsets) {
- cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
- } else {
- if (!g_pfZeroes) {
- g_pfZeroes = new float[g_MaxAngles];
- memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float));
- }
- cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
- }
+ convertAndUploadAngles(angles, dims.iProjAngles, dims.iProjDets);
+
dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // detector block size, angles
@@ -492,7 +324,7 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch,
// Maybe we should detect corner cases and put them in the optimal
// group of angles.
if (a != dims.iProjAngles)
- vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a])));
+ vertical = (fabsf(angles[a].fRayX) <= fabsf(angles[a].fRayY));
if (a == dims.iProjAngles || vertical != blockVertical) {
// block done
@@ -536,8 +368,8 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch,
bool FP_simple(float* D_volumeData, unsigned int volumePitch,
float* D_projData, unsigned int projPitch,
- const SDimensions& dims, const float* angles,
- const float* TOffsets, float outputScale)
+ const SDimensions& dims, const SParProjection* angles,
+ float outputScale)
{
for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) {
SDimensions subdims = dims;
@@ -550,7 +382,7 @@ bool FP_simple(float* D_volumeData, unsigned int volumePitch,
ret = FP_simple_internal(D_volumeData, volumePitch,
D_projData + iAngle * projPitch, projPitch,
subdims, angles + iAngle,
- TOffsets ? TOffsets + iAngle : 0, outputScale);
+ outputScale);
if (!ret)
return false;
}
@@ -559,106 +391,12 @@ bool FP_simple(float* D_volumeData, unsigned int volumePitch,
bool FP(float* D_volumeData, unsigned int volumePitch,
float* D_projData, unsigned int projPitch,
- const SDimensions& dims, const float* angles,
- const float* TOffsets, float outputScale)
+ const SDimensions& dims, const SParProjection* angles,
+ float outputScale)
{
return FP_simple(D_volumeData, volumePitch, D_projData, projPitch,
- dims, angles, TOffsets, outputScale);
-
- // TODO: Fix bug in this non-simple FP with large detscale and TOffsets
-#if 0
-
- // TODO: load angles into constant memory in smaller blocks
- assert(dims.iProjAngles <= g_MaxAngles);
-
- // TODO: compute region size dynamically to resolve these two assumptions
- // ASSUMPTION: 16 > regionOffset / fDetScale
- const unsigned int g_blockSliceSize = g_detBlockSize;
- assert(16 > (g_blockSlices / g_blockSliceSize) / dims.fDetScale);
- // ASSUMPTION: fDetScale >= 1.0f
- assert(dims.fDetScale > 0.9999f);
-
- cudaArray* D_dataArray;
- bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight);
-
- cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
-
- if (TOffsets) {
- cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
- } else {
- if (!g_pfZeroes) {
- g_pfZeroes = new float[g_MaxAngles];
- memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float));
- }
- cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
- }
-
- int regionOffset = g_blockSlices / g_blockSliceSize;
-
- dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // region size, angles
-
- std::list<cudaStream_t> streams;
-
+ dims, angles, outputScale);
- // Run over all angles, grouping them into groups of the same
- // orientation (roughly horizontal vs. roughly vertical).
- // Start a stream of grids for each such group.
-
- // TODO: Check if it's worth it to store this info instead
- // of recomputing it every FP.
-
- unsigned int blockStart = 0;
- unsigned int blockEnd = 0;
- bool blockVertical = false;
- for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
- bool vertical;
- // TODO: Having <= instead of < below causes a 5% speedup.
- // Maybe we should detect corner cases and put them in the optimal
- // group of angles.
- if (a != dims.iProjAngles)
- vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a])));
- if (a == dims.iProjAngles || vertical != blockVertical) {
- // block done
-
- blockEnd = a;
- if (blockStart != blockEnd) {
- unsigned int length = dims.iVolHeight;
- if (blockVertical)
- length = dims.iVolWidth;
- dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock,
- (length+g_blockSliceSize-1)/g_blockSliceSize+2*regionOffset); // angle blocks, regions
- // TODO: check if we can't immediately
- // destroy the stream after use
- cudaStream_t stream;
- cudaStreamCreate(&stream);
- streams.push_back(stream);
- //printf("angle block: %d to %d, %d\n", blockStart, blockEnd, blockVertical);
- if (!blockVertical)
- for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices)
- FPhorizontal<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale);
- else
- for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices)
- FPvertical<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale);
- }
- blockVertical = vertical;
- blockStart = a;
- }
- }
-
- for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
- cudaStreamDestroy(*iter);
-
- streams.clear();
-
- cudaThreadSynchronize();
-
- cudaTextForceKernelsCompletion();
-
- cudaFreeArray(D_dataArray);
-
-
- return true;
-#endif
}