diff options
Diffstat (limited to 'cuda/3d/fdk.cu')
-rw-r--r-- | cuda/3d/fdk.cu | 289 |
1 files changed, 103 insertions, 186 deletions
diff --git a/cuda/3d/fdk.cu b/cuda/3d/fdk.cu index 4899ad1..bbac0be 100644 --- a/cuda/3d/fdk.cu +++ b/cuda/3d/fdk.cu @@ -41,174 +41,26 @@ $Id$ #include "dims3d.h" #include "arith3d.h" +#include "cone_bp.h" #include "../2d/fft.h" -typedef texture<float, 3, cudaReadModeElementType> texture3D; - -static texture3D gT_coneProjTexture; +#include "../../include/astra/Logging.h" namespace astraCUDA3d { -static const unsigned int g_volBlockZ = 16; - -static const unsigned int g_anglesPerBlock = 64; -static const unsigned int g_volBlockX = 32; -static const unsigned int g_volBlockY = 16; - static const unsigned int g_anglesPerWeightBlock = 16; static const unsigned int g_detBlockU = 32; static const unsigned int g_detBlockV = 32; -static const unsigned g_MaxAngles = 2048; +static const unsigned g_MaxAngles = 12000; -__constant__ float gC_angle_sin[g_MaxAngles]; -__constant__ float gC_angle_cos[g_MaxAngles]; __constant__ float gC_angle[g_MaxAngles]; // per-detector u/v shifts? -static bool bindProjDataTexture(const cudaArray* array) -{ - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); - - gT_coneProjTexture.addressMode[0] = cudaAddressModeBorder; - gT_coneProjTexture.addressMode[1] = cudaAddressModeBorder; - gT_coneProjTexture.addressMode[2] = cudaAddressModeBorder; - gT_coneProjTexture.filterMode = cudaFilterModeLinear; - gT_coneProjTexture.normalized = false; - - cudaBindTextureToArray(gT_coneProjTexture, array, channelDesc); - // TODO: error value? - - return true; -} - - -__global__ void devBP_FDK(void* D_volData, unsigned int volPitch, int startAngle, float fSrcOrigin, float fDetOrigin, float fSrcZ, float fDetZ, float fInvDetUSize, float fInvDetVSize, const SDimensions3D dims) -{ - float* volData = (float*)D_volData; - - int endAngle = startAngle + g_anglesPerBlock; - if (endAngle > dims.iProjAngles) - endAngle = dims.iProjAngles; - - // threadIdx: x = rel x - // y = rel y - - // blockIdx: x = x + y - // y = z - - - // TO TRY: precompute part of detector intersection formulas in shared mem? - // TO TRY: inner loop over z, gather ray values in shared mem - - const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; - const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; - - if (X > dims.iVolX) - return; - if (Y > dims.iVolY) - return; - - const int startZ = blockIdx.y * g_volBlockZ; - int endZ = startZ + g_volBlockZ; - if (endZ > dims.iVolZ) - endZ = dims.iVolZ; - - float fX = X - 0.5f*dims.iVolX + 0.5f; - float fY = Y - 0.5f*dims.iVolY + 0.5f; - float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - fSrcZ; - - const float fU_base = 0.5f*dims.iProjU - 0.5f + 0.5f; - const float fV_base = 0.5f*dims.iProjV - 0.5f + 0.5f + (fDetZ-fSrcZ); - - // Note re. fZ/rV_base: the computations below are all relative to the - // optical axis, so we do the Z-adjustments beforehand. - - for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f) - { - - float fVal = 0.0f; - float fAngle = startAngle + 0.5f; - - for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) - { - - const float cos_theta = gC_angle_cos[angle]; - const float sin_theta = gC_angle_sin[angle]; - - const float fR = fSrcOrigin; - const float fD = fR - fX * sin_theta + fY * cos_theta; - float fWeight = fR / fD; - fWeight *= fWeight; - - const float fScaleFactor = (fR + fDetOrigin) / fD; - const float fU = fU_base + (fX*cos_theta+fY*sin_theta) * fScaleFactor * fInvDetUSize; - const float fV = fV_base + fZ * fScaleFactor * fInvDetVSize; - - fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV); - - } - - volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal; -// projData[(angle*dims.iProjV+detectorV)*projPitch+detectorU] = 10.0f; -// if (threadIdx.x == 0 && threadIdx.y == 0) { printf("%d,%d,%d [%d / %d] -> %f\n", angle, detectorU, detectorV, (angle*dims.iProjV+detectorV)*projPitch+detectorU, projPitch, projData[(angle*dims.iProjV+detectorV)*projPitch+detectorU]); } - } - -} - - -bool FDK_BP(cudaPitchedPtr D_volumeData, - cudaPitchedPtr D_projData, - float fSrcOrigin, float fDetOrigin, - float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize, - const SDimensions3D& dims, const float* angles) -{ - // transfer projections to array - - cudaArray* cuArray = allocateProjectionArray(dims); - transferProjectionsToArray(D_projData, cuArray, dims); - - bindProjDataTexture(cuArray); - - float* angle_sin = new float[dims.iProjAngles]; - float* angle_cos = new float[dims.iProjAngles]; - - for (unsigned int i = 0; i < dims.iProjAngles; ++i) { - angle_sin[i] = sinf(angles[i]); - angle_cos[i] = cosf(angles[i]); - } - cudaError_t e1 = cudaMemcpyToSymbol(gC_angle_sin, angle_sin, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); - cudaError_t e2 = cudaMemcpyToSymbol(gC_angle_cos, angle_cos, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); - assert(e1 == cudaSuccess); - assert(e2 == cudaSuccess); - - delete[] angle_sin; - delete[] angle_cos; - - dim3 dimBlock(g_volBlockX, g_volBlockY); - - dim3 dimGrid(((dims.iVolX+g_volBlockX-1)/g_volBlockX)*((dims.iVolY+g_volBlockY-1)/g_volBlockY), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ); - - // timeval t; - // tic(t); - - for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) { - devBP_FDK<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, 1.0f / fDetUSize, 1.0f / fDetVSize, dims); - } - - cudaTextForceKernelsCompletion(); - - cudaFreeArray(cuArray); - - // printf("%f\n", toc(t)); - - return true; -} - -__global__ void devFDK_preweight(void* D_projData, unsigned int projPitch, unsigned int startAngle, unsigned int endAngle, float fSrcOrigin, float fDetOrigin, float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize, const SDimensions3D dims) +__global__ void devFDK_preweight(void* D_projData, unsigned int projPitch, unsigned int startAngle, unsigned int endAngle, float fSrcOrigin, float fDetOrigin, float fZShift, float fDetUSize, float fDetVSize, const SDimensions3D dims) { float* projData = (float*)D_projData; int angle = startAngle + blockIdx.y * g_anglesPerWeightBlock + threadIdx.y; @@ -230,21 +82,26 @@ __global__ void devFDK_preweight(void* D_projData, unsigned int projPitch, unsig const float fT = fCentralRayLength * fCentralRayLength + fU * fU; - float fV = (startDetectorV - 0.5f*dims.iProjV + 0.5f) * fDetVSize + fDetZ - fSrcZ; + float fV = (startDetectorV - 0.5f*dims.iProjV + 0.5f) * fDetVSize + fZShift; + + //const float fW = fCentralRayLength; + //const float fW = fCentralRayLength * (M_PI / 2.0f) / (float)dims.iProjAngles; + const float fW1 = fSrcOrigin * fDetUSize * fDetVSize; + const float fW = fCentralRayLength * fW1 * fW1 * (M_PI / 2.0f) / (float)dims.iProjAngles; for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) { const float fRayLength = sqrtf(fT + fV * fV); - const float fWeight = fCentralRayLength / fRayLength; + const float fWeight = fW / fRayLength; projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] *= fWeight; - fV += 1.0f; + fV += fDetVSize; } } -__global__ void devFDK_ParkerWeight(void* D_projData, unsigned int projPitch, unsigned int startAngle, unsigned int endAngle, float fSrcOrigin, float fDetOrigin, float fSrcZ, float fDetZ, float fDetUSize, float fCentralFanAngle, const SDimensions3D dims) +__global__ void devFDK_ParkerWeight(void* D_projData, unsigned int projPitch, unsigned int startAngle, unsigned int endAngle, float fSrcOrigin, float fDetOrigin, float fDetUSize, float fCentralFanAngle, const SDimensions3D dims) { float* projData = (float*)D_projData; int angle = startAngle + blockIdx.y * g_anglesPerWeightBlock + threadIdx.y; @@ -305,7 +162,7 @@ __global__ void devFDK_ParkerWeight(void* D_projData, unsigned int projPitch, un // Perform the FDK pre-weighting and filtering bool FDK_PreWeight(cudaPitchedPtr D_projData, float fSrcOrigin, float fDetOrigin, - float fSrcZ, float fDetZ, + float fZShift, float fDetUSize, float fDetVSize, bool bShortScan, const SDimensions3D& dims, const float* angles) { @@ -318,23 +175,57 @@ bool FDK_PreWeight(cudaPitchedPtr D_projData, int projPitch = D_projData.pitch/sizeof(float); - devFDK_preweight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, fDetUSize, fDetVSize, dims); + devFDK_preweight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fZShift, fDetUSize, fDetVSize, dims); cudaTextForceKernelsCompletion(); - if (bShortScan) { + if (bShortScan && dims.iProjAngles > 1) { + ASTRA_DEBUG("Doing Parker weighting"); // We do short-scan Parker weighting - cudaError_t e1 = cudaMemcpyToSymbol(gC_angle, angles, + // First, determine (in a very basic way) the interval that's + // been scanned. We assume angles[0] is one of the endpoints of the + // range. + float fdA = angles[1] - angles[0]; + + while (fdA < -M_PI) + fdA += 2*M_PI; + while (fdA >= M_PI) + fdA -= 2*M_PI; + + float fAngleBase; + if (fdA >= 0.0f) { + // going up from angles[0] + fAngleBase = angles[dims.iProjAngles - 1]; + } else { + // going down from angles[0] + fAngleBase = angles[0]; + } + + // We pick the highest end of the range, and then + // move all angles so they fall in (-2pi,0] + + float *fRelAngles = new float[dims.iProjAngles]; + for (unsigned int i = 0; i < dims.iProjAngles; ++i) { + float f = angles[i] - fAngleBase; + while (f > 0) + f -= 2*M_PI; + while (f <= -2*M_PI) + f += 2*M_PI; + fRelAngles[i] = f; + + } + + cudaError_t e1 = cudaMemcpyToSymbol(gC_angle, fRelAngles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); assert(!e1); + delete[] fRelAngles; - // TODO: detector pixel size! - float fCentralFanAngle = atanf((dims.iProjU*0.5f) / + float fCentralFanAngle = atanf(fDetUSize * (dims.iProjU*0.5f) / (fSrcOrigin + fDetOrigin)); - devFDK_ParkerWeight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, fDetUSize, fCentralFanAngle, dims); + devFDK_ParkerWeight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fDetUSize, fCentralFanAngle, dims); } @@ -344,10 +235,7 @@ bool FDK_PreWeight(cudaPitchedPtr D_projData, bool FDK_Filter(cudaPitchedPtr D_projData, cufftComplex * D_filter, - float fSrcOrigin, float fDetOrigin, - float fSrcZ, float fDetZ, - float fDetUSize, float fDetVSize, bool bShortScan, - const SDimensions3D& dims, const float* angles) + const SDimensions3D& dims) { // The filtering is a regular ramp filter per detector line. @@ -392,10 +280,9 @@ bool FDK_Filter(cudaPitchedPtr D_projData, bool FDK(cudaPitchedPtr D_volumeData, cudaPitchedPtr D_projData, - float fSrcOrigin, float fDetOrigin, - float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize, - const SDimensions3D& dims, const float* angles, bool bShortScan, - const float* filter) + const SConeProjection* angles, + const SDimensions3D& dims, SProjectorParams3D params, bool bShortScan, + const float* pfFilter) { bool ok; // Generate filter @@ -404,20 +291,53 @@ bool FDK(cudaPitchedPtr D_volumeData, int iPaddedDetCount = calcNextPowerOfTwo(2 * dims.iProjU); int iHalfFFTSize = calcFFTFourSize(iPaddedDetCount); + + // NB: We don't support arbitrary cone_vec geometries here. + // Only those that are vertical sub-geometries + // (cf. CompositeGeometryManager) of regular cone geometries. + assert(dims.iProjAngles > 0); + const SConeProjection& p0 = angles[0]; + + // assuming U is in the XY plane, V is parallel to Z axis + float fDetCX = p0.fDetSX + 0.5*dims.iProjU*p0.fDetUX; + float fDetCY = p0.fDetSY + 0.5*dims.iProjU*p0.fDetUY; + float fDetCZ = p0.fDetSZ + 0.5*dims.iProjV*p0.fDetVZ; + + float fSrcOrigin = sqrt(p0.fSrcX*p0.fSrcX + p0.fSrcY*p0.fSrcY); + float fDetOrigin = sqrt(fDetCX*fDetCX + fDetCY*fDetCY); + float fDetUSize = sqrt(p0.fDetUX*p0.fDetUX + p0.fDetUY*p0.fDetUY); + float fDetVSize = abs(p0.fDetVZ); + + float fZShift = fDetCZ - p0.fSrcZ; + + float *pfAngles = new float[dims.iProjAngles]; + for (unsigned int i = 0; i < dims.iProjAngles; ++i) { + // FIXME: Sign/order + pfAngles[i] = -atan2(angles[i].fSrcX, angles[i].fSrcY) + M_PI; + } + + +#if 1 ok = FDK_PreWeight(D_projData, fSrcOrigin, fDetOrigin, - fSrcZ, fDetZ, fDetUSize, fDetVSize, - bShortScan, dims, angles); + fZShift, fDetUSize, fDetVSize, + bShortScan, dims, pfAngles); +#else + ok = true; +#endif + delete[] pfAngles; + if (!ok) return false; +#if 1 cufftComplex *pHostFilter = new cufftComplex[dims.iProjAngles * iHalfFFTSize]; memset(pHostFilter, 0, sizeof(cufftComplex) * dims.iProjAngles * iHalfFFTSize); - if (filter==NULL){ + if (pfFilter == 0){ genFilter(FILTER_RAMLAK, 1.0f, dims.iProjAngles, pHostFilter, iPaddedDetCount, iHalfFFTSize); - }else{ - for(int i=0;i<dims.iProjAngles * iHalfFFTSize;i++){ - pHostFilter[i].x = filter[i]; + } else { + for (int i = 0; i < dims.iProjAngles * iHalfFFTSize; i++) { + pHostFilter[i].x = pfFilter[i]; pHostFilter[i].y = 0; } } @@ -433,28 +353,25 @@ bool FDK(cudaPitchedPtr D_volumeData, - ok = FDK_Filter(D_projData, D_filter, fSrcOrigin, fDetOrigin, - fSrcZ, fDetZ, fDetUSize, fDetVSize, - bShortScan, dims, angles); + ok = FDK_Filter(D_projData, D_filter, dims); // Clean up filter freeComplexOnDevice(D_filter); - +#endif if (!ok) return false; // Perform BP - ok = FDK_BP(D_volumeData, D_projData, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, - fDetUSize, fDetVSize, dims, angles); + params.bFDKWeighting = true; + + //ok = FDK_BP(D_volumeData, D_projData, fSrcOrigin, fDetOrigin, 0.0f, 0.0f, fDetUSize, fDetVSize, dims, pfAngles); + ok = ConeBP(D_volumeData, D_projData, dims, angles, params); if (!ok) return false; - processVol3D<opMul>(D_volumeData, - (M_PI / 2.0f) / (float)dims.iProjAngles, dims); - return true; } |