From 7ce0b7cca179e903e8011cd96c9910cbdf62ae00 Mon Sep 17 00:00:00 2001 From: Willem Jan Palenstijn Date: Wed, 16 Apr 2014 11:13:01 +0000 Subject: Remove padding in 3D cuda in favour of Border mode --- cuda/3d/arith3d.cu | 90 +++++++++++++++++++---------------------- cuda/3d/cone_bp.cu | 14 +++---- cuda/3d/cone_fp.cu | 18 ++++----- cuda/3d/fdk.cu | 10 ++--- cuda/3d/par3d_bp.cu | 14 +++---- cuda/3d/par3d_fp.cu | 38 +++++++++--------- cuda/3d/util3d.cu | 113 ++++------------------------------------------------ 7 files changed, 96 insertions(+), 201 deletions(-) (limited to 'cuda') diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu index 9a19be0..7cb56f6 100644 --- a/cuda/3d/arith3d.cu +++ b/cuda/3d/arith3d.cu @@ -99,14 +99,14 @@ struct opClampMax { -template +template __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -116,14 +116,14 @@ __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, uns } } -template +template __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -134,14 +134,14 @@ __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned } -template +template __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -151,14 +151,14 @@ __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, uns } } -template +template __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -168,14 +168,14 @@ __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned } } -template +template __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -185,14 +185,14 @@ __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, u } } -template +template __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -210,7 +210,7 @@ __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, -template +template void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); @@ -218,12 +218,12 @@ void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsign float *pfOut = (float*)out; - devtoD<<>>(pfOut, pitch, width, height); + devtoD<<>>(pfOut, pitch, width, height); cudaTextForceKernelsCompletion(); } -template +template void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); @@ -231,12 +231,12 @@ void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int float *pfOut = (float*)out; - devFtoD<<>>(pfOut, fParam, pitch, width, height); + devFtoD<<>>(pfOut, fParam, pitch, width, height); cudaTextForceKernelsCompletion(); } -template +template void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); @@ -245,12 +245,12 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, uns float *pfOut = (float*)out; const float *pfIn = (const float*)in; - devDtoD<<>>(pfOut, pfIn, pitch, width, height); + devDtoD<<>>(pfOut, pfIn, pitch, width, height); cudaTextForceKernelsCompletion(); } -template +template void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); @@ -259,12 +259,12 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned float *pfOut = (float*)out; const float *pfIn = (const float*)in; - devDFtoD<<>>(pfOut, pfIn, fParam, pitch, width, height); + devDFtoD<<>>(pfOut, pfIn, fParam, pitch, width, height); cudaTextForceKernelsCompletion(); } -template +template void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); @@ -274,12 +274,12 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2 const float *pfIn1 = (const float*)in1; const float *pfIn2 = (const float*)in2; - devDDFtoD<<>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); + devDDFtoD<<>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); cudaTextForceKernelsCompletion(); } -template +template void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); @@ -289,7 +289,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2 const float *pfIn1 = (const float*)in1; const float *pfIn2 = (const float*)in2; - devDDtoD<<>>(pfOut, pfIn1, pfIn2, pitch, width, height); + devDDtoD<<>>(pfOut, pfIn1, pfIn2, pitch, width, height); cudaTextForceKernelsCompletion(); } @@ -319,7 +319,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims) unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devtoD<<>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devtoD<<>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; } @@ -335,7 +335,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devFtoD<<>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devFtoD<<>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; } @@ -352,7 +352,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devDtoD<<>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devDtoD<<>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; pfIn += step; } @@ -370,7 +370,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devDFtoD<<>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devDFtoD<<>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; pfIn += step; } @@ -389,7 +389,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devDDFtoD<<>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devDDFtoD<<>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; pfIn1 += step; pfIn2 += step; @@ -409,7 +409,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devDDtoD<<>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devDDtoD<<>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; pfIn1 += step; pfIn2 += step; @@ -439,7 +439,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims) unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devtoD<<>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devtoD<<>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; } @@ -455,7 +455,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devFtoD<<>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devFtoD<<>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; } @@ -472,7 +472,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devDtoD<<>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devDtoD<<>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; pfIn += step; } @@ -490,7 +490,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devDFtoD<<>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devDFtoD<<>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; pfIn += step; } @@ -509,7 +509,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devDDFtoD<<>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devDDFtoD<<>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; pfIn1 += step; pfIn2 += step; @@ -529,7 +529,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devDDtoD<<>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devDDtoD<<>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; pfIn1 += step; pfIn2 += step; @@ -556,39 +556,33 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit #define INST_DFtoD(name) \ - template void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); #define INST_DtoD(name) \ - template void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); #define INST_DDtoD(name) \ - template void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); #define INST_DDFtoD(name) \ - template void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); #define INST_toD(name) \ - template void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims); #define INST_FtoD(name) \ - template void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); diff --git a/cuda/3d/cone_bp.cu b/cuda/3d/cone_bp.cu index 7f8e320..1b0154d 100644 --- a/cuda/3d/cone_bp.cu +++ b/cuda/3d/cone_bp.cu @@ -73,9 +73,9 @@ bool bindProjDataTexture(const cudaArray* array) { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - gT_coneProjTexture.addressMode[0] = cudaAddressModeClamp; - gT_coneProjTexture.addressMode[1] = cudaAddressModeClamp; - gT_coneProjTexture.addressMode[2] = cudaAddressModeClamp; + gT_coneProjTexture.addressMode[0] = cudaAddressModeBorder; + gT_coneProjTexture.addressMode[1] = cudaAddressModeBorder; + gT_coneProjTexture.addressMode[2] = cudaAddressModeBorder; gT_coneProjTexture.filterMode = cudaFilterModeLinear; gT_coneProjTexture.normalized = false; @@ -148,8 +148,8 @@ __global__ void dev_cone_BP(void* D_volData, unsigned int volPitch, int startAng const float fVNum = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz; const float fDen = fCdc + fX * fCdx + fY * fCdy + fZ * fCdz; - const float fU = fUNum / fDen + 1.0f; - const float fV = fVNum / fDen + 1.0f; + const float fU = fUNum / fDen; + const float fV = fVNum / fDen; fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV); @@ -230,8 +230,8 @@ __global__ void dev_cone_BP_SS(void* D_volData, unsigned int volPitch, int start const float fVNum = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz; const float fDen = fCdc + fXs * fCdx + fYs * fCdy + fZs * fCdz; - const float fU = fUNum / fDen + 1.0f; - const float fV = fVNum / fDen + 1.0f; + const float fU = fUNum / fDen; + const float fV = fVNum / fDen; fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV); diff --git a/cuda/3d/cone_fp.cu b/cuda/3d/cone_fp.cu index 40dca4f..fa308b9 100644 --- a/cuda/3d/cone_fp.cu +++ b/cuda/3d/cone_fp.cu @@ -72,9 +72,9 @@ bool bindVolumeDataTexture(const cudaArray* array) { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - gT_coneVolumeTexture.addressMode[0] = cudaAddressModeClamp; - gT_coneVolumeTexture.addressMode[1] = cudaAddressModeClamp; - gT_coneVolumeTexture.addressMode[2] = cudaAddressModeClamp; + gT_coneVolumeTexture.addressMode[0] = cudaAddressModeBorder; + gT_coneVolumeTexture.addressMode[1] = cudaAddressModeBorder; + gT_coneVolumeTexture.addressMode[2] = cudaAddressModeBorder; gT_coneVolumeTexture.filterMode = cudaFilterModeLinear; gT_coneVolumeTexture.normalized = false; @@ -142,9 +142,9 @@ bool bindVolumeDataTexture(const cudaArray* array) \ float fVal = 0.0f; \ \ - float f##c0 = startSlice + 1.5f; \ - float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f; \ - float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f; \ + float f##c0 = startSlice + 0.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 0.5f; \ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 0.5f; \ \ for (int s = startSlice; s < endSlice; ++s) \ { \ @@ -218,9 +218,9 @@ bool bindVolumeDataTexture(const cudaArray* array) \ float fVal = 0.0f; \ \ - float f##c0 = startSlice + 1.5f; \ - float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f; \ - float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f; \ + float f##c0 = startSlice + 0.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 0.5f; \ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 0.5f; \ \ for (int s = startSlice; s < endSlice; ++s) \ { \ diff --git a/cuda/3d/fdk.cu b/cuda/3d/fdk.cu index d439d9b..883187d 100644 --- a/cuda/3d/fdk.cu +++ b/cuda/3d/fdk.cu @@ -71,9 +71,9 @@ static bool bindProjDataTexture(const cudaArray* array) { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - gT_coneProjTexture.addressMode[0] = cudaAddressModeClamp; - gT_coneProjTexture.addressMode[1] = cudaAddressModeClamp; - gT_coneProjTexture.addressMode[2] = cudaAddressModeClamp; + gT_coneProjTexture.addressMode[0] = cudaAddressModeBorder; + gT_coneProjTexture.addressMode[1] = cudaAddressModeBorder; + gT_coneProjTexture.addressMode[2] = cudaAddressModeBorder; gT_coneProjTexture.filterMode = cudaFilterModeLinear; gT_coneProjTexture.normalized = false; @@ -120,8 +120,8 @@ __global__ void devBP_FDK(void* D_volData, unsigned int volPitch, int startAngle float fY = Y - 0.5f*dims.iVolY + 0.5f; float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - fSrcZ; - const float fU_base = 0.5f*dims.iProjU - 0.5f + 1.5f; - const float fV_base = 0.5f*dims.iProjV - 0.5f + 1.5f + (fDetZ-fSrcZ); + const float fU_base = 0.5f*dims.iProjU - 0.5f + 0.5f; + const float fV_base = 0.5f*dims.iProjV - 0.5f + 0.5f + (fDetZ-fSrcZ); // Note re. fZ/rV_base: the computations below are all relative to the // optical axis, so we do the Z-adjustments beforehand. diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu index 872b1eb..9bf9a9f 100644 --- a/cuda/3d/par3d_bp.cu +++ b/cuda/3d/par3d_bp.cu @@ -69,9 +69,9 @@ static bool bindProjDataTexture(const cudaArray* array) { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - gT_par3DProjTexture.addressMode[0] = cudaAddressModeClamp; - gT_par3DProjTexture.addressMode[1] = cudaAddressModeClamp; - gT_par3DProjTexture.addressMode[2] = cudaAddressModeClamp; + gT_par3DProjTexture.addressMode[0] = cudaAddressModeBorder; + gT_par3DProjTexture.addressMode[1] = cudaAddressModeBorder; + gT_par3DProjTexture.addressMode[2] = cudaAddressModeBorder; gT_par3DProjTexture.filterMode = cudaFilterModeLinear; gT_par3DProjTexture.normalized = false; @@ -139,8 +139,8 @@ __global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, int startAn const float fUNum = fCuc + fX * fCux + fY * fCuy + fZ * fCuz; const float fVNum = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz; - const float fU = fUNum + 1.0f; - const float fV = fVNum + 1.0f; + const float fU = fUNum; + const float fV = fVNum; fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order @@ -216,8 +216,8 @@ __global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, int star const float fUNum = fCuc + fXs * fCux + fYs * fCuy + fZs * fCuz; const float fVNum = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz; - const float fU = fUNum + 1.0f; - const float fV = fVNum + 1.0f; + const float fU = fUNum; + const float fV = fVNum; fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order fZs += fSubStep; diff --git a/cuda/3d/par3d_fp.cu b/cuda/3d/par3d_fp.cu index 6bf9037..e0f743c 100644 --- a/cuda/3d/par3d_fp.cu +++ b/cuda/3d/par3d_fp.cu @@ -72,9 +72,9 @@ static bool bindVolumeDataTexture(const cudaArray* array) { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - gT_par3DVolumeTexture.addressMode[0] = cudaAddressModeClamp; - gT_par3DVolumeTexture.addressMode[1] = cudaAddressModeClamp; - gT_par3DVolumeTexture.addressMode[2] = cudaAddressModeClamp; + gT_par3DVolumeTexture.addressMode[0] = cudaAddressModeBorder; + gT_par3DVolumeTexture.addressMode[1] = cudaAddressModeBorder; + gT_par3DVolumeTexture.addressMode[2] = cudaAddressModeBorder; gT_par3DVolumeTexture.filterMode = cudaFilterModeLinear; gT_par3DVolumeTexture.normalized = false; @@ -144,9 +144,9 @@ static bool bindVolumeDataTexture(const cudaArray* array) \ float fVal = 0.0f; \ \ - float f##c0 = startSlice + 1.5f; \ - float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\ - float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\ + float f##c0 = startSlice + 0.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 0.5f;\ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 0.5f;\ \ for (int s = startSlice; s < endSlice; ++s) \ { \ @@ -226,9 +226,9 @@ static bool bindVolumeDataTexture(const cudaArray* array) \ float fVal = 0.0f; \ \ - float f##c0 = startSlice + 1.5f; \ - float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\ - float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\ + float f##c0 = startSlice + 0.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 0.5f;\ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 0.5f;\ \ for (int s = startSlice; s < endSlice; ++s) \ { \ @@ -281,16 +281,16 @@ PAR3D_FP_SS_BODY(Z,X,Y) __device__ float dirWeights(float fX, float fN) { - if (fX <= 0.5f) // outside image on left + if (fX <= -0.5f) // outside image on left return 0.0f; - if (fX <= 1.5f) // half outside image on left - return (fX - 0.5f) * (fX - 0.5f); - if (fX <= fN + 0.5f) { // inside image - float t = fX - 0.5f - floorf(fX - 0.5f); + if (fX <= 0.5f) // half outside image on left + return (fX + 0.5f) * (fX + 0.5f); + if (fX <= fN - 0.5f) { // inside image + float t = fX + 0.5f - floorf(fX + 0.5f); return t*t + (1-t)*(1-t); } - if (fX <= fN + 1.5f) // half outside image on right - return (fN + 1.5f - fX) * (fN + 1.5f - fX); + if (fX <= fN + 0.5f) // half outside image on right + return (fN + 0.5f - fX) * (fN + 0.5f - fX); return 0.0f; // outside image on right } @@ -346,9 +346,9 @@ __device__ float dirWeights(float fX, float fN) { \ float fVal = 0.0f; \ \ - float f##c0 = startSlice + 1.5f; \ - float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\ - float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\ + float f##c0 = startSlice + 0.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 0.5f;\ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 0.5f;\ \ for (int s = startSlice; s < endSlice; ++s) \ { \ diff --git a/cuda/3d/util3d.cu b/cuda/3d/util3d.cu index 6dc79c7..cf40fdc 100644 --- a/cuda/3d/util3d.cu +++ b/cuda/3d/util3d.cu @@ -292,16 +292,14 @@ bool duplicateProjectionData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, // TODO: Consider using a single array of size max(proj,volume) (per dim) // instead of allocating a new one each time -// TODO: Figure out a faster way of zeroing the padding? - cudaArray* allocateVolumeArray(const SDimensions3D& dims) { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); cudaArray* cuArray; cudaExtent extentA; - extentA.width = dims.iVolX+2; - extentA.height = dims.iVolY+2; - extentA.depth = dims.iVolZ+2; + extentA.width = dims.iVolX; + extentA.height = dims.iVolY; + extentA.depth = dims.iVolZ; cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA); if (err != cudaSuccess) { astraCUDA::reportCudaError(err); @@ -309,8 +307,6 @@ cudaArray* allocateVolumeArray(const SDimensions3D& dims) return 0; } - zeroVolumeArray(cuArray, dims); - return cuArray; } cudaArray* allocateProjectionArray(const SDimensions3D& dims) @@ -318,9 +314,9 @@ cudaArray* allocateProjectionArray(const SDimensions3D& dims) cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); cudaArray* cuArray; cudaExtent extentA; - extentA.width = dims.iProjU+2; + extentA.width = dims.iProjU; extentA.height = dims.iProjAngles; - extentA.depth = dims.iProjV+2; + extentA.depth = dims.iProjV; cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA); if (err != cudaSuccess) { @@ -329,101 +325,8 @@ cudaArray* allocateProjectionArray(const SDimensions3D& dims) return 0; } - zeroProjectionArray(cuArray, dims); - return cuArray; } -bool zeroVolumeArray(cudaArray* array, const SDimensions3D& dims) -{ - cudaPitchedPtr zeroBuf; - cudaExtent extentS; - extentS.width = sizeof(float)*(dims.iVolX+2); - extentS.height = dims.iVolY+2; - extentS.depth = 1; - - cudaExtent extentA; - extentA.width = dims.iVolX+2; - extentA.height = dims.iVolY+2; - extentA.depth = 1; - - - - cudaError err; - err = cudaMalloc3D(&zeroBuf, extentS); - ASTRA_CUDA_ASSERT(err); - err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iVolX+2), dims.iVolY+2); - ASTRA_CUDA_ASSERT(err); - - // zero array - for (unsigned int i = 0; i < dims.iVolZ+2; ++i) { - cudaMemcpy3DParms p; - cudaPos zp = {0, 0, 0}; - cudaPos dp = {0, 0, i}; - p.srcArray = 0; - p.srcPos = zp; - p.srcPtr = zeroBuf; - p.dstArray = array; - p.dstPtr.ptr = 0; - p.dstPtr.pitch = 0; - p.dstPtr.xsize = 0; - p.dstPtr.ysize = 0; - p.dstPos = dp; - p.extent = extentA; - p.kind = cudaMemcpyDeviceToDevice; - - err = cudaMemcpy3D(&p); - ASTRA_CUDA_ASSERT(err); - } - cudaFree(zeroBuf.ptr); - - // TODO: check errors - - return true; -} -bool zeroProjectionArray(cudaArray* array, const SDimensions3D& dims) -{ - cudaPitchedPtr zeroBuf; - cudaExtent extentS; - extentS.width = sizeof(float)*(dims.iProjU+2); - extentS.height = dims.iProjAngles; - extentS.depth = 1; - cudaExtent extentA; - extentA.width = dims.iProjU+2; - extentA.height = dims.iProjAngles; - extentA.depth = 1; - - - cudaError err; - err = cudaMalloc3D(&zeroBuf, extentS); - ASTRA_CUDA_ASSERT(err); - err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iProjU+2), dims.iProjAngles); - ASTRA_CUDA_ASSERT(err); - - for (unsigned int i = 0; i < dims.iProjV+2; ++i) { - cudaMemcpy3DParms p; - cudaPos zp = {0, 0, 0}; - cudaPos dp = {0, 0, i}; - p.srcArray = 0; - p.srcPos = zp; - p.srcPtr = zeroBuf; - p.dstArray = array; - p.dstPtr.ptr = 0; - p.dstPtr.pitch = 0; - p.dstPtr.xsize = 0; - p.dstPtr.ysize = 0; - p.dstPos = dp; - p.extent = extentA; - p.kind = cudaMemcpyDeviceToDevice; - - err = cudaMemcpy3D(&p); - ASTRA_CUDA_ASSERT(err); - } - cudaFree(zeroBuf.ptr); - - // TODO: check errors - return true; -} - bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const SDimensions3D& dims) { @@ -434,7 +337,6 @@ bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const cudaMemcpy3DParms p; cudaPos zp = {0, 0, 0}; - cudaPos dp = {1, 1, 1}; p.srcArray = 0; p.srcPos = zp; p.srcPtr = D_volumeData; @@ -443,7 +345,7 @@ bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const p.dstPtr.pitch = 0; p.dstPtr.xsize = 0; p.dstPtr.ysize = 0; - p.dstPos = dp; + p.dstPos = zp; p.extent = extentA; p.kind = cudaMemcpyDeviceToDevice; @@ -462,7 +364,6 @@ bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, con cudaMemcpy3DParms p; cudaPos zp = {0, 0, 0}; - cudaPos dp = {1, 0, 1}; p.srcArray = 0; p.srcPos = zp; p.srcPtr = D_projData; @@ -471,7 +372,7 @@ bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, con p.dstPtr.pitch = 0; p.dstPtr.xsize = 0; p.dstPtr.ysize = 0; - p.dstPos = dp; + p.dstPos = zp; p.extent = extentA; p.kind = cudaMemcpyDeviceToDevice; -- cgit v1.2.3