summaryrefslogtreecommitdiffstats
path: root/cuda/2d/arith.cu
diff options
context:
space:
mode:
authorWillem Jan Palenstijn <WillemJan.Palenstijn@uantwerpen.be>2014-04-16 11:12:55 +0000
committerwpalenst <WillemJan.Palenstijn@uantwerpen.be>2014-04-16 11:12:55 +0000
commit3a6769465bee7d56d0ddff36613b886446421e07 (patch)
tree624e85c5d6a4ab19c958a388e3436219693a6296 /cuda/2d/arith.cu
parent4dfb881ceb82b07630437e952dec62323977ab56 (diff)
downloadastra-3a6769465bee7d56d0ddff36613b886446421e07.tar.gz
astra-3a6769465bee7d56d0ddff36613b886446421e07.tar.bz2
astra-3a6769465bee7d56d0ddff36613b886446421e07.tar.xz
astra-3a6769465bee7d56d0ddff36613b886446421e07.zip
Remove padding in 2D cuda in favour of Border mode
Diffstat (limited to 'cuda/2d/arith.cu')
-rw-r--r--cuda/2d/arith.cu182
1 files changed, 84 insertions, 98 deletions
diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu
index 1ee02ca..42c2c98 100644
--- a/cuda/2d/arith.cu
+++ b/cuda/2d/arith.cu
@@ -144,14 +144,14 @@ struct opMulMask {
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -161,14 +161,14 @@ __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, uns
}
}
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -178,14 +178,14 @@ __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned
}
}
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -197,14 +197,14 @@ __global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fPa
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -214,14 +214,14 @@ __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, uns
}
}
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -231,14 +231,14 @@ __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned
}
}
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -248,14 +248,14 @@ __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, u
}
}
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -280,51 +280,51 @@ __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2,
-template<typename op, VolType t>
+template<typename op>
void processVolCopy(float* out, unsigned int width, unsigned int height)
{
float* D_out;
unsigned int pitch;
- allocateVolume(D_out, width+2, height+2, pitch);
+ allocateVolume(D_out, width, height, pitch);
copyVolumeToDevice(out, width, width, height, D_out, pitch);
- processVol<op, t>(D_out, pitch, width, height);
+ processVol<op>(D_out, pitch, width, height);
copyVolumeFromDevice(out, width, width, height, D_out, pitch);
cudaFree(D_out);
}
-template<typename op, VolType t>
+template<typename op>
void processVolCopy(float* out, float param, unsigned int width, unsigned int height)
{
float* D_out;
unsigned int pitch;
- allocateVolume(D_out, width+2, height+2, pitch);
+ allocateVolume(D_out, width, height, pitch);
copyVolumeToDevice(out, width, width, height, D_out, pitch);
- processVol<op, t>(D_out, param, pitch, width, height);
+ processVol<op>(D_out, param, pitch, width, height);
copyVolumeFromDevice(out, width, width, height, D_out, pitch);
cudaFree(D_out);
}
-template<typename op, VolType t>
+template<typename op>
void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height)
{
float* D_out1;
float* D_out2;
unsigned int pitch;
- allocateVolume(D_out1, width+2, height+2, pitch);
+ allocateVolume(D_out1, width, height, pitch);
copyVolumeToDevice(out1, width, width, height, D_out1, pitch);
- allocateVolume(D_out2, width+2, height+2, pitch);
+ allocateVolume(D_out2, width, height, pitch);
copyVolumeToDevice(out2, width, width, height, D_out2, pitch);
- processVol<op, t>(D_out1, D_out2, param1, param2, pitch, width, height);
+ processVol<op>(D_out1, D_out2, param1, param2, pitch, width, height);
copyVolumeFromDevice(out1, width, width, height, D_out1, pitch);
copyVolumeFromDevice(out2, width, width, height, D_out2, pitch);
@@ -334,19 +334,19 @@ void processVolCopy(float* out1, float* out2, float param1, float param2, unsign
}
-template<typename op, VolType t>
+template<typename op>
void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height)
{
float* D_out;
float* D_in;
unsigned int pitch;
- allocateVolume(D_out, width+2, height+2, pitch);
+ allocateVolume(D_out, width, height, pitch);
copyVolumeToDevice(out, width, width, height, D_out, pitch);
- allocateVolume(D_in, width+2, height+2, pitch);
+ allocateVolume(D_in, width, height, pitch);
copyVolumeToDevice(in, width, width, height, D_in, pitch);
- processVol<op, t>(D_out, D_in, pitch, width, height);
+ processVol<op>(D_out, D_in, pitch, width, height);
copyVolumeFromDevice(out, width, width, height, D_out, pitch);
@@ -354,19 +354,19 @@ void processVolCopy(float* out, const float* in, unsigned int width, unsigned in
cudaFree(D_in);
}
-template<typename op, VolType t>
+template<typename op>
void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height)
{
float* D_out;
float* D_in;
unsigned int pitch;
- allocateVolume(D_out, width+2, height+2, pitch);
+ allocateVolume(D_out, width, height, pitch);
copyVolumeToDevice(out, width, width, height, D_out, pitch);
- allocateVolume(D_in, width+2, height+2, pitch);
+ allocateVolume(D_in, width, height, pitch);
copyVolumeToDevice(in, width, width, height, D_in, pitch);
- processVol<op, t>(D_out, D_in, param, pitch, width, height);
+ processVol<op>(D_out, D_in, param, pitch, width, height);
copyVolumeFromDevice(out, width, width, height, D_out, pitch);
@@ -374,7 +374,7 @@ void processVolCopy(float* out, const float* in, float param, unsigned int width
cudaFree(D_in);
}
-template<typename op, VolType t>
+template<typename op>
void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height)
{
float* D_out;
@@ -382,14 +382,14 @@ void processVolCopy(float* out, const float* in1, const float* in2, unsigned int
float* D_in2;
unsigned int pitch;
- allocateVolume(D_out, width+2, height+2, pitch);
+ allocateVolume(D_out, width, height, pitch);
copyVolumeToDevice(out, width, width, height, D_out, pitch);
- allocateVolume(D_in1, width+2, height+2, pitch);
+ allocateVolume(D_in1, width, height, pitch);
copyVolumeToDevice(in1, width, width, height, D_in1, pitch);
- allocateVolume(D_in2, width+2, height+2, pitch);
+ allocateVolume(D_in2, width, height, pitch);
copyVolumeToDevice(in2, width, width, height, D_in2, pitch);
- processVol<op, t>(D_out, D_in1, D_in2, pitch, width, height);
+ processVol<op>(D_out, D_in1, D_in2, pitch, width, height);
copyVolumeFromDevice(out, width, width, height, D_out, pitch);
@@ -398,7 +398,7 @@ void processVolCopy(float* out, const float* in1, const float* in2, unsigned int
cudaFree(D_in2);
}
-template<typename op, VolType t>
+template<typename op>
void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height)
{
float* D_out;
@@ -406,14 +406,14 @@ void processVolCopy(float* out, const float* in1, const float* in2, float param,
float* D_in2;
unsigned int pitch;
- allocateVolume(D_out, width+2, height+2, pitch);
+ allocateVolume(D_out, width, height, pitch);
copyVolumeToDevice(out, width, width, height, D_out, pitch);
- allocateVolume(D_in1, width+2, height+2, pitch);
+ allocateVolume(D_in1, width, height, pitch);
copyVolumeToDevice(in1, width, width, height, D_in1, pitch);
- allocateVolume(D_in2, width+2, height+2, pitch);
+ allocateVolume(D_in2, width, height, pitch);
copyVolumeToDevice(in2, width, width, height, D_in2, pitch);
- processVol<op, t>(D_out, D_in1, D_in2, param, pitch, width, height);
+ processVol<op>(D_out, D_in1, D_in2, param, pitch, width, height);
copyVolumeFromDevice(out, width, width, height, D_out, pitch);
@@ -430,80 +430,80 @@ void processVolCopy(float* out, const float* in1, const float* in2, float param,
-template<typename op, VolType t>
+template<typename op>
void processVol(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
dim3 gridSize((width+15)/16, (height+511)/512);
- devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
+ devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
cudaTextForceKernelsCompletion();
}
-template<typename op, VolType t>
+template<typename op>
void processVol(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
dim3 gridSize((width+15)/16, (height+15)/16);
- devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
+ devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
cudaTextForceKernelsCompletion();
}
-template<typename op, VolType t>
+template<typename op>
void processVol(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
dim3 gridSize((width+15)/16, (height+15)/16);
- devFFtoDD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height);
+ devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height);
cudaTextForceKernelsCompletion();
}
-template<typename op, VolType t>
+template<typename op>
void processVol(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
dim3 gridSize((width+15)/16, (height+15)/16);
- devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
+ devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
cudaTextForceKernelsCompletion();
}
-template<typename op, VolType t>
+template<typename op>
void processVol(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
dim3 gridSize((width+15)/16, (height+15)/16);
- devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
+ devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
cudaTextForceKernelsCompletion();
}
-template<typename op, VolType t>
+template<typename op>
void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
dim3 gridSize((width+15)/16, (height+15)/16);
- devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
+ devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
cudaTextForceKernelsCompletion();
}
-template<typename op, VolType t>
+template<typename op>
void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
dim3 gridSize((width+15)/16, (height+15)/16);
- devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
+ devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
cudaTextForceKernelsCompletion();
}
@@ -533,7 +533,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims)
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
}
@@ -549,7 +549,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
}
@@ -566,7 +566,7 @@ void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, flo
unsigned int step = out1.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut1 += step;
pfOut2 += step;
}
@@ -585,7 +585,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
pfIn += step;
}
@@ -603,7 +603,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
pfIn += step;
}
@@ -622,7 +622,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
pfIn1 += step;
pfIn2 += step;
@@ -642,7 +642,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
pfIn1 += step;
pfIn2 += step;
@@ -672,7 +672,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims)
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
}
@@ -688,7 +688,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
}
@@ -705,7 +705,7 @@ void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, fl
unsigned int step = out1.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut1 += step;
pfOut2 += step;
}
@@ -724,7 +724,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
pfIn += step;
}
@@ -742,7 +742,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam,
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
pfIn += step;
}
@@ -761,7 +761,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
pfIn1 += step;
pfIn2 += step;
@@ -781,7 +781,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
pfIn1 += step;
pfIn2 += step;
@@ -808,59 +808,45 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
#define INST_DFtoD(name) \
- template void processVolCopy<name, VOL>(float* out, const float* in, float param, unsigned int width, unsigned int height); \
- template void processVolCopy<name, SINO>(float* out, const float* in, float param, unsigned int width, unsigned int height); \
- template void processVol<name, VOL>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVolCopy<name>(float* out, const float* in, float param, unsigned int width, unsigned int height); \
+ template void processVol<name>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
#define INST_DtoD(name) \
- template void processVolCopy<name, VOL>(float* out, const float* in, unsigned int width, unsigned int height); \
- template void processVolCopy<name, SINO>(float* out, const float* in, unsigned int width, unsigned int height); \
- template void processVol<name, VOL>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVolCopy<name>(float* out, const float* in, unsigned int width, unsigned int height); \
+ template void processVol<name>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
#define INST_DDtoD(name) \
- template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \
- template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \
- template void processVol<name, VOL>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVolCopy<name>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \
+ template void processVol<name>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
#define INST_DDFtoD(name) \
- template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \
- template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \
- template void processVol<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVolCopy<name>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \
+ template void processVol<name>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
#define INST_toD(name) \
- template void processVolCopy<name, VOL>(float* out, unsigned int width, unsigned int height); \
- template void processVolCopy<name, SINO>(float* out, unsigned int width, unsigned int height); \
- template void processVol<name, VOL>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVolCopy<name>(float* out, unsigned int width, unsigned int height); \
+ template void processVol<name>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims);
#define INST_FtoD(name) \
- template void processVolCopy<name, VOL>(float* out, float param, unsigned int width, unsigned int height); \
- template void processVolCopy<name, SINO>(float* out, float param, unsigned int width, unsigned int height); \
- template void processVol<name, VOL>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVolCopy<name>(float* out, float param, unsigned int width, unsigned int height); \
+ template void processVol<name>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims);
#define INST_FFtoDD(name) \
- template void processVolCopy<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \
- template void processVolCopy<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \
- template void processVol<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVolCopy<name>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \
+ template void processVol<name>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims);