diff options
author | Willem Jan Palenstijn <WillemJan.Palenstijn@uantwerpen.be> | 2013-07-01 22:34:11 +0000 |
---|---|---|
committer | wpalenst <WillemJan.Palenstijn@uantwerpen.be> | 2013-07-01 22:34:11 +0000 |
commit | b2fc6c70434674d74551c3a6c01ffb3233499312 (patch) | |
tree | b17f080ebc504ab85ebb7c3d89f917fd87ce9e00 /cuda/2d/arith.cu | |
download | astra-b2fc6c70434674d74551c3a6c01ffb3233499312.tar.gz astra-b2fc6c70434674d74551c3a6c01ffb3233499312.tar.bz2 astra-b2fc6c70434674d74551c3a6c01ffb3233499312.tar.xz astra-b2fc6c70434674d74551c3a6c01ffb3233499312.zip |
Update version to 1.3
Diffstat (limited to 'cuda/2d/arith.cu')
-rw-r--r-- | cuda/2d/arith.cu | 893 |
1 files changed, 893 insertions, 0 deletions
diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu new file mode 100644 index 0000000..1ee02ca --- /dev/null +++ b/cuda/2d/arith.cu @@ -0,0 +1,893 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include "util.h" +#include "arith.h" +#include <cassert> + +namespace astraCUDA { + + +struct opAddScaled { + __device__ void operator()(float& out, const float in, const float inp) { + out += in * inp; + } +}; +struct opScaleAndAdd { + __device__ void operator()(float& out, const float in, const float inp) { + out = in + out * inp; + } +}; +struct opAddMulScaled { + __device__ void operator()(float& out, const float in1, const float in2, const float inp) { + out += in1 * in2 * inp; + } +}; +struct opAddMul { + __device__ void operator()(float& out, const float in1, const float in2) { + out += in1 * in2; + } +}; +struct opAdd { + __device__ void operator()(float& out, const float in) { + out += in; + } +}; +struct opAdd2 { + __device__ void operator()(float& out, const float in1, const float in2) { + out += in1 + in2; + } +}; +struct opMul { + __device__ void operator()(float& out, const float in) { + out *= in; + } +}; +struct opMul2 { + __device__ void operator()(float& out, const float in1, const float in2) { + out *= in1 * in2; + } +}; +struct opDividedBy { + __device__ void operator()(float& out, const float in) { + if (out > 0.000001f) // out is assumed to be positive + out = in / out; + else + out = 0.0f; + } +}; +struct opInvert { + __device__ void operator()(float& out) { + if (out > 0.000001f) // out is assumed to be positive + out = 1 / out; + else + out = 0.0f; + } +}; +struct opSet { + __device__ void operator()(float& out, const float inp) { + out = inp; + } +}; +struct opClampMin { + __device__ void operator()(float& out, const float inp) { + if (out < inp) + out = inp; + } +}; +struct opClampMax { + __device__ void operator()(float& out, const float inp) { + if (out > inp) + out = inp; + } +}; +struct opClampMinMask { + __device__ void operator()(float& out, const float in) { + if (out < in) + out = in; + } +}; +struct opClampMaxMask { + __device__ void operator()(float& out, const float in) { + if (out > in) + out = in; + } +}; +struct opSetMaskedValues { + __device__ void operator()(float& out, const float in, const float inp) { + if (!in) + out = inp; + } +}; +struct opSegmentAndMask { + __device__ void operator()(float& out1, float& out2, const float inp1, const float inp2) { + if (out1 >= inp1) { + out1 = inp2; + out2 = 0.0f; + } + + } + +}; +struct opMulMask { + __device__ void operator()(float& out, const float mask, const float in) { + if (mask > 0.0f) { + out *= in; + } + } +}; + + + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off]); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], fParam); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut1[off], pfOut2[off], fParam1, fParam2); + off += pitch; + y++; + } +} + + + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn[off]); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn[off], fParam); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn1[off], pfIn2[off]); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn1[off], pfIn2[off], fParam); + off += pitch; + y++; + } +} + + + + + + + + + + + + + + + + +template<typename op, VolType t> +void processVolCopy(float* out, unsigned int width, unsigned int height) +{ + float* D_out; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + + processVol<op, t>(D_out, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); +} + +template<typename op, VolType t> +void processVolCopy(float* out, float param, unsigned int width, unsigned int height) +{ + float* D_out; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + + processVol<op, t>(D_out, param, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); +} + +template<typename op, VolType t> +void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height) +{ + float* D_out1; + float* D_out2; + + unsigned int pitch; + allocateVolume(D_out1, width+2, height+2, pitch); + copyVolumeToDevice(out1, width, width, height, D_out1, pitch); + allocateVolume(D_out2, width+2, height+2, pitch); + copyVolumeToDevice(out2, width, width, height, D_out2, pitch); + + processVol<op, t>(D_out1, D_out2, param1, param2, pitch, width, height); + + copyVolumeFromDevice(out1, width, width, height, D_out1, pitch); + copyVolumeFromDevice(out2, width, width, height, D_out2, pitch); + + cudaFree(D_out1); + cudaFree(D_out2); +} + + +template<typename op, VolType t> +void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height) +{ + float* D_out; + float* D_in; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + allocateVolume(D_in, width+2, height+2, pitch); + copyVolumeToDevice(in, width, width, height, D_in, pitch); + + processVol<op, t>(D_out, D_in, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); + cudaFree(D_in); +} + +template<typename op, VolType t> +void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height) +{ + float* D_out; + float* D_in; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + allocateVolume(D_in, width+2, height+2, pitch); + copyVolumeToDevice(in, width, width, height, D_in, pitch); + + processVol<op, t>(D_out, D_in, param, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); + cudaFree(D_in); +} + +template<typename op, VolType t> +void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height) +{ + float* D_out; + float* D_in1; + float* D_in2; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + allocateVolume(D_in1, width+2, height+2, pitch); + copyVolumeToDevice(in1, width, width, height, D_in1, pitch); + allocateVolume(D_in2, width+2, height+2, pitch); + copyVolumeToDevice(in2, width, width, height, D_in2, pitch); + + processVol<op, t>(D_out, D_in1, D_in2, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); + cudaFree(D_in1); + cudaFree(D_in2); +} + +template<typename op, VolType t> +void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height) +{ + float* D_out; + float* D_in1; + float* D_in2; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + allocateVolume(D_in1, width+2, height+2, pitch); + copyVolumeToDevice(in1, width, width, height, D_in1, pitch); + allocateVolume(D_in2, width+2, height+2, pitch); + copyVolumeToDevice(in2, width, width, height, D_in2, pitch); + + processVol<op, t>(D_out, D_in1, D_in2, param, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); + cudaFree(D_in1); + cudaFree(D_in2); +} + + + + + + + + + +template<typename op, VolType t> +void processVol(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+511)/512); + + devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devFFtoDD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + + +template<typename op, VolType t> +void processVol(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + + + + + + + + + + + + + + + + + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut1 = (float*)out1.ptr; + float *pfOut2 = (float*)out2.ptr; + unsigned int step = out1.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut1 += step; + pfOut2 += step; + } + + cudaTextForceKernelsCompletion(); +} + + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + + + + + + + + + + + + + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut1 = (float*)out1.ptr; + float *pfOut2 = (float*)out2.ptr; + unsigned int step = out1.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut1 += step; + pfOut2 += step; + } + + cudaTextForceKernelsCompletion(); +} + + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + + + + + + + + + + + + + + + + + + +#define INST_DFtoD(name) \ + template void processVolCopy<name, VOL>(float* out, const float* in, float param, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, const float* in, float param, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); + +#define INST_DtoD(name) \ + template void processVolCopy<name, VOL>(float* out, const float* in, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, const float* in, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); + +#define INST_DDtoD(name) \ + template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); + +#define INST_DDFtoD(name) \ + template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); + + +#define INST_toD(name) \ + template void processVolCopy<name, VOL>(float* out, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); + +#define INST_FtoD(name) \ + template void processVolCopy<name, VOL>(float* out, float param, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, float param, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims); + +#define INST_FFtoDD(name) \ + template void processVolCopy<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); + + + +INST_DFtoD(opAddScaled) +INST_DFtoD(opScaleAndAdd) +INST_DDFtoD(opAddMulScaled) +INST_DDtoD(opAddMul) +INST_DDtoD(opMul2) +INST_DDtoD(opAdd2) +INST_DtoD(opMul) +INST_DDtoD(opMulMask) +INST_DtoD(opAdd) +INST_DtoD(opDividedBy) +INST_toD(opInvert) +INST_FtoD(opSet) +INST_FtoD(opMul) +INST_DFtoD(opMulMask) +INST_FtoD(opAdd) +INST_FtoD(opClampMin) +INST_FtoD(opClampMax) +INST_DtoD(opClampMinMask) +INST_DtoD(opClampMaxMask) + +// PDART-specific: +INST_DFtoD(opSetMaskedValues) +INST_FFtoDD(opSegmentAndMask) + +} |