From bcdf186ccdca54a3df383512ad5a117004500a60 Mon Sep 17 00:00:00 2001 From: Daniil Kazantsev Date: Mon, 26 Feb 2018 12:50:58 +0000 Subject: CPU-GPU naming consistency --- Core/regularizers_CPU/FGP_TV_core.c | 2 +- Core/regularizers_CPU/FGP_TV_core.h | 2 +- Core/regularizers_CPU/ROF_TV_core.c | 2 +- Core/regularizers_CPU/ROF_TV_core.h | 2 +- Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu | 561 --------------------- Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h | 10 - Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu | 561 +++++++++++++++++++++ Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h | 10 + Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu | 369 -------------- Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.h | 8 - Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu | 369 ++++++++++++++ Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h | 8 + Wrappers/Python/src/cpu_regularizers.cpp | 546 ++++++++++---------- Wrappers/Python/src/gpu_regularizers.pyx | 69 ++- Wrappers/Python/test/test_cpu_vs_gpu.py | 10 + .../Python/test/test_cpu_vs_gpu_regularizers.py | 8 +- 16 files changed, 1305 insertions(+), 1232 deletions(-) delete mode 100755 Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu delete mode 100755 Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h create mode 100755 Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu create mode 100755 Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h delete mode 100755 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu delete mode 100755 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.h create mode 100755 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu create mode 100755 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h create mode 100644 Wrappers/Python/test/test_cpu_vs_gpu.py diff --git a/Core/regularizers_CPU/FGP_TV_core.c b/Core/regularizers_CPU/FGP_TV_core.c index 304848d..2f1439d 100644 --- a/Core/regularizers_CPU/FGP_TV_core.c +++ b/Core/regularizers_CPU/FGP_TV_core.c @@ -37,7 +37,7 @@ limitations under the License. * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" */ -float FGP_TV_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ) +float TV_FGP_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ) { int ll, j, DimTotal; float re, re1; diff --git a/Core/regularizers_CPU/FGP_TV_core.h b/Core/regularizers_CPU/FGP_TV_core.h index b591819..98ceaec 100644 --- a/Core/regularizers_CPU/FGP_TV_core.h +++ b/Core/regularizers_CPU/FGP_TV_core.h @@ -47,7 +47,7 @@ limitations under the License. #ifdef __cplusplus extern "C" { #endif -CCPI_EXPORT float FGP_TV_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ); +CCPI_EXPORT float TV_FGP_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ); CCPI_EXPORT float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, int dimX, int dimY); CCPI_EXPORT float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda, int dimX, int dimY); diff --git a/Core/regularizers_CPU/ROF_TV_core.c b/Core/regularizers_CPU/ROF_TV_core.c index fd47c3f..b2c6f00 100644 --- a/Core/regularizers_CPU/ROF_TV_core.c +++ b/Core/regularizers_CPU/ROF_TV_core.c @@ -46,7 +46,7 @@ int sign(float x) { */ /* Running iterations of TV-ROF function */ -float TV_ROF(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float lambda) +float TV_ROF_CPU(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float lambda) { float *D1, *D2, *D3; int i, DimTotal; diff --git a/Core/regularizers_CPU/ROF_TV_core.h b/Core/regularizers_CPU/ROF_TV_core.h index 5d69d27..b32d0d5 100644 --- a/Core/regularizers_CPU/ROF_TV_core.h +++ b/Core/regularizers_CPU/ROF_TV_core.h @@ -47,7 +47,7 @@ limitations under the License. extern "C" { #endif CCPI_EXPORT float TV_kernel(float *D1, float *D2, float *D3, float *B, float *A, float lambda, float tau, int dimY, int dimX, int dimZ); -CCPI_EXPORT float TV_ROF(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float lambda); +CCPI_EXPORT float TV_ROF_CPU(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float lambda); CCPI_EXPORT float D1_func(float *A, float *D1, int dimY, int dimX, int dimZ); CCPI_EXPORT float D2_func(float *A, float *D2, int dimY, int dimX, int dimZ); CCPI_EXPORT float D3_func(float *A, float *D3, int dimY, int dimX, int dimZ); diff --git a/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu b/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu deleted file mode 100755 index 21a95c9..0000000 --- a/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu +++ /dev/null @@ -1,561 +0,0 @@ - /* -This work is part of the Core Imaging Library developed by -Visual Analytics and Imaging System Group of the Science Technology -Facilities Council, STFC - -Copyright 2017 Daniil Kazantsev -Copyright 2017 Srikanth Nagella, Edoardo Pasca - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#include "FGP_TV_GPU_core.h" -#include -#include - -/* CUDA implementation of FGP-TV [1] denoising/regularization model (2D/3D case) - * - * Input Parameters: - * 1. Noisy image/volume - * 2. lambda - regularization parameter - * 3. Number of iterations - * 4. eplsilon: tolerance constant - * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) - * 6. nonneg: 'nonnegativity (0 is OFF by default) - * 7. print information: 0 (off) or 1 (on) - * - * Output: - * [1] Filtered/regularized image - * - * This function is based on the Matlab's code and paper by - * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" - */ - -// This will output the proper CUDA error strings in the event that a CUDA host call returns an error -#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) - -inline void __checkCudaErrors(cudaError err, const char *file, const int line) -{ - if (cudaSuccess != err) - { - fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n", - file, line, (int)err, cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } -} - -#define BLKXSIZE2D 16 -#define BLKYSIZE2D 16 - -#define BLKXSIZE 8 -#define BLKYSIZE 8 -#define BLKZSIZE 8 - -#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) -struct square { __host__ __device__ float operator()(float x) { return x * x; } }; - -/************************************************/ -/*****************2D modules*********************/ -/************************************************/ -__global__ void Obj_func2D_kernel(float *Ad, float *D, float *R1, float *R2, int N, int M, int ImSize, float lambda) -{ - - float val1,val2; - - //calculate each thread global index - const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; - const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - - int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { - if (xIndex <= 0) {val1 = 0.0f;} else {val1 = R1[(xIndex-1) + N*yIndex];} - if (yIndex <= 0) {val2 = 0.0f;} else {val2 = R2[xIndex + N*(yIndex-1)];} - //Write final result to global memory - D[index] = Ad[index] - lambda*(R1[index] + R2[index] - val1 - val2); - } - return; -} - -__global__ void Grad_func2D_kernel(float *P1, float *P2, float *D, float *R1, float *R2, int N, int M, int ImSize, float multip) -{ - - float val1,val2; - - //calculate each thread global index - const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; - const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - - int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { - - /* boundary conditions */ - if (xIndex >= N-1) val1 = 0.0f; else val1 = D[index] - D[(xIndex+1) + N*yIndex]; - if (yIndex >= M-1) val2 = 0.0f; else val2 = D[index] - D[(xIndex) + N*(yIndex + 1)]; - - //Write final result to global memory - P1[index] = R1[index] + multip*val1; - P2[index] = R2[index] + multip*val2; - } - return; -} - -__global__ void Proj_func2D_iso_kernel(float *P1, float *P2, int N, int M, int ImSize) -{ - - float denom; - //calculate each thread global index - const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; - const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - - int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { - denom = pow(P1[index],2) + pow(P2[index],2); - if (denom > 1.0f) { - P1[index] = P1[index]/sqrt(denom); - P2[index] = P2[index]/sqrt(denom); - } - } - return; -} -__global__ void Proj_func2D_aniso_kernel(float *P1, float *P2, int N, int M, int ImSize) -{ - - float val1, val2; - //calculate each thread global index - const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; - const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - - int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { - val1 = abs(P1[index]); - val2 = abs(P2[index]); - if (val1 < 1.0f) {val1 = 1.0f;} - if (val2 < 1.0f) {val2 = 1.0f;} - P1[index] = P1[index]/val1; - P2[index] = P2[index]/val2; - } - return; -} -__global__ void Rupd_func2D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, float multip2, int N, int M, int ImSize) -{ - //calculate each thread global index - const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; - const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - - int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { - R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]); - R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]); - } - return; -} -__global__ void nonneg2D_kernel(float* Output, int N, int M, int num_total) -{ - int xIndex = blockDim.x * blockIdx.x + threadIdx.x; - int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - - int index = xIndex + N*yIndex; - - if (index < num_total) { - if (Output[index] < 0.0f) Output[index] = 0.0f; - } -} -__global__ void copy_kernel2D(float *Input, float* Output, int N, int M, int num_total) -{ - int xIndex = blockDim.x * blockIdx.x + threadIdx.x; - int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - - int index = xIndex + N*yIndex; - - if (index < num_total) { - Output[index] = Input[index]; - } -} -__global__ void ResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total) -{ - int xIndex = blockDim.x * blockIdx.x + threadIdx.x; - int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - - int index = xIndex + N*yIndex; - - if (index < num_total) { - Output[index] = Input1[index] - Input2[index]; - } -} -/************************************************/ -/*****************3D modules*********************/ -/************************************************/ -__global__ void Obj_func3D_kernel(float *Ad, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float lambda) -{ - - float val1,val2,val3; - - //calculate each thread global index - int i = blockDim.x * blockIdx.x + threadIdx.x; - int j = blockDim.y * blockIdx.y + threadIdx.y; - int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (N*M)*k + i + N*j; - - if ((i < N) && (j < M) && (k < Z)) { - if (i <= 0) {val1 = 0.0f;} else {val1 = R1[(N*M)*(k) + (i-1) + N*j];} - if (j <= 0) {val2 = 0.0f;} else {val2 = R2[(N*M)*(k) + i + N*(j-1)];} - if (k <= 0) {val3 = 0.0f;} else {val3 = R3[(N*M)*(k-1) + i + N*j];} - //Write final result to global memory - D[index] = Ad[index] - lambda*(R1[index] + R2[index] + R3[index] - val1 - val2 - val3); - } - return; -} - -__global__ void Grad_func3D_kernel(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float multip) -{ - - float val1,val2,val3; - - //calculate each thread global index - int i = blockDim.x * blockIdx.x + threadIdx.x; - int j = blockDim.y * blockIdx.y + threadIdx.y; - int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (N*M)*k + i + N*j; - - if ((i < N) && (j < M) && (k < Z)) { - /* boundary conditions */ - if (i >= N-1) val1 = 0.0f; else val1 = D[index] - D[(N*M)*(k) + (i+1) + N*j]; - if (j >= M-1) val2 = 0.0f; else val2 = D[index] - D[(N*M)*(k) + i + N*(j+1)]; - if (k >= Z-1) val3 = 0.0f; else val3 = D[index] - D[(N*M)*(k+1) + i + N*j]; - - //Write final result to global memory - P1[index] = R1[index] + multip*val1; - P2[index] = R2[index] + multip*val2; - P3[index] = R3[index] + multip*val3; - } - return; -} - -__global__ void Proj_func3D_iso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize) -{ - - float denom,sq_denom; - //calculate each thread global index - int i = blockDim.x * blockIdx.x + threadIdx.x; - int j = blockDim.y * blockIdx.y + threadIdx.y; - int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (N*M)*k + i + N*j; - - if ((i < N) && (j < M) && (k < Z)) { - denom = pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2); - - if (denom > 1.0f) { - sq_denom = 1.0f/sqrt(denom); - P1[index] = P1[index]*sq_denom; - P2[index] = P2[index]*sq_denom; - P3[index] = P3[index]*sq_denom; - } - } - return; -} - -__global__ void Proj_func3D_aniso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize) -{ - - float val1, val2, val3; - //calculate each thread global index - int i = blockDim.x * blockIdx.x + threadIdx.x; - int j = blockDim.y * blockIdx.y + threadIdx.y; - int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (N*M)*k + i + N*j; - - if ((i < N) && (j < M) && (k < Z)) { - val1 = abs(P1[index]); - val2 = abs(P2[index]); - val3 = abs(P3[index]); - if (val1 < 1.0f) {val1 = 1.0f;} - if (val2 < 1.0f) {val2 = 1.0f;} - if (val3 < 1.0f) {val3 = 1.0f;} - P1[index] = P1[index]/val1; - P2[index] = P2[index]/val2; - P3[index] = P3[index]/val3; - } - return; -} - - -__global__ void Rupd_func3D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, float multip2, int N, int M, int Z, int ImSize) -{ - //calculate each thread global index - int i = blockDim.x * blockIdx.x + threadIdx.x; - int j = blockDim.y * blockIdx.y + threadIdx.y; - int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (N*M)*k + i + N*j; - - if ((i < N) && (j < M) && (k < Z)) { - R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]); - R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]); - R3[index] = P3[index] + multip2*(P3[index] - P3_old[index]); - } - return; -} - -__global__ void nonneg3D_kernel(float* Output, int N, int M, int Z, int num_total) -{ - int i = blockDim.x * blockIdx.x + threadIdx.x; - int j = blockDim.y * blockIdx.y + threadIdx.y; - int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (N*M)*k + i + N*j; - - if (index < num_total) { - if (Output[index] < 0.0f) Output[index] = 0.0f; - } -} - -__global__ void copy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total) -{ - int i = blockDim.x * blockIdx.x + threadIdx.x; - int j = blockDim.y * blockIdx.y + threadIdx.y; - int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (N*M)*k + i + N*j; - - if (index < num_total) { - Output[index] = Input[index]; - } -} -/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ - -////////////MAIN HOST FUNCTION /////////////// -extern "C" void FGP_TV_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ) -{ - int deviceCount = -1; // number of devices - cudaGetDeviceCount(&deviceCount); - if (deviceCount == 0) { - fprintf(stderr, "No CUDA devices found\n"); - return; - } - - int count = 0, i; - float re, multip,multip2; - float tk = 1.0f; - float tkp1=1.0f; - - if (dimZ <= 1) { - /*2D verson*/ - int ImSize = dimX*dimY; - float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL; - - dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); - dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D)); - - /*allocate space for images on device*/ - checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); - if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) ); - - checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); - cudaMemset(P1, 0, ImSize*sizeof(float)); - cudaMemset(P2, 0, ImSize*sizeof(float)); - cudaMemset(P1_prev, 0, ImSize*sizeof(float)); - cudaMemset(P2_prev, 0, ImSize*sizeof(float)); - cudaMemset(R1, 0, ImSize*sizeof(float)); - cudaMemset(R2, 0, ImSize*sizeof(float)); - - /********************** Run CUDA 2D kernel here ********************/ - multip = (1.0f/(8.0f*lambda)); - - /* The main kernel */ - for (i = 0; i < iter; i++) { - - /* computing the gradient of the objective function */ - Obj_func2D_kernel<<>>(d_input, d_update, R1, R2, dimX, dimY, ImSize, lambda); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - if (nonneg != 0) { - nonneg2D_kernel<<>>(d_update, dimX, dimY, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); } - - /*Taking a step towards minus of the gradient*/ - Grad_func2D_kernel<<>>(P1, P2, d_update, R1, R2, dimX, dimY, ImSize, multip); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - /* projection step */ - if (methodTV == 0) Proj_func2D_iso_kernel<<>>(P1, P2, dimX, dimY, ImSize); /*isotropic TV*/ - else Proj_func2D_aniso_kernel<<>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/ - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; - multip2 = ((tk-1.0f)/tkp1); - - Rupd_func2D_kernel<<>>(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, multip2, dimX, dimY, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - if (epsil != 0.0f) { - /* calculate norm - stopping rules using the Thrust library */ - ResidCalc2D_kernel<<>>(d_update, d_update_prev, P1_prev, dimX, dimY, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - thrust::device_vector d_vec(P1_prev, P1_prev + ImSize); - float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus())); - thrust::device_vector d_vec2(d_update, d_update + ImSize); - float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus())); - - re = (reduction/reduction2); - if (re < epsil) count++; - if (count > 4) break; - - copy_kernel2D<<>>(d_update, d_update_prev, dimX, dimY, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - } - - copy_kernel2D<<>>(P1, P1_prev, dimX, dimY, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - copy_kernel2D<<>>(P2, P2_prev, dimX, dimY, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - tk = tkp1; - } - if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i); - /***************************************************************/ - //copy result matrix from device to host memory - cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); - - cudaFree(d_input); - cudaFree(d_update); - if (epsil != 0.0f) cudaFree(d_update_prev); - cudaFree(P1); - cudaFree(P2); - cudaFree(P1_prev); - cudaFree(P2_prev); - cudaFree(R1); - cudaFree(R2); - } - else { - /*3D verson*/ - int ImSize = dimX*dimY*dimZ; - float *d_input, *d_update=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL; - - dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); - dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE)); - - /*allocate space for images on device*/ - checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&P3,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&P3_prev,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&R3,ImSize*sizeof(float)) ); - - checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); - cudaMemset(P1, 0, ImSize*sizeof(float)); - cudaMemset(P2, 0, ImSize*sizeof(float)); - cudaMemset(P3, 0, ImSize*sizeof(float)); - cudaMemset(P1_prev, 0, ImSize*sizeof(float)); - cudaMemset(P2_prev, 0, ImSize*sizeof(float)); - cudaMemset(P3_prev, 0, ImSize*sizeof(float)); - cudaMemset(R1, 0, ImSize*sizeof(float)); - cudaMemset(R2, 0, ImSize*sizeof(float)); - cudaMemset(R3, 0, ImSize*sizeof(float)); - /********************** Run CUDA 3D kernel here ********************/ - multip = (1.0f/(8.0f*lambda)); - - /* The main kernel */ - for (i = 0; i < iter; i++) { - - /* computing the gradient of the objective function */ - Obj_func3D_kernel<<>>(d_input, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, lambda); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - if (nonneg != 0) { - nonneg3D_kernel<<>>(d_update, dimX, dimY, dimZ, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); } - - /*Taking a step towards minus of the gradient*/ - Grad_func3D_kernel<<>>(P1, P2, P3, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, multip); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - /* projection step */ - if (methodTV == 0) Proj_func3D_iso_kernel<<>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* isotropic kernel */ - else Proj_func3D_aniso_kernel<<>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* anisotropic kernel */ - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; - multip2 = ((tk-1.0f)/tkp1); - - Rupd_func3D_kernel<<>>(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, multip2, dimX, dimY, dimZ, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - copy_kernel3D<<>>(P1, P1_prev, dimX, dimY, dimZ, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - copy_kernel3D<<>>(P2, P2_prev, dimX, dimY, dimZ, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - copy_kernel3D<<>>(P3, P3_prev, dimX, dimY, dimZ, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - tk = tkp1; - } - if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i); - /***************************************************************/ - //copy result matrix from device to host memory - cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); - - cudaFree(d_input); - cudaFree(d_update); - cudaFree(P1); - cudaFree(P2); - cudaFree(P3); - cudaFree(P1_prev); - cudaFree(P2_prev); - cudaFree(P3_prev); - cudaFree(R1); - cudaFree(R2); - cudaFree(R3); - } - cudaDeviceReset(); -} diff --git a/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h b/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h deleted file mode 100755 index a5d3f73..0000000 --- a/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h +++ /dev/null @@ -1,10 +0,0 @@ -#include -#include -#include - -#ifndef _FGP_TV_GPU_ -#define _FGP_TV_GPU_ - -extern "C" void FGP_TV_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ); - -#endif diff --git a/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu new file mode 100755 index 0000000..0533a85 --- /dev/null +++ b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu @@ -0,0 +1,561 @@ + /* +This work is part of the Core Imaging Library developed by +Visual Analytics and Imaging System Group of the Science Technology +Facilities Council, STFC + +Copyright 2017 Daniil Kazantsev +Copyright 2017 Srikanth Nagella, Edoardo Pasca + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "TV_FGP_GPU_core.h" +#include +#include + +/* CUDA implementation of FGP-TV [1] denoising/regularization model (2D/3D case) + * + * Input Parameters: + * 1. Noisy image/volume + * 2. lambda - regularization parameter + * 3. Number of iterations + * 4. eplsilon: tolerance constant + * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) + * 6. nonneg: 'nonnegativity (0 is OFF by default) + * 7. print information: 0 (off) or 1 (on) + * + * Output: + * [1] Filtered/regularized image + * + * This function is based on the Matlab's code and paper by + * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" + */ + +// This will output the proper CUDA error strings in the event that a CUDA host call returns an error +#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) + +inline void __checkCudaErrors(cudaError err, const char *file, const int line) +{ + if (cudaSuccess != err) + { + fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n", + file, line, (int)err, cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +#define BLKXSIZE2D 16 +#define BLKYSIZE2D 16 + +#define BLKXSIZE 8 +#define BLKYSIZE 8 +#define BLKZSIZE 8 + +#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) +struct square { __host__ __device__ float operator()(float x) { return x * x; } }; + +/************************************************/ +/*****************2D modules*********************/ +/************************************************/ +__global__ void Obj_func2D_kernel(float *Ad, float *D, float *R1, float *R2, int N, int M, int ImSize, float lambda) +{ + + float val1,val2; + + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + if (xIndex <= 0) {val1 = 0.0f;} else {val1 = R1[(xIndex-1) + N*yIndex];} + if (yIndex <= 0) {val2 = 0.0f;} else {val2 = R2[xIndex + N*(yIndex-1)];} + //Write final result to global memory + D[index] = Ad[index] - lambda*(R1[index] + R2[index] - val1 - val2); + } + return; +} + +__global__ void Grad_func2D_kernel(float *P1, float *P2, float *D, float *R1, float *R2, int N, int M, int ImSize, float multip) +{ + + float val1,val2; + + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + + /* boundary conditions */ + if (xIndex >= N-1) val1 = 0.0f; else val1 = D[index] - D[(xIndex+1) + N*yIndex]; + if (yIndex >= M-1) val2 = 0.0f; else val2 = D[index] - D[(xIndex) + N*(yIndex + 1)]; + + //Write final result to global memory + P1[index] = R1[index] + multip*val1; + P2[index] = R2[index] + multip*val2; + } + return; +} + +__global__ void Proj_func2D_iso_kernel(float *P1, float *P2, int N, int M, int ImSize) +{ + + float denom; + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + denom = pow(P1[index],2) + pow(P2[index],2); + if (denom > 1.0f) { + P1[index] = P1[index]/sqrt(denom); + P2[index] = P2[index]/sqrt(denom); + } + } + return; +} +__global__ void Proj_func2D_aniso_kernel(float *P1, float *P2, int N, int M, int ImSize) +{ + + float val1, val2; + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + val1 = abs(P1[index]); + val2 = abs(P2[index]); + if (val1 < 1.0f) {val1 = 1.0f;} + if (val2 < 1.0f) {val2 = 1.0f;} + P1[index] = P1[index]/val1; + P2[index] = P2[index]/val2; + } + return; +} +__global__ void Rupd_func2D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, float multip2, int N, int M, int ImSize) +{ + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]); + R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]); + } + return; +} +__global__ void nonneg2D_kernel(float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + if (Output[index] < 0.0f) Output[index] = 0.0f; + } +} +__global__ void copy_kernel2D(float *Input, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input[index]; + } +} +__global__ void ResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} +/************************************************/ +/*****************3D modules*********************/ +/************************************************/ +__global__ void Obj_func3D_kernel(float *Ad, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float lambda) +{ + + float val1,val2,val3; + + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + if (i <= 0) {val1 = 0.0f;} else {val1 = R1[(N*M)*(k) + (i-1) + N*j];} + if (j <= 0) {val2 = 0.0f;} else {val2 = R2[(N*M)*(k) + i + N*(j-1)];} + if (k <= 0) {val3 = 0.0f;} else {val3 = R3[(N*M)*(k-1) + i + N*j];} + //Write final result to global memory + D[index] = Ad[index] - lambda*(R1[index] + R2[index] + R3[index] - val1 - val2 - val3); + } + return; +} + +__global__ void Grad_func3D_kernel(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float multip) +{ + + float val1,val2,val3; + + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + /* boundary conditions */ + if (i >= N-1) val1 = 0.0f; else val1 = D[index] - D[(N*M)*(k) + (i+1) + N*j]; + if (j >= M-1) val2 = 0.0f; else val2 = D[index] - D[(N*M)*(k) + i + N*(j+1)]; + if (k >= Z-1) val3 = 0.0f; else val3 = D[index] - D[(N*M)*(k+1) + i + N*j]; + + //Write final result to global memory + P1[index] = R1[index] + multip*val1; + P2[index] = R2[index] + multip*val2; + P3[index] = R3[index] + multip*val3; + } + return; +} + +__global__ void Proj_func3D_iso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize) +{ + + float denom,sq_denom; + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + denom = pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2); + + if (denom > 1.0f) { + sq_denom = 1.0f/sqrt(denom); + P1[index] = P1[index]*sq_denom; + P2[index] = P2[index]*sq_denom; + P3[index] = P3[index]*sq_denom; + } + } + return; +} + +__global__ void Proj_func3D_aniso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize) +{ + + float val1, val2, val3; + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + val1 = abs(P1[index]); + val2 = abs(P2[index]); + val3 = abs(P3[index]); + if (val1 < 1.0f) {val1 = 1.0f;} + if (val2 < 1.0f) {val2 = 1.0f;} + if (val3 < 1.0f) {val3 = 1.0f;} + P1[index] = P1[index]/val1; + P2[index] = P2[index]/val2; + P3[index] = P3[index]/val3; + } + return; +} + + +__global__ void Rupd_func3D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, float multip2, int N, int M, int Z, int ImSize) +{ + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]); + R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]); + R3[index] = P3[index] + multip2*(P3[index] - P3_old[index]); + } + return; +} + +__global__ void nonneg3D_kernel(float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + if (Output[index] < 0.0f) Output[index] = 0.0f; + } +} + +__global__ void copy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input[index]; + } +} +/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ + +////////////MAIN HOST FUNCTION /////////////// +extern "C" void TV_FGP_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ) +{ + int deviceCount = -1; // number of devices + cudaGetDeviceCount(&deviceCount); + if (deviceCount == 0) { + fprintf(stderr, "No CUDA devices found\n"); + return; + } + + int count = 0, i; + float re, multip,multip2; + float tk = 1.0f; + float tkp1=1.0f; + + if (dimZ <= 1) { + /*2D verson*/ + int ImSize = dimX*dimY; + float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL; + + dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); + dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D)); + + /*allocate space for images on device*/ + checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); + if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) ); + + checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + cudaMemset(P1, 0, ImSize*sizeof(float)); + cudaMemset(P2, 0, ImSize*sizeof(float)); + cudaMemset(P1_prev, 0, ImSize*sizeof(float)); + cudaMemset(P2_prev, 0, ImSize*sizeof(float)); + cudaMemset(R1, 0, ImSize*sizeof(float)); + cudaMemset(R2, 0, ImSize*sizeof(float)); + + /********************** Run CUDA 2D kernel here ********************/ + multip = (1.0f/(8.0f*lambda)); + + /* The main kernel */ + for (i = 0; i < iter; i++) { + + /* computing the gradient of the objective function */ + Obj_func2D_kernel<<>>(d_input, d_update, R1, R2, dimX, dimY, ImSize, lambda); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + if (nonneg != 0) { + nonneg2D_kernel<<>>(d_update, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); } + + /*Taking a step towards minus of the gradient*/ + Grad_func2D_kernel<<>>(P1, P2, d_update, R1, R2, dimX, dimY, ImSize, multip); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + /* projection step */ + if (methodTV == 0) Proj_func2D_iso_kernel<<>>(P1, P2, dimX, dimY, ImSize); /*isotropic TV*/ + else Proj_func2D_aniso_kernel<<>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/ + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; + multip2 = ((tk-1.0f)/tkp1); + + Rupd_func2D_kernel<<>>(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, multip2, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + if (epsil != 0.0f) { + /* calculate norm - stopping rules using the Thrust library */ + ResidCalc2D_kernel<<>>(d_update, d_update_prev, P1_prev, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + thrust::device_vector d_vec(P1_prev, P1_prev + ImSize); + float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus())); + thrust::device_vector d_vec2(d_update, d_update + ImSize); + float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus())); + + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 4) break; + + copy_kernel2D<<>>(d_update, d_update_prev, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + + copy_kernel2D<<>>(P1, P1_prev, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + copy_kernel2D<<>>(P2, P2_prev, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + tk = tkp1; + } + if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i); + /***************************************************************/ + //copy result matrix from device to host memory + cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); + + cudaFree(d_input); + cudaFree(d_update); + if (epsil != 0.0f) cudaFree(d_update_prev); + cudaFree(P1); + cudaFree(P2); + cudaFree(P1_prev); + cudaFree(P2_prev); + cudaFree(R1); + cudaFree(R2); + } + else { + /*3D verson*/ + int ImSize = dimX*dimY*dimZ; + float *d_input, *d_update=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL; + + dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); + dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE)); + + /*allocate space for images on device*/ + checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P3,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P3_prev,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&R3,ImSize*sizeof(float)) ); + + checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + cudaMemset(P1, 0, ImSize*sizeof(float)); + cudaMemset(P2, 0, ImSize*sizeof(float)); + cudaMemset(P3, 0, ImSize*sizeof(float)); + cudaMemset(P1_prev, 0, ImSize*sizeof(float)); + cudaMemset(P2_prev, 0, ImSize*sizeof(float)); + cudaMemset(P3_prev, 0, ImSize*sizeof(float)); + cudaMemset(R1, 0, ImSize*sizeof(float)); + cudaMemset(R2, 0, ImSize*sizeof(float)); + cudaMemset(R3, 0, ImSize*sizeof(float)); + /********************** Run CUDA 3D kernel here ********************/ + multip = (1.0f/(8.0f*lambda)); + + /* The main kernel */ + for (i = 0; i < iter; i++) { + + /* computing the gradient of the objective function */ + Obj_func3D_kernel<<>>(d_input, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, lambda); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + if (nonneg != 0) { + nonneg3D_kernel<<>>(d_update, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); } + + /*Taking a step towards minus of the gradient*/ + Grad_func3D_kernel<<>>(P1, P2, P3, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, multip); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + /* projection step */ + if (methodTV == 0) Proj_func3D_iso_kernel<<>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* isotropic kernel */ + else Proj_func3D_aniso_kernel<<>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* anisotropic kernel */ + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; + multip2 = ((tk-1.0f)/tkp1); + + Rupd_func3D_kernel<<>>(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, multip2, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + copy_kernel3D<<>>(P1, P1_prev, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + copy_kernel3D<<>>(P2, P2_prev, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + copy_kernel3D<<>>(P3, P3_prev, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + tk = tkp1; + } + if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i); + /***************************************************************/ + //copy result matrix from device to host memory + cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); + + cudaFree(d_input); + cudaFree(d_update); + cudaFree(P1); + cudaFree(P2); + cudaFree(P3); + cudaFree(P1_prev); + cudaFree(P2_prev); + cudaFree(P3_prev); + cudaFree(R1); + cudaFree(R2); + cudaFree(R3); + } + cudaDeviceReset(); +} diff --git a/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h new file mode 100755 index 0000000..15c7120 --- /dev/null +++ b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h @@ -0,0 +1,10 @@ +#include +#include +#include + +#ifndef _TV_FGP_GPU_ +#define _TV_FGP_GPU_ + +extern "C" void TV_FGP_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ); + +#endif diff --git a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu deleted file mode 100755 index b67b53b..0000000 --- a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu +++ /dev/null @@ -1,369 +0,0 @@ - /* -This work is part of the Core Imaging Library developed by -Visual Analytics and Imaging System Group of the Science Technology -Facilities Council, STFC - -Copyright 2017 Daniil Kazantsev -Copyright 2017 Srikanth Nagella, Edoardo Pasca - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#include "TV_ROF_GPU.h" - -/* C-OMP implementation of ROF-TV denoising/regularization model [1] (2D/3D case) -* -* Input Parameters: -* 1. Noisy image/volume [REQUIRED] -* 2. lambda - regularization parameter [REQUIRED] -* 3. tau - marching step for explicit scheme, ~0.1 is recommended [REQUIRED] -* 4. Number of iterations, for explicit scheme >= 150 is recommended [REQUIRED] -* -* Output: -* [1] Regularized image/volume - - * This function is based on the paper by -* [1] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms" -* -* D. Kazantsev, 2016-18 -*/ - -#define CHECK(call) \ -{ \ - const cudaError_t error = call; \ - if (error != cudaSuccess) \ - { \ - fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \ - fprintf(stderr, "code: %d, reason: %s\n", error, \ - cudaGetErrorString(error)); \ - exit(1); \ - } \ -} - -#define BLKXSIZE 8 -#define BLKYSIZE 8 -#define BLKZSIZE 8 - -#define BLKXSIZE2D 16 -#define BLKYSIZE2D 16 -#define EPS 1.0e-5 - -#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) - -#define MAX(x, y) (((x) > (y)) ? (x) : (y)) -#define MIN(x, y) (((x) < (y)) ? (x) : (y)) - -__host__ __device__ int sign (float x) -{ - return (x > 0) - (x < 0); -} - -/*********************2D case****************************/ - - /* differences 1 */ - __global__ void D1_func2D(float* Input, float* D1, int N, int M) - { - int i1, j1, i2; - float NOMx_1,NOMy_1,NOMy_0,denom1,denom2,T1; - int i = blockDim.x * blockIdx.x + threadIdx.x; - int j = blockDim.y * blockIdx.y + threadIdx.y; - - int index = i + N*j; - - if ((i >= 0) && (i < N) && (j >= 0) && (j < M)) { - - /* boundary conditions (Neumann reflections) */ - i1 = i + 1; if (i1 >= N) i1 = i-1; - i2 = i - 1; if (i2 < 0) i2 = i+1; - j1 = j + 1; if (j1 >= M) j1 = j-1; - - /* Forward-backward differences */ - NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */ - NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */ - NOMy_0 = Input[index] - Input[j*N + i2]; /* y- */ - - denom1 = NOMx_1*NOMx_1; - denom2 = 0.5f*(sign((float)NOMy_1) + sign((float)NOMy_0))*(MIN(abs((float)NOMy_1),abs((float)NOMy_0))); - denom2 = denom2*denom2; - T1 = sqrt(denom1 + denom2 + EPS); - D1[index] = NOMx_1/T1; - } - } - - /* differences 2 */ - __global__ void D2_func2D(float* Input, float* D2, int N, int M) - { - int i1, j1, j2; - float NOMx_1,NOMy_1,NOMx_0,denom1,denom2,T2; - int i = blockDim.x * blockIdx.x + threadIdx.x; - int j = blockDim.y * blockIdx.y + threadIdx.y; - - int index = i + N*j; - - if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) { - - /* boundary conditions (Neumann reflections) */ - i1 = i + 1; if (i1 >= N) i1 = i-1; - j1 = j + 1; if (j1 >= M) j1 = j-1; - j2 = j - 1; if (j2 < 0) j2 = j+1; - - /* Forward-backward differences */ - NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */ - NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */ - NOMx_0 = Input[index] - Input[j2*N + i]; /* x- */ - - denom1 = NOMy_1*NOMy_1; - denom2 = 0.5f*(sign((float)NOMx_1) + sign((float)NOMx_0))*(MIN(abs((float)NOMx_1),abs((float)NOMx_0))); - denom2 = denom2*denom2; - T2 = sqrt(denom1 + denom2 + EPS); - D2[index] = NOMy_1/T2; - } - } - - __global__ void TV_kernel2D(float *D1, float *D2, float *Update, float *Input, float lambda, float tau, int N, int M) - { - int i2, j2; - float dv1,dv2; - int i = blockDim.x * blockIdx.x + threadIdx.x; - int j = blockDim.y * blockIdx.y + threadIdx.y; - - int index = i + N*j; - - if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) { - - /* boundary conditions (Neumann reflections) */ - i2 = i - 1; if (i2 < 0) i2 = i+1; - j2 = j - 1; if (j2 < 0) j2 = j+1; - - /* divergence components */ - dv1 = D1[index] - D1[j2*N + i]; - dv2 = D2[index] - D2[j*N + i2]; - - Update[index] = Update[index] + tau*(2.0f*lambda*(dv1 + dv2) - (Update[index] - Input[index])); - - } - } -/*********************3D case****************************/ - - /* differences 1 */ - __global__ void D1_func3D(float* Input, float* D1, int dimX, int dimY, int dimZ) - { - float NOMx_1, NOMy_1, NOMy_0, NOMz_1, NOMz_0, denom1, denom2,denom3, T1; - int i1,i2,k1,j1,j2,k2; - - int i = blockDim.x * blockIdx.x + threadIdx.x; - int j = blockDim.y * blockIdx.y + threadIdx.y; - int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (dimX*dimY)*k + j*dimX+i; - - if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { - - /* symmetric boundary conditions (Neuman) */ - i1 = i + 1; if (i1 >= dimX) i1 = i-1; - i2 = i - 1; if (i2 < 0) i2 = i+1; - j1 = j + 1; if (j1 >= dimY) j1 = j-1; - j2 = j - 1; if (j2 < 0) j2 = j+1; - k1 = k + 1; if (k1 >= dimZ) k1 = k-1; - k2 = k - 1; if (k2 < 0) k2 = k+1; - - /* Forward-backward differences */ - NOMx_1 = Input[(dimX*dimY)*k + j1*dimX + i] - Input[index]; /* x+ */ - NOMy_1 = Input[(dimX*dimY)*k + j*dimX + i1] - Input[index]; /* y+ */ - NOMy_0 = Input[index] - Input[(dimX*dimY)*k + j*dimX + i2]; /* y- */ - - NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ - NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + j*dimX + i]; /* z- */ - - - denom1 = NOMx_1*NOMx_1; - denom2 = 0.5*(sign(NOMy_1) + sign(NOMy_0))*(MIN(abs(NOMy_1),abs(NOMy_0))); - denom2 = denom2*denom2; - denom3 = 0.5*(sign(NOMz_1) + sign(NOMz_0))*(MIN(abs(NOMz_1),abs(NOMz_0))); - denom3 = denom3*denom3; - T1 = sqrt(denom1 + denom2 + denom3 + EPS); - D1[index] = NOMx_1/T1; - } - } - - /* differences 2 */ - __global__ void D2_func3D(float* Input, float* D2, int dimX, int dimY, int dimZ) - { - float NOMx_1, NOMy_1, NOMx_0, NOMz_1, NOMz_0, denom1, denom2, denom3, T2; - int i1,i2,k1,j1,j2,k2; - - int i = blockDim.x * blockIdx.x + threadIdx.x; - int j = blockDim.y * blockIdx.y + threadIdx.y; - int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (dimX*dimY)*k + j*dimX+i; - - if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { - /* symmetric boundary conditions (Neuman) */ - i1 = i + 1; if (i1 >= dimX) i1 = i-1; - i2 = i - 1; if (i2 < 0) i2 = i+1; - j1 = j + 1; if (j1 >= dimY) j1 = j-1; - j2 = j - 1; if (j2 < 0) j2 = j+1; - k1 = k + 1; if (k1 >= dimZ) k1 = k-1; - k2 = k - 1; if (k2 < 0) k2 = k+1; - - - /* Forward-backward differences */ - NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */ - NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */ - NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ - NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ - NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + (j)*dimX + i]; /* z- */ - - - denom1 = NOMy_1*NOMy_1; - denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0))); - denom2 = denom2*denom2; - denom3 = 0.5*(sign(NOMz_1) + sign(NOMz_0))*(MIN(abs(NOMz_1),abs(NOMz_0))); - denom3 = denom3*denom3; - T2 = sqrt(denom1 + denom2 + denom3 + EPS); - D2[index] = NOMy_1/T2; - } - } - - /* differences 3 */ - __global__ void D3_func3D(float* Input, float* D3, int dimX, int dimY, int dimZ) - { - float NOMx_1, NOMy_1, NOMx_0, NOMy_0, NOMz_1, denom1, denom2, denom3, T3; - int i1,i2,k1,j1,j2,k2; - - int i = blockDim.x * blockIdx.x + threadIdx.x; - int j = blockDim.y * blockIdx.y + threadIdx.y; - int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (dimX*dimY)*k + j*dimX+i; - - if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { - - i1 = i + 1; if (i1 >= dimX) i1 = i-1; - i2 = i - 1; if (i2 < 0) i2 = i+1; - j1 = j + 1; if (j1 >= dimY) j1 = j-1; - j2 = j - 1; if (j2 < 0) j2 = j+1; - k1 = k + 1; if (k1 >= dimZ) k1 = k-1; - k2 = k - 1; if (k2 < 0) k2 = k+1; - - /* Forward-backward differences */ - NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */ - NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */ - NOMy_0 = Input[index] - Input[(dimX*dimY)*k + (j)*dimX + i2]; /* y- */ - NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ - NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ - - denom1 = NOMz_1*NOMz_1; - denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0))); - denom2 = denom2*denom2; - denom3 = 0.5*(sign(NOMy_1) + sign(NOMy_0))*(MIN(abs(NOMy_1),abs(NOMy_0))); - denom3 = denom3*denom3; - T3 = sqrt(denom1 + denom2 + denom3 + EPS); - D3[index] = NOMz_1/T3; - } - } - - __global__ void TV_kernel3D(float *D1, float *D2, float *D3, float *Update, float *Input, float lambda, float tau, int dimX, int dimY, int dimZ) - { - float dv1, dv2, dv3; - int i1,i2,k1,j1,j2,k2; - int i = blockDim.x * blockIdx.x + threadIdx.x; - int j = blockDim.y * blockIdx.y + threadIdx.y; - int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (dimX*dimY)*k + j*dimX+i; - - if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { - - /* symmetric boundary conditions (Neuman) */ - i1 = i + 1; if (i1 >= dimX) i1 = i-1; - i2 = i - 1; if (i2 < 0) i2 = i+1; - j1 = j + 1; if (j1 >= dimY) j1 = j-1; - j2 = j - 1; if (j2 < 0) j2 = j+1; - k1 = k + 1; if (k1 >= dimZ) k1 = k-1; - k2 = k - 1; if (k2 < 0) k2 = k+1; - - /*divergence components */ - dv1 = D1[index] - D1[(dimX*dimY)*k + j2*dimX+i]; - dv2 = D2[index] - D2[(dimX*dimY)*k + j*dimX+i2]; - dv3 = D3[index] - D3[(dimX*dimY)*k2 + j*dimX+i]; - - Update[index] = Update[index] + tau*(2.0f*lambda*(dv1 + dv2 + dv3) - (Update[index] - Input[index])); - - } - } - -///////////////////////////////////////////////// -// HOST FUNCTION -extern "C" void TV_ROF_GPU(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambda) -{ - // set up device - int dev = 0; - CHECK(cudaSetDevice(dev)); - - float *d_input, *d_update, *d_D1, *d_D2; - - if (Z == 0) Z = 1; - CHECK(cudaMalloc((void**)&d_input,N*M*Z*sizeof(float))); - CHECK(cudaMalloc((void**)&d_update,N*M*Z*sizeof(float))); - CHECK(cudaMalloc((void**)&d_D1,N*M*Z*sizeof(float))); - CHECK(cudaMalloc((void**)&d_D2,N*M*Z*sizeof(float))); - - CHECK(cudaMemcpy(d_input,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice)); - CHECK(cudaMemcpy(d_update,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice)); - - if (Z > 1) { - // TV - 3D case - dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); - dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKXSIZE)); - - float *d_D3; - CHECK(cudaMalloc((void**)&d_D3,N*M*Z*sizeof(float))); - - for(int n=0; n < iter; n++) { - /* calculate differences */ - D1_func3D<<>>(d_update, d_D1, N, M, Z); - CHECK(cudaDeviceSynchronize()); - D2_func3D<<>>(d_update, d_D2, N, M, Z); - CHECK(cudaDeviceSynchronize()); - D3_func3D<<>>(d_update, d_D3, N, M, Z); - CHECK(cudaDeviceSynchronize()); - /*running main kernel*/ - TV_kernel3D<<>>(d_D1, d_D2, d_D3, d_update, d_input, lambda, tau, N, M, Z); - CHECK(cudaDeviceSynchronize()); - } - - CHECK(cudaFree(d_D3)); - } - else { - // TV - 2D case - dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); - dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D)); - - for(int n=0; n < iter; n++) { - /* calculate differences */ - D1_func2D<<>>(d_update, d_D1, N, M); - CHECK(cudaDeviceSynchronize()); - D2_func2D<<>>(d_update, d_D2, N, M); - CHECK(cudaDeviceSynchronize()); - /*running main kernel*/ - TV_kernel2D<<>>(d_D1, d_D2, d_update, d_input, lambda, tau, N, M); - CHECK(cudaDeviceSynchronize()); - } - } - CHECK(cudaMemcpy(Output,d_update,N*M*Z*sizeof(float),cudaMemcpyDeviceToHost)); - CHECK(cudaFree(d_input)); - CHECK(cudaFree(d_update)); - CHECK(cudaFree(d_D1)); - CHECK(cudaFree(d_D2)); - cudaDeviceReset(); -} diff --git a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.h b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.h deleted file mode 100755 index 2938d2f..0000000 --- a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __TVGPU_H__ -#define __TVGPU_H__ -#include "CCPiDefines.h" -#include - -extern "C" CCPI_EXPORT void TV_ROF_GPU_kernel(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambda); - -#endif diff --git a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu new file mode 100755 index 0000000..480855f --- /dev/null +++ b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu @@ -0,0 +1,369 @@ + /* +This work is part of the Core Imaging Library developed by +Visual Analytics and Imaging System Group of the Science Technology +Facilities Council, STFC + +Copyright 2017 Daniil Kazantsev +Copyright 2017 Srikanth Nagella, Edoardo Pasca + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "TV_ROF_GPU_core.h" + +/* C-OMP implementation of ROF-TV denoising/regularization model [1] (2D/3D case) +* +* Input Parameters: +* 1. Noisy image/volume [REQUIRED] +* 2. lambda - regularization parameter [REQUIRED] +* 3. tau - marching step for explicit scheme, ~0.1 is recommended [REQUIRED] +* 4. Number of iterations, for explicit scheme >= 150 is recommended [REQUIRED] +* +* Output: +* [1] Regularized image/volume + + * This function is based on the paper by +* [1] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms" +* +* D. Kazantsev, 2016-18 +*/ + +#define CHECK(call) \ +{ \ + const cudaError_t error = call; \ + if (error != cudaSuccess) \ + { \ + fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \ + fprintf(stderr, "code: %d, reason: %s\n", error, \ + cudaGetErrorString(error)); \ + exit(1); \ + } \ +} + +#define BLKXSIZE 8 +#define BLKYSIZE 8 +#define BLKZSIZE 8 + +#define BLKXSIZE2D 16 +#define BLKYSIZE2D 16 +#define EPS 1.0e-5 + +#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) + +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) + +__host__ __device__ int sign (float x) +{ + return (x > 0) - (x < 0); +} + +/*********************2D case****************************/ + + /* differences 1 */ + __global__ void D1_func2D(float* Input, float* D1, int N, int M) + { + int i1, j1, i2; + float NOMx_1,NOMy_1,NOMy_0,denom1,denom2,T1; + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + + int index = i + N*j; + + if ((i >= 0) && (i < N) && (j >= 0) && (j < M)) { + + /* boundary conditions (Neumann reflections) */ + i1 = i + 1; if (i1 >= N) i1 = i-1; + i2 = i - 1; if (i2 < 0) i2 = i+1; + j1 = j + 1; if (j1 >= M) j1 = j-1; + + /* Forward-backward differences */ + NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */ + NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */ + NOMy_0 = Input[index] - Input[j*N + i2]; /* y- */ + + denom1 = NOMx_1*NOMx_1; + denom2 = 0.5f*(sign((float)NOMy_1) + sign((float)NOMy_0))*(MIN(abs((float)NOMy_1),abs((float)NOMy_0))); + denom2 = denom2*denom2; + T1 = sqrt(denom1 + denom2 + EPS); + D1[index] = NOMx_1/T1; + } + } + + /* differences 2 */ + __global__ void D2_func2D(float* Input, float* D2, int N, int M) + { + int i1, j1, j2; + float NOMx_1,NOMy_1,NOMx_0,denom1,denom2,T2; + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + + int index = i + N*j; + + if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) { + + /* boundary conditions (Neumann reflections) */ + i1 = i + 1; if (i1 >= N) i1 = i-1; + j1 = j + 1; if (j1 >= M) j1 = j-1; + j2 = j - 1; if (j2 < 0) j2 = j+1; + + /* Forward-backward differences */ + NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */ + NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */ + NOMx_0 = Input[index] - Input[j2*N + i]; /* x- */ + + denom1 = NOMy_1*NOMy_1; + denom2 = 0.5f*(sign((float)NOMx_1) + sign((float)NOMx_0))*(MIN(abs((float)NOMx_1),abs((float)NOMx_0))); + denom2 = denom2*denom2; + T2 = sqrt(denom1 + denom2 + EPS); + D2[index] = NOMy_1/T2; + } + } + + __global__ void TV_kernel2D(float *D1, float *D2, float *Update, float *Input, float lambda, float tau, int N, int M) + { + int i2, j2; + float dv1,dv2; + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + + int index = i + N*j; + + if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) { + + /* boundary conditions (Neumann reflections) */ + i2 = i - 1; if (i2 < 0) i2 = i+1; + j2 = j - 1; if (j2 < 0) j2 = j+1; + + /* divergence components */ + dv1 = D1[index] - D1[j2*N + i]; + dv2 = D2[index] - D2[j*N + i2]; + + Update[index] = Update[index] + tau*(2.0f*lambda*(dv1 + dv2) - (Update[index] - Input[index])); + + } + } +/*********************3D case****************************/ + + /* differences 1 */ + __global__ void D1_func3D(float* Input, float* D1, int dimX, int dimY, int dimZ) + { + float NOMx_1, NOMy_1, NOMy_0, NOMz_1, NOMz_0, denom1, denom2,denom3, T1; + int i1,i2,k1,j1,j2,k2; + + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (dimX*dimY)*k + j*dimX+i; + + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { + + /* symmetric boundary conditions (Neuman) */ + i1 = i + 1; if (i1 >= dimX) i1 = i-1; + i2 = i - 1; if (i2 < 0) i2 = i+1; + j1 = j + 1; if (j1 >= dimY) j1 = j-1; + j2 = j - 1; if (j2 < 0) j2 = j+1; + k1 = k + 1; if (k1 >= dimZ) k1 = k-1; + k2 = k - 1; if (k2 < 0) k2 = k+1; + + /* Forward-backward differences */ + NOMx_1 = Input[(dimX*dimY)*k + j1*dimX + i] - Input[index]; /* x+ */ + NOMy_1 = Input[(dimX*dimY)*k + j*dimX + i1] - Input[index]; /* y+ */ + NOMy_0 = Input[index] - Input[(dimX*dimY)*k + j*dimX + i2]; /* y- */ + + NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ + NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + j*dimX + i]; /* z- */ + + + denom1 = NOMx_1*NOMx_1; + denom2 = 0.5*(sign(NOMy_1) + sign(NOMy_0))*(MIN(abs(NOMy_1),abs(NOMy_0))); + denom2 = denom2*denom2; + denom3 = 0.5*(sign(NOMz_1) + sign(NOMz_0))*(MIN(abs(NOMz_1),abs(NOMz_0))); + denom3 = denom3*denom3; + T1 = sqrt(denom1 + denom2 + denom3 + EPS); + D1[index] = NOMx_1/T1; + } + } + + /* differences 2 */ + __global__ void D2_func3D(float* Input, float* D2, int dimX, int dimY, int dimZ) + { + float NOMx_1, NOMy_1, NOMx_0, NOMz_1, NOMz_0, denom1, denom2, denom3, T2; + int i1,i2,k1,j1,j2,k2; + + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (dimX*dimY)*k + j*dimX+i; + + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { + /* symmetric boundary conditions (Neuman) */ + i1 = i + 1; if (i1 >= dimX) i1 = i-1; + i2 = i - 1; if (i2 < 0) i2 = i+1; + j1 = j + 1; if (j1 >= dimY) j1 = j-1; + j2 = j - 1; if (j2 < 0) j2 = j+1; + k1 = k + 1; if (k1 >= dimZ) k1 = k-1; + k2 = k - 1; if (k2 < 0) k2 = k+1; + + + /* Forward-backward differences */ + NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */ + NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */ + NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ + NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ + NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + (j)*dimX + i]; /* z- */ + + + denom1 = NOMy_1*NOMy_1; + denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0))); + denom2 = denom2*denom2; + denom3 = 0.5*(sign(NOMz_1) + sign(NOMz_0))*(MIN(abs(NOMz_1),abs(NOMz_0))); + denom3 = denom3*denom3; + T2 = sqrt(denom1 + denom2 + denom3 + EPS); + D2[index] = NOMy_1/T2; + } + } + + /* differences 3 */ + __global__ void D3_func3D(float* Input, float* D3, int dimX, int dimY, int dimZ) + { + float NOMx_1, NOMy_1, NOMx_0, NOMy_0, NOMz_1, denom1, denom2, denom3, T3; + int i1,i2,k1,j1,j2,k2; + + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (dimX*dimY)*k + j*dimX+i; + + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { + + i1 = i + 1; if (i1 >= dimX) i1 = i-1; + i2 = i - 1; if (i2 < 0) i2 = i+1; + j1 = j + 1; if (j1 >= dimY) j1 = j-1; + j2 = j - 1; if (j2 < 0) j2 = j+1; + k1 = k + 1; if (k1 >= dimZ) k1 = k-1; + k2 = k - 1; if (k2 < 0) k2 = k+1; + + /* Forward-backward differences */ + NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */ + NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */ + NOMy_0 = Input[index] - Input[(dimX*dimY)*k + (j)*dimX + i2]; /* y- */ + NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ + NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ + + denom1 = NOMz_1*NOMz_1; + denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0))); + denom2 = denom2*denom2; + denom3 = 0.5*(sign(NOMy_1) + sign(NOMy_0))*(MIN(abs(NOMy_1),abs(NOMy_0))); + denom3 = denom3*denom3; + T3 = sqrt(denom1 + denom2 + denom3 + EPS); + D3[index] = NOMz_1/T3; + } + } + + __global__ void TV_kernel3D(float *D1, float *D2, float *D3, float *Update, float *Input, float lambda, float tau, int dimX, int dimY, int dimZ) + { + float dv1, dv2, dv3; + int i1,i2,k1,j1,j2,k2; + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (dimX*dimY)*k + j*dimX+i; + + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { + + /* symmetric boundary conditions (Neuman) */ + i1 = i + 1; if (i1 >= dimX) i1 = i-1; + i2 = i - 1; if (i2 < 0) i2 = i+1; + j1 = j + 1; if (j1 >= dimY) j1 = j-1; + j2 = j - 1; if (j2 < 0) j2 = j+1; + k1 = k + 1; if (k1 >= dimZ) k1 = k-1; + k2 = k - 1; if (k2 < 0) k2 = k+1; + + /*divergence components */ + dv1 = D1[index] - D1[(dimX*dimY)*k + j2*dimX+i]; + dv2 = D2[index] - D2[(dimX*dimY)*k + j*dimX+i2]; + dv3 = D3[index] - D3[(dimX*dimY)*k2 + j*dimX+i]; + + Update[index] = Update[index] + tau*(2.0f*lambda*(dv1 + dv2 + dv3) - (Update[index] - Input[index])); + + } + } + +///////////////////////////////////////////////// +// HOST FUNCTION +extern "C" void TV_ROF_GPU(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambda) +{ + // set up device + int dev = 0; + CHECK(cudaSetDevice(dev)); + + float *d_input, *d_update, *d_D1, *d_D2; + + if (Z == 0) Z = 1; + CHECK(cudaMalloc((void**)&d_input,N*M*Z*sizeof(float))); + CHECK(cudaMalloc((void**)&d_update,N*M*Z*sizeof(float))); + CHECK(cudaMalloc((void**)&d_D1,N*M*Z*sizeof(float))); + CHECK(cudaMalloc((void**)&d_D2,N*M*Z*sizeof(float))); + + CHECK(cudaMemcpy(d_input,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice)); + CHECK(cudaMemcpy(d_update,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice)); + + if (Z > 1) { + // TV - 3D case + dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); + dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKXSIZE)); + + float *d_D3; + CHECK(cudaMalloc((void**)&d_D3,N*M*Z*sizeof(float))); + + for(int n=0; n < iter; n++) { + /* calculate differences */ + D1_func3D<<>>(d_update, d_D1, N, M, Z); + CHECK(cudaDeviceSynchronize()); + D2_func3D<<>>(d_update, d_D2, N, M, Z); + CHECK(cudaDeviceSynchronize()); + D3_func3D<<>>(d_update, d_D3, N, M, Z); + CHECK(cudaDeviceSynchronize()); + /*running main kernel*/ + TV_kernel3D<<>>(d_D1, d_D2, d_D3, d_update, d_input, lambda, tau, N, M, Z); + CHECK(cudaDeviceSynchronize()); + } + + CHECK(cudaFree(d_D3)); + } + else { + // TV - 2D case + dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); + dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D)); + + for(int n=0; n < iter; n++) { + /* calculate differences */ + D1_func2D<<>>(d_update, d_D1, N, M); + CHECK(cudaDeviceSynchronize()); + D2_func2D<<>>(d_update, d_D2, N, M); + CHECK(cudaDeviceSynchronize()); + /*running main kernel*/ + TV_kernel2D<<>>(d_D1, d_D2, d_update, d_input, lambda, tau, N, M); + CHECK(cudaDeviceSynchronize()); + } + } + CHECK(cudaMemcpy(Output,d_update,N*M*Z*sizeof(float),cudaMemcpyDeviceToHost)); + CHECK(cudaFree(d_input)); + CHECK(cudaFree(d_update)); + CHECK(cudaFree(d_D1)); + CHECK(cudaFree(d_D2)); + cudaDeviceReset(); +} diff --git a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h new file mode 100755 index 0000000..8b64d99 --- /dev/null +++ b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h @@ -0,0 +1,8 @@ +#ifndef __TVGPU_H__ +#define __TVGPU_H__ +#include "CCPiDefines.h" +#include + +extern "C" CCPI_EXPORT void TV_ROF_GPU(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambda); + +#endif diff --git a/Wrappers/Python/src/cpu_regularizers.cpp b/Wrappers/Python/src/cpu_regularizers.cpp index e311570..43d5d11 100644 --- a/Wrappers/Python/src/cpu_regularizers.cpp +++ b/Wrappers/Python/src/cpu_regularizers.cpp @@ -27,7 +27,7 @@ limitations under the License. #include "boost/tuple/tuple.hpp" #include "SplitBregman_TV_core.h" -#include "FGP_TV_core.h" +//#include "FGP_TV_core.h" #include "LLT_model_core.h" #include "PatchBased_Regul_core.h" #include "TGV_PD_core.h" @@ -305,289 +305,289 @@ bp::list SplitBregman_TV(np::ndarray input, double d_mu, int iter, double d_epsi -bp::list FGP_TV(np::ndarray input, double d_mu, int iter, double d_epsil, int methTV) { +//bp::list FGP_TV(np::ndarray input, double d_mu, int iter, double d_epsil, int methTV) { - // the result is in the following list - bp::list result; + //// the result is in the following list + //bp::list result; - int number_of_dims, dimX, dimY, dimZ, ll, j, count; - float *A, *D = NULL, *D_old = NULL, *P1 = NULL, *P2 = NULL, *P3 = NULL, *P1_old = NULL, *P2_old = NULL, *P3_old = NULL, *R1 = NULL, *R2 = NULL, *R3 = NULL; - float lambda, tk, tkp1, re, re1, re_old, epsil, funcval; + //int number_of_dims, dimX, dimY, dimZ, ll, j, count; + //float *A, *D = NULL, *D_old = NULL, *P1 = NULL, *P2 = NULL, *P3 = NULL, *P1_old = NULL, *P2_old = NULL, *P3_old = NULL, *R1 = NULL, *R2 = NULL, *R3 = NULL; + //float lambda, tk, tkp1, re, re1, re_old, epsil, funcval; - //number_of_dims = mxGetNumberOfDimensions(prhs[0]); - //dim_array = mxGetDimensions(prhs[0]); - - number_of_dims = input.get_nd(); - int dim_array[3]; + ////number_of_dims = mxGetNumberOfDimensions(prhs[0]); + ////dim_array = mxGetDimensions(prhs[0]); - dim_array[0] = input.shape(0); - dim_array[1] = input.shape(1); - if (number_of_dims == 2) { - dim_array[2] = -1; - } - else { - dim_array[2] = input.shape(2); - } - // Parameter handling is be done in Python - ///*Handling Matlab input data*/ - //if ((nrhs < 2) || (nrhs > 5)) mexErrMsgTxt("At least 2 parameters is required: Image(2D/3D), Regularization parameter. The full list of parameters: Image(2D/3D), Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1')"); - - ///*Handling Matlab input data*/ - //A = (float *)mxGetData(prhs[0]); /*noisy image (2D/3D) */ - A = reinterpret_cast(input.get_data()); - - //mu = (float)mxGetScalar(prhs[1]); /* regularization parameter */ - lambda = (float)d_mu; - - //iter = 35; /* default iterations number */ + //number_of_dims = input.get_nd(); + //int dim_array[3]; - //epsil = 0.0001; /* default tolerance constant */ - epsil = (float)d_epsil; - //methTV = 0; /* default isotropic TV penalty */ - //if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5)) iter = (int)mxGetScalar(prhs[2]); /* iterations number */ - //if ((nrhs == 4) || (nrhs == 5)) epsil = (float)mxGetScalar(prhs[3]); /* tolerance constant */ - //if (nrhs == 5) { - // char *penalty_type; - // penalty_type = mxArrayToString(prhs[4]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */ - // if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',"); - // if (strcmp(penalty_type, "l1") == 0) methTV = 1; /* enable 'l1' penalty */ - // mxFree(penalty_type); + //dim_array[0] = input.shape(0); + //dim_array[1] = input.shape(1); + //if (number_of_dims == 2) { + //dim_array[2] = -1; //} - //if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) { mexErrMsgTxt("The input image must be in a single precision"); } - - //plhs[1] = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL); - bp::tuple shape1 = bp::make_tuple(dim_array[0], dim_array[1]); - np::dtype dtype = np::dtype::get_builtin(); - np::ndarray out1 = np::zeros(shape1, dtype); + //else { + //dim_array[2] = input.shape(2); + //} + //// Parameter handling is be done in Python + /////*Handling Matlab input data*/ + ////if ((nrhs < 2) || (nrhs > 5)) mexErrMsgTxt("At least 2 parameters is required: Image(2D/3D), Regularization parameter. The full list of parameters: Image(2D/3D), Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1')"); + + /////*Handling Matlab input data*/ + ////A = (float *)mxGetData(prhs[0]); /*noisy image (2D/3D) */ + //A = reinterpret_cast(input.get_data()); + + ////mu = (float)mxGetScalar(prhs[1]); /* regularization parameter */ + //lambda = (float)d_mu; + + ////iter = 35; /* default iterations number */ + + ////epsil = 0.0001; /* default tolerance constant */ + //epsil = (float)d_epsil; + ////methTV = 0; /* default isotropic TV penalty */ + ////if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5)) iter = (int)mxGetScalar(prhs[2]); /* iterations number */ + ////if ((nrhs == 4) || (nrhs == 5)) epsil = (float)mxGetScalar(prhs[3]); /* tolerance constant */ + ////if (nrhs == 5) { + //// char *penalty_type; + //// penalty_type = mxArrayToString(prhs[4]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */ + //// if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',"); + //// if (strcmp(penalty_type, "l1") == 0) methTV = 1; /* enable 'l1' penalty */ + //// mxFree(penalty_type); + ////} + ////if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) { mexErrMsgTxt("The input image must be in a single precision"); } + + ////plhs[1] = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL); + //bp::tuple shape1 = bp::make_tuple(dim_array[0], dim_array[1]); + //np::dtype dtype = np::dtype::get_builtin(); + //np::ndarray out1 = np::zeros(shape1, dtype); - //float *funcvalA = (float *)mxGetData(plhs[1]); - float * funcvalA = reinterpret_cast(out1.get_data()); - //if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) { mexErrMsgTxt("The input image must be in a single precision"); } - - /*Handling Matlab output data*/ - dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; - - tk = 1.0f; - tkp1 = 1.0f; - count = 1; - re_old = 0.0f; - - if (number_of_dims == 2) { - dimZ = 1; /*2D case*/ - /*D = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); - D_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); - P1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); - P2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); - P1_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); - P2_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); - R1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); - R2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));*/ - - bp::tuple shape = bp::make_tuple(dim_array[0], dim_array[1]); - np::dtype dtype = np::dtype::get_builtin(); - - - np::ndarray npD = np::zeros(shape, dtype); - np::ndarray npD_old = np::zeros(shape, dtype); - np::ndarray npP1 = np::zeros(shape, dtype); - np::ndarray npP2 = np::zeros(shape, dtype); - np::ndarray npP1_old = np::zeros(shape, dtype); - np::ndarray npP2_old = np::zeros(shape, dtype); - np::ndarray npR1 = np::zeros(shape, dtype); - np::ndarray npR2 = np::zeros(shape, dtype); - - D = reinterpret_cast(npD.get_data()); - D_old = reinterpret_cast(npD_old.get_data()); - P1 = reinterpret_cast(npP1.get_data()); - P2 = reinterpret_cast(npP2.get_data()); - P1_old = reinterpret_cast(npP1_old.get_data()); - P2_old = reinterpret_cast(npP2_old.get_data()); - R1 = reinterpret_cast(npR1.get_data()); - R2 = reinterpret_cast(npR2.get_data()); - - /* begin iterations */ - for (ll = 0; ll(out1.get_data()); + ////if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) { mexErrMsgTxt("The input image must be in a single precision"); } + + ///*Handling Matlab output data*/ + //dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2]; + + //tk = 1.0f; + //tkp1 = 1.0f; + //count = 1; + //re_old = 0.0f; + + //if (number_of_dims == 2) { + //dimZ = 1; /*2D case*/ + ///*D = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); + //D_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); + //P1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); + //P2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); + //P1_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); + //P2_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); + //R1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); + //R2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));*/ + + //bp::tuple shape = bp::make_tuple(dim_array[0], dim_array[1]); + //np::dtype dtype = np::dtype::get_builtin(); + + + //np::ndarray npD = np::zeros(shape, dtype); + //np::ndarray npD_old = np::zeros(shape, dtype); + //np::ndarray npP1 = np::zeros(shape, dtype); + //np::ndarray npP2 = np::zeros(shape, dtype); + //np::ndarray npP1_old = np::zeros(shape, dtype); + //np::ndarray npP2_old = np::zeros(shape, dtype); + //np::ndarray npR1 = np::zeros(shape, dtype); + //np::ndarray npR2 = np::zeros(shape, dtype); + + //D = reinterpret_cast(npD.get_data()); + //D_old = reinterpret_cast(npD_old.get_data()); + //P1 = reinterpret_cast(npP1.get_data()); + //P2 = reinterpret_cast(npP2.get_data()); + //P1_old = reinterpret_cast(npP1_old.get_data()); + //P2_old = reinterpret_cast(npP2_old.get_data()); + //R1 = reinterpret_cast(npR1.get_data()); + //R2 = reinterpret_cast(npR2.get_data()); + + ///* begin iterations */ + //for (ll = 0; ll 4) { - Obj_func2D(A, D, P1, P2, lambda, dimX, dimY); - funcval = 0.0f; - for (j = 0; j 2) { - if (re > re_old) { - Obj_func2D(A, D, P1, P2, lambda, dimX, dimY); - funcval = 0.0f; - for (j = 0; j(npD); - result.append(out1); - result.append(ll); - } - if (number_of_dims == 3) { - /*D = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); - D_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); - P1 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); - P2 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); - P3 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); - P1_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); - P2_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); - P3_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); - R1 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); - R2 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); - R3 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));*/ - bp::tuple shape = bp::make_tuple(dim_array[0], dim_array[1], dim_array[2]); - np::dtype dtype = np::dtype::get_builtin(); + ///*updating R and t*/ + //tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; + //Rupd_func2D(P1, P1_old, P2, P2_old, R1, R2, tkp1, tk, dimX, dimY); + + ///* calculate norm */ + //re = 0.0f; re1 = 0.0f; + //for (j = 0; j 4) { + //Obj_func2D(A, D, P1, P2, lambda, dimX, dimY); + //funcval = 0.0f; + //for (j = 0; j 2) { + //if (re > re_old) { + //Obj_func2D(A, D, P1, P2, lambda, dimX, dimY); + //funcval = 0.0f; + //for (j = 0; j(npD); + //result.append(out1); + //result.append(ll); + //} + //if (number_of_dims == 3) { + ///*D = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + //D_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + //P1 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + //P2 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + //P3 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + //P1_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + //P2_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + //P3_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + //R1 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + //R2 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL)); + //R3 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));*/ + //bp::tuple shape = bp::make_tuple(dim_array[0], dim_array[1], dim_array[2]); + //np::dtype dtype = np::dtype::get_builtin(); - np::ndarray npD = np::zeros(shape, dtype); - np::ndarray npD_old = np::zeros(shape, dtype); - np::ndarray npP1 = np::zeros(shape, dtype); - np::ndarray npP2 = np::zeros(shape, dtype); - np::ndarray npP3 = np::zeros(shape, dtype); - np::ndarray npP1_old = np::zeros(shape, dtype); - np::ndarray npP2_old = np::zeros(shape, dtype); - np::ndarray npP3_old = np::zeros(shape, dtype); - np::ndarray npR1 = np::zeros(shape, dtype); - np::ndarray npR2 = np::zeros(shape, dtype); - np::ndarray npR3 = np::zeros(shape, dtype); - - D = reinterpret_cast(npD.get_data()); - D_old = reinterpret_cast(npD_old.get_data()); - P1 = reinterpret_cast(npP1.get_data()); - P2 = reinterpret_cast(npP2.get_data()); - P3 = reinterpret_cast(npP3.get_data()); - P1_old = reinterpret_cast(npP1_old.get_data()); - P2_old = reinterpret_cast(npP2_old.get_data()); - P3_old = reinterpret_cast(npP3_old.get_data()); - R1 = reinterpret_cast(npR1.get_data()); - R2 = reinterpret_cast(npR2.get_data()); - R3 = reinterpret_cast(npR3.get_data()); - /* begin iterations */ - for (ll = 0; ll 3) { - Obj_func3D(A, D, P1, P2, P3, lambda, dimX, dimY, dimZ); - funcval = 0.0f; - for (j = 0; j 2) { - if (re > re_old) { - Obj_func3D(A, D, P1, P2, P3, lambda, dimX, dimY, dimZ); - funcval = 0.0f; - for (j = 0; j(npD); - result.append(out1); - result.append(ll); - } + //np::ndarray npD = np::zeros(shape, dtype); + //np::ndarray npD_old = np::zeros(shape, dtype); + //np::ndarray npP1 = np::zeros(shape, dtype); + //np::ndarray npP2 = np::zeros(shape, dtype); + //np::ndarray npP3 = np::zeros(shape, dtype); + //np::ndarray npP1_old = np::zeros(shape, dtype); + //np::ndarray npP2_old = np::zeros(shape, dtype); + //np::ndarray npP3_old = np::zeros(shape, dtype); + //np::ndarray npR1 = np::zeros(shape, dtype); + //np::ndarray npR2 = np::zeros(shape, dtype); + //np::ndarray npR3 = np::zeros(shape, dtype); + + //D = reinterpret_cast(npD.get_data()); + //D_old = reinterpret_cast(npD_old.get_data()); + //P1 = reinterpret_cast(npP1.get_data()); + //P2 = reinterpret_cast(npP2.get_data()); + //P3 = reinterpret_cast(npP3.get_data()); + //P1_old = reinterpret_cast(npP1_old.get_data()); + //P2_old = reinterpret_cast(npP2_old.get_data()); + //P3_old = reinterpret_cast(npP3_old.get_data()); + //R1 = reinterpret_cast(npR1.get_data()); + //R2 = reinterpret_cast(npR2.get_data()); + //R3 = reinterpret_cast(npR3.get_data()); + ///* begin iterations */ + //for (ll = 0; ll 3) { + //Obj_func3D(A, D, P1, P2, P3, lambda, dimX, dimY, dimZ); + //funcval = 0.0f; + //for (j = 0; j 2) { + //if (re > re_old) { + //Obj_func3D(A, D, P1, P2, P3, lambda, dimX, dimY, dimZ); + //funcval = 0.0f; + //for (j = 0; j(npD); + //result.append(out1); + //result.append(ll); + //} - return result; -} + //return result; +//} bp::list LLT_model(np::ndarray input, double d_lambda, double d_tau, int iter, double d_epsil, int switcher) { // the result is in the following list diff --git a/Wrappers/Python/src/gpu_regularizers.pyx b/Wrappers/Python/src/gpu_regularizers.pyx index c724471..263fa4a 100644 --- a/Wrappers/Python/src/gpu_regularizers.pyx +++ b/Wrappers/Python/src/gpu_regularizers.pyx @@ -25,7 +25,9 @@ cdef extern void NLM_GPU_kernel(float *A, float* B, float *Eucl_Vec, int N, int M, int Z, int dimension, int SearchW, int SimilW, int SearchW_real, float denh2, float lambdaf); -cdef extern void TV_ROF_GPU_kernel(float* A, float* B, int N, int M, int Z, int iter, float tau, float lambdaf); +cdef extern void TV_ROF_GPU(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambdaf); +cdef extern void TV_FGP_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int N, int M, int Z); + cdef extern float pad_crop(float *A, float *Ap, int OldSizeX, int OldSizeY, int OldSizeZ, int NewSizeX, int NewSizeY, int NewSizeZ, @@ -343,7 +345,7 @@ def ROFTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, np.zeros([dims[0],dims[1]], dtype='float32') # Running CUDA code here - TV_ROF_GPU_kernel( + TV_ROF_GPU( &inputData[0,0], &B[0,0], dims[0], dims[1], 1, iterations , @@ -366,7 +368,7 @@ def ROFTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, np.zeros([dims[0],dims[1],dims[2]], dtype='float32') # Running CUDA code here - TV_ROF_GPU_kernel( + TV_ROF_GPU( &inputData[0,0,0], &B[0,0,0], dims[0], dims[1], dims[2], iterations , @@ -374,3 +376,64 @@ def ROFTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, regularization_parameter); return B + + +def TVFGP2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, + float regularization_parameter, + int iterations, + float tolerance_param, + int methodTV, + int nonneg, + int printM): + + cdef long dims[2] + dims[0] = inputData.shape[0] + dims[1] = inputData.shape[1] + + cdef np.ndarray[np.float32_t, ndim=2, mode="c"] B = \ + np.zeros([dims[0],dims[1]], dtype='float32') + + # Running CUDA code here + TV_FGP_GPU( + &inputData[0,0], &B[0,0], + regularization_parameter , + iterations, + tolerance_param, + methodTV, + nonneg, + printM, + dims[0], dims[1], 1); + + return B + +def TVFGP3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, + float regularization_parameter, + int iterations, + float tolerance_param, + int methodTV, + int nonneg, + int printM): + + cdef long dims[3] + dims[0] = inputData.shape[0] + dims[1] = inputData.shape[1] + dims[2] = inputData.shape[2] + + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] B = \ + np.zeros([dims[0],dims[1],dims[2]], dtype='float32') + + # Running CUDA code here + TV_FGP_GPU( + &inputData[0,0,0], &B[0,0,0], + regularization_parameter , + iterations, + tolerance_param, + methodTV, + nonneg, + printM, + dims[0], dims[1], dims[2]); + + return B + + + diff --git a/Wrappers/Python/test/test_cpu_vs_gpu.py b/Wrappers/Python/test/test_cpu_vs_gpu.py new file mode 100644 index 0000000..74d65dd --- /dev/null +++ b/Wrappers/Python/test/test_cpu_vs_gpu.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Feb 21 12:12:22 2018 + +# CPU vs GPU comparison tests + +@author: algol +""" + diff --git a/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py b/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py index d742a7f..6344021 100644 --- a/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py +++ b/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py @@ -12,8 +12,8 @@ import matplotlib.pyplot as plt import numpy as np import os import timeit -from ccpi.filters.gpu_regularizers import Diff4thHajiaboli, NML, GPU_ROF_TV -from ccpi.filters.cpu_regularizers_cython import ROF_TV +from ccpi.filters.gpu_regularizers import Diff4thHajiaboli, NML, TV_ROF_GPU +from ccpi.filters.cpu_regularizers_cython import TV_ROF_CPU ############################################################################### def printParametersToString(pars): txt = r'' @@ -64,7 +64,7 @@ pars = {'algorithm': ROF_TV , \ } print ("#################ROF TV CPU#####################") start_time = timeit.default_timer() -rof_cpu = ROF_TV(pars['input'], +rof_cpu = TV_ROF_CPU(pars['input'], pars['number_of_iterations'], pars['regularization_parameter'], pars['time_marching_parameter'] @@ -89,7 +89,7 @@ plt.title('{}'.format('CPU results')) print ("#################ROF TV GPU#####################") start_time = timeit.default_timer() -rof_gpu = GPU_ROF_TV(pars['input'], +rof_gpu = TV_ROF_GPU(pars['input'], pars['number_of_iterations'], pars['time_marching_parameter'], pars['regularization_parameter']) -- cgit v1.2.3