all work completed on gpu version of pdtv

author: Daniil Kazantsev <dkazanc@hotmail.com> 2019-11-28 23:01:03 +0000
committer: Daniil Kazantsev <dkazanc@hotmail.com> 2019-11-28 23:01:03 +0000
commit: c65291e6b987283e4767a8ad2bd2d2433ca3782e (patch)
tree: c3b660c9b2151f2ff1a12352daf73dfc90d1c3a3 /src
parent: cdef6a981f1772ed04fe44bbe2b8251983a4ba7a (diff)
download: regularization-c65291e6b987283e4767a8ad2bd2d2433ca3782e.tar.gz
regularization-c65291e6b987283e4767a8ad2bd2d2433ca3782e.tar.bz2
regularization-c65291e6b987283e4767a8ad2bd2d2433ca3782e.tar.xz
regularization-c65291e6b987283e4767a8ad2bd2d2433ca3782e.zip
11 files changed, 735 insertions, 25 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5fe1a57..f93c19a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -17,4 +17,12 @@ if (BUILD_MATLAB_WRAPPER)
 endif()
 if (BUILD_PYTHON_WRAPPER)
     add_subdirectory(Python)
-endif()
-\ No newline at end of file
+endif()
+find_package(OpenMP REQUIRED)
+if (OPENMP_FOUND)
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
+   set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
+   set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt
index c1bd7bb..07aae3c 100644
--- a/src/Core/CMakeLists.txt
+++ b/src/Core/CMakeLists.txt
@@ -12,17 +12,6 @@ set (CIL_VERSION $ENV{CIL_VERSION} CACHE INTERNAL "Core Imaging Library version"
 message("CIL_VERSION ${CIL_VERSION}")
 #include (GenerateExportHeader)
 
-
-find_package(OpenMP)
-if (OPENMP_FOUND)
-    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
-   set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
-   set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
-
-endif()
-
 ## Build the regularisers package as a library
 message("Creating Regularisers as a shared library")
 
@@ -115,6 +104,7 @@ if (BUILD_CUDA)
       CUDA_ADD_LIBRARY(cilregcuda SHARED
         ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_ROF_GPU_core.cu
         ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_FGP_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_PD_GPU_core.cu
         ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_SB_GPU_core.cu
         ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/LLT_ROF_GPU_core.cu
         ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TGV_GPU_core.cu
diff --git a/src/Core/regularisers_CPU/PD_TV_core.c b/src/Core/regularisers_CPU/PD_TV_core.c
index 65b8711..534091b 100644
--- a/src/Core/regularisers_CPU/PD_TV_core.c
+++ b/src/Core/regularisers_CPU/PD_TV_core.c
@@ -81,6 +81,7 @@ float PDTV_CPU_main(float *Input, float *U, float *infovector, float lambdaPar,
             /* copy U to U_old */
             copyIm(U, U_old, (long)(dimX), (long)(dimY), 1l);
 
+            /* calculate divergence */
             DivProj2D(U, Input, P1, P2,(long)(dimX), (long)(dimY), lt, tau);
 
             /* check early stopping criteria */
diff --git a/src/Core/regularisers_GPU/TV_PD_GPU_core.cu b/src/Core/regularisers_GPU/TV_PD_GPU_core.cu
new file mode 100644
index 0000000..59fda3b
--- /dev/null
+++ b/src/Core/regularisers_GPU/TV_PD_GPU_core.cu
@@ -0,0 +1,522 @@
+/*
+This work is part of the Core Imaging Library developed by
+Visual Analytics and Imaging System Group of the Science Technology
+Facilities Council, STFC
+
+Copyright 2017 Daniil Kazantsev
+Copyright 2017 Srikanth Nagella, Edoardo Pasca
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "TV_PD_GPU_core.h"
+#include "shared.h"
+#include <thrust/functional.h>
+#include <thrust/device_vector.h>
+#include <thrust/transform_reduce.h>
+
+/* CUDA implementation of Primal-Dual TV [1] by Chambolle Pock denoising/regularization model (2D/3D case)
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume
+ * 2. lambdaPar - regularization parameter
+ * 3. Number of iterations
+ * 4. eplsilon: tolerance constant
+ * 5. lipschitz_const: convergence related parameter
+ * 6. TV-type: methodTV - 'iso' (0) or 'l1' (1)
+ * 7. nonneg: 'nonnegativity (0 is OFF by default, 1 is ON)
+ * 8. tau: time marching parameter
+
+ * Output:
+ * [1] TV - Filtered/regularized image/volume
+ * [2] Information vector which contains [iteration no., reached tolerance]
+ *
+ * [1] Antonin Chambolle, Thomas Pock. "A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging", 2010
+ */
+
+#define BLKXSIZE2D 16
+#define BLKYSIZE2D 16
+
+#define BLKXSIZE 8
+#define BLKYSIZE 8
+#define BLKZSIZE 8
+
+#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
+// struct square { __host__ __device__ float operator()(float x) { return x * x; } };
+
+/************************************************/
+/*****************2D modules*********************/
+/************************************************/
+
+__global__ void dualPD_kernel(float *U, float *P1, float *P2, float sigma, int N, int M)
+{
+
+   //calculate each thread global index
+   const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+   const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+
+   int index = xIndex + N*yIndex;
+
+   if ((xIndex < N) && (yIndex < M)) {
+     if (xIndex == N-1) P1[index] += sigma*(U[(xIndex-1) + N*yIndex] - U[index]);
+     else P1[index] += sigma*(U[(xIndex+1) + N*yIndex] - U[index]);
+     if (yIndex == M-1) P2[index] += sigma*(U[xIndex + N*(yIndex-1)] - U[index]);
+     else  P2[index] += sigma*(U[xIndex + N*(yIndex+1)] - U[index]);
+   }
+   return;
+}
+__global__ void Proj_funcPD2D_iso_kernel(float *P1, float *P2, int N, int M, int ImSize)
+{
+
+   float denom;
+   //calculate each thread global index
+   const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+   const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+
+   int index = xIndex + N*yIndex;
+
+   if ((xIndex < N) && (yIndex < M)) {
+       denom = pow(P1[index],2) +  pow(P2[index],2);
+       if (denom > 1.0f) {
+           P1[index] = P1[index]/sqrt(denom);
+           P2[index] = P2[index]/sqrt(denom);
+       }
+   }
+   return;
+}
+__global__ void Proj_funcPD2D_aniso_kernel(float *P1, float *P2, int N, int M, int ImSize)
+{
+
+   float val1, val2;
+   //calculate each thread global index
+   const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+   const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+
+   int index = xIndex + N*yIndex;
+
+   if ((xIndex < N) && (yIndex < M)) {
+               val1 = abs(P1[index]);
+               val2 = abs(P2[index]);
+               if (val1 < 1.0f) {val1 = 1.0f;}
+               if (val2 < 1.0f) {val2 = 1.0f;}
+               P1[index] = P1[index]/val1;
+               P2[index] = P2[index]/val2;
+   }
+   return;
+}
+__global__ void DivProj2D_kernel(float *U, float *Input, float *P1, float *P2, float lt, float tau, int N, int M)
+{
+   float P_v1, P_v2, div_var;
+
+   //calculate each thread global index
+   const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+   const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+
+   int index = xIndex + N*yIndex;
+
+   if ((xIndex < N) && (yIndex < M)) {
+     if (xIndex == 0) P_v1 = -P1[index];
+     else P_v1 = -(P1[index] - P1[(xIndex-1) + N*yIndex]);
+     if (yIndex == 0) P_v2 = -P2[index];
+     else  P_v2 = -(P2[index] - P2[xIndex + N*(yIndex-1)]);
+     div_var = P_v1 + P_v2;
+     U[index] = (U[index] - tau*div_var + lt*Input[index])/(1.0 + lt);
+   }
+   return;
+}
+__global__ void PDnonneg2D_kernel(float* Output, int N, int M, int num_total)
+{
+   int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+   int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+
+   int index = xIndex + N*yIndex;
+
+   if (index < num_total)	{
+       if (Output[index] < 0.0f) Output[index] = 0.0f;
+   }
+}
+/************************************************/
+/*****************3D modules*********************/
+/************************************************/
+__global__ void dualPD3D_kernel(float *U, float *P1, float *P2, float *P3, float sigma, int N, int M, int Z)
+{
+
+  //calculate each thread global index
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+  int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+  int index = (N*M)*k + i + N*j;
+
+  if ((i < N) && (j < M) && (k < Z)) {
+     if (i == N-1) P1[index] += sigma*(U[(N*M)*k + (i-1) + N*j] - U[index]);
+     else P1[index] += sigma*(U[(N*M)*k + (i+1) + N*j] - U[index]);
+     if (j == M-1) P2[index] += sigma*(U[(N*M)*k + i + N*(j-1)] - U[index]);
+     else  P2[index] += sigma*(U[(N*M)*k + i + N*(j+1)] - U[index]);
+     if (k == Z-1) P3[index] += sigma*(U[(N*M)*(k-1) + i + N*j] - U[index]);
+     else  P3[index] += sigma*(U[(N*M)*(k+1) + i + N*j] - U[index]);
+   }
+   return;
+}
+__global__ void Proj_funcPD3D_iso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize)
+{
+
+   float denom,sq_denom;
+   //calculate each thread global index
+   int i = blockDim.x * blockIdx.x + threadIdx.x;
+   int j = blockDim.y * blockIdx.y + threadIdx.y;
+   int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+   int index = (N*M)*k + i + N*j;
+
+   if ((i < N) && (j < M) && (k <  Z)) {
+       denom = pow(P1[index],2) +  pow(P2[index],2) + pow(P3[index],2);
+       if (denom > 1.0f) {
+           sq_denom = 1.0f/sqrt(denom);
+           P1[index] *= sq_denom;
+           P2[index] *= sq_denom;
+           P3[index] *= sq_denom;
+       }
+   }
+   return;
+}
+__global__ void Proj_funcPD3D_aniso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize)
+{
+
+   float val1, val2, val3;
+   //calculate each thread global index
+   int i = blockDim.x * blockIdx.x + threadIdx.x;
+   int j = blockDim.y * blockIdx.y + threadIdx.y;
+   int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+   int index = (N*M)*k + i + N*j;
+
+   if ((i < N) && (j < M) && (k <  Z)) {
+               val1 = abs(P1[index]);
+               val2 = abs(P2[index]);
+               val3 = abs(P3[index]);
+               if (val1 < 1.0f) {val1 = 1.0f;}
+               if (val2 < 1.0f) {val2 = 1.0f;}
+               if (val3 < 1.0f) {val3 = 1.0f;}
+               P1[index] /= val1;
+               P2[index] /= val2;
+               P3[index] /= val3;
+   }
+   return;
+}
+__global__ void DivProj3D_kernel(float *U, float *Input, float *P1, float *P2, float *P3, float lt, float tau, int N, int M, int Z)
+{
+   float P_v1, P_v2, P_v3, div_var;
+
+   //calculate each thread global index
+   int i = blockDim.x * blockIdx.x + threadIdx.x;
+   int j = blockDim.y * blockIdx.y + threadIdx.y;
+   int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+   int index = (N*M)*k + i + N*j;
+
+   if ((i < N) && (j < M) && (k <  Z)) {
+     if (i == 0) P_v1 = -P1[index];
+     else P_v1 = -(P1[index] - P1[(N*M)*k + (i-1) + N*j]);
+     if (j == 0) P_v2 = -P2[index];
+     else  P_v2 = -(P2[index] - P2[(N*M)*k + i + N*(j-1)]);
+     if (k == 0) P_v3 = -P3[index];
+     else  P_v3 = -(P3[index] - P3[(N*M)*(k-1) + i + N*j]);
+     div_var = P_v1 + P_v2 + P_v3;
+     U[index] = (U[index] - tau*div_var + lt*Input[index])/(1.0 + lt);
+   }
+   return;
+}
+
+__global__ void PDnonneg3D_kernel(float* Output, int N, int M, int Z, int num_total)
+{
+   int i = blockDim.x * blockIdx.x + threadIdx.x;
+   int j = blockDim.y * blockIdx.y + threadIdx.y;
+   int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+   int index = (N*M)*k + i + N*j;
+
+   if (index < num_total)	{
+       if (Output[index] < 0.0f) Output[index] = 0.0f;
+   }
+}
+__global__ void PDcopy_kernel2D(float *Input, float* Output, int N, int M, int num_total)
+{
+   int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+   int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+
+   int index = xIndex + N*yIndex;
+
+   if (index < num_total)	{
+       Output[index] = Input[index];
+   }
+}
+
+__global__ void PDcopy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total)
+{
+   int i = blockDim.x * blockIdx.x + threadIdx.x;
+   int j = blockDim.y * blockIdx.y + threadIdx.y;
+   int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+   int index = (N*M)*k + i + N*j;
+
+   if (index < num_total)	{
+       Output[index] = Input[index];
+   }
+}
+
+__global__ void getU2D_kernel(float *Input, float *Input_old, float theta, int N, int M, int num_total)
+{
+   int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+   int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+
+   int index = xIndex + N*yIndex;
+
+   if (index < num_total)	{
+       Input[index] += theta*(Input[index] - Input_old[index]);
+   }
+}
+
+__global__ void getU3D_kernel(float *Input, float *Input_old, float theta, int N, int M, int Z, int num_total)
+{
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+  int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+  int index = (N*M)*k + i + N*j;
+
+   if (index < num_total)	{
+       Input[index] += theta*(Input[index] - Input_old[index]);
+   }
+}
+
+__global__ void PDResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total)
+{
+    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+
+    int index = xIndex + N*yIndex;
+
+    if (index < num_total)	{
+        Output[index] = Input1[index] - Input2[index];
+    }
+}
+
+__global__ void PDResidCalc3D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int Z, int num_total)
+{
+   	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+    int index = (N*M)*k + i + N*j;
+
+    if (index < num_total)	{
+        Output[index] = Input1[index] - Input2[index];
+    }
+}
+
+
+/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
+
+////////////MAIN HOST FUNCTION ///////////////
+extern "C" int TV_PD_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, float lipschitz_const, int methodTV, int nonneg, float tau, int dimX, int dimY, int dimZ)
+{
+   int deviceCount = -1; // number of devices
+   cudaGetDeviceCount(&deviceCount);
+   if (deviceCount == 0) {
+       fprintf(stderr, "No CUDA devices found\n");
+       return -1;
+   }
+   int count = 0, i;
+   float re, sigma, theta, lt;
+   re = 0.0f;
+
+   sigma = 1.0/(lipschitz_const*tau);
+   theta = 1.0f;
+   lt = tau/lambdaPar;
+
+   if (dimZ <= 1) {
+   /*2D verson*/
+     int ImSize = dimX*dimY;
+     float *d_input, *d_update, *d_old=NULL, *P1=NULL, *P2=NULL;
+
+     dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
+     dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D));
+
+      /*allocate space for images on device*/
+      checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
+      checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );
+      checkCudaErrors( cudaMalloc((void**)&d_old,ImSize*sizeof(float)) );
+      checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) );
+      checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) );
+
+       checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
+       checkCudaErrors( cudaMemcpy(d_update,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
+       cudaMemset(P1, 0, ImSize*sizeof(float));
+       cudaMemset(P2, 0, ImSize*sizeof(float));
+
+       /********************** Run CUDA 2D kernel here ********************/
+       /* The main kernel */
+       for (i = 0; i < iter; i++) {
+
+           /* computing the the dual P variable */
+           dualPD_kernel<<<dimGrid,dimBlock>>>(d_update, P1, P2, sigma, dimX, dimY);
+           checkCudaErrors( cudaDeviceSynchronize() );
+           checkCudaErrors(cudaPeekAtLastError() );
+
+           if (nonneg != 0) {
+           PDnonneg2D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, ImSize);
+           checkCudaErrors( cudaDeviceSynchronize() );
+           checkCudaErrors(cudaPeekAtLastError() ); }
+
+           /* projection step */
+           if (methodTV == 0) Proj_funcPD2D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*isotropic TV*/
+           else Proj_funcPD2D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/
+           checkCudaErrors( cudaDeviceSynchronize() );
+           checkCudaErrors(cudaPeekAtLastError() );
+
+           /* copy U to U_old */
+           PDcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_old, dimX, dimY, ImSize);
+           checkCudaErrors( cudaDeviceSynchronize() );
+           checkCudaErrors(cudaPeekAtLastError() );
+
+           /* calculate divergence */
+           DivProj2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_input, P1, P2, lt, tau, dimX, dimY);
+           checkCudaErrors( cudaDeviceSynchronize() );
+           checkCudaErrors(cudaPeekAtLastError() );
+
+           if ((epsil != 0.0f) && (i % 5 == 0)) {
+               /* calculate norm - stopping rules using the Thrust library */
+               PDResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_old, P1, dimX, dimY, ImSize);
+               checkCudaErrors( cudaDeviceSynchronize() );
+               checkCudaErrors(cudaPeekAtLastError() );
+
+               // setup arguments
+               square<float>        unary_op;
+               thrust::plus<float> binary_op;
+               thrust::device_vector<float> d_vec(P1, P1 + ImSize);
+               float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op));
+               thrust::device_vector<float> d_vec2(d_update, d_update + ImSize);
+               float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op));
+
+               // compute norm
+               re = (reduction/reduction2);
+               if (re < epsil)  count++;
+               if (count > 3) break;
+             }
+
+           getU2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_old, theta, dimX, dimY, ImSize);
+           checkCudaErrors( cudaDeviceSynchronize() );
+           checkCudaErrors(cudaPeekAtLastError() );
+          }
+           //copy result matrix from device to host memory
+           cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
+
+           cudaFree(d_input);
+           cudaFree(d_update);
+           cudaFree(d_old);
+           cudaFree(P1);
+           cudaFree(P2);
+
+   }
+   else {
+           /*3D verson*/
+           int ImSize = dimX*dimY*dimZ;
+           float *d_input, *d_update, *d_old=NULL, *P1=NULL, *P2=NULL, *P3=NULL;
+
+           dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE);
+           dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE));
+
+           /*allocate space for images on device*/
+           checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
+           checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );
+           checkCudaErrors( cudaMalloc((void**)&d_old,ImSize*sizeof(float)) );
+           checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) );
+           checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) );
+           checkCudaErrors( cudaMalloc((void**)&P3,ImSize*sizeof(float)) );
+
+            checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
+            checkCudaErrors( cudaMemcpy(d_update,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
+            cudaMemset(P1, 0, ImSize*sizeof(float));
+            cudaMemset(P2, 0, ImSize*sizeof(float));
+            cudaMemset(P3, 0, ImSize*sizeof(float));
+           /********************** Run CUDA 3D kernel here ********************/
+       for (i = 0; i < iter; i++) {
+
+         /* computing the the dual P variable */
+         dualPD3D_kernel<<<dimGrid,dimBlock>>>(d_update, P1, P2, P3, sigma, dimX, dimY, dimZ);
+         checkCudaErrors( cudaDeviceSynchronize() );
+         checkCudaErrors(cudaPeekAtLastError() );
+
+         if (nonneg != 0) {
+         PDnonneg3D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, dimZ, ImSize);
+         checkCudaErrors( cudaDeviceSynchronize() );
+         checkCudaErrors(cudaPeekAtLastError() ); }
+
+         /* projection step */
+         if (methodTV == 0) Proj_funcPD3D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /*isotropic TV*/
+         else Proj_funcPD3D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /*anisotropic TV*/
+         checkCudaErrors( cudaDeviceSynchronize() );
+         checkCudaErrors(cudaPeekAtLastError() );
+
+         /* copy U to U_old */
+         PDcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_old, dimX, dimY, dimZ, ImSize);
+         checkCudaErrors( cudaDeviceSynchronize() );
+         checkCudaErrors(cudaPeekAtLastError() );
+
+         /* calculate divergence */
+         DivProj3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_input, P1, P2, P3, lt, tau, dimX, dimY, dimZ);
+         checkCudaErrors( cudaDeviceSynchronize() );
+         checkCudaErrors(cudaPeekAtLastError() );
+
+         if ((epsil != 0.0f) && (i % 5 == 0)) {
+         /* calculate norm - stopping rules using the Thrust library */
+         PDResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_old, P1, dimX, dimY, dimZ, ImSize);
+         checkCudaErrors( cudaDeviceSynchronize() );
+         checkCudaErrors(cudaPeekAtLastError() );
+
+        // setup arguments
+         square<float>        unary_op;
+         thrust::plus<float> binary_op;
+         thrust::device_vector<float> d_vec(P1, P1 + ImSize);
+         float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op));
+         thrust::device_vector<float> d_vec2(d_update, d_update + ImSize);
+         float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op));
+
+           // compute norm
+           re = (reduction/reduction2);
+           if (re < epsil)  count++;
+           if (count > 3) break;
+           }
+
+           /* get U*/
+           getU3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_old, theta, dimX, dimY, dimZ, ImSize);
+           checkCudaErrors( cudaDeviceSynchronize() );
+           checkCudaErrors(cudaPeekAtLastError() );
+         }
+           /***************************************************************/
+           //copy result matrix from device to host memory
+           cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
+
+           cudaFree(d_input);
+           cudaFree(d_update);
+           cudaFree(d_old);
+           cudaFree(P1);
+           cudaFree(P2);
+           cudaFree(P3);
+
+   }
+   //cudaDeviceReset();
+   /*adding info into info_vector */
+   infovector[0] = (float)(i);  /*iterations number (if stopped earlier based on tolerance)*/
+   infovector[1] = re;  /* reached tolerance */
+   return 0;
+}
diff --git a/src/Core/regularisers_GPU/TV_PD_GPU_core.h b/src/Core/regularisers_GPU/TV_PD_GPU_core.h
new file mode 100644
index 0000000..2b123d9
--- /dev/null
+++ b/src/Core/regularisers_GPU/TV_PD_GPU_core.h
@@ -0,0 +1,9 @@
+#ifndef _TV_PD_GPU_
+#define _TV_PD_GPU_
+
+#include "CCPiDefines.h"
+#include <memory.h>
+
+extern "C" CCPI_EXPORT int TV_PD_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, float lipschitz_const, int methodTV, int nonneg, float tau, int dimX, int dimY, int dimZ);
+
+#endif
diff --git a/src/Matlab/mex_compile/compileGPU_mex.m b/src/Matlab/mex_compile/compileGPU_mex.m
index 56fcd38..577630f 100644
--- a/src/Matlab/mex_compile/compileGPU_mex.m
+++ b/src/Matlab/mex_compile/compileGPU_mex.m
@@ -41,6 +41,11 @@ fprintf('%s \n', 'Compiling SB-TV...');
 mex -g -I/usr/local/cuda-10.0/include -L/usr/local/cuda-10.0/lib64 -lcudart -lcufft -lmwgpu SB_TV_GPU.cpp TV_SB_GPU_core.o
 movefile('SB_TV_GPU.mex*',Pathmove);
 
+fprintf('%s \n', 'Compiling PD-TV...');
+!/usr/local/cuda/bin/nvcc -O0 -c TV_PD_GPU_core.cu -Xcompiler -fPIC -I~/SOFT/MATLAB9/extern/include/
+mex -g -I/usr/local/cuda-10.0/include -L/usr/local/cuda-10.0/lib64 -lcudart -lcufft -lmwgpu PD_TV_GPU.cpp TV_PD_GPU_core.o
+movefile('PD_TV_GPU.mex*',Pathmove);
+
 fprintf('%s \n', 'Compiling TGV...');
 !/usr/local/cuda/bin/nvcc -O0 -c TGV_GPU_core.cu -Xcompiler -fPIC -I~/SOFT/MATLAB9/extern/include/
 mex -g -I/usr/local/cuda-10.0/include -L/usr/local/cuda-10.0/lib64 -lcudart -lcufft -lmwgpu TGV_GPU.cpp TGV_GPU_core.o
@@ -72,6 +77,7 @@ mex -g -I/usr/local/cuda-10.0/include -L/usr/local/cuda-10.0/lib64 -lcudart -lcu
 movefile('PatchSelect_GPU.mex*',Pathmove);
 
 delete TV_ROF_GPU_core* TV_FGP_GPU_core* TV_SB_GPU_core* dTV_FGP_GPU_core* NonlDiff_GPU_core* Diffus_4thO_GPU_core* TGV_GPU_core* LLT_ROF_GPU_core* CCPiDefines.h
+delete TV_PD_GPU_core*
 delete PatchSelect_GPU_core* Nonlocal_TV_core* shared.h
 fprintf('%s \n', 'All successfully compiled!');
 
diff --git a/src/Matlab/mex_compile/regularisers_CPU/PD_TV.c b/src/Matlab/mex_compile/regularisers_CPU/PD_TV.c
index eac2d18..e5ab1e4 100644
--- a/src/Matlab/mex_compile/regularisers_CPU/PD_TV.c
+++ b/src/Matlab/mex_compile/regularisers_CPU/PD_TV.c
@@ -30,6 +30,7 @@
  * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
  * 6. nonneg: 'nonnegativity (0 is OFF by default, 1 is ON)
  * 7. lipschitz_const: convergence related parameter
+ * 8. tau: convergence related parameter
  
  * Output:
  * [1] TV - Filtered/regularized image/volume
@@ -45,7 +46,7 @@ void mexFunction(
     int number_of_dims, iter, methTV, nonneg;
     mwSize dimX, dimY, dimZ;
     const mwSize *dim_array;
-    float *Input, *infovec=NULL, *Output=NULL, lambda, epsil, lipschitz_const;
+    float *Input, *infovec=NULL, *Output=NULL, lambda, epsil, lipschitz_const, tau;
     
     number_of_dims = mxGetNumberOfDimensions(prhs[0]);
     dim_array = mxGetDimensions(prhs[0]);
@@ -55,29 +56,30 @@ void mexFunction(
     
     Input  = (float *) mxGetData(prhs[0]); /*noisy image (2D/3D) */
     lambda =  (float) mxGetScalar(prhs[1]); /* regularization parameter */
-    iter = 400; /* default iterations number */
+    iter = 500; /* default iterations number */
     epsil = 1.0e-06; /* default tolerance constant */
     methTV = 0;  /* default isotropic TV penalty */
     nonneg = 0; /* default nonnegativity switch, off - 0 */
-    lipschitz_const = 12.0; /* lipschitz_const */
+    lipschitz_const = 8.0; /* lipschitz_const */
+    tau = 0.0025; /* tau convergence const */
     
     if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); }
     
-    if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7))  iter = (int) mxGetScalar(prhs[2]); /* iterations number */
-    if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7))  epsil =  (float) mxGetScalar(prhs[3]); /* tolerance constant */
-    if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7))  {
+    if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8))  iter = (int) mxGetScalar(prhs[2]); /* iterations number */
+    if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8))  epsil =  (float) mxGetScalar(prhs[3]); /* tolerance constant */
+    if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8))  {
         char *penalty_type;
         penalty_type = mxArrayToString(prhs[4]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */
         if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',");
         if (strcmp(penalty_type, "l1") == 0)  methTV = 1;  /* enable 'l1' penalty */
         mxFree(penalty_type);
     }
-    if ((nrhs == 6) || (nrhs == 7))  {
+    if ((nrhs == 6) || (nrhs == 7) || (nrhs == 8))  {
         nonneg = (int) mxGetScalar(prhs[5]);
         if ((nonneg != 0) && (nonneg != 1)) mexErrMsgTxt("Nonnegativity constraint can be enabled by choosing 1 or off - 0");
     }
-    if (nrhs == 7) lipschitz_const = (float) mxGetScalar(prhs[6]);
-    
+    if ((nrhs == 7) || (nrhs == 8)) lipschitz_const = (float) mxGetScalar(prhs[6]);
+    if (nrhs == 8) tau = (float) mxGetScalar(prhs[7]);   
     
     /*Handling Matlab output data*/
     dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];    
@@ -94,5 +96,5 @@ void mexFunction(
     infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL));
     
     /* running the function */    
-    PDTV_CPU_main(Input, Output, infovec, lambda, iter,  epsil, lipschitz_const, methTV, nonneg, dimX, dimY, dimZ);
+    PDTV_CPU_main(Input, Output, infovec, lambda, iter,  epsil, lipschitz_const, methTV, nonneg, tau, dimX, dimY, dimZ);
 }
diff --git a/src/Matlab/mex_compile/regularisers_GPU/PD_TV_GPU.cpp b/src/Matlab/mex_compile/regularisers_GPU/PD_TV_GPU.cpp
new file mode 100644
index 0000000..e853dd3
--- /dev/null
+++ b/src/Matlab/mex_compile/regularisers_GPU/PD_TV_GPU.cpp
@@ -0,0 +1,101 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2019 Daniil Kazantsev
+ * Copyright 2019 Srikanth Nagella, Edoardo Pasca
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "matrix.h"
+#include "mex.h"
+#include "TV_PD_GPU_core.h"
+
+/* GPU implementation of Primal-Dual TV [1] by Chambolle Pock denoising/regularization model (2D/3D case)
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume
+ * 2. lambdaPar - regularization parameter
+ * 3. Number of iterations
+ * 4. eplsilon: tolerance constant
+ * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
+ * 6. nonneg: 'nonnegativity (0 is OFF by default, 1 is ON)
+ * 7. lipschitz_const: convergence related parameter
+ * 8. tau: convergence related parameter
+ 
+ * Output:
+ * [1] TV - Filtered/regularized image/volume
+ * [2] Information vector which contains [iteration no., reached tolerance]
+ *
+ * [1] Antonin Chambolle, Thomas Pock. "A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging", 2010
+ */
+
+void mexFunction(
+        int nlhs, mxArray *plhs[],
+        int nrhs, const mxArray *prhs[])
+        
+{
+    int number_of_dims, iter, methTV, nonneg;
+    mwSize dimX, dimY, dimZ;
+    const mwSize *dim_array;
+    float *Input, *infovec=NULL, *Output=NULL, lambda, epsil, lipschitz_const, tau;
+    
+    number_of_dims = mxGetNumberOfDimensions(prhs[0]);
+    dim_array = mxGetDimensions(prhs[0]);
+    
+    /*Handling Matlab input data*/
+    if ((nrhs < 2) || (nrhs > 7)) mexErrMsgTxt("At least 2 parameters is required, all parameters are: Image(2D/3D), Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1'), nonnegativity switch, lipschitz_const");
+    
+    Input  = (float *) mxGetData(prhs[0]); /*noisy image (2D/3D) */
+    lambda =  (float) mxGetScalar(prhs[1]); /* regularization parameter */
+    iter = 500; /* default iterations number */
+    epsil = 1.0e-06; /* default tolerance constant */
+    methTV = 0;  /* default isotropic TV penalty */
+    nonneg = 0; /* default nonnegativity switch, off - 0 */
+    lipschitz_const = 8.0; /* lipschitz_const */
+    tau = 0.0025; /* tau convergence const */
+    
+    if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); }
+    
+    if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8))  iter = (int) mxGetScalar(prhs[2]); /* iterations number */
+    if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8))  epsil =  (float) mxGetScalar(prhs[3]); /* tolerance constant */
+    if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7) || (nrhs == 8))  {
+        char *penalty_type;
+        penalty_type = mxArrayToString(prhs[4]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */
+        if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',");
+        if (strcmp(penalty_type, "l1") == 0)  methTV = 1;  /* enable 'l1' penalty */
+        mxFree(penalty_type);
+    }
+    if ((nrhs == 6) || (nrhs == 7) || (nrhs == 8))  {
+        nonneg = (int) mxGetScalar(prhs[5]);
+        if ((nonneg != 0) && (nonneg != 1)) mexErrMsgTxt("Nonnegativity constraint can be enabled by choosing 1 or off - 0");
+    }
+    if ((nrhs == 7) || (nrhs == 8)) lipschitz_const = (float) mxGetScalar(prhs[6]);
+    if (nrhs == 8) tau = (float) mxGetScalar(prhs[7]);   
+    
+    /*Handling Matlab output data*/
+    dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];    
+       
+    if (number_of_dims == 2) {
+        dimZ = 1; /*2D case*/
+        Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));        
+    }
+    if (number_of_dims == 3) {    
+        Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
+    }    
+    mwSize vecdim[1];
+    vecdim[0] = 2;
+    infovec = (float*)mxGetPr(plhs[1] = mxCreateNumericArray(1, vecdim, mxSINGLE_CLASS, mxREAL));
+    
+    /* running the function */    
+    TV_PD_GPU_main(Input, Output, infovec, lambda, iter,  epsil, lipschitz_const, methTV, nonneg, tau, dimX, dimY, dimZ);
+}
diff --git a/src/Python/ccpi/filters/regularisers.py b/src/Python/ccpi/filters/regularisers.py
index bc745fe..5f4001a 100644
--- a/src/Python/ccpi/filters/regularisers.py
+++ b/src/Python/ccpi/filters/regularisers.py
@@ -4,7 +4,7 @@ script which assigns a proper device core function based on a flag ('cpu' or 'gp
 
 from ccpi.filters.cpu_regularisers import TV_ROF_CPU, TV_FGP_CPU, TV_PD_CPU, TV_SB_CPU, dTV_FGP_CPU, TNV_CPU, NDF_CPU, Diff4th_CPU, TGV_CPU, LLT_ROF_CPU, PATCHSEL_CPU, NLTV_CPU
 try:
-    from ccpi.filters.gpu_regularisers import TV_ROF_GPU, TV_FGP_GPU, TV_SB_GPU, dTV_FGP_GPU, NDF_GPU, Diff4th_GPU, TGV_GPU, LLT_ROF_GPU, PATCHSEL_GPU
+    from ccpi.filters.gpu_regularisers import TV_ROF_GPU, TV_FGP_GPU, TV_PD_GPU, TV_SB_GPU, dTV_FGP_GPU, NDF_GPU, Diff4th_GPU, TGV_GPU, LLT_ROF_GPU, PATCHSEL_GPU
     gpu_enabled = True
 except ImportError:
     gpu_enabled = False
@@ -64,7 +64,7 @@ def PD_TV(inputData, regularisation_parameter, iterations,
                      lipschitz_const,
                      tau)
     elif device == 'gpu' and gpu_enabled:
-        return TV_PD_CPU(inputData,
+        return TV_PD_GPU(inputData,
                      regularisation_parameter,
                      iterations,
                      tolerance_param,
diff --git a/src/Python/setup-regularisers.py.in b/src/Python/setup-regularisers.py.in
index 9a5b693..c4ad143 100644
--- a/src/Python/setup-regularisers.py.in
+++ b/src/Python/setup-regularisers.py.in
@@ -39,6 +39,7 @@ extra_include_dirs += [os.path.join(".." , "Core"),
                        os.path.join(".." , "Core",  "inpainters_CPU"),
                        os.path.join(".." , "Core",  "regularisers_GPU" , "TV_FGP" ) ,
                        os.path.join(".." , "Core",  "regularisers_GPU" , "TV_ROF" ) ,
+                       os.path.join(".." , "Core",  "regularisers_GPU" , "TV_PD" ) ,
                        os.path.join(".." , "Core",  "regularisers_GPU" , "TV_SB" ) ,
                        os.path.join(".." , "Core",  "regularisers_GPU" , "TGV" ) ,
                        os.path.join(".." , "Core",  "regularisers_GPU" , "LLTROF" ) ,
diff --git a/src/Python/src/gpu_regularisers.pyx b/src/Python/src/gpu_regularisers.pyx
index 8cd8c93..b22d15e 100644
--- a/src/Python/src/gpu_regularisers.pyx
+++ b/src/Python/src/gpu_regularisers.pyx
@@ -22,6 +22,7 @@ CUDAErrorMessage = 'CUDA error'
 
 cdef extern int TV_ROF_GPU_main(float* Input, float* Output, float *infovector, float *lambdaPar, int lambda_is_arr, int iter, float tau, float epsil, int N, int M, int Z);
 cdef extern int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int N, int M, int Z);
+cdef extern int TV_PD_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, float lipschitz_const, int methodTV, int nonneg, float tau, int dimX, int dimY, int dimZ);
 cdef extern int TV_SB_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, int methodTV, int N, int M, int Z);
 cdef extern int LLT_ROF_GPU_main(float *Input, float *Output, float *infovector, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau,  float epsil, int N, int M, int Z);
 cdef extern int TGV_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, float alpha1, float alpha0, int iterationsNumb, float L2, float epsil, int dimX, int dimY, int dimZ);
@@ -70,6 +71,75 @@ def TV_FGP_GPU(inputData,
                      tolerance_param,
                      methodTV,
                      nonneg)
+# Total-variation Primal-Dual (PD)
+def TV_PD_GPU(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, nonneg, lipschitz_const, tau):
+    if inputData.ndim == 2:
+        return TVPD2D(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, nonneg, lipschitz_const, tau)
+    elif inputData.ndim == 3:
+        return TVPD3D(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, nonneg, lipschitz_const, tau)
+
+def TVPD2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
+                     float regularisation_parameter,
+                     int iterationsNumb,
+                     float tolerance_param,
+                     int methodTV,
+                     int nonneg,
+                     float lipschitz_const,
+                     float tau):
+
+    cdef long dims[2]
+    dims[0] = inputData.shape[0]
+    dims[1] = inputData.shape[1]
+
+    cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \
+            np.zeros([dims[0],dims[1]], dtype='float32')
+
+    cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \
+            np.ones([2], dtype='float32')
+
+    if (TV_PD_GPU_main(&inputData[0,0], &outputData[0,0], &infovec[0], regularisation_parameter,
+                       iterationsNumb,
+                       tolerance_param,
+                       lipschitz_const,
+                       methodTV,
+                       nonneg,
+                       tau,
+                       dims[1],dims[0], 1) ==0):
+        return (outputData,infovec)
+    else:
+        raise ValueError(CUDAErrorMessage);
+
+def TVPD3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
+                     float regularisation_parameter,
+                     int iterationsNumb,
+                     float tolerance_param,
+                     int methodTV,
+                     int nonneg,
+                     float lipschitz_const,
+                     float tau):
+
+    cdef long dims[3]
+    dims[0] = inputData.shape[0]
+    dims[1] = inputData.shape[1]
+    dims[2] = inputData.shape[2]
+
+    cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \
+            np.zeros([dims[0], dims[1], dims[2]], dtype='float32')
+    cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \
+            np.zeros([2], dtype='float32')
+
+    if (TV_PD_GPU_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0], regularisation_parameter,
+                       iterationsNumb,
+                       tolerance_param,
+                       lipschitz_const,
+                       methodTV,
+                       nonneg,
+                       tau,
+                       dims[2], dims[1], dims[0]) ==0):
+        return (outputData,infovec)
+    else:
+        raise ValueError(CUDAErrorMessage);
+
 # Total-variation Split Bregman (SB)
 def TV_SB_GPU(inputData,
                      regularisation_parameter,
@@ -195,7 +265,7 @@ def ROFTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
     if isinstance (regularisation_parameter, np.ndarray):
         reg = regularisation_parameter.copy()
         # Running CUDA code here
-        if (TV_ROF_GPU_main(&inputData[0,0], &outputData[0,0], &infovec[0], 
+        if (TV_ROF_GPU_main(&inputData[0,0], &outputData[0,0], &infovec[0],
                        &reg[0,0], 1,
                        iterations,
                        time_marching_parameter,
author	Daniil Kazantsev <dkazanc@hotmail.com>	2019-11-28 23:01:03 +0000
committer	Daniil Kazantsev <dkazanc@hotmail.com>	2019-11-28 23:01:03 +0000
commit	c65291e6b987283e4767a8ad2bd2d2433ca3782e (patch)
tree	c3b660c9b2151f2ff1a12352daf73dfc90d1c3a3 /src
parent	cdef6a981f1772ed04fe44bbe2b8251983a4ba7a (diff)
download	regularization-c65291e6b987283e4767a8ad2bd2d2433ca3782e.tar.gz regularization-c65291e6b987283e4767a8ad2bd2d2433ca3782e.tar.bz2 regularization-c65291e6b987283e4767a8ad2bd2d2433ca3782e.tar.xz regularization-c65291e6b987283e4767a8ad2bd2d2433ca3782e.zip