From bb86cf3cb44fa66a2def258d346ebb68fe14ed61 Mon Sep 17 00:00:00 2001
From: Daniil Kazantsev <dkazanc@hotmail.com>
Date: Mon, 9 Apr 2018 09:38:35 +0100
Subject: fixes a memory leak in FGP-TV(CPU)#43, matlab CPU/GPU wrappers and
 demos

---
 Core/CMakeLists.txt                                |   4 +-
 Core/regularizers_CPU/FGP_TV_core.c                |  38 +-
 Core/regularizers_CPU/ROF_TV_core.c                |  14 +-
 Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu    | 561 ---------------------
 Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h     |  10 -
 Core/regularizers_GPU/TV_FGP_GPU_core.cu           | 561 +++++++++++++++++++++
 Core/regularizers_GPU/TV_FGP_GPU_core.h            |  10 +
 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu    | 368 --------------
 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h     |   8 -
 Core/regularizers_GPU/TV_ROF_GPU_core.cu           | 368 ++++++++++++++
 Core/regularizers_GPU/TV_ROF_GPU_core.h            |   8 +
 Readme.md                                          |  30 +-
 Wrappers/Matlab/demos/demoMatlab_denoise.m         |  38 ++
 Wrappers/Matlab/demos/demoMatlab_denoise.m~        |  31 ++
 Wrappers/Matlab/mex_compile/compileCPU_mex.m       |  19 +
 Wrappers/Matlab/mex_compile/compileGPU_mex.m       |  30 ++
 Wrappers/Matlab/mex_compile/compile_mex.m          |  19 -
 .../Matlab/mex_compile/regularizers_CPU/FGP_TV.c   |  96 ++--
 .../mex_compile/regularizers_CPU/LLT_model.c       | 169 -------
 .../regularizers_CPU/PatchBased_Regul.c            | 140 -----
 .../Matlab/mex_compile/regularizers_CPU/ROF_TV.c   |  73 +--
 .../mex_compile/regularizers_CPU/SplitBregman_TV.c | 179 -------
 .../Matlab/mex_compile/regularizers_CPU/TGV_PD.c   | 144 ------
 .../Diffus_HO/Diff4thHajiaboli_GPU.cpp             | 125 -----
 .../mex_compile/regularizers_GPU/FGP_TV_GPU.cpp    |  95 ++++
 .../regularizers_GPU/NL_Regul/NLM_GPU.cpp          | 171 -------
 .../mex_compile/regularizers_GPU/ROF_TV_GPU.cpp    |  73 +++
 27 files changed, 1348 insertions(+), 2034 deletions(-)
 delete mode 100755 Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu
 delete mode 100755 Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h
 create mode 100755 Core/regularizers_GPU/TV_FGP_GPU_core.cu
 create mode 100755 Core/regularizers_GPU/TV_FGP_GPU_core.h
 delete mode 100755 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu
 delete mode 100755 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h
 create mode 100755 Core/regularizers_GPU/TV_ROF_GPU_core.cu
 create mode 100755 Core/regularizers_GPU/TV_ROF_GPU_core.h
 create mode 100644 Wrappers/Matlab/demos/demoMatlab_denoise.m
 create mode 100644 Wrappers/Matlab/demos/demoMatlab_denoise.m~
 create mode 100644 Wrappers/Matlab/mex_compile/compileCPU_mex.m
 create mode 100644 Wrappers/Matlab/mex_compile/compileGPU_mex.m
 delete mode 100644 Wrappers/Matlab/mex_compile/compile_mex.m
 delete mode 100644 Wrappers/Matlab/mex_compile/regularizers_CPU/LLT_model.c
 delete mode 100644 Wrappers/Matlab/mex_compile/regularizers_CPU/PatchBased_Regul.c
 delete mode 100644 Wrappers/Matlab/mex_compile/regularizers_CPU/SplitBregman_TV.c
 delete mode 100644 Wrappers/Matlab/mex_compile/regularizers_CPU/TGV_PD.c
 delete mode 100644 Wrappers/Matlab/mex_compile/regularizers_GPU/Diffus_HO/Diff4thHajiaboli_GPU.cpp
 create mode 100644 Wrappers/Matlab/mex_compile/regularizers_GPU/FGP_TV_GPU.cpp
 delete mode 100644 Wrappers/Matlab/mex_compile/regularizers_GPU/NL_Regul/NLM_GPU.cpp
 create mode 100644 Wrappers/Matlab/mex_compile/regularizers_GPU/ROF_TV_GPU.cpp

diff --git a/Core/CMakeLists.txt b/Core/CMakeLists.txt
index f31454d..f53c538 100644
--- a/Core/CMakeLists.txt
+++ b/Core/CMakeLists.txt
@@ -127,8 +127,8 @@ if (CUDA_FOUND)
   set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -shared -D_FORCE_INLINES")
   message("CUDA FLAGS ${CUDA_NVCC_FLAGS}")
   CUDA_ADD_LIBRARY(cilregcuda SHARED
-	${CMAKE_CURRENT_SOURCE_DIR}/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu
-	${CMAKE_CURRENT_SOURCE_DIR}/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu
+	${CMAKE_CURRENT_SOURCE_DIR}/regularizers_GPU/TV_ROF_GPU_core.cu
+	${CMAKE_CURRENT_SOURCE_DIR}/regularizers_GPU/TV_FGP_GPU_core.cu
   )
   if (UNIX)
     message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib")
diff --git a/Core/regularizers_CPU/FGP_TV_core.c b/Core/regularizers_CPU/FGP_TV_core.c
index 7a8ce48..462c3f7 100644
--- a/Core/regularizers_CPU/FGP_TV_core.c
+++ b/Core/regularizers_CPU/FGP_TV_core.c
@@ -50,13 +50,13 @@ float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iteratio
 		float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL;
 		DimTotal = dimX*dimY;
 		
-		Output_prev = (float *) malloc( DimTotal * sizeof(float) );
-		P1 = (float *) malloc( DimTotal * sizeof(float) );
-		P2 = (float *) malloc( DimTotal * sizeof(float) );
-		P1_prev = (float *) malloc( DimTotal * sizeof(float) );
-		P2_prev = (float *) malloc( DimTotal * sizeof(float) );
-		R1 = (float *) malloc( DimTotal * sizeof(float) );
-		R2 = (float *) malloc( DimTotal * sizeof(float) );
+        Output_prev = calloc(DimTotal, sizeof(float));
+        P1 = calloc(DimTotal, sizeof(float));
+        P2 = calloc(DimTotal, sizeof(float));
+        P1_prev = calloc(DimTotal, sizeof(float));
+        P2_prev = calloc(DimTotal, sizeof(float));        
+        R1 = calloc(DimTotal, sizeof(float));
+        R2 = calloc(DimTotal, sizeof(float)); 
 		
 		/* begin iterations */
         for(ll=0; ll<iterationsNumb; ll++) {
@@ -100,18 +100,18 @@ float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iteratio
 	else {
 		/*3D case*/
 		float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL;		
-		DimTotal = dimX*dimY*dimZ;
-		
-		Output_prev = (float *) malloc( DimTotal * sizeof(float) );
-		P1 = (float *) malloc( DimTotal * sizeof(float) );
-		P2 = (float *) malloc( DimTotal * sizeof(float) );
-		P3 = (float *) malloc( DimTotal * sizeof(float) );
-		P1_prev = (float *) malloc( DimTotal * sizeof(float) );
-		P2_prev = (float *) malloc( DimTotal * sizeof(float) );
-		P3_prev = (float *) malloc( DimTotal * sizeof(float) );
-		R1 = (float *) malloc( DimTotal * sizeof(float) );
-		R2 = (float *) malloc( DimTotal * sizeof(float) );
-		R3 = (float *) malloc( DimTotal * sizeof(float) );
+		DimTotal = dimX*dimY*dimZ;        
+        
+        Output_prev = calloc(DimTotal, sizeof(float));
+        P1 = calloc(DimTotal, sizeof(float));
+        P2 = calloc(DimTotal, sizeof(float));
+        P3 = calloc(DimTotal, sizeof(float));
+        P1_prev = calloc(DimTotal, sizeof(float));
+        P2_prev = calloc(DimTotal, sizeof(float));        
+        P3_prev = calloc(DimTotal, sizeof(float));        
+        R1 = calloc(DimTotal, sizeof(float));
+        R2 = calloc(DimTotal, sizeof(float)); 
+        R3 = calloc(DimTotal, sizeof(float)); 
 		
 		    /* begin iterations */
         for(ll=0; ll<iterationsNumb; ll++) {
diff --git a/Core/regularizers_CPU/ROF_TV_core.c b/Core/regularizers_CPU/ROF_TV_core.c
index a4f82a6..9ffb905 100644
--- a/Core/regularizers_CPU/ROF_TV_core.c
+++ b/Core/regularizers_CPU/ROF_TV_core.c
@@ -50,13 +50,11 @@ float TV_ROF_CPU_main(float *Input, float *Output, float lambdaPar, int iteratio
 {
     float *D1, *D2, *D3;
     int i, DimTotal;
-    DimTotal = dimX*dimY*dimZ;
+    DimTotal = dimX*dimY*dimZ;    
     
-      
-    D1 = (float *) malloc( DimTotal * sizeof(float) );
-    D2 = (float *) malloc( DimTotal * sizeof(float) );   
-    D3 = (float *) malloc( DimTotal * sizeof(float) );   
-	
+    D1 = calloc(DimTotal, sizeof(float));
+    D2 = calloc(DimTotal, sizeof(float));
+    D3 = calloc(DimTotal, sizeof(float));
 	   
     /* copy into output */
     copyIm(Input, Output, dimX, dimY, dimZ);
@@ -268,7 +266,7 @@ float TV_kernel(float *D1, float *D2, float *D3, float *B, float *A, float lambd
                     dv2 = D2[index] - D2[(dimX*dimY)*k + j*dimX+i2];
                     dv3 = D3[index] - D3[(dimX*dimY)*k2 + j*dimX+i];
                     
-                    B[index] = B[index] + tau*(2.0f*lambda*(dv1 + dv2 + dv3) - (B[index] - A[index]));   
+                    B[index] = B[index] + tau*(lambda*(dv1 + dv2 + dv3) - (B[index] - A[index]));   
                 }}}		
     }
     else {
@@ -286,7 +284,7 @@ float TV_kernel(float *D1, float *D2, float *D3, float *B, float *A, float lambd
                 dv1 = D1[index] - D1[j2*dimX + i];
                 dv2 = D2[index] - D2[j*dimX + i2];                
 
-                B[index] =  B[index] + tau*(2.0f*lambda*(dv1 + dv2) - (B[index] - A[index]));                
+                B[index] =  B[index] + tau*(lambda*(dv1 + dv2) - (B[index] - A[index]));                
             }}
     }
     return *B;
diff --git a/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu
deleted file mode 100755
index 61097fb..0000000
--- a/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu
+++ /dev/null
@@ -1,561 +0,0 @@
- /*
-This work is part of the Core Imaging Library developed by
-Visual Analytics and Imaging System Group of the Science Technology
-Facilities Council, STFC
-
-Copyright 2017 Daniil Kazantsev
-Copyright 2017 Srikanth Nagella, Edoardo Pasca
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/ 
-
-#include "TV_FGP_GPU_core.h"
-#include <thrust/device_vector.h>
-#include <thrust/transform_reduce.h>
-
-/* CUDA implementation of FGP-TV [1] denoising/regularization model (2D/3D case)
- *
- * Input Parameters:
- * 1. Noisy image/volume 
- * 2. lambdaPar - regularization parameter 
- * 3. Number of iterations
- * 4. eplsilon: tolerance constant 
- * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
- * 6. nonneg: 'nonnegativity (0 is OFF by default) 
- * 7. print information: 0 (off) or 1 (on) 
- *
- * Output:
- * [1] Filtered/regularized image
- *
- * This function is based on the Matlab's code and paper by
- * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
- */
-
-// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
-#define checkCudaErrors(err)           __checkCudaErrors (err, __FILE__, __LINE__)
-
-inline void __checkCudaErrors(cudaError err, const char *file, const int line)
-{
-    if (cudaSuccess != err)
-    {
-        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
-                file, line, (int)err, cudaGetErrorString(err));
-        exit(EXIT_FAILURE);
-    }
-}
-
-#define BLKXSIZE2D 16
-#define BLKYSIZE2D 16
-
-#define BLKXSIZE 8
-#define BLKYSIZE 8
-#define BLKZSIZE 8
-
-#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
-struct square { __host__ __device__ float operator()(float x) { return x * x; } };
-
-/************************************************/
-/*****************2D modules*********************/
-/************************************************/
-__global__ void Obj_func2D_kernel(float *Ad, float *D, float *R1, float *R2, int N, int M, int ImSize, float lambda)
-{
-    
-    float val1,val2;
-    
-    //calculate each thread global index
-    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
-    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
-    
-    int index = xIndex + N*yIndex; 
-    
-    if ((xIndex < N) && (yIndex < M)) {        
-        if (xIndex <= 0) {val1 = 0.0f;} else {val1 = R1[(xIndex-1) + N*yIndex];}
-        if (yIndex <= 0) {val2 = 0.0f;} else {val2 = R2[xIndex + N*(yIndex-1)];}
-        //Write final result to global memory
-        D[index] = Ad[index] - lambda*(R1[index] + R2[index] - val1 - val2);
-    }
-    return;
-}
-
-__global__ void Grad_func2D_kernel(float *P1, float *P2, float *D, float *R1, float *R2, int N, int M, int ImSize, float multip)
-{
-    
-    float val1,val2;
-    
-    //calculate each thread global index
-    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
-    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
-    
-    if ((xIndex < N) && (yIndex < M)) {        
-        
-        /* boundary conditions */
-        if (xIndex >= N-1) val1 = 0.0f; else val1 = D[index] - D[(xIndex+1) + N*yIndex];
-        if (yIndex >= M-1) val2 = 0.0f; else val2 = D[index] - D[(xIndex) + N*(yIndex + 1)];
-        
-        //Write final result to global memory
-        P1[index] = R1[index] + multip*val1;
-        P2[index] = R2[index] + multip*val2;
-    }
-    return;
-}
-
-__global__ void Proj_func2D_iso_kernel(float *P1, float *P2, int N, int M, int ImSize)
-{
-    
-    float denom;    
-    //calculate each thread global index
-    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
-    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
-    
-    if ((xIndex < N) && (yIndex < M)) { 
-        denom = pow(P1[index],2) +  pow(P2[index],2);        
-        if (denom > 1.0f) {
-            P1[index] = P1[index]/sqrt(denom);
-            P2[index] = P2[index]/sqrt(denom);
-        }
-    }
-    return;
-}
-__global__ void Proj_func2D_aniso_kernel(float *P1, float *P2, int N, int M, int ImSize)
-{
-    
-    float val1, val2;    
-    //calculate each thread global index
-    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
-    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
-    
-    if ((xIndex < N) && (yIndex < M)) { 
-                val1 = abs(P1[index]);
-                val2 = abs(P2[index]);
-                if (val1 < 1.0f) {val1 = 1.0f;}
-                if (val2 < 1.0f) {val2 = 1.0f;}
-                P1[index] = P1[index]/val1;
-                P2[index] = P2[index]/val2;
-    }
-    return;
-}
-__global__ void Rupd_func2D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, float multip2, int N, int M, int ImSize)
-{
-    //calculate each thread global index
-    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
-    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
-    
-    if ((xIndex < N) && (yIndex < M)) { 
-        R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]);
-        R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]);
-    }
-    return;
-}
-__global__ void nonneg2D_kernel(float* Output, int N, int M, int num_total)
-{
-    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
-    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
-    
-    if (index < num_total)	{
-        if (Output[index] < 0.0f) Output[index] = 0.0f;
-    }
-}
-__global__ void copy_kernel2D(float *Input, float* Output, int N, int M, int num_total)
-{
-    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
-    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
-    
-    if (index < num_total)	{
-        Output[index] = Input[index];
-    }
-}
-__global__ void ResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total)
-{
-    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
-    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
-    
-    if (index < num_total)	{
-        Output[index] = Input1[index] - Input2[index];
-    }
-}   
-/************************************************/
-/*****************3D modules*********************/
-/************************************************/
-__global__ void Obj_func3D_kernel(float *Ad, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float lambda)
-{
-    
-    float val1,val2,val3;
-    
-    //calculate each thread global index
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    int k = blockDim.z * blockIdx.z + threadIdx.z;
-    
-    int index = (N*M)*k + i + N*j;
-    
-    if ((i < N) && (j < M) && (k < Z)) {      
-        if (i <= 0) {val1 = 0.0f;} else {val1 = R1[(N*M)*(k) + (i-1) + N*j];}
-        if (j <= 0) {val2 = 0.0f;} else {val2 = R2[(N*M)*(k) + i + N*(j-1)];}
-        if (k <= 0) {val3 = 0.0f;} else {val3 = R3[(N*M)*(k-1) + i + N*j];}
-        //Write final result to global memory
-        D[index] = Ad[index] - lambda*(R1[index] + R2[index] + R3[index] - val1 - val2 - val3);
-    }
-    return;
-}
-
-__global__ void Grad_func3D_kernel(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float multip)
-{
-    
-    float val1,val2,val3;
-    
-    //calculate each thread global index
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    int k = blockDim.z * blockIdx.z + threadIdx.z;
-    
-    int index = (N*M)*k + i + N*j;
-    
-    if ((i < N) && (j < M) && (k <  Z)) {       
-        /* boundary conditions */
-        if (i >= N-1) val1 = 0.0f; else val1 = D[index] - D[(N*M)*(k) + (i+1) + N*j];
-        if (j >= M-1) val2 = 0.0f; else val2 = D[index] - D[(N*M)*(k) + i + N*(j+1)];
-        if (k >= Z-1) val3 = 0.0f; else val3 = D[index] - D[(N*M)*(k+1) + i + N*j];
-        
-        //Write final result to global memory
-        P1[index] = R1[index] + multip*val1;
-        P2[index] = R2[index] + multip*val2;
-        P3[index] = R3[index] + multip*val3;
-    }
-    return;
-}
-
-__global__ void Proj_func3D_iso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize)
-{
-    
-    float denom,sq_denom;    
-    //calculate each thread global index
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    int k = blockDim.z * blockIdx.z + threadIdx.z;
-    
-    int index = (N*M)*k + i + N*j;
-    
-    if ((i < N) && (j < M) && (k <  Z)) {
-        denom = pow(P1[index],2) +  pow(P2[index],2) + pow(P3[index],2);
-        
-        if (denom > 1.0f) {
-            sq_denom = 1.0f/sqrt(denom);
-            P1[index] = P1[index]*sq_denom;
-            P2[index] = P2[index]*sq_denom;
-            P3[index] = P3[index]*sq_denom;
-        }
-    }
-    return;
-}
-
-__global__ void Proj_func3D_aniso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize)
-{
-    
-    float val1, val2, val3;    
-    //calculate each thread global index
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    int k = blockDim.z * blockIdx.z + threadIdx.z;
-    
-    int index = (N*M)*k + i + N*j;
-    
-    if ((i < N) && (j < M) && (k <  Z)) {
-                val1 = abs(P1[index]);
-                val2 = abs(P2[index]);
-                val3 = abs(P3[index]);
-                if (val1 < 1.0f) {val1 = 1.0f;}
-                if (val2 < 1.0f) {val2 = 1.0f;}
-                if (val3 < 1.0f) {val3 = 1.0f;}
-                P1[index] = P1[index]/val1;
-                P2[index] = P2[index]/val2;
-                P3[index] = P3[index]/val3;
-    }
-    return;
-}
-
-
-__global__ void Rupd_func3D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, float multip2, int N, int M, int Z, int ImSize)
-{
-    //calculate each thread global index
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    int k = blockDim.z * blockIdx.z + threadIdx.z;
-    
-    int index = (N*M)*k + i + N*j;
-    
-    if ((i < N) && (j < M) && (k <  Z)) { 
-        R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]);
-        R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]);
-        R3[index] = P3[index] + multip2*(P3[index] - P3_old[index]);
-    }
-    return;
-}
-
-__global__ void nonneg3D_kernel(float* Output, int N, int M, int Z, int num_total)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    int k = blockDim.z * blockIdx.z + threadIdx.z;
-    
-    int index = (N*M)*k + i + N*j;
-    
-    if (index < num_total)	{
-        if (Output[index] < 0.0f) Output[index] = 0.0f;
-    }
-}
-
-__global__ void copy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total)
-{
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    int k = blockDim.z * blockIdx.z + threadIdx.z;
-    
-    int index = (N*M)*k + i + N*j;
-    
-    if (index < num_total)	{
-        Output[index] = Input[index];
-    }
-}
-/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
-
-////////////MAIN HOST FUNCTION ///////////////
-extern "C" void TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
-{
-    int deviceCount = -1; // number of devices
-    cudaGetDeviceCount(&deviceCount);
-    if (deviceCount == 0) {
-        fprintf(stderr, "No CUDA devices found\n");
-        return;
-    }
-    
-    int count = 0, i;
-    float re, multip,multip2;    
-	float tk = 1.0f;
-    float tkp1=1.0f;
-        
-    if (dimZ <= 1) {
-		/*2D verson*/
-		int ImSize = dimX*dimY;    
-		float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL;
-   
-		dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
-		dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D));
-    
-		/*allocate space for images on device*/
-		checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
-		checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );
-		if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) );
-		checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) );
-		checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) );
-		checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) );
-		checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) );
-		checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) );
-		checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) );
-    
-        checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
-        cudaMemset(P1, 0, ImSize*sizeof(float));
-        cudaMemset(P2, 0, ImSize*sizeof(float));
-        cudaMemset(P1_prev, 0, ImSize*sizeof(float));
-        cudaMemset(P2_prev, 0, ImSize*sizeof(float));
-        cudaMemset(R1, 0, ImSize*sizeof(float));
-        cudaMemset(R2, 0, ImSize*sizeof(float));
-
-        /********************** Run CUDA 2D kernel here ********************/    
-        multip = (1.0f/(8.0f*lambdaPar));
-    
-        /* The main kernel */
-        for (i = 0; i < iter; i++) {
-        
-            /* computing the gradient of the objective function */
-            Obj_func2D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, dimX, dimY, ImSize, lambdaPar);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-            
-            if (nonneg != 0) {
-            nonneg2D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() ); }
-                    
-            /*Taking a step towards minus of the gradient*/
-            Grad_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P2, d_update, R1, R2, dimX, dimY, ImSize, multip);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-        
-            /* projection step */
-            if (methodTV == 0) Proj_func2D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*isotropic TV*/
-            else Proj_func2D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/            
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-        
-            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
-            multip2 = ((tk-1.0f)/tkp1);
-        
-            Rupd_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, multip2, dimX, dimY, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-        
-            if (epsil != 0.0f) {
-                /* calculate norm - stopping rules using the Thrust library */					
-                ResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, ImSize);
-                checkCudaErrors( cudaDeviceSynchronize() );
-                checkCudaErrors(cudaPeekAtLastError() );               
-                
-                thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize);  		
-                float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));		
-                thrust::device_vector<float> d_vec2(d_update, d_update + ImSize);  		
-                float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>()));
-                    
-                re = (reduction/reduction2);      
-                if (re < epsil)  count++;
-                    if (count > 4) break;       
-             
-                copy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize);
-                checkCudaErrors( cudaDeviceSynchronize() );
-                checkCudaErrors(cudaPeekAtLastError() );                                              
-            }                  
-        
-            copy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-        
-            copy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );       
- 
-            tk = tkp1;
-        }
-        if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i);   
-            /***************************************************************/    
-            //copy result matrix from device to host memory
-            cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
-    
-            cudaFree(d_input);
-            cudaFree(d_update);
-            if (epsil != 0.0f) cudaFree(d_update_prev);
-            cudaFree(P1);
-            cudaFree(P2);
-            cudaFree(P1_prev);
-            cudaFree(P2_prev);
-            cudaFree(R1);
-            cudaFree(R2);
-    }
-    else {
-            /*3D verson*/
-            int ImSize = dimX*dimY*dimZ;    
-            float *d_input, *d_update=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL;
-   
-            dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE);
-            dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE));
-    
-            /*allocate space for images on device*/
-            checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );            
-            checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&P3,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&P3_prev,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&R3,ImSize*sizeof(float)) );
-    
-            checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
-            cudaMemset(P1, 0, ImSize*sizeof(float));
-            cudaMemset(P2, 0, ImSize*sizeof(float));
-            cudaMemset(P3, 0, ImSize*sizeof(float));
-            cudaMemset(P1_prev, 0, ImSize*sizeof(float));
-            cudaMemset(P2_prev, 0, ImSize*sizeof(float));
-            cudaMemset(P3_prev, 0, ImSize*sizeof(float));
-            cudaMemset(R1, 0, ImSize*sizeof(float));
-            cudaMemset(R2, 0, ImSize*sizeof(float));
-            cudaMemset(R3, 0, ImSize*sizeof(float));
-            /********************** Run CUDA 3D kernel here ********************/    
-            multip = (1.0f/(8.0f*lambdaPar));
-    
-            /* The main kernel */
-        for (i = 0; i < iter; i++) {
-        
-            /* computing the gradient of the objective function */
-            Obj_func3D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, lambdaPar);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-        
-            if (nonneg != 0) {
-            nonneg3D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, dimZ, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() ); }
-            
-            /*Taking a step towards minus of the gradient*/
-            Grad_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, multip);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-        
-            /* projection step */
-            if (methodTV == 0) Proj_func3D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* isotropic kernel */
-            else Proj_func3D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* anisotropic kernel */
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-        
-            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
-            multip2 = ((tk-1.0f)/tkp1);
-        
-            Rupd_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, multip2, dimX, dimY, dimZ, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );           
-        
-            copy_kernel3D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, dimZ, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-        
-            copy_kernel3D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, dimZ, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );   
-            
-            copy_kernel3D<<<dimGrid,dimBlock>>>(P3, P3_prev, dimX, dimY, dimZ, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );      
- 
-            tk = tkp1;
-        }
-        if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i);   
-            /***************************************************************/    
-            //copy result matrix from device to host memory
-            cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
-    
-            cudaFree(d_input);
-            cudaFree(d_update);            
-            cudaFree(P1);
-            cudaFree(P2);
-            cudaFree(P3);
-            cudaFree(P1_prev);
-            cudaFree(P2_prev);
-            cudaFree(P3_prev);
-            cudaFree(R1);
-            cudaFree(R2);        
-            cudaFree(R3);        
-    } 
-    cudaDeviceReset(); 
-}
diff --git a/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h
deleted file mode 100755
index 107d243..0000000
--- a/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <memory.h>
-
-#ifndef _TV_FGP_GPU_
-#define _TV_FGP_GPU_
-
-extern "C" void TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ); 
-
-#endif 
diff --git a/Core/regularizers_GPU/TV_FGP_GPU_core.cu b/Core/regularizers_GPU/TV_FGP_GPU_core.cu
new file mode 100755
index 0000000..61097fb
--- /dev/null
+++ b/Core/regularizers_GPU/TV_FGP_GPU_core.cu
@@ -0,0 +1,561 @@
+ /*
+This work is part of the Core Imaging Library developed by
+Visual Analytics and Imaging System Group of the Science Technology
+Facilities Council, STFC
+
+Copyright 2017 Daniil Kazantsev
+Copyright 2017 Srikanth Nagella, Edoardo Pasca
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/ 
+
+#include "TV_FGP_GPU_core.h"
+#include <thrust/device_vector.h>
+#include <thrust/transform_reduce.h>
+
+/* CUDA implementation of FGP-TV [1] denoising/regularization model (2D/3D case)
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume 
+ * 2. lambdaPar - regularization parameter 
+ * 3. Number of iterations
+ * 4. eplsilon: tolerance constant 
+ * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
+ * 6. nonneg: 'nonnegativity (0 is OFF by default) 
+ * 7. print information: 0 (off) or 1 (on) 
+ *
+ * Output:
+ * [1] Filtered/regularized image
+ *
+ * This function is based on the Matlab's code and paper by
+ * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
+ */
+
+// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
+#define checkCudaErrors(err)           __checkCudaErrors (err, __FILE__, __LINE__)
+
+inline void __checkCudaErrors(cudaError err, const char *file, const int line)
+{
+    if (cudaSuccess != err)
+    {
+        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
+                file, line, (int)err, cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define BLKXSIZE2D 16
+#define BLKYSIZE2D 16
+
+#define BLKXSIZE 8
+#define BLKYSIZE 8
+#define BLKZSIZE 8
+
+#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
+struct square { __host__ __device__ float operator()(float x) { return x * x; } };
+
+/************************************************/
+/*****************2D modules*********************/
+/************************************************/
+__global__ void Obj_func2D_kernel(float *Ad, float *D, float *R1, float *R2, int N, int M, int ImSize, float lambda)
+{
+    
+    float val1,val2;
+    
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex; 
+    
+    if ((xIndex < N) && (yIndex < M)) {        
+        if (xIndex <= 0) {val1 = 0.0f;} else {val1 = R1[(xIndex-1) + N*yIndex];}
+        if (yIndex <= 0) {val2 = 0.0f;} else {val2 = R2[xIndex + N*(yIndex-1)];}
+        //Write final result to global memory
+        D[index] = Ad[index] - lambda*(R1[index] + R2[index] - val1 - val2);
+    }
+    return;
+}
+
+__global__ void Grad_func2D_kernel(float *P1, float *P2, float *D, float *R1, float *R2, int N, int M, int ImSize, float multip)
+{
+    
+    float val1,val2;
+    
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if ((xIndex < N) && (yIndex < M)) {        
+        
+        /* boundary conditions */
+        if (xIndex >= N-1) val1 = 0.0f; else val1 = D[index] - D[(xIndex+1) + N*yIndex];
+        if (yIndex >= M-1) val2 = 0.0f; else val2 = D[index] - D[(xIndex) + N*(yIndex + 1)];
+        
+        //Write final result to global memory
+        P1[index] = R1[index] + multip*val1;
+        P2[index] = R2[index] + multip*val2;
+    }
+    return;
+}
+
+__global__ void Proj_func2D_iso_kernel(float *P1, float *P2, int N, int M, int ImSize)
+{
+    
+    float denom;    
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if ((xIndex < N) && (yIndex < M)) { 
+        denom = pow(P1[index],2) +  pow(P2[index],2);        
+        if (denom > 1.0f) {
+            P1[index] = P1[index]/sqrt(denom);
+            P2[index] = P2[index]/sqrt(denom);
+        }
+    }
+    return;
+}
+__global__ void Proj_func2D_aniso_kernel(float *P1, float *P2, int N, int M, int ImSize)
+{
+    
+    float val1, val2;    
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if ((xIndex < N) && (yIndex < M)) { 
+                val1 = abs(P1[index]);
+                val2 = abs(P2[index]);
+                if (val1 < 1.0f) {val1 = 1.0f;}
+                if (val2 < 1.0f) {val2 = 1.0f;}
+                P1[index] = P1[index]/val1;
+                P2[index] = P2[index]/val2;
+    }
+    return;
+}
+__global__ void Rupd_func2D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, float multip2, int N, int M, int ImSize)
+{
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if ((xIndex < N) && (yIndex < M)) { 
+        R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]);
+        R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]);
+    }
+    return;
+}
+__global__ void nonneg2D_kernel(float* Output, int N, int M, int num_total)
+{
+    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if (index < num_total)	{
+        if (Output[index] < 0.0f) Output[index] = 0.0f;
+    }
+}
+__global__ void copy_kernel2D(float *Input, float* Output, int N, int M, int num_total)
+{
+    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if (index < num_total)	{
+        Output[index] = Input[index];
+    }
+}
+__global__ void ResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total)
+{
+    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if (index < num_total)	{
+        Output[index] = Input1[index] - Input2[index];
+    }
+}   
+/************************************************/
+/*****************3D modules*********************/
+/************************************************/
+__global__ void Obj_func3D_kernel(float *Ad, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float lambda)
+{
+    
+    float val1,val2,val3;
+    
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k < Z)) {      
+        if (i <= 0) {val1 = 0.0f;} else {val1 = R1[(N*M)*(k) + (i-1) + N*j];}
+        if (j <= 0) {val2 = 0.0f;} else {val2 = R2[(N*M)*(k) + i + N*(j-1)];}
+        if (k <= 0) {val3 = 0.0f;} else {val3 = R3[(N*M)*(k-1) + i + N*j];}
+        //Write final result to global memory
+        D[index] = Ad[index] - lambda*(R1[index] + R2[index] + R3[index] - val1 - val2 - val3);
+    }
+    return;
+}
+
+__global__ void Grad_func3D_kernel(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float multip)
+{
+    
+    float val1,val2,val3;
+    
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k <  Z)) {       
+        /* boundary conditions */
+        if (i >= N-1) val1 = 0.0f; else val1 = D[index] - D[(N*M)*(k) + (i+1) + N*j];
+        if (j >= M-1) val2 = 0.0f; else val2 = D[index] - D[(N*M)*(k) + i + N*(j+1)];
+        if (k >= Z-1) val3 = 0.0f; else val3 = D[index] - D[(N*M)*(k+1) + i + N*j];
+        
+        //Write final result to global memory
+        P1[index] = R1[index] + multip*val1;
+        P2[index] = R2[index] + multip*val2;
+        P3[index] = R3[index] + multip*val3;
+    }
+    return;
+}
+
+__global__ void Proj_func3D_iso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize)
+{
+    
+    float denom,sq_denom;    
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k <  Z)) {
+        denom = pow(P1[index],2) +  pow(P2[index],2) + pow(P3[index],2);
+        
+        if (denom > 1.0f) {
+            sq_denom = 1.0f/sqrt(denom);
+            P1[index] = P1[index]*sq_denom;
+            P2[index] = P2[index]*sq_denom;
+            P3[index] = P3[index]*sq_denom;
+        }
+    }
+    return;
+}
+
+__global__ void Proj_func3D_aniso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize)
+{
+    
+    float val1, val2, val3;    
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k <  Z)) {
+                val1 = abs(P1[index]);
+                val2 = abs(P2[index]);
+                val3 = abs(P3[index]);
+                if (val1 < 1.0f) {val1 = 1.0f;}
+                if (val2 < 1.0f) {val2 = 1.0f;}
+                if (val3 < 1.0f) {val3 = 1.0f;}
+                P1[index] = P1[index]/val1;
+                P2[index] = P2[index]/val2;
+                P3[index] = P3[index]/val3;
+    }
+    return;
+}
+
+
+__global__ void Rupd_func3D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, float multip2, int N, int M, int Z, int ImSize)
+{
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k <  Z)) { 
+        R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]);
+        R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]);
+        R3[index] = P3[index] + multip2*(P3[index] - P3_old[index]);
+    }
+    return;
+}
+
+__global__ void nonneg3D_kernel(float* Output, int N, int M, int Z, int num_total)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if (index < num_total)	{
+        if (Output[index] < 0.0f) Output[index] = 0.0f;
+    }
+}
+
+__global__ void copy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total)
+{
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if (index < num_total)	{
+        Output[index] = Input[index];
+    }
+}
+/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
+
+////////////MAIN HOST FUNCTION ///////////////
+extern "C" void TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
+{
+    int deviceCount = -1; // number of devices
+    cudaGetDeviceCount(&deviceCount);
+    if (deviceCount == 0) {
+        fprintf(stderr, "No CUDA devices found\n");
+        return;
+    }
+    
+    int count = 0, i;
+    float re, multip,multip2;    
+	float tk = 1.0f;
+    float tkp1=1.0f;
+        
+    if (dimZ <= 1) {
+		/*2D verson*/
+		int ImSize = dimX*dimY;    
+		float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL;
+   
+		dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
+		dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D));
+    
+		/*allocate space for images on device*/
+		checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );
+		if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) );
+    
+        checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
+        cudaMemset(P1, 0, ImSize*sizeof(float));
+        cudaMemset(P2, 0, ImSize*sizeof(float));
+        cudaMemset(P1_prev, 0, ImSize*sizeof(float));
+        cudaMemset(P2_prev, 0, ImSize*sizeof(float));
+        cudaMemset(R1, 0, ImSize*sizeof(float));
+        cudaMemset(R2, 0, ImSize*sizeof(float));
+
+        /********************** Run CUDA 2D kernel here ********************/    
+        multip = (1.0f/(8.0f*lambdaPar));
+    
+        /* The main kernel */
+        for (i = 0; i < iter; i++) {
+        
+            /* computing the gradient of the objective function */
+            Obj_func2D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, dimX, dimY, ImSize, lambdaPar);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+            
+            if (nonneg != 0) {
+            nonneg2D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() ); }
+                    
+            /*Taking a step towards minus of the gradient*/
+            Grad_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P2, d_update, R1, R2, dimX, dimY, ImSize, multip);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            /* projection step */
+            if (methodTV == 0) Proj_func2D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*isotropic TV*/
+            else Proj_func2D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/            
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
+            multip2 = ((tk-1.0f)/tkp1);
+        
+            Rupd_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, multip2, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            if (epsil != 0.0f) {
+                /* calculate norm - stopping rules using the Thrust library */					
+                ResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, ImSize);
+                checkCudaErrors( cudaDeviceSynchronize() );
+                checkCudaErrors(cudaPeekAtLastError() );               
+                
+                thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize);  		
+                float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));		
+                thrust::device_vector<float> d_vec2(d_update, d_update + ImSize);  		
+                float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>()));
+                    
+                re = (reduction/reduction2);      
+                if (re < epsil)  count++;
+                    if (count > 4) break;       
+             
+                copy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize);
+                checkCudaErrors( cudaDeviceSynchronize() );
+                checkCudaErrors(cudaPeekAtLastError() );                                              
+            }                  
+        
+            copy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            copy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );       
+ 
+            tk = tkp1;
+        }
+        if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i);   
+            /***************************************************************/    
+            //copy result matrix from device to host memory
+            cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
+    
+            cudaFree(d_input);
+            cudaFree(d_update);
+            if (epsil != 0.0f) cudaFree(d_update_prev);
+            cudaFree(P1);
+            cudaFree(P2);
+            cudaFree(P1_prev);
+            cudaFree(P2_prev);
+            cudaFree(R1);
+            cudaFree(R2);
+    }
+    else {
+            /*3D verson*/
+            int ImSize = dimX*dimY*dimZ;    
+            float *d_input, *d_update=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL;
+   
+            dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE);
+            dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE));
+    
+            /*allocate space for images on device*/
+            checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );            
+            checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P3,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P3_prev,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&R3,ImSize*sizeof(float)) );
+    
+            checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
+            cudaMemset(P1, 0, ImSize*sizeof(float));
+            cudaMemset(P2, 0, ImSize*sizeof(float));
+            cudaMemset(P3, 0, ImSize*sizeof(float));
+            cudaMemset(P1_prev, 0, ImSize*sizeof(float));
+            cudaMemset(P2_prev, 0, ImSize*sizeof(float));
+            cudaMemset(P3_prev, 0, ImSize*sizeof(float));
+            cudaMemset(R1, 0, ImSize*sizeof(float));
+            cudaMemset(R2, 0, ImSize*sizeof(float));
+            cudaMemset(R3, 0, ImSize*sizeof(float));
+            /********************** Run CUDA 3D kernel here ********************/    
+            multip = (1.0f/(8.0f*lambdaPar));
+    
+            /* The main kernel */
+        for (i = 0; i < iter; i++) {
+        
+            /* computing the gradient of the objective function */
+            Obj_func3D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, lambdaPar);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            if (nonneg != 0) {
+            nonneg3D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() ); }
+            
+            /*Taking a step towards minus of the gradient*/
+            Grad_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, multip);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            /* projection step */
+            if (methodTV == 0) Proj_func3D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* isotropic kernel */
+            else Proj_func3D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* anisotropic kernel */
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
+            multip2 = ((tk-1.0f)/tkp1);
+        
+            Rupd_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, multip2, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );           
+        
+            copy_kernel3D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            copy_kernel3D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );   
+            
+            copy_kernel3D<<<dimGrid,dimBlock>>>(P3, P3_prev, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );      
+ 
+            tk = tkp1;
+        }
+        if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i);   
+            /***************************************************************/    
+            //copy result matrix from device to host memory
+            cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
+    
+            cudaFree(d_input);
+            cudaFree(d_update);            
+            cudaFree(P1);
+            cudaFree(P2);
+            cudaFree(P3);
+            cudaFree(P1_prev);
+            cudaFree(P2_prev);
+            cudaFree(P3_prev);
+            cudaFree(R1);
+            cudaFree(R2);        
+            cudaFree(R3);        
+    } 
+    cudaDeviceReset(); 
+}
diff --git a/Core/regularizers_GPU/TV_FGP_GPU_core.h b/Core/regularizers_GPU/TV_FGP_GPU_core.h
new file mode 100755
index 0000000..107d243
--- /dev/null
+++ b/Core/regularizers_GPU/TV_FGP_GPU_core.h
@@ -0,0 +1,10 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+#ifndef _TV_FGP_GPU_
+#define _TV_FGP_GPU_
+
+extern "C" void TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ); 
+
+#endif 
diff --git a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu
deleted file mode 100755
index a30a89b..0000000
--- a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu
+++ /dev/null
@@ -1,368 +0,0 @@
- /*
-This work is part of the Core Imaging Library developed by
-Visual Analytics and Imaging System Group of the Science Technology
-Facilities Council, STFC
-
-Copyright 2017 Daniil Kazantsev
-Copyright 2017 Srikanth Nagella, Edoardo Pasca
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/ 
-
-#include "TV_ROF_GPU_core.h"
-
-/* C-OMP implementation of ROF-TV denoising/regularization model [1] (2D/3D case)
-*
-* Input Parameters:
-* 1. Noisy image/volume [REQUIRED]
-* 2. lambda - regularization parameter [REQUIRED]
-* 3. tau - marching step for explicit scheme, ~0.1 is recommended [REQUIRED]
-* 4. Number of iterations, for explicit scheme >= 150 is recommended [REQUIRED]
-*
-* Output:
-* [1] Regularized image/volume
-
- * This function is based on the paper by
-* [1] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms"
-*
-* D. Kazantsev, 2016-18
-*/
-
-#define CHECK(call)                                                            \
-{                                                                              \
-    const cudaError_t error = call;                                            \
-    if (error != cudaSuccess)                                                  \
-    {                                                                          \
-        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
-        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
-                cudaGetErrorString(error));                                    \
-        exit(1);                                                               \
-    }                                                                          \
-}
-    
-#define BLKXSIZE 8
-#define BLKYSIZE 8
-#define BLKZSIZE 8
-    
-#define BLKXSIZE2D 16
-#define BLKYSIZE2D 16
-#define EPS 1.0e-5
-    
-#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
-
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-
-__host__ __device__ int sign (float x)
-{
-        return (x > 0) - (x < 0);
-}        
-   
-/*********************2D case****************************/    
-    
-    /* differences 1 */
-    __global__ void D1_func2D(float* Input, float* D1, int N, int M)      
-    {
-		int i1, j1, i2;
-		float NOMx_1,NOMy_1,NOMy_0,denom1,denom2,T1;
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        
-        int index = i + N*j;        
-        
-        if ((i >= 0) && (i < N) && (j >= 0) && (j < M)) {
-            
-            /* boundary conditions (Neumann reflections) */
-                i1 = i + 1; if (i1 >= N) i1 = i-1;
-                i2 = i - 1; if (i2 < 0) i2 = i+1;
-                j1 = j + 1; if (j1 >= M) j1 = j-1;
-		
-		     /* Forward-backward differences */
-                NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */
-                NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */                
-                NOMy_0 = Input[index] - Input[j*N + i2]; /* y- */
-                
-                denom1 = NOMx_1*NOMx_1;
-                denom2 = 0.5f*(sign((float)NOMy_1) + sign((float)NOMy_0))*(MIN(abs((float)NOMy_1),abs((float)NOMy_0)));
-                denom2 = denom2*denom2;
-                T1 = sqrt(denom1 + denom2 + EPS);
-                D1[index] = NOMx_1/T1;	
-		}		
-	}       
-    
-    /* differences 2 */
-    __global__ void D2_func2D(float* Input, float* D2, int N, int M)      
-    {
-		int i1, j1, j2;
-		float NOMx_1,NOMy_1,NOMx_0,denom1,denom2,T2;
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        
-        int index = i + N*j;        
-        
-        if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) {
-            
-            /* boundary conditions (Neumann reflections) */
-                i1 = i + 1; if (i1 >= N) i1 = i-1;
-                j1 = j + 1; if (j1 >= M) j1 = j-1;
-                j2 = j - 1; if (j2 < 0) j2 = j+1; 
-		
-                /* Forward-backward differences */
-                NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */
-                NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */
-                NOMx_0 = Input[index] - Input[j2*N + i]; /* x- */
-                
-                denom1 = NOMy_1*NOMy_1;
-                denom2 = 0.5f*(sign((float)NOMx_1) + sign((float)NOMx_0))*(MIN(abs((float)NOMx_1),abs((float)NOMx_0)));
-                denom2 = denom2*denom2;
-                T2 = sqrt(denom1 + denom2 + EPS);
-                D2[index] = NOMy_1/T2;	
-		}		
-	}
-    
-    __global__ void TV_kernel2D(float *D1, float *D2, float *Update, float *Input, float lambda, float tau, int N, int M)    
-    {
-		int i2, j2;
-		float dv1,dv2;
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        
-        int index = i + N*j;        
-        
-        if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) {
-            
-				/* boundary conditions (Neumann reflections) */                
-                i2 = i - 1; if (i2 < 0) i2 = i+1;                
-                j2 = j - 1; if (j2 < 0) j2 = j+1; 
-                
-				/* divergence components  */
-                dv1 = D1[index] - D1[j2*N + i];
-                dv2 = D2[index] - D2[j*N + i2];                                
-                
-                Update[index] =  Update[index] + tau*(2.0f*lambda*(dv1 + dv2) - (Update[index] - Input[index]));      
-		
-		}  
-	}   
-/*********************3D case****************************/    
- 
-    /* differences 1 */
-    __global__ void D1_func3D(float* Input, float* D1, int dimX, int dimY, int dimZ)      
-    {
-		float NOMx_1, NOMy_1, NOMy_0, NOMz_1, NOMz_0, denom1, denom2,denom3, T1;
-		int i1,i2,k1,j1,j2,k2;
-		
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        int k = blockDim.z * blockIdx.z + threadIdx.z;
-        
-      	int index = (dimX*dimY)*k + j*dimX+i;     
-        
-        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {
-            
-                    /* symmetric boundary conditions (Neuman) */
-                    i1 = i + 1; if (i1 >= dimX) i1 = i-1;
-                    i2 = i - 1; if (i2 < 0) i2 = i+1;
-                    j1 = j + 1; if (j1 >= dimY) j1 = j-1;
-                    j2 = j - 1; if (j2 < 0) j2 = j+1;
-                    k1 = k + 1; if (k1 >= dimZ) k1 = k-1;
-                    k2 = k - 1; if (k2 < 0) k2 = k+1;                    
-                    
-                    /* Forward-backward differences */
-                    NOMx_1 = Input[(dimX*dimY)*k + j1*dimX + i] - Input[index]; /* x+ */
-                    NOMy_1 = Input[(dimX*dimY)*k + j*dimX + i1] - Input[index]; /* y+ */                    
-                    NOMy_0 = Input[index] - Input[(dimX*dimY)*k + j*dimX + i2]; /* y- */
-                    
-                    NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */
-                    NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + j*dimX + i]; /* z- */
-                    
-                    
-                    denom1 = NOMx_1*NOMx_1;
-                    denom2 = 0.5*(sign(NOMy_1) + sign(NOMy_0))*(MIN(abs(NOMy_1),abs(NOMy_0)));
-                    denom2 = denom2*denom2;
-                    denom3 = 0.5*(sign(NOMz_1) + sign(NOMz_0))*(MIN(abs(NOMz_1),abs(NOMz_0)));
-                    denom3 = denom3*denom3;
-                    T1 = sqrt(denom1 + denom2 + denom3 + EPS);
-                    D1[index] = NOMx_1/T1;	
-		}		
-	}      
-
-    /* differences 2 */
-    __global__ void D2_func3D(float* Input, float* D2, int dimX, int dimY, int dimZ)      
-    {
-		float NOMx_1, NOMy_1, NOMx_0, NOMz_1, NOMz_0, denom1, denom2, denom3, T2;
-		int i1,i2,k1,j1,j2,k2;
-		
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        int k = blockDim.z * blockIdx.z + threadIdx.z;
-        
-      	int index = (dimX*dimY)*k + j*dimX+i;     
-        
-        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {
-                    /* symmetric boundary conditions (Neuman) */
-                    i1 = i + 1; if (i1 >= dimX) i1 = i-1;
-                    i2 = i - 1; if (i2 < 0) i2 = i+1;
-                    j1 = j + 1; if (j1 >= dimY) j1 = j-1;
-                    j2 = j - 1; if (j2 < 0) j2 = j+1;
-                    k1 = k + 1; if (k1 >= dimZ) k1 = k-1;
-                    k2 = k - 1; if (k2 < 0) k2 = k+1;
-                    
-                    
-                    /* Forward-backward differences */
-                    NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */
-                    NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */
-                    NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */
-                    NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */
-                    NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + (j)*dimX + i]; /* z- */
-                    
-                    
-                    denom1 = NOMy_1*NOMy_1;
-                    denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0)));
-                    denom2 = denom2*denom2;
-                    denom3 = 0.5*(sign(NOMz_1) + sign(NOMz_0))*(MIN(abs(NOMz_1),abs(NOMz_0)));
-                    denom3 = denom3*denom3;
-                    T2 = sqrt(denom1 + denom2 + denom3 + EPS);
-                    D2[index] = NOMy_1/T2;
-		}
-	}
-	
-	  /* differences 3 */
-    __global__ void D3_func3D(float* Input, float* D3, int dimX, int dimY, int dimZ)      
-    {
-		float NOMx_1, NOMy_1, NOMx_0, NOMy_0, NOMz_1, denom1, denom2, denom3, T3;
-		int i1,i2,k1,j1,j2,k2;
-		
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        int k = blockDim.z * blockIdx.z + threadIdx.z;
-        
-      	int index = (dimX*dimY)*k + j*dimX+i;     
-        
-        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {
-
-				i1 = i + 1; if (i1 >= dimX) i1 = i-1;
-                i2 = i - 1; if (i2 < 0) i2 = i+1;
-                j1 = j + 1; if (j1 >= dimY) j1 = j-1;
-                j2 = j - 1; if (j2 < 0) j2 = j+1;
-                k1 = k + 1; if (k1 >= dimZ) k1 = k-1;
-                k2 = k - 1; if (k2 < 0) k2 = k+1;
-                
-                /* Forward-backward differences */
-                NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */
-                NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */
-                NOMy_0 = Input[index] - Input[(dimX*dimY)*k + (j)*dimX + i2]; /* y- */
-                NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */
-                NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */
-               
-                denom1 = NOMz_1*NOMz_1;
-                denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0)));
-                denom2 = denom2*denom2;
-                denom3 = 0.5*(sign(NOMy_1) + sign(NOMy_0))*(MIN(abs(NOMy_1),abs(NOMy_0)));
-                denom3 = denom3*denom3;
-                T3 = sqrt(denom1 + denom2 + denom3 + EPS);
-                D3[index] = NOMz_1/T3;		
-		}
-	}
-
-    __global__ void TV_kernel3D(float *D1, float *D2, float *D3, float *Update, float *Input, float lambda, float tau, int dimX, int dimY, int dimZ)    
-    {
-		float dv1, dv2, dv3;
-		int i1,i2,k1,j1,j2,k2;
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        int k = blockDim.z * blockIdx.z + threadIdx.z;
-        
-        int index = (dimX*dimY)*k + j*dimX+i;       
-        
-        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {
-            
-					/* symmetric boundary conditions (Neuman) */
-                    i1 = i + 1; if (i1 >= dimX) i1 = i-1;
-                    i2 = i - 1; if (i2 < 0) i2 = i+1;
-                    j1 = j + 1; if (j1 >= dimY) j1 = j-1;
-                    j2 = j - 1; if (j2 < 0) j2 = j+1;
-                    k1 = k + 1; if (k1 >= dimZ) k1 = k-1;
-                    k2 = k - 1; if (k2 < 0) k2 = k+1;
-                    
-                    /*divergence components */
-                    dv1 = D1[index] - D1[(dimX*dimY)*k + j2*dimX+i];
-                    dv2 = D2[index] - D2[(dimX*dimY)*k + j*dimX+i2];
-                    dv3 = D3[index] - D3[(dimX*dimY)*k2 + j*dimX+i];
-                    
-                    Update[index] = Update[index] + tau*(2.0f*lambda*(dv1 + dv2 + dv3) - (Update[index] - Input[index]));
-		
-		}  
-	}
-
-/////////////////////////////////////////////////
-// HOST FUNCTION
-extern "C" void TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z)
-{
-	    // set up device
-		int dev = 0;
-		CHECK(cudaSetDevice(dev));
-        float *d_input, *d_update, *d_D1, *d_D2;
-        
-	if (Z == 0) Z = 1;
-        CHECK(cudaMalloc((void**)&d_input,N*M*Z*sizeof(float)));
-        CHECK(cudaMalloc((void**)&d_update,N*M*Z*sizeof(float)));
-        CHECK(cudaMalloc((void**)&d_D1,N*M*Z*sizeof(float)));
-        CHECK(cudaMalloc((void**)&d_D2,N*M*Z*sizeof(float)));
-        
-        CHECK(cudaMemcpy(d_input,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice));
-        CHECK(cudaMemcpy(d_update,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice));      
-        
-        if (Z > 1) {
-			// TV - 3D case                 
-            dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE);
-            dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKXSIZE));            
-            
-            float *d_D3;
-            CHECK(cudaMalloc((void**)&d_D3,N*M*Z*sizeof(float)));
-            
-            for(int n=0; n < iter; n++) {
-                /* calculate differences */
-                D1_func3D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M, Z);
-                CHECK(cudaDeviceSynchronize());
-				D2_func3D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M, Z);
-                CHECK(cudaDeviceSynchronize());        
-                D3_func3D<<<dimGrid,dimBlock>>>(d_update, d_D3, N, M, Z);
-                CHECK(cudaDeviceSynchronize());        
-                /*running main kernel*/
-                TV_kernel3D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_D3, d_update, d_input, lambdaPar, tau, N, M, Z);
-                CHECK(cudaDeviceSynchronize());
-            }
-            
-            CHECK(cudaFree(d_D3));         
-        }
-        else {
-	    // TV - 2D case
-            dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
-            dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D));
-             
-            for(int n=0; n < iter; n++) {
-                /* calculate differences */
-                D1_func2D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M);
-                CHECK(cudaDeviceSynchronize());
-				D2_func2D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M);
-                CHECK(cudaDeviceSynchronize());        
-                /*running main kernel*/
-                TV_kernel2D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_update, d_input, lambdaPar, tau, N, M);
-                CHECK(cudaDeviceSynchronize());
-            }
-        }        
-        CHECK(cudaMemcpy(Output,d_update,N*M*Z*sizeof(float),cudaMemcpyDeviceToHost));
-        CHECK(cudaFree(d_input));
-        CHECK(cudaFree(d_update));
-        CHECK(cudaFree(d_D1));
-        CHECK(cudaFree(d_D2));        
-        cudaDeviceReset(); 
-}
diff --git a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h
deleted file mode 100755
index d772aba..0000000
--- a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __TVGPU_H__
-#define __TVGPU_H__
-#include "CCPiDefines.h"
-#include <stdio.h>
-
-extern "C" CCPI_EXPORT void TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z);
-
-#endif 
diff --git a/Core/regularizers_GPU/TV_ROF_GPU_core.cu b/Core/regularizers_GPU/TV_ROF_GPU_core.cu
new file mode 100755
index 0000000..1a54d02
--- /dev/null
+++ b/Core/regularizers_GPU/TV_ROF_GPU_core.cu
@@ -0,0 +1,368 @@
+ /*
+This work is part of the Core Imaging Library developed by
+Visual Analytics and Imaging System Group of the Science Technology
+Facilities Council, STFC
+
+Copyright 2017 Daniil Kazantsev
+Copyright 2017 Srikanth Nagella, Edoardo Pasca
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/ 
+
+#include "TV_ROF_GPU_core.h"
+
+/* C-OMP implementation of ROF-TV denoising/regularization model [1] (2D/3D case)
+*
+* Input Parameters:
+* 1. Noisy image/volume [REQUIRED]
+* 2. lambda - regularization parameter [REQUIRED]
+* 3. tau - marching step for explicit scheme, ~0.1 is recommended [REQUIRED]
+* 4. Number of iterations, for explicit scheme >= 150 is recommended [REQUIRED]
+*
+* Output:
+* [1] Regularized image/volume
+
+ * This function is based on the paper by
+* [1] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms"
+*
+* D. Kazantsev, 2016-18
+*/
+
+#define CHECK(call)                                                            \
+{                                                                              \
+    const cudaError_t error = call;                                            \
+    if (error != cudaSuccess)                                                  \
+    {                                                                          \
+        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
+        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
+                cudaGetErrorString(error));                                    \
+        exit(1);                                                               \
+    }                                                                          \
+}
+    
+#define BLKXSIZE 8
+#define BLKYSIZE 8
+#define BLKZSIZE 8
+    
+#define BLKXSIZE2D 16
+#define BLKYSIZE2D 16
+#define EPS 1.0e-5
+    
+#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
+
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+__host__ __device__ int sign (float x)
+{
+        return (x > 0) - (x < 0);
+}        
+   
+/*********************2D case****************************/    
+    
+    /* differences 1 */
+    __global__ void D1_func2D(float* Input, float* D1, int N, int M)      
+    {
+		int i1, j1, i2;
+		float NOMx_1,NOMy_1,NOMy_0,denom1,denom2,T1;
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        
+        int index = i + N*j;        
+        
+        if ((i >= 0) && (i < N) && (j >= 0) && (j < M)) {
+            
+            /* boundary conditions (Neumann reflections) */
+                i1 = i + 1; if (i1 >= N) i1 = i-1;
+                i2 = i - 1; if (i2 < 0) i2 = i+1;
+                j1 = j + 1; if (j1 >= M) j1 = j-1;
+		
+		     /* Forward-backward differences */
+                NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */
+                NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */                
+                NOMy_0 = Input[index] - Input[j*N + i2]; /* y- */
+                
+                denom1 = NOMx_1*NOMx_1;
+                denom2 = 0.5f*(sign((float)NOMy_1) + sign((float)NOMy_0))*(MIN(abs((float)NOMy_1),abs((float)NOMy_0)));
+                denom2 = denom2*denom2;
+                T1 = sqrt(denom1 + denom2 + EPS);
+                D1[index] = NOMx_1/T1;	
+		}		
+	}       
+    
+    /* differences 2 */
+    __global__ void D2_func2D(float* Input, float* D2, int N, int M)      
+    {
+		int i1, j1, j2;
+		float NOMx_1,NOMy_1,NOMx_0,denom1,denom2,T2;
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        
+        int index = i + N*j;        
+        
+        if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) {
+            
+            /* boundary conditions (Neumann reflections) */
+                i1 = i + 1; if (i1 >= N) i1 = i-1;
+                j1 = j + 1; if (j1 >= M) j1 = j-1;
+                j2 = j - 1; if (j2 < 0) j2 = j+1; 
+		
+                /* Forward-backward differences */
+                NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */
+                NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */
+                NOMx_0 = Input[index] - Input[j2*N + i]; /* x- */
+                
+                denom1 = NOMy_1*NOMy_1;
+                denom2 = 0.5f*(sign((float)NOMx_1) + sign((float)NOMx_0))*(MIN(abs((float)NOMx_1),abs((float)NOMx_0)));
+                denom2 = denom2*denom2;
+                T2 = sqrt(denom1 + denom2 + EPS);
+                D2[index] = NOMy_1/T2;	
+		}		
+	}
+    
+    __global__ void TV_kernel2D(float *D1, float *D2, float *Update, float *Input, float lambda, float tau, int N, int M)    
+    {
+		int i2, j2;
+		float dv1,dv2;
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        
+        int index = i + N*j;        
+        
+        if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) {
+            
+				/* boundary conditions (Neumann reflections) */                
+                i2 = i - 1; if (i2 < 0) i2 = i+1;                
+                j2 = j - 1; if (j2 < 0) j2 = j+1; 
+                
+				/* divergence components  */
+                dv1 = D1[index] - D1[j2*N + i];
+                dv2 = D2[index] - D2[j*N + i2];                                
+                
+                Update[index] =  Update[index] + tau*(lambda*(dv1 + dv2) - (Update[index] - Input[index]));      
+		
+		}  
+	}   
+/*********************3D case****************************/    
+ 
+    /* differences 1 */
+    __global__ void D1_func3D(float* Input, float* D1, int dimX, int dimY, int dimZ)      
+    {
+		float NOMx_1, NOMy_1, NOMy_0, NOMz_1, NOMz_0, denom1, denom2,denom3, T1;
+		int i1,i2,k1,j1,j2,k2;
+		
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        int k = blockDim.z * blockIdx.z + threadIdx.z;
+        
+      	int index = (dimX*dimY)*k + j*dimX+i;     
+        
+        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {
+            
+                    /* symmetric boundary conditions (Neuman) */
+                    i1 = i + 1; if (i1 >= dimX) i1 = i-1;
+                    i2 = i - 1; if (i2 < 0) i2 = i+1;
+                    j1 = j + 1; if (j1 >= dimY) j1 = j-1;
+                    j2 = j - 1; if (j2 < 0) j2 = j+1;
+                    k1 = k + 1; if (k1 >= dimZ) k1 = k-1;
+                    k2 = k - 1; if (k2 < 0) k2 = k+1;                    
+                    
+                    /* Forward-backward differences */
+                    NOMx_1 = Input[(dimX*dimY)*k + j1*dimX + i] - Input[index]; /* x+ */
+                    NOMy_1 = Input[(dimX*dimY)*k + j*dimX + i1] - Input[index]; /* y+ */                    
+                    NOMy_0 = Input[index] - Input[(dimX*dimY)*k + j*dimX + i2]; /* y- */
+                    
+                    NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */
+                    NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + j*dimX + i]; /* z- */
+                    
+                    
+                    denom1 = NOMx_1*NOMx_1;
+                    denom2 = 0.5*(sign(NOMy_1) + sign(NOMy_0))*(MIN(abs(NOMy_1),abs(NOMy_0)));
+                    denom2 = denom2*denom2;
+                    denom3 = 0.5*(sign(NOMz_1) + sign(NOMz_0))*(MIN(abs(NOMz_1),abs(NOMz_0)));
+                    denom3 = denom3*denom3;
+                    T1 = sqrt(denom1 + denom2 + denom3 + EPS);
+                    D1[index] = NOMx_1/T1;	
+		}		
+	}      
+
+    /* differences 2 */
+    __global__ void D2_func3D(float* Input, float* D2, int dimX, int dimY, int dimZ)      
+    {
+		float NOMx_1, NOMy_1, NOMx_0, NOMz_1, NOMz_0, denom1, denom2, denom3, T2;
+		int i1,i2,k1,j1,j2,k2;
+		
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        int k = blockDim.z * blockIdx.z + threadIdx.z;
+        
+      	int index = (dimX*dimY)*k + j*dimX+i;     
+        
+        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {
+                    /* symmetric boundary conditions (Neuman) */
+                    i1 = i + 1; if (i1 >= dimX) i1 = i-1;
+                    i2 = i - 1; if (i2 < 0) i2 = i+1;
+                    j1 = j + 1; if (j1 >= dimY) j1 = j-1;
+                    j2 = j - 1; if (j2 < 0) j2 = j+1;
+                    k1 = k + 1; if (k1 >= dimZ) k1 = k-1;
+                    k2 = k - 1; if (k2 < 0) k2 = k+1;
+                    
+                    
+                    /* Forward-backward differences */
+                    NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */
+                    NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */
+                    NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */
+                    NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */
+                    NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + (j)*dimX + i]; /* z- */
+                    
+                    
+                    denom1 = NOMy_1*NOMy_1;
+                    denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0)));
+                    denom2 = denom2*denom2;
+                    denom3 = 0.5*(sign(NOMz_1) + sign(NOMz_0))*(MIN(abs(NOMz_1),abs(NOMz_0)));
+                    denom3 = denom3*denom3;
+                    T2 = sqrt(denom1 + denom2 + denom3 + EPS);
+                    D2[index] = NOMy_1/T2;
+		}
+	}
+	
+	  /* differences 3 */
+    __global__ void D3_func3D(float* Input, float* D3, int dimX, int dimY, int dimZ)      
+    {
+		float NOMx_1, NOMy_1, NOMx_0, NOMy_0, NOMz_1, denom1, denom2, denom3, T3;
+		int i1,i2,k1,j1,j2,k2;
+		
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        int k = blockDim.z * blockIdx.z + threadIdx.z;
+        
+      	int index = (dimX*dimY)*k + j*dimX+i;     
+        
+        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {
+
+				i1 = i + 1; if (i1 >= dimX) i1 = i-1;
+                i2 = i - 1; if (i2 < 0) i2 = i+1;
+                j1 = j + 1; if (j1 >= dimY) j1 = j-1;
+                j2 = j - 1; if (j2 < 0) j2 = j+1;
+                k1 = k + 1; if (k1 >= dimZ) k1 = k-1;
+                k2 = k - 1; if (k2 < 0) k2 = k+1;
+                
+                /* Forward-backward differences */
+                NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */
+                NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */
+                NOMy_0 = Input[index] - Input[(dimX*dimY)*k + (j)*dimX + i2]; /* y- */
+                NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */
+                NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */
+               
+                denom1 = NOMz_1*NOMz_1;
+                denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0)));
+                denom2 = denom2*denom2;
+                denom3 = 0.5*(sign(NOMy_1) + sign(NOMy_0))*(MIN(abs(NOMy_1),abs(NOMy_0)));
+                denom3 = denom3*denom3;
+                T3 = sqrt(denom1 + denom2 + denom3 + EPS);
+                D3[index] = NOMz_1/T3;		
+		}
+	}
+
+    __global__ void TV_kernel3D(float *D1, float *D2, float *D3, float *Update, float *Input, float lambda, float tau, int dimX, int dimY, int dimZ)    
+    {
+		float dv1, dv2, dv3;
+		int i1,i2,k1,j1,j2,k2;
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        int k = blockDim.z * blockIdx.z + threadIdx.z;
+        
+        int index = (dimX*dimY)*k + j*dimX+i;       
+        
+        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {
+            
+					/* symmetric boundary conditions (Neuman) */
+                    i1 = i + 1; if (i1 >= dimX) i1 = i-1;
+                    i2 = i - 1; if (i2 < 0) i2 = i+1;
+                    j1 = j + 1; if (j1 >= dimY) j1 = j-1;
+                    j2 = j - 1; if (j2 < 0) j2 = j+1;
+                    k1 = k + 1; if (k1 >= dimZ) k1 = k-1;
+                    k2 = k - 1; if (k2 < 0) k2 = k+1;
+                    
+                    /*divergence components */
+                    dv1 = D1[index] - D1[(dimX*dimY)*k + j2*dimX+i];
+                    dv2 = D2[index] - D2[(dimX*dimY)*k + j*dimX+i2];
+                    dv3 = D3[index] - D3[(dimX*dimY)*k2 + j*dimX+i];
+                    
+                    Update[index] = Update[index] + tau*(lambda*(dv1 + dv2 + dv3) - (Update[index] - Input[index]));
+		
+		}  
+	}
+
+/////////////////////////////////////////////////
+// HOST FUNCTION
+extern "C" void TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z)
+{
+	    // set up device
+		int dev = 0;
+		CHECK(cudaSetDevice(dev));
+        float *d_input, *d_update, *d_D1, *d_D2;
+        
+	if (Z == 0) Z = 1;
+        CHECK(cudaMalloc((void**)&d_input,N*M*Z*sizeof(float)));
+        CHECK(cudaMalloc((void**)&d_update,N*M*Z*sizeof(float)));
+        CHECK(cudaMalloc((void**)&d_D1,N*M*Z*sizeof(float)));
+        CHECK(cudaMalloc((void**)&d_D2,N*M*Z*sizeof(float)));
+        
+        CHECK(cudaMemcpy(d_input,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice));
+        CHECK(cudaMemcpy(d_update,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice));      
+        
+        if (Z > 1) {
+			// TV - 3D case                 
+            dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE);
+            dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKXSIZE));            
+            
+            float *d_D3;
+            CHECK(cudaMalloc((void**)&d_D3,N*M*Z*sizeof(float)));
+            
+            for(int n=0; n < iter; n++) {
+                /* calculate differences */
+                D1_func3D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M, Z);
+                CHECK(cudaDeviceSynchronize());
+				D2_func3D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M, Z);
+                CHECK(cudaDeviceSynchronize());        
+                D3_func3D<<<dimGrid,dimBlock>>>(d_update, d_D3, N, M, Z);
+                CHECK(cudaDeviceSynchronize());        
+                /*running main kernel*/
+                TV_kernel3D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_D3, d_update, d_input, lambdaPar, tau, N, M, Z);
+                CHECK(cudaDeviceSynchronize());
+            }
+            
+            CHECK(cudaFree(d_D3));         
+        }
+        else {
+	    // TV - 2D case
+            dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
+            dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D));
+             
+            for(int n=0; n < iter; n++) {
+                /* calculate differences */
+                D1_func2D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M);
+                CHECK(cudaDeviceSynchronize());
+				D2_func2D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M);
+                CHECK(cudaDeviceSynchronize());        
+                /*running main kernel*/
+                TV_kernel2D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_update, d_input, lambdaPar, tau, N, M);
+                CHECK(cudaDeviceSynchronize());
+            }
+        }        
+        CHECK(cudaMemcpy(Output,d_update,N*M*Z*sizeof(float),cudaMemcpyDeviceToHost));
+        CHECK(cudaFree(d_input));
+        CHECK(cudaFree(d_update));
+        CHECK(cudaFree(d_D1));
+        CHECK(cudaFree(d_D2));        
+        cudaDeviceReset(); 
+}
diff --git a/Core/regularizers_GPU/TV_ROF_GPU_core.h b/Core/regularizers_GPU/TV_ROF_GPU_core.h
new file mode 100755
index 0000000..d772aba
--- /dev/null
+++ b/Core/regularizers_GPU/TV_ROF_GPU_core.h
@@ -0,0 +1,8 @@
+#ifndef __TVGPU_H__
+#define __TVGPU_H__
+#include "CCPiDefines.h"
+#include <stdio.h>
+
+extern "C" CCPI_EXPORT void TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z);
+
+#endif 
diff --git a/Readme.md b/Readme.md
index d91e420..3ec20dc 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,24 +1,27 @@
-# CCPi-Regularisation Toolkit (CCPi-RGL)
+# CCPi-Regularization Toolkit (CCPi-RGL)
 
-**Iterative image reconstruction (IIR) methods normally require regularisation to stabilise convergence and make the reconstruction problem more well-posed. 
-CCPi-RGL software consist of 2D/3D regularisation modules which frequently used for IIR. 
-The core modules are written in C-OMP and CUDA languages and wrappers for Matlab and Python are provided.** 
+**Iterative image reconstruction (IIR) methods normally require regularization to stabilize the convergence and make the reconstruction problem more well-posed. 
+CCPi-RGL software consist of 2D/3D regularization modules for single-channel and multi-channel reconstruction problems. The modules especially suited for IIR, however,
+can also be used as image denoising iterative filters. The core modules are written in C-OMP and CUDA languages and wrappers for Matlab and Python are provided.** 
 
 ## Prerequisites: 
 
- * MATLAB (www.mathworks.com/products/matlab/)
- * Python (ver. 3.5); Cython
+ * MATLAB (www.mathworks.com/products/matlab/) OR
+ * Python (tested ver. 3.5); Cython
  * C compilers
  * nvcc (CUDA SDK) compilers
 
 ## Package modules (regularisers):
 
-1. Rudin-Osher-Fatemi Total Variation (explicit PDE minimisation scheme) [2D/3D GPU/CPU]
-2. Fast-Gradient-Projection Total Variation [2D/3D GPU/CPU]
+### Single-channel
+1. Rudin-Osher-Fatemi (ROF) Total Variation (explicit PDE minimisation scheme) [2D/3D GPU/CPU]; (Ref. 1)
+2. Fast-Gradient-Projection (FGP) Total Variation [2D/3D GPU/CPU]; (Ref. 2)
 
-### Installation:
+### Multi-channel
 
-#### Python (conda-build)
+## Installation:
+
+### Python (conda-build)
 ```
 	export CIL_VERSION=0.9.2
 	conda build recipes/regularizers --numpy 1.12 --python 3.5 
@@ -29,7 +32,12 @@ The core modules are written in C-OMP and CUDA languages and wrappers for Matlab
 	cd test/
 	python test_cpu_vs_gpu_regularizers.py
 ```
-#### Matlab 
+### Matlab
+```
+	cd /Wrappers/Matlab/mex_compile
+	compileCPU_mex.m % to compile CPU modules
+	compileGPU_mex.m % to compile GPU modules (see instructions in the file)
+```
 
 ### References:
 1. Rudin, L.I., Osher, S. and Fatemi, E., 1992. Nonlinear total variation based noise removal algorithms. Physica D: nonlinear phenomena, 60(1-4), pp.259-268.
diff --git a/Wrappers/Matlab/demos/demoMatlab_denoise.m b/Wrappers/Matlab/demos/demoMatlab_denoise.m
new file mode 100644
index 0000000..7258e5e
--- /dev/null
+++ b/Wrappers/Matlab/demos/demoMatlab_denoise.m
@@ -0,0 +1,38 @@
+% Image (2D) denoising demo using CCPi-RGL
+
+addpath('../mex_compile/installed');
+addpath('../../../data/');
+
+Im = double(imread('lena_gray_256.tif'))/255;  % loading image
+u0 = Im + .05*randn(size(Im)); u0(u0 < 0) = 0;
+figure; imshow(u0, [0 1]); title('Noisy image');
+
+%%
+fprintf('Denoise using ROF-TV model (CPU) \n');
+lambda_rof = 0.03; % regularization parameter
+tau_rof = 0.0025; % time-marching constant 
+iter_rof = 2000; % number of ROF iterations
+tic; u_rof = ROF_TV(single(u0), lambda_rof, iter_rof, tau_rof); toc; 
+figure; imshow(u_rof, [0 1]); title('ROF-TV denoised image (CPU)');
+%%
+% fprintf('Denoise using ROF-TV model (GPU) \n');
+% lambda_rof = 0.03; % regularization parameter
+% tau_rof = 0.0025; % time-marching constant 
+% iter_rof = 2000; % number of ROF iterations
+% tic; u_rofG = ROF_TV_GPU(single(u0), lambda_rof, iter_rof, tau_rof); toc;
+% figure; imshow(u_rofG, [0 1]); title('ROF-TV denoised image (GPU)');
+%%
+fprintf('Denoise using FGP-TV model (CPU) \n');
+lambda_fgp = 0.03; % regularization parameter
+iter_fgp = 1000; % number of FGP iterations
+epsil_tol =  1.0e-05; % tolerance
+tic; u_fgp = FGP_TV(single(u0), lambda_fgp, iter_fgp, epsil_tol); toc; 
+figure; imshow(u_fgp, [0 1]); title('FGP-TV denoised image (CPU)');
+%%
+% fprintf('Denoise using FGP-TV model (GPU) \n');
+% lambda_fgp = 0.03; % regularization parameter
+% iter_fgp = 1000; % number of FGP iterations
+% epsil_tol =  1.0e-05; % tolerance
+% tic; u_fgpG = FGP_TV_GPU(single(u0), lambda_fgp, iter_fgp, epsil_tol); toc; 
+% figure; imshow(u_fgpG, [0 1]); title('FGP-TV denoised image (GPU)');
+%%
\ No newline at end of file
diff --git a/Wrappers/Matlab/demos/demoMatlab_denoise.m~ b/Wrappers/Matlab/demos/demoMatlab_denoise.m~
new file mode 100644
index 0000000..3f4403e
--- /dev/null
+++ b/Wrappers/Matlab/demos/demoMatlab_denoise.m~
@@ -0,0 +1,31 @@
+% Image (2D) denoising demo using CCPi-RGL
+
+addpath('../mex_compile/installed');
+addpath('../../../data/');
+
+Im = double(imread('lena_gray_256.tif'))/255;  % loading image
+u0 = Im + .05*randn(size(Im)); u0(u0 < 0) = 0;
+figure; imshow(u0, [0 1]); title('Noisy image');
+
+%%
+fprintf('Denoise using ROF-TV model (CPU) \n');
+lambda_rof = 0.02; % regularization parameter
+tau_rof = 0.0025; % time-marching constant 
+iter_rof = 2000; % number of ROF iterations
+tic; u_rof = ROF_TV(single(u0), lambda_rof, iter_rof, tau_rof); toc; 
+figure; imshow(u_rof, [0 1]); title('ROF-TV denoised image (CPU)');
+%%
+% fprintf('Denoise using ROF-TV model (GPU) \n');
+% lambda_rof = 0.02; % regularization parameter
+% tau_rof = 0.0025; % time-marching constant 
+% iter_rof = 2000; % number of ROF iterations
+% tic; u_rof = ROF_TV_GPU(single(u0), lambda_rof, iter_rof, tau_rof); toc;
+% figure; imshow(u_rof, [0 1]); title('ROF-TV denoised image (GPU)');
+%%
+fprintf('Denoise using FGP-TV model (CPU) \n');
+lambda_fgp = 0.02; % regularization parameter
+iter_fgp = 2000; % number of FGP iterations
+epsil_tol =  1.0e-05; % tolerance
+tic; u_fgp = FGP_TV(single(u0), lambda_fgp, iter_fgp, epsil_tol); toc; 
+figure; imshow(u_rof, [0 1]); title('ROF-TV denoised image (CPU)');
+%%
\ No newline at end of file
diff --git a/Wrappers/Matlab/mex_compile/compileCPU_mex.m b/Wrappers/Matlab/mex_compile/compileCPU_mex.m
new file mode 100644
index 0000000..fcee53a
--- /dev/null
+++ b/Wrappers/Matlab/mex_compile/compileCPU_mex.m
@@ -0,0 +1,19 @@
+% execute this mex file in Matlab once
+copyfile ../../../Core/regularizers_CPU/ regularizers_CPU/
+copyfile ../../../Core/CCPiDefines.h regularizers_CPU/
+
+cd regularizers_CPU/
+
+fprintf('%s \n', 'Compiling CPU regularizers...');
+mex ROF_TV.c ROF_TV_core.c utils.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
+movefile ROF_TV.mex* ../installed/
+
+mex FGP_TV.c FGP_TV_core.c utils.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
+movefile FGP_TV.mex* ../installed/
+
+delete ROF_TV_core* FGP_TV_core* utils.c utils.h CCPiDefines.h
+
+fprintf('%s \n', 'All successfully compiled!');
+
+cd ../../
+cd demos
diff --git a/Wrappers/Matlab/mex_compile/compileGPU_mex.m b/Wrappers/Matlab/mex_compile/compileGPU_mex.m
new file mode 100644
index 0000000..df29a3e
--- /dev/null
+++ b/Wrappers/Matlab/mex_compile/compileGPU_mex.m
@@ -0,0 +1,30 @@
+% execute this mex file in Matlab once
+
+%>>>>>>>>>>>>>>Important<<<<<<<<<<<<<<<<<<<
+% In order to compile CUDA modules one needs to have nvcc-compiler
+% installed (see CUDA SDK)
+% check it under MATLAB with !nvcc --version
+% In the code bellow we provide a full path to nvcc compiler 
+% ! paths to matlab and CUDA sdk can be different, modify accordingly !
+
+% tested on Ubuntu 16.04/MATLAB 2016b
+
+copyfile ../../../Core/regularizers_GPU/ regularizers_GPU/
+copyfile ../../../Core/CCPiDefines.h regularizers_GPU/
+
+cd regularizers_GPU/
+
+fprintf('%s \n', 'Compiling GPU regularizers (CUDA)...');
+!/usr/local/cuda/bin/nvcc -O0 -c TV_ROF_GPU_core.cu -Xcompiler -fPIC -I~/SOFT/MATLAB9/extern/include/
+mex -g -I/usr/local/cuda-7.5/include -L/usr/local/cuda-7.5/lib64 -lcudart -lcufft -lmwgpu ROF_TV_GPU.cpp TV_ROF_GPU_core.o
+movefile ROF_TV_GPU.mex* ../installed/
+
+!/usr/local/cuda/bin/nvcc -O0 -c TV_FGP_GPU_core.cu -Xcompiler -fPIC -I~/SOFT/MATLAB9/extern/include/
+mex -g -I/usr/local/cuda-7.5/include -L/usr/local/cuda-7.5/lib64 -lcudart -lcufft -lmwgpu FGP_TV_GPU.cpp TV_FGP_GPU_core.o
+movefile FGP_TV_GPU.mex* ../installed/
+
+delete TV_ROF_GPU_core* TV_FGP_GPU_core* CCPiDefines.h
+fprintf('%s \n', 'All successfully compiled!');
+
+cd ../../
+cd demos
diff --git a/Wrappers/Matlab/mex_compile/compile_mex.m b/Wrappers/Matlab/mex_compile/compile_mex.m
deleted file mode 100644
index d45ac04..0000000
--- a/Wrappers/Matlab/mex_compile/compile_mex.m
+++ /dev/null
@@ -1,19 +0,0 @@
-% compile mex's in Matlab once
-copyfile ../../../Core/regularizers_CPU/ regularizers_CPU/
-copyfile ../../../Core/regularizers_GPU/ regularizers_GPU/
-copyfile ../../../Core/CCPiDefines.h regularizers_CPU/
-
-cd regularizers_CPU/
-
-% compile C regularizers
-
-mex ROF_TV.c ROF_TV_core.c utils.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
-mex FGP_TV.c FGP_TV_core.c utils.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
-
-delete ROF_TV_core.c ROF_TV_core.h FGP_TV_core.c FGP_TV_core.h utils.c utils.h CCPiDefines.h
-
-% compile CUDA-based regularizers
-%cd regularizers_GPU/
-
-cd ../../
-cd demos
diff --git a/Wrappers/Matlab/mex_compile/regularizers_CPU/FGP_TV.c b/Wrappers/Matlab/mex_compile/regularizers_CPU/FGP_TV.c
index 7cc861a..ba06cc7 100644
--- a/Wrappers/Matlab/mex_compile/regularizers_CPU/FGP_TV.c
+++ b/Wrappers/Matlab/mex_compile/regularizers_CPU/FGP_TV.c
@@ -1,21 +1,21 @@
 /*
-This work is part of the Core Imaging Library developed by
-Visual Analytics and Imaging System Group of the Science Technology
-Facilities Council, STFC
-
-Copyright 2017 Daniil Kazantsev
-Copyright 2017 Srikanth Nagella, Edoardo Pasca
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 #include "matrix.h"
 #include "mex.h"
 #include "FGP_TV_core.h"
@@ -23,28 +23,19 @@ limitations under the License.
 /* C-OMP implementation of FGP-TV [1] denoising/regularization model (2D/3D case)
  *
  * Input Parameters:
- * 1. Noisy image/volume [REQUIRED]
- * 2. lambda - regularization parameter [REQUIRED]
- * 3. Number of iterations [OPTIONAL parameter]
- * 4. eplsilon: tolerance constant [OPTIONAL parameter]
- * 5. TV-type: 'iso' or 'l1' [OPTIONAL parameter]
+ * 1. Noisy image/volume
+ * 2. lambdaPar - regularization parameter
+ * 3. Number of iterations
+ * 4. eplsilon: tolerance constant
+ * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
+ * 6. nonneg: 'nonnegativity (0 is OFF by default)
+ * 7. print information: 0 (off) or 1 (on)
  *
  * Output:
  * [1] Filtered/regularized image
- * [2] last function value 
- *
- * Example of image denoising:
- * figure;
- * Im = double(imread('lena_gray_256.tif'))/255;  % loading image
- * u0 = Im + .05*randn(size(Im)); % adding noise
- * u = FGP_TV(single(u0), 0.05, 100, 1e-04);
  *
- * to compile with OMP support: mex FGP_TV.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
  * This function is based on the Matlab's code and paper by
  * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
- *
- * D. Kazantsev, 2016-17
- *
  */
 
 
@@ -53,42 +44,53 @@ void mexFunction(
         int nrhs, const mxArray *prhs[])
         
 {
-    int number_of_dims, iter, dimX, dimY, dimZ, methTV;
+    int number_of_dims, iter, dimX, dimY, dimZ, methTV, printswitch, nonneg;
     const int  *dim_array;
-    float *Input, *Output, lambda, epsil;
+    float *Input, *Output=NULL, lambda, epsil;
     
     number_of_dims = mxGetNumberOfDimensions(prhs[0]);
     dim_array = mxGetDimensions(prhs[0]);
     
     /*Handling Matlab input data*/
-    if ((nrhs < 2) || (nrhs > 5)) mexErrMsgTxt("At least 2 parameters is required: Image(2D/3D), Regularization parameter. The full list of parameters: Image(2D/3D), Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1')");
+    if ((nrhs < 2) || (nrhs > 7)) mexErrMsgTxt("At least 2 parameters is required, all parameters are: Image(2D/3D), Regularization parameter. The full list of parameters: Image(2D/3D), Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1'), nonnegativity switch, print switch");
     
     Input  = (float *) mxGetData(prhs[0]); /*noisy image (2D/3D) */
     lambda =  (float) mxGetScalar(prhs[1]); /* regularization parameter */
     iter = 300; /* default iterations number */
     epsil = 0.0001; /* default tolerance constant */
     methTV = 0;  /* default isotropic TV penalty */
+    nonneg = 0; /* default nonnegativity switch, off - 0 */
+    printswitch = 0; /*default print is switched, off - 0 */
     
-    if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5))  iter = (int) mxGetScalar(prhs[2]); /* iterations number */
-    if ((nrhs == 4) || (nrhs == 5))  epsil =  (float) mxGetScalar(prhs[3]); /* tolerance constant */
-    if (nrhs == 5)  {
+    if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); }
+    
+    if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7))  iter = (int) mxGetScalar(prhs[2]); /* iterations number */
+    if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7))  epsil =  (float) mxGetScalar(prhs[3]); /* tolerance constant */
+    if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7))  {
         char *penalty_type;
         penalty_type = mxArrayToString(prhs[4]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */
         if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',");
         if (strcmp(penalty_type, "l1") == 0)  methTV = 1;  /* enable 'l1' penalty */
         mxFree(penalty_type);
     }
-        
-    if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); }
+    if ((nrhs == 6) || (nrhs == 7))  {
+        nonneg = (int) mxGetScalar(prhs[5]);
+        if ((nonneg != 0) && (nonneg != 1)) mexErrMsgTxt("Nonnegativity constraint can be enabled by choosing 1 or off - 0");
+    }
+    if (nrhs == 7)  {
+        printswitch = (int) mxGetScalar(prhs[6]);
+        if ((printswitch != 0) && (printswitch != 1)) mexErrMsgTxt("Print can be enabled by choosing 1 or off - 0");
+    }
     
     /*Handling Matlab output data*/
-    dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];       
+    dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];
     
     if (number_of_dims == 2) {
         dimZ = 1; /*2D case*/
         Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        TV_FGP_CPU_main(Input, Output, lambda, iter, epsil, methTV, 0, 0, dimX, dimY, dimZ);
-    } 
-    if (number_of_dims == 3) {          
-    }   
-}
+    }
+    if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
+    
+    /* running the function */
+    TV_FGP_CPU_main(Input, Output, lambda, iter, epsil, methTV, nonneg, printswitch, dimX, dimY, dimZ);
+}
\ No newline at end of file
diff --git a/Wrappers/Matlab/mex_compile/regularizers_CPU/LLT_model.c b/Wrappers/Matlab/mex_compile/regularizers_CPU/LLT_model.c
deleted file mode 100644
index 0b07b47..0000000
--- a/Wrappers/Matlab/mex_compile/regularizers_CPU/LLT_model.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
-This work is part of the Core Imaging Library developed by
-Visual Analytics and Imaging System Group of the Science Technology
-Facilities Council, STFC
-
-Copyright 2017 Daniil Kazantsev
-Copyright 2017 Srikanth Nagella, Edoardo Pasca
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include "mex.h"
-#include "matrix.h"
-#include "LLT_model_core.h"
-
-/* C-OMP implementation of Lysaker, Lundervold and Tai (LLT) model of higher order regularization penalty
-*
-* Input Parameters:
-* 1. U0 - original noise image/volume
-* 2. lambda - regularization parameter
-* 3. tau - time-step  for explicit scheme
-* 4. iter - iterations number
-* 5. epsil  - tolerance constant (to terminate earlier)
-* 6. switcher - default is 0, switch to (1) to restrictive smoothing in Z dimension (in test)
-*
-* Output:
-* Filtered/regularized image
-*
-* Example:
-* figure;
-* Im = double(imread('lena_gray_256.tif'))/255;  % loading image
-* u0 = Im + .03*randn(size(Im)); % adding noise
-* [Den] = LLT_model(single(u0), 10, 0.1, 1);
-*
-*
-* to compile with OMP support: mex LLT_model.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
-* References: Lysaker, Lundervold and Tai (LLT) 2003, IEEE
-*
-* 28.11.16/Harwell
-*/
-
-void mexFunction(
-        int nlhs, mxArray *plhs[],
-        int nrhs, const mxArray *prhs[])
-        
-{
-    int number_of_dims, iter, dimX, dimY, dimZ, ll, j, count, switcher;
-    const int  *dim_array;
-    float *U0, *U=NULL, *U_old=NULL, *D1=NULL, *D2=NULL, *D3=NULL, lambda, tau, re, re1, epsil, re_old;
-    unsigned short *Map=NULL;
-    
-    number_of_dims = mxGetNumberOfDimensions(prhs[0]);
-    dim_array = mxGetDimensions(prhs[0]);
-    
-    /*Handling Matlab input data*/
-    U0  = (float *) mxGetData(prhs[0]); /*origanal noise image/volume*/
-    if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input in single precision is required"); }
-    lambda =  (float) mxGetScalar(prhs[1]); /*regularization parameter*/
-    tau =  (float) mxGetScalar(prhs[2]); /* time-step */
-    iter =  (int) mxGetScalar(prhs[3]); /*iterations number*/
-    epsil =  (float) mxGetScalar(prhs[4]); /* tolerance constant */
-    switcher =  (int) mxGetScalar(prhs[5]); /*switch on (1) restrictive smoothing in Z dimension*/
-     
-    /*Handling Matlab output data*/
-    dimX = dim_array[0]; dimY = dim_array[1];  dimZ = 1;
-    
-    if (number_of_dims == 2) {
-        /*2D case*/       
-        U = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        U_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        D1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        D2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-    }
-    else if (number_of_dims == 3) {
-        /*3D case*/
-        dimZ = dim_array[2];
-        U = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        U_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        D1 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        D2 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        D3 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        if (switcher != 0) {
-        Map = (unsigned short*)mxGetPr(plhs[1] = mxCreateNumericArray(3, dim_array, mxUINT16_CLASS, mxREAL));       
-        }
-    }
-    else {mexErrMsgTxt("The input data should be 2D or 3D");}
-    
-    /*Copy U0 to U*/
-    copyIm(U0, U, dimX, dimY, dimZ);
-    
-    count = 1;
-    re_old = 0.0f; 
-    if (number_of_dims == 2) {
-        for(ll = 0; ll < iter; ll++) {
-            
-            copyIm(U, U_old, dimX, dimY, dimZ);
-            
-            /*estimate inner derrivatives */
-            der2D(U, D1, D2, dimX, dimY, dimZ);
-            /* calculate div^2 and update */
-            div_upd2D(U0, U, D1, D2, dimX, dimY, dimZ, lambda, tau);
-            
-            /* calculate norm to terminate earlier */
-            re = 0.0f; re1 = 0.0f;
-            for(j=0; j<dimX*dimY*dimZ; j++)
-            {
-                re += pow(U_old[j] - U[j],2);
-                re1 += pow(U_old[j],2);
-            }
-            re = sqrt(re)/sqrt(re1);           
-            if (re < epsil)  count++;
-            if (count > 4) break;            
-            
-            /* check that the residual norm is decreasing */
-            if (ll > 2) {
-                if (re > re_old) break; 
-            }
-            re_old = re;         
-        
-        } /*end of iterations*/
-        printf("HO iterations stopped at iteration: %i\n", ll);          
-    }
-    /*3D version*/
-    if (number_of_dims == 3) {
-        
-        if (switcher == 1) {
-            /* apply restrictive smoothing */            
-            calcMap(U, Map, dimX, dimY, dimZ);
-            /*clear outliers */
-            cleanMap(Map, dimX, dimY, dimZ);           
-        }
-        for(ll = 0; ll < iter; ll++) {
-            
-            copyIm(U, U_old, dimX, dimY, dimZ);
-            
-            /*estimate inner derrivatives */
-            der3D(U, D1, D2, D3, dimX, dimY, dimZ);          
-            /* calculate div^2 and update */
-            div_upd3D(U0, U, D1, D2, D3, Map, switcher, dimX, dimY, dimZ, lambda, tau);                 
-            
-            /* calculate norm to terminate earlier */
-            re = 0.0f; re1 = 0.0f;
-            for(j=0; j<dimX*dimY*dimZ; j++)
-            {
-                re += pow(U_old[j] - U[j],2);
-                re1 += pow(U_old[j],2);
-            }
-            re = sqrt(re)/sqrt(re1);           
-            if (re < epsil)  count++;
-            if (count > 4) break;            
-            
-            /* check that the residual norm is decreasing */
-            if (ll > 2) {
-                if (re > re_old) break; 
-            }
-            re_old = re;     
-            
-        } /*end of iterations*/
-        printf("HO iterations stopped at iteration: %i\n", ll);       
-    }
-}
diff --git a/Wrappers/Matlab/mex_compile/regularizers_CPU/PatchBased_Regul.c b/Wrappers/Matlab/mex_compile/regularizers_CPU/PatchBased_Regul.c
deleted file mode 100644
index 9c925df..0000000
--- a/Wrappers/Matlab/mex_compile/regularizers_CPU/PatchBased_Regul.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
-This work is part of the Core Imaging Library developed by
-Visual Analytics and Imaging System Group of the Science Technology
-Facilities Council, STFC
-
-Copyright 2017 Daniil Kazantsev
-Copyright 2017 Srikanth Nagella, Edoardo Pasca
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include "mex.h"
-#include "matrix.h"
-#include "PatchBased_Regul_core.h"
-
-
-/* C-OMP implementation of  patch-based (PB) regularization (2D and 3D cases). 
- * This method finds self-similar patches in data and performs one fixed point iteration to mimimize the PB penalty function
- * 
- * References: 1. Yang Z. & Jacob M. "Nonlocal Regularization of Inverse Problems"
- *             2. Kazantsev D. et al. "4D-CT reconstruction with unified spatial-temporal patch-based regularization"
- *
- * Input Parameters:
- * 1. Image (2D or 3D) [required]
- * 2. ratio of the searching window (e.g. 3 = (2*3+1) = 7 pixels window) [optional]
- * 3. ratio of the similarity window (e.g. 1 = (2*1+1) = 3 pixels window) [optional]
- * 4. h - parameter for the PB penalty function [optional]
- * 5. lambda - regularization parameter  [optional]
-
- * Output:
- * 1. regularized (denoised) Image (N x N)/volume (N x N x N)
- *
- * 2D denoising example in Matlab:   
-   Im = double(imread('lena_gray_256.tif'))/255;  % loading image
-   u0 = Im + .03*randn(size(Im)); u0(u0<0) = 0; % adding noise
-   ImDen = PatchBased_Regul(single(u0), 3, 1, 0.08, 0.05); 
- *
- * Matlab + C/mex compilers needed
- * to compile with OMP support: mex PatchBased_Regul.c CFLAGS="\$CFLAGS -fopenmp -Wall" LDFLAGS="\$LDFLAGS -fopenmp"
- *
- * D. Kazantsev *
- * 02/07/2014
- * Harwell, UK
- */
-
-
-void mexFunction(
-        int nlhs, mxArray *plhs[],
-        int nrhs, const mxArray *prhs[]) 
-{    
-    int N, M, Z, numdims, SearchW, SimilW, SearchW_real, padXY, newsizeX, newsizeY, newsizeZ, switchpad_crop;
-    const int  *dims;
-    float *A, *B=NULL, *Ap=NULL, *Bp=NULL, h, lambda;
-    
-    numdims = mxGetNumberOfDimensions(prhs[0]);
-    dims = mxGetDimensions(prhs[0]);
-    
-    N = dims[0];
-    M = dims[1];
-    Z = dims[2];
-    
-    if ((numdims < 2) || (numdims > 3)) {mexErrMsgTxt("The input is 2D image or 3D volume");}
-    if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input in single precision is required"); }
-    
-    if(nrhs != 5) mexErrMsgTxt("Five inputs reqired: Image(2D,3D), SearchW, SimilW, Threshold, Regularization parameter");
-    
-    /*Handling inputs*/
-    A  = (float *) mxGetData(prhs[0]);    /* the image/volume to regularize/filter */
-    SearchW_real = 3; /*default value*/
-    SimilW = 1; /*default value*/
-    h = 0.1; 
-    lambda = 0.1;
-    
-    if ((nrhs == 2) || (nrhs == 3) || (nrhs == 4) || (nrhs == 5))   SearchW_real  = (int) mxGetScalar(prhs[1]); /* the searching window ratio */
-    if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5))   SimilW =  (int) mxGetScalar(prhs[2]);  /* the similarity window ratio */
-    if ((nrhs == 4) || (nrhs == 5))  h =  (float) mxGetScalar(prhs[3]);  /* parameter for the PB filtering function */
-    if ((nrhs == 5))  lambda = (float) mxGetScalar(prhs[4]); /* regularization parameter */   
-
-
-    if (h <= 0) mexErrMsgTxt("Parmeter for the PB penalty function should be > 0");
-    if (lambda <= 0) mexErrMsgTxt(" Regularization parmeter should be > 0");
-       
-    SearchW = SearchW_real + 2*SimilW;
-    
-    /* SearchW_full = 2*SearchW + 1; */ /* the full searching window  size */
-    /* SimilW_full = 2*SimilW + 1;  */  /* the full similarity window  size */
-    
-    padXY = SearchW + 2*SimilW; /* padding sizes */
-    newsizeX = N + 2*(padXY); /* the X size of the padded array */
-    newsizeY = M + 2*(padXY); /* the Y size of the padded array */
-    newsizeZ = Z + 2*(padXY); /* the Z size of the padded array */
-    int N_dims[] = {newsizeX, newsizeY, newsizeZ};
-    
-    /******************************2D case ****************************/
-    if (numdims == 2) {
-        /*Handling output*/
-        B = (float*)mxGetData(plhs[0] = mxCreateNumericMatrix(N, M, mxSINGLE_CLASS, mxREAL));
-        /*allocating memory for the padded arrays */
-        Ap = (float*)mxGetData(mxCreateNumericMatrix(newsizeX, newsizeY, mxSINGLE_CLASS, mxREAL));
-        Bp = (float*)mxGetData(mxCreateNumericMatrix(newsizeX, newsizeY, mxSINGLE_CLASS, mxREAL));
-        /**************************************************************************/
-        /*Perform padding of image A to the size of [newsizeX * newsizeY] */
-        switchpad_crop = 0; /*padding*/
-        pad_crop(A, Ap, M, N, 0, newsizeY, newsizeX, 0, padXY, switchpad_crop);
-        
-        /* Do PB regularization with the padded array  */
-        PB_FUNC2D(Ap, Bp, newsizeY, newsizeX, padXY, SearchW, SimilW, (float)h, (float)lambda);
-        
-        switchpad_crop = 1; /*cropping*/
-        pad_crop(Bp, B, M, N, 0, newsizeY, newsizeX, 0, padXY, switchpad_crop);
-    }
-    else
-    {
-        /******************************3D case ****************************/
-        /*Handling output*/
-        B = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dims, mxSINGLE_CLASS, mxREAL));
-        /*allocating memory for the padded arrays */
-        Ap = (float*)mxGetPr(mxCreateNumericArray(3, N_dims, mxSINGLE_CLASS, mxREAL));
-        Bp = (float*)mxGetPr(mxCreateNumericArray(3, N_dims, mxSINGLE_CLASS, mxREAL));
-        /**************************************************************************/
-        
-        /*Perform padding of image A to the size of [newsizeX * newsizeY * newsizeZ] */
-        switchpad_crop = 0; /*padding*/
-        pad_crop(A, Ap, M, N, Z, newsizeY, newsizeX, newsizeZ, padXY, switchpad_crop);
-        
-        /* Do PB regularization with the padded array  */
-        PB_FUNC3D(Ap, Bp, newsizeY, newsizeX, newsizeZ, padXY, SearchW, SimilW, (float)h, (float)lambda);
-        
-        switchpad_crop = 1; /*cropping*/
-        pad_crop(Bp, B, M, N, Z, newsizeY, newsizeX, newsizeZ, padXY, switchpad_crop);
-    } /*end else ndims*/ 
-}    
diff --git a/Wrappers/Matlab/mex_compile/regularizers_CPU/ROF_TV.c b/Wrappers/Matlab/mex_compile/regularizers_CPU/ROF_TV.c
index a800add..6b9e1ea 100644
--- a/Wrappers/Matlab/mex_compile/regularizers_CPU/ROF_TV.c
+++ b/Wrappers/Matlab/mex_compile/regularizers_CPU/ROF_TV.c
@@ -20,20 +20,21 @@
 #include "mex.h"
 #include "ROF_TV_core.h"
 
-/* C-OMP implementation of ROF-TV denoising/regularization model [1] (2D/3D case)
- *
+/* ROF-TV denoising/regularization model [1] (2D/3D case)
+ * (MEX wrapper for MATLAB)
+ * 
  * Input Parameters:
  * 1. Noisy image/volume [REQUIRED]
  * 2. lambda - regularization parameter [REQUIRED]
- * 3. tau - marching step for explicit scheme, ~0.001 is recommended [REQUIRED]
- * 4. Number of iterations, for explicit scheme >= 150 is recommended  [REQUIRED]
+ * 3. Number of iterations, for explicit scheme >= 150 is recommended  [REQUIRED]
+ * 4. tau - marching step for explicit scheme, ~1 is recommended [REQUIRED]
  *
  * Output:
- * [1] Regularized image/volume
+ * [1] Regularized image/volume 
  *
  * This function is based on the paper by
  * [1] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms"
- * compile: mex ROF_TV.c ROF_TV_core.c utils.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
+ *
  * D. Kazantsev, 2016-18
  */
 
@@ -42,65 +43,31 @@ void mexFunction(
         int nrhs, const mxArray *prhs[])
         
 {
-    int i, number_of_dims, iter_numb, dimX, dimY, dimZ;
+    int number_of_dims, iter_numb, dimX, dimY, dimZ;
     const int  *dim_array;
-    float *A, *B, *D1, *D2, *D3, lambda, tau;
+    float *Input, *Output=NULL, lambda, tau;
     
     dim_array = mxGetDimensions(prhs[0]);
     number_of_dims = mxGetNumberOfDimensions(prhs[0]);
     
     /*Handling Matlab input data*/
-    A  = (float *) mxGetData(prhs[0]);
+    Input  = (float *) mxGetData(prhs[0]);
     lambda =  (float) mxGetScalar(prhs[1]); /* regularization parameter */
-    tau =  (float) mxGetScalar(prhs[2]); /* marching step parameter */
-    iter_numb =  (int) mxGetScalar(prhs[3]); /* iterations number */
+    iter_numb =  (int) mxGetScalar(prhs[2]); /* iterations number */
+    tau =  (float) mxGetScalar(prhs[3]); /* marching step parameter */  
     
     if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); }
+    if(nrhs != 4) mexErrMsgTxt("Four inputs reqired: Image(2D,3D), regularization parameter, iterations number,  marching step constant");
     /*Handling Matlab output data*/
     dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];
     
     /* output arrays*/
     if (number_of_dims == 2) {
-        dimZ = 0; /*2D case*/
-        B = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        D1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        D2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        
-        /* copy into B */
-        copyIm(A, B, dimX, dimY, 1);
-        
-        /* start TV iterations */
-        for(i=0; i < iter_numb; i++) {
-            
-            /* calculate differences */
-            D1_func(B, D1, dimX, dimY, dimZ);
-            D2_func(B, D2, dimX, dimY, dimZ);
-            
-            /* calculate divergence and image update*/
-            TV_main(D1, D2, D2, B, A, lambda, tau, dimX, dimY, dimZ);
-        }
-    }
-    
-    if (number_of_dims == 3) {
-        B = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        D1 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        D2 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        D3 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        
-        /* copy into B */
-        copyIm(A, B, dimX, dimY, dimZ);
-        
-        /* start TV iterations */
-        for(i=0; i < iter_numb; i++) {
-            
-        /* calculate differences */
-        D1_func(B, D1, dimX, dimY, dimZ);
-        D2_func(B, D2, dimX, dimY, dimZ);
-        D3_func(B, D3, dimX, dimY, dimZ);
-        
-        /* calculate divergence and image update*/
-        TV_main(D1, D2, D3, B, A, lambda, tau, dimX, dimY, dimZ);
-        }
-    }
+        dimZ = 1; /*2D case*/
+        /* output image/volume */
+        Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));                        
+    }    
+    if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
     
-}
+    TV_ROF_CPU_main(Input, Output, lambda, iter_numb, tau, dimX, dimY, dimZ);    
+}
\ No newline at end of file
diff --git a/Wrappers/Matlab/mex_compile/regularizers_CPU/SplitBregman_TV.c b/Wrappers/Matlab/mex_compile/regularizers_CPU/SplitBregman_TV.c
deleted file mode 100644
index 38f6a9d..0000000
--- a/Wrappers/Matlab/mex_compile/regularizers_CPU/SplitBregman_TV.c
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
-This work is part of the Core Imaging Library developed by
-Visual Analytics and Imaging System Group of the Science Technology
-Facilities Council, STFC
-
-Copyright 2017 Daniil Kazantsev
-Copyright 2017 Srikanth Nagella, Edoardo Pasca
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include "mex.h"
-#include <matrix.h>
-#include "SplitBregman_TV_core.h"
-
-/* C-OMP implementation of Split Bregman - TV denoising-regularization model (2D/3D)
- *
- * Input Parameters:
- * 1. Noisy image/volume
- * 2. lambda - regularization parameter
- * 3. Number of iterations [OPTIONAL parameter]
- * 4. eplsilon - tolerance constant [OPTIONAL parameter]
- * 5. TV-type: 'iso' or 'l1' [OPTIONAL parameter]
- *
- * Output:
- * Filtered/regularized image
- *
- * Example:
- * figure;
- * Im = double(imread('lena_gray_256.tif'))/255;  % loading image
- * u0 = Im + .05*randn(size(Im)); u0(u0 < 0) = 0;
- * u = SplitBregman_TV(single(u0), 10, 30, 1e-04);
- *
- * to compile with OMP support: mex SplitBregman_TV.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
- * References:
- * The Split Bregman Method for L1 Regularized Problems, by Tom Goldstein and Stanley Osher.
- * D. Kazantsev, 2016*
- */
-
-
-void mexFunction(
-        int nlhs, mxArray *plhs[],
-        int nrhs, const mxArray *prhs[])
-        
-{
-    int number_of_dims, iter, dimX, dimY, dimZ, ll, j, count, methTV;
-    const int  *dim_array;
-    float *A, *U=NULL, *U_old=NULL, *Dx=NULL, *Dy=NULL, *Dz=NULL, *Bx=NULL, *By=NULL, *Bz=NULL, lambda, mu, epsil, re, re1, re_old;
-    
-    number_of_dims = mxGetNumberOfDimensions(prhs[0]);
-    dim_array = mxGetDimensions(prhs[0]);
-    
-    /*Handling Matlab input data*/
-    if ((nrhs < 2) || (nrhs > 5)) mexErrMsgTxt("At least 2 parameters is required: Image(2D/3D), Regularization parameter. The full list of parameters: Image(2D/3D), Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1')");
-    
-    /*Handling Matlab input data*/
-    A  = (float *) mxGetData(prhs[0]); /*noisy image (2D/3D) */
-    mu =  (float) mxGetScalar(prhs[1]); /* regularization parameter */
-    iter = 35; /* default iterations number */
-    epsil = 0.0001; /* default tolerance constant */
-    methTV = 0;  /* default isotropic TV penalty */
-    if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5))  iter = (int) mxGetScalar(prhs[2]); /* iterations number */
-    if ((nrhs == 4) || (nrhs == 5))  epsil =  (float) mxGetScalar(prhs[3]); /* tolerance constant */
-    if (nrhs == 5)  {
-        char *penalty_type;
-        penalty_type = mxArrayToString(prhs[4]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */
-        if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',");
-        if (strcmp(penalty_type, "l1") == 0)  methTV = 1;  /* enable 'l1' penalty */
-        mxFree(penalty_type);
-    }
-    if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); }
-    
-    lambda = 2.0f*mu;
-    count = 1;
-    re_old = 0.0f;
-    /*Handling Matlab output data*/
-    dimY = dim_array[0]; dimX = dim_array[1]; dimZ = dim_array[2];
-    
-    if (number_of_dims == 2) {
-        dimZ = 1; /*2D case*/
-        U = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        U_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        Dx = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        Dy = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        Bx = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        By = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        
-        copyIm(A, U, dimX, dimY, dimZ); /*initialize */
-        
-        /* begin outer SB iterations */
-        for(ll=0; ll<iter; ll++) {
-            
-            /*storing old values*/
-            copyIm(U, U_old, dimX, dimY, dimZ);
-            
-            /*GS iteration */
-            gauss_seidel2D(U, A, Dx, Dy, Bx, By, dimX, dimY, lambda, mu);
-            
-            if (methTV == 1)  updDxDy_shrinkAniso2D(U, Dx, Dy, Bx, By, dimX, dimY, lambda);
-            else updDxDy_shrinkIso2D(U, Dx, Dy, Bx, By, dimX, dimY, lambda);
-            
-            updBxBy2D(U, Dx, Dy, Bx, By, dimX, dimY);
-            
-            /* calculate norm to terminate earlier */
-            re = 0.0f; re1 = 0.0f;
-            for(j=0; j<dimX*dimY*dimZ; j++)
-            {
-                re += pow(U_old[j] - U[j],2);
-                re1 += pow(U_old[j],2);
-            }
-            re = sqrt(re)/sqrt(re1);
-            if (re < epsil)  count++;
-            if (count > 4) break;
-            
-            /* check that the residual norm is decreasing */
-            if (ll > 2) {
-                if (re > re_old) break;
-            }
-            re_old = re;
-            /*printf("%f %i %i \n", re, ll, count); */
-            
-            /*copyIm(U_old, U, dimX, dimY, dimZ); */
-        }
-        printf("SB iterations stopped at iteration: %i\n", ll);
-    }
-    if (number_of_dims == 3) {
-        U = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        U_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        Dx = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        Dy = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        Dz = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        Bx = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        By = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        Bz = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        
-        copyIm(A, U, dimX, dimY, dimZ); /*initialize */
-        
-        /* begin outer SB iterations */
-        for(ll=0; ll<iter; ll++) {
-            
-            /*storing old values*/
-            copyIm(U, U_old, dimX, dimY, dimZ);
-            
-            /*GS iteration */
-            gauss_seidel3D(U, A, Dx, Dy, Dz, Bx, By, Bz, dimX, dimY, dimZ, lambda, mu);
-            
-            if (methTV == 1) updDxDyDz_shrinkAniso3D(U, Dx, Dy, Dz, Bx, By, Bz, dimX, dimY, dimZ, lambda);
-            else updDxDyDz_shrinkIso3D(U, Dx, Dy, Dz, Bx, By, Bz, dimX, dimY, dimZ, lambda);
-            
-            updBxByBz3D(U, Dx, Dy, Dz, Bx, By, Bz, dimX, dimY, dimZ);
-            
-            /* calculate norm to terminate earlier */
-            re = 0.0f; re1 = 0.0f;
-            for(j=0; j<dimX*dimY*dimZ; j++)
-            {
-                re += pow(U[j] - U_old[j],2);
-                re1 += pow(U[j],2);
-            }
-            re = sqrt(re)/sqrt(re1);
-            if (re < epsil)  count++;
-            if (count > 4) break;
-            
-            /* check that the residual norm is decreasing */
-            if (ll > 2) {
-                if (re > re_old) break; }
-            /*printf("%f %i %i \n", re, ll, count); */
-            re_old = re;
-        }
-        printf("SB iterations stopped at iteration: %i\n", ll);
-    }
-}
\ No newline at end of file
diff --git a/Wrappers/Matlab/mex_compile/regularizers_CPU/TGV_PD.c b/Wrappers/Matlab/mex_compile/regularizers_CPU/TGV_PD.c
deleted file mode 100644
index c9cb440..0000000
--- a/Wrappers/Matlab/mex_compile/regularizers_CPU/TGV_PD.c
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
-This work is part of the Core Imaging Library developed by
-Visual Analytics and Imaging System Group of the Science Technology
-Facilities Council, STFC
-
-Copyright 2017 Daniil Kazantsev
-Copyright 2017 Srikanth Nagella, Edoardo Pasca
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include "TGV_PD_core.h"
-#include "mex.h"
-
-/* C-OMP implementation of Primal-Dual denoising method for 
- * Total Generilized Variation (TGV)-L2 model (2D case only)
- *
- * Input Parameters:
- * 1. Noisy image/volume (2D)
- * 2. lambda - regularization parameter
- * 3. parameter to control first-order term (alpha1)
- * 4. parameter to control the second-order term (alpha0)
- * 5. Number of CP iterations
- *
- * Output:
- * Filtered/regularized image 
- *
- * Example:
- * figure;
- * Im = double(imread('lena_gray_256.tif'))/255;  % loading image
- * u0 = Im + .03*randn(size(Im)); % adding noise
- * tic; u = TGV_PD(single(u0), 0.02, 1.3, 1, 550); toc;
- *
- * to compile with OMP support: mex TGV_PD.c TGV_PD_core.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
- * References:
- * K. Bredies "Total Generalized Variation"
- *
- * 28.11.16/Harwell
- */
- 
-void mexFunction(
-        int nlhs, mxArray *plhs[],
-        int nrhs, const mxArray *prhs[])
-        
-{
-    int number_of_dims, iter, dimX, dimY, dimZ, ll;
-    const int  *dim_array;
-    float *A, *U, *U_old, *P1, *P2, *Q1, *Q2, *Q3, *V1, *V1_old, *V2, *V2_old, lambda, L2, tau, sigma,  alpha1, alpha0;
-    
-    number_of_dims = mxGetNumberOfDimensions(prhs[0]);
-    dim_array = mxGetDimensions(prhs[0]);
-    
-    /*Handling Matlab input data*/
-    A  = (float *) mxGetData(prhs[0]); /*origanal noise image/volume*/
-    if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input in single precision is required"); }
-    lambda =  (float) mxGetScalar(prhs[1]); /*regularization parameter*/
-    alpha1 =  (float) mxGetScalar(prhs[2]); /*first-order term*/
-    alpha0 =  (float) mxGetScalar(prhs[3]); /*second-order term*/
-    iter =  (int) mxGetScalar(prhs[4]); /*iterations number*/
-    if(nrhs != 5) mexErrMsgTxt("Five input parameters is reqired: Image(2D/3D), Regularization parameter, alpha1, alpha0, Iterations");
-    
-    /*Handling Matlab output data*/
-    dimX = dim_array[0]; dimY = dim_array[1];
-    
-    if (number_of_dims == 2) {
-        /*2D case*/
-        dimZ = 1;
-        U = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        
-        /*dual variables*/
-        P1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        P2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        
-        Q1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        Q2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        Q3 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        
-        U_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        
-        V1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        V1_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        V2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        V2_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));   
-        
-        
-         /*printf("%i \n", i);*/
-    L2 = 12.0f; /*Lipshitz constant*/
-    tau = 1.0/pow(L2,0.5);
-    sigma = 1.0/pow(L2,0.5);
-    
-    /*Copy A to U*/
-    copyIm(A, U, dimX, dimY, dimZ);
-    
-        /* Here primal-dual iterations begin for 2D */
-        for(ll = 0; ll < iter; ll++) {
-            
-            /* Calculate Dual Variable P */
-            DualP_2D(U, V1, V2, P1, P2, dimX, dimY, dimZ, sigma);
-            
-            /*Projection onto convex set for P*/
-            ProjP_2D(P1, P2, dimX, dimY, dimZ, alpha1);
-            
-            /* Calculate Dual Variable Q */
-            DualQ_2D(V1, V2, Q1, Q2, Q3, dimX, dimY, dimZ, sigma);
-            
-            /*Projection onto convex set for Q*/
-            ProjQ_2D(Q1, Q2, Q3, dimX, dimY, dimZ, alpha0);
-            
-            /*saving U into U_old*/
-            copyIm(U, U_old, dimX, dimY, dimZ);
-            
-            /*adjoint operation  -> divergence and projection of P*/
-            DivProjP_2D(U, A, P1, P2, dimX, dimY, dimZ, lambda, tau);
-            
-            /*get updated solution U*/
-            newU(U, U_old, dimX, dimY, dimZ);
-            
-            /*saving V into V_old*/
-            copyIm(V1, V1_old, dimX, dimY, dimZ);
-            copyIm(V2, V2_old, dimX, dimY, dimZ);
-            
-            /* upd V*/
-            UpdV_2D(V1, V2, P1, P2, Q1, Q2, Q3, dimX, dimY, dimZ, tau);
-            
-            /*get new V*/
-            newU(V1, V1_old, dimX, dimY, dimZ);
-            newU(V2, V2_old, dimX, dimY, dimZ);
-        } /*end of iterations*/        
-    }
-    else if (number_of_dims == 3) {
-        mexErrMsgTxt("The input data should be a 2D array");
-        /*3D case*/
-    }
-    else {mexErrMsgTxt("The input data should be a 2D array");}    
-   
-}
diff --git a/Wrappers/Matlab/mex_compile/regularizers_GPU/Diffus_HO/Diff4thHajiaboli_GPU.cpp b/Wrappers/Matlab/mex_compile/regularizers_GPU/Diffus_HO/Diff4thHajiaboli_GPU.cpp
deleted file mode 100644
index 7fd6077..0000000
--- a/Wrappers/Matlab/mex_compile/regularizers_GPU/Diffus_HO/Diff4thHajiaboli_GPU.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-#include "mex.h"
-#include <matrix.h>
-#include <math.h>
-#include <stdlib.h>
-#include <memory.h>
-#include <stdio.h>
-#include <iostream>
-#include "Diff4th_GPU_kernel.h"
-
-/*
- * 2D and 3D CUDA implementation of the 4th order PDE denoising model by Hajiaboli
- *
- * Reference :
- * "An anisotropic fourth-order diffusion filter for image noise removal" by M. Hajiaboli
- * 
- * Example
- * figure;
- * Im = double(imread('lena_gray_256.tif'))/255;  % loading image
- * u0 = Im + .05*randn(size(Im)); % adding noise
- * u = Diff4thHajiaboli_GPU(single(u0), 0.02, 150);
- * subplot (1,2,1); imshow(u0,[ ]); title('Noisy Image')
- * subplot (1,2,2); imshow(u,[ ]); title('Denoised Image')
- *
- *
- * Linux/Matlab compilation:
- * compile in terminal: nvcc -Xcompiler -fPIC -shared -o Diff4th_GPU_kernel.o Diff4th_GPU_kernel.cu
- * then compile in Matlab: mex -I/usr/local/cuda-7.5/include -L/usr/local/cuda-7.5/lib64 -lcudart Diff4thHajiaboli_GPU.cpp Diff4th_GPU_kernel.o
- 
- void mexFunction(
-int nlhs, mxArray *plhs[],
-int nrhs, const mxArray *prhs[])
-where:
-prhs Array of pointers to the INPUT mxArrays
-nrhs int number of INPUT mxArrays
-
-nlhs Array of pointers to the OUTPUT mxArrays
-plhs int number of OUTPUT mxArrays
-
- */
-
-void mexFunction(
-        int nlhs, mxArray *plhs[],
-        int nrhs, const mxArray *prhs[])
-{
-    int numdims, dimZ, size;
-    float *A, *B, *A_L, *B_L;
-    const int *dims;    
-    
-    numdims = mxGetNumberOfDimensions(prhs[0]);
-    dims = mxGetDimensions(prhs[0]);       
-    
-    float sigma = (float)mxGetScalar(prhs[1]); /* edge-preserving parameter */
-    float lambda = (float)mxGetScalar(prhs[2]); /* regularization parameter */
-    int iter = (int)mxGetScalar(prhs[3]); /* iterations number */
-    
-    if (numdims == 2)  {
-        
-        int N, M, Z, i, j;
-        Z = 0; // for the 2D case
-        float tau = 0.01; // time step is sufficiently small for an explicit methods
-        
-        /*Input data*/
-        A = (float*)mxGetData(prhs[0]);        
-        N = dims[0] + 2;
-        M = dims[1] + 2;
-        A_L = (float*)mxGetData(mxCreateNumericMatrix(N, M, mxSINGLE_CLASS, mxREAL));
-        B_L = (float*)mxGetData(mxCreateNumericMatrix(N, M, mxSINGLE_CLASS, mxREAL));
-        
-        /*Output data*/
-        B = (float*)mxGetData(plhs[0] = mxCreateNumericMatrix(dims[0], dims[1], mxSINGLE_CLASS, mxREAL));        
-        
-        // copy A to the bigger A_L with boundaries
-        #pragma omp parallel for shared(A_L, A) private(i,j)
-        for (i=0; i < N; i++) {
-            for (j=0; j < M; j++) {
-                if (((i > 0) && (i < N-1)) &&  ((j > 0) && (j < M-1)))  A_L[i*M+j] = A[(i-1)*(dims[1])+(j-1)];
-            }}
-        
-        // Running CUDA code here
-        Diff4th_GPU_kernel(A_L, B_L, N, M, Z, (float)sigma, iter, (float)tau, lambda);
-        
-        // copy the processed B_L to a smaller B
-        #pragma omp parallel for shared(B_L, B) private(i,j)
-        for (i=0; i < N; i++) {
-            for (j=0; j < M; j++) {
-                if (((i > 0) && (i < N-1)) &&  ((j > 0) && (j < M-1)))   B[(i-1)*(dims[1])+(j-1)] = B_L[i*M+j];
-            }}
-    }
-    if (numdims == 3)  {
-        //  3D image denoising / regularization
-        int N, M, Z, i, j, k;
-        float tau = 0.0007; // Time Step is small for an explicit methods
-        A = (float*)mxGetData(prhs[0]);
-        N = dims[0] + 2;
-        M = dims[1] + 2;
-        Z = dims[2] + 2;
-        int N_dims[] = {N, M, Z};
-        A_L = (float*)mxGetPr(mxCreateNumericArray(3, N_dims, mxSINGLE_CLASS, mxREAL));
-        B_L = (float*)mxGetPr(mxCreateNumericArray(3, N_dims, mxSINGLE_CLASS, mxREAL));
-        
-        /* output data */
-        B = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dims, mxSINGLE_CLASS, mxREAL));
-        
-        // copy A to the bigger A_L with boundaries
-        #pragma omp parallel for shared(A_L, A) private(i,j,k)
-        for (i=0; i < N; i++) {
-            for (j=0; j < M; j++) {
-                for (k=0; k < Z; k++) {
-                    if (((i > 0) && (i < N-1)) &&  ((j > 0) && (j < M-1)) &&  ((k > 0) && (k < Z-1))) {
-                        A_L[(N*M)*(k)+(i)*M+(j)] = A[(dims[0]*dims[1])*(k-1)+(i-1)*dims[1]+(j-1)];
-                    }}}}
-        
-        // Running CUDA kernel here for diffusivity
-        Diff4th_GPU_kernel(A_L, B_L, N, M, Z, (float)sigma, iter, (float)tau, lambda);
-        
-        // copy the processed B_L to a smaller B
-        #pragma omp parallel for shared(B_L, B) private(i,j,k)
-        for (i=0; i < N; i++) {
-            for (j=0; j < M; j++) {
-                for (k=0; k < Z; k++) {
-                    if (((i > 0) && (i < N-1)) &&  ((j > 0) && (j < M-1)) &&  ((k > 0) && (k < Z-1))) {
-                        B[(dims[0]*dims[1])*(k-1)+(i-1)*dims[1]+(j-1)] = B_L[(N*M)*(k)+(i)*M+(j)];
-                    }}}}
-    }
-}
\ No newline at end of file
diff --git a/Wrappers/Matlab/mex_compile/regularizers_GPU/FGP_TV_GPU.cpp b/Wrappers/Matlab/mex_compile/regularizers_GPU/FGP_TV_GPU.cpp
new file mode 100644
index 0000000..9ed9ae0
--- /dev/null
+++ b/Wrappers/Matlab/mex_compile/regularizers_GPU/FGP_TV_GPU.cpp
@@ -0,0 +1,95 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "matrix.h"
+#include "mex.h"
+#include "TV_FGP_GPU_core.h"
+
+/* GPU (CUDA) implementation of FGP-TV [1] denoising/regularization model (2D/3D case)
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume
+ * 2. lambdaPar - regularization parameter
+ * 3. Number of iterations
+ * 4. eplsilon: tolerance constant
+ * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
+ * 6. nonneg: 'nonnegativity (0 is OFF by default)
+ * 7. print information: 0 (off) or 1 (on)
+ *
+ * Output:
+ * [1] Filtered/regularized image
+ *
+ * This function is based on the Matlab's code and paper by
+ * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
+ */
+
+void mexFunction(
+        int nlhs, mxArray *plhs[],
+        int nrhs, const mxArray *prhs[])
+        
+{
+    int number_of_dims, iter, dimX, dimY, dimZ, methTV, printswitch, nonneg;
+    const int  *dim_array;
+    float *Input, *Output=NULL, lambda, epsil;
+    
+    number_of_dims = mxGetNumberOfDimensions(prhs[0]);
+    dim_array = mxGetDimensions(prhs[0]);
+    
+    /*Handling Matlab input data*/
+    if ((nrhs < 2) || (nrhs > 7)) mexErrMsgTxt("At least 2 parameters is required, all parameters are: Image(2D/3D), Regularization parameter. The full list of parameters: Image(2D/3D), Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1'), nonnegativity switch, print switch");
+    
+    Input  = (float *) mxGetData(prhs[0]); /*noisy image (2D/3D) */
+    lambda =  (float) mxGetScalar(prhs[1]); /* regularization parameter */
+    iter = 300; /* default iterations number */
+    epsil = 0.0001; /* default tolerance constant */
+    methTV = 0;  /* default isotropic TV penalty */
+    nonneg = 0; /* default nonnegativity switch, off - 0 */
+    printswitch = 0; /*default print is switched, off - 0 */
+    
+    if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); }
+    
+    if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7))  iter = (int) mxGetScalar(prhs[2]); /* iterations number */
+    if ((nrhs == 4) || (nrhs == 5) || (nrhs == 6) || (nrhs == 7))  epsil =  (float) mxGetScalar(prhs[3]); /* tolerance constant */
+    if ((nrhs == 5) || (nrhs == 6) || (nrhs == 7))  {
+        char *penalty_type;
+        penalty_type = mxArrayToString(prhs[4]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */
+        if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',");
+        if (strcmp(penalty_type, "l1") == 0)  methTV = 1;  /* enable 'l1' penalty */
+        mxFree(penalty_type);
+    }
+    if ((nrhs == 6) || (nrhs == 7))  {
+        nonneg = (int) mxGetScalar(prhs[5]);
+        if ((nonneg != 0) && (nonneg != 1)) mexErrMsgTxt("Nonnegativity constraint can be enabled by choosing 1 or off - 0");
+    }
+    if (nrhs == 7)  {
+        printswitch = (int) mxGetScalar(prhs[6]);
+        if ((printswitch != 0) && (printswitch != 1)) mexErrMsgTxt("Print can be enabled by choosing 1 or off - 0");
+    }
+    
+    /*Handling Matlab output data*/
+    dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];
+    
+    if (number_of_dims == 2) {
+        dimZ = 1; /*2D case*/
+        Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
+    }
+    if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
+    
+    /* running the function */
+    TV_FGP_GPU_main(Input, Output, lambda, iter, epsil, methTV, nonneg, printswitch, dimX, dimY, dimZ);    
+}
\ No newline at end of file
diff --git a/Wrappers/Matlab/mex_compile/regularizers_GPU/NL_Regul/NLM_GPU.cpp b/Wrappers/Matlab/mex_compile/regularizers_GPU/NL_Regul/NLM_GPU.cpp
deleted file mode 100644
index ff0cc90..0000000
--- a/Wrappers/Matlab/mex_compile/regularizers_GPU/NL_Regul/NLM_GPU.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-#include "mex.h"
-#include <matrix.h>
-#include <math.h>
-#include <stdlib.h>
-#include <memory.h>
-#include <stdio.h>
-#include <iostream>
-#include "NLM_GPU_kernel.h"
-
-/* CUDA implementation of the patch-based (PB) regularization  for 2D and 3D images/volumes  
- * This method finds self-similar patches in data and performs one fixed point iteration to mimimize the PB penalty function
- * 
- * References: 1. Yang Z. & Jacob M. "Nonlocal Regularization of Inverse Problems"
- *             2. Kazantsev D. at. all "4D-CT reconstruction with unified spatial-temporal patch-based regularization"
- *
- * Input Parameters (mandatory):
- * 1. Image/volume (2D/3D)
- * 2. ratio of the searching window (e.g. 3 = (2*3+1) = 7 pixels window)
- * 3. ratio of the similarity window (e.g. 1 = (2*1+1) = 3 pixels window)
- * 4. h - parameter for the PB penalty function
- * 5. lambda - regularization parameter 
-
- * Output:
- * 1. regularized (denoised) Image/volume (N x N x N)
- *
- * In matlab check what kind of GPU you have with "gpuDevice" command,
- * then set your ComputeCapability, here I use -arch compute_35
- *
- * Quick 2D denoising example in Matlab:   
-   Im = double(imread('lena_gray_256.tif'))/255;  % loading image
-   u0 = Im + .03*randn(size(Im)); u0(u0<0) = 0; % adding noise
-   ImDen = NLM_GPU(single(u0), 3, 2, 0.15, 1);
- 
- * Linux/Matlab compilation:
- * compile in terminal: nvcc -Xcompiler -fPIC -shared -o NLM_GPU_kernel.o NLM_GPU_kernel.cu
- * then compile in Matlab: mex -I/usr/local/cuda-7.5/include -L/usr/local/cuda-7.5/lib64 -lcudart NLM_GPU.cpp NLM_GPU_kernel.o
- *
- * D. Kazantsev 
- * 2014-17
- * Harwell/Manchester UK
- */
-
-float pad_crop(float *A, float *Ap, int OldSizeX, int OldSizeY,  int OldSizeZ, int NewSizeX, int NewSizeY, int NewSizeZ, int padXY, int switchpad_crop);
-
-void mexFunction(
-        int nlhs, mxArray *plhs[],
-        int nrhs, const mxArray *prhs[])
-{
-    int N, M, Z, i_n, j_n, k_n, numdims, SearchW, SimilW, SearchW_real, padXY, newsizeX, newsizeY, newsizeZ, switchpad_crop, count, SearchW_full, SimilW_full;
-    const int  *dims;
-    float *A, *B=NULL, *Ap=NULL, *Bp=NULL, *Eucl_Vec, h, h2, lambda, val, denh2;
-    
-    numdims = mxGetNumberOfDimensions(prhs[0]);
-    dims = mxGetDimensions(prhs[0]);
-    
-    N = dims[0];
-    M = dims[1];
-    Z = dims[2];
-    
-    if ((numdims < 2) || (numdims > 3)) {mexErrMsgTxt("The input should be 2D image or 3D volume");}
-    if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input in single precision is required"); }
-    
-    if(nrhs != 5) mexErrMsgTxt("Five inputs reqired: Image(2D,3D), SearchW, SimilW, Threshold, Regularization parameter");
-    
-    /*Handling inputs*/
-    A  = (float *) mxGetData(prhs[0]);    /* the image to regularize/filter */
-    SearchW_real  = (int) mxGetScalar(prhs[1]); /* the searching window ratio */
-    SimilW =  (int) mxGetScalar(prhs[2]);  /* the similarity window ratio */
-    h =  (float) mxGetScalar(prhs[3]);  /* parameter for the PB filtering function */
-    lambda = (float) mxGetScalar(prhs[4]);
-    
-    if (h <= 0) mexErrMsgTxt("Parmeter for the PB penalty function should be > 0");
-      
-    SearchW = SearchW_real + 2*SimilW;
-    
-    SearchW_full = 2*SearchW + 1; /* the full searching window  size */
-    SimilW_full = 2*SimilW + 1;   /* the full similarity window  size */
-    h2 = h*h;
-    
-    padXY = SearchW + 2*SimilW; /* padding sizes */
-    newsizeX = N + 2*(padXY); /* the X size of the padded array */
-    newsizeY = M + 2*(padXY); /* the Y size of the padded array */
-    newsizeZ = Z + 2*(padXY); /* the Z size of the padded array */
-    int N_dims[] = {newsizeX, newsizeY, newsizeZ};
-    
-    /******************************2D case ****************************/
-    if (numdims == 2) {
-        /*Handling output*/
-        B = (float*)mxGetData(plhs[0] = mxCreateNumericMatrix(N, M, mxSINGLE_CLASS, mxREAL));
-        /*allocating memory for the padded arrays */
-        Ap = (float*)mxGetData(mxCreateNumericMatrix(newsizeX, newsizeY, mxSINGLE_CLASS, mxREAL));
-        Bp = (float*)mxGetData(mxCreateNumericMatrix(newsizeX, newsizeY, mxSINGLE_CLASS, mxREAL));
-        Eucl_Vec = (float*)mxGetData(mxCreateNumericMatrix(SimilW_full*SimilW_full, 1, mxSINGLE_CLASS, mxREAL));
-        
-        /*Gaussian kernel */
-        count = 0;
-        for(i_n=-SimilW; i_n<=SimilW; i_n++) {
-            for(j_n=-SimilW; j_n<=SimilW; j_n++) {
-                val = (float)(i_n*i_n + j_n*j_n)/(2*SimilW*SimilW);
-                Eucl_Vec[count] = exp(-val);
-                count = count + 1;
-            }} /*main neighb loop */
-        
-        /**************************************************************************/
-        /*Perform padding of image A to the size of [newsizeX * newsizeY] */
-        switchpad_crop = 0; /*padding*/
-        pad_crop(A, Ap, M, N, 0, newsizeY, newsizeX, 0, padXY, switchpad_crop);
-        
-        /* Do PB regularization with the padded array  */
-        NLM_GPU_kernel(Ap, Bp, Eucl_Vec, newsizeY, newsizeX, 0, numdims, SearchW, SimilW, SearchW_real, (float)h2, (float)lambda);
-        
-        switchpad_crop = 1; /*cropping*/
-        pad_crop(Bp, B, M, N, 0, newsizeY, newsizeX, 0, padXY, switchpad_crop);
-    }
-    else
-    {
-        /******************************3D case ****************************/
-        /*Handling output*/
-        B = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dims, mxSINGLE_CLASS, mxREAL));
-        /*allocating memory for the padded arrays */
-        Ap = (float*)mxGetPr(mxCreateNumericArray(3, N_dims, mxSINGLE_CLASS, mxREAL));
-        Bp = (float*)mxGetPr(mxCreateNumericArray(3, N_dims, mxSINGLE_CLASS, mxREAL));
-        Eucl_Vec = (float*)mxGetData(mxCreateNumericMatrix(SimilW_full*SimilW_full*SimilW_full, 1, mxSINGLE_CLASS, mxREAL));
-        
-        /*Gaussian kernel */
-        count = 0;
-        for(i_n=-SimilW; i_n<=SimilW; i_n++) {
-            for(j_n=-SimilW; j_n<=SimilW; j_n++) {
-                for(k_n=-SimilW; k_n<=SimilW; k_n++) {
-                    val = (float)(i_n*i_n + j_n*j_n + k_n*k_n)/(2*SimilW*SimilW*SimilW);
-                    Eucl_Vec[count] = exp(-val);
-                    count = count + 1;
-                }}} /*main neighb loop */
-        /**************************************************************************/
-        /*Perform padding of image A to the size of [newsizeX * newsizeY * newsizeZ] */
-        switchpad_crop = 0; /*padding*/
-        pad_crop(A, Ap, M, N, Z, newsizeY, newsizeX, newsizeZ, padXY, switchpad_crop);
-        
-        /* Do PB regularization with the padded array  */
-        NLM_GPU_kernel(Ap, Bp, Eucl_Vec, newsizeY, newsizeX, newsizeZ, numdims, SearchW, SimilW, SearchW_real, (float)h2, (float)lambda);
-        
-        switchpad_crop = 1; /*cropping*/
-        pad_crop(Bp, B, M, N, Z, newsizeY, newsizeX, newsizeZ, padXY, switchpad_crop);
-    } /*end else ndims*/
-}
-
-float pad_crop(float *A, float *Ap, int OldSizeX, int OldSizeY, int OldSizeZ, int NewSizeX, int NewSizeY, int NewSizeZ, int padXY, int switchpad_crop)
-{
-    /* padding-cropping function */
-    int i,j,k;    
-    if (NewSizeZ > 1) {    
-           for (i=0; i < NewSizeX; i++) {
-            for (j=0; j < NewSizeY; j++) {
-              for (k=0; k < NewSizeZ; k++) {
-                if (((i >= padXY) && (i < NewSizeX-padXY)) &&  ((j >= padXY) && (j < NewSizeY-padXY)) &&  ((k >= padXY) && (k < NewSizeZ-padXY))) {
-                    if (switchpad_crop == 0)  Ap[NewSizeX*NewSizeY*k + i*NewSizeY+j] = A[OldSizeX*OldSizeY*(k - padXY) + (i-padXY)*(OldSizeY)+(j-padXY)];
-                    else  Ap[OldSizeX*OldSizeY*(k - padXY) + (i-padXY)*(OldSizeY)+(j-padXY)] = A[NewSizeX*NewSizeY*k + i*NewSizeY+j];
-                }
-            }}}   
-    }
-    else {
-        for (i=0; i < NewSizeX; i++) {
-            for (j=0; j < NewSizeY; j++) {
-                if (((i >= padXY) && (i < NewSizeX-padXY)) &&  ((j >= padXY) && (j < NewSizeY-padXY))) {
-                    if (switchpad_crop == 0)  Ap[i*NewSizeY+j] = A[(i-padXY)*(OldSizeY)+(j-padXY)];
-                    else  Ap[(i-padXY)*(OldSizeY)+(j-padXY)] = A[i*NewSizeY+j];
-                }
-            }}
-    }
-    return *Ap;
-}
\ No newline at end of file
diff --git a/Wrappers/Matlab/mex_compile/regularizers_GPU/ROF_TV_GPU.cpp b/Wrappers/Matlab/mex_compile/regularizers_GPU/ROF_TV_GPU.cpp
new file mode 100644
index 0000000..7bbe3af
--- /dev/null
+++ b/Wrappers/Matlab/mex_compile/regularizers_GPU/ROF_TV_GPU.cpp
@@ -0,0 +1,73 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "matrix.h"
+#include "mex.h"
+#include "TV_ROF_GPU_core.h"
+
+/* ROF-TV denoising/regularization model [1] (2D/3D case)
+ * (MEX wrapper for MATLAB)
+ * 
+ * Input Parameters:
+ * 1. Noisy image/volume [REQUIRED]
+ * 2. lambda - regularization parameter [REQUIRED]
+ * 3. Number of iterations, for explicit scheme >= 150 is recommended  [REQUIRED]
+ * 4. tau - marching step for explicit scheme, ~1 is recommended [REQUIRED]
+ *
+ * Output:
+ * [1] Regularized image/volume 
+ *
+ * This function is based on the paper by
+ * [1] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms"
+ *
+ * D. Kazantsev, 2016-18
+ */
+
+void mexFunction(
+        int nlhs, mxArray *plhs[],
+        int nrhs, const mxArray *prhs[])
+        
+{
+    int number_of_dims, iter_numb, dimX, dimY, dimZ;
+    const int  *dim_array;
+    float *Input, *Output=NULL, lambda, tau;
+    
+    dim_array = mxGetDimensions(prhs[0]);
+    number_of_dims = mxGetNumberOfDimensions(prhs[0]);
+    
+    /*Handling Matlab input data*/
+    Input  = (float *) mxGetData(prhs[0]);
+    lambda =  (float) mxGetScalar(prhs[1]); /* regularization parameter */
+    iter_numb =  (int) mxGetScalar(prhs[2]); /* iterations number */
+    tau =  (float) mxGetScalar(prhs[3]); /* marching step parameter */  
+    
+    if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); }
+    if(nrhs != 4) mexErrMsgTxt("Four inputs reqired: Image(2D,3D), regularization parameter, iterations number,  marching step constant");
+    /*Handling Matlab output data*/
+    dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];
+    
+    /* output arrays*/
+    if (number_of_dims == 2) {
+        dimZ = 1; /*2D case*/
+        /* output image/volume */
+        Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));                        
+    }    
+    if (number_of_dims == 3) Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
+    
+    TV_ROF_GPU_main(Input, Output, lambda, iter_numb, tau, dimX, dimY, dimZ);    
+}
\ No newline at end of file
-- 
cgit v1.2.3