From b9562bbfdf6ca2cc8775e242ae13cbfaeeb2479b Mon Sep 17 00:00:00 2001
From: Daniil Kazantsev <dkazanc@hotmail.com>
Date: Mon, 26 Feb 2018 11:16:25 +0000
Subject: FGP CPU parameters passing simplified, FGP GPU added

---
 Core/regularizers_CPU/FGP_TV_core.c             | 481 +++++++++++---------
 Core/regularizers_CPU/FGP_TV_core.h             |  51 +--
 Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu | 561 ++++++++++++++++++++++++
 Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h  |  10 +
 4 files changed, 867 insertions(+), 236 deletions(-)
 create mode 100755 Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu
 create mode 100755 Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h

diff --git a/Core/regularizers_CPU/FGP_TV_core.c b/Core/regularizers_CPU/FGP_TV_core.c
index 03cd445..304848d 100644
--- a/Core/regularizers_CPU/FGP_TV_core.c
+++ b/Core/regularizers_CPU/FGP_TV_core.c
@@ -22,245 +22,314 @@ limitations under the License.
 /* C-OMP implementation of FGP-TV [1] denoising/regularization model (2D/3D case)
  *
  * Input Parameters:
- * 1. Noisy image/volume [REQUIRED]
- * 2. lambda - regularization parameter [REQUIRED]
- * 3. Number of iterations [OPTIONAL parameter]
- * 4. eplsilon: tolerance constant [OPTIONAL parameter]
- * 5. TV-type: 'iso' or 'l1' [OPTIONAL parameter]
+ * 1. Noisy image/volume 
+ * 2. lambda - regularization parameter 
+ * 3. Number of iterations
+ * 4. eplsilon: tolerance constant 
+ * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
+ * 6. nonneg: 'nonnegativity (0 is OFF by default) 
+ * 7. print information: 0 (off) or 1 (on) 
  *
  * Output:
  * [1] Filtered/regularized image
- * [2] last function value 
- *
- * Example of image denoising:
- * figure;
- * Im = double(imread('lena_gray_256.tif'))/255;  % loading image
- * u0 = Im + .05*randn(size(Im)); % adding noise
- * u = FGP_TV(single(u0), 0.05, 100, 1e-04);
  *
  * This function is based on the Matlab's code and paper by
  * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
- *
- * D. Kazantsev, 2016-17
- *
  */
-
-/* 2D-case related Functions */
-/*****************************************************************/
-float Obj_func_CALC2D(float *A, float *D, float *funcvalA, float lambda, int dimX, int dimY)
-{   
-    int i,j;
-    float f1, f2, val1, val2;
-    
-    /*data-related term */
-    f1 = 0.0f;
-    for(i=0; i<dimX*dimY; i++) f1 += pow(D[i] - A[i],2);    
-    
-    /*TV-related term */
-    f2 = 0.0f;
-    for(i=0; i<dimX; i++) {
-        for(j=0; j<dimY; j++) {
-            /* boundary conditions  */
-            if (i == dimX-1) {val1 = 0.0f;} else {val1 = A[(i+1)*dimY + (j)] - A[(i)*dimY + (j)];}
-            if (j == dimY-1) {val2 = 0.0f;} else {val2 = A[(i)*dimY + (j+1)] - A[(i)*dimY + (j)];}    
-            f2 += sqrt(pow(val1,2) + pow(val2,2));
-        }}  
-    
-    /* sum of two terms */
-    funcvalA[0] = 0.5f*f1 + lambda*f2;     
-    return *funcvalA;
+ 
+float FGP_TV_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
+{
+	int ll, j, DimTotal;
+	float re, re1;
+	float tk = 1.0f;
+    float tkp1=1.0f;
+    int count = 0;
+	
+	if (dimZ <= 1) {
+		/*2D case */				
+		float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL;		
+		DimTotal = dimX*dimY;
+		
+		Output_prev = (float *) malloc( DimTotal * sizeof(float) );
+		P1 = (float *) malloc( DimTotal * sizeof(float) );
+		P2 = (float *) malloc( DimTotal * sizeof(float) );
+		P1_prev = (float *) malloc( DimTotal * sizeof(float) );
+		P2_prev = (float *) malloc( DimTotal * sizeof(float) );
+		R1 = (float *) malloc( DimTotal * sizeof(float) );
+		R2 = (float *) malloc( DimTotal * sizeof(float) );
+		
+		/* begin iterations */
+        for(ll=0; ll<iter; ll++) {
+            
+            /* computing the gradient of the objective function */
+            Obj_func2D(Input, Output, R1, R2, lambda, dimX, dimY);
+            
+            /* apply nonnegativity */
+            if (nonneg == 1) for(j=0; j<dimX*dimY; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;}            
+            
+            /*Taking a step towards minus of the gradient*/
+            Grad_func2D(P1, P2, Output, R1, R2, lambda, dimX, dimY);
+            
+            /* projection step */
+            Proj_func2D(P1, P2, methodTV, dimX, dimY);
+            
+            /*updating R and t*/
+            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
+            Rupd_func2D(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, dimX, dimY);
+            
+            /* check early stopping criteria */
+            re = 0.0f; re1 = 0.0f;
+            for(j=0; j<dimX*dimY; j++)
+            {
+                re += pow(Output[j] - Output_prev[j],2);
+                re1 += pow(Output[j],2);
+            }
+            re = sqrt(re)/sqrt(re1);
+            if (re < epsil)  count++;
+				if (count > 4) break;				            
+            
+            /*storing old values*/
+            copyIm(Output, Output_prev, dimX, dimY, 1);
+            copyIm(P1, P1_prev, dimX, dimY, 1);
+            copyIm(P2, P2_prev, dimX, dimY, 1);
+            tk = tkp1;
+        }
+        if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", ll);   
+		free(Output_prev); free(P1); free(P2); free(P1_prev); free(P2_prev); free(R1); free(R2);		
+	}
+	else {
+		/*3D case*/
+		float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL;		
+		DimTotal = dimX*dimY*dimZ;
+		
+		Output_prev = (float *) malloc( DimTotal * sizeof(float) );
+		P1 = (float *) malloc( DimTotal * sizeof(float) );
+		P2 = (float *) malloc( DimTotal * sizeof(float) );
+		P3 = (float *) malloc( DimTotal * sizeof(float) );
+		P1_prev = (float *) malloc( DimTotal * sizeof(float) );
+		P2_prev = (float *) malloc( DimTotal * sizeof(float) );
+		P3_prev = (float *) malloc( DimTotal * sizeof(float) );
+		R1 = (float *) malloc( DimTotal * sizeof(float) );
+		R2 = (float *) malloc( DimTotal * sizeof(float) );
+		R3 = (float *) malloc( DimTotal * sizeof(float) );
+		
+		    /* begin iterations */
+        for(ll=0; ll<iter; ll++) {
+            
+            /* computing the gradient of the objective function */
+            Obj_func3D(Input, Output, R1, R2, R3, lambda, dimX, dimY, dimZ);
+            
+            /* apply nonnegativity */
+            if (nonneg == 1) for(j=0; j<dimX*dimY*dimZ; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;}  
+            
+            /*Taking a step towards minus of the gradient*/
+            Grad_func3D(P1, P2, P3, Output, R1, R2, R3, lambda, dimX, dimY, dimZ);
+            
+            /* projection step */
+            Proj_func3D(P1, P2, P3, methodTV, dimX, dimY, dimZ);
+            
+            /*updating R and t*/
+            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
+            Rupd_func3D(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, dimX, dimY, dimZ);
+            
+            /* calculate norm - stopping rules*/
+            re = 0.0f; re1 = 0.0f;
+            for(j=0; j<dimX*dimY*dimZ; j++)
+            {
+                re += pow(Output[j] - Output_prev[j],2);
+                re1 += pow(Output[j],2);
+            }
+            re = sqrt(re)/sqrt(re1);
+            /* stop if the norm residual is less than the tolerance EPS */
+            if (re < epsil)  count++;
+            if (count > 4) break;            
+                        
+            /*storing old values*/
+            copyIm(Output, Output_prev, dimX, dimY, dimZ);
+            copyIm(P1, P1_prev, dimX, dimY, dimZ);
+            copyIm(P2, P2_prev, dimX, dimY, dimZ);
+            copyIm(P3, P3_prev, dimX, dimY, dimZ);
+            tk = tkp1;            
+        }	
+		if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", ll);   
+		free(Output_prev); free(P1); free(P2); free(P3); free(P1_prev); free(P2_prev); free(P3_prev); free(R1); free(R2); free(R3);
+	}
+	return *Output;
 }
 
 float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, int dimX, int dimY)
 {
-	float val1, val2;
-	int i, j;
-#pragma omp parallel for shared(A,D,R1,R2) private(i,j,val1,val2)
-	for (i = 0; i<dimX; i++) {
-		for (j = 0; j<dimY; j++) {
-			/* boundary conditions  */
-			if (i == 0) { val1 = 0.0f; }
-			else { val1 = R1[(i - 1)*dimY + (j)]; }
-			if (j == 0) { val2 = 0.0f; }
-			else { val2 = R2[(i)*dimY + (j - 1)]; }
-			D[(i)*dimY + (j)] = A[(i)*dimY + (j)] - lambda*(R1[(i)*dimY + (j)] + R2[(i)*dimY + (j)] - val1 - val2);
-		}
-	}
-	return *D;
+    float val1, val2;
+    int i,j,index;
+#pragma omp parallel for shared(A,D,R1,R2) private(index,i,j,val1,val2)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+			index = j*dimX+i;
+            /* boundary conditions  */
+            if (i == 0) {val1 = 0.0f;} else {val1 = R1[j*dimX + (i-1)];}
+            if (j == 0) {val2 = 0.0f;} else {val2 = R2[(j-1)*dimX + i];}
+            D[index] = A[index] - lambda*(R1[index] + R2[index] - val1 - val2);
+        }}
+    return *D;
 }
 float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda, int dimX, int dimY)
 {
-	float val1, val2, multip;
-	int i, j;
-	multip = (1.0f / (8.0f*lambda));
-#pragma omp parallel for shared(P1,P2,D,R1,R2,multip) private(i,j,val1,val2)
-	for (i = 0; i<dimX; i++) {
-		for (j = 0; j<dimY; j++) {
-			/* boundary conditions */
-			if (i == dimX - 1) val1 = 0.0f; else val1 = D[(i)*dimY + (j)] - D[(i + 1)*dimY + (j)];
-			if (j == dimY - 1) val2 = 0.0f; else val2 = D[(i)*dimY + (j)] - D[(i)*dimY + (j + 1)];
-			P1[(i)*dimY + (j)] = R1[(i)*dimY + (j)] + multip*val1;
-			P2[(i)*dimY + (j)] = R2[(i)*dimY + (j)] + multip*val2;
-		}
-	}
-	return 1;
+    float val1, val2, multip;
+    int i,j,index;
+    multip = (1.0f/(8.0f*lambda));
+#pragma omp parallel for shared(P1,P2,D,R1,R2,multip) private(index,i,j,val1,val2)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+			index = j*dimX+i;
+            /* boundary conditions */
+            if (i == dimX-1) val1 = 0.0f; else val1 = D[index] - D[j*dimX + (i+1)];
+            if (j == dimY-1) val2 = 0.0f; else val2 = D[index] - D[(j+1)*dimX + i];
+            P1[index] = R1[index] + multip*val1;
+            P2[index] = R2[index] + multip*val2;
+        }}
+    return 1;
 }
 float Proj_func2D(float *P1, float *P2, int methTV, int dimX, int dimY)
 {
-	float val1, val2, denom;
-	int i, j;
-	if (methTV == 0) {
-		/* isotropic TV*/
-#pragma omp parallel for shared(P1,P2) private(i,j,denom)
-		for (i = 0; i<dimX; i++) {
-			for (j = 0; j<dimY; j++) {
-				denom = pow(P1[(i)*dimY + (j)], 2) + pow(P2[(i)*dimY + (j)], 2);
-				if (denom > 1) {
-					P1[(i)*dimY + (j)] = P1[(i)*dimY + (j)] / sqrt(denom);
-					P2[(i)*dimY + (j)] = P2[(i)*dimY + (j)] / sqrt(denom);
-				}
-			}
-		}
-	}
-	else {
-		/* anisotropic TV*/
-#pragma omp parallel for shared(P1,P2) private(i,j,val1,val2)
-		for (i = 0; i<dimX; i++) {
-			for (j = 0; j<dimY; j++) {
-				val1 = fabs(P1[(i)*dimY + (j)]);
-				val2 = fabs(P2[(i)*dimY + (j)]);
-				if (val1 < 1.0f) { val1 = 1.0f; }
-				if (val2 < 1.0f) { val2 = 1.0f; }
-				P1[(i)*dimY + (j)] = P1[(i)*dimY + (j)] / val1;
-				P2[(i)*dimY + (j)] = P2[(i)*dimY + (j)] / val2;
-			}
-		}
-	}
-	return 1;
+    float val1, val2, denom, sq_denom;
+    int i,j,index;
+    if (methTV == 0) {
+        /* isotropic TV*/
+#pragma omp parallel for shared(P1,P2) private(index,i,j,denom,sq_denom)
+        for(i=0; i<dimX; i++) {
+            for(j=0; j<dimY; j++) {
+				index = j*dimX+i;
+                denom = pow(P1[index],2) +  pow(P2[index],2);
+                if (denom > 1.0f) {
+					sq_denom = 1.0f/sqrt(denom);
+                    P1[index] = P1[index]*sq_denom;
+                    P2[index] = P2[index]*sq_denom;
+                }
+            }}
+    }
+    else {
+        /* anisotropic TV*/
+#pragma omp parallel for shared(P1,P2) private(index,i,j,val1,val2)
+        for(i=0; i<dimX; i++) {
+            for(j=0; j<dimY; j++) {
+				index = j*dimX+i;
+                val1 = fabs(P1[index]);
+                val2 = fabs(P2[index]);
+                if (val1 < 1.0f) {val1 = 1.0f;}
+                if (val2 < 1.0f) {val2 = 1.0f;}
+                P1[index] = P1[index]/val1;
+                P2[index] = P2[index]/val2;
+            }}
+    }
+    return 1;
 }
 float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, int dimX, int dimY)
 {
-	int i, j;
-	float multip;
-	multip = ((tk - 1.0f) / tkp1);
-#pragma omp parallel for shared(P1,P2,P1_old,P2_old,R1,R2,multip) private(i,j)
-	for (i = 0; i<dimX; i++) {
-		for (j = 0; j<dimY; j++) {
-			R1[(i)*dimY + (j)] = P1[(i)*dimY + (j)] + multip*(P1[(i)*dimY + (j)] - P1_old[(i)*dimY + (j)]);
-			R2[(i)*dimY + (j)] = P2[(i)*dimY + (j)] + multip*(P2[(i)*dimY + (j)] - P2_old[(i)*dimY + (j)]);
-		}
-	}
-	return 1;
+    int i,j,index;
+    float multip;
+    multip = ((tk-1.0f)/tkp1);
+#pragma omp parallel for shared(P1,P2,P1_old,P2_old,R1,R2,multip) private(index,i,j)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+			index = j*dimX+i;
+            R1[index] = P1[index] + multip*(P1[index] - P1_old[index]);
+            R2[index] = P2[index] + multip*(P2[index] - P2_old[index]);
+        }}
+    return 1;
 }
 
 /* 3D-case related Functions */
 /*****************************************************************/
-float Obj_func_CALC3D(float *A, float *D, float *funcvalA, float lambda, int dimX, int dimY, int dimZ)
-{   
-    int i,j,k;
-    float f1, f2, val1, val2, val3;
-    
-    /*data-related term */
-    f1 = 0.0f;
-    for(i=0; i<dimX*dimY*dimZ; i++) f1 += pow(D[i] - A[i],2);    
-    
-    /*TV-related term */
-    f2 = 0.0f;
+float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ)
+{
+    float val1, val2, val3;
+    int i,j,k,index;
+#pragma omp parallel for shared(A,D,R1,R2,R3) private(index,i,j,k,val1,val2,val3)
     for(i=0; i<dimX; i++) {
         for(j=0; j<dimY; j++) {
             for(k=0; k<dimZ; k++) {
-            /* boundary conditions  */
-            if (i == dimX-1) {val1 = 0.0f;} else {val1 = A[(dimX*dimY)*k + (i+1)*dimY + (j)] - A[(dimX*dimY)*k + (i)*dimY + (j)];}
-            if (j == dimY-1) {val2 = 0.0f;} else {val2 = A[(dimX*dimY)*k + (i)*dimY + (j+1)] - A[(dimX*dimY)*k + (i)*dimY + (j)];}    
-            if (k == dimZ-1) {val3 = 0.0f;} else {val3 = A[(dimX*dimY)*(k+1) + (i)*dimY + (j)] - A[(dimX*dimY)*k + (i)*dimY + (j)];}    
-            f2 += sqrt(pow(val1,2) + pow(val2,2)  + pow(val3,2));
-        }}}     
-    /* sum of two terms */
-    funcvalA[0] = 0.5f*f1 + lambda*f2;     
-    return *funcvalA;
-}
-
-float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ)
-{
-	float val1, val2, val3;
-	int i, j, k;
-#pragma omp parallel for shared(A,D,R1,R2,R3) private(i,j,k,val1,val2,val3)
-	for (i = 0; i<dimX; i++) {
-		for (j = 0; j<dimY; j++) {
-			for (k = 0; k<dimZ; k++) {
-				/* boundary conditions */
-				if (i == 0) { val1 = 0.0f; }
-				else { val1 = R1[(dimX*dimY)*k + (i - 1)*dimY + (j)]; }
-				if (j == 0) { val2 = 0.0f; }
-				else { val2 = R2[(dimX*dimY)*k + (i)*dimY + (j - 1)]; }
-				if (k == 0) { val3 = 0.0f; }
-				else { val3 = R3[(dimX*dimY)*(k - 1) + (i)*dimY + (j)]; }
-				D[(dimX*dimY)*k + (i)*dimY + (j)] = A[(dimX*dimY)*k + (i)*dimY + (j)] - lambda*(R1[(dimX*dimY)*k + (i)*dimY + (j)] + R2[(dimX*dimY)*k + (i)*dimY + (j)] + R3[(dimX*dimY)*k + (i)*dimY + (j)] - val1 - val2 - val3);
-			}
-		}
-	}
-	return *D;
+				index = (dimX*dimY)*k + j*dimX+i;	
+                /* boundary conditions */
+                if (i == 0) {val1 = 0.0f;} else {val1 = R1[(dimX*dimY)*k + j*dimX + (i-1)];}
+                if (j == 0) {val2 = 0.0f;} else {val2 = R2[(dimX*dimY)*k + (j-1)*dimX + i];}
+                if (k == 0) {val3 = 0.0f;} else {val3 = R3[(dimX*dimY)*(k-1) + j*dimX + i];}
+                D[index] = A[index] - lambda*(R1[index] + R2[index] + R3[index] - val1 - val2 - val3);
+            }}}
+    return *D;
 }
 float Grad_func3D(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ)
 {
-	float val1, val2, val3, multip;
-	int i, j, k;
-	multip = (1.0f / (8.0f*lambda));
-#pragma omp parallel for shared(P1,P2,P3,D,R1,R2,R3,multip) private(i,j,k,val1,val2,val3)
-	for (i = 0; i<dimX; i++) {
-		for (j = 0; j<dimY; j++) {
-			for (k = 0; k<dimZ; k++) {
-				/* boundary conditions */
-				if (i == dimX - 1) val1 = 0.0f; else val1 = D[(dimX*dimY)*k + (i)*dimY + (j)] - D[(dimX*dimY)*k + (i + 1)*dimY + (j)];
-				if (j == dimY - 1) val2 = 0.0f; else val2 = D[(dimX*dimY)*k + (i)*dimY + (j)] - D[(dimX*dimY)*k + (i)*dimY + (j + 1)];
-				if (k == dimZ - 1) val3 = 0.0f; else val3 = D[(dimX*dimY)*k + (i)*dimY + (j)] - D[(dimX*dimY)*(k + 1) + (i)*dimY + (j)];
-				P1[(dimX*dimY)*k + (i)*dimY + (j)] = R1[(dimX*dimY)*k + (i)*dimY + (j)] + multip*val1;
-				P2[(dimX*dimY)*k + (i)*dimY + (j)] = R2[(dimX*dimY)*k + (i)*dimY + (j)] + multip*val2;
-				P3[(dimX*dimY)*k + (i)*dimY + (j)] = R3[(dimX*dimY)*k + (i)*dimY + (j)] + multip*val3;
-			}
-		}
-	}
-	return 1;
+    float val1, val2, val3, multip;
+    int i,j,k, index;
+    multip = (1.0f/(8.0f*lambda));
+#pragma omp parallel for shared(P1,P2,P3,D,R1,R2,R3,multip) private(index,i,j,k,val1,val2,val3)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+            for(k=0; k<dimZ; k++) {
+				index = (dimX*dimY)*k + j*dimX+i;				
+                /* boundary conditions */
+                if (i == dimX-1) val1 = 0.0f; else val1 = D[index] - D[(dimX*dimY)*k + j*dimX + (i+1)];
+                if (j == dimY-1) val2 = 0.0f; else val2 = D[index] - D[(dimX*dimY)*k + (j+1)*dimX + i];
+                if (k == dimZ-1) val3 = 0.0f; else val3 = D[index] - D[(dimX*dimY)*(k+1) + j*dimX + i];
+                P1[index] = R1[index] + multip*val1;
+                P2[index] = R2[index] + multip*val2;
+                P3[index] = R3[index] + multip*val3;
+            }}}
+    return 1;
 }
-float Proj_func3D(float *P1, float *P2, float *P3, int dimX, int dimY, int dimZ)
-{
-	float val1, val2, val3;
-	int i, j, k;
-#pragma omp parallel for shared(P1,P2,P3) private(i,j,k,val1,val2,val3)
-	for (i = 0; i<dimX; i++) {
-		for (j = 0; j<dimY; j++) {
-			for (k = 0; k<dimZ; k++) {
-				val1 = fabs(P1[(dimX*dimY)*k + (i)*dimY + (j)]);
-				val2 = fabs(P2[(dimX*dimY)*k + (i)*dimY + (j)]);
-				val3 = fabs(P3[(dimX*dimY)*k + (i)*dimY + (j)]);
-				if (val1 < 1.0f) { val1 = 1.0f; }
-				if (val2 < 1.0f) { val2 = 1.0f; }
-				if (val3 < 1.0f) { val3 = 1.0f; }
-
-				P1[(dimX*dimY)*k + (i)*dimY + (j)] = P1[(dimX*dimY)*k + (i)*dimY + (j)] / val1;
-				P2[(dimX*dimY)*k + (i)*dimY + (j)] = P2[(dimX*dimY)*k + (i)*dimY + (j)] / val2;
-				P3[(dimX*dimY)*k + (i)*dimY + (j)] = P3[(dimX*dimY)*k + (i)*dimY + (j)] / val3;
-			}
+float Proj_func3D(float *P1, float *P2, float *P3, int methTV, int dimX, int dimY, int dimZ)
+{		
+    float val1, val2, val3, denom, sq_denom;
+    int i,j,k, index;
+    if (methTV == 0) {
+	/* isotropic TV*/
+	#pragma omp parallel for shared(P1,P2,P3) private(index,i,j,k,val1,val2,val3,sq_denom)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+            for(k=0; k<dimZ; k++) {
+				index = (dimX*dimY)*k + j*dimX+i;				
+				denom = pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2);				
+                if (denom > 1.0f) {
+					sq_denom = 1.0f/sqrt(denom);
+                    P1[index] = P1[index]*sq_denom;
+                    P2[index] = P2[index]*sq_denom;
+                    P3[index] = P3[index]*sq_denom;
+                }				
+			}}}	
+	}    
+    else {
+    /* anisotropic TV*/
+#pragma omp parallel for shared(P1,P2,P3) private(index,i,j,k,val1,val2,val3)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+            for(k=0; k<dimZ; k++) {
+				index = (dimX*dimY)*k + j*dimX+i;		
+                val1 = fabs(P1[index]);
+                val2 = fabs(P2[index]);
+                val3 = fabs(P3[index]);
+                if (val1 < 1.0f) {val1 = 1.0f;}
+                if (val2 < 1.0f) {val2 = 1.0f;}
+                if (val3 < 1.0f) {val3 = 1.0f;}                
+                P1[index] = P1[index]/val1;
+                P2[index] = P2[index]/val2;
+                P3[index] = P3[index]/val3;
+            }}}
 		}
-	}
-	return 1;
+    return 1;
 }
 float Rupd_func3D(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, int dimX, int dimY, int dimZ)
 {
-	int i, j, k;
-	float multip;
-	multip = ((tk - 1.0f) / tkp1);
-#pragma omp parallel for shared(P1,P2,P3,P1_old,P2_old,P3_old,R1,R2,R3,multip) private(i,j,k)
-	for (i = 0; i<dimX; i++) {
-		for (j = 0; j<dimY; j++) {
-			for (k = 0; k<dimZ; k++) {
-				R1[(dimX*dimY)*k + (i)*dimY + (j)] = P1[(dimX*dimY)*k + (i)*dimY + (j)] + multip*(P1[(dimX*dimY)*k + (i)*dimY + (j)] - P1_old[(dimX*dimY)*k + (i)*dimY + (j)]);
-				R2[(dimX*dimY)*k + (i)*dimY + (j)] = P2[(dimX*dimY)*k + (i)*dimY + (j)] + multip*(P2[(dimX*dimY)*k + (i)*dimY + (j)] - P2_old[(dimX*dimY)*k + (i)*dimY + (j)]);
-				R3[(dimX*dimY)*k + (i)*dimY + (j)] = P3[(dimX*dimY)*k + (i)*dimY + (j)] + multip*(P3[(dimX*dimY)*k + (i)*dimY + (j)] - P3_old[(dimX*dimY)*k + (i)*dimY + (j)]);
-			}
-		}
-	}
-	return 1;
+    int i,j,k,index;
+    float multip;
+    multip = ((tk-1.0f)/tkp1);
+#pragma omp parallel for shared(P1,P2,P3,P1_old,P2_old,P3_old,R1,R2,R3,multip) private(index,i,j,k)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+            for(k=0; k<dimZ; k++) {
+				index = (dimX*dimY)*k + j*dimX+i;	
+                R1[index] = P1[index] + multip*(P1[index] - P1_old[index]);
+                R2[index] = P2[index] + multip*(P2[index] - P2_old[index]);
+                R3[index] = P3[index] + multip*(P3[index] - P3_old[index]);
+            }}}
+    return 1;
 }
-
-
diff --git a/Core/regularizers_CPU/FGP_TV_core.h b/Core/regularizers_CPU/FGP_TV_core.h
index 10ea224..b591819 100644
--- a/Core/regularizers_CPU/FGP_TV_core.h
+++ b/Core/regularizers_CPU/FGP_TV_core.h
@@ -27,46 +27,37 @@ limitations under the License.
 #include "CCPiDefines.h"
 
 /* C-OMP implementation of FGP-TV [1] denoising/regularization model (2D/3D case)
-*
-* Input Parameters:
-* 1. Noisy image/volume [REQUIRED]
-* 2. lambda - regularization parameter [REQUIRED]
-* 3. Number of iterations [OPTIONAL parameter]
-* 4. eplsilon: tolerance constant [OPTIONAL parameter]
-* 5. TV-type: 'iso' or 'l1' [OPTIONAL parameter]
-*
-* Output:
-* [1] Filtered/regularized image
-* [2] last function value
-*
-* Example of image denoising:
-* figure;
-* Im = double(imread('lena_gray_256.tif'))/255;  % loading image
-* u0 = Im + .05*randn(size(Im)); % adding noise
-* u = FGP_TV(single(u0), 0.05, 100, 1e-04);
-*
-* to compile with OMP support: mex FGP_TV.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
-* This function is based on the Matlab's code and paper by
-* [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
-*
-* D. Kazantsev, 2016-17
-*
-*/
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume 
+ * 2. lambda - regularization parameter 
+ * 3. Number of iterations
+ * 4. eplsilon: tolerance constant 
+ * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
+ * 6. nonneg: 'nonnegativity (0 is OFF by default) 
+ * 7. print information: 0 (off) or 1 (on) 
+ *
+ * Output:
+ * [1] Filtered/regularized image
+ *
+ * This function is based on the Matlab's code and paper by
+ * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
+ */
+ 
 #ifdef __cplusplus
 extern "C" {
 #endif
-//float copyIm(float *A, float *B, int dimX, int dimY, int dimZ);
+CCPI_EXPORT float FGP_TV_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
+
 CCPI_EXPORT float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, int dimX, int dimY);
 CCPI_EXPORT float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda, int dimX, int dimY);
 CCPI_EXPORT float Proj_func2D(float *P1, float *P2, int methTV, int dimX, int dimY);
 CCPI_EXPORT float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, int dimX, int dimY);
-CCPI_EXPORT float Obj_func_CALC2D(float *A, float *D, float *funcvalA, float lambda, int dimX, int dimY);
 
 CCPI_EXPORT float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ);
 CCPI_EXPORT float Grad_func3D(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ);
-CCPI_EXPORT float Proj_func3D(float *P1, float *P2, float *P3, int dimX, int dimY, int dimZ);
+CCPI_EXPORT float Proj_func3D(float *P1, float *P2, float *P3, int methTV, int dimX, int dimY, int dimZ);
 CCPI_EXPORT float Rupd_func3D(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, int dimX, int dimY, int dimZ);
-CCPI_EXPORT float Obj_func_CALC3D(float *A, float *D, float *funcvalA, float lambda, int dimX, int dimY, int dimZ);
 #ifdef __cplusplus
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu b/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu
new file mode 100755
index 0000000..21a95c9
--- /dev/null
+++ b/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu
@@ -0,0 +1,561 @@
+ /*
+This work is part of the Core Imaging Library developed by
+Visual Analytics and Imaging System Group of the Science Technology
+Facilities Council, STFC
+
+Copyright 2017 Daniil Kazantsev
+Copyright 2017 Srikanth Nagella, Edoardo Pasca
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/ 
+
+#include "FGP_TV_GPU_core.h"
+#include <thrust/device_vector.h>
+#include <thrust/transform_reduce.h>
+
+/* CUDA implementation of FGP-TV [1] denoising/regularization model (2D/3D case)
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume 
+ * 2. lambda - regularization parameter 
+ * 3. Number of iterations
+ * 4. eplsilon: tolerance constant 
+ * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
+ * 6. nonneg: 'nonnegativity (0 is OFF by default) 
+ * 7. print information: 0 (off) or 1 (on) 
+ *
+ * Output:
+ * [1] Filtered/regularized image
+ *
+ * This function is based on the Matlab's code and paper by
+ * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
+ */
+
+// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
+#define checkCudaErrors(err)           __checkCudaErrors (err, __FILE__, __LINE__)
+
+inline void __checkCudaErrors(cudaError err, const char *file, const int line)
+{
+    if (cudaSuccess != err)
+    {
+        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
+                file, line, (int)err, cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define BLKXSIZE2D 16
+#define BLKYSIZE2D 16
+
+#define BLKXSIZE 8
+#define BLKYSIZE 8
+#define BLKZSIZE 8
+
+#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
+struct square { __host__ __device__ float operator()(float x) { return x * x; } };
+
+/************************************************/
+/*****************2D modules*********************/
+/************************************************/
+__global__ void Obj_func2D_kernel(float *Ad, float *D, float *R1, float *R2, int N, int M, int ImSize, float lambda)
+{
+    
+    float val1,val2;
+    
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex; 
+    
+    if ((xIndex < N) && (yIndex < M)) {        
+        if (xIndex <= 0) {val1 = 0.0f;} else {val1 = R1[(xIndex-1) + N*yIndex];}
+        if (yIndex <= 0) {val2 = 0.0f;} else {val2 = R2[xIndex + N*(yIndex-1)];}
+        //Write final result to global memory
+        D[index] = Ad[index] - lambda*(R1[index] + R2[index] - val1 - val2);
+    }
+    return;
+}
+
+__global__ void Grad_func2D_kernel(float *P1, float *P2, float *D, float *R1, float *R2, int N, int M, int ImSize, float multip)
+{
+    
+    float val1,val2;
+    
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if ((xIndex < N) && (yIndex < M)) {        
+        
+        /* boundary conditions */
+        if (xIndex >= N-1) val1 = 0.0f; else val1 = D[index] - D[(xIndex+1) + N*yIndex];
+        if (yIndex >= M-1) val2 = 0.0f; else val2 = D[index] - D[(xIndex) + N*(yIndex + 1)];
+        
+        //Write final result to global memory
+        P1[index] = R1[index] + multip*val1;
+        P2[index] = R2[index] + multip*val2;
+    }
+    return;
+}
+
+__global__ void Proj_func2D_iso_kernel(float *P1, float *P2, int N, int M, int ImSize)
+{
+    
+    float denom;    
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if ((xIndex < N) && (yIndex < M)) { 
+        denom = pow(P1[index],2) +  pow(P2[index],2);        
+        if (denom > 1.0f) {
+            P1[index] = P1[index]/sqrt(denom);
+            P2[index] = P2[index]/sqrt(denom);
+        }
+    }
+    return;
+}
+__global__ void Proj_func2D_aniso_kernel(float *P1, float *P2, int N, int M, int ImSize)
+{
+    
+    float val1, val2;    
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if ((xIndex < N) && (yIndex < M)) { 
+                val1 = abs(P1[index]);
+                val2 = abs(P2[index]);
+                if (val1 < 1.0f) {val1 = 1.0f;}
+                if (val2 < 1.0f) {val2 = 1.0f;}
+                P1[index] = P1[index]/val1;
+                P2[index] = P2[index]/val2;
+    }
+    return;
+}
+__global__ void Rupd_func2D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, float multip2, int N, int M, int ImSize)
+{
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if ((xIndex < N) && (yIndex < M)) { 
+        R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]);
+        R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]);
+    }
+    return;
+}
+__global__ void nonneg2D_kernel(float* Output, int N, int M, int num_total)
+{
+    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if (index < num_total)	{
+        if (Output[index] < 0.0f) Output[index] = 0.0f;
+    }
+}
+__global__ void copy_kernel2D(float *Input, float* Output, int N, int M, int num_total)
+{
+    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if (index < num_total)	{
+        Output[index] = Input[index];
+    }
+}
+__global__ void ResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total)
+{
+    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if (index < num_total)	{
+        Output[index] = Input1[index] - Input2[index];
+    }
+}   
+/************************************************/
+/*****************3D modules*********************/
+/************************************************/
+__global__ void Obj_func3D_kernel(float *Ad, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float lambda)
+{
+    
+    float val1,val2,val3;
+    
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k < Z)) {      
+        if (i <= 0) {val1 = 0.0f;} else {val1 = R1[(N*M)*(k) + (i-1) + N*j];}
+        if (j <= 0) {val2 = 0.0f;} else {val2 = R2[(N*M)*(k) + i + N*(j-1)];}
+        if (k <= 0) {val3 = 0.0f;} else {val3 = R3[(N*M)*(k-1) + i + N*j];}
+        //Write final result to global memory
+        D[index] = Ad[index] - lambda*(R1[index] + R2[index] + R3[index] - val1 - val2 - val3);
+    }
+    return;
+}
+
+__global__ void Grad_func3D_kernel(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float multip)
+{
+    
+    float val1,val2,val3;
+    
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k <  Z)) {       
+        /* boundary conditions */
+        if (i >= N-1) val1 = 0.0f; else val1 = D[index] - D[(N*M)*(k) + (i+1) + N*j];
+        if (j >= M-1) val2 = 0.0f; else val2 = D[index] - D[(N*M)*(k) + i + N*(j+1)];
+        if (k >= Z-1) val3 = 0.0f; else val3 = D[index] - D[(N*M)*(k+1) + i + N*j];
+        
+        //Write final result to global memory
+        P1[index] = R1[index] + multip*val1;
+        P2[index] = R2[index] + multip*val2;
+        P3[index] = R3[index] + multip*val3;
+    }
+    return;
+}
+
+__global__ void Proj_func3D_iso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize)
+{
+    
+    float denom,sq_denom;    
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k <  Z)) {
+        denom = pow(P1[index],2) +  pow(P2[index],2) + pow(P3[index],2);
+        
+        if (denom > 1.0f) {
+            sq_denom = 1.0f/sqrt(denom);
+            P1[index] = P1[index]*sq_denom;
+            P2[index] = P2[index]*sq_denom;
+            P3[index] = P3[index]*sq_denom;
+        }
+    }
+    return;
+}
+
+__global__ void Proj_func3D_aniso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize)
+{
+    
+    float val1, val2, val3;    
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k <  Z)) {
+                val1 = abs(P1[index]);
+                val2 = abs(P2[index]);
+                val3 = abs(P3[index]);
+                if (val1 < 1.0f) {val1 = 1.0f;}
+                if (val2 < 1.0f) {val2 = 1.0f;}
+                if (val3 < 1.0f) {val3 = 1.0f;}
+                P1[index] = P1[index]/val1;
+                P2[index] = P2[index]/val2;
+                P3[index] = P3[index]/val3;
+    }
+    return;
+}
+
+
+__global__ void Rupd_func3D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, float multip2, int N, int M, int Z, int ImSize)
+{
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k <  Z)) { 
+        R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]);
+        R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]);
+        R3[index] = P3[index] + multip2*(P3[index] - P3_old[index]);
+    }
+    return;
+}
+
+__global__ void nonneg3D_kernel(float* Output, int N, int M, int Z, int num_total)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if (index < num_total)	{
+        if (Output[index] < 0.0f) Output[index] = 0.0f;
+    }
+}
+
+__global__ void copy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total)
+{
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if (index < num_total)	{
+        Output[index] = Input[index];
+    }
+}
+/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
+
+////////////MAIN HOST FUNCTION ///////////////
+extern "C" void FGP_TV_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
+{
+    int deviceCount = -1; // number of devices
+    cudaGetDeviceCount(&deviceCount);
+    if (deviceCount == 0) {
+        fprintf(stderr, "No CUDA devices found\n");
+        return;
+    }
+    
+    int count = 0, i;
+    float re, multip,multip2;    
+	float tk = 1.0f;
+    float tkp1=1.0f;
+        
+    if (dimZ <= 1) {
+		/*2D verson*/
+		int ImSize = dimX*dimY;    
+		float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL;
+   
+		dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
+		dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D));
+    
+		/*allocate space for images on device*/
+		checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );
+		if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) );
+    
+        checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
+        cudaMemset(P1, 0, ImSize*sizeof(float));
+        cudaMemset(P2, 0, ImSize*sizeof(float));
+        cudaMemset(P1_prev, 0, ImSize*sizeof(float));
+        cudaMemset(P2_prev, 0, ImSize*sizeof(float));
+        cudaMemset(R1, 0, ImSize*sizeof(float));
+        cudaMemset(R2, 0, ImSize*sizeof(float));
+
+        /********************** Run CUDA 2D kernel here ********************/    
+        multip = (1.0f/(8.0f*lambda));
+    
+        /* The main kernel */
+        for (i = 0; i < iter; i++) {
+        
+            /* computing the gradient of the objective function */
+            Obj_func2D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, dimX, dimY, ImSize, lambda);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+            
+            if (nonneg != 0) {
+            nonneg2D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() ); }
+                    
+            /*Taking a step towards minus of the gradient*/
+            Grad_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P2, d_update, R1, R2, dimX, dimY, ImSize, multip);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            /* projection step */
+            if (methodTV == 0) Proj_func2D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*isotropic TV*/
+            else Proj_func2D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/            
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
+            multip2 = ((tk-1.0f)/tkp1);
+        
+            Rupd_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, multip2, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            if (epsil != 0.0f) {
+                /* calculate norm - stopping rules using the Thrust library */					
+                ResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, ImSize);
+                checkCudaErrors( cudaDeviceSynchronize() );
+                checkCudaErrors(cudaPeekAtLastError() );               
+                
+                thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize);  		
+                float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));		
+                thrust::device_vector<float> d_vec2(d_update, d_update + ImSize);  		
+                float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>()));
+                    
+                re = (reduction/reduction2);      
+                if (re < epsil)  count++;
+                    if (count > 4) break;       
+             
+                copy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize);
+                checkCudaErrors( cudaDeviceSynchronize() );
+                checkCudaErrors(cudaPeekAtLastError() );                                              
+            }                  
+        
+            copy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            copy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );       
+ 
+            tk = tkp1;
+        }
+        if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i);   
+            /***************************************************************/    
+            //copy result matrix from device to host memory
+            cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
+    
+            cudaFree(d_input);
+            cudaFree(d_update);
+            if (epsil != 0.0f) cudaFree(d_update_prev);
+            cudaFree(P1);
+            cudaFree(P2);
+            cudaFree(P1_prev);
+            cudaFree(P2_prev);
+            cudaFree(R1);
+            cudaFree(R2);
+    }
+    else {
+            /*3D verson*/
+            int ImSize = dimX*dimY*dimZ;    
+            float *d_input, *d_update=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL;
+   
+            dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE);
+            dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE));
+    
+            /*allocate space for images on device*/
+            checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );            
+            checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P3,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P3_prev,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&R3,ImSize*sizeof(float)) );
+    
+            checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
+            cudaMemset(P1, 0, ImSize*sizeof(float));
+            cudaMemset(P2, 0, ImSize*sizeof(float));
+            cudaMemset(P3, 0, ImSize*sizeof(float));
+            cudaMemset(P1_prev, 0, ImSize*sizeof(float));
+            cudaMemset(P2_prev, 0, ImSize*sizeof(float));
+            cudaMemset(P3_prev, 0, ImSize*sizeof(float));
+            cudaMemset(R1, 0, ImSize*sizeof(float));
+            cudaMemset(R2, 0, ImSize*sizeof(float));
+            cudaMemset(R3, 0, ImSize*sizeof(float));
+            /********************** Run CUDA 3D kernel here ********************/    
+            multip = (1.0f/(8.0f*lambda));
+    
+            /* The main kernel */
+        for (i = 0; i < iter; i++) {
+        
+            /* computing the gradient of the objective function */
+            Obj_func3D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, lambda);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            if (nonneg != 0) {
+            nonneg3D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() ); }
+            
+            /*Taking a step towards minus of the gradient*/
+            Grad_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, multip);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            /* projection step */
+            if (methodTV == 0) Proj_func3D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* isotropic kernel */
+            else Proj_func3D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* anisotropic kernel */
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
+            multip2 = ((tk-1.0f)/tkp1);
+        
+            Rupd_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, multip2, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );           
+        
+            copy_kernel3D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            copy_kernel3D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );   
+            
+            copy_kernel3D<<<dimGrid,dimBlock>>>(P3, P3_prev, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );      
+ 
+            tk = tkp1;
+        }
+        if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i);   
+            /***************************************************************/    
+            //copy result matrix from device to host memory
+            cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
+    
+            cudaFree(d_input);
+            cudaFree(d_update);            
+            cudaFree(P1);
+            cudaFree(P2);
+            cudaFree(P3);
+            cudaFree(P1_prev);
+            cudaFree(P2_prev);
+            cudaFree(P3_prev);
+            cudaFree(R1);
+            cudaFree(R2);        
+            cudaFree(R3);        
+    } 
+    cudaDeviceReset(); 
+}
diff --git a/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h b/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h
new file mode 100755
index 0000000..a5d3f73
--- /dev/null
+++ b/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h
@@ -0,0 +1,10 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+#ifndef _FGP_TV_GPU_
+#define _FGP_TV_GPU_
+
+extern "C" void FGP_TV_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);   
+
+#endif 
-- 
cgit v1.2.3