From ed8b0d3f19045cd08b954220c0407d3d4c3b79db Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 13 Feb 2019 17:27:37 +0000
Subject: first add of TGV3D_CPU

---
 Core/regularisers_CPU/TGV_core.c | 179 +++++++++++++++++++++++++++++++++++----
 Core/regularisers_CPU/TGV_core.h |  19 +++--
 2 files changed, 178 insertions(+), 20 deletions(-)

diff --git a/Core/regularisers_CPU/TGV_core.c b/Core/regularisers_CPU/TGV_core.c
index 477e909..964c8a1 100644
--- a/Core/regularisers_CPU/TGV_core.c
+++ b/Core/regularisers_CPU/TGV_core.c
@@ -20,31 +20,35 @@ limitations under the License.
 #include "TGV_core.h"
 
 /* C-OMP implementation of Primal-Dual denoising method for 
- * Total Generilized Variation (TGV)-L2 model [1] (2D case only)
+ * Total Generilized Variation (TGV)-L2 model [1] (2D/3D case)
  *
  * Input Parameters:
- * 1. Noisy image (2D)
+ * 1. Noisy image/volume (2D/3D)
  * 2. lambda - regularisation parameter
  * 3. parameter to control the first-order term (alpha1)
  * 4. parameter to control the second-order term (alpha0)
  * 5. Number of Chambolle-Pock (Primal-Dual) iterations
  * 6. Lipshitz constant (default is 12)
- *
+ * 
  * Output:
- * Filtered/regulariaed image 
+ * Filtered/regularised image/volume
  *
  * References:
  * [1] K. Bredies "Total Generalized Variation"
+ * 
  */
  
-float TGV_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iter, float L2, int dimX, int dimY)
+float TGV_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iter, float L2, int dimX, int dimY, int dimZ)
 {
 	long DimTotal;
-    int ll;
+	int ll;
 	float *U_old, *P1, *P2, *Q1, *Q2, *Q3, *V1, *V1_old, *V2, *V2_old, tau, sigma;
-		
-		DimTotal = (long)(dimX*dimY);
-        
+
+	DimTotal = (long)(dimX*dimY*dimZ);
+	copyIm(U0, U, (long)(dimX), (long)(dimY), (long)(dimZ)); /* initialize */
+        tau = pow(L2,-0.5);
+        sigma = pow(L2,-0.5);
+
         /* dual variables */
         P1 = calloc(DimTotal, sizeof(float));
         P2 = calloc(DimTotal, sizeof(float));
@@ -59,12 +63,10 @@ float TGV_main(float *U0, float *U, float lambda, float alpha1, float alpha0, in
         V1_old = calloc(DimTotal, sizeof(float));
         V2 = calloc(DimTotal, sizeof(float));
         V2_old = calloc(DimTotal, sizeof(float));
-        
-        copyIm(U0, U, (long)(dimX), (long)(dimY), 1l); /* initialize  */
-       
-        tau = pow(L2,-0.5);
-        sigma = pow(L2,-0.5);
-    
+	
+	if (dimZ == 1) {
+	/*2D case*/
+	
         /* Primal-dual iterations begin here */
         for(ll = 0; ll < iter; ll++) {
             
@@ -100,6 +102,59 @@ float TGV_main(float *U0, float *U, float lambda, float alpha1, float alpha0, in
             newU(V1, V1_old, (long)(dimX), (long)(dimY));
             newU(V2, V2_old, (long)(dimX), (long)(dimY));
         } /*end of iterations*/
+        	}
+        else {
+        /*3D case*/
+        float *P3, *Q4, *Q5, *Q6, *V3, *V3_old;
+        
+        P3 = calloc(DimTotal, sizeof(float));
+        Q4 = calloc(DimTotal, sizeof(float));
+        Q5 = calloc(DimTotal, sizeof(float));
+        Q6 = calloc(DimTotal, sizeof(float));
+        V3 = calloc(DimTotal, sizeof(float));
+        V3_old = calloc(DimTotal, sizeof(float));
+        
+         /* Primal-dual iterations begin here */
+        for(ll = 0; ll < iter; ll++) {
+            
+            /* Calculate Dual Variable P */
+            DualP_3D(U, V1, V2, V3, P1, P2, P3, (long)(dimX), (long)(dimY), (long)(dimZ), sigma);
+            
+            /*Projection onto convex set for P*/
+            ProjP_3D(P1, P2, P3, (long)(dimX), (long)(dimY), alpha1);
+            
+            /* Calculate Dual Variable Q */
+            DualQ_3D(V1, V2, V3, Q1, Q2, Q3, Q4, Q5, Q6, (long)(dimX), (long)(dimY), (long)(dimZ), sigma);
+            
+            /*Projection onto convex set for Q*/
+            ProjQ_3D(Q1, Q2, Q3, Q4, Q5, Q6, (long)(dimX), (long)(dimY), (long)(dimZ), alpha0);
+            
+            /*saving U into U_old*/
+            copyIm(U, U_old, (long)(dimX), (long)(dimY), (long)(dimZ));
+            
+            /*adjoint operation  -> divergence and projection of P*/
+            DivProjP_3D(U, U0, P1, P2, P3, (long)(dimX), (long)(dimY), (long)(dimZ), lambda, tau);
+            
+            /*get updated solution U*/
+            newU3D(U, U_old, (long)(dimX), (long)(dimY), (long)(dimZ));
+            
+            /*saving V into V_old*/
+            copyIm(V1, V1_old, (long)(dimX), (long)(dimY), (long)(dimZ));
+            copyIm(V2, V2_old, (long)(dimX), (long)(dimY), (long)(dimZ));
+            copyIm(V3, V3_old, (long)(dimX), (long)(dimY), (long)(dimZ));
+            
+            /* upd V*/
+            UpdV_3D(V1, V2, V3, P1, P2, P3, Q1, Q2, Q3, Q4, Q5, Q6, (long)(dimX), (long)(dimY), (long)(dimZ), tau);
+            
+            /*get new V*/
+            newU3D(V1, V1_old, (long)(dimX), (long)(dimY), (long)(dimZ));
+            newU3D(V2, V2_old, (long)(dimX), (long)(dimY), (long)(dimZ));
+            newU3D(V3, V3_old, (long)(dimX), (long)(dimY), (long)(dimZ));
+            
+	        } /*end of iterations*/
+        free(P3);free(Q4);free(Q5);free(Q6);free(V3);free(V3_old);
+        }     
+
     /*freeing*/
     free(P1);free(P2);free(Q1);free(Q2);free(Q3);free(U_old);
     free(V1);free(V2);free(V1_old);free(V2_old);
@@ -253,3 +308,97 @@ float UpdV_2D(float *V1, float *V2, float *P1, float *P2, float *Q1, float *Q2,
         }}
     return 1;
 }
+
+
+/********************************************************************/
+/***************************3D Functions*****************************/
+/********************************************************************/
+/*Calculating dual variable P (using forward differences)*/
+float DualP_3D(float *U, float *V1, float *V2, float *V3, float *P1, float *P2, float *P3, long dimX, long dimY, long dimZ, float sigma)
+{
+    long i,j,k, index;
+#pragma omp parallel for shared(U,V1,V2,V3,P1,P2,P3) private(i,j,k,index)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+          for(k=0; k<dimZ; k++) {
+             	   
+    	   index = (dimX*dimY)*k + j*dimX+i;
+    	   
+            /* symmetric boundary conditions (Neuman) */
+            if (i == dimX-1) P1[index] += sigma*((U[(dimX*dimY)*k + j*dimX+(i-1)] - U[index]) - V1[index]); 
+            else P1[index] += sigma*((U[(dimX*dimY)*k + j*dimX+(i+1)] - U[index])  - V1[index]); 
+            if (j == dimY-1) P2[index] += sigma*((U[(dimX*dimY)*k + (j-1)*dimX+i] - U[index])  - V2[index]);
+            else  P2[index] += sigma*((U[(dimX*dimY)*k + (j+1)*dimX+i] - U[index])  - V2[index]);
+            if (j == dimZ-1) P3[index] += sigma*((U[(dimX*dimY)*(k-1) + j*dimX+i] - U[index])  - V3[index]);
+            else  P3[index] += sigma*((U[(dimX*dimY)*(k+1) + j*dimX+i] - U[index])  - V3[index]);
+        }}}
+    return 1;
+}
+/*Projection onto convex set for P*/
+float ProjP_3D(float *P1, float *P2, float *P3, long dimX, long dimY, long dimZ, float alpha1)
+{
+    float grad_magn;
+    long i,j,k,index;
+#pragma omp parallel for shared(P1,P2,P3) private(i,j,k,index,grad_magn)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+	  for(k=0; k<dimZ; k++) {   	
+   	    index = (dimX*dimY)*k + j*dimX+i;
+            grad_magn = sqrt(pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2));
+            grad_magn = grad_magn/alpha1;
+            if (grad_magn > 1.0f) {
+                P1[index] /= grad_magn;
+                P2[index] /= grad_magn;
+                P3[index] /= grad_magn;
+            }
+        }}}
+    return 1;
+}
+/*Calculating dual variable Q (using forward differences)*/
+float DualQ_3D(float *V1, float *V2, float *V3, float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, long dimX, long dimY, long dimZ, float sigma)
+{
+    long i,j,k, index;
+    float q1, q2, q3, q4, q5, q6, q11, q22, q33, q44, q55, q66;
+#pragma omp parallel for shared(Q1,Q2,Q3,Q4,Q5,Q6,V1,V2,V3) private(i,j,k,index,q1,q2,q3,q4,q5,q6,q11,q22,q33,q44,q55,q66)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+       	  for(k=0; k<dimZ; k++) {   	
+	    index = (dimX*dimY)*k + j*dimX+i;
+            /* symmetric boundary conditions (Neuman) */
+            if (i == dimX-1){ 
+              q1 = V1[(dimX*dimY)*k + j*dimX+(i-1)] - V1[index];
+              q11 = V2[(dimX*dimY)*k + j*dimX+(i-1)] - V2[index];
+              q33 = V3[(dimX*dimY)*k + j*dimX+(i-1)] - V3[index];
+            }
+            else {
+                q1 = (V1[(dimX*dimY)*k + j*dimX+(i+1)] - V1[index]);
+                q11 = (V2[(dimX*dimY)*k + j*dimX+(i+1)] - V2[index]);
+                q33 = V3[(dimX*dimY)*k + j*dimX+(i+1)] - V3[index];
+            }
+            if (j == dimY-1) {
+                q2 = (V2[(dimX*dimY)*k + (j-1)*dimX+i] - V2[index]);
+                q22 = (V1[(dimX*dimY)*k + (j-1)*dimX+i] - V1[index]);
+            }
+            else {
+                q2 = V2[(dimX*dimY)*k + (j+1)*dimX+i] - V2[index];
+                q22 = V1[(dimX*dimY)*k + (j+1)*dimX+i] - V1[index];
+            }
+            if (k == dimZ-1) {
+                q3 = V3[(dimX*dimY)*(k-1) + j*dimX+i] - V3[index];
+                q44 = V1[(dimX*dimY)*(k-1) + j*dimX+i] - V1[index];
+            }
+            else {
+                q3 = V3[(dimX*dimY)*(k+1) + j*dimX+i] - V3[index];
+                q44 = V1[(dimX*dimY)*(k+1) + j*dimX+i] - V1[index];
+            }
+            
+            Q1[index] += sigma*(q1);
+            Q2[index] += sigma*(q2);
+            Q3[index] += sigma*(q3);
+            Q4[index] += sigma*(0.5f*(q11 + q22));
+            Q5[index] += sigma*(0.5f*(q33 + q44));
+        }}}
+    return 1;
+}
+
+
diff --git a/Core/regularisers_CPU/TGV_core.h b/Core/regularisers_CPU/TGV_core.h
index 29482ad..8bd8cf6 100644
--- a/Core/regularisers_CPU/TGV_core.h
+++ b/Core/regularisers_CPU/TGV_core.h
@@ -26,10 +26,10 @@ limitations under the License.
 #include "CCPiDefines.h"
 
 /* C-OMP implementation of Primal-Dual denoising method for 
- * Total Generilized Variation (TGV)-L2 model [1] (2D case only)
+ * Total Generilized Variation (TGV)-L2 model [1] (2D/3D)
  *
  * Input Parameters:
- * 1. Noisy image (2D)
+ * 1. Noisy image/volume (2D/3D)
  * 2. lambda - regularisation parameter
  * 3. parameter to control the first-order term (alpha1)
  * 4. parameter to control the second-order term (alpha0)
@@ -37,7 +37,7 @@ limitations under the License.
  * 6. Lipshitz constant (default is 12)
  * 
  * Output:
- * Filtered/regulariaed image 
+ * Filtered/regularised image/volume
  *
  * References:
  * [1] K. Bredies "Total Generalized Variation"
@@ -47,9 +47,10 @@ limitations under the License.
 #ifdef __cplusplus
 extern "C" {
 #endif
-/* 2D functions */
-CCPI_EXPORT float TGV_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iter, float L2, int dimX, int dimY);
 
+CCPI_EXPORT float TGV_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iter, float L2, int dimX, int dimY, int dimZ);
+
+/* 2D functions */
 CCPI_EXPORT float DualP_2D(float *U, float *V1, float *V2, float *P1, float *P2, long dimX, long dimY, float sigma);
 CCPI_EXPORT float ProjP_2D(float *P1, float *P2, long dimX, long dimY, float alpha1);
 CCPI_EXPORT float DualQ_2D(float *V1, float *V2, float *Q1, float *Q2, float *Q3, long dimX, long dimY, float sigma);
@@ -57,6 +58,14 @@ CCPI_EXPORT float ProjQ_2D(float *Q1, float *Q2, float *Q3, long dimX, long dimY
 CCPI_EXPORT float DivProjP_2D(float *U, float *U0, float *P1, float *P2, long dimX, long dimY, float lambda, float tau);
 CCPI_EXPORT float UpdV_2D(float *V1, float *V2, float *P1, float *P2, float *Q1, float *Q2, float *Q3, long dimX, long dimY, float tau);
 CCPI_EXPORT float newU(float *U, float *U_old, long dimX, long dimY);
+/* 3D functions */
+CCPI_EXPORT float DualP_3D(float *U, float *V1, float *V2, float *V3, float *P1, float *P2, float *P3, long dimX, long dimY, long dimZ, float sigma);
+CCPI_EXPORT float ProjP_3D(float *P1, float *P2, float *P3, long dimX, long dimY, long dimZ, float alpha1);
+CCPI_EXPORT float DualQ_3D(float *V1, float *V2, float *V3, float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, long dimX, long dimY, long dimZ, float sigma);
+CCPI_EXPORT float ProjQ_3D(float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, long dimX, long dimY, long dimZ, float alpha0);
+CCPI_EXPORT float DivProjP_3D(float *U, float *U0, float *P1, float *P2, float *P3, long dimX, long dimY, long dimZ, float lambda, float tau);
+CCPI_EXPORT float UpdV_3D(float *V1, float *V2, float *V3, float *P1, float *P2, float *P3, float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, long dimX, long dimY, long dimZ, float tau);
+CCPI_EXPORT float newU3D(float *U, float *U_old, long dimX, long dimY long dimZ);
 #ifdef __cplusplus
 }
 #endif
-- 
cgit v1.2.3


From 4bcfee09d4b8fc23ea231521b4ceb7aaeecf2811 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Thu, 14 Feb 2019 17:41:00 +0000
Subject: first TGVD version

---
 Core/regularisers_CPU/TGV_core.c                   | 202 ++++++++++++++-------
 Core/regularisers_CPU/TGV_core.h                   |   2 +-
 Wrappers/Matlab/demos/demoMatlab_3Ddenoise.m       |  15 +-
 Wrappers/Matlab/demos/demoMatlab_denoise.m         |  16 +-
 Wrappers/Matlab/mex_compile/regularisers_CPU/TGV.c |  29 +--
 5 files changed, 170 insertions(+), 94 deletions(-)

diff --git a/Core/regularisers_CPU/TGV_core.c b/Core/regularisers_CPU/TGV_core.c
index 964c8a1..01306f3 100644
--- a/Core/regularisers_CPU/TGV_core.c
+++ b/Core/regularisers_CPU/TGV_core.c
@@ -121,7 +121,7 @@ float TGV_main(float *U0, float *U, float lambda, float alpha1, float alpha0, in
             DualP_3D(U, V1, V2, V3, P1, P2, P3, (long)(dimX), (long)(dimY), (long)(dimZ), sigma);
             
             /*Projection onto convex set for P*/
-            ProjP_3D(P1, P2, P3, (long)(dimX), (long)(dimY), alpha1);
+            ProjP_3D(P1, P2, P3, (long)(dimX), (long)(dimY), (long)(dimZ), alpha1);
             
             /* Calculate Dual Variable Q */
             DualQ_3D(V1, V2, V3, Q1, Q2, Q3, Q4, Q5, Q6, (long)(dimX), (long)(dimY), (long)(dimZ), sigma);
@@ -189,10 +189,9 @@ float ProjP_2D(float *P1, float *P2, long dimX, long dimY, float alpha1)
 #pragma omp parallel for shared(P1,P2) private(i,j,index,grad_magn)
     for(i=0; i<dimX; i++) {
         for(j=0; j<dimY; j++) {
-			index = j*dimX+i;
-            grad_magn = sqrt(pow(P1[index],2) + pow(P2[index],2));
-            grad_magn = grad_magn/alpha1;
-            if (grad_magn > 1.0) {
+	    index = j*dimX+i;
+            grad_magn = (sqrtf(pow(P1[index],2) + pow(P2[index],2)))/alpha1;
+            if (grad_magn > 1.0f) {
                 P1[index] /= grad_magn;
                 P2[index] /= grad_magn;
             }
@@ -207,21 +206,14 @@ float DualQ_2D(float *V1, float *V2, float *Q1, float *Q2, float *Q3, long dimX,
 #pragma omp parallel for shared(Q1,Q2,Q3,V1,V2) private(i,j,index,q1,q2,q11,q22)
     for(i=0; i<dimX; i++) {
         for(j=0; j<dimY; j++) {
-			index = j*dimX+i;
-            /* symmetric boundary conditions (Neuman) */
-            if (i == dimX-1)
-            { q1 = (V1[j*dimX+(i-1)] - V1[index]);
-              q11 = (V2[j*dimX+(i-1)] - V2[index]);
-            }
-            else {
-                q1 = (V1[j*dimX+(i+1)] - V1[index]);
-                q11 = (V2[j*dimX+(i+1)] - V2[index]);
+    	    index = j*dimX+i;
+    	    q1 = 0.0f; q11 = 0.0f; q2 = 0.0f; q22 = 0.0f;
+            /* boundary conditions (Neuman) */
+            if (i != dimX-1){
+                q1 = V1[j*dimX+(i+1)] - V1[index];
+                q11 = V2[j*dimX+(i+1)] - V2[index];
             }
-            if (j == dimY-1) {
-                q2 = (V2[(j-1)*dimX+i] - V2[index]);
-                q22 = (V1[(j-1)*dimX+i] - V1[index]);
-            }
-            else {
+            if (j != dimY-1) {
                 q2 = V2[(j+1)*dimX+i] - V2[index];
                 q22 = V1[(j+1)*dimX+i] - V1[index];
             }
@@ -238,10 +230,10 @@ float ProjQ_2D(float *Q1, float *Q2, float *Q3, long dimX, long dimY, float alph
 #pragma omp parallel for shared(Q1,Q2,Q3) private(i,j,index,grad_magn)
     for(i=0; i<dimX; i++) {
         for(j=0; j<dimY; j++) {
-			index = j*dimX+i;
-            grad_magn = sqrt(pow(Q1[index],2) + pow(Q2[index],2) + 2*pow(Q3[index],2));
+	   index = j*dimX+i;
+            grad_magn = sqrtf(pow(Q1[index],2) + pow(Q2[index],2) + 2*pow(Q3[index],2));
             grad_magn = grad_magn/alpha0;
-            if (grad_magn > 1.0) {
+            if (grad_magn > 1.0f) {
                 Q1[index] /= grad_magn;
                 Q2[index] /= grad_magn;
                 Q3[index] /= grad_magn;
@@ -279,37 +271,29 @@ float newU(float *U, float *U_old, long dimX, long dimY)
 float UpdV_2D(float *V1, float *V2, float *P1, float *P2, float *Q1, float *Q2, float *Q3, long dimX, long dimY, float tau)
 {
     long i, j, index;
-    float q1, q11, q2, q22, div1, div2;
-#pragma omp parallel for shared(V1,V2,P1,P2,Q1,Q2,Q3) private(i, j, index, q1, q11, q2, q22, div1, div2)
+    float q1, q3_x, q3_y, q2, div1, div2;
+#pragma omp parallel for shared(V1,V2,P1,P2,Q1,Q2,Q3) private(i, j, index, q1, q3_x, q3_y, q2, div1, div2)
     for(i=0; i<dimX; i++) {
         for(j=0; j<dimY; j++) {
-			index = j*dimX+i;
-            /* symmetric boundary conditions (Neuman) */
-            if (i == 0) {
-                q1 = Q1[index];
-                q11 = Q3[index];
-            }
-            else {
+	    index = j*dimX+i;
+              q2 = 0.0f;  q3_y = 0.0f; q1 = 0.0f; q3_x = 0.0;
+            /* boundary conditions (Neuman) */
+            if (i != 0) {
                 q1 = Q1[index] - Q1[j*dimX+(i-1)];
-                q11 = Q3[index] - Q3[j*dimX+(i-1)];
+                q3_x = Q3[index] - Q3[j*dimX+(i-1)];
             }
-            if (j == 0) {
-                q2 = Q2[index];
-                q22 = Q3[index];
-            }
-            else  {
+            if (j != 0) {
                 q2 = Q2[index] - Q2[(j-1)*dimX+i];
-                q22 = Q3[index] - Q3[(j-1)*dimX+i];
+                q3_y = Q3[index] - Q3[(j-1)*dimX+i];
             }
-            div1 = q1 + q22;
-            div2 = q2 + q11;
+            div1 = q1 + q3_y;
+            div2 = q3_x + q2;
             V1[index] += tau*(P1[index] + div1);
             V2[index] += tau*(P2[index] + div2);
         }}
     return 1;
 }
 
-
 /********************************************************************/
 /***************************3D Functions*****************************/
 /********************************************************************/
@@ -320,16 +304,14 @@ float DualP_3D(float *U, float *V1, float *V2, float *V3, float *P1, float *P2,
 #pragma omp parallel for shared(U,V1,V2,V3,P1,P2,P3) private(i,j,k,index)
     for(i=0; i<dimX; i++) {
         for(j=0; j<dimY; j++) {
-          for(k=0; k<dimZ; k++) {
-             	   
-    	   index = (dimX*dimY)*k + j*dimX+i;
-    	   
+          for(k=0; k<dimZ; k++) {             	   
+    	   index = (dimX*dimY)*k + j*dimX+i;    	   
             /* symmetric boundary conditions (Neuman) */
             if (i == dimX-1) P1[index] += sigma*((U[(dimX*dimY)*k + j*dimX+(i-1)] - U[index]) - V1[index]); 
             else P1[index] += sigma*((U[(dimX*dimY)*k + j*dimX+(i+1)] - U[index])  - V1[index]); 
             if (j == dimY-1) P2[index] += sigma*((U[(dimX*dimY)*k + (j-1)*dimX+i] - U[index])  - V2[index]);
             else  P2[index] += sigma*((U[(dimX*dimY)*k + (j+1)*dimX+i] - U[index])  - V2[index]);
-            if (j == dimZ-1) P3[index] += sigma*((U[(dimX*dimY)*(k-1) + j*dimX+i] - U[index])  - V3[index]);
+            if (k == dimZ-1) P3[index] += sigma*((U[(dimX*dimY)*(k-1) + j*dimX+i] - U[index])  - V3[index]);
             else  P3[index] += sigma*((U[(dimX*dimY)*(k+1) + j*dimX+i] - U[index])  - V3[index]);
         }}}
     return 1;
@@ -344,8 +326,7 @@ float ProjP_3D(float *P1, float *P2, float *P3, long dimX, long dimY, long dimZ,
         for(j=0; j<dimY; j++) {
 	  for(k=0; k<dimZ; k++) {   	
    	    index = (dimX*dimY)*k + j*dimX+i;
-            grad_magn = sqrt(pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2));
-            grad_magn = grad_magn/alpha1;
+            grad_magn = (sqrtf(pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2)))/alpha1;
             if (grad_magn > 1.0f) {
                 P1[index] /= grad_magn;
                 P2[index] /= grad_magn;
@@ -357,39 +338,29 @@ float ProjP_3D(float *P1, float *P2, float *P3, long dimX, long dimY, long dimZ,
 /*Calculating dual variable Q (using forward differences)*/
 float DualQ_3D(float *V1, float *V2, float *V3, float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, long dimX, long dimY, long dimZ, float sigma)
 {
-    long i,j,k, index;
-    float q1, q2, q3, q4, q5, q6, q11, q22, q33, q44, q55, q66;
-#pragma omp parallel for shared(Q1,Q2,Q3,Q4,Q5,Q6,V1,V2,V3) private(i,j,k,index,q1,q2,q3,q4,q5,q6,q11,q22,q33,q44,q55,q66)
+    long i,j,k,index;
+    float q1, q2, q3, q11, q22, q33, q44, q55, q66;
+#pragma omp parallel for shared(Q1,Q2,Q3,Q4,Q5,Q6,V1,V2,V3) private(i,j,k,index,q1,q2,q3,q11,q22,q33,q44,q55,q66)
     for(i=0; i<dimX; i++) {
         for(j=0; j<dimY; j++) {
        	  for(k=0; k<dimZ; k++) {   	
 	    index = (dimX*dimY)*k + j*dimX+i;
+	    q1 = 0.0f; q11 = 0.0f; q33 = 0.0f; q2 = 0.0f; q22 = 0.0f; q55 = 0.0f; q3 = 0.0f; q44 = 0.0f; q66 = 0.0f;
             /* symmetric boundary conditions (Neuman) */
-            if (i == dimX-1){ 
-              q1 = V1[(dimX*dimY)*k + j*dimX+(i-1)] - V1[index];
-              q11 = V2[(dimX*dimY)*k + j*dimX+(i-1)] - V2[index];
-              q33 = V3[(dimX*dimY)*k + j*dimX+(i-1)] - V3[index];
-            }
-            else {
-                q1 = (V1[(dimX*dimY)*k + j*dimX+(i+1)] - V1[index]);
-                q11 = (V2[(dimX*dimY)*k + j*dimX+(i+1)] - V2[index]);
+            if (i != dimX-1){ 
+                q1 = V1[(dimX*dimY)*k + j*dimX+(i+1)] - V1[index];
+                q11 = V2[(dimX*dimY)*k + j*dimX+(i+1)] - V2[index];
                 q33 = V3[(dimX*dimY)*k + j*dimX+(i+1)] - V3[index];
             }
-            if (j == dimY-1) {
-                q2 = (V2[(dimX*dimY)*k + (j-1)*dimX+i] - V2[index]);
-                q22 = (V1[(dimX*dimY)*k + (j-1)*dimX+i] - V1[index]);
-            }
-            else {
+            if (j != dimY-1) {
                 q2 = V2[(dimX*dimY)*k + (j+1)*dimX+i] - V2[index];
                 q22 = V1[(dimX*dimY)*k + (j+1)*dimX+i] - V1[index];
+                q55 = V3[(dimX*dimY)*k + (j+1)*dimX+i] - V3[index];
             }
-            if (k == dimZ-1) {
-                q3 = V3[(dimX*dimY)*(k-1) + j*dimX+i] - V3[index];
-                q44 = V1[(dimX*dimY)*(k-1) + j*dimX+i] - V1[index];
-            }
-            else {
+            if (k != dimZ-1) {
                 q3 = V3[(dimX*dimY)*(k+1) + j*dimX+i] - V3[index];
                 q44 = V1[(dimX*dimY)*(k+1) + j*dimX+i] - V1[index];
+                q66 = V2[(dimX*dimY)*(k+1) + j*dimX+i] - V2[index];
             }
             
             Q1[index] += sigma*(q1);
@@ -397,8 +368,99 @@ float DualQ_3D(float *V1, float *V2, float *V3, float *Q1, float *Q2, float *Q3,
             Q3[index] += sigma*(q3);
             Q4[index] += sigma*(0.5f*(q11 + q22));
             Q5[index] += sigma*(0.5f*(q33 + q44));
+            Q6[index] += sigma*(0.5f*(q55 + q66));
         }}}
     return 1;
 }
-
+float ProjQ_3D(float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, long dimX, long dimY, long dimZ, float alpha0)
+{
+    float grad_magn;
+    long i,j,k,index;
+#pragma omp parallel for shared(Q1,Q2,Q3,Q4,Q5,Q6) private(i,j,k,index,grad_magn)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+       	  for(k=0; k<dimZ; k++) {   	
+	    index = (dimX*dimY)*k + j*dimX+i;           
+            grad_magn = sqrtf(pow(Q1[index],2) + pow(Q2[index],2) + pow(Q3[index],2) + 2.0f*pow(Q4[index],2) + 2.0f*pow(Q5[index],2) + 2.0f*pow(Q6[index],2));
+            grad_magn = grad_magn/alpha0;
+            if (grad_magn > 1.0f) {
+                Q1[index] /= grad_magn;
+                Q2[index] /= grad_magn;
+                Q3[index] /= grad_magn;
+                Q4[index] /= grad_magn;
+                Q5[index] /= grad_magn;
+                Q6[index] /= grad_magn;
+            }
+        }}}
+    return 1;
+}
+/* Divergence and projection for P*/
+float DivProjP_3D(float *U, float *U0, float *P1, float *P2, float *P3, long dimX, long dimY, long dimZ, float lambda, float tau)
+{
+    long i,j,k,index;
+    float P_v1, P_v2, P_v3, div;
+#pragma omp parallel for shared(U,U0,P1,P2,P3) private(i,j,k,index,P_v1,P_v2,P_v3,div)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+       	  for(k=0; k<dimZ; k++) {   	
+	    index = (dimX*dimY)*k + j*dimX+i; 	    
+            if (i == 0) P_v1 = P1[index];
+            else P_v1 = P1[index] - P1[(dimX*dimY)*k + j*dimX+(i-1)];
+            if (j == 0) P_v2 = P2[index];
+            else P_v2 = P2[index] - P2[(dimX*dimY)*k + (j-1)*dimX+i];
+            if (k == 0) P_v3 = P3[index];
+            else P_v3 = P3[index] - P3[(dimX*dimY)*(k-1) + (j)*dimX+i];              
+                      
+            div = P_v1 + P_v2 + P_v3;
+            U[index] = (lambda*(U[index] + tau*div) + tau*U0[index])/(lambda + tau); 
+        }}}
+    return *U;
+}
+/*get update for V*/
+float UpdV_3D(float *V1, float *V2, float *V3, float *P1, float *P2, float *P3, float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, long dimX, long dimY, long dimZ, float tau)
+{
+    long i,j,k,index;
+    float q1, q4x, q6x, q2, q4y, q5y, q6z, q5z, q3, div1, div2, div3;
+#pragma omp parallel for shared(V1,V2,V3,P1,P2,P3,Q1,Q2,Q3,Q4,Q5,Q6) private(i,j,k,index,q1,q4x,q6x,q2,q4y,q5y,q6z,q5z,q3,div1,div2,div3)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+       	  for(k=0; k<dimZ; k++) {   	
+	    index = (dimX*dimY)*k + j*dimX+i; 	
+	    q1 = 0.0f; q4x= 0.0f; q6x= 0.0f; q2= 0.0f; q4y= 0.0f; q5y= 0.0f; q6z= 0.0f; q5z= 0.0f; q3= 0.0f;
+            /* symmetric boundary conditions (Neuman) */
+            if (i != 0) {
+                q1 = Q1[index] - Q1[(dimX*dimY)*k + j*dimX+(i-1)];
+                q4x = Q4[index] - Q4[(dimX*dimY)*k + j*dimX+(i-1)];
+                q6x = Q6[index] - Q6[(dimX*dimY)*k + j*dimX+(i-1)];
+            }
+            if (j != 0) {
+                q2 = Q2[index] - Q2[(dimX*dimY)*k + (j-1)*dimX+i];
+                q4y = Q4[index] - Q4[(dimX*dimY)*k + (j-1)*dimX+i];
+                q5y = Q5[index] - Q5[(dimX*dimY)*k + (j-1)*dimX+i];
+            }
+             if (k != 0) {
+                q6z = Q6[index] - Q6[(dimX*dimY)*(k-1) + (j)*dimX+i];
+                q5z = Q5[index] - Q5[(dimX*dimY)*(k-1) + (j)*dimX+i];
+                q3 = Q3[index] - Q3[(dimX*dimY)*(k-1) + (j)*dimX+i];
+            }
+            div1 = q1 + q4y + q6z;
+            div2 = q4x + q2 + q5z;
+            div3 = q6x + q5y + q3;
+            
+            V1[index] += tau*(P1[index] + div1);
+            V2[index] += tau*(P2[index] + div2);
+            V3[index] += tau*(P3[index] + div3);
+        }}}
+    return 1;
+}
+/*get updated solution U*/
+float newU3D(float *U, float *U_old, long dimX, long dimY, long dimZ)
+{
+    long i;
+#pragma omp parallel for shared(U,U_old) private(i)
+    for(i=0; i<dimX*dimY*dimZ; i++) {
+    U[i] = 2.0f*U[i] - U_old[i];
+    }
+    return *U;
+}
 
diff --git a/Core/regularisers_CPU/TGV_core.h b/Core/regularisers_CPU/TGV_core.h
index 8bd8cf6..8ff084e 100644
--- a/Core/regularisers_CPU/TGV_core.h
+++ b/Core/regularisers_CPU/TGV_core.h
@@ -65,7 +65,7 @@ CCPI_EXPORT float DualQ_3D(float *V1, float *V2, float *V3, float *Q1, float *Q2
 CCPI_EXPORT float ProjQ_3D(float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, long dimX, long dimY, long dimZ, float alpha0);
 CCPI_EXPORT float DivProjP_3D(float *U, float *U0, float *P1, float *P2, float *P3, long dimX, long dimY, long dimZ, float lambda, float tau);
 CCPI_EXPORT float UpdV_3D(float *V1, float *V2, float *V3, float *P1, float *P2, float *P3, float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, long dimX, long dimY, long dimZ, float tau);
-CCPI_EXPORT float newU3D(float *U, float *U_old, long dimX, long dimY long dimZ);
+CCPI_EXPORT float newU3D(float *U, float *U_old, long dimX, long dimY, long dimZ);
 #ifdef __cplusplus
 }
 #endif
diff --git a/Wrappers/Matlab/demos/demoMatlab_3Ddenoise.m b/Wrappers/Matlab/demos/demoMatlab_3Ddenoise.m
index 5cc47b3..23cda32 100644
--- a/Wrappers/Matlab/demos/demoMatlab_3Ddenoise.m
+++ b/Wrappers/Matlab/demos/demoMatlab_3Ddenoise.m
@@ -2,11 +2,13 @@
 clear; close all
 Path1 = sprintf(['..' filesep 'mex_compile' filesep 'installed'], 1i);
 Path2 = sprintf(['..' filesep '..' filesep '..' filesep 'data' filesep], 1i);
+Path3 = sprintf(['..' filesep 'supp'], 1i);
 addpath(Path1);
 addpath(Path2);
+addpath(Path3);
 
 N = 512; 
-slices = 15;
+slices = 7;
 vol3D = zeros(N,N,slices, 'single');
 Ideal3D = zeros(N,N,slices, 'single');
 Im = double(imread('lena_gray_512.tif'))/255;  % loading image
@@ -131,7 +133,16 @@ figure; imshow(u_diff4(:,:,7), [0 1]); title('Diffusion 4thO denoised volume (CP
 % fprintf('%s %f \n', 'RMSE error for Anis.Diff of 4th order is:', rmse_diff4);
 % figure; imshow(u_diff4_g(:,:,7), [0 1]); title('Diffusion 4thO denoised volume (GPU)');
 %%
-
+fprintf('Denoise using the TGV model (CPU) \n');
+lambda_TGV = 0.05; % regularisation parameter
+alpha1 = 1.0; % parameter to control the first-order term
+alpha0 = 2.0; % parameter to control the second-order term
+iter_TGV = 40; % number of Primal-Dual iterations for TGV
+tic; u_tgv = TGV(single(vol3D), lambda_TGV, alpha1, alpha0, iter_TGV, 128); toc; 
+rmseTGV = RMSE(Ideal3D(:),u_tgv(:));
+fprintf('%s %f \n', 'RMSE error for TGV is:', rmseTGV);
+figure; imshow(u_tgv(:,:,3), [0 1]); title('TGV denoised volume (CPU)');
+%%
 %>>>>>>>>>>>>>> MULTI-CHANNEL priors <<<<<<<<<<<<<<< %
 fprintf('Denoise a volume using the FGP-dTV model (CPU) \n');
 
diff --git a/Wrappers/Matlab/demos/demoMatlab_denoise.m b/Wrappers/Matlab/demos/demoMatlab_denoise.m
index 3506cca..14d3096 100644
--- a/Wrappers/Matlab/demos/demoMatlab_denoise.m
+++ b/Wrappers/Matlab/demos/demoMatlab_denoise.m
@@ -60,20 +60,20 @@ figure; imshow(u_sb, [0 1]); title('SB-TV denoised image (CPU)');
 % figure; imshow(u_sbG, [0 1]); title('SB-TV denoised image (GPU)');
 %%
 fprintf('Denoise using the TGV model (CPU) \n');
-lambda_TGV = 0.04; % regularisation parameter
-alpha1 = 1; % parameter to control the first-order term
-alpha0 = 0.7; % parameter to control the second-order term
-iter_TGV = 500; % number of Primal-Dual iterations for TGV
+lambda_TGV = 0.045; % regularisation parameter
+alpha1 = 1.0; % parameter to control the first-order term
+alpha0 = 2.0; % parameter to control the second-order term
+iter_TGV = 2000; % number of Primal-Dual iterations for TGV
 tic; u_tgv = TGV(single(u0), lambda_TGV, alpha1, alpha0, iter_TGV); toc; 
 rmseTGV = (RMSE(u_tgv(:),Im(:)));
 fprintf('%s %f \n', 'RMSE error for TGV is:', rmseTGV);
 figure; imshow(u_tgv, [0 1]); title('TGV denoised image (CPU)');
 %%
 % fprintf('Denoise using the TGV model (GPU) \n');
-% lambda_TGV = 0.04; % regularisation parameter
-% alpha1 = 1; % parameter to control the first-order term
-% alpha0 = 0.7; % parameter to control the second-order term
-% iter_TGV = 500; % number of Primal-Dual iterations for TGV
+% lambda_TGV = 0.045; % regularisation parameter
+% alpha1 = 1.0; % parameter to control the first-order term
+% alpha0 = 2.0; % parameter to control the second-order term
+% iter_TGV = 2000; % number of Primal-Dual iterations for TGV
 % tic; u_tgv_gpu = TGV_GPU(single(u0), lambda_TGV, alpha1, alpha0, iter_TGV); toc; 
 % rmseTGV_gpu = (RMSE(u_tgv_gpu(:),Im(:)));
 % fprintf('%s %f \n', 'RMSE error for TGV is:', rmseTGV_gpu);
diff --git a/Wrappers/Matlab/mex_compile/regularisers_CPU/TGV.c b/Wrappers/Matlab/mex_compile/regularisers_CPU/TGV.c
index 5459bf5..aa4eed4 100644
--- a/Wrappers/Matlab/mex_compile/regularisers_CPU/TGV.c
+++ b/Wrappers/Matlab/mex_compile/regularisers_CPU/TGV.c
@@ -21,14 +21,14 @@ limitations under the License.
 #include "TGV_core.h"
 
 /* C-OMP implementation of Primal-Dual denoising method for 
- * Total Generilized Variation (TGV)-L2 model [1] (2D case only)
+ * Total Generilized Variation (TGV)-L2 model [1] (2D/3D)
  *
  * Input Parameters:
- * 1. Noisy image (2D) (required)
- * 2. lambda - regularisation parameter (required)
- * 3. parameter to control the first-order term (alpha1) (default - 1)
- * 4. parameter to control the second-order term (alpha0) (default - 0.5)
- * 5. Number of Chambolle-Pock (Primal-Dual) iterations (default is 300)
+ * 1. Noisy image/volume (2D/3D)
+ * 2. lambda - regularisation parameter
+ * 3. parameter to control the first-order term (alpha1)
+ * 4. parameter to control the second-order term (alpha0)
+ * 5. Number of Chambolle-Pock (Primal-Dual) iterations
  * 6. Lipshitz constant (default is 12)
  *
  * Output:
@@ -44,7 +44,7 @@ void mexFunction(
         
 {
     int number_of_dims, iter;
-    mwSize dimX, dimY;
+    mwSize dimX, dimY, dimZ;
     const mwSize *dim_array;
     
     float *Input, *Output=NULL, lambda, alpha0, alpha1, L2;
@@ -55,7 +55,7 @@ void mexFunction(
     /*Handling Matlab input data*/
     if ((nrhs < 2) || (nrhs > 6)) mexErrMsgTxt("At least 2 parameters is required, all parameters are: Image(2D), Regularisation parameter, alpha0, alpha1, iterations number, Lipshitz Constant");
     
-    Input  = (float *) mxGetData(prhs[0]); /*noisy image (2D) */
+    Input  = (float *) mxGetData(prhs[0]); /*noisy image/volume */
     lambda =  (float) mxGetScalar(prhs[1]); /* regularisation parameter */
     alpha1 =  1.0f; /* parameter to control the first-order term */ 
     alpha0 =  0.5f; /* parameter to control the second-order term */
@@ -69,12 +69,15 @@ void mexFunction(
     if (nrhs == 6)  L2 =  (float) mxGetScalar(prhs[5]); /* Lipshitz constant */
     
     /*Handling Matlab output data*/
-    dimX = dim_array[0]; dimY = dim_array[1];
+    dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];
     
     if (number_of_dims == 2) {
-        Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        /* running the function */
-        TGV_main(Input, Output, lambda, alpha1, alpha0, iter, L2, dimX, dimY);        
+        dimZ = 1; /*2D case*/
+        Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));        
     }
-    if (number_of_dims == 3) {mexErrMsgTxt("Only 2D images accepted");}       
+    if (number_of_dims == 3) {
+        Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
+    }       
+    /* running the function */
+    TGV_main(Input, Output, lambda, alpha1, alpha0, iter, L2, dimX, dimY, dimZ);        
 }
-- 
cgit v1.2.3


From 69219dd3d69cc644bc858986ba71ee02e8e1952f Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Mon, 18 Feb 2019 10:56:21 +0000
Subject: working TGV3D_CPU version/Matlab

---
 Core/regularisers_CPU/TGV_core.c             | 37 ++++++++++++++--------------
 Wrappers/Matlab/demos/demoMatlab_3Ddenoise.m |  6 ++---
 2 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/Core/regularisers_CPU/TGV_core.c b/Core/regularisers_CPU/TGV_core.c
index 01306f3..ef9ef98 100644
--- a/Core/regularisers_CPU/TGV_core.c
+++ b/Core/regularisers_CPU/TGV_core.c
@@ -249,7 +249,7 @@ float DivProjP_2D(float *U, float *U0, float *P1, float *P2, long dimX, long dim
 #pragma omp parallel for shared(U,U0,P1,P2) private(i,j,index,P_v1,P_v2,div)
     for(i=0; i<dimX; i++) {
         for(j=0; j<dimY; j++) {
-			index = j*dimX+i;
+	    index = j*dimX+i;
             if (i == 0) P_v1 = P1[index];
             else P_v1 = P1[index] - P1[j*dimX+(i-1)];
             if (j == 0) P_v2 = P2[index];
@@ -348,12 +348,12 @@ float DualQ_3D(float *V1, float *V2, float *V3, float *Q1, float *Q2, float *Q3,
 	    q1 = 0.0f; q11 = 0.0f; q33 = 0.0f; q2 = 0.0f; q22 = 0.0f; q55 = 0.0f; q3 = 0.0f; q44 = 0.0f; q66 = 0.0f;
             /* symmetric boundary conditions (Neuman) */
             if (i != dimX-1){ 
-                q1 = V1[(dimX*dimY)*k + j*dimX+(i+1)] - V1[index];
+                q1 = V1[(dimX*dimY)*k + j*dimX+(i+1)] - V1[index];              
                 q11 = V2[(dimX*dimY)*k + j*dimX+(i+1)] - V2[index];
                 q33 = V3[(dimX*dimY)*k + j*dimX+(i+1)] - V3[index];
             }
             if (j != dimY-1) {
-                q2 = V2[(dimX*dimY)*k + (j+1)*dimX+i] - V2[index];
+                q2 = V2[(dimX*dimY)*k + (j+1)*dimX+i] - V2[index];                
                 q22 = V1[(dimX*dimY)*k + (j+1)*dimX+i] - V1[index];
                 q55 = V3[(dimX*dimY)*k + (j+1)*dimX+i] - V3[index];
             }
@@ -363,12 +363,12 @@ float DualQ_3D(float *V1, float *V2, float *V3, float *Q1, float *Q2, float *Q3,
                 q66 = V2[(dimX*dimY)*(k+1) + j*dimX+i] - V2[index];
             }
             
-            Q1[index] += sigma*(q1);
-            Q2[index] += sigma*(q2);
-            Q3[index] += sigma*(q3);
-            Q4[index] += sigma*(0.5f*(q11 + q22));
-            Q5[index] += sigma*(0.5f*(q33 + q44));
-            Q6[index] += sigma*(0.5f*(q55 + q66));
+            Q1[index] += sigma*(q1); /*Q11*/
+            Q2[index] += sigma*(q2); /*Q22*/            
+            Q3[index] += sigma*(q3); /*Q33*/
+            Q4[index] += sigma*(0.5f*(q11 + q22)); /* Q21 / Q12 */
+            Q5[index] += sigma*(0.5f*(q33 + q44)); /* Q31 / Q13 */
+            Q6[index] += sigma*(0.5f*(q55 + q66)); /* Q32 / Q23 */
         }}}
     return 1;
 }
@@ -420,32 +420,33 @@ float DivProjP_3D(float *U, float *U0, float *P1, float *P2, float *P3, long dim
 float UpdV_3D(float *V1, float *V2, float *V3, float *P1, float *P2, float *P3, float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, long dimX, long dimY, long dimZ, float tau)
 {
     long i,j,k,index;
-    float q1, q4x, q6x, q2, q4y, q5y, q6z, q5z, q3, div1, div2, div3;
-#pragma omp parallel for shared(V1,V2,V3,P1,P2,P3,Q1,Q2,Q3,Q4,Q5,Q6) private(i,j,k,index,q1,q4x,q6x,q2,q4y,q5y,q6z,q5z,q3,div1,div2,div3)
+    float q1, q4x, q5x, q2, q4y, q6y, q6z, q5z, q3, div1, div2, div3;
+#pragma omp parallel for shared(V1,V2,V3,P1,P2,P3,Q1,Q2,Q3,Q4,Q5,Q6) private(i,j,k,index,q1,q4x,q5x,q2,q4y,q6y,q6z,q5z,q3,div1,div2,div3)
     for(i=0; i<dimX; i++) {
         for(j=0; j<dimY; j++) {
        	  for(k=0; k<dimZ; k++) {   	
 	    index = (dimX*dimY)*k + j*dimX+i; 	
-	    q1 = 0.0f; q4x= 0.0f; q6x= 0.0f; q2= 0.0f; q4y= 0.0f; q5y= 0.0f; q6z= 0.0f; q5z= 0.0f; q3= 0.0f;
+	    q1 = 0.0f; q4x= 0.0f; q5x= 0.0f; q2= 0.0f; q4y= 0.0f; q6y= 0.0f; q6z= 0.0f; q5z= 0.0f; q3= 0.0f;
+            /* Q1 - Q11, Q2 - Q22, Q3 -  Q33, Q4 - Q21/Q12, Q5 - Q31/Q13, Q6 - Q32/Q23*/            
             /* symmetric boundary conditions (Neuman) */
             if (i != 0) {
                 q1 = Q1[index] - Q1[(dimX*dimY)*k + j*dimX+(i-1)];
-                q4x = Q4[index] - Q4[(dimX*dimY)*k + j*dimX+(i-1)];
-                q6x = Q6[index] - Q6[(dimX*dimY)*k + j*dimX+(i-1)];
+                q4x = Q4[index] - Q4[(dimX*dimY)*k + j*dimX+(i-1)];                
+                q5x = Q5[index] - Q5[(dimX*dimY)*k + j*dimX+(i-1)];
             }
             if (j != 0) {
                 q2 = Q2[index] - Q2[(dimX*dimY)*k + (j-1)*dimX+i];
                 q4y = Q4[index] - Q4[(dimX*dimY)*k + (j-1)*dimX+i];
-                q5y = Q5[index] - Q5[(dimX*dimY)*k + (j-1)*dimX+i];
+                q6y = Q6[index] - Q6[(dimX*dimY)*k + (j-1)*dimX+i];
             }
              if (k != 0) {
                 q6z = Q6[index] - Q6[(dimX*dimY)*(k-1) + (j)*dimX+i];
                 q5z = Q5[index] - Q5[(dimX*dimY)*(k-1) + (j)*dimX+i];
                 q3 = Q3[index] - Q3[(dimX*dimY)*(k-1) + (j)*dimX+i];
             }
-            div1 = q1 + q4y + q6z;
-            div2 = q4x + q2 + q5z;
-            div3 = q6x + q5y + q3;
+            div1 = q1 + q4y + q5z;
+            div2 = q4x + q2 + q6z;            
+            div3 = q5x + q6y + q3;
             
             V1[index] += tau*(P1[index] + div1);
             V2[index] += tau*(P2[index] + div2);
diff --git a/Wrappers/Matlab/demos/demoMatlab_3Ddenoise.m b/Wrappers/Matlab/demos/demoMatlab_3Ddenoise.m
index 23cda32..0c331a4 100644
--- a/Wrappers/Matlab/demos/demoMatlab_3Ddenoise.m
+++ b/Wrappers/Matlab/demos/demoMatlab_3Ddenoise.m
@@ -134,11 +134,11 @@ figure; imshow(u_diff4(:,:,7), [0 1]); title('Diffusion 4thO denoised volume (CP
 % figure; imshow(u_diff4_g(:,:,7), [0 1]); title('Diffusion 4thO denoised volume (GPU)');
 %%
 fprintf('Denoise using the TGV model (CPU) \n');
-lambda_TGV = 0.05; % regularisation parameter
+lambda_TGV = 0.03; % regularisation parameter
 alpha1 = 1.0; % parameter to control the first-order term
 alpha0 = 2.0; % parameter to control the second-order term
-iter_TGV = 40; % number of Primal-Dual iterations for TGV
-tic; u_tgv = TGV(single(vol3D), lambda_TGV, alpha1, alpha0, iter_TGV, 128); toc; 
+iter_TGV = 500; % number of Primal-Dual iterations for TGV
+tic; u_tgv = TGV(single(vol3D), lambda_TGV, alpha1, alpha0, iter_TGV); toc; 
 rmseTGV = RMSE(Ideal3D(:),u_tgv(:));
 fprintf('%s %f \n', 'RMSE error for TGV is:', rmseTGV);
 figure; imshow(u_tgv(:,:,3), [0 1]); title('TGV denoised volume (CPU)');
-- 
cgit v1.2.3


From 787b534643d5b4cad4e6f8d9c4b524b52d804348 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Mon, 18 Feb 2019 14:48:29 +0000
Subject: TGV3D_GPU added, demos updated fpr Python/Matlab

---
 Core/regularisers_CPU/TGV_core.c                   |  40 +-
 Core/regularisers_CPU/TGV_core.h                   |   2 +
 Core/regularisers_GPU/TGV_GPU_core.cu              | 455 +++++++++++++++++----
 Core/regularisers_GPU/TGV_GPU_core.h               |   2 +-
 Readme.md                                          |   4 +-
 Wrappers/Python/conda-recipe/run_test.py           |   2 +-
 Wrappers/Python/demos/demo_cpu_regularisers.py     |   4 +-
 Wrappers/Python/demos/demo_cpu_regularisers3D.py   |  54 ++-
 .../Python/demos/demo_cpu_vs_gpu_regularisers.py   |   4 +-
 Wrappers/Python/demos/demo_gpu_regularisers.py     |   4 +-
 Wrappers/Python/demos/demo_gpu_regularisers3D.py   |  51 ++-
 Wrappers/Python/src/cpu_regularisers.pyx           |  35 +-
 Wrappers/Python/src/gpu_regularisers.pyx           |  39 +-
 13 files changed, 583 insertions(+), 113 deletions(-)

diff --git a/Core/regularisers_CPU/TGV_core.c b/Core/regularisers_CPU/TGV_core.c
index ef9ef98..805c3d4 100644
--- a/Core/regularisers_CPU/TGV_core.c
+++ b/Core/regularisers_CPU/TGV_core.c
@@ -139,18 +139,13 @@ float TGV_main(float *U0, float *U, float lambda, float alpha1, float alpha0, in
             newU3D(U, U_old, (long)(dimX), (long)(dimY), (long)(dimZ));
             
             /*saving V into V_old*/
-            copyIm(V1, V1_old, (long)(dimX), (long)(dimY), (long)(dimZ));
-            copyIm(V2, V2_old, (long)(dimX), (long)(dimY), (long)(dimZ));
-            copyIm(V3, V3_old, (long)(dimX), (long)(dimY), (long)(dimZ));
+            copyIm_3Ar(V1, V2, V3, V1_old, V2_old, V3_old, (long)(dimX), (long)(dimY), (long)(dimZ));
             
             /* upd V*/
             UpdV_3D(V1, V2, V3, P1, P2, P3, Q1, Q2, Q3, Q4, Q5, Q6, (long)(dimX), (long)(dimY), (long)(dimZ), tau);
             
             /*get new V*/
-            newU3D(V1, V1_old, (long)(dimX), (long)(dimY), (long)(dimZ));
-            newU3D(V2, V2_old, (long)(dimX), (long)(dimY), (long)(dimZ));
-            newU3D(V3, V3_old, (long)(dimX), (long)(dimY), (long)(dimZ));
-            
+            newU3D_3Ar(V1, V2, V3, V1_old, V2_old, V3_old, (long)(dimX), (long)(dimY), (long)(dimZ));           
 	        } /*end of iterations*/
         free(P3);free(Q4);free(Q5);free(Q6);free(V3);free(V3_old);
         }     
@@ -454,14 +449,39 @@ float UpdV_3D(float *V1, float *V2, float *V3, float *P1, float *P2, float *P3,
         }}}
     return 1;
 }
+
+float copyIm_3Ar(float *V1, float *V2, float *V3, float *V1_old, float *V2_old, float *V3_old, long dimX, long dimY, long dimZ)
+{
+	long j;
+#pragma omp parallel for shared(V1, V2, V3, V1_old, V2_old, V3_old) private(j)
+	for (j = 0; j<dimX*dimY*dimZ; j++)  {	
+	V1_old[j] = V1[j];
+	V2_old[j] = V2[j];
+	V3_old[j] = V3[j];	
+	}
+	return 1;
+}
+
 /*get updated solution U*/
 float newU3D(float *U, float *U_old, long dimX, long dimY, long dimZ)
 {
     long i;
-#pragma omp parallel for shared(U,U_old) private(i)
+#pragma omp parallel for shared(U, U_old) private(i)
+    for(i=0; i<dimX*dimY*dimZ; i++) U[i] = 2.0f*U[i] - U_old[i];
+    return *U;
+}
+
+
+/*get updated solution U*/
+float newU3D_3Ar(float *V1, float *V2, float *V3, float *V1_old, float *V2_old, float *V3_old, long dimX, long dimY, long dimZ)
+{
+    long i;
+#pragma omp parallel for shared(V1, V2, V3, V1_old, V2_old, V3_old) private(i)
     for(i=0; i<dimX*dimY*dimZ; i++) {
-    U[i] = 2.0f*U[i] - U_old[i];
+    V1[i] = 2.0f*V1[i] - V1_old[i];
+    V2[i] = 2.0f*V2[i] - V2_old[i];
+    V3[i] = 2.0f*V3[i] - V3_old[i];
     }
-    return *U;
+    return 1;
 }
 
diff --git a/Core/regularisers_CPU/TGV_core.h b/Core/regularisers_CPU/TGV_core.h
index 8ff084e..11b12c1 100644
--- a/Core/regularisers_CPU/TGV_core.h
+++ b/Core/regularisers_CPU/TGV_core.h
@@ -66,6 +66,8 @@ CCPI_EXPORT float ProjQ_3D(float *Q1, float *Q2, float *Q3, float *Q4, float *Q5
 CCPI_EXPORT float DivProjP_3D(float *U, float *U0, float *P1, float *P2, float *P3, long dimX, long dimY, long dimZ, float lambda, float tau);
 CCPI_EXPORT float UpdV_3D(float *V1, float *V2, float *V3, float *P1, float *P2, float *P3, float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, long dimX, long dimY, long dimZ, float tau);
 CCPI_EXPORT float newU3D(float *U, float *U_old, long dimX, long dimY, long dimZ);
+CCPI_EXPORT float copyIm_3Ar(float *V1, float *V2, float *V3, float *V1_old, float *V2_old, float *V3_old, long dimX, long dimY, long dimZ);
+CCPI_EXPORT float newU3D_3Ar(float *V1, float *V2, float *V3, float *V1_old, float *V2_old, float *V3_old, long dimX, long dimY, long dimZ);
 #ifdef __cplusplus
 }
 #endif
diff --git a/Core/regularisers_GPU/TGV_GPU_core.cu b/Core/regularisers_GPU/TGV_GPU_core.cu
index 73232a9..58b2c41 100644
--- a/Core/regularisers_GPU/TGV_GPU_core.cu
+++ b/Core/regularisers_GPU/TGV_GPU_core.cu
@@ -21,10 +21,10 @@ limitations under the License.
 #include "shared.h"
 
 /* CUDA implementation of Primal-Dual denoising method for 
- * Total Generilized Variation (TGV)-L2 model [1] (2D case only)
+ * Total Generilized Variation (TGV)-L2 model [1] (2D/3D case)
  *
  * Input Parameters:
- * 1. Noisy image (2D)
+ * 1. Noisy image/volume (2D/3D)
  * 2. lambda - regularisation parameter
  * 3. parameter to control the first-order term (alpha1)
  * 4. parameter to control the second-order term (alpha0)
@@ -38,6 +38,10 @@ limitations under the License.
  * [1] K. Bredies "Total Generalized Variation"
  */
     
+#define BLKXSIZE 8
+#define BLKYSIZE 8
+#define BLKZSIZE 8    
+    
 #define BLKXSIZE2D 16
 #define BLKYSIZE2D 16
 #define EPS 1.0e-7
@@ -48,9 +52,8 @@ limitations under the License.
 /***************************2D Functions*****************************/
 /********************************************************************/
 __global__ void DualP_2D_kernel(float *U, float *V1, float *V2, float *P1, float *P2, int dimX, int dimY, float sigma)
-{
-    
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
+{    
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
         int j = blockDim.y * blockIdx.y + threadIdx.y;
         
         int index = i + dimX*j;
@@ -67,9 +70,9 @@ __global__ void DualP_2D_kernel(float *U, float *V1, float *V2, float *P1, float
 
 __global__ void ProjP_2D_kernel(float *P1, float *P2, int dimX, int dimY, float alpha1)
 {
-    float grad_magn;
+   	float grad_magn;
 
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
         int j = blockDim.y * blockIdx.y + threadIdx.y;
         
         int index = i + dimX*j;
@@ -78,7 +81,7 @@ __global__ void ProjP_2D_kernel(float *P1, float *P2, int dimX, int dimY, float
             
             grad_magn = sqrt(pow(P1[index],2) + pow(P2[index],2));
             grad_magn = grad_magn/alpha1;
-            if (grad_magn > 1.0) {
+            if (grad_magn > 1.0f) {
                 P1[index] /= grad_magn;
                 P2[index] /= grad_magn;
             }
@@ -90,64 +93,56 @@ __global__ void DualQ_2D_kernel(float *V1, float *V2, float *Q1, float *Q2, floa
 {
         float q1, q2, q11, q22;
 
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
         int j = blockDim.y * blockIdx.y + threadIdx.y;
         
         int index = i + dimX*j;
         
-        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY)) {
-            
+        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY)) {            
             /* symmetric boundary conditions (Neuman) */
-            if (i == dimX-1)
-            { q1 = (V1[j*dimX+(i-1)] - V1[index]);
-              q11 = (V2[j*dimX+(i-1)] - V2[index]);
+    	    q1 = 0.0f; q11 = 0.0f; q2 = 0.0f; q22 = 0.0f;
+            /* boundary conditions (Neuman) */
+            if (i != dimX-1){
+                q1 = V1[j*dimX+(i+1)] - V1[index];
+                q11 = V2[j*dimX+(i+1)] - V2[index];
             }
-            else {
-                q1 = (V1[j*dimX+(i+1)] - V1[index]);
-                q11 = (V2[j*dimX+(i+1)] - V2[index]);
-            }
-            if (j == dimY-1) {
-                q2 = (V2[(j-1)*dimX+i] - V2[index]);
-                q22 = (V1[(j-1)*dimX+i] - V1[index]);
-            }
-            else {
+            if (j != dimY-1) {
                 q2 = V2[(j+1)*dimX+i] - V2[index];
                 q22 = V1[(j+1)*dimX+i] - V1[index];
             }
             Q1[index] += sigma*(q1);
             Q2[index] += sigma*(q2);
             Q3[index] += sigma*(0.5f*(q11 + q22));
-		}
+	}
 	return;
 } 
 
 __global__ void ProjQ_2D_kernel(float *Q1, float *Q2, float *Q3, int dimX, int dimY, float alpha0)
 {
-    float grad_magn;
+	float grad_magn;
 
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
         int j = blockDim.y * blockIdx.y + threadIdx.y;
         
         int index = i + dimX*j;
         
-        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY)) {
-            
+        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY)) {            
             grad_magn = sqrt(pow(Q1[index],2) + pow(Q2[index],2) + 2*pow(Q3[index],2));
             grad_magn = grad_magn/alpha0;
-            if (grad_magn > 1.0) {
+            if (grad_magn > 1.0f) {
                 Q1[index] /= grad_magn;
                 Q2[index] /= grad_magn;
                 Q3[index] /= grad_magn;
-            }
-		}
+        	    }
+	}
 	return;
 } 
 
 __global__ void DivProjP_2D_kernel(float *U, float *U0, float *P1, float *P2, int dimX, int dimY, float lambda, float tau)
 {
-		float P_v1, P_v2, div;
+	float P_v1, P_v2, div;
 
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
         int j = blockDim.y * blockIdx.y + threadIdx.y;
         
         int index = i + dimX*j;
@@ -166,49 +161,54 @@ __global__ void DivProjP_2D_kernel(float *U, float *U0, float *P1, float *P2, in
 
 __global__ void UpdV_2D_kernel(float *V1, float *V2, float *P1, float *P2, float *Q1, float *Q2, float *Q3, int dimX, int dimY, float tau)
 {
-		float q1, q11, q2, q22, div1, div2;
+	float q1, q3_x, q2, q3_y, div1, div2;
 
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
         int j = blockDim.y * blockIdx.y + threadIdx.y;
         
         int index = i + dimX*j;
         
-        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY)) {
-			
-   /* symmetric boundary conditions (Neuman) */
-            if (i == 0) {
-                q1 = Q1[index];
-                q11 = Q3[index];
-            }
-            else {
+        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY)) {			
+   	    q2 = 0.0f;  q3_y = 0.0f; q1 = 0.0f; q3_x = 0.0;
+            /* boundary conditions (Neuman) */
+            if (i != 0) {
                 q1 = Q1[index] - Q1[j*dimX+(i-1)];
-                q11 = Q3[index] - Q3[j*dimX+(i-1)];
-            }
-            if (j == 0) {
-                q2 = Q2[index];
-                q22 = Q3[index];
+                q3_x = Q3[index] - Q3[j*dimX+(i-1)];
             }
-            else  {
+            if (j != 0) {
                 q2 = Q2[index] - Q2[(j-1)*dimX+i];
-                q22 = Q3[index] - Q3[(j-1)*dimX+i];
+                q3_y = Q3[index] - Q3[(j-1)*dimX+i];
             }
-            div1 = q1 + q22;
-            div2 = q2 + q11;
+            div1 = q1 + q3_y;
+            div2 = q3_x + q2;
             V1[index] += tau*(P1[index] + div1);
             V2[index] += tau*(P2[index] + div2);
-		}
+	}
 	return;
 } 
 
-__global__ void copyIm_TGV_kernel(float *Input, float* Output, int N, int M, int num_total)
+__global__ void copyIm_TGV_kernel(float *U, float *U_old, int N, int M, int num_total)
 {
     int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
     int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
     
     int index = xIndex + N*yIndex;
     
-    if (index < num_total)	{
-        Output[index] = Input[index];
+    if (index < num_total)   {
+        U_old[index] = U[index];
+    }
+}
+
+__global__ void copyIm_TGV_kernel_ar2(float *V1, float *V2, float *V1_old, float *V2_old, int N, int M, int num_total)
+{
+    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if (index < num_total)   {
+        V1_old[index] = V1[index];
+        V2_old[index] = V2[index];
     }
 }
 
@@ -220,18 +220,276 @@ __global__ void newU_kernel(float *U, float *U_old, int N, int M, int num_total)
     int index = xIndex + N*yIndex;
     
     if (index < num_total)	{
-        U[index] = 2.0f*U[index] - U_old[index]; 
+        U[index] = 2.0f*U[index] - U_old[index];
+    }
+}
+
+
+__global__ void newU_kernel_ar2(float *V1, float *V2, float *V1_old, float *V2_old, int N, int M, int num_total)
+{
+    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if (index < num_total)	{
+        V1[index] = 2.0f*V1[index] - V1_old[index];
+        V2[index] = 2.0f*V2[index] - V2_old[index];  
+    }
+}
+/********************************************************************/
+/***************************3D Functions*****************************/
+/********************************************************************/
+__global__ void DualP_3D_kernel(float *U, float *V1, float *V2, float *V3, float *P1, float *P2, float *P3, int dimX, int dimY, int dimZ, float sigma)
+{    
+	int index;
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        int k = blockDim.z * blockIdx.z + threadIdx.z;
+        
+        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {
+	
+	    index = (dimX*dimY)*k + j*dimX+i;
+            /* symmetric boundary conditions (Neuman) */
+            if (i == dimX-1) P1[index] += sigma*((U[(dimX*dimY)*k + j*dimX+(i-1)] - U[index]) - V1[index]); 
+            else P1[index] += sigma*((U[(dimX*dimY)*k + j*dimX+(i+1)] - U[index])  - V1[index]); 
+            if (j == dimY-1) P2[index] += sigma*((U[(dimX*dimY)*k + (j-1)*dimX+i] - U[index])  - V2[index]);
+            else  P2[index] += sigma*((U[(dimX*dimY)*k + (j+1)*dimX+i] - U[index])  - V2[index]);
+            if (k == dimZ-1) P3[index] += sigma*((U[(dimX*dimY)*(k-1) + j*dimX+i] - U[index])  - V3[index]);
+            else  P3[index] += sigma*((U[(dimX*dimY)*(k+1) + j*dimX+i] - U[index])  - V3[index]);
+	}
+	return;
+} 
+
+__global__ void ProjP_3D_kernel(float *P1, float *P2, float *P3, int dimX, int dimY, int dimZ, float alpha1)
+{
+   	float grad_magn;
+   	int index;
+   	
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        int k = blockDim.z * blockIdx.z + threadIdx.z;
+        
+        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {	
+	    index = (dimX*dimY)*k + j*dimX+i;
+            
+            grad_magn = (sqrtf(pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2)))/alpha1;
+            if (grad_magn > 1.0f) {
+                P1[index] /= grad_magn;
+                P2[index] /= grad_magn;
+                P3[index] /= grad_magn;
+            }
+	}
+	return;
+}
+
+__global__ void DualQ_3D_kernel(float *V1, float *V2, float *V3, float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, int dimX, int dimY, int dimZ, float sigma)
+{
+	int index; 
+        float q1, q2, q3, q11, q22, q33, q44, q55, q66;
+
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        int k = blockDim.z * blockIdx.z + threadIdx.z;
+        
+        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {	
+	    
+	    index = (dimX*dimY)*k + j*dimX+i;	    
+	    q1 = 0.0f; q11 = 0.0f; q33 = 0.0f; q2 = 0.0f; q22 = 0.0f; q55 = 0.0f; q3 = 0.0f; q44 = 0.0f; q66 = 0.0f;
+            /* symmetric boundary conditions (Neuman) */
+            if (i != dimX-1){ 
+                q1 = V1[(dimX*dimY)*k + j*dimX+(i+1)] - V1[index];              
+                q11 = V2[(dimX*dimY)*k + j*dimX+(i+1)] - V2[index];
+                q33 = V3[(dimX*dimY)*k + j*dimX+(i+1)] - V3[index];
+            }
+            if (j != dimY-1) {
+                q2 = V2[(dimX*dimY)*k + (j+1)*dimX+i] - V2[index];                
+                q22 = V1[(dimX*dimY)*k + (j+1)*dimX+i] - V1[index];
+                q55 = V3[(dimX*dimY)*k + (j+1)*dimX+i] - V3[index];
+            }
+            if (k != dimZ-1) {
+                q3 = V3[(dimX*dimY)*(k+1) + j*dimX+i] - V3[index];
+                q44 = V1[(dimX*dimY)*(k+1) + j*dimX+i] - V1[index];
+                q66 = V2[(dimX*dimY)*(k+1) + j*dimX+i] - V2[index];
+            }
+            
+            Q1[index] += sigma*(q1); /*Q11*/
+            Q2[index] += sigma*(q2); /*Q22*/            
+            Q3[index] += sigma*(q3); /*Q33*/
+            Q4[index] += sigma*(0.5f*(q11 + q22)); /* Q21 / Q12 */
+            Q5[index] += sigma*(0.5f*(q33 + q44)); /* Q31 / Q13 */
+            Q6[index] += sigma*(0.5f*(q55 + q66)); /* Q32 / Q23 */
+	}
+	return;
+}
+
+
+__global__ void ProjQ_3D_kernel(float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, int dimX, int dimY, int dimZ, float alpha0)
+{
+	float grad_magn;
+	int index;
+
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        int k = blockDim.z * blockIdx.z + threadIdx.z;
+        
+        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {	
+	    
+        index = (dimX*dimY)*k + j*dimX+i;	
+	
+	grad_magn = sqrtf(pow(Q1[index],2) + pow(Q2[index],2) + pow(Q3[index],2) + 2.0f*pow(Q4[index],2) + 2.0f*pow(Q5[index],2) + 2.0f*pow(Q6[index],2));
+            grad_magn = grad_magn/alpha0;
+            if (grad_magn > 1.0f) {
+                Q1[index] /= grad_magn;
+                Q2[index] /= grad_magn;
+                Q3[index] /= grad_magn;
+                Q4[index] /= grad_magn;
+                Q5[index] /= grad_magn;
+                Q6[index] /= grad_magn;
+            }
+	}
+	return;
+} 
+__global__ void DivProjP_3D_kernel(float *U, float *U0, float *P1, float *P2, float *P3, int dimX, int dimY, int dimZ, float lambda, float tau)
+{
+	float P_v1, P_v2, P_v3, div;
+	int index;
+
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        int k = blockDim.z * blockIdx.z + threadIdx.z;
+        
+        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {	
+
+        index = (dimX*dimY)*k + j*dimX+i;	
+			
+        if (i == 0) P_v1 = P1[index];
+        else P_v1 = P1[index] - P1[(dimX*dimY)*k + j*dimX+(i-1)];
+        if (j == 0) P_v2 = P2[index];
+        else P_v2 = P2[index] - P2[(dimX*dimY)*k + (j-1)*dimX+i];
+        if (k == 0) P_v3 = P3[index];
+        else P_v3 = P3[index] - P3[(dimX*dimY)*(k-1) + (j)*dimX+i];              
+                      
+        div = P_v1 + P_v2 + P_v3;
+        U[index] = (lambda*(U[index] + tau*div) + tau*U0[index])/(lambda + tau);             
+	}
+	return;
+}
+__global__ void UpdV_3D_kernel(float *V1, float *V2, float *V3, float *P1, float *P2, float *P3, float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, int dimX, int dimY, int dimZ, float tau)
+{
+	float q1, q4x, q5x, q2, q4y, q6y, q6z, q5z, q3, div1, div2, div3;
+	int index;
+	
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        int k = blockDim.z * blockIdx.z + threadIdx.z;
+        
+        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {	
+
+        index = (dimX*dimY)*k + j*dimX+i;	
+        
+	q1 = 0.0f; q4x= 0.0f; q5x= 0.0f; q2= 0.0f; q4y= 0.0f; q6y= 0.0f; q6z= 0.0f; q5z= 0.0f; q3= 0.0f;
+        /* Q1 - Q11, Q2 - Q22, Q3 -  Q33, Q4 - Q21/Q12, Q5 - Q31/Q13, Q6 - Q32/Q23*/            
+        /* symmetric boundary conditions (Neuman) */
+        if (i != 0) {
+                q1 = Q1[index] - Q1[(dimX*dimY)*k + j*dimX+(i-1)];
+                q4x = Q4[index] - Q4[(dimX*dimY)*k + j*dimX+(i-1)];                
+                q5x = Q5[index] - Q5[(dimX*dimY)*k + j*dimX+(i-1)];
+        }
+       if (j != 0) {
+                q2 = Q2[index] - Q2[(dimX*dimY)*k + (j-1)*dimX+i];
+                q4y = Q4[index] - Q4[(dimX*dimY)*k + (j-1)*dimX+i];
+                q6y = Q6[index] - Q6[(dimX*dimY)*k + (j-1)*dimX+i];
+       }
+       if (k != 0) {
+                q6z = Q6[index] - Q6[(dimX*dimY)*(k-1) + (j)*dimX+i];
+                q5z = Q5[index] - Q5[(dimX*dimY)*(k-1) + (j)*dimX+i];
+                q3 = Q3[index] - Q3[(dimX*dimY)*(k-1) + (j)*dimX+i];
+       }
+       div1 = q1 + q4y + q5z;
+       div2 = q4x + q2 + q6z;            
+       div3 = q5x + q6y + q3;
+            
+        V1[index] += tau*(P1[index] + div1);
+        V2[index] += tau*(P2[index] + div2);
+        V3[index] += tau*(P3[index] + div3);
+	}
+	return;
+} 
+
+__global__ void copyIm_TGV_kernel3D(float *U, float *U_old, int dimX, int dimY, int dimZ, int num_total)
+{
+    int index;
+	
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;    
+    
+    index = (dimX*dimY)*k + j*dimX+i;
+    
+    if (index < num_total) {	
+      	U_old[index] = U[index];	
     }
 }
 
+__global__ void copyIm_TGV_kernel3D_ar3(float *V1, float *V2, float *V3, float *V1_old, float *V2_old, float *V3_old, int dimX, int dimY, int dimZ, int num_total)
+{
+    int index;
+	
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;    
+    
+    index = (dimX*dimY)*k + j*dimX+i;
+    
+    if (index < num_total) {	
+      	V1_old[index] = V1[index];
+	V2_old[index] = V2[index];
+	V3_old[index] = V3[index];	
+    }
+}
+
+__global__ void newU_kernel3D(float *U, float *U_old, int dimX, int dimY, int dimZ, int num_total)
+{
+     int index;
+	
+     int i = blockDim.x * blockIdx.x + threadIdx.x;
+     int j = blockDim.y * blockIdx.y + threadIdx.y;
+     int k = blockDim.z * blockIdx.z + threadIdx.z;    
+         
+     index = (dimX*dimY)*k + j*dimX+i;
+    
+    if (index < num_total) {
+	   U[index] = 2.0f*U[index] - U_old[index];
+    }
+}  
+
+__global__ void newU_kernel3D_ar3(float *V1, float *V2, float *V3, float *V1_old, float *V2_old, float *V3_old, int dimX, int dimY, int dimZ, int num_total)
+{
+     int index;
+	
+     int i = blockDim.x * blockIdx.x + threadIdx.x;
+     int j = blockDim.y * blockIdx.y + threadIdx.y;
+     int k = blockDim.z * blockIdx.z + threadIdx.z;    
+         
+     index = (dimX*dimY)*k + j*dimX+i;
+    
+    if (index < num_total) {
+	   V1[index] = 2.0f*V1[index] - V1_old[index];
+	   V2[index] = 2.0f*V2[index] - V2_old[index];
+	   V3[index] = 2.0f*V3[index] - V3_old[index];
+    }
+}  
+
 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
-/********************* MAIN HOST FUNCTION ******************/
+/************************ MAIN HOST FUNCTION ***********************/
 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
-extern "C" int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY)
+extern "C" int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY, int dimZ)
 {
-		int dimTotal, dev = 0;
-		CHECK(cudaSetDevice(dev));
-		dimTotal = dimX*dimY;
+	int dimTotal, dev = 0;
+	CHECK(cudaSetDevice(dev));
+	
+	dimTotal = dimX*dimY*dimZ;
        
         float *U_old, *d_U0, *d_U, *P1, *P2, *Q1, *Q2, *Q3, *V1, *V1_old, *V2, *V2_old, tau, sigma;
         tau = pow(L2,-0.5);
@@ -254,16 +512,17 @@ extern "C" int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, flo
         CHECK(cudaMemcpy(d_U0,U0,dimTotal*sizeof(float),cudaMemcpyHostToDevice));
         CHECK(cudaMemcpy(d_U,U0,dimTotal*sizeof(float),cudaMemcpyHostToDevice));      
         
-	     /*2D case */
+        if (dimZ == 1) {
+	/*2D case */
         dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
         dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D));
              
         for(int n=0; n < iterationsNumb; n++) {
 			
-		    /* Calculate Dual Variable P */
+	    /* Calculate Dual Variable P */
             DualP_2D_kernel<<<dimGrid,dimBlock>>>(d_U, V1, V2, P1, P2, dimX, dimY, sigma);
-			CHECK(cudaDeviceSynchronize());
-			/*Projection onto convex set for P*/
+	    CHECK(cudaDeviceSynchronize());
+            /*Projection onto convex set for P*/
             ProjP_2D_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, alpha1);
             CHECK(cudaDeviceSynchronize());
             /* Calculate Dual Variable Q */
@@ -282,18 +541,72 @@ extern "C" int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, flo
             newU_kernel<<<dimGrid,dimBlock>>>(d_U, U_old, dimX, dimY, dimTotal);
             CHECK(cudaDeviceSynchronize());
             /*saving V into V_old*/
-            copyIm_TGV_kernel<<<dimGrid,dimBlock>>>(V1, V1_old, dimX, dimY, dimTotal);
-            copyIm_TGV_kernel<<<dimGrid,dimBlock>>>(V2, V2_old, dimX, dimY, dimTotal);
+            copyIm_TGV_kernel_ar2<<<dimGrid,dimBlock>>>(V1, V2, V1_old, V2_old, dimX, dimY, dimTotal);
             CHECK(cudaDeviceSynchronize());
             /* upd V*/
             UpdV_2D_kernel<<<dimGrid,dimBlock>>>(V1, V2, P1, P2, Q1, Q2, Q3, dimX, dimY, tau);
             CHECK(cudaDeviceSynchronize());
             /*get new V*/
-            newU_kernel<<<dimGrid,dimBlock>>>(V1, V1_old, dimX, dimY, dimTotal);
-            newU_kernel<<<dimGrid,dimBlock>>>(V2, V2_old, dimX, dimY, dimTotal);
+            newU_kernel_ar2<<<dimGrid,dimBlock>>>(V1, V2, V1_old, V2_old, dimX, dimY, dimTotal);
             CHECK(cudaDeviceSynchronize());            
+	        }
         }
-
+        else {
+        /*3D case */
+        dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE);
+        dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKXSIZE));
+        
+        float *P3, *Q4, *Q5, *Q6, *V3, *V3_old;
+        
+	CHECK(cudaMalloc((void**)&P3,dimTotal*sizeof(float)));
+        CHECK(cudaMalloc((void**)&Q4,dimTotal*sizeof(float)));
+        CHECK(cudaMalloc((void**)&Q5,dimTotal*sizeof(float)));
+        CHECK(cudaMalloc((void**)&Q6,dimTotal*sizeof(float)));
+        CHECK(cudaMalloc((void**)&V3,dimTotal*sizeof(float)));
+        CHECK(cudaMalloc((void**)&V3_old,dimTotal*sizeof(float)));
+        
+        for(int n=0; n < iterationsNumb; n++) {
+			
+	    /* Calculate Dual Variable P */
+            DualP_3D_kernel<<<dimGrid,dimBlock>>>(d_U, V1, V2, V3, P1, P2, P3, dimX, dimY, dimZ, sigma);
+	    CHECK(cudaDeviceSynchronize());
+            /*Projection onto convex set for P*/
+            ProjP_3D_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, alpha1);
+            CHECK(cudaDeviceSynchronize());
+            /* Calculate Dual Variable Q */
+            DualQ_3D_kernel<<<dimGrid,dimBlock>>>(V1, V2, V3, Q1, Q2, Q3, Q4, Q5, Q6, dimX, dimY, dimZ, sigma);
+            CHECK(cudaDeviceSynchronize());
+             /*Projection onto convex set for Q*/
+            ProjQ_3D_kernel<<<dimGrid,dimBlock>>>(Q1, Q2, Q3, Q4, Q5, Q6, dimX, dimY, dimZ, alpha0);
+            CHECK(cudaDeviceSynchronize());
+            /*saving U into U_old*/
+            copyIm_TGV_kernel3D<<<dimGrid,dimBlock>>>(d_U, U_old, dimX, dimY, dimZ, dimTotal);
+            CHECK(cudaDeviceSynchronize());
+            /*adjoint operation  -> divergence and projection of P*/
+            DivProjP_3D_kernel<<<dimGrid,dimBlock>>>(d_U, d_U0, P1, P2, P3, dimX, dimY, dimZ, lambda, tau);
+            CHECK(cudaDeviceSynchronize());
+            /*get updated solution U*/
+            newU_kernel3D<<<dimGrid,dimBlock>>>(d_U, U_old, dimX, dimY, dimZ, dimTotal);
+            CHECK(cudaDeviceSynchronize());
+            /*saving V into V_old*/
+            copyIm_TGV_kernel3D_ar3<<<dimGrid,dimBlock>>>(V1, V2, V3, V1_old, V2_old, V3_old, dimX, dimY, dimZ, dimTotal);           
+            CHECK(cudaDeviceSynchronize());
+            /* upd V*/
+            UpdV_3D_kernel<<<dimGrid,dimBlock>>>(V1, V2, V3, P1, P2, P3, Q1, Q2, Q3, Q4, Q5, Q6, dimX, dimY, dimZ, tau);
+            CHECK(cudaDeviceSynchronize());
+            /*get new V*/
+            newU_kernel3D_ar3<<<dimGrid,dimBlock>>>(V1, V2, V3, V1_old, V2_old, V3_old, dimX, dimY, dimZ, dimTotal);
+            CHECK(cudaDeviceSynchronize());            
+	        }
+	        
+        CHECK(cudaFree(Q4));
+        CHECK(cudaFree(Q5));
+        CHECK(cudaFree(Q6));
+        CHECK(cudaFree(P3));
+        CHECK(cudaFree(V3));
+        CHECK(cudaFree(V3_old));	                
+        }
+        
         CHECK(cudaMemcpy(U,d_U,dimTotal*sizeof(float),cudaMemcpyDeviceToHost));
         CHECK(cudaFree(d_U0));
         CHECK(cudaFree(d_U));
diff --git a/Core/regularisers_GPU/TGV_GPU_core.h b/Core/regularisers_GPU/TGV_GPU_core.h
index 5a4eb76..9f73d1c 100644
--- a/Core/regularisers_GPU/TGV_GPU_core.h
+++ b/Core/regularisers_GPU/TGV_GPU_core.h
@@ -3,6 +3,6 @@
 #include "CCPiDefines.h"
 #include <stdio.h>
 
-extern "C" CCPI_EXPORT int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY);
+extern "C" CCPI_EXPORT int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY, int dimZ);
 
 #endif 
diff --git a/Readme.md b/Readme.md
index 1db50aa..ebd4d20 100644
--- a/Readme.md
+++ b/Readme.md
@@ -33,7 +33,7 @@
 1. Rudin-Osher-Fatemi (ROF) Total Variation (explicit PDE minimisation scheme) **2D/3D CPU/GPU** (Ref. *1*)
 2. Fast-Gradient-Projection (FGP) Total Variation **2D/3D CPU/GPU** (Ref. *2*)
 3. Split-Bregman (SB) Total Variation **2D/3D CPU/GPU** (Ref. *5*)
-4. Total Generalised Variation (TGV) model for higher-order regularisation **2D CPU/GPU** (Ref. *6*)
+4. Total Generalised Variation (TGV) model for higher-order regularisation **2D/3D CPU/GPU** (Ref. *6*)
 5. Linear and nonlinear diffusion (explicit PDE minimisation scheme) **2D/3D CPU/GPU** (Ref. *8*)
 6. Anisotropic Fourth-Order Diffusion (explicit PDE minimisation) **2D/3D CPU/GPU** (Ref. *9*)
 7. A joint ROF-LLT (Lysaker-Lundervold-Tai) model for higher-order regularisation **2D/3D CPU/GPU** (Ref. *10,11*)
@@ -93,7 +93,7 @@ conda install ccpi-regulariser -c ccpi -c conda-forge
 
 #### Python (conda-build)
 ```
-	export CIL_VERSION=0.10.3
+	export CIL_VERSION=0.10.4
 	conda build Wrappers/Python/conda-recipe --numpy 1.12 --python 3.5 
 	conda install ccpi-regulariser=${CIL_VERSION} --use-local --force
 	cd demos/
diff --git a/Wrappers/Python/conda-recipe/run_test.py b/Wrappers/Python/conda-recipe/run_test.py
index 37b9dcc..21f3216 100755
--- a/Wrappers/Python/conda-recipe/run_test.py
+++ b/Wrappers/Python/conda-recipe/run_test.py
@@ -303,7 +303,7 @@ class TestRegularisers(unittest.TestCase):
                 'input' : u0,\
                 'regularisation_parameter':0.04, \
                 'alpha1':1.0,\
-                'alpha0':0.7,\
+                'alpha0':2.0,\
                 'number_of_iterations' :250 ,\
                 'LipshitzConstant' :12 ,\
                 }
diff --git a/Wrappers/Python/demos/demo_cpu_regularisers.py b/Wrappers/Python/demos/demo_cpu_regularisers.py
index 859b633..e6befa9 100644
--- a/Wrappers/Python/demos/demo_cpu_regularisers.py
+++ b/Wrappers/Python/demos/demo_cpu_regularisers.py
@@ -225,8 +225,8 @@ pars = {'algorithm' : TGV, \
         'input' : u0,\
         'regularisation_parameter':0.04, \
         'alpha1':1.0,\
-        'alpha0':0.7,\
-        'number_of_iterations' :250 ,\
+        'alpha0':2.0,\
+        'number_of_iterations' :1350 ,\
         'LipshitzConstant' :12 ,\
         }
         
diff --git a/Wrappers/Python/demos/demo_cpu_regularisers3D.py b/Wrappers/Python/demos/demo_cpu_regularisers3D.py
index c42c37b..2d2fc22 100644
--- a/Wrappers/Python/demos/demo_cpu_regularisers3D.py
+++ b/Wrappers/Python/demos/demo_cpu_regularisers3D.py
@@ -12,7 +12,7 @@ import matplotlib.pyplot as plt
 import numpy as np
 import os
 import timeit
-from ccpi.filters.regularisers import ROF_TV, FGP_TV, SB_TV, LLT_ROF, FGP_dTV, NDF, Diff4th
+from ccpi.filters.regularisers import ROF_TV, FGP_TV, SB_TV, TGV, LLT_ROF, FGP_dTV, NDF, Diff4th
 from qualitymetrics import rmse
 ###############################################################################
 def printParametersToString(pars):
@@ -68,7 +68,7 @@ Im2[:,0:M] = Im[:,0:M]
 Im = Im2
 del Im2
 """
-slices = 20
+slices = 15
 
 noisyVol = np.zeros((slices,N,M),dtype='float32')
 noisyRef = np.zeros((slices,N,M),dtype='float32')
@@ -96,7 +96,7 @@ pars = {'algorithm': ROF_TV, \
         'input' : noisyVol,\
         'regularisation_parameter':0.04,\
         'number_of_iterations': 500,\
-        'time_marching_parameter': 0.0025        
+        'time_marching_parameter': 0.0025
         }
 print ("#############ROF TV CPU####################")
 start_time = timeit.default_timer()
@@ -262,6 +262,54 @@ a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
 imgplot = plt.imshow(lltrof_cpu3D[10,:,:], cmap="gray")
 plt.title('{}'.format('Recovered volume on the CPU using LLT-ROF'))
 
+#%%
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+print ("_______________TGV (3D)_________________")
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+
+## plot 
+fig = plt.figure()
+plt.suptitle('Performance of TGV regulariser using the CPU')
+a=fig.add_subplot(1,2,1)
+a.set_title('Noisy Image')
+imgplot = plt.imshow(noisyVol[10,:,:],cmap="gray")
+
+# set parameters
+pars = {'algorithm' : TGV, \
+        'input' : noisyVol,\
+        'regularisation_parameter':0.04, \
+        'alpha1':1.0,\
+        'alpha0':2.0,\
+        'number_of_iterations' :250 ,\
+        'LipshitzConstant' :12 ,\
+        }
+
+print ("#############TGV CPU####################")
+start_time = timeit.default_timer()
+tgv_cpu3D = TGV(pars['input'], 
+              pars['regularisation_parameter'],
+              pars['alpha1'],
+              pars['alpha0'],
+              pars['number_of_iterations'],
+              pars['LipshitzConstant'],'cpu')
+             
+
+rms = rmse(idealVol, tgv_cpu3D)
+pars['rmse'] = rms
+
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+a=fig.add_subplot(1,2,2)
+
+# these are matplotlib.patch.Patch properties
+props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
+# place a text box in upper left in axes coords
+a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
+         verticalalignment='top', bbox=props)
+imgplot = plt.imshow(tgv_cpu3D[10,:,:], cmap="gray")
+plt.title('{}'.format('Recovered volume on the CPU using TGV'))
+
 #%%
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 print ("________________NDF (3D)___________________")
diff --git a/Wrappers/Python/demos/demo_cpu_vs_gpu_regularisers.py b/Wrappers/Python/demos/demo_cpu_vs_gpu_regularisers.py
index 275e844..230a761 100644
--- a/Wrappers/Python/demos/demo_cpu_vs_gpu_regularisers.py
+++ b/Wrappers/Python/demos/demo_cpu_vs_gpu_regularisers.py
@@ -323,8 +323,8 @@ pars = {'algorithm' : TGV, \
         'input' : u0,\
         'regularisation_parameter':0.04, \
         'alpha1':1.0,\
-        'alpha0':0.7,\
-        'number_of_iterations' :250 ,\
+        'alpha0':2.0,\
+        'number_of_iterations' :400 ,\
         'LipshitzConstant' :12 ,\
         }
         
diff --git a/Wrappers/Python/demos/demo_gpu_regularisers.py b/Wrappers/Python/demos/demo_gpu_regularisers.py
index 9115494..e1c6575 100644
--- a/Wrappers/Python/demos/demo_gpu_regularisers.py
+++ b/Wrappers/Python/demos/demo_gpu_regularisers.py
@@ -223,8 +223,8 @@ pars = {'algorithm' : TGV, \
         'input' : u0,\
         'regularisation_parameter':0.04, \
         'alpha1':1.0,\
-        'alpha0':0.7,\
-        'number_of_iterations' :250 ,\
+        'alpha0':2.0,\
+        'number_of_iterations' :1250 ,\
         'LipshitzConstant' :12 ,\
         }
         
diff --git a/Wrappers/Python/demos/demo_gpu_regularisers3D.py b/Wrappers/Python/demos/demo_gpu_regularisers3D.py
index cda2847..b6058d2 100644
--- a/Wrappers/Python/demos/demo_gpu_regularisers3D.py
+++ b/Wrappers/Python/demos/demo_gpu_regularisers3D.py
@@ -12,7 +12,7 @@ import matplotlib.pyplot as plt
 import numpy as np
 import os
 import timeit
-from ccpi.filters.regularisers import ROF_TV, FGP_TV, SB_TV, LLT_ROF, FGP_dTV, NDF, Diff4th
+from ccpi.filters.regularisers import ROF_TV, FGP_TV, SB_TV, TGV, LLT_ROF, FGP_dTV, NDF, Diff4th
 from qualitymetrics import rmse
 ###############################################################################
 def printParametersToString(pars):
@@ -67,7 +67,7 @@ Im = Im2
 del Im2
 """
 
-#%%
+
 slices = 20
 
 filename = os.path.join(".." , ".." , ".." , "data" ,"lena_gray_512.tif")
@@ -266,6 +266,53 @@ a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
 imgplot = plt.imshow(lltrof_gpu3D[10,:,:], cmap="gray")
 plt.title('{}'.format('Recovered volume on the GPU using LLT-ROF'))
 
+#%%
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+print ("_______________TGV (3D)_________________")
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+
+## plot 
+fig = plt.figure()
+plt.suptitle('Performance of TGV regulariser using the GPU')
+a=fig.add_subplot(1,2,1)
+a.set_title('Noisy Image')
+imgplot = plt.imshow(noisyVol[10,:,:],cmap="gray")
+
+# set parameters
+pars = {'algorithm' : TGV, \
+        'input' : noisyVol,\
+        'regularisation_parameter':0.04, \
+        'alpha1':1.0,\
+        'alpha0':2.0,\
+        'number_of_iterations' :600 ,\
+        'LipshitzConstant' :12 ,\
+        }
+
+print ("#############TGV GPU####################")
+start_time = timeit.default_timer()
+tgv_gpu3D = TGV(pars['input'], 
+              pars['regularisation_parameter'],
+              pars['alpha1'],
+              pars['alpha0'],
+              pars['number_of_iterations'],
+              pars['LipshitzConstant'],'gpu')
+             
+
+rms = rmse(idealVol, tgv_gpu3D)
+pars['rmse'] = rms
+
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+a=fig.add_subplot(1,2,2)
+
+# these are matplotlib.patch.Patch properties
+props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
+# place a text box in upper left in axes coords
+a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
+         verticalalignment='top', bbox=props)
+imgplot = plt.imshow(tgv_gpu3D[10,:,:], cmap="gray")
+plt.title('{}'.format('Recovered volume on the GPU using TGV'))
 #%%
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 print ("_______________NDF-TV (3D)_________________")
diff --git a/Wrappers/Python/src/cpu_regularisers.pyx b/Wrappers/Python/src/cpu_regularisers.pyx
index 7d57ed1..11a0617 100644
--- a/Wrappers/Python/src/cpu_regularisers.pyx
+++ b/Wrappers/Python/src/cpu_regularisers.pyx
@@ -22,7 +22,7 @@ cdef extern float TV_ROF_CPU_main(float *Input, float *Output, float lambdaPar,
 cdef extern float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
 cdef extern float SB_TV_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int printM, int dimX, int dimY, int dimZ);
 cdef extern float LLT_ROF_CPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int dimX, int dimY, int dimZ);
-cdef extern float TGV_main(float *Input, float *Output, float lambdaPar, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY);
+cdef extern float TGV_main(float *Input, float *Output, float lambdaPar, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY, int dimZ);
 cdef extern float Diffusion_CPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int dimX, int dimY, int dimZ);
 cdef extern float Diffus4th_CPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int dimX, int dimY, int dimZ);
 cdef extern float TNV_CPU_main(float *Input, float *u, float lambdaPar, int maxIter, float tol, int dimX, int dimY, int dimZ);
@@ -202,12 +202,8 @@ def TGV_CPU(inputData, regularisation_parameter, alpha1, alpha0, iterations, Lip
         return TGV_2D(inputData, regularisation_parameter, alpha1, alpha0, 
                       iterations, LipshitzConst)
     elif inputData.ndim == 3:
-        shape = inputData.shape
-        out = inputData.copy()
-        for i in range(shape[0]):
-            out[i,:,:] = TGV_2D(inputData[i,:,:], regularisation_parameter, 
-               alpha1, alpha0, iterations, LipshitzConst)
-        return out
+        return TGV_3D(inputData, regularisation_parameter, alpha1, alpha0, 
+                      iterations, LipshitzConst)
 
 def TGV_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
                      float regularisation_parameter,
@@ -229,7 +225,30 @@ def TGV_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
                        alpha0,
                        iterationsNumb, 
                        LipshitzConst,
-                       dims[1],dims[0])                           
+                       dims[1],dims[0],1)
+    return outputData
+def TGV_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
+                     float regularisation_parameter,
+                     float alpha1,
+                     float alpha0,
+                     int iterationsNumb, 
+                     float LipshitzConst):
+                         
+    cdef long dims[3]
+    dims[0] = inputData.shape[0]
+    dims[1] = inputData.shape[1]
+    dims[2] = inputData.shape[2]
+    
+    cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \
+            np.zeros([dims[0], dims[1], dims[2]], dtype='float32')
+                   
+    #/* Run TGV iterations for 3D data */
+    TGV_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, 
+                       alpha1,
+                       alpha0,
+                       iterationsNumb, 
+                       LipshitzConst,
+                       dims[2], dims[1], dims[0])
     return outputData
 
 #***************************************************************#
diff --git a/Wrappers/Python/src/gpu_regularisers.pyx b/Wrappers/Python/src/gpu_regularisers.pyx
index 47a6149..b52f669 100644
--- a/Wrappers/Python/src/gpu_regularisers.pyx
+++ b/Wrappers/Python/src/gpu_regularisers.pyx
@@ -23,7 +23,7 @@ CUDAErrorMessage = 'CUDA error'
 cdef extern int TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z);
 cdef extern int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int N, int M, int Z);
 cdef extern int TV_SB_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int printM, int N, int M, int Z);
-cdef extern int TGV_GPU_main(float *Input, float *Output, float lambdaPar, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY);
+cdef extern int TGV_GPU_main(float *Input, float *Output, float lambdaPar, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY, int dimZ);
 cdef extern int LLT_ROF_GPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int N, int M, int Z);
 cdef extern int NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int N, int M, int Z);
 cdef extern int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, float lambdaPar, int iterationsNumb, float epsil, float eta, int methodTV, int nonneg, int printM, int N, int M, int Z);
@@ -102,12 +102,7 @@ def TGV_GPU(inputData, regularisation_parameter, alpha1, alpha0, iterations, Lip
     if inputData.ndim == 2:
         return TGV2D(inputData, regularisation_parameter, alpha1, alpha0, iterations, LipshitzConst)
     elif inputData.ndim == 3:
-        shape = inputData.shape
-        out = inputData.copy()
-        for i in range(shape[0]):
-            out[i,:,:] = TGV2D(inputData[i,:,:], regularisation_parameter, 
-               alpha1, alpha0, iterations, LipshitzConst)
-        return out
+        return TGV3D(inputData, regularisation_parameter, alpha1, alpha0, iterations, LipshitzConst)
 # Directional Total-variation Fast-Gradient-Projection (FGP)
 def dTV_FGP_GPU(inputData,
                      refdata,
@@ -393,7 +388,6 @@ def LLT_ROF_GPU3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
         raise ValueError(CUDAErrorMessage);
 
 
-
 #***************************************************************#
 #***************** Total Generalised Variation *****************#
 #***************************************************************#
@@ -417,11 +411,38 @@ def TGV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
                        alpha0,
                        iterationsNumb, 
                        LipshitzConst,
-                       dims[1],dims[0])==0):
+                       dims[1],dims[0], 1)==0):
         return outputData
     else:
         raise ValueError(CUDAErrorMessage);
 
+def TGV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
+                     float regularisation_parameter,
+                     float alpha1,
+                     float alpha0,
+                     int iterationsNumb, 
+                     float LipshitzConst):
+    
+    cdef long dims[3]
+    dims[0] = inputData.shape[0]
+    dims[1] = inputData.shape[1]
+    dims[2] = inputData.shape[2]
+
+    cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \
+		    np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
+          
+    # Running CUDA code here    
+    if (TGV_GPU_main(
+            &inputData[0,0,0], &outputData[0,0,0], regularisation_parameter,
+                       alpha1,
+                       alpha0,
+                       iterationsNumb, 
+                       LipshitzConst,
+                       dims[2], dims[1], dims[0])==0):
+        return outputData;
+    else:
+        raise ValueError(CUDAErrorMessage);
+
 
 #****************************************************************#
 #**************Directional Total-variation FGP ******************#
-- 
cgit v1.2.3