From 49423f47c7282672fea979cd122c063c0f5ee465 Mon Sep 17 00:00:00 2001
From: Daniil Kazantsev <dkazanc@hotmail.com>
Date: Wed, 21 Feb 2018 12:06:12 +0000
Subject: ROF_TV demo

---
 Wrappers/Python/test/test_gpu_regularizers.py | 44 +++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/Wrappers/Python/test/test_gpu_regularizers.py b/Wrappers/Python/test/test_gpu_regularizers.py
index 735a25d..3346fb2 100644
--- a/Wrappers/Python/test/test_gpu_regularizers.py
+++ b/Wrappers/Python/test/test_gpu_regularizers.py
@@ -6,14 +6,12 @@ Created on Tue Jan 30 10:24:26 2018
 @author: ofn77899
 """
 
-
-
 import matplotlib.pyplot as plt
 import numpy as np
 import os    
 from enum import Enum
 import timeit
-from ccpi.filters.gpu_regularizers import Diff4thHajiaboli, NML
+from ccpi.filters.gpu_regularizers import Diff4thHajiaboli, NML, ROF_TV_GPU
 ###############################################################################
 def printParametersToString(pars):
         txt = r''
@@ -146,3 +144,43 @@ imgplot = plt.imshow((nml - u0)**2, cmap="gray")
 
 plt.show()
         
+        
+## Rudin-Osher-Fatemi (ROF) TV regularization
+start_time = timeit.default_timer()
+
+pars = {
+        'input' : u0,
+        'regularization_parameter': 1,\
+        'time_marching_parameter': 0.003, \
+        'number_of_iterations':300        
+	}
+
+rof_tv = ROF_TV_GPU(pars['input'], 
+                     pars['number_of_iterations'], 
+                     pars['time_marching_parameter'], 
+                     pars['number_of_iterations'])
+                     
+rms = rmse(Im, rof_tv)
+pars['rmse'] = rms
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+a=fig.add_subplot(2,4,1)
+
+# these are matplotlib.patch.Patch properties
+props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
+# place a text box in upper left in axes coords
+a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=12,
+         verticalalignment='top', bbox=props)
+imgplot = plt.imshow(nml, cmap="gray")
+
+a=fig.add_subplot(2,4,2)
+
+# these are matplotlib.patch.Patch properties
+props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
+# place a text box in upper left in axes coords
+a.text(0.05, 0.95, 'rof_tv - u0', transform=a.transAxes, fontsize=14,
+         verticalalignment='top', bbox=props)
+imgplot = plt.imshow((rof_tv - u0)**2, cmap="gray")
+
+plt.show()
-- 
cgit v1.2.3


From 75917255b4f0b8aae2e6ce9492a75e0f749bfb3e Mon Sep 17 00:00:00 2001
From: Edoardo Pasca <edo.paskino@gmail.com>
Date: Wed, 21 Feb 2018 12:49:59 +0000
Subject: added TV_ROF

---
 Core/CMakeLists.txt                           | 3 ++-
 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu    | 2 +-
 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.h     | 3 ++-
 Wrappers/Python/src/gpu_regularizers.pyx      | 8 ++++----
 Wrappers/Python/test/test_gpu_regularizers.py | 2 +-
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/Core/CMakeLists.txt b/Core/CMakeLists.txt
index 3202581..3e3f89e 100644
--- a/Core/CMakeLists.txt
+++ b/Core/CMakeLists.txt
@@ -88,7 +88,7 @@ add_library(cilreg SHARED
 	    ${CMAKE_CURRENT_SOURCE_DIR}/regularizers_CPU/PatchBased_Regul_core.c
 	    ${CMAKE_CURRENT_SOURCE_DIR}/regularizers_CPU/SplitBregman_TV_core.c
 	    ${CMAKE_CURRENT_SOURCE_DIR}/regularizers_CPU/TGV_PD_core.c
-		${CMAKE_CURRENT_SOURCE_DIR}/regularizers_CPU/ROF_TV_core.c
+            ${CMAKE_CURRENT_SOURCE_DIR}/regularizers_CPU/ROF_TV_core.c
 	    ${CMAKE_CURRENT_SOURCE_DIR}/regularizers_CPU/utils.c
 	    )
 target_link_libraries(cilreg ${EXTRA_LIBRARIES} )
@@ -129,6 +129,7 @@ if (CUDA_FOUND)
   CUDA_ADD_LIBRARY(cilregcuda SHARED
 	${CMAKE_CURRENT_SOURCE_DIR}/regularizers_GPU/Diffus_HO/Diff4th_GPU_kernel.cu
 	${CMAKE_CURRENT_SOURCE_DIR}/regularizers_GPU/NL_Regul/NLM_GPU_kernel.cu
+	${CMAKE_CURRENT_SOURCE_DIR}/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu
   )
   if (UNIX)
     message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib")
diff --git a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu
index 633bf63..73a52e1 100755
--- a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu
+++ b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu
@@ -306,7 +306,7 @@ __host__ __device__ int sign (float x)
 
 /////////////////////////////////////////////////
 // HOST FUNCTION
-extern "C" void TV_ROF_GPU(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambda)
+extern "C" void TV_ROF_GPU_kernel(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambda)
 {
 	    // set up device
 		int dev = 0;
diff --git a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.h b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.h
index 3163482..2938d2f 100755
--- a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.h
+++ b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.h
@@ -1,7 +1,8 @@
 #ifndef __TVGPU_H__
 #define __TVGPU_H__
 #include "CCPiDefines.h"
+#include <stdio.h>
 
-extern "C" CCPI_EXPORT void TV_ROF_GPU(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambda);
+extern "C" CCPI_EXPORT void TV_ROF_GPU_kernel(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambda);
 
 #endif 
diff --git a/Wrappers/Python/src/gpu_regularizers.pyx b/Wrappers/Python/src/gpu_regularizers.pyx
index 5a5d274..fcb91cc 100644
--- a/Wrappers/Python/src/gpu_regularizers.pyx
+++ b/Wrappers/Python/src/gpu_regularizers.pyx
@@ -25,7 +25,7 @@ cdef extern void NLM_GPU_kernel(float *A, float* B, float *Eucl_Vec,
                                 int N, int M,  int Z, int dimension, 
                                 int SearchW, int SimilW, 
                                 int SearchW_real, float denh2, float lambdaf);
-cdef extern void TV_ROF_GPU(float* A, float* B, int N, int M, int Z, int iter, float tau, float lambdaf);
+cdef extern void TV_ROF_GPU_kernel(float* A, float* B, int N, int M, int Z, int iter, float tau, float lambdaf);
 cdef extern float pad_crop(float *A, float *Ap, 
                            int OldSizeX, int OldSizeY, int OldSizeZ, 
                            int NewSizeX, int NewSizeY, int NewSizeZ, 
@@ -67,7 +67,7 @@ def NML(inputData,
                      h,
                      lambdaf)
 
-def ROF_TV_GPU(inputData,
+def GPU_ROF_TV(inputData,
                      iterations, 
                      time_marching_parameter,
                      regularization_parameter):
@@ -343,7 +343,7 @@ def ROFTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
 		    np.zeros([dims[0],dims[1]], dtype='float32')
           
     # Running CUDA code here    
-    TV_ROF_GPU(            
+    TV_ROF_GPU_kernel(            
             &inputData[0,0], &B[0,0], 
                        dims[0], dims[1], 0, 
                        iterations , 
@@ -366,7 +366,7 @@ def ROFTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
 		    np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
           
     # Running CUDA code here    
-    TV_ROF_GPU(            
+    TV_ROF_GPU_kernel(            
             &inputData[0,0,0], &B[0,0,0], 
                        dims[0], dims[1], dims[2], 
                        iterations , 
diff --git a/Wrappers/Python/test/test_gpu_regularizers.py b/Wrappers/Python/test/test_gpu_regularizers.py
index 735a25d..75127a6 100644
--- a/Wrappers/Python/test/test_gpu_regularizers.py
+++ b/Wrappers/Python/test/test_gpu_regularizers.py
@@ -13,7 +13,7 @@ import numpy as np
 import os    
 from enum import Enum
 import timeit
-from ccpi.filters.gpu_regularizers import Diff4thHajiaboli, NML
+from ccpi.filters.gpu_regularizers import Diff4thHajiaboli, NML, GPU_ROF_TV
 ###############################################################################
 def printParametersToString(pars):
         txt = r''
-- 
cgit v1.2.3


From ce313c7dcf45edaf4c824690d8caa7df5df90120 Mon Sep 17 00:00:00 2001
From: Edoardo Pasca <edo.paskino@gmail.com>
Date: Wed, 21 Feb 2018 14:43:35 +0000
Subject: modified gpu regularizers build

---
 Wrappers/Python/setup-regularizers.py.in      |  6 ++++--
 Wrappers/Python/test/test_gpu_regularizers.py | 30 ++++++++++++++-------------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/Wrappers/Python/setup-regularizers.py.in b/Wrappers/Python/setup-regularizers.py.in
index a125261..8655a2e 100644
--- a/Wrappers/Python/setup-regularizers.py.in
+++ b/Wrappers/Python/setup-regularizers.py.in
@@ -34,7 +34,9 @@ extra_libraries = ['cilreg']
 
 extra_include_dirs += [os.path.join(".." , ".." , "Core"),
                        os.path.join(".." , ".." , "Core",  "regularizers_CPU"),
-                       os.path.join(".." , ".." , "Core",  "regularizers_GPU") , 
+                       os.path.join(".." , ".." , "Core",  "regularizers_GPU" , "Diffus_HO" ) , 
+                       os.path.join(".." , ".." , "Core",  "regularizers_GPU" , "NL_Regul" ) , 
+                       os.path.join(".." , ".." , "Core",  "regularizers_GPU" , "TV_ROF" ) , 
 						   "."]
 
 if platform.system() == 'Windows':				   
@@ -81,4 +83,4 @@ setup(
 )
 
 
-@SETUP_GPU_WRAPPERS@
\ No newline at end of file
+@SETUP_GPU_WRAPPERS@
diff --git a/Wrappers/Python/test/test_gpu_regularizers.py b/Wrappers/Python/test/test_gpu_regularizers.py
index ff8af44..9e37627 100644
--- a/Wrappers/Python/test/test_gpu_regularizers.py
+++ b/Wrappers/Python/test/test_gpu_regularizers.py
@@ -50,14 +50,15 @@ u0 = f(u0).astype('float32')
 ## plot 
 fig = plt.figure()
 
-a=fig.add_subplot(2,3,1)
+a=fig.add_subplot(2,4,1)
 a.set_title('noise')
 imgplot = plt.imshow(u0,cmap="gray")
 
 
 ## Diff4thHajiaboli
 start_time = timeit.default_timer()
-pars = {'algorithm' : Diff4thHajiaboli , \
+pars = {
+'algorithm' : Diff4thHajiaboli , \
         'input' : u0,
         'edge_preserv_parameter':0.1 , \
 'number_of_iterations' :250 ,\
@@ -76,7 +77,7 @@ pars['rmse'] = rms
 txtstr = printParametersToString(pars)
 txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
 print (txtstr)
-a=fig.add_subplot(2,3,2)
+a=fig.add_subplot(2,4,2)
 
 # these are matplotlib.patch.Patch properties
 props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
@@ -85,14 +86,14 @@ a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=12,
          verticalalignment='top', bbox=props)
 imgplot = plt.imshow(d4h, cmap="gray")
 
-a=fig.add_subplot(2,3,5)
+a=fig.add_subplot(2,4,6)
 
 # these are matplotlib.patch.Patch properties
 props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
 # place a text box in upper left in axes coords
 a.text(0.05, 0.95, 'd4h - u0', transform=a.transAxes, fontsize=12,
          verticalalignment='top', bbox=props)
-imgplot = plt.imshow((d4h - u0)**2, cmap="gray")
+imgplot = plt.imshow((d4h - u0), cmap="gray")
 
 
 ## Patch Based Regul NML
@@ -107,6 +108,7 @@ pars = {'algorithm' : NML , \
 }
 """
 pars = {
+'algorithm' : NML , \
         'input' : u0,
         'regularization_parameter': 0.01,\
         'searching_window_ratio':3, \
@@ -124,7 +126,7 @@ pars['rmse'] = rms
 txtstr = printParametersToString(pars)
 txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
 print (txtstr)
-a=fig.add_subplot(2,3,3)
+a=fig.add_subplot(2,4,3)
 
 # these are matplotlib.patch.Patch properties
 props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
@@ -133,22 +135,22 @@ a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=12,
          verticalalignment='top', bbox=props)
 imgplot = plt.imshow(nml, cmap="gray")
 
-a=fig.add_subplot(2,3,6)
+a=fig.add_subplot(2,4,7)
 
 # these are matplotlib.patch.Patch properties
 props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
 # place a text box in upper left in axes coords
 a.text(0.05, 0.95, 'nml - u0', transform=a.transAxes, fontsize=14,
          verticalalignment='top', bbox=props)
-imgplot = plt.imshow((nml - u0)**2, cmap="gray")
+imgplot = plt.imshow((nml - u0), cmap="gray")
 
-plt.show()
         
         
 ## Rudin-Osher-Fatemi (ROF) TV regularization
 start_time = timeit.default_timer()
 
 pars = {
+'algorithm' : GPU_ROF_TV , \
         'input' : u0,
         'regularization_parameter': 1,\
         'time_marching_parameter': 0.003, \
@@ -158,29 +160,29 @@ pars = {
 rof_tv = GPU_ROF_TV(pars['input'], 
                      pars['number_of_iterations'], 
                      pars['time_marching_parameter'], 
-                     pars['number_of_iterations'])
+                     pars['regularization_parameter'])
                      
 rms = rmse(Im, rof_tv)
 pars['rmse'] = rms
 txtstr = printParametersToString(pars)
 txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
 print (txtstr)
-a=fig.add_subplot(2,4,1)
+a=fig.add_subplot(2,4,4)
 
 # these are matplotlib.patch.Patch properties
 props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
 # place a text box in upper left in axes coords
 a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=12,
          verticalalignment='top', bbox=props)
-imgplot = plt.imshow(nml, cmap="gray")
+imgplot = plt.imshow(rof_tv, cmap="gray")
 
-a=fig.add_subplot(2,4,2)
+a=fig.add_subplot(2,4,8)
 
 # these are matplotlib.patch.Patch properties
 props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
 # place a text box in upper left in axes coords
 a.text(0.05, 0.95, 'rof_tv - u0', transform=a.transAxes, fontsize=14,
          verticalalignment='top', bbox=props)
-imgplot = plt.imshow((rof_tv - u0)**2, cmap="gray")
+imgplot = plt.imshow((rof_tv - u0), cmap="gray")
 
 plt.show()
-- 
cgit v1.2.3


From d4d12945f5396b10259fdeeb2b09f99b0e2c6afd Mon Sep 17 00:00:00 2001
From: algol <dkazanc@hotmail.com>
Date: Thu, 22 Feb 2018 12:34:39 +0000
Subject: GPU test fixed, cpu vs gpu test added for ROF-TV

---
 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu         |   2 +-
 Wrappers/Python/demo/test_cpu_regularizers.py      |  10 +-
 Wrappers/Python/src/gpu_regularizers.pyx           |   2 +-
 .../Python/test/test_cpu_vs_gpu_regularizers.py    | 123 +++++++++++++++++++++
 Wrappers/Python/test/test_gpu_regularizers.py      |  16 +--
 5 files changed, 138 insertions(+), 15 deletions(-)
 create mode 100644 Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py

diff --git a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu
index 73a52e1..897b5d0 100755
--- a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu
+++ b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu
@@ -312,7 +312,7 @@ extern "C" void TV_ROF_GPU_kernel(float* Input, float* Output, int N, int M, int
 		int dev = 0;
 		CHECK(cudaSetDevice(dev));
 		
-        float *d_input, *d_update, *d_D1, *d_D2;
+        float *d_input, *d_update, *d_D1, *d_D2;        
         
         CHECK(cudaMalloc((void**)&d_input,N*M*Z*sizeof(float)));
         CHECK(cudaMalloc((void**)&d_update,N*M*Z*sizeof(float)));
diff --git a/Wrappers/Python/demo/test_cpu_regularizers.py b/Wrappers/Python/demo/test_cpu_regularizers.py
index 5908c3c..d147b85 100644
--- a/Wrappers/Python/demo/test_cpu_regularizers.py
+++ b/Wrappers/Python/demo/test_cpu_regularizers.py
@@ -284,9 +284,9 @@ start_time = timeit.default_timer()
 
 pars = {'algorithm': ROF_TV , \
         'input' : u0,\
-        'regularization_parameter':1,\
-        'marching_step': 0.003,\
-        'number_of_iterations': 300
+        'regularization_parameter':25,\
+        'marching_step': 0.001,\
+        'number_of_iterations': 350
         }
 rof = ROF_TV(pars['input'],
              pars['number_of_iterations'],
@@ -307,9 +307,7 @@ props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
 # place a text box in upper left in axes coords
 a.text(0.05, 0.95, txtstr, transform=a.transAxes, fontsize=14,
          verticalalignment='top', bbox=props)
-imgplot = plt.imshow(tgv, cmap="gray")
-
-
+imgplot = plt.imshow(rof, cmap="gray")
 
 plt.show()
 
diff --git a/Wrappers/Python/src/gpu_regularizers.pyx b/Wrappers/Python/src/gpu_regularizers.pyx
index fcb91cc..c724471 100644
--- a/Wrappers/Python/src/gpu_regularizers.pyx
+++ b/Wrappers/Python/src/gpu_regularizers.pyx
@@ -345,7 +345,7 @@ def ROFTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
     # Running CUDA code here    
     TV_ROF_GPU_kernel(            
             &inputData[0,0], &B[0,0], 
-                       dims[0], dims[1], 0, 
+                       dims[0], dims[1], 1, 
                        iterations , 
                        time_marching_parameter, 
                        regularization_parameter);   
diff --git a/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py b/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py
new file mode 100644
index 0000000..8c91c73
--- /dev/null
+++ b/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Feb 22 11:39:43 2018
+
+Testing CPU implementation against GPU one
+
+@author: algol
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+import os    
+import timeit
+from ccpi.filters.gpu_regularizers import Diff4thHajiaboli, NML, GPU_ROF_TV
+from ccpi.filters.cpu_regularizers_cython import ROF_TV
+###############################################################################
+def printParametersToString(pars):
+        txt = r''
+        for key, value in pars.items():
+            if key== 'algorithm' :
+                txt += "{0} = {1}".format(key, value.__name__)
+            elif key == 'input':
+                txt += "{0} = {1}".format(key, np.shape(value))
+            else:
+                txt += "{0} = {1}".format(key, value)
+            txt += '\n'
+        return txt
+###############################################################################
+def rmse(im1, im2):
+    a, b = im1.shape
+    rmse = np.sqrt(np.sum((im1 - im2) ** 2) / float(a * b))    
+    return rmse
+        
+filename = os.path.join(".." , ".." , ".." , "data" ,"lena_gray_512.tif")
+
+# read image
+Im = plt.imread(filename)                     
+Im = np.asarray(Im, dtype='float32')
+
+Im = Im/255
+perc = 0.075
+u0 = Im + np.random.normal(loc = Im ,
+                                  scale = perc * Im , 
+                                  size = np.shape(Im))
+# map the u0 u0->u0>0
+f = np.frompyfunc(lambda x: 0 if x < 0 else x, 1,1)
+u0 = f(u0).astype('float32')
+
+## plot 
+fig = plt.figure(1)
+plt.suptitle('Comparison of ROF-TV regularizer using CPU and GPU implementations')
+a=fig.add_subplot(1,4,1)
+a.set_title('Noisy Image')
+imgplot = plt.imshow(u0,cmap="gray")
+
+
+# set parameters
+pars = {'algorithm': ROF_TV , \
+        'input' : u0,\
+        'regularization_parameter':12,\
+        'time_marching_parameter': 0.001,\
+        'number_of_iterations': 600
+        }
+print ("#################ROF TV CPU#####################")
+start_time = timeit.default_timer()
+rof_cpu = ROF_TV(pars['input'],
+             pars['number_of_iterations'],
+             pars['regularization_parameter'],
+             pars['time_marching_parameter'] 
+             )
+#tgv = out
+rms = rmse(Im, rof_cpu)
+pars['rmse'] = rms
+
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+a=fig.add_subplot(1,4,2)
+
+# these are matplotlib.patch.Patch properties
+props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
+# place a text box in upper left in axes coords
+a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
+         verticalalignment='top', bbox=props)
+imgplot = plt.imshow(rof_cpu, cmap="gray")
+plt.title('{}'.format('CPU results'))
+
+
+print ("#################ROF TV GPU#####################")
+start_time = timeit.default_timer()
+rof_gpu = GPU_ROF_TV(pars['input'], 
+                     pars['number_of_iterations'], 
+                     pars['time_marching_parameter'], 
+                     pars['regularization_parameter'])
+                     
+rms = rmse(Im, rof_gpu)
+pars['rmse'] = rms
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+a=fig.add_subplot(1,4,3)
+
+# these are matplotlib.patch.Patch properties
+props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
+# place a text box in upper left in axes coords
+a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
+         verticalalignment='top', bbox=props)
+imgplot = plt.imshow(rof_gpu, cmap="gray")
+plt.title('{}'.format('GPU results'))
+
+
+print ("--------Compare the results--------")
+tolerance = 1e-06
+diff_im = abs(rof_cpu - rof_gpu)
+diff_im[diff_im > tolerance] = 1
+a=fig.add_subplot(1,4,4)
+imgplot = plt.imshow(diff_im, vmin=0, vmax=1, cmap="gray")
+plt.title('{}'.format('Pixels larger threshold difference'))
+if (diff_im.sum() > 1):
+    print ("Arrays do not match!")
+else:
+    print ("Arrays match")
\ No newline at end of file
diff --git a/Wrappers/Python/test/test_gpu_regularizers.py b/Wrappers/Python/test/test_gpu_regularizers.py
index 9e37627..29f5bad 100644
--- a/Wrappers/Python/test/test_gpu_regularizers.py
+++ b/Wrappers/Python/test/test_gpu_regularizers.py
@@ -93,7 +93,8 @@ props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
 # place a text box in upper left in axes coords
 a.text(0.05, 0.95, 'd4h - u0', transform=a.transAxes, fontsize=12,
          verticalalignment='top', bbox=props)
-imgplot = plt.imshow((d4h - u0), cmap="gray")
+imgplot = plt.imshow((d4h - u0)**2,  vmin=0, vmax=0.03, cmap="gray")
+plt.colorbar(ticks=[0, 0.03], orientation='vertical')
 
 
 ## Patch Based Regul NML
@@ -142,7 +143,8 @@ props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
 # place a text box in upper left in axes coords
 a.text(0.05, 0.95, 'nml - u0', transform=a.transAxes, fontsize=14,
          verticalalignment='top', bbox=props)
-imgplot = plt.imshow((nml - u0), cmap="gray")
+imgplot = plt.imshow((nml - u0)**2,  vmin=0, vmax=0.03, cmap="gray")
+plt.colorbar(ticks=[0, 0.03], orientation='vertical')
 
         
@@ -152,9 +154,9 @@ start_time = timeit.default_timer()
 pars = {
 'algorithm' : GPU_ROF_TV , \
         'input' : u0,
-        'regularization_parameter': 1,\
-        'time_marching_parameter': 0.003, \
-        'number_of_iterations':300        
+        'regularization_parameter': 25,\
+        'time_marching_parameter': 0.001, \
+        'number_of_iterations':350        
 	}
 
 rof_tv = GPU_ROF_TV(pars['input'], 
@@ -183,6 +185,6 @@ props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
 # place a text box in upper left in axes coords
 a.text(0.05, 0.95, 'rof_tv - u0', transform=a.transAxes, fontsize=14,
          verticalalignment='top', bbox=props)
-imgplot = plt.imshow((rof_tv - u0), cmap="gray")
-
+imgplot = plt.imshow((rof_tv - u0)**2, vmin=0, vmax=0.03, cmap="gray")
+plt.colorbar(ticks=[0, 0.03], orientation='vertical')
 plt.show()
-- 
cgit v1.2.3


From 0ebf1f949b7150692e356627c455e905b642af97 Mon Sep 17 00:00:00 2001
From: Daniil Kazantsev <dkazanc@hotmail.com>
Date: Sun, 25 Feb 2018 17:54:03 +0000
Subject: updates to CPU/GPU core for ROF and demos

---
 Core/regularizers_CPU/ROF_TV_core.c                  |  9 ++++-----
 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu           | 19 +++++++++----------
 Wrappers/Python/demo/test_cpu_regularizers.py        |  6 +++---
 Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py |  6 +++---
 Wrappers/Python/test/test_gpu_regularizers.py        |  6 +++---
 5 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/Core/regularizers_CPU/ROF_TV_core.c b/Core/regularizers_CPU/ROF_TV_core.c
index ba7fe48..fd47c3f 100644
--- a/Core/regularizers_CPU/ROF_TV_core.c
+++ b/Core/regularizers_CPU/ROF_TV_core.c
@@ -268,7 +268,7 @@ float TV_kernel(float *D1, float *D2, float *D3, float *B, float *A, float lambd
                     dv2 = D2[index] - D2[(dimX*dimY)*k + j*dimX+i2];
                     dv3 = D3[index] - D3[(dimX*dimY)*k2 + j*dimX+i];
                     
-                    B[index] = B[index] + tau*lambda*(dv1 + dv2 + dv3) + tau*(A[index] - B[index]);
+                    B[index] = B[index] + tau*(2.0f*lambda*(dv1 + dv2 + dv3) - (B[index] - A[index]));   
                 }}}		
     }
     else {
@@ -284,10 +284,9 @@ float TV_kernel(float *D1, float *D2, float *D3, float *B, float *A, float lambd
                 
                 /* divergence components  */
                 dv1 = D1[index] - D1[j2*dimX + i];
-                dv2 = D2[index] - D2[j*dimX + i2];
-                
-                //B[index] =  B[index] + tau*lambda*(dv1 + dv2) + tau*(A[index] - B[index]);
-                B[index] =  B[index] + tau*((dv1 + dv2) - lambda*(B[index] - A[index]));                
+                dv2 = D2[index] - D2[j*dimX + i2];                
+
+                B[index] =  B[index] + tau*(2.0f*lambda*(dv1 + dv2) - (B[index] - A[index]));                
             }}
     }
     return *B;
diff --git a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu
index 897b5d0..b67b53b 100755
--- a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu
+++ b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu
@@ -54,7 +54,7 @@ limitations under the License.
     
 #define BLKXSIZE2D 16
 #define BLKYSIZE2D 16
-#define EPS 1.0e-4
+#define EPS 1.0e-5
     
 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
 
@@ -71,7 +71,7 @@ __host__ __device__ int sign (float x)
     /* differences 1 */
     __global__ void D1_func2D(float* Input, float* D1, int N, int M)      
     {
-		int i1, j1, i2, j2;
+		int i1, j1, i2;
 		float NOMx_1,NOMy_1,NOMy_0,denom1,denom2,T1;
 		int i = blockDim.x * blockIdx.x + threadIdx.x;
         int j = blockDim.y * blockIdx.y + threadIdx.y;
@@ -84,7 +84,6 @@ __host__ __device__ int sign (float x)
                 i1 = i + 1; if (i1 >= N) i1 = i-1;
                 i2 = i - 1; if (i2 < 0) i2 = i+1;
                 j1 = j + 1; if (j1 >= M) j1 = j-1;
-                j2 = j - 1; if (j2 < 0) j2 = j+1; 
 		
 		     /* Forward-backward differences */
                 NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */
@@ -102,7 +101,7 @@ __host__ __device__ int sign (float x)
     /* differences 2 */
     __global__ void D2_func2D(float* Input, float* D2, int N, int M)      
     {
-		int i1, j1, i2, j2;
+		int i1, j1, j2;
 		float NOMx_1,NOMy_1,NOMx_0,denom1,denom2,T2;
 		int i = blockDim.x * blockIdx.x + threadIdx.x;
         int j = blockDim.y * blockIdx.y + threadIdx.y;
@@ -113,7 +112,6 @@ __host__ __device__ int sign (float x)
             
             /* boundary conditions (Neumann reflections) */
                 i1 = i + 1; if (i1 >= N) i1 = i-1;
-                i2 = i - 1; if (i2 < 0) i2 = i+1;
                 j1 = j + 1; if (j1 >= M) j1 = j-1;
                 j2 = j - 1; if (j2 < 0) j2 = j+1; 
 		
@@ -149,7 +147,7 @@ __host__ __device__ int sign (float x)
                 dv1 = D1[index] - D1[j2*N + i];
                 dv2 = D2[index] - D2[j*N + i2];                                
                 
-                Update[index] =  Update[index] + tau*((dv1 + dv2) - lambda*(Update[index] - Input[index]));      
+                Update[index] =  Update[index] + tau*(2.0f*lambda*(dv1 + dv2) - (Update[index] - Input[index]));      
 		
 		}  
 	}   
@@ -299,21 +297,22 @@ __host__ __device__ int sign (float x)
                     dv2 = D2[index] - D2[(dimX*dimY)*k + j*dimX+i2];
                     dv3 = D3[index] - D3[(dimX*dimY)*k2 + j*dimX+i];
                     
-                    Update[index] = Update[index] + tau*lambda*(dv1 + dv2 + dv3) + tau*(Update[index] - Input[index]);   
+                    Update[index] = Update[index] + tau*(2.0f*lambda*(dv1 + dv2 + dv3) - (Update[index] - Input[index]));
 		
 		}  
 	}
 
 /////////////////////////////////////////////////
 // HOST FUNCTION
-extern "C" void TV_ROF_GPU_kernel(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambda)
+extern "C" void TV_ROF_GPU(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambda)
 {
 	    // set up device
 		int dev = 0;
 		CHECK(cudaSetDevice(dev));
 		
-        float *d_input, *d_update, *d_D1, *d_D2;        
+        float *d_input, *d_update, *d_D1, *d_D2;
         
+	if (Z == 0) Z = 1;
         CHECK(cudaMalloc((void**)&d_input,N*M*Z*sizeof(float)));
         CHECK(cudaMalloc((void**)&d_update,N*M*Z*sizeof(float)));
         CHECK(cudaMalloc((void**)&d_D1,N*M*Z*sizeof(float)));
@@ -346,7 +345,7 @@ extern "C" void TV_ROF_GPU_kernel(float* Input, float* Output, int N, int M, int
             CHECK(cudaFree(d_D3));         
         }
         else {
-			// TV - 2D case
+	    // TV - 2D case
             dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
             dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D));
              
diff --git a/Wrappers/Python/demo/test_cpu_regularizers.py b/Wrappers/Python/demo/test_cpu_regularizers.py
index d147b85..b08c418 100644
--- a/Wrappers/Python/demo/test_cpu_regularizers.py
+++ b/Wrappers/Python/demo/test_cpu_regularizers.py
@@ -284,9 +284,9 @@ start_time = timeit.default_timer()
 
 pars = {'algorithm': ROF_TV , \
         'input' : u0,\
-        'regularization_parameter':25,\
-        'marching_step': 0.001,\
-        'number_of_iterations': 350
+        'regularization_parameter':0.04,\
+        'marching_step': 0.0025,\
+        'number_of_iterations': 300
         }
 rof = ROF_TV(pars['input'],
              pars['number_of_iterations'],
diff --git a/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py b/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py
index 8c91c73..d742a7f 100644
--- a/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py
+++ b/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py
@@ -58,8 +58,8 @@ imgplot = plt.imshow(u0,cmap="gray")
 # set parameters
 pars = {'algorithm': ROF_TV , \
         'input' : u0,\
-        'regularization_parameter':12,\
-        'time_marching_parameter': 0.001,\
+        'regularization_parameter':0.04,\
+        'time_marching_parameter': 0.0025,\
         'number_of_iterations': 600
         }
 print ("#################ROF TV CPU#####################")
@@ -120,4 +120,4 @@ plt.title('{}'.format('Pixels larger threshold difference'))
 if (diff_im.sum() > 1):
     print ("Arrays do not match!")
 else:
-    print ("Arrays match")
\ No newline at end of file
+    print ("Arrays match")
diff --git a/Wrappers/Python/test/test_gpu_regularizers.py b/Wrappers/Python/test/test_gpu_regularizers.py
index 29f5bad..c473270 100644
--- a/Wrappers/Python/test/test_gpu_regularizers.py
+++ b/Wrappers/Python/test/test_gpu_regularizers.py
@@ -154,9 +154,9 @@ start_time = timeit.default_timer()
 pars = {
 'algorithm' : GPU_ROF_TV , \
         'input' : u0,
-        'regularization_parameter': 25,\
-        'time_marching_parameter': 0.001, \
-        'number_of_iterations':350        
+        'regularization_parameter': 0.04,\
+        'time_marching_parameter': 0.0025, \
+        'number_of_iterations':300        
 	}
 
 rof_tv = GPU_ROF_TV(pars['input'], 
-- 
cgit v1.2.3


From b9562bbfdf6ca2cc8775e242ae13cbfaeeb2479b Mon Sep 17 00:00:00 2001
From: Daniil Kazantsev <dkazanc@hotmail.com>
Date: Mon, 26 Feb 2018 11:16:25 +0000
Subject: FGP CPU parameters passing simplified, FGP GPU added

---
 Core/regularizers_CPU/FGP_TV_core.c             | 481 +++++++++++---------
 Core/regularizers_CPU/FGP_TV_core.h             |  51 +--
 Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu | 561 ++++++++++++++++++++++++
 Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h  |  10 +
 4 files changed, 867 insertions(+), 236 deletions(-)
 create mode 100755 Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu
 create mode 100755 Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h

diff --git a/Core/regularizers_CPU/FGP_TV_core.c b/Core/regularizers_CPU/FGP_TV_core.c
index 03cd445..304848d 100644
--- a/Core/regularizers_CPU/FGP_TV_core.c
+++ b/Core/regularizers_CPU/FGP_TV_core.c
@@ -22,245 +22,314 @@ limitations under the License.
 /* C-OMP implementation of FGP-TV [1] denoising/regularization model (2D/3D case)
  *
  * Input Parameters:
- * 1. Noisy image/volume [REQUIRED]
- * 2. lambda - regularization parameter [REQUIRED]
- * 3. Number of iterations [OPTIONAL parameter]
- * 4. eplsilon: tolerance constant [OPTIONAL parameter]
- * 5. TV-type: 'iso' or 'l1' [OPTIONAL parameter]
+ * 1. Noisy image/volume 
+ * 2. lambda - regularization parameter 
+ * 3. Number of iterations
+ * 4. eplsilon: tolerance constant 
+ * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
+ * 6. nonneg: 'nonnegativity (0 is OFF by default) 
+ * 7. print information: 0 (off) or 1 (on) 
  *
  * Output:
  * [1] Filtered/regularized image
- * [2] last function value 
- *
- * Example of image denoising:
- * figure;
- * Im = double(imread('lena_gray_256.tif'))/255;  % loading image
- * u0 = Im + .05*randn(size(Im)); % adding noise
- * u = FGP_TV(single(u0), 0.05, 100, 1e-04);
  *
  * This function is based on the Matlab's code and paper by
  * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
- *
- * D. Kazantsev, 2016-17
- *
  */
-
-/* 2D-case related Functions */
-/*****************************************************************/
-float Obj_func_CALC2D(float *A, float *D, float *funcvalA, float lambda, int dimX, int dimY)
-{   
-    int i,j;
-    float f1, f2, val1, val2;
-    
-    /*data-related term */
-    f1 = 0.0f;
-    for(i=0; i<dimX*dimY; i++) f1 += pow(D[i] - A[i],2);    
-    
-    /*TV-related term */
-    f2 = 0.0f;
-    for(i=0; i<dimX; i++) {
-        for(j=0; j<dimY; j++) {
-            /* boundary conditions  */
-            if (i == dimX-1) {val1 = 0.0f;} else {val1 = A[(i+1)*dimY + (j)] - A[(i)*dimY + (j)];}
-            if (j == dimY-1) {val2 = 0.0f;} else {val2 = A[(i)*dimY + (j+1)] - A[(i)*dimY + (j)];}    
-            f2 += sqrt(pow(val1,2) + pow(val2,2));
-        }}  
-    
-    /* sum of two terms */
-    funcvalA[0] = 0.5f*f1 + lambda*f2;     
-    return *funcvalA;
+ 
+float FGP_TV_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
+{
+	int ll, j, DimTotal;
+	float re, re1;
+	float tk = 1.0f;
+    float tkp1=1.0f;
+    int count = 0;
+	
+	if (dimZ <= 1) {
+		/*2D case */				
+		float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL;		
+		DimTotal = dimX*dimY;
+		
+		Output_prev = (float *) malloc( DimTotal * sizeof(float) );
+		P1 = (float *) malloc( DimTotal * sizeof(float) );
+		P2 = (float *) malloc( DimTotal * sizeof(float) );
+		P1_prev = (float *) malloc( DimTotal * sizeof(float) );
+		P2_prev = (float *) malloc( DimTotal * sizeof(float) );
+		R1 = (float *) malloc( DimTotal * sizeof(float) );
+		R2 = (float *) malloc( DimTotal * sizeof(float) );
+		
+		/* begin iterations */
+        for(ll=0; ll<iter; ll++) {
+            
+            /* computing the gradient of the objective function */
+            Obj_func2D(Input, Output, R1, R2, lambda, dimX, dimY);
+            
+            /* apply nonnegativity */
+            if (nonneg == 1) for(j=0; j<dimX*dimY; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;}            
+            
+            /*Taking a step towards minus of the gradient*/
+            Grad_func2D(P1, P2, Output, R1, R2, lambda, dimX, dimY);
+            
+            /* projection step */
+            Proj_func2D(P1, P2, methodTV, dimX, dimY);
+            
+            /*updating R and t*/
+            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
+            Rupd_func2D(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, dimX, dimY);
+            
+            /* check early stopping criteria */
+            re = 0.0f; re1 = 0.0f;
+            for(j=0; j<dimX*dimY; j++)
+            {
+                re += pow(Output[j] - Output_prev[j],2);
+                re1 += pow(Output[j],2);
+            }
+            re = sqrt(re)/sqrt(re1);
+            if (re < epsil)  count++;
+				if (count > 4) break;				            
+            
+            /*storing old values*/
+            copyIm(Output, Output_prev, dimX, dimY, 1);
+            copyIm(P1, P1_prev, dimX, dimY, 1);
+            copyIm(P2, P2_prev, dimX, dimY, 1);
+            tk = tkp1;
+        }
+        if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", ll);   
+		free(Output_prev); free(P1); free(P2); free(P1_prev); free(P2_prev); free(R1); free(R2);		
+	}
+	else {
+		/*3D case*/
+		float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL;		
+		DimTotal = dimX*dimY*dimZ;
+		
+		Output_prev = (float *) malloc( DimTotal * sizeof(float) );
+		P1 = (float *) malloc( DimTotal * sizeof(float) );
+		P2 = (float *) malloc( DimTotal * sizeof(float) );
+		P3 = (float *) malloc( DimTotal * sizeof(float) );
+		P1_prev = (float *) malloc( DimTotal * sizeof(float) );
+		P2_prev = (float *) malloc( DimTotal * sizeof(float) );
+		P3_prev = (float *) malloc( DimTotal * sizeof(float) );
+		R1 = (float *) malloc( DimTotal * sizeof(float) );
+		R2 = (float *) malloc( DimTotal * sizeof(float) );
+		R3 = (float *) malloc( DimTotal * sizeof(float) );
+		
+		    /* begin iterations */
+        for(ll=0; ll<iter; ll++) {
+            
+            /* computing the gradient of the objective function */
+            Obj_func3D(Input, Output, R1, R2, R3, lambda, dimX, dimY, dimZ);
+            
+            /* apply nonnegativity */
+            if (nonneg == 1) for(j=0; j<dimX*dimY*dimZ; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;}  
+            
+            /*Taking a step towards minus of the gradient*/
+            Grad_func3D(P1, P2, P3, Output, R1, R2, R3, lambda, dimX, dimY, dimZ);
+            
+            /* projection step */
+            Proj_func3D(P1, P2, P3, methodTV, dimX, dimY, dimZ);
+            
+            /*updating R and t*/
+            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
+            Rupd_func3D(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, dimX, dimY, dimZ);
+            
+            /* calculate norm - stopping rules*/
+            re = 0.0f; re1 = 0.0f;
+            for(j=0; j<dimX*dimY*dimZ; j++)
+            {
+                re += pow(Output[j] - Output_prev[j],2);
+                re1 += pow(Output[j],2);
+            }
+            re = sqrt(re)/sqrt(re1);
+            /* stop if the norm residual is less than the tolerance EPS */
+            if (re < epsil)  count++;
+            if (count > 4) break;            
+                        
+            /*storing old values*/
+            copyIm(Output, Output_prev, dimX, dimY, dimZ);
+            copyIm(P1, P1_prev, dimX, dimY, dimZ);
+            copyIm(P2, P2_prev, dimX, dimY, dimZ);
+            copyIm(P3, P3_prev, dimX, dimY, dimZ);
+            tk = tkp1;            
+        }	
+		if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", ll);   
+		free(Output_prev); free(P1); free(P2); free(P3); free(P1_prev); free(P2_prev); free(P3_prev); free(R1); free(R2); free(R3);
+	}
+	return *Output;
 }
 
 float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, int dimX, int dimY)
 {
-	float val1, val2;
-	int i, j;
-#pragma omp parallel for shared(A,D,R1,R2) private(i,j,val1,val2)
-	for (i = 0; i<dimX; i++) {
-		for (j = 0; j<dimY; j++) {
-			/* boundary conditions  */
-			if (i == 0) { val1 = 0.0f; }
-			else { val1 = R1[(i - 1)*dimY + (j)]; }
-			if (j == 0) { val2 = 0.0f; }
-			else { val2 = R2[(i)*dimY + (j - 1)]; }
-			D[(i)*dimY + (j)] = A[(i)*dimY + (j)] - lambda*(R1[(i)*dimY + (j)] + R2[(i)*dimY + (j)] - val1 - val2);
-		}
-	}
-	return *D;
+    float val1, val2;
+    int i,j,index;
+#pragma omp parallel for shared(A,D,R1,R2) private(index,i,j,val1,val2)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+			index = j*dimX+i;
+            /* boundary conditions  */
+            if (i == 0) {val1 = 0.0f;} else {val1 = R1[j*dimX + (i-1)];}
+            if (j == 0) {val2 = 0.0f;} else {val2 = R2[(j-1)*dimX + i];}
+            D[index] = A[index] - lambda*(R1[index] + R2[index] - val1 - val2);
+        }}
+    return *D;
 }
 float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda, int dimX, int dimY)
 {
-	float val1, val2, multip;
-	int i, j;
-	multip = (1.0f / (8.0f*lambda));
-#pragma omp parallel for shared(P1,P2,D,R1,R2,multip) private(i,j,val1,val2)
-	for (i = 0; i<dimX; i++) {
-		for (j = 0; j<dimY; j++) {
-			/* boundary conditions */
-			if (i == dimX - 1) val1 = 0.0f; else val1 = D[(i)*dimY + (j)] - D[(i + 1)*dimY + (j)];
-			if (j == dimY - 1) val2 = 0.0f; else val2 = D[(i)*dimY + (j)] - D[(i)*dimY + (j + 1)];
-			P1[(i)*dimY + (j)] = R1[(i)*dimY + (j)] + multip*val1;
-			P2[(i)*dimY + (j)] = R2[(i)*dimY + (j)] + multip*val2;
-		}
-	}
-	return 1;
+    float val1, val2, multip;
+    int i,j,index;
+    multip = (1.0f/(8.0f*lambda));
+#pragma omp parallel for shared(P1,P2,D,R1,R2,multip) private(index,i,j,val1,val2)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+			index = j*dimX+i;
+            /* boundary conditions */
+            if (i == dimX-1) val1 = 0.0f; else val1 = D[index] - D[j*dimX + (i+1)];
+            if (j == dimY-1) val2 = 0.0f; else val2 = D[index] - D[(j+1)*dimX + i];
+            P1[index] = R1[index] + multip*val1;
+            P2[index] = R2[index] + multip*val2;
+        }}
+    return 1;
 }
 float Proj_func2D(float *P1, float *P2, int methTV, int dimX, int dimY)
 {
-	float val1, val2, denom;
-	int i, j;
-	if (methTV == 0) {
-		/* isotropic TV*/
-#pragma omp parallel for shared(P1,P2) private(i,j,denom)
-		for (i = 0; i<dimX; i++) {
-			for (j = 0; j<dimY; j++) {
-				denom = pow(P1[(i)*dimY + (j)], 2) + pow(P2[(i)*dimY + (j)], 2);
-				if (denom > 1) {
-					P1[(i)*dimY + (j)] = P1[(i)*dimY + (j)] / sqrt(denom);
-					P2[(i)*dimY + (j)] = P2[(i)*dimY + (j)] / sqrt(denom);
-				}
-			}
-		}
-	}
-	else {
-		/* anisotropic TV*/
-#pragma omp parallel for shared(P1,P2) private(i,j,val1,val2)
-		for (i = 0; i<dimX; i++) {
-			for (j = 0; j<dimY; j++) {
-				val1 = fabs(P1[(i)*dimY + (j)]);
-				val2 = fabs(P2[(i)*dimY + (j)]);
-				if (val1 < 1.0f) { val1 = 1.0f; }
-				if (val2 < 1.0f) { val2 = 1.0f; }
-				P1[(i)*dimY + (j)] = P1[(i)*dimY + (j)] / val1;
-				P2[(i)*dimY + (j)] = P2[(i)*dimY + (j)] / val2;
-			}
-		}
-	}
-	return 1;
+    float val1, val2, denom, sq_denom;
+    int i,j,index;
+    if (methTV == 0) {
+        /* isotropic TV*/
+#pragma omp parallel for shared(P1,P2) private(index,i,j,denom,sq_denom)
+        for(i=0; i<dimX; i++) {
+            for(j=0; j<dimY; j++) {
+				index = j*dimX+i;
+                denom = pow(P1[index],2) +  pow(P2[index],2);
+                if (denom > 1.0f) {
+					sq_denom = 1.0f/sqrt(denom);
+                    P1[index] = P1[index]*sq_denom;
+                    P2[index] = P2[index]*sq_denom;
+                }
+            }}
+    }
+    else {
+        /* anisotropic TV*/
+#pragma omp parallel for shared(P1,P2) private(index,i,j,val1,val2)
+        for(i=0; i<dimX; i++) {
+            for(j=0; j<dimY; j++) {
+				index = j*dimX+i;
+                val1 = fabs(P1[index]);
+                val2 = fabs(P2[index]);
+                if (val1 < 1.0f) {val1 = 1.0f;}
+                if (val2 < 1.0f) {val2 = 1.0f;}
+                P1[index] = P1[index]/val1;
+                P2[index] = P2[index]/val2;
+            }}
+    }
+    return 1;
 }
 float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, int dimX, int dimY)
 {
-	int i, j;
-	float multip;
-	multip = ((tk - 1.0f) / tkp1);
-#pragma omp parallel for shared(P1,P2,P1_old,P2_old,R1,R2,multip) private(i,j)
-	for (i = 0; i<dimX; i++) {
-		for (j = 0; j<dimY; j++) {
-			R1[(i)*dimY + (j)] = P1[(i)*dimY + (j)] + multip*(P1[(i)*dimY + (j)] - P1_old[(i)*dimY + (j)]);
-			R2[(i)*dimY + (j)] = P2[(i)*dimY + (j)] + multip*(P2[(i)*dimY + (j)] - P2_old[(i)*dimY + (j)]);
-		}
-	}
-	return 1;
+    int i,j,index;
+    float multip;
+    multip = ((tk-1.0f)/tkp1);
+#pragma omp parallel for shared(P1,P2,P1_old,P2_old,R1,R2,multip) private(index,i,j)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+			index = j*dimX+i;
+            R1[index] = P1[index] + multip*(P1[index] - P1_old[index]);
+            R2[index] = P2[index] + multip*(P2[index] - P2_old[index]);
+        }}
+    return 1;
 }
 
 /* 3D-case related Functions */
 /*****************************************************************/
-float Obj_func_CALC3D(float *A, float *D, float *funcvalA, float lambda, int dimX, int dimY, int dimZ)
-{   
-    int i,j,k;
-    float f1, f2, val1, val2, val3;
-    
-    /*data-related term */
-    f1 = 0.0f;
-    for(i=0; i<dimX*dimY*dimZ; i++) f1 += pow(D[i] - A[i],2);    
-    
-    /*TV-related term */
-    f2 = 0.0f;
+float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ)
+{
+    float val1, val2, val3;
+    int i,j,k,index;
+#pragma omp parallel for shared(A,D,R1,R2,R3) private(index,i,j,k,val1,val2,val3)
     for(i=0; i<dimX; i++) {
         for(j=0; j<dimY; j++) {
             for(k=0; k<dimZ; k++) {
-            /* boundary conditions  */
-            if (i == dimX-1) {val1 = 0.0f;} else {val1 = A[(dimX*dimY)*k + (i+1)*dimY + (j)] - A[(dimX*dimY)*k + (i)*dimY + (j)];}
-            if (j == dimY-1) {val2 = 0.0f;} else {val2 = A[(dimX*dimY)*k + (i)*dimY + (j+1)] - A[(dimX*dimY)*k + (i)*dimY + (j)];}    
-            if (k == dimZ-1) {val3 = 0.0f;} else {val3 = A[(dimX*dimY)*(k+1) + (i)*dimY + (j)] - A[(dimX*dimY)*k + (i)*dimY + (j)];}    
-            f2 += sqrt(pow(val1,2) + pow(val2,2)  + pow(val3,2));
-        }}}     
-    /* sum of two terms */
-    funcvalA[0] = 0.5f*f1 + lambda*f2;     
-    return *funcvalA;
-}
-
-float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ)
-{
-	float val1, val2, val3;
-	int i, j, k;
-#pragma omp parallel for shared(A,D,R1,R2,R3) private(i,j,k,val1,val2,val3)
-	for (i = 0; i<dimX; i++) {
-		for (j = 0; j<dimY; j++) {
-			for (k = 0; k<dimZ; k++) {
-				/* boundary conditions */
-				if (i == 0) { val1 = 0.0f; }
-				else { val1 = R1[(dimX*dimY)*k + (i - 1)*dimY + (j)]; }
-				if (j == 0) { val2 = 0.0f; }
-				else { val2 = R2[(dimX*dimY)*k + (i)*dimY + (j - 1)]; }
-				if (k == 0) { val3 = 0.0f; }
-				else { val3 = R3[(dimX*dimY)*(k - 1) + (i)*dimY + (j)]; }
-				D[(dimX*dimY)*k + (i)*dimY + (j)] = A[(dimX*dimY)*k + (i)*dimY + (j)] - lambda*(R1[(dimX*dimY)*k + (i)*dimY + (j)] + R2[(dimX*dimY)*k + (i)*dimY + (j)] + R3[(dimX*dimY)*k + (i)*dimY + (j)] - val1 - val2 - val3);
-			}
-		}
-	}
-	return *D;
+				index = (dimX*dimY)*k + j*dimX+i;	
+                /* boundary conditions */
+                if (i == 0) {val1 = 0.0f;} else {val1 = R1[(dimX*dimY)*k + j*dimX + (i-1)];}
+                if (j == 0) {val2 = 0.0f;} else {val2 = R2[(dimX*dimY)*k + (j-1)*dimX + i];}
+                if (k == 0) {val3 = 0.0f;} else {val3 = R3[(dimX*dimY)*(k-1) + j*dimX + i];}
+                D[index] = A[index] - lambda*(R1[index] + R2[index] + R3[index] - val1 - val2 - val3);
+            }}}
+    return *D;
 }
 float Grad_func3D(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ)
 {
-	float val1, val2, val3, multip;
-	int i, j, k;
-	multip = (1.0f / (8.0f*lambda));
-#pragma omp parallel for shared(P1,P2,P3,D,R1,R2,R3,multip) private(i,j,k,val1,val2,val3)
-	for (i = 0; i<dimX; i++) {
-		for (j = 0; j<dimY; j++) {
-			for (k = 0; k<dimZ; k++) {
-				/* boundary conditions */
-				if (i == dimX - 1) val1 = 0.0f; else val1 = D[(dimX*dimY)*k + (i)*dimY + (j)] - D[(dimX*dimY)*k + (i + 1)*dimY + (j)];
-				if (j == dimY - 1) val2 = 0.0f; else val2 = D[(dimX*dimY)*k + (i)*dimY + (j)] - D[(dimX*dimY)*k + (i)*dimY + (j + 1)];
-				if (k == dimZ - 1) val3 = 0.0f; else val3 = D[(dimX*dimY)*k + (i)*dimY + (j)] - D[(dimX*dimY)*(k + 1) + (i)*dimY + (j)];
-				P1[(dimX*dimY)*k + (i)*dimY + (j)] = R1[(dimX*dimY)*k + (i)*dimY + (j)] + multip*val1;
-				P2[(dimX*dimY)*k + (i)*dimY + (j)] = R2[(dimX*dimY)*k + (i)*dimY + (j)] + multip*val2;
-				P3[(dimX*dimY)*k + (i)*dimY + (j)] = R3[(dimX*dimY)*k + (i)*dimY + (j)] + multip*val3;
-			}
-		}
-	}
-	return 1;
+    float val1, val2, val3, multip;
+    int i,j,k, index;
+    multip = (1.0f/(8.0f*lambda));
+#pragma omp parallel for shared(P1,P2,P3,D,R1,R2,R3,multip) private(index,i,j,k,val1,val2,val3)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+            for(k=0; k<dimZ; k++) {
+				index = (dimX*dimY)*k + j*dimX+i;				
+                /* boundary conditions */
+                if (i == dimX-1) val1 = 0.0f; else val1 = D[index] - D[(dimX*dimY)*k + j*dimX + (i+1)];
+                if (j == dimY-1) val2 = 0.0f; else val2 = D[index] - D[(dimX*dimY)*k + (j+1)*dimX + i];
+                if (k == dimZ-1) val3 = 0.0f; else val3 = D[index] - D[(dimX*dimY)*(k+1) + j*dimX + i];
+                P1[index] = R1[index] + multip*val1;
+                P2[index] = R2[index] + multip*val2;
+                P3[index] = R3[index] + multip*val3;
+            }}}
+    return 1;
 }
-float Proj_func3D(float *P1, float *P2, float *P3, int dimX, int dimY, int dimZ)
-{
-	float val1, val2, val3;
-	int i, j, k;
-#pragma omp parallel for shared(P1,P2,P3) private(i,j,k,val1,val2,val3)
-	for (i = 0; i<dimX; i++) {
-		for (j = 0; j<dimY; j++) {
-			for (k = 0; k<dimZ; k++) {
-				val1 = fabs(P1[(dimX*dimY)*k + (i)*dimY + (j)]);
-				val2 = fabs(P2[(dimX*dimY)*k + (i)*dimY + (j)]);
-				val3 = fabs(P3[(dimX*dimY)*k + (i)*dimY + (j)]);
-				if (val1 < 1.0f) { val1 = 1.0f; }
-				if (val2 < 1.0f) { val2 = 1.0f; }
-				if (val3 < 1.0f) { val3 = 1.0f; }
-
-				P1[(dimX*dimY)*k + (i)*dimY + (j)] = P1[(dimX*dimY)*k + (i)*dimY + (j)] / val1;
-				P2[(dimX*dimY)*k + (i)*dimY + (j)] = P2[(dimX*dimY)*k + (i)*dimY + (j)] / val2;
-				P3[(dimX*dimY)*k + (i)*dimY + (j)] = P3[(dimX*dimY)*k + (i)*dimY + (j)] / val3;
-			}
+float Proj_func3D(float *P1, float *P2, float *P3, int methTV, int dimX, int dimY, int dimZ)
+{		
+    float val1, val2, val3, denom, sq_denom;
+    int i,j,k, index;
+    if (methTV == 0) {
+	/* isotropic TV*/
+	#pragma omp parallel for shared(P1,P2,P3) private(index,i,j,k,val1,val2,val3,sq_denom)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+            for(k=0; k<dimZ; k++) {
+				index = (dimX*dimY)*k + j*dimX+i;				
+				denom = pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2);				
+                if (denom > 1.0f) {
+					sq_denom = 1.0f/sqrt(denom);
+                    P1[index] = P1[index]*sq_denom;
+                    P2[index] = P2[index]*sq_denom;
+                    P3[index] = P3[index]*sq_denom;
+                }				
+			}}}	
+	}    
+    else {
+    /* anisotropic TV*/
+#pragma omp parallel for shared(P1,P2,P3) private(index,i,j,k,val1,val2,val3)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+            for(k=0; k<dimZ; k++) {
+				index = (dimX*dimY)*k + j*dimX+i;		
+                val1 = fabs(P1[index]);
+                val2 = fabs(P2[index]);
+                val3 = fabs(P3[index]);
+                if (val1 < 1.0f) {val1 = 1.0f;}
+                if (val2 < 1.0f) {val2 = 1.0f;}
+                if (val3 < 1.0f) {val3 = 1.0f;}                
+                P1[index] = P1[index]/val1;
+                P2[index] = P2[index]/val2;
+                P3[index] = P3[index]/val3;
+            }}}
 		}
-	}
-	return 1;
+    return 1;
 }
 float Rupd_func3D(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, int dimX, int dimY, int dimZ)
 {
-	int i, j, k;
-	float multip;
-	multip = ((tk - 1.0f) / tkp1);
-#pragma omp parallel for shared(P1,P2,P3,P1_old,P2_old,P3_old,R1,R2,R3,multip) private(i,j,k)
-	for (i = 0; i<dimX; i++) {
-		for (j = 0; j<dimY; j++) {
-			for (k = 0; k<dimZ; k++) {
-				R1[(dimX*dimY)*k + (i)*dimY + (j)] = P1[(dimX*dimY)*k + (i)*dimY + (j)] + multip*(P1[(dimX*dimY)*k + (i)*dimY + (j)] - P1_old[(dimX*dimY)*k + (i)*dimY + (j)]);
-				R2[(dimX*dimY)*k + (i)*dimY + (j)] = P2[(dimX*dimY)*k + (i)*dimY + (j)] + multip*(P2[(dimX*dimY)*k + (i)*dimY + (j)] - P2_old[(dimX*dimY)*k + (i)*dimY + (j)]);
-				R3[(dimX*dimY)*k + (i)*dimY + (j)] = P3[(dimX*dimY)*k + (i)*dimY + (j)] + multip*(P3[(dimX*dimY)*k + (i)*dimY + (j)] - P3_old[(dimX*dimY)*k + (i)*dimY + (j)]);
-			}
-		}
-	}
-	return 1;
+    int i,j,k,index;
+    float multip;
+    multip = ((tk-1.0f)/tkp1);
+#pragma omp parallel for shared(P1,P2,P3,P1_old,P2_old,P3_old,R1,R2,R3,multip) private(index,i,j,k)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+            for(k=0; k<dimZ; k++) {
+				index = (dimX*dimY)*k + j*dimX+i;	
+                R1[index] = P1[index] + multip*(P1[index] - P1_old[index]);
+                R2[index] = P2[index] + multip*(P2[index] - P2_old[index]);
+                R3[index] = P3[index] + multip*(P3[index] - P3_old[index]);
+            }}}
+    return 1;
 }
-
-
diff --git a/Core/regularizers_CPU/FGP_TV_core.h b/Core/regularizers_CPU/FGP_TV_core.h
index 10ea224..b591819 100644
--- a/Core/regularizers_CPU/FGP_TV_core.h
+++ b/Core/regularizers_CPU/FGP_TV_core.h
@@ -27,46 +27,37 @@ limitations under the License.
 #include "CCPiDefines.h"
 
 /* C-OMP implementation of FGP-TV [1] denoising/regularization model (2D/3D case)
-*
-* Input Parameters:
-* 1. Noisy image/volume [REQUIRED]
-* 2. lambda - regularization parameter [REQUIRED]
-* 3. Number of iterations [OPTIONAL parameter]
-* 4. eplsilon: tolerance constant [OPTIONAL parameter]
-* 5. TV-type: 'iso' or 'l1' [OPTIONAL parameter]
-*
-* Output:
-* [1] Filtered/regularized image
-* [2] last function value
-*
-* Example of image denoising:
-* figure;
-* Im = double(imread('lena_gray_256.tif'))/255;  % loading image
-* u0 = Im + .05*randn(size(Im)); % adding noise
-* u = FGP_TV(single(u0), 0.05, 100, 1e-04);
-*
-* to compile with OMP support: mex FGP_TV.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
-* This function is based on the Matlab's code and paper by
-* [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
-*
-* D. Kazantsev, 2016-17
-*
-*/
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume 
+ * 2. lambda - regularization parameter 
+ * 3. Number of iterations
+ * 4. eplsilon: tolerance constant 
+ * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
+ * 6. nonneg: 'nonnegativity (0 is OFF by default) 
+ * 7. print information: 0 (off) or 1 (on) 
+ *
+ * Output:
+ * [1] Filtered/regularized image
+ *
+ * This function is based on the Matlab's code and paper by
+ * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
+ */
+ 
 #ifdef __cplusplus
 extern "C" {
 #endif
-//float copyIm(float *A, float *B, int dimX, int dimY, int dimZ);
+CCPI_EXPORT float FGP_TV_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
+
 CCPI_EXPORT float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, int dimX, int dimY);
 CCPI_EXPORT float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda, int dimX, int dimY);
 CCPI_EXPORT float Proj_func2D(float *P1, float *P2, int methTV, int dimX, int dimY);
 CCPI_EXPORT float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, int dimX, int dimY);
-CCPI_EXPORT float Obj_func_CALC2D(float *A, float *D, float *funcvalA, float lambda, int dimX, int dimY);
 
 CCPI_EXPORT float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ);
 CCPI_EXPORT float Grad_func3D(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ);
-CCPI_EXPORT float Proj_func3D(float *P1, float *P2, float *P3, int dimX, int dimY, int dimZ);
+CCPI_EXPORT float Proj_func3D(float *P1, float *P2, float *P3, int methTV, int dimX, int dimY, int dimZ);
 CCPI_EXPORT float Rupd_func3D(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, int dimX, int dimY, int dimZ);
-CCPI_EXPORT float Obj_func_CALC3D(float *A, float *D, float *funcvalA, float lambda, int dimX, int dimY, int dimZ);
 #ifdef __cplusplus
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu b/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu
new file mode 100755
index 0000000..21a95c9
--- /dev/null
+++ b/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu
@@ -0,0 +1,561 @@
+ /*
+This work is part of the Core Imaging Library developed by
+Visual Analytics and Imaging System Group of the Science Technology
+Facilities Council, STFC
+
+Copyright 2017 Daniil Kazantsev
+Copyright 2017 Srikanth Nagella, Edoardo Pasca
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/ 
+
+#include "FGP_TV_GPU_core.h"
+#include <thrust/device_vector.h>
+#include <thrust/transform_reduce.h>
+
+/* CUDA implementation of FGP-TV [1] denoising/regularization model (2D/3D case)
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume 
+ * 2. lambda - regularization parameter 
+ * 3. Number of iterations
+ * 4. eplsilon: tolerance constant 
+ * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
+ * 6. nonneg: 'nonnegativity (0 is OFF by default) 
+ * 7. print information: 0 (off) or 1 (on) 
+ *
+ * Output:
+ * [1] Filtered/regularized image
+ *
+ * This function is based on the Matlab's code and paper by
+ * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
+ */
+
+// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
+#define checkCudaErrors(err)           __checkCudaErrors (err, __FILE__, __LINE__)
+
+inline void __checkCudaErrors(cudaError err, const char *file, const int line)
+{
+    if (cudaSuccess != err)
+    {
+        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
+                file, line, (int)err, cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define BLKXSIZE2D 16
+#define BLKYSIZE2D 16
+
+#define BLKXSIZE 8
+#define BLKYSIZE 8
+#define BLKZSIZE 8
+
+#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
+struct square { __host__ __device__ float operator()(float x) { return x * x; } };
+
+/************************************************/
+/*****************2D modules*********************/
+/************************************************/
+__global__ void Obj_func2D_kernel(float *Ad, float *D, float *R1, float *R2, int N, int M, int ImSize, float lambda)
+{
+    
+    float val1,val2;
+    
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex; 
+    
+    if ((xIndex < N) && (yIndex < M)) {        
+        if (xIndex <= 0) {val1 = 0.0f;} else {val1 = R1[(xIndex-1) + N*yIndex];}
+        if (yIndex <= 0) {val2 = 0.0f;} else {val2 = R2[xIndex + N*(yIndex-1)];}
+        //Write final result to global memory
+        D[index] = Ad[index] - lambda*(R1[index] + R2[index] - val1 - val2);
+    }
+    return;
+}
+
+__global__ void Grad_func2D_kernel(float *P1, float *P2, float *D, float *R1, float *R2, int N, int M, int ImSize, float multip)
+{
+    
+    float val1,val2;
+    
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if ((xIndex < N) && (yIndex < M)) {        
+        
+        /* boundary conditions */
+        if (xIndex >= N-1) val1 = 0.0f; else val1 = D[index] - D[(xIndex+1) + N*yIndex];
+        if (yIndex >= M-1) val2 = 0.0f; else val2 = D[index] - D[(xIndex) + N*(yIndex + 1)];
+        
+        //Write final result to global memory
+        P1[index] = R1[index] + multip*val1;
+        P2[index] = R2[index] + multip*val2;
+    }
+    return;
+}
+
+__global__ void Proj_func2D_iso_kernel(float *P1, float *P2, int N, int M, int ImSize)
+{
+    
+    float denom;    
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if ((xIndex < N) && (yIndex < M)) { 
+        denom = pow(P1[index],2) +  pow(P2[index],2);        
+        if (denom > 1.0f) {
+            P1[index] = P1[index]/sqrt(denom);
+            P2[index] = P2[index]/sqrt(denom);
+        }
+    }
+    return;
+}
+__global__ void Proj_func2D_aniso_kernel(float *P1, float *P2, int N, int M, int ImSize)
+{
+    
+    float val1, val2;    
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if ((xIndex < N) && (yIndex < M)) { 
+                val1 = abs(P1[index]);
+                val2 = abs(P2[index]);
+                if (val1 < 1.0f) {val1 = 1.0f;}
+                if (val2 < 1.0f) {val2 = 1.0f;}
+                P1[index] = P1[index]/val1;
+                P2[index] = P2[index]/val2;
+    }
+    return;
+}
+__global__ void Rupd_func2D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, float multip2, int N, int M, int ImSize)
+{
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if ((xIndex < N) && (yIndex < M)) { 
+        R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]);
+        R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]);
+    }
+    return;
+}
+__global__ void nonneg2D_kernel(float* Output, int N, int M, int num_total)
+{
+    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if (index < num_total)	{
+        if (Output[index] < 0.0f) Output[index] = 0.0f;
+    }
+}
+__global__ void copy_kernel2D(float *Input, float* Output, int N, int M, int num_total)
+{
+    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if (index < num_total)	{
+        Output[index] = Input[index];
+    }
+}
+__global__ void ResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total)
+{
+    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if (index < num_total)	{
+        Output[index] = Input1[index] - Input2[index];
+    }
+}   
+/************************************************/
+/*****************3D modules*********************/
+/************************************************/
+__global__ void Obj_func3D_kernel(float *Ad, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float lambda)
+{
+    
+    float val1,val2,val3;
+    
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k < Z)) {      
+        if (i <= 0) {val1 = 0.0f;} else {val1 = R1[(N*M)*(k) + (i-1) + N*j];}
+        if (j <= 0) {val2 = 0.0f;} else {val2 = R2[(N*M)*(k) + i + N*(j-1)];}
+        if (k <= 0) {val3 = 0.0f;} else {val3 = R3[(N*M)*(k-1) + i + N*j];}
+        //Write final result to global memory
+        D[index] = Ad[index] - lambda*(R1[index] + R2[index] + R3[index] - val1 - val2 - val3);
+    }
+    return;
+}
+
+__global__ void Grad_func3D_kernel(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float multip)
+{
+    
+    float val1,val2,val3;
+    
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k <  Z)) {       
+        /* boundary conditions */
+        if (i >= N-1) val1 = 0.0f; else val1 = D[index] - D[(N*M)*(k) + (i+1) + N*j];
+        if (j >= M-1) val2 = 0.0f; else val2 = D[index] - D[(N*M)*(k) + i + N*(j+1)];
+        if (k >= Z-1) val3 = 0.0f; else val3 = D[index] - D[(N*M)*(k+1) + i + N*j];
+        
+        //Write final result to global memory
+        P1[index] = R1[index] + multip*val1;
+        P2[index] = R2[index] + multip*val2;
+        P3[index] = R3[index] + multip*val3;
+    }
+    return;
+}
+
+__global__ void Proj_func3D_iso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize)
+{
+    
+    float denom,sq_denom;    
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k <  Z)) {
+        denom = pow(P1[index],2) +  pow(P2[index],2) + pow(P3[index],2);
+        
+        if (denom > 1.0f) {
+            sq_denom = 1.0f/sqrt(denom);
+            P1[index] = P1[index]*sq_denom;
+            P2[index] = P2[index]*sq_denom;
+            P3[index] = P3[index]*sq_denom;
+        }
+    }
+    return;
+}
+
+__global__ void Proj_func3D_aniso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize)
+{
+    
+    float val1, val2, val3;    
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k <  Z)) {
+                val1 = abs(P1[index]);
+                val2 = abs(P2[index]);
+                val3 = abs(P3[index]);
+                if (val1 < 1.0f) {val1 = 1.0f;}
+                if (val2 < 1.0f) {val2 = 1.0f;}
+                if (val3 < 1.0f) {val3 = 1.0f;}
+                P1[index] = P1[index]/val1;
+                P2[index] = P2[index]/val2;
+                P3[index] = P3[index]/val3;
+    }
+    return;
+}
+
+
+__global__ void Rupd_func3D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, float multip2, int N, int M, int Z, int ImSize)
+{
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k <  Z)) { 
+        R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]);
+        R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]);
+        R3[index] = P3[index] + multip2*(P3[index] - P3_old[index]);
+    }
+    return;
+}
+
+__global__ void nonneg3D_kernel(float* Output, int N, int M, int Z, int num_total)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if (index < num_total)	{
+        if (Output[index] < 0.0f) Output[index] = 0.0f;
+    }
+}
+
+__global__ void copy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total)
+{
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if (index < num_total)	{
+        Output[index] = Input[index];
+    }
+}
+/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
+
+////////////MAIN HOST FUNCTION ///////////////
+extern "C" void FGP_TV_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
+{
+    int deviceCount = -1; // number of devices
+    cudaGetDeviceCount(&deviceCount);
+    if (deviceCount == 0) {
+        fprintf(stderr, "No CUDA devices found\n");
+        return;
+    }
+    
+    int count = 0, i;
+    float re, multip,multip2;    
+	float tk = 1.0f;
+    float tkp1=1.0f;
+        
+    if (dimZ <= 1) {
+		/*2D verson*/
+		int ImSize = dimX*dimY;    
+		float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL;
+   
+		dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
+		dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D));
+    
+		/*allocate space for images on device*/
+		checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );
+		if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) );
+    
+        checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
+        cudaMemset(P1, 0, ImSize*sizeof(float));
+        cudaMemset(P2, 0, ImSize*sizeof(float));
+        cudaMemset(P1_prev, 0, ImSize*sizeof(float));
+        cudaMemset(P2_prev, 0, ImSize*sizeof(float));
+        cudaMemset(R1, 0, ImSize*sizeof(float));
+        cudaMemset(R2, 0, ImSize*sizeof(float));
+
+        /********************** Run CUDA 2D kernel here ********************/    
+        multip = (1.0f/(8.0f*lambda));
+    
+        /* The main kernel */
+        for (i = 0; i < iter; i++) {
+        
+            /* computing the gradient of the objective function */
+            Obj_func2D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, dimX, dimY, ImSize, lambda);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+            
+            if (nonneg != 0) {
+            nonneg2D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() ); }
+                    
+            /*Taking a step towards minus of the gradient*/
+            Grad_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P2, d_update, R1, R2, dimX, dimY, ImSize, multip);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            /* projection step */
+            if (methodTV == 0) Proj_func2D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*isotropic TV*/
+            else Proj_func2D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/            
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
+            multip2 = ((tk-1.0f)/tkp1);
+        
+            Rupd_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, multip2, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            if (epsil != 0.0f) {
+                /* calculate norm - stopping rules using the Thrust library */					
+                ResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, ImSize);
+                checkCudaErrors( cudaDeviceSynchronize() );
+                checkCudaErrors(cudaPeekAtLastError() );               
+                
+                thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize);  		
+                float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));		
+                thrust::device_vector<float> d_vec2(d_update, d_update + ImSize);  		
+                float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>()));
+                    
+                re = (reduction/reduction2);      
+                if (re < epsil)  count++;
+                    if (count > 4) break;       
+             
+                copy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize);
+                checkCudaErrors( cudaDeviceSynchronize() );
+                checkCudaErrors(cudaPeekAtLastError() );                                              
+            }                  
+        
+            copy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            copy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );       
+ 
+            tk = tkp1;
+        }
+        if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i);   
+            /***************************************************************/    
+            //copy result matrix from device to host memory
+            cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
+    
+            cudaFree(d_input);
+            cudaFree(d_update);
+            if (epsil != 0.0f) cudaFree(d_update_prev);
+            cudaFree(P1);
+            cudaFree(P2);
+            cudaFree(P1_prev);
+            cudaFree(P2_prev);
+            cudaFree(R1);
+            cudaFree(R2);
+    }
+    else {
+            /*3D verson*/
+            int ImSize = dimX*dimY*dimZ;    
+            float *d_input, *d_update=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL;
+   
+            dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE);
+            dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE));
+    
+            /*allocate space for images on device*/
+            checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );            
+            checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P3,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P3_prev,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&R3,ImSize*sizeof(float)) );
+    
+            checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
+            cudaMemset(P1, 0, ImSize*sizeof(float));
+            cudaMemset(P2, 0, ImSize*sizeof(float));
+            cudaMemset(P3, 0, ImSize*sizeof(float));
+            cudaMemset(P1_prev, 0, ImSize*sizeof(float));
+            cudaMemset(P2_prev, 0, ImSize*sizeof(float));
+            cudaMemset(P3_prev, 0, ImSize*sizeof(float));
+            cudaMemset(R1, 0, ImSize*sizeof(float));
+            cudaMemset(R2, 0, ImSize*sizeof(float));
+            cudaMemset(R3, 0, ImSize*sizeof(float));
+            /********************** Run CUDA 3D kernel here ********************/    
+            multip = (1.0f/(8.0f*lambda));
+    
+            /* The main kernel */
+        for (i = 0; i < iter; i++) {
+        
+            /* computing the gradient of the objective function */
+            Obj_func3D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, lambda);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            if (nonneg != 0) {
+            nonneg3D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() ); }
+            
+            /*Taking a step towards minus of the gradient*/
+            Grad_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, multip);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            /* projection step */
+            if (methodTV == 0) Proj_func3D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* isotropic kernel */
+            else Proj_func3D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* anisotropic kernel */
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
+            multip2 = ((tk-1.0f)/tkp1);
+        
+            Rupd_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, multip2, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );           
+        
+            copy_kernel3D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            copy_kernel3D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );   
+            
+            copy_kernel3D<<<dimGrid,dimBlock>>>(P3, P3_prev, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );      
+ 
+            tk = tkp1;
+        }
+        if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i);   
+            /***************************************************************/    
+            //copy result matrix from device to host memory
+            cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
+    
+            cudaFree(d_input);
+            cudaFree(d_update);            
+            cudaFree(P1);
+            cudaFree(P2);
+            cudaFree(P3);
+            cudaFree(P1_prev);
+            cudaFree(P2_prev);
+            cudaFree(P3_prev);
+            cudaFree(R1);
+            cudaFree(R2);        
+            cudaFree(R3);        
+    } 
+    cudaDeviceReset(); 
+}
diff --git a/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h b/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h
new file mode 100755
index 0000000..a5d3f73
--- /dev/null
+++ b/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h
@@ -0,0 +1,10 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+#ifndef _FGP_TV_GPU_
+#define _FGP_TV_GPU_
+
+extern "C" void FGP_TV_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);   
+
+#endif 
-- 
cgit v1.2.3


From bcdf186ccdca54a3df383512ad5a117004500a60 Mon Sep 17 00:00:00 2001
From: Daniil Kazantsev <dkazanc@hotmail.com>
Date: Mon, 26 Feb 2018 12:50:58 +0000
Subject: CPU-GPU naming consistency

---
 Core/regularizers_CPU/FGP_TV_core.c                |   2 +-
 Core/regularizers_CPU/FGP_TV_core.h                |   2 +-
 Core/regularizers_CPU/ROF_TV_core.c                |   2 +-
 Core/regularizers_CPU/ROF_TV_core.h                |   2 +-
 Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu    | 561 ---------------------
 Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h     |  10 -
 Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu    | 561 +++++++++++++++++++++
 Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h     |  10 +
 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu         | 369 --------------
 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.h          |   8 -
 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu    | 369 ++++++++++++++
 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h     |   8 +
 Wrappers/Python/src/cpu_regularizers.cpp           | 546 ++++++++++----------
 Wrappers/Python/src/gpu_regularizers.pyx           |  69 ++-
 Wrappers/Python/test/test_cpu_vs_gpu.py            |  10 +
 .../Python/test/test_cpu_vs_gpu_regularizers.py    |   8 +-
 16 files changed, 1305 insertions(+), 1232 deletions(-)
 delete mode 100755 Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu
 delete mode 100755 Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h
 create mode 100755 Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu
 create mode 100755 Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h
 delete mode 100755 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu
 delete mode 100755 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.h
 create mode 100755 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu
 create mode 100755 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h
 create mode 100644 Wrappers/Python/test/test_cpu_vs_gpu.py

diff --git a/Core/regularizers_CPU/FGP_TV_core.c b/Core/regularizers_CPU/FGP_TV_core.c
index 304848d..2f1439d 100644
--- a/Core/regularizers_CPU/FGP_TV_core.c
+++ b/Core/regularizers_CPU/FGP_TV_core.c
@@ -37,7 +37,7 @@ limitations under the License.
  * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
  */
  
-float FGP_TV_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
+float TV_FGP_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
 {
 	int ll, j, DimTotal;
 	float re, re1;
diff --git a/Core/regularizers_CPU/FGP_TV_core.h b/Core/regularizers_CPU/FGP_TV_core.h
index b591819..98ceaec 100644
--- a/Core/regularizers_CPU/FGP_TV_core.h
+++ b/Core/regularizers_CPU/FGP_TV_core.h
@@ -47,7 +47,7 @@ limitations under the License.
 #ifdef __cplusplus
 extern "C" {
 #endif
-CCPI_EXPORT float FGP_TV_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
+CCPI_EXPORT float TV_FGP_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
 
 CCPI_EXPORT float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, int dimX, int dimY);
 CCPI_EXPORT float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda, int dimX, int dimY);
diff --git a/Core/regularizers_CPU/ROF_TV_core.c b/Core/regularizers_CPU/ROF_TV_core.c
index fd47c3f..b2c6f00 100644
--- a/Core/regularizers_CPU/ROF_TV_core.c
+++ b/Core/regularizers_CPU/ROF_TV_core.c
@@ -46,7 +46,7 @@ int sign(float x) {
  */
 
 /* Running iterations of TV-ROF function */
-float TV_ROF(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float lambda)
+float TV_ROF_CPU(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float lambda)
 {
     float *D1, *D2, *D3;
     int i, DimTotal;
diff --git a/Core/regularizers_CPU/ROF_TV_core.h b/Core/regularizers_CPU/ROF_TV_core.h
index 5d69d27..b32d0d5 100644
--- a/Core/regularizers_CPU/ROF_TV_core.h
+++ b/Core/regularizers_CPU/ROF_TV_core.h
@@ -47,7 +47,7 @@ limitations under the License.
 extern "C" {
 #endif
 CCPI_EXPORT float TV_kernel(float *D1, float *D2, float *D3, float *B, float *A, float lambda, float tau, int dimY, int dimX, int dimZ);
-CCPI_EXPORT float TV_ROF(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float lambda);
+CCPI_EXPORT float TV_ROF_CPU(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float lambda);
 CCPI_EXPORT float D1_func(float *A, float *D1, int dimY, int dimX, int dimZ);
 CCPI_EXPORT float D2_func(float *A, float *D2, int dimY, int dimX, int dimZ);
 CCPI_EXPORT float D3_func(float *A, float *D3, int dimY, int dimX, int dimZ);
diff --git a/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu b/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu
deleted file mode 100755
index 21a95c9..0000000
--- a/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu
+++ /dev/null
@@ -1,561 +0,0 @@
- /*
-This work is part of the Core Imaging Library developed by
-Visual Analytics and Imaging System Group of the Science Technology
-Facilities Council, STFC
-
-Copyright 2017 Daniil Kazantsev
-Copyright 2017 Srikanth Nagella, Edoardo Pasca
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/ 
-
-#include "FGP_TV_GPU_core.h"
-#include <thrust/device_vector.h>
-#include <thrust/transform_reduce.h>
-
-/* CUDA implementation of FGP-TV [1] denoising/regularization model (2D/3D case)
- *
- * Input Parameters:
- * 1. Noisy image/volume 
- * 2. lambda - regularization parameter 
- * 3. Number of iterations
- * 4. eplsilon: tolerance constant 
- * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
- * 6. nonneg: 'nonnegativity (0 is OFF by default) 
- * 7. print information: 0 (off) or 1 (on) 
- *
- * Output:
- * [1] Filtered/regularized image
- *
- * This function is based on the Matlab's code and paper by
- * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
- */
-
-// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
-#define checkCudaErrors(err)           __checkCudaErrors (err, __FILE__, __LINE__)
-
-inline void __checkCudaErrors(cudaError err, const char *file, const int line)
-{
-    if (cudaSuccess != err)
-    {
-        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
-                file, line, (int)err, cudaGetErrorString(err));
-        exit(EXIT_FAILURE);
-    }
-}
-
-#define BLKXSIZE2D 16
-#define BLKYSIZE2D 16
-
-#define BLKXSIZE 8
-#define BLKYSIZE 8
-#define BLKZSIZE 8
-
-#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
-struct square { __host__ __device__ float operator()(float x) { return x * x; } };
-
-/************************************************/
-/*****************2D modules*********************/
-/************************************************/
-__global__ void Obj_func2D_kernel(float *Ad, float *D, float *R1, float *R2, int N, int M, int ImSize, float lambda)
-{
-    
-    float val1,val2;
-    
-    //calculate each thread global index
-    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
-    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
-    
-    int index = xIndex + N*yIndex; 
-    
-    if ((xIndex < N) && (yIndex < M)) {        
-        if (xIndex <= 0) {val1 = 0.0f;} else {val1 = R1[(xIndex-1) + N*yIndex];}
-        if (yIndex <= 0) {val2 = 0.0f;} else {val2 = R2[xIndex + N*(yIndex-1)];}
-        //Write final result to global memory
-        D[index] = Ad[index] - lambda*(R1[index] + R2[index] - val1 - val2);
-    }
-    return;
-}
-
-__global__ void Grad_func2D_kernel(float *P1, float *P2, float *D, float *R1, float *R2, int N, int M, int ImSize, float multip)
-{
-    
-    float val1,val2;
-    
-    //calculate each thread global index
-    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
-    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
-    
-    if ((xIndex < N) && (yIndex < M)) {        
-        
-        /* boundary conditions */
-        if (xIndex >= N-1) val1 = 0.0f; else val1 = D[index] - D[(xIndex+1) + N*yIndex];
-        if (yIndex >= M-1) val2 = 0.0f; else val2 = D[index] - D[(xIndex) + N*(yIndex + 1)];
-        
-        //Write final result to global memory
-        P1[index] = R1[index] + multip*val1;
-        P2[index] = R2[index] + multip*val2;
-    }
-    return;
-}
-
-__global__ void Proj_func2D_iso_kernel(float *P1, float *P2, int N, int M, int ImSize)
-{
-    
-    float denom;    
-    //calculate each thread global index
-    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
-    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
-    
-    if ((xIndex < N) && (yIndex < M)) { 
-        denom = pow(P1[index],2) +  pow(P2[index],2);        
-        if (denom > 1.0f) {
-            P1[index] = P1[index]/sqrt(denom);
-            P2[index] = P2[index]/sqrt(denom);
-        }
-    }
-    return;
-}
-__global__ void Proj_func2D_aniso_kernel(float *P1, float *P2, int N, int M, int ImSize)
-{
-    
-    float val1, val2;    
-    //calculate each thread global index
-    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
-    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
-    
-    if ((xIndex < N) && (yIndex < M)) { 
-                val1 = abs(P1[index]);
-                val2 = abs(P2[index]);
-                if (val1 < 1.0f) {val1 = 1.0f;}
-                if (val2 < 1.0f) {val2 = 1.0f;}
-                P1[index] = P1[index]/val1;
-                P2[index] = P2[index]/val2;
-    }
-    return;
-}
-__global__ void Rupd_func2D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, float multip2, int N, int M, int ImSize)
-{
-    //calculate each thread global index
-    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
-    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
-    
-    if ((xIndex < N) && (yIndex < M)) { 
-        R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]);
-        R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]);
-    }
-    return;
-}
-__global__ void nonneg2D_kernel(float* Output, int N, int M, int num_total)
-{
-    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
-    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
-    
-    if (index < num_total)	{
-        if (Output[index] < 0.0f) Output[index] = 0.0f;
-    }
-}
-__global__ void copy_kernel2D(float *Input, float* Output, int N, int M, int num_total)
-{
-    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
-    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
-    
-    if (index < num_total)	{
-        Output[index] = Input[index];
-    }
-}
-__global__ void ResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total)
-{
-    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
-    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
-    
-    if (index < num_total)	{
-        Output[index] = Input1[index] - Input2[index];
-    }
-}   
-/************************************************/
-/*****************3D modules*********************/
-/************************************************/
-__global__ void Obj_func3D_kernel(float *Ad, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float lambda)
-{
-    
-    float val1,val2,val3;
-    
-    //calculate each thread global index
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    int k = blockDim.z * blockIdx.z + threadIdx.z;
-    
-    int index = (N*M)*k + i + N*j;
-    
-    if ((i < N) && (j < M) && (k < Z)) {      
-        if (i <= 0) {val1 = 0.0f;} else {val1 = R1[(N*M)*(k) + (i-1) + N*j];}
-        if (j <= 0) {val2 = 0.0f;} else {val2 = R2[(N*M)*(k) + i + N*(j-1)];}
-        if (k <= 0) {val3 = 0.0f;} else {val3 = R3[(N*M)*(k-1) + i + N*j];}
-        //Write final result to global memory
-        D[index] = Ad[index] - lambda*(R1[index] + R2[index] + R3[index] - val1 - val2 - val3);
-    }
-    return;
-}
-
-__global__ void Grad_func3D_kernel(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float multip)
-{
-    
-    float val1,val2,val3;
-    
-    //calculate each thread global index
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    int k = blockDim.z * blockIdx.z + threadIdx.z;
-    
-    int index = (N*M)*k + i + N*j;
-    
-    if ((i < N) && (j < M) && (k <  Z)) {       
-        /* boundary conditions */
-        if (i >= N-1) val1 = 0.0f; else val1 = D[index] - D[(N*M)*(k) + (i+1) + N*j];
-        if (j >= M-1) val2 = 0.0f; else val2 = D[index] - D[(N*M)*(k) + i + N*(j+1)];
-        if (k >= Z-1) val3 = 0.0f; else val3 = D[index] - D[(N*M)*(k+1) + i + N*j];
-        
-        //Write final result to global memory
-        P1[index] = R1[index] + multip*val1;
-        P2[index] = R2[index] + multip*val2;
-        P3[index] = R3[index] + multip*val3;
-    }
-    return;
-}
-
-__global__ void Proj_func3D_iso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize)
-{
-    
-    float denom,sq_denom;    
-    //calculate each thread global index
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    int k = blockDim.z * blockIdx.z + threadIdx.z;
-    
-    int index = (N*M)*k + i + N*j;
-    
-    if ((i < N) && (j < M) && (k <  Z)) {
-        denom = pow(P1[index],2) +  pow(P2[index],2) + pow(P3[index],2);
-        
-        if (denom > 1.0f) {
-            sq_denom = 1.0f/sqrt(denom);
-            P1[index] = P1[index]*sq_denom;
-            P2[index] = P2[index]*sq_denom;
-            P3[index] = P3[index]*sq_denom;
-        }
-    }
-    return;
-}
-
-__global__ void Proj_func3D_aniso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize)
-{
-    
-    float val1, val2, val3;    
-    //calculate each thread global index
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    int k = blockDim.z * blockIdx.z + threadIdx.z;
-    
-    int index = (N*M)*k + i + N*j;
-    
-    if ((i < N) && (j < M) && (k <  Z)) {
-                val1 = abs(P1[index]);
-                val2 = abs(P2[index]);
-                val3 = abs(P3[index]);
-                if (val1 < 1.0f) {val1 = 1.0f;}
-                if (val2 < 1.0f) {val2 = 1.0f;}
-                if (val3 < 1.0f) {val3 = 1.0f;}
-                P1[index] = P1[index]/val1;
-                P2[index] = P2[index]/val2;
-                P3[index] = P3[index]/val3;
-    }
-    return;
-}
-
-
-__global__ void Rupd_func3D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, float multip2, int N, int M, int Z, int ImSize)
-{
-    //calculate each thread global index
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    int k = blockDim.z * blockIdx.z + threadIdx.z;
-    
-    int index = (N*M)*k + i + N*j;
-    
-    if ((i < N) && (j < M) && (k <  Z)) { 
-        R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]);
-        R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]);
-        R3[index] = P3[index] + multip2*(P3[index] - P3_old[index]);
-    }
-    return;
-}
-
-__global__ void nonneg3D_kernel(float* Output, int N, int M, int Z, int num_total)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    int k = blockDim.z * blockIdx.z + threadIdx.z;
-    
-    int index = (N*M)*k + i + N*j;
-    
-    if (index < num_total)	{
-        if (Output[index] < 0.0f) Output[index] = 0.0f;
-    }
-}
-
-__global__ void copy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total)
-{
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    int k = blockDim.z * blockIdx.z + threadIdx.z;
-    
-    int index = (N*M)*k + i + N*j;
-    
-    if (index < num_total)	{
-        Output[index] = Input[index];
-    }
-}
-/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
-
-////////////MAIN HOST FUNCTION ///////////////
-extern "C" void FGP_TV_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
-{
-    int deviceCount = -1; // number of devices
-    cudaGetDeviceCount(&deviceCount);
-    if (deviceCount == 0) {
-        fprintf(stderr, "No CUDA devices found\n");
-        return;
-    }
-    
-    int count = 0, i;
-    float re, multip,multip2;    
-	float tk = 1.0f;
-    float tkp1=1.0f;
-        
-    if (dimZ <= 1) {
-		/*2D verson*/
-		int ImSize = dimX*dimY;    
-		float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL;
-   
-		dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
-		dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D));
-    
-		/*allocate space for images on device*/
-		checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
-		checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );
-		if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) );
-		checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) );
-		checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) );
-		checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) );
-		checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) );
-		checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) );
-		checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) );
-    
-        checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
-        cudaMemset(P1, 0, ImSize*sizeof(float));
-        cudaMemset(P2, 0, ImSize*sizeof(float));
-        cudaMemset(P1_prev, 0, ImSize*sizeof(float));
-        cudaMemset(P2_prev, 0, ImSize*sizeof(float));
-        cudaMemset(R1, 0, ImSize*sizeof(float));
-        cudaMemset(R2, 0, ImSize*sizeof(float));
-
-        /********************** Run CUDA 2D kernel here ********************/    
-        multip = (1.0f/(8.0f*lambda));
-    
-        /* The main kernel */
-        for (i = 0; i < iter; i++) {
-        
-            /* computing the gradient of the objective function */
-            Obj_func2D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, dimX, dimY, ImSize, lambda);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-            
-            if (nonneg != 0) {
-            nonneg2D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() ); }
-                    
-            /*Taking a step towards minus of the gradient*/
-            Grad_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P2, d_update, R1, R2, dimX, dimY, ImSize, multip);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-        
-            /* projection step */
-            if (methodTV == 0) Proj_func2D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*isotropic TV*/
-            else Proj_func2D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/            
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-        
-            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
-            multip2 = ((tk-1.0f)/tkp1);
-        
-            Rupd_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, multip2, dimX, dimY, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-        
-            if (epsil != 0.0f) {
-                /* calculate norm - stopping rules using the Thrust library */					
-                ResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, ImSize);
-                checkCudaErrors( cudaDeviceSynchronize() );
-                checkCudaErrors(cudaPeekAtLastError() );               
-                
-                thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize);  		
-                float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));		
-                thrust::device_vector<float> d_vec2(d_update, d_update + ImSize);  		
-                float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>()));
-                    
-                re = (reduction/reduction2);      
-                if (re < epsil)  count++;
-                    if (count > 4) break;       
-             
-                copy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize);
-                checkCudaErrors( cudaDeviceSynchronize() );
-                checkCudaErrors(cudaPeekAtLastError() );                                              
-            }                  
-        
-            copy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-        
-            copy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );       
- 
-            tk = tkp1;
-        }
-        if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i);   
-            /***************************************************************/    
-            //copy result matrix from device to host memory
-            cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
-    
-            cudaFree(d_input);
-            cudaFree(d_update);
-            if (epsil != 0.0f) cudaFree(d_update_prev);
-            cudaFree(P1);
-            cudaFree(P2);
-            cudaFree(P1_prev);
-            cudaFree(P2_prev);
-            cudaFree(R1);
-            cudaFree(R2);
-    }
-    else {
-            /*3D verson*/
-            int ImSize = dimX*dimY*dimZ;    
-            float *d_input, *d_update=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL;
-   
-            dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE);
-            dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE));
-    
-            /*allocate space for images on device*/
-            checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );            
-            checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&P3,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&P3_prev,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) );
-            checkCudaErrors( cudaMalloc((void**)&R3,ImSize*sizeof(float)) );
-    
-            checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
-            cudaMemset(P1, 0, ImSize*sizeof(float));
-            cudaMemset(P2, 0, ImSize*sizeof(float));
-            cudaMemset(P3, 0, ImSize*sizeof(float));
-            cudaMemset(P1_prev, 0, ImSize*sizeof(float));
-            cudaMemset(P2_prev, 0, ImSize*sizeof(float));
-            cudaMemset(P3_prev, 0, ImSize*sizeof(float));
-            cudaMemset(R1, 0, ImSize*sizeof(float));
-            cudaMemset(R2, 0, ImSize*sizeof(float));
-            cudaMemset(R3, 0, ImSize*sizeof(float));
-            /********************** Run CUDA 3D kernel here ********************/    
-            multip = (1.0f/(8.0f*lambda));
-    
-            /* The main kernel */
-        for (i = 0; i < iter; i++) {
-        
-            /* computing the gradient of the objective function */
-            Obj_func3D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, lambda);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-        
-            if (nonneg != 0) {
-            nonneg3D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, dimZ, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() ); }
-            
-            /*Taking a step towards minus of the gradient*/
-            Grad_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, multip);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-        
-            /* projection step */
-            if (methodTV == 0) Proj_func3D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* isotropic kernel */
-            else Proj_func3D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* anisotropic kernel */
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-        
-            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
-            multip2 = ((tk-1.0f)/tkp1);
-        
-            Rupd_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, multip2, dimX, dimY, dimZ, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );           
-        
-            copy_kernel3D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, dimZ, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-        
-            copy_kernel3D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, dimZ, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );   
-            
-            copy_kernel3D<<<dimGrid,dimBlock>>>(P3, P3_prev, dimX, dimY, dimZ, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );      
- 
-            tk = tkp1;
-        }
-        if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i);   
-            /***************************************************************/    
-            //copy result matrix from device to host memory
-            cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
-    
-            cudaFree(d_input);
-            cudaFree(d_update);            
-            cudaFree(P1);
-            cudaFree(P2);
-            cudaFree(P3);
-            cudaFree(P1_prev);
-            cudaFree(P2_prev);
-            cudaFree(P3_prev);
-            cudaFree(R1);
-            cudaFree(R2);        
-            cudaFree(R3);        
-    } 
-    cudaDeviceReset(); 
-}
diff --git a/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h b/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h
deleted file mode 100755
index a5d3f73..0000000
--- a/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <memory.h>
-
-#ifndef _FGP_TV_GPU_
-#define _FGP_TV_GPU_
-
-extern "C" void FGP_TV_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);   
-
-#endif 
diff --git a/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu
new file mode 100755
index 0000000..0533a85
--- /dev/null
+++ b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu
@@ -0,0 +1,561 @@
+ /*
+This work is part of the Core Imaging Library developed by
+Visual Analytics and Imaging System Group of the Science Technology
+Facilities Council, STFC
+
+Copyright 2017 Daniil Kazantsev
+Copyright 2017 Srikanth Nagella, Edoardo Pasca
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/ 
+
+#include "TV_FGP_GPU_core.h"
+#include <thrust/device_vector.h>
+#include <thrust/transform_reduce.h>
+
+/* CUDA implementation of FGP-TV [1] denoising/regularization model (2D/3D case)
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume 
+ * 2. lambda - regularization parameter 
+ * 3. Number of iterations
+ * 4. eplsilon: tolerance constant 
+ * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
+ * 6. nonneg: 'nonnegativity (0 is OFF by default) 
+ * 7. print information: 0 (off) or 1 (on) 
+ *
+ * Output:
+ * [1] Filtered/regularized image
+ *
+ * This function is based on the Matlab's code and paper by
+ * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
+ */
+
+// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
+#define checkCudaErrors(err)           __checkCudaErrors (err, __FILE__, __LINE__)
+
+inline void __checkCudaErrors(cudaError err, const char *file, const int line)
+{
+    if (cudaSuccess != err)
+    {
+        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
+                file, line, (int)err, cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define BLKXSIZE2D 16
+#define BLKYSIZE2D 16
+
+#define BLKXSIZE 8
+#define BLKYSIZE 8
+#define BLKZSIZE 8
+
+#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
+struct square { __host__ __device__ float operator()(float x) { return x * x; } };
+
+/************************************************/
+/*****************2D modules*********************/
+/************************************************/
+__global__ void Obj_func2D_kernel(float *Ad, float *D, float *R1, float *R2, int N, int M, int ImSize, float lambda)
+{
+    
+    float val1,val2;
+    
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex; 
+    
+    if ((xIndex < N) && (yIndex < M)) {        
+        if (xIndex <= 0) {val1 = 0.0f;} else {val1 = R1[(xIndex-1) + N*yIndex];}
+        if (yIndex <= 0) {val2 = 0.0f;} else {val2 = R2[xIndex + N*(yIndex-1)];}
+        //Write final result to global memory
+        D[index] = Ad[index] - lambda*(R1[index] + R2[index] - val1 - val2);
+    }
+    return;
+}
+
+__global__ void Grad_func2D_kernel(float *P1, float *P2, float *D, float *R1, float *R2, int N, int M, int ImSize, float multip)
+{
+    
+    float val1,val2;
+    
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if ((xIndex < N) && (yIndex < M)) {        
+        
+        /* boundary conditions */
+        if (xIndex >= N-1) val1 = 0.0f; else val1 = D[index] - D[(xIndex+1) + N*yIndex];
+        if (yIndex >= M-1) val2 = 0.0f; else val2 = D[index] - D[(xIndex) + N*(yIndex + 1)];
+        
+        //Write final result to global memory
+        P1[index] = R1[index] + multip*val1;
+        P2[index] = R2[index] + multip*val2;
+    }
+    return;
+}
+
+__global__ void Proj_func2D_iso_kernel(float *P1, float *P2, int N, int M, int ImSize)
+{
+    
+    float denom;    
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if ((xIndex < N) && (yIndex < M)) { 
+        denom = pow(P1[index],2) +  pow(P2[index],2);        
+        if (denom > 1.0f) {
+            P1[index] = P1[index]/sqrt(denom);
+            P2[index] = P2[index]/sqrt(denom);
+        }
+    }
+    return;
+}
+__global__ void Proj_func2D_aniso_kernel(float *P1, float *P2, int N, int M, int ImSize)
+{
+    
+    float val1, val2;    
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if ((xIndex < N) && (yIndex < M)) { 
+                val1 = abs(P1[index]);
+                val2 = abs(P2[index]);
+                if (val1 < 1.0f) {val1 = 1.0f;}
+                if (val2 < 1.0f) {val2 = 1.0f;}
+                P1[index] = P1[index]/val1;
+                P2[index] = P2[index]/val2;
+    }
+    return;
+}
+__global__ void Rupd_func2D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, float multip2, int N, int M, int ImSize)
+{
+    //calculate each thread global index
+    const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+    const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if ((xIndex < N) && (yIndex < M)) { 
+        R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]);
+        R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]);
+    }
+    return;
+}
+__global__ void nonneg2D_kernel(float* Output, int N, int M, int num_total)
+{
+    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if (index < num_total)	{
+        if (Output[index] < 0.0f) Output[index] = 0.0f;
+    }
+}
+__global__ void copy_kernel2D(float *Input, float* Output, int N, int M, int num_total)
+{
+    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if (index < num_total)	{
+        Output[index] = Input[index];
+    }
+}
+__global__ void ResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total)
+{
+    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if (index < num_total)	{
+        Output[index] = Input1[index] - Input2[index];
+    }
+}   
+/************************************************/
+/*****************3D modules*********************/
+/************************************************/
+__global__ void Obj_func3D_kernel(float *Ad, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float lambda)
+{
+    
+    float val1,val2,val3;
+    
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k < Z)) {      
+        if (i <= 0) {val1 = 0.0f;} else {val1 = R1[(N*M)*(k) + (i-1) + N*j];}
+        if (j <= 0) {val2 = 0.0f;} else {val2 = R2[(N*M)*(k) + i + N*(j-1)];}
+        if (k <= 0) {val3 = 0.0f;} else {val3 = R3[(N*M)*(k-1) + i + N*j];}
+        //Write final result to global memory
+        D[index] = Ad[index] - lambda*(R1[index] + R2[index] + R3[index] - val1 - val2 - val3);
+    }
+    return;
+}
+
+__global__ void Grad_func3D_kernel(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float multip)
+{
+    
+    float val1,val2,val3;
+    
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k <  Z)) {       
+        /* boundary conditions */
+        if (i >= N-1) val1 = 0.0f; else val1 = D[index] - D[(N*M)*(k) + (i+1) + N*j];
+        if (j >= M-1) val2 = 0.0f; else val2 = D[index] - D[(N*M)*(k) + i + N*(j+1)];
+        if (k >= Z-1) val3 = 0.0f; else val3 = D[index] - D[(N*M)*(k+1) + i + N*j];
+        
+        //Write final result to global memory
+        P1[index] = R1[index] + multip*val1;
+        P2[index] = R2[index] + multip*val2;
+        P3[index] = R3[index] + multip*val3;
+    }
+    return;
+}
+
+__global__ void Proj_func3D_iso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize)
+{
+    
+    float denom,sq_denom;    
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k <  Z)) {
+        denom = pow(P1[index],2) +  pow(P2[index],2) + pow(P3[index],2);
+        
+        if (denom > 1.0f) {
+            sq_denom = 1.0f/sqrt(denom);
+            P1[index] = P1[index]*sq_denom;
+            P2[index] = P2[index]*sq_denom;
+            P3[index] = P3[index]*sq_denom;
+        }
+    }
+    return;
+}
+
+__global__ void Proj_func3D_aniso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize)
+{
+    
+    float val1, val2, val3;    
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k <  Z)) {
+                val1 = abs(P1[index]);
+                val2 = abs(P2[index]);
+                val3 = abs(P3[index]);
+                if (val1 < 1.0f) {val1 = 1.0f;}
+                if (val2 < 1.0f) {val2 = 1.0f;}
+                if (val3 < 1.0f) {val3 = 1.0f;}
+                P1[index] = P1[index]/val1;
+                P2[index] = P2[index]/val2;
+                P3[index] = P3[index]/val3;
+    }
+    return;
+}
+
+
+__global__ void Rupd_func3D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, float multip2, int N, int M, int Z, int ImSize)
+{
+    //calculate each thread global index
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if ((i < N) && (j < M) && (k <  Z)) { 
+        R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]);
+        R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]);
+        R3[index] = P3[index] + multip2*(P3[index] - P3_old[index]);
+    }
+    return;
+}
+
+__global__ void nonneg3D_kernel(float* Output, int N, int M, int Z, int num_total)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if (index < num_total)	{
+        if (Output[index] < 0.0f) Output[index] = 0.0f;
+    }
+}
+
+__global__ void copy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total)
+{
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    int k = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    int index = (N*M)*k + i + N*j;
+    
+    if (index < num_total)	{
+        Output[index] = Input[index];
+    }
+}
+/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
+
+////////////MAIN HOST FUNCTION ///////////////
+extern "C" void TV_FGP_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
+{
+    int deviceCount = -1; // number of devices
+    cudaGetDeviceCount(&deviceCount);
+    if (deviceCount == 0) {
+        fprintf(stderr, "No CUDA devices found\n");
+        return;
+    }
+    
+    int count = 0, i;
+    float re, multip,multip2;    
+	float tk = 1.0f;
+    float tkp1=1.0f;
+        
+    if (dimZ <= 1) {
+		/*2D verson*/
+		int ImSize = dimX*dimY;    
+		float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL;
+   
+		dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
+		dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D));
+    
+		/*allocate space for images on device*/
+		checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );
+		if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) );
+		checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) );
+    
+        checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
+        cudaMemset(P1, 0, ImSize*sizeof(float));
+        cudaMemset(P2, 0, ImSize*sizeof(float));
+        cudaMemset(P1_prev, 0, ImSize*sizeof(float));
+        cudaMemset(P2_prev, 0, ImSize*sizeof(float));
+        cudaMemset(R1, 0, ImSize*sizeof(float));
+        cudaMemset(R2, 0, ImSize*sizeof(float));
+
+        /********************** Run CUDA 2D kernel here ********************/    
+        multip = (1.0f/(8.0f*lambda));
+    
+        /* The main kernel */
+        for (i = 0; i < iter; i++) {
+        
+            /* computing the gradient of the objective function */
+            Obj_func2D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, dimX, dimY, ImSize, lambda);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+            
+            if (nonneg != 0) {
+            nonneg2D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() ); }
+                    
+            /*Taking a step towards minus of the gradient*/
+            Grad_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P2, d_update, R1, R2, dimX, dimY, ImSize, multip);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            /* projection step */
+            if (methodTV == 0) Proj_func2D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*isotropic TV*/
+            else Proj_func2D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/            
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
+            multip2 = ((tk-1.0f)/tkp1);
+        
+            Rupd_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, multip2, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            if (epsil != 0.0f) {
+                /* calculate norm - stopping rules using the Thrust library */					
+                ResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, ImSize);
+                checkCudaErrors( cudaDeviceSynchronize() );
+                checkCudaErrors(cudaPeekAtLastError() );               
+                
+                thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize);  		
+                float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));		
+                thrust::device_vector<float> d_vec2(d_update, d_update + ImSize);  		
+                float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>()));
+                    
+                re = (reduction/reduction2);      
+                if (re < epsil)  count++;
+                    if (count > 4) break;       
+             
+                copy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize);
+                checkCudaErrors( cudaDeviceSynchronize() );
+                checkCudaErrors(cudaPeekAtLastError() );                                              
+            }                  
+        
+            copy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            copy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );       
+ 
+            tk = tkp1;
+        }
+        if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i);   
+            /***************************************************************/    
+            //copy result matrix from device to host memory
+            cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
+    
+            cudaFree(d_input);
+            cudaFree(d_update);
+            if (epsil != 0.0f) cudaFree(d_update_prev);
+            cudaFree(P1);
+            cudaFree(P2);
+            cudaFree(P1_prev);
+            cudaFree(P2_prev);
+            cudaFree(R1);
+            cudaFree(R2);
+    }
+    else {
+            /*3D verson*/
+            int ImSize = dimX*dimY*dimZ;    
+            float *d_input, *d_update=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL;
+   
+            dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE);
+            dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE));
+    
+            /*allocate space for images on device*/
+            checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );            
+            checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P3,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&P3_prev,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) );
+            checkCudaErrors( cudaMalloc((void**)&R3,ImSize*sizeof(float)) );
+    
+            checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
+            cudaMemset(P1, 0, ImSize*sizeof(float));
+            cudaMemset(P2, 0, ImSize*sizeof(float));
+            cudaMemset(P3, 0, ImSize*sizeof(float));
+            cudaMemset(P1_prev, 0, ImSize*sizeof(float));
+            cudaMemset(P2_prev, 0, ImSize*sizeof(float));
+            cudaMemset(P3_prev, 0, ImSize*sizeof(float));
+            cudaMemset(R1, 0, ImSize*sizeof(float));
+            cudaMemset(R2, 0, ImSize*sizeof(float));
+            cudaMemset(R3, 0, ImSize*sizeof(float));
+            /********************** Run CUDA 3D kernel here ********************/    
+            multip = (1.0f/(8.0f*lambda));
+    
+            /* The main kernel */
+        for (i = 0; i < iter; i++) {
+        
+            /* computing the gradient of the objective function */
+            Obj_func3D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, lambda);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            if (nonneg != 0) {
+            nonneg3D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() ); }
+            
+            /*Taking a step towards minus of the gradient*/
+            Grad_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, multip);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            /* projection step */
+            if (methodTV == 0) Proj_func3D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* isotropic kernel */
+            else Proj_func3D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* anisotropic kernel */
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
+            multip2 = ((tk-1.0f)/tkp1);
+        
+            Rupd_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, multip2, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );           
+        
+            copy_kernel3D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
+        
+            copy_kernel3D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );   
+            
+            copy_kernel3D<<<dimGrid,dimBlock>>>(P3, P3_prev, dimX, dimY, dimZ, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );      
+ 
+            tk = tkp1;
+        }
+        if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i);   
+            /***************************************************************/    
+            //copy result matrix from device to host memory
+            cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
+    
+            cudaFree(d_input);
+            cudaFree(d_update);            
+            cudaFree(P1);
+            cudaFree(P2);
+            cudaFree(P3);
+            cudaFree(P1_prev);
+            cudaFree(P2_prev);
+            cudaFree(P3_prev);
+            cudaFree(R1);
+            cudaFree(R2);        
+            cudaFree(R3);        
+    } 
+    cudaDeviceReset(); 
+}
diff --git a/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h
new file mode 100755
index 0000000..15c7120
--- /dev/null
+++ b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h
@@ -0,0 +1,10 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+#ifndef _TV_FGP_GPU_
+#define _TV_FGP_GPU_
+
+extern "C" void TV_FGP_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);   
+
+#endif 
diff --git a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu
deleted file mode 100755
index b67b53b..0000000
--- a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu
+++ /dev/null
@@ -1,369 +0,0 @@
- /*
-This work is part of the Core Imaging Library developed by
-Visual Analytics and Imaging System Group of the Science Technology
-Facilities Council, STFC
-
-Copyright 2017 Daniil Kazantsev
-Copyright 2017 Srikanth Nagella, Edoardo Pasca
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/ 
-
-#include "TV_ROF_GPU.h"
-
-/* C-OMP implementation of ROF-TV denoising/regularization model [1] (2D/3D case)
-*
-* Input Parameters:
-* 1. Noisy image/volume [REQUIRED]
-* 2. lambda - regularization parameter [REQUIRED]
-* 3. tau - marching step for explicit scheme, ~0.1 is recommended [REQUIRED]
-* 4. Number of iterations, for explicit scheme >= 150 is recommended [REQUIRED]
-*
-* Output:
-* [1] Regularized image/volume
-
- * This function is based on the paper by
-* [1] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms"
-*
-* D. Kazantsev, 2016-18
-*/
-
-#define CHECK(call)                                                            \
-{                                                                              \
-    const cudaError_t error = call;                                            \
-    if (error != cudaSuccess)                                                  \
-    {                                                                          \
-        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
-        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
-                cudaGetErrorString(error));                                    \
-        exit(1);                                                               \
-    }                                                                          \
-}
-    
-#define BLKXSIZE 8
-#define BLKYSIZE 8
-#define BLKZSIZE 8
-    
-#define BLKXSIZE2D 16
-#define BLKYSIZE2D 16
-#define EPS 1.0e-5
-    
-#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
-
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-
-__host__ __device__ int sign (float x)
-{
-        return (x > 0) - (x < 0);
-}        
-   
-/*********************2D case****************************/    
-    
-    /* differences 1 */
-    __global__ void D1_func2D(float* Input, float* D1, int N, int M)      
-    {
-		int i1, j1, i2;
-		float NOMx_1,NOMy_1,NOMy_0,denom1,denom2,T1;
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        
-        int index = i + N*j;        
-        
-        if ((i >= 0) && (i < N) && (j >= 0) && (j < M)) {
-            
-            /* boundary conditions (Neumann reflections) */
-                i1 = i + 1; if (i1 >= N) i1 = i-1;
-                i2 = i - 1; if (i2 < 0) i2 = i+1;
-                j1 = j + 1; if (j1 >= M) j1 = j-1;
-		
-		     /* Forward-backward differences */
-                NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */
-                NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */                
-                NOMy_0 = Input[index] - Input[j*N + i2]; /* y- */
-                
-                denom1 = NOMx_1*NOMx_1;
-                denom2 = 0.5f*(sign((float)NOMy_1) + sign((float)NOMy_0))*(MIN(abs((float)NOMy_1),abs((float)NOMy_0)));
-                denom2 = denom2*denom2;
-                T1 = sqrt(denom1 + denom2 + EPS);
-                D1[index] = NOMx_1/T1;	
-		}		
-	}       
-    
-    /* differences 2 */
-    __global__ void D2_func2D(float* Input, float* D2, int N, int M)      
-    {
-		int i1, j1, j2;
-		float NOMx_1,NOMy_1,NOMx_0,denom1,denom2,T2;
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        
-        int index = i + N*j;        
-        
-        if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) {
-            
-            /* boundary conditions (Neumann reflections) */
-                i1 = i + 1; if (i1 >= N) i1 = i-1;
-                j1 = j + 1; if (j1 >= M) j1 = j-1;
-                j2 = j - 1; if (j2 < 0) j2 = j+1; 
-		
-                /* Forward-backward differences */
-                NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */
-                NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */
-                NOMx_0 = Input[index] - Input[j2*N + i]; /* x- */
-                
-                denom1 = NOMy_1*NOMy_1;
-                denom2 = 0.5f*(sign((float)NOMx_1) + sign((float)NOMx_0))*(MIN(abs((float)NOMx_1),abs((float)NOMx_0)));
-                denom2 = denom2*denom2;
-                T2 = sqrt(denom1 + denom2 + EPS);
-                D2[index] = NOMy_1/T2;	
-		}		
-	}
-    
-    __global__ void TV_kernel2D(float *D1, float *D2, float *Update, float *Input, float lambda, float tau, int N, int M)    
-    {
-		int i2, j2;
-		float dv1,dv2;
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        
-        int index = i + N*j;        
-        
-        if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) {
-            
-				/* boundary conditions (Neumann reflections) */                
-                i2 = i - 1; if (i2 < 0) i2 = i+1;                
-                j2 = j - 1; if (j2 < 0) j2 = j+1; 
-                
-				/* divergence components  */
-                dv1 = D1[index] - D1[j2*N + i];
-                dv2 = D2[index] - D2[j*N + i2];                                
-                
-                Update[index] =  Update[index] + tau*(2.0f*lambda*(dv1 + dv2) - (Update[index] - Input[index]));      
-		
-		}  
-	}   
-/*********************3D case****************************/    
- 
-    /* differences 1 */
-    __global__ void D1_func3D(float* Input, float* D1, int dimX, int dimY, int dimZ)      
-    {
-		float NOMx_1, NOMy_1, NOMy_0, NOMz_1, NOMz_0, denom1, denom2,denom3, T1;
-		int i1,i2,k1,j1,j2,k2;
-		
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        int k = blockDim.z * blockIdx.z + threadIdx.z;
-        
-      	int index = (dimX*dimY)*k + j*dimX+i;     
-        
-        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {
-            
-                    /* symmetric boundary conditions (Neuman) */
-                    i1 = i + 1; if (i1 >= dimX) i1 = i-1;
-                    i2 = i - 1; if (i2 < 0) i2 = i+1;
-                    j1 = j + 1; if (j1 >= dimY) j1 = j-1;
-                    j2 = j - 1; if (j2 < 0) j2 = j+1;
-                    k1 = k + 1; if (k1 >= dimZ) k1 = k-1;
-                    k2 = k - 1; if (k2 < 0) k2 = k+1;                    
-                    
-                    /* Forward-backward differences */
-                    NOMx_1 = Input[(dimX*dimY)*k + j1*dimX + i] - Input[index]; /* x+ */
-                    NOMy_1 = Input[(dimX*dimY)*k + j*dimX + i1] - Input[index]; /* y+ */                    
-                    NOMy_0 = Input[index] - Input[(dimX*dimY)*k + j*dimX + i2]; /* y- */
-                    
-                    NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */
-                    NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + j*dimX + i]; /* z- */
-                    
-                    
-                    denom1 = NOMx_1*NOMx_1;
-                    denom2 = 0.5*(sign(NOMy_1) + sign(NOMy_0))*(MIN(abs(NOMy_1),abs(NOMy_0)));
-                    denom2 = denom2*denom2;
-                    denom3 = 0.5*(sign(NOMz_1) + sign(NOMz_0))*(MIN(abs(NOMz_1),abs(NOMz_0)));
-                    denom3 = denom3*denom3;
-                    T1 = sqrt(denom1 + denom2 + denom3 + EPS);
-                    D1[index] = NOMx_1/T1;	
-		}		
-	}      
-
-    /* differences 2 */
-    __global__ void D2_func3D(float* Input, float* D2, int dimX, int dimY, int dimZ)      
-    {
-		float NOMx_1, NOMy_1, NOMx_0, NOMz_1, NOMz_0, denom1, denom2, denom3, T2;
-		int i1,i2,k1,j1,j2,k2;
-		
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        int k = blockDim.z * blockIdx.z + threadIdx.z;
-        
-      	int index = (dimX*dimY)*k + j*dimX+i;     
-        
-        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {
-                    /* symmetric boundary conditions (Neuman) */
-                    i1 = i + 1; if (i1 >= dimX) i1 = i-1;
-                    i2 = i - 1; if (i2 < 0) i2 = i+1;
-                    j1 = j + 1; if (j1 >= dimY) j1 = j-1;
-                    j2 = j - 1; if (j2 < 0) j2 = j+1;
-                    k1 = k + 1; if (k1 >= dimZ) k1 = k-1;
-                    k2 = k - 1; if (k2 < 0) k2 = k+1;
-                    
-                    
-                    /* Forward-backward differences */
-                    NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */
-                    NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */
-                    NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */
-                    NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */
-                    NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + (j)*dimX + i]; /* z- */
-                    
-                    
-                    denom1 = NOMy_1*NOMy_1;
-                    denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0)));
-                    denom2 = denom2*denom2;
-                    denom3 = 0.5*(sign(NOMz_1) + sign(NOMz_0))*(MIN(abs(NOMz_1),abs(NOMz_0)));
-                    denom3 = denom3*denom3;
-                    T2 = sqrt(denom1 + denom2 + denom3 + EPS);
-                    D2[index] = NOMy_1/T2;
-		}
-	}
-	
-	  /* differences 3 */
-    __global__ void D3_func3D(float* Input, float* D3, int dimX, int dimY, int dimZ)      
-    {
-		float NOMx_1, NOMy_1, NOMx_0, NOMy_0, NOMz_1, denom1, denom2, denom3, T3;
-		int i1,i2,k1,j1,j2,k2;
-		
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        int k = blockDim.z * blockIdx.z + threadIdx.z;
-        
-      	int index = (dimX*dimY)*k + j*dimX+i;     
-        
-        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {
-
-				i1 = i + 1; if (i1 >= dimX) i1 = i-1;
-                i2 = i - 1; if (i2 < 0) i2 = i+1;
-                j1 = j + 1; if (j1 >= dimY) j1 = j-1;
-                j2 = j - 1; if (j2 < 0) j2 = j+1;
-                k1 = k + 1; if (k1 >= dimZ) k1 = k-1;
-                k2 = k - 1; if (k2 < 0) k2 = k+1;
-                
-                /* Forward-backward differences */
-                NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */
-                NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */
-                NOMy_0 = Input[index] - Input[(dimX*dimY)*k + (j)*dimX + i2]; /* y- */
-                NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */
-                NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */
-               
-                denom1 = NOMz_1*NOMz_1;
-                denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0)));
-                denom2 = denom2*denom2;
-                denom3 = 0.5*(sign(NOMy_1) + sign(NOMy_0))*(MIN(abs(NOMy_1),abs(NOMy_0)));
-                denom3 = denom3*denom3;
-                T3 = sqrt(denom1 + denom2 + denom3 + EPS);
-                D3[index] = NOMz_1/T3;		
-		}
-	}
-
-    __global__ void TV_kernel3D(float *D1, float *D2, float *D3, float *Update, float *Input, float lambda, float tau, int dimX, int dimY, int dimZ)    
-    {
-		float dv1, dv2, dv3;
-		int i1,i2,k1,j1,j2,k2;
-		int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        int k = blockDim.z * blockIdx.z + threadIdx.z;
-        
-        int index = (dimX*dimY)*k + j*dimX+i;       
-        
-        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {
-            
-					/* symmetric boundary conditions (Neuman) */
-                    i1 = i + 1; if (i1 >= dimX) i1 = i-1;
-                    i2 = i - 1; if (i2 < 0) i2 = i+1;
-                    j1 = j + 1; if (j1 >= dimY) j1 = j-1;
-                    j2 = j - 1; if (j2 < 0) j2 = j+1;
-                    k1 = k + 1; if (k1 >= dimZ) k1 = k-1;
-                    k2 = k - 1; if (k2 < 0) k2 = k+1;
-                    
-                    /*divergence components */
-                    dv1 = D1[index] - D1[(dimX*dimY)*k + j2*dimX+i];
-                    dv2 = D2[index] - D2[(dimX*dimY)*k + j*dimX+i2];
-                    dv3 = D3[index] - D3[(dimX*dimY)*k2 + j*dimX+i];
-                    
-                    Update[index] = Update[index] + tau*(2.0f*lambda*(dv1 + dv2 + dv3) - (Update[index] - Input[index]));
-		
-		}  
-	}
-
-/////////////////////////////////////////////////
-// HOST FUNCTION
-extern "C" void TV_ROF_GPU(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambda)
-{
-	    // set up device
-		int dev = 0;
-		CHECK(cudaSetDevice(dev));
-		
-        float *d_input, *d_update, *d_D1, *d_D2;
-        
-	if (Z == 0) Z = 1;
-        CHECK(cudaMalloc((void**)&d_input,N*M*Z*sizeof(float)));
-        CHECK(cudaMalloc((void**)&d_update,N*M*Z*sizeof(float)));
-        CHECK(cudaMalloc((void**)&d_D1,N*M*Z*sizeof(float)));
-        CHECK(cudaMalloc((void**)&d_D2,N*M*Z*sizeof(float)));
-        
-        CHECK(cudaMemcpy(d_input,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice));
-        CHECK(cudaMemcpy(d_update,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice));      
-        
-        if (Z > 1) {
-			// TV - 3D case                 
-            dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE);
-            dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKXSIZE));            
-            
-            float *d_D3;
-            CHECK(cudaMalloc((void**)&d_D3,N*M*Z*sizeof(float)));
-            
-            for(int n=0; n < iter; n++) {
-                /* calculate differences */
-                D1_func3D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M, Z);				
-                CHECK(cudaDeviceSynchronize());
-				D2_func3D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M, Z);				                
-                CHECK(cudaDeviceSynchronize());        
-                D3_func3D<<<dimGrid,dimBlock>>>(d_update, d_D3, N, M, Z);				
-                CHECK(cudaDeviceSynchronize());        
-                /*running main kernel*/
-                TV_kernel3D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_D3, d_update, d_input, lambda, tau, N, M, Z);
-                CHECK(cudaDeviceSynchronize());
-            }
-            
-            CHECK(cudaFree(d_D3));         
-        }
-        else {
-	    // TV - 2D case
-            dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
-            dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D));
-             
-            for(int n=0; n < iter; n++) {
-                /* calculate differences */
-                D1_func2D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M);				
-                CHECK(cudaDeviceSynchronize());
-				D2_func2D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M);				
-                CHECK(cudaDeviceSynchronize());        
-                /*running main kernel*/
-                TV_kernel2D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_update, d_input, lambda, tau, N, M);
-                CHECK(cudaDeviceSynchronize());
-            }
-        }        
-        CHECK(cudaMemcpy(Output,d_update,N*M*Z*sizeof(float),cudaMemcpyDeviceToHost));
-        CHECK(cudaFree(d_input));
-        CHECK(cudaFree(d_update));
-        CHECK(cudaFree(d_D1));
-        CHECK(cudaFree(d_D2));        
-        cudaDeviceReset(); 
-}
diff --git a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.h b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.h
deleted file mode 100755
index 2938d2f..0000000
--- a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __TVGPU_H__
-#define __TVGPU_H__
-#include "CCPiDefines.h"
-#include <stdio.h>
-
-extern "C" CCPI_EXPORT void TV_ROF_GPU_kernel(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambda);
-
-#endif 
diff --git a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu
new file mode 100755
index 0000000..480855f
--- /dev/null
+++ b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu
@@ -0,0 +1,369 @@
+ /*
+This work is part of the Core Imaging Library developed by
+Visual Analytics and Imaging System Group of the Science Technology
+Facilities Council, STFC
+
+Copyright 2017 Daniil Kazantsev
+Copyright 2017 Srikanth Nagella, Edoardo Pasca
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/ 
+
+#include "TV_ROF_GPU_core.h"
+
+/* C-OMP implementation of ROF-TV denoising/regularization model [1] (2D/3D case)
+*
+* Input Parameters:
+* 1. Noisy image/volume [REQUIRED]
+* 2. lambda - regularization parameter [REQUIRED]
+* 3. tau - marching step for explicit scheme, ~0.1 is recommended [REQUIRED]
+* 4. Number of iterations, for explicit scheme >= 150 is recommended [REQUIRED]
+*
+* Output:
+* [1] Regularized image/volume
+
+ * This function is based on the paper by
+* [1] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms"
+*
+* D. Kazantsev, 2016-18
+*/
+
+#define CHECK(call)                                                            \
+{                                                                              \
+    const cudaError_t error = call;                                            \
+    if (error != cudaSuccess)                                                  \
+    {                                                                          \
+        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
+        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
+                cudaGetErrorString(error));                                    \
+        exit(1);                                                               \
+    }                                                                          \
+}
+    
+#define BLKXSIZE 8
+#define BLKYSIZE 8
+#define BLKZSIZE 8
+    
+#define BLKXSIZE2D 16
+#define BLKYSIZE2D 16
+#define EPS 1.0e-5
+    
+#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
+
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+__host__ __device__ int sign (float x)
+{
+        return (x > 0) - (x < 0);
+}        
+   
+/*********************2D case****************************/    
+    
+    /* differences 1 */
+    __global__ void D1_func2D(float* Input, float* D1, int N, int M)      
+    {
+		int i1, j1, i2;
+		float NOMx_1,NOMy_1,NOMy_0,denom1,denom2,T1;
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        
+        int index = i + N*j;        
+        
+        if ((i >= 0) && (i < N) && (j >= 0) && (j < M)) {
+            
+            /* boundary conditions (Neumann reflections) */
+                i1 = i + 1; if (i1 >= N) i1 = i-1;
+                i2 = i - 1; if (i2 < 0) i2 = i+1;
+                j1 = j + 1; if (j1 >= M) j1 = j-1;
+		
+		     /* Forward-backward differences */
+                NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */
+                NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */                
+                NOMy_0 = Input[index] - Input[j*N + i2]; /* y- */
+                
+                denom1 = NOMx_1*NOMx_1;
+                denom2 = 0.5f*(sign((float)NOMy_1) + sign((float)NOMy_0))*(MIN(abs((float)NOMy_1),abs((float)NOMy_0)));
+                denom2 = denom2*denom2;
+                T1 = sqrt(denom1 + denom2 + EPS);
+                D1[index] = NOMx_1/T1;	
+		}		
+	}       
+    
+    /* differences 2 */
+    __global__ void D2_func2D(float* Input, float* D2, int N, int M)      
+    {
+		int i1, j1, j2;
+		float NOMx_1,NOMy_1,NOMx_0,denom1,denom2,T2;
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        
+        int index = i + N*j;        
+        
+        if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) {
+            
+            /* boundary conditions (Neumann reflections) */
+                i1 = i + 1; if (i1 >= N) i1 = i-1;
+                j1 = j + 1; if (j1 >= M) j1 = j-1;
+                j2 = j - 1; if (j2 < 0) j2 = j+1; 
+		
+                /* Forward-backward differences */
+                NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */
+                NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */
+                NOMx_0 = Input[index] - Input[j2*N + i]; /* x- */
+                
+                denom1 = NOMy_1*NOMy_1;
+                denom2 = 0.5f*(sign((float)NOMx_1) + sign((float)NOMx_0))*(MIN(abs((float)NOMx_1),abs((float)NOMx_0)));
+                denom2 = denom2*denom2;
+                T2 = sqrt(denom1 + denom2 + EPS);
+                D2[index] = NOMy_1/T2;	
+		}		
+	}
+    
+    __global__ void TV_kernel2D(float *D1, float *D2, float *Update, float *Input, float lambda, float tau, int N, int M)    
+    {
+		int i2, j2;
+		float dv1,dv2;
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        
+        int index = i + N*j;        
+        
+        if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) {
+            
+				/* boundary conditions (Neumann reflections) */                
+                i2 = i - 1; if (i2 < 0) i2 = i+1;                
+                j2 = j - 1; if (j2 < 0) j2 = j+1; 
+                
+				/* divergence components  */
+                dv1 = D1[index] - D1[j2*N + i];
+                dv2 = D2[index] - D2[j*N + i2];                                
+                
+                Update[index] =  Update[index] + tau*(2.0f*lambda*(dv1 + dv2) - (Update[index] - Input[index]));      
+		
+		}  
+	}   
+/*********************3D case****************************/    
+ 
+    /* differences 1 */
+    __global__ void D1_func3D(float* Input, float* D1, int dimX, int dimY, int dimZ)      
+    {
+		float NOMx_1, NOMy_1, NOMy_0, NOMz_1, NOMz_0, denom1, denom2,denom3, T1;
+		int i1,i2,k1,j1,j2,k2;
+		
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        int k = blockDim.z * blockIdx.z + threadIdx.z;
+        
+      	int index = (dimX*dimY)*k + j*dimX+i;     
+        
+        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {
+            
+                    /* symmetric boundary conditions (Neuman) */
+                    i1 = i + 1; if (i1 >= dimX) i1 = i-1;
+                    i2 = i - 1; if (i2 < 0) i2 = i+1;
+                    j1 = j + 1; if (j1 >= dimY) j1 = j-1;
+                    j2 = j - 1; if (j2 < 0) j2 = j+1;
+                    k1 = k + 1; if (k1 >= dimZ) k1 = k-1;
+                    k2 = k - 1; if (k2 < 0) k2 = k+1;                    
+                    
+                    /* Forward-backward differences */
+                    NOMx_1 = Input[(dimX*dimY)*k + j1*dimX + i] - Input[index]; /* x+ */
+                    NOMy_1 = Input[(dimX*dimY)*k + j*dimX + i1] - Input[index]; /* y+ */                    
+                    NOMy_0 = Input[index] - Input[(dimX*dimY)*k + j*dimX + i2]; /* y- */
+                    
+                    NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */
+                    NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + j*dimX + i]; /* z- */
+                    
+                    
+                    denom1 = NOMx_1*NOMx_1;
+                    denom2 = 0.5*(sign(NOMy_1) + sign(NOMy_0))*(MIN(abs(NOMy_1),abs(NOMy_0)));
+                    denom2 = denom2*denom2;
+                    denom3 = 0.5*(sign(NOMz_1) + sign(NOMz_0))*(MIN(abs(NOMz_1),abs(NOMz_0)));
+                    denom3 = denom3*denom3;
+                    T1 = sqrt(denom1 + denom2 + denom3 + EPS);
+                    D1[index] = NOMx_1/T1;	
+		}		
+	}      
+
+    /* differences 2 */
+    __global__ void D2_func3D(float* Input, float* D2, int dimX, int dimY, int dimZ)      
+    {
+		float NOMx_1, NOMy_1, NOMx_0, NOMz_1, NOMz_0, denom1, denom2, denom3, T2;
+		int i1,i2,k1,j1,j2,k2;
+		
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        int k = blockDim.z * blockIdx.z + threadIdx.z;
+        
+      	int index = (dimX*dimY)*k + j*dimX+i;     
+        
+        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {
+                    /* symmetric boundary conditions (Neuman) */
+                    i1 = i + 1; if (i1 >= dimX) i1 = i-1;
+                    i2 = i - 1; if (i2 < 0) i2 = i+1;
+                    j1 = j + 1; if (j1 >= dimY) j1 = j-1;
+                    j2 = j - 1; if (j2 < 0) j2 = j+1;
+                    k1 = k + 1; if (k1 >= dimZ) k1 = k-1;
+                    k2 = k - 1; if (k2 < 0) k2 = k+1;
+                    
+                    
+                    /* Forward-backward differences */
+                    NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */
+                    NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */
+                    NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */
+                    NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */
+                    NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + (j)*dimX + i]; /* z- */
+                    
+                    
+                    denom1 = NOMy_1*NOMy_1;
+                    denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0)));
+                    denom2 = denom2*denom2;
+                    denom3 = 0.5*(sign(NOMz_1) + sign(NOMz_0))*(MIN(abs(NOMz_1),abs(NOMz_0)));
+                    denom3 = denom3*denom3;
+                    T2 = sqrt(denom1 + denom2 + denom3 + EPS);
+                    D2[index] = NOMy_1/T2;
+		}
+	}
+	
+	  /* differences 3 */
+    __global__ void D3_func3D(float* Input, float* D3, int dimX, int dimY, int dimZ)      
+    {
+		float NOMx_1, NOMy_1, NOMx_0, NOMy_0, NOMz_1, denom1, denom2, denom3, T3;
+		int i1,i2,k1,j1,j2,k2;
+		
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        int k = blockDim.z * blockIdx.z + threadIdx.z;
+        
+      	int index = (dimX*dimY)*k + j*dimX+i;     
+        
+        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {
+
+				i1 = i + 1; if (i1 >= dimX) i1 = i-1;
+                i2 = i - 1; if (i2 < 0) i2 = i+1;
+                j1 = j + 1; if (j1 >= dimY) j1 = j-1;
+                j2 = j - 1; if (j2 < 0) j2 = j+1;
+                k1 = k + 1; if (k1 >= dimZ) k1 = k-1;
+                k2 = k - 1; if (k2 < 0) k2 = k+1;
+                
+                /* Forward-backward differences */
+                NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */
+                NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */
+                NOMy_0 = Input[index] - Input[(dimX*dimY)*k + (j)*dimX + i2]; /* y- */
+                NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */
+                NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */
+               
+                denom1 = NOMz_1*NOMz_1;
+                denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0)));
+                denom2 = denom2*denom2;
+                denom3 = 0.5*(sign(NOMy_1) + sign(NOMy_0))*(MIN(abs(NOMy_1),abs(NOMy_0)));
+                denom3 = denom3*denom3;
+                T3 = sqrt(denom1 + denom2 + denom3 + EPS);
+                D3[index] = NOMz_1/T3;		
+		}
+	}
+
+    __global__ void TV_kernel3D(float *D1, float *D2, float *D3, float *Update, float *Input, float lambda, float tau, int dimX, int dimY, int dimZ)    
+    {
+		float dv1, dv2, dv3;
+		int i1,i2,k1,j1,j2,k2;
+		int i = blockDim.x * blockIdx.x + threadIdx.x;
+        int j = blockDim.y * blockIdx.y + threadIdx.y;
+        int k = blockDim.z * blockIdx.z + threadIdx.z;
+        
+        int index = (dimX*dimY)*k + j*dimX+i;       
+        
+        if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) {
+            
+					/* symmetric boundary conditions (Neuman) */
+                    i1 = i + 1; if (i1 >= dimX) i1 = i-1;
+                    i2 = i - 1; if (i2 < 0) i2 = i+1;
+                    j1 = j + 1; if (j1 >= dimY) j1 = j-1;
+                    j2 = j - 1; if (j2 < 0) j2 = j+1;
+                    k1 = k + 1; if (k1 >= dimZ) k1 = k-1;
+                    k2 = k - 1; if (k2 < 0) k2 = k+1;
+                    
+                    /*divergence components */
+                    dv1 = D1[index] - D1[(dimX*dimY)*k + j2*dimX+i];
+                    dv2 = D2[index] - D2[(dimX*dimY)*k + j*dimX+i2];
+                    dv3 = D3[index] - D3[(dimX*dimY)*k2 + j*dimX+i];
+                    
+                    Update[index] = Update[index] + tau*(2.0f*lambda*(dv1 + dv2 + dv3) - (Update[index] - Input[index]));
+		
+		}  
+	}
+
+/////////////////////////////////////////////////
+// HOST FUNCTION
+extern "C" void TV_ROF_GPU(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambda)
+{
+	    // set up device
+		int dev = 0;
+		CHECK(cudaSetDevice(dev));
+		
+        float *d_input, *d_update, *d_D1, *d_D2;
+        
+	if (Z == 0) Z = 1;
+        CHECK(cudaMalloc((void**)&d_input,N*M*Z*sizeof(float)));
+        CHECK(cudaMalloc((void**)&d_update,N*M*Z*sizeof(float)));
+        CHECK(cudaMalloc((void**)&d_D1,N*M*Z*sizeof(float)));
+        CHECK(cudaMalloc((void**)&d_D2,N*M*Z*sizeof(float)));
+        
+        CHECK(cudaMemcpy(d_input,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice));
+        CHECK(cudaMemcpy(d_update,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice));      
+        
+        if (Z > 1) {
+			// TV - 3D case                 
+            dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE);
+            dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKXSIZE));            
+            
+            float *d_D3;
+            CHECK(cudaMalloc((void**)&d_D3,N*M*Z*sizeof(float)));
+            
+            for(int n=0; n < iter; n++) {
+                /* calculate differences */
+                D1_func3D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M, Z);				
+                CHECK(cudaDeviceSynchronize());
+				D2_func3D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M, Z);				                
+                CHECK(cudaDeviceSynchronize());        
+                D3_func3D<<<dimGrid,dimBlock>>>(d_update, d_D3, N, M, Z);				
+                CHECK(cudaDeviceSynchronize());        
+                /*running main kernel*/
+                TV_kernel3D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_D3, d_update, d_input, lambda, tau, N, M, Z);
+                CHECK(cudaDeviceSynchronize());
+            }
+            
+            CHECK(cudaFree(d_D3));         
+        }
+        else {
+	    // TV - 2D case
+            dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
+            dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D));
+             
+            for(int n=0; n < iter; n++) {
+                /* calculate differences */
+                D1_func2D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M);				
+                CHECK(cudaDeviceSynchronize());
+				D2_func2D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M);				
+                CHECK(cudaDeviceSynchronize());        
+                /*running main kernel*/
+                TV_kernel2D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_update, d_input, lambda, tau, N, M);
+                CHECK(cudaDeviceSynchronize());
+            }
+        }        
+        CHECK(cudaMemcpy(Output,d_update,N*M*Z*sizeof(float),cudaMemcpyDeviceToHost));
+        CHECK(cudaFree(d_input));
+        CHECK(cudaFree(d_update));
+        CHECK(cudaFree(d_D1));
+        CHECK(cudaFree(d_D2));        
+        cudaDeviceReset(); 
+}
diff --git a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h
new file mode 100755
index 0000000..8b64d99
--- /dev/null
+++ b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h
@@ -0,0 +1,8 @@
+#ifndef __TVGPU_H__
+#define __TVGPU_H__
+#include "CCPiDefines.h"
+#include <stdio.h>
+
+extern "C" CCPI_EXPORT void TV_ROF_GPU(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambda);
+
+#endif 
diff --git a/Wrappers/Python/src/cpu_regularizers.cpp b/Wrappers/Python/src/cpu_regularizers.cpp
index e311570..43d5d11 100644
--- a/Wrappers/Python/src/cpu_regularizers.cpp
+++ b/Wrappers/Python/src/cpu_regularizers.cpp
@@ -27,7 +27,7 @@ limitations under the License.
 #include "boost/tuple/tuple.hpp"
 
 #include "SplitBregman_TV_core.h"
-#include "FGP_TV_core.h"
+//#include "FGP_TV_core.h"
 #include "LLT_model_core.h"
 #include "PatchBased_Regul_core.h"
 #include "TGV_PD_core.h"
@@ -305,289 +305,289 @@ bp::list SplitBregman_TV(np::ndarray input, double d_mu, int iter, double d_epsi
 
 
-bp::list FGP_TV(np::ndarray input, double d_mu, int iter, double d_epsil, int methTV) {
+//bp::list FGP_TV(np::ndarray input, double d_mu, int iter, double d_epsil, int methTV) {
 
-	// the result is in the following list
-	bp::list result;
+	//// the result is in the following list
+	//bp::list result;
 
-	int number_of_dims, dimX, dimY, dimZ, ll, j, count;
-	float *A, *D = NULL, *D_old = NULL, *P1 = NULL, *P2 = NULL, *P3 = NULL, *P1_old = NULL, *P2_old = NULL, *P3_old = NULL, *R1 = NULL, *R2 = NULL, *R3 = NULL;
-	float lambda, tk, tkp1, re, re1, re_old, epsil, funcval;
+	//int number_of_dims, dimX, dimY, dimZ, ll, j, count;
+	//float *A, *D = NULL, *D_old = NULL, *P1 = NULL, *P2 = NULL, *P3 = NULL, *P1_old = NULL, *P2_old = NULL, *P3_old = NULL, *R1 = NULL, *R2 = NULL, *R3 = NULL;
+	//float lambda, tk, tkp1, re, re1, re_old, epsil, funcval;
 
-	//number_of_dims = mxGetNumberOfDimensions(prhs[0]);
-	//dim_array = mxGetDimensions(prhs[0]);
-
-	number_of_dims = input.get_nd();
-	int dim_array[3];
+	////number_of_dims = mxGetNumberOfDimensions(prhs[0]);
+	////dim_array = mxGetDimensions(prhs[0]);
 
-	dim_array[0] = input.shape(0);
-	dim_array[1] = input.shape(1);
-	if (number_of_dims == 2) {
-		dim_array[2] = -1;
-	}
-	else {
-		dim_array[2] = input.shape(2);
-	}
-	// Parameter handling is be done in Python
-	///*Handling Matlab input data*/
-	//if ((nrhs < 2) || (nrhs > 5)) mexErrMsgTxt("At least 2 parameters is required: Image(2D/3D), Regularization parameter. The full list of parameters: Image(2D/3D), Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1')");
-
-	///*Handling Matlab input data*/
-	//A = (float *)mxGetData(prhs[0]); /*noisy image (2D/3D) */
-	A = reinterpret_cast<float *>(input.get_data());
-
-	//mu = (float)mxGetScalar(prhs[1]); /* regularization parameter */
-	lambda = (float)d_mu;
-
-	//iter = 35; /* default iterations number */
+	//number_of_dims = input.get_nd();
+	//int dim_array[3];
 
-	//epsil = 0.0001; /* default tolerance constant */
-	epsil = (float)d_epsil;
-	//methTV = 0;  /* default isotropic TV penalty */
-	//if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5))  iter = (int)mxGetScalar(prhs[2]); /* iterations number */
-	//if ((nrhs == 4) || (nrhs == 5))  epsil = (float)mxGetScalar(prhs[3]); /* tolerance constant */
-	//if (nrhs == 5) {
-	//	char *penalty_type;
-	//	penalty_type = mxArrayToString(prhs[4]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */
-	//	if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',");
-	//	if (strcmp(penalty_type, "l1") == 0)  methTV = 1;  /* enable 'l1' penalty */
-	//	mxFree(penalty_type);
+	//dim_array[0] = input.shape(0);
+	//dim_array[1] = input.shape(1);
+	//if (number_of_dims == 2) {
+		//dim_array[2] = -1;
 	//}
-	//if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) { mexErrMsgTxt("The input image must be in a single precision"); }
-
-	//plhs[1] = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL);
-	bp::tuple shape1 = bp::make_tuple(dim_array[0], dim_array[1]);
-	np::dtype dtype = np::dtype::get_builtin<float>();
-	np::ndarray out1 = np::zeros(shape1, dtype);
+	//else {
+		//dim_array[2] = input.shape(2);
+	//}
+	//// Parameter handling is be done in Python
+	/////*Handling Matlab input data*/
+	////if ((nrhs < 2) || (nrhs > 5)) mexErrMsgTxt("At least 2 parameters is required: Image(2D/3D), Regularization parameter. The full list of parameters: Image(2D/3D), Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1')");
+
+	/////*Handling Matlab input data*/
+	////A = (float *)mxGetData(prhs[0]); /*noisy image (2D/3D) */
+	//A = reinterpret_cast<float *>(input.get_data());
+
+	////mu = (float)mxGetScalar(prhs[1]); /* regularization parameter */
+	//lambda = (float)d_mu;
+
+	////iter = 35; /* default iterations number */
+
+	////epsil = 0.0001; /* default tolerance constant */
+	//epsil = (float)d_epsil;
+	////methTV = 0;  /* default isotropic TV penalty */
+	////if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5))  iter = (int)mxGetScalar(prhs[2]); /* iterations number */
+	////if ((nrhs == 4) || (nrhs == 5))  epsil = (float)mxGetScalar(prhs[3]); /* tolerance constant */
+	////if (nrhs == 5) {
+	////	char *penalty_type;
+	////	penalty_type = mxArrayToString(prhs[4]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */
+	////	if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',");
+	////	if (strcmp(penalty_type, "l1") == 0)  methTV = 1;  /* enable 'l1' penalty */
+	////	mxFree(penalty_type);
+	////}
+	////if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) { mexErrMsgTxt("The input image must be in a single precision"); }
+
+	////plhs[1] = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL);
+	//bp::tuple shape1 = bp::make_tuple(dim_array[0], dim_array[1]);
+	//np::dtype dtype = np::dtype::get_builtin<float>();
+	//np::ndarray out1 = np::zeros(shape1, dtype);
 	
-	//float *funcvalA = (float *)mxGetData(plhs[1]);
-	float * funcvalA = reinterpret_cast<float *>(out1.get_data());
-	//if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) { mexErrMsgTxt("The input image must be in a single precision"); }
-
-	/*Handling Matlab output data*/
-	dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];
-
-	tk = 1.0f;
-	tkp1 = 1.0f;
-	count = 1;
-	re_old = 0.0f;
-
-	if (number_of_dims == 2) {
-		dimZ = 1; /*2D case*/
-		/*D = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-		D_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-		P1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-		P2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-		P1_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-		P2_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-		R1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-		R2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));*/
-
-		bp::tuple shape = bp::make_tuple(dim_array[0], dim_array[1]);
-		np::dtype dtype = np::dtype::get_builtin<float>();
-
-
-		np::ndarray npD      = np::zeros(shape, dtype);
-		np::ndarray npD_old  = np::zeros(shape, dtype);
-		np::ndarray npP1     = np::zeros(shape, dtype);
-		np::ndarray npP2     = np::zeros(shape, dtype);
-		np::ndarray npP1_old = np::zeros(shape, dtype);
-		np::ndarray npP2_old = np::zeros(shape, dtype);
-		np::ndarray npR1     = np::zeros(shape, dtype);
-		np::ndarray npR2     = np::zeros(shape, dtype);
-
-		D      = reinterpret_cast<float *>(npD.get_data());
-		D_old  = reinterpret_cast<float *>(npD_old.get_data());
-		P1     = reinterpret_cast<float *>(npP1.get_data());
-		P2     = reinterpret_cast<float *>(npP2.get_data());
-		P1_old = reinterpret_cast<float *>(npP1_old.get_data());
-		P2_old = reinterpret_cast<float *>(npP2_old.get_data());
-		R1     = reinterpret_cast<float *>(npR1.get_data());
-		R2     = reinterpret_cast<float *>(npR2.get_data());
-
-		/* begin iterations */
-		for (ll = 0; ll<iter; ll++) {
-			/* computing the gradient of the objective function */
-			Obj_func2D(A, D, R1, R2, lambda, dimX, dimY);
-
-			/*Taking a step towards minus of the gradient*/
-			Grad_func2D(P1, P2, D, R1, R2, lambda, dimX, dimY);
+	////float *funcvalA = (float *)mxGetData(plhs[1]);
+	//float * funcvalA = reinterpret_cast<float *>(out1.get_data());
+	////if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) { mexErrMsgTxt("The input image must be in a single precision"); }
+
+	///*Handling Matlab output data*/
+	//dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];
+
+	//tk = 1.0f;
+	//tkp1 = 1.0f;
+	//count = 1;
+	//re_old = 0.0f;
+
+	//if (number_of_dims == 2) {
+		//dimZ = 1; /*2D case*/
+		///*D = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
+		//D_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
+		//P1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
+		//P2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
+		//P1_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
+		//P2_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
+		//R1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
+		//R2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));*/
+
+		//bp::tuple shape = bp::make_tuple(dim_array[0], dim_array[1]);
+		//np::dtype dtype = np::dtype::get_builtin<float>();
+
+
+		//np::ndarray npD      = np::zeros(shape, dtype);
+		//np::ndarray npD_old  = np::zeros(shape, dtype);
+		//np::ndarray npP1     = np::zeros(shape, dtype);
+		//np::ndarray npP2     = np::zeros(shape, dtype);
+		//np::ndarray npP1_old = np::zeros(shape, dtype);
+		//np::ndarray npP2_old = np::zeros(shape, dtype);
+		//np::ndarray npR1     = np::zeros(shape, dtype);
+		//np::ndarray npR2     = np::zeros(shape, dtype);
+
+		//D      = reinterpret_cast<float *>(npD.get_data());
+		//D_old  = reinterpret_cast<float *>(npD_old.get_data());
+		//P1     = reinterpret_cast<float *>(npP1.get_data());
+		//P2     = reinterpret_cast<float *>(npP2.get_data());
+		//P1_old = reinterpret_cast<float *>(npP1_old.get_data());
+		//P2_old = reinterpret_cast<float *>(npP2_old.get_data());
+		//R1     = reinterpret_cast<float *>(npR1.get_data());
+		//R2     = reinterpret_cast<float *>(npR2.get_data());
+
+		///* begin iterations */
+		//for (ll = 0; ll<iter; ll++) {
+			///* computing the gradient of the objective function */
+			//Obj_func2D(A, D, R1, R2, lambda, dimX, dimY);
+
+			///*Taking a step towards minus of the gradient*/
+			//Grad_func2D(P1, P2, D, R1, R2, lambda, dimX, dimY);
   
-            /* projection step */
-            Proj_func2D(P1, P2, methTV, dimX, dimY);
+            ///* projection step */
+            //Proj_func2D(P1, P2, methTV, dimX, dimY);
             
-            /*updating R and t*/
-            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
-            Rupd_func2D(P1, P1_old, P2, P2_old, R1, R2, tkp1, tk, dimX, dimY);
-
-			/* calculate norm */
-			re = 0.0f; re1 = 0.0f;
-			for (j = 0; j<dimX*dimY*dimZ; j++)
-			{
-				re += pow(D[j] - D_old[j], 2);
-				re1 += pow(D[j], 2);
-			}
-			re = sqrt(re) / sqrt(re1);
-			if (re < epsil)  count++;
-			if (count > 4) {
-				Obj_func2D(A, D, P1, P2, lambda, dimX, dimY);
-				funcval = 0.0f;
-				for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
-				//funcvalA[0] = sqrt(funcval);
-				float fv = sqrt(funcval);
-				std::memcpy(funcvalA, &fv, sizeof(float));
-				break;
-			}
-
-			/* check that the residual norm is decreasing */
-			if (ll > 2) {
-				if (re > re_old) {
-					Obj_func2D(A, D, P1, P2, lambda, dimX, dimY);
-					funcval = 0.0f;
-					for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
-					//funcvalA[0] = sqrt(funcval);
-					float fv = sqrt(funcval);
-					std::memcpy(funcvalA, &fv, sizeof(float));
-					break;
-				}
-			}
-			re_old = re;
-			/*printf("%f %i %i \n", re, ll, count); */
-
-			/*storing old values*/
-			copyIm(D, D_old, dimX, dimY, dimZ);
-			copyIm(P1, P1_old, dimX, dimY, dimZ);
-			copyIm(P2, P2_old, dimX, dimY, dimZ);
-			tk = tkp1;
-
-			/* calculating the objective function value */
-			if (ll == (iter - 1)) {
-				Obj_func2D(A, D, P1, P2, lambda, dimX, dimY);
-				funcval = 0.0f;
-				for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
-				//funcvalA[0] = sqrt(funcval);
-				float fv = sqrt(funcval);
-				std::memcpy(funcvalA, &fv, sizeof(float));
-			}
-		}
-		//printf("FGP-TV iterations stopped at iteration %i with the function value %f \n", ll, funcvalA[0]);
-		result.append<np::ndarray>(npD);
-		result.append<np::ndarray>(out1);
-		result.append<int>(ll);
-	}
-	if (number_of_dims == 3) {
-		/*D = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		D_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		P1 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		P2 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		P3 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		P1_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		P2_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		P3_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		R1 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		R2 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		R3 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));*/
-		bp::tuple shape = bp::make_tuple(dim_array[0], dim_array[1], dim_array[2]);
-		np::dtype dtype = np::dtype::get_builtin<float>();
+            ///*updating R and t*/
+            //tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
+            //Rupd_func2D(P1, P1_old, P2, P2_old, R1, R2, tkp1, tk, dimX, dimY);
+
+			///* calculate norm */
+			//re = 0.0f; re1 = 0.0f;
+			//for (j = 0; j<dimX*dimY*dimZ; j++)
+			//{
+				//re += pow(D[j] - D_old[j], 2);
+				//re1 += pow(D[j], 2);
+			//}
+			//re = sqrt(re) / sqrt(re1);
+			//if (re < epsil)  count++;
+			//if (count > 4) {
+				//Obj_func2D(A, D, P1, P2, lambda, dimX, dimY);
+				//funcval = 0.0f;
+				//for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
+				////funcvalA[0] = sqrt(funcval);
+				//float fv = sqrt(funcval);
+				//std::memcpy(funcvalA, &fv, sizeof(float));
+				//break;
+			//}
+
+			///* check that the residual norm is decreasing */
+			//if (ll > 2) {
+				//if (re > re_old) {
+					//Obj_func2D(A, D, P1, P2, lambda, dimX, dimY);
+					//funcval = 0.0f;
+					//for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
+					////funcvalA[0] = sqrt(funcval);
+					//float fv = sqrt(funcval);
+					//std::memcpy(funcvalA, &fv, sizeof(float));
+					//break;
+				//}
+			//}
+			//re_old = re;
+			///*printf("%f %i %i \n", re, ll, count); */
+
+			///*storing old values*/
+			//copyIm(D, D_old, dimX, dimY, dimZ);
+			//copyIm(P1, P1_old, dimX, dimY, dimZ);
+			//copyIm(P2, P2_old, dimX, dimY, dimZ);
+			//tk = tkp1;
+
+			///* calculating the objective function value */
+			//if (ll == (iter - 1)) {
+				//Obj_func2D(A, D, P1, P2, lambda, dimX, dimY);
+				//funcval = 0.0f;
+				//for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
+				////funcvalA[0] = sqrt(funcval);
+				//float fv = sqrt(funcval);
+				//std::memcpy(funcvalA, &fv, sizeof(float));
+			//}
+		//}
+		////printf("FGP-TV iterations stopped at iteration %i with the function value %f \n", ll, funcvalA[0]);
+		//result.append<np::ndarray>(npD);
+		//result.append<np::ndarray>(out1);
+		//result.append<int>(ll);
+	//}
+	//if (number_of_dims == 3) {
+		///*D = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
+		//D_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
+		//P1 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
+		//P2 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
+		//P3 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
+		//P1_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
+		//P2_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
+		//P3_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
+		//R1 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
+		//R2 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
+		//R3 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));*/
+		//bp::tuple shape = bp::make_tuple(dim_array[0], dim_array[1], dim_array[2]);
+		//np::dtype dtype = np::dtype::get_builtin<float>();
 		
-		np::ndarray npD      = np::zeros(shape, dtype);
-		np::ndarray npD_old  = np::zeros(shape, dtype);
-		np::ndarray npP1     = np::zeros(shape, dtype);
-		np::ndarray npP2     = np::zeros(shape, dtype);
-		np::ndarray npP3     = np::zeros(shape, dtype);
-		np::ndarray npP1_old = np::zeros(shape, dtype);
-		np::ndarray npP2_old = np::zeros(shape, dtype);
-		np::ndarray npP3_old = np::zeros(shape, dtype);
-		np::ndarray npR1     = np::zeros(shape, dtype);
-		np::ndarray npR2     = np::zeros(shape, dtype);
-		np::ndarray npR3     = np::zeros(shape, dtype);
-
-		D      = reinterpret_cast<float *>(npD.get_data());
-		D_old  = reinterpret_cast<float *>(npD_old.get_data());
-		P1     = reinterpret_cast<float *>(npP1.get_data());
-		P2     = reinterpret_cast<float *>(npP2.get_data());
-		P3     = reinterpret_cast<float *>(npP3.get_data());
-		P1_old = reinterpret_cast<float *>(npP1_old.get_data());
-		P2_old = reinterpret_cast<float *>(npP2_old.get_data());
-		P3_old = reinterpret_cast<float *>(npP3_old.get_data());
-		R1     = reinterpret_cast<float *>(npR1.get_data());
-		R2     = reinterpret_cast<float *>(npR2.get_data());
-		R3     = reinterpret_cast<float *>(npR3.get_data());
-		/* begin iterations */
-		for (ll = 0; ll<iter; ll++) {
-			/* computing the gradient of the objective function */
-			Obj_func3D(A, D, R1, R2, R3, lambda, dimX, dimY, dimZ);
-			/*Taking a step towards minus of the gradient*/
-			Grad_func3D(P1, P2, P3, D, R1, R2, R3, lambda, dimX, dimY, dimZ);
-
-			/* projection step */
-			Proj_func3D(P1, P2, P3, dimX, dimY, dimZ);
-
-			/*updating R and t*/
-			tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
-			Rupd_func3D(P1, P1_old, P2, P2_old, P3, P3_old, R1, R2, R3, tkp1, tk, dimX, dimY, dimZ);
-
-			/* calculate norm - stopping rules*/
-			re = 0.0f; re1 = 0.0f;
-			for (j = 0; j<dimX*dimY*dimZ; j++)
-			{
-				re += pow(D[j] - D_old[j], 2);
-				re1 += pow(D[j], 2);
-			}
-			re = sqrt(re) / sqrt(re1);
-			/* stop if the norm residual is less than the tolerance EPS */
-			if (re < epsil)  count++;
-			if (count > 3) {
-				Obj_func3D(A, D, P1, P2, P3, lambda, dimX, dimY, dimZ);
-				funcval = 0.0f;
-				for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
-				//funcvalA[0] = sqrt(funcval);
-				float fv = sqrt(funcval);
-				std::memcpy(funcvalA, &fv, sizeof(float));
-				break;
-			}
-
-			/* check that the residual norm is decreasing */
-			if (ll > 2) {
-				if (re > re_old) {
-					Obj_func3D(A, D, P1, P2, P3, lambda, dimX, dimY, dimZ);
-					funcval = 0.0f;
-					for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
-					//funcvalA[0] = sqrt(funcval);
-					float fv = sqrt(funcval);
-					std::memcpy(funcvalA, &fv, sizeof(float));
-					break;
-				}
-			}
-
-			re_old = re;
-			/*printf("%f %i %i \n", re, ll, count); */
-
-			/*storing old values*/
-			copyIm(D, D_old, dimX, dimY, dimZ);
-			copyIm(P1, P1_old, dimX, dimY, dimZ);
-			copyIm(P2, P2_old, dimX, dimY, dimZ);
-			copyIm(P3, P3_old, dimX, dimY, dimZ);
-			tk = tkp1;
-
-			if (ll == (iter - 1)) {
-				Obj_func3D(A, D, P1, P2, P3, lambda, dimX, dimY, dimZ);
-				funcval = 0.0f;
-				for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
-				//funcvalA[0] = sqrt(funcval);
-				float fv = sqrt(funcval);
-				std::memcpy(funcvalA, &fv, sizeof(float));
-			}
-
-		}
-		//printf("FGP-TV iterations stopped at iteration %i with the function value %f \n", ll, funcvalA[0]);
-		result.append<np::ndarray>(npD);
-		result.append<np::ndarray>(out1);
-		result.append<int>(ll);
-	}
+		//np::ndarray npD      = np::zeros(shape, dtype);
+		//np::ndarray npD_old  = np::zeros(shape, dtype);
+		//np::ndarray npP1     = np::zeros(shape, dtype);
+		//np::ndarray npP2     = np::zeros(shape, dtype);
+		//np::ndarray npP3     = np::zeros(shape, dtype);
+		//np::ndarray npP1_old = np::zeros(shape, dtype);
+		//np::ndarray npP2_old = np::zeros(shape, dtype);
+		//np::ndarray npP3_old = np::zeros(shape, dtype);
+		//np::ndarray npR1     = np::zeros(shape, dtype);
+		//np::ndarray npR2     = np::zeros(shape, dtype);
+		//np::ndarray npR3     = np::zeros(shape, dtype);
+
+		//D      = reinterpret_cast<float *>(npD.get_data());
+		//D_old  = reinterpret_cast<float *>(npD_old.get_data());
+		//P1     = reinterpret_cast<float *>(npP1.get_data());
+		//P2     = reinterpret_cast<float *>(npP2.get_data());
+		//P3     = reinterpret_cast<float *>(npP3.get_data());
+		//P1_old = reinterpret_cast<float *>(npP1_old.get_data());
+		//P2_old = reinterpret_cast<float *>(npP2_old.get_data());
+		//P3_old = reinterpret_cast<float *>(npP3_old.get_data());
+		//R1     = reinterpret_cast<float *>(npR1.get_data());
+		//R2     = reinterpret_cast<float *>(npR2.get_data());
+		//R3     = reinterpret_cast<float *>(npR3.get_data());
+		///* begin iterations */
+		//for (ll = 0; ll<iter; ll++) {
+			///* computing the gradient of the objective function */
+			//Obj_func3D(A, D, R1, R2, R3, lambda, dimX, dimY, dimZ);
+			///*Taking a step towards minus of the gradient*/
+			//Grad_func3D(P1, P2, P3, D, R1, R2, R3, lambda, dimX, dimY, dimZ);
+
+			///* projection step */
+			//Proj_func3D(P1, P2, P3, dimX, dimY, dimZ);
+
+			///*updating R and t*/
+			//tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
+			//Rupd_func3D(P1, P1_old, P2, P2_old, P3, P3_old, R1, R2, R3, tkp1, tk, dimX, dimY, dimZ);
+
+			///* calculate norm - stopping rules*/
+			//re = 0.0f; re1 = 0.0f;
+			//for (j = 0; j<dimX*dimY*dimZ; j++)
+			//{
+				//re += pow(D[j] - D_old[j], 2);
+				//re1 += pow(D[j], 2);
+			//}
+			//re = sqrt(re) / sqrt(re1);
+			///* stop if the norm residual is less than the tolerance EPS */
+			//if (re < epsil)  count++;
+			//if (count > 3) {
+				//Obj_func3D(A, D, P1, P2, P3, lambda, dimX, dimY, dimZ);
+				//funcval = 0.0f;
+				//for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
+				////funcvalA[0] = sqrt(funcval);
+				//float fv = sqrt(funcval);
+				//std::memcpy(funcvalA, &fv, sizeof(float));
+				//break;
+			//}
+
+			///* check that the residual norm is decreasing */
+			//if (ll > 2) {
+				//if (re > re_old) {
+					//Obj_func3D(A, D, P1, P2, P3, lambda, dimX, dimY, dimZ);
+					//funcval = 0.0f;
+					//for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
+					////funcvalA[0] = sqrt(funcval);
+					//float fv = sqrt(funcval);
+					//std::memcpy(funcvalA, &fv, sizeof(float));
+					//break;
+				//}
+			//}
+
+			//re_old = re;
+			///*printf("%f %i %i \n", re, ll, count); */
+
+			///*storing old values*/
+			//copyIm(D, D_old, dimX, dimY, dimZ);
+			//copyIm(P1, P1_old, dimX, dimY, dimZ);
+			//copyIm(P2, P2_old, dimX, dimY, dimZ);
+			//copyIm(P3, P3_old, dimX, dimY, dimZ);
+			//tk = tkp1;
+
+			//if (ll == (iter - 1)) {
+				//Obj_func3D(A, D, P1, P2, P3, lambda, dimX, dimY, dimZ);
+				//funcval = 0.0f;
+				//for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
+				////funcvalA[0] = sqrt(funcval);
+				//float fv = sqrt(funcval);
+				//std::memcpy(funcvalA, &fv, sizeof(float));
+			//}
+
+		//}
+		////printf("FGP-TV iterations stopped at iteration %i with the function value %f \n", ll, funcvalA[0]);
+		//result.append<np::ndarray>(npD);
+		//result.append<np::ndarray>(out1);
+		//result.append<int>(ll);
+	//}
 
-	return result;
-}
+	//return result;
+//}
 
 bp::list LLT_model(np::ndarray input, double d_lambda, double d_tau, int iter, double d_epsil, int switcher) {
 	// the result is in the following list
diff --git a/Wrappers/Python/src/gpu_regularizers.pyx b/Wrappers/Python/src/gpu_regularizers.pyx
index c724471..263fa4a 100644
--- a/Wrappers/Python/src/gpu_regularizers.pyx
+++ b/Wrappers/Python/src/gpu_regularizers.pyx
@@ -25,7 +25,9 @@ cdef extern void NLM_GPU_kernel(float *A, float* B, float *Eucl_Vec,
                                 int N, int M,  int Z, int dimension, 
                                 int SearchW, int SimilW, 
                                 int SearchW_real, float denh2, float lambdaf);
-cdef extern void TV_ROF_GPU_kernel(float* A, float* B, int N, int M, int Z, int iter, float tau, float lambdaf);
+cdef extern void TV_ROF_GPU(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambdaf);
+cdef extern void TV_FGP_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int N, int M, int Z);
+
 cdef extern float pad_crop(float *A, float *Ap, 
                            int OldSizeX, int OldSizeY, int OldSizeZ, 
                            int NewSizeX, int NewSizeY, int NewSizeZ, 
@@ -343,7 +345,7 @@ def ROFTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
 		    np.zeros([dims[0],dims[1]], dtype='float32')
           
     # Running CUDA code here    
-    TV_ROF_GPU_kernel(            
+    TV_ROF_GPU(            
             &inputData[0,0], &B[0,0], 
                        dims[0], dims[1], 1, 
                        iterations , 
@@ -366,7 +368,7 @@ def ROFTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
 		    np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
           
     # Running CUDA code here    
-    TV_ROF_GPU_kernel(            
+    TV_ROF_GPU(            
             &inputData[0,0,0], &B[0,0,0], 
                        dims[0], dims[1], dims[2], 
                        iterations , 
@@ -374,3 +376,64 @@ def ROFTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
                        regularization_parameter);   
      
     return B
+
+
+def TVFGP2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
+                     float regularization_parameter,
+                     int iterations, 
+                     float tolerance_param,
+                     int methodTV,
+                     int nonneg,
+                     int printM):
+    
+    cdef long dims[2]
+    dims[0] = inputData.shape[0]
+    dims[1] = inputData.shape[1]
+
+    cdef np.ndarray[np.float32_t, ndim=2, mode="c"] B = \
+		    np.zeros([dims[0],dims[1]], dtype='float32')
+          
+    # Running CUDA code here    
+    TV_FGP_GPU(            
+            &inputData[0,0], &B[0,0],                        
+                       regularization_parameter , 
+                       iterations, 
+                       tolerance_param,
+                       methodTV,
+                       nonneg,
+                       printM,
+                       dims[0], dims[1], 1);   
+     
+    return B
+    
+def TVFGP3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
+                     float regularization_parameter,
+                     int iterations, 
+                     float tolerance_param,
+                     int methodTV,
+                     int nonneg,
+                     int printM):
+    
+    cdef long dims[3]
+    dims[0] = inputData.shape[0]
+    dims[1] = inputData.shape[1]
+    dims[2] = inputData.shape[2]
+
+    cdef np.ndarray[np.float32_t, ndim=3, mode="c"] B = \
+		    np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
+          
+    # Running CUDA code here    
+    TV_FGP_GPU(            
+            &inputData[0,0,0], &B[0,0,0], 
+                       regularization_parameter , 
+                       iterations, 
+                       tolerance_param,
+                       methodTV,
+                       nonneg,
+                       printM,
+                       dims[0], dims[1], dims[2]);   
+     
+    return B    
+    
+    
+    
diff --git a/Wrappers/Python/test/test_cpu_vs_gpu.py b/Wrappers/Python/test/test_cpu_vs_gpu.py
new file mode 100644
index 0000000..74d65dd
--- /dev/null
+++ b/Wrappers/Python/test/test_cpu_vs_gpu.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Feb 21 12:12:22 2018
+
+# CPU vs GPU comparison tests
+
+@author: algol
+"""
+
diff --git a/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py b/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py
index d742a7f..6344021 100644
--- a/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py
+++ b/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py
@@ -12,8 +12,8 @@ import matplotlib.pyplot as plt
 import numpy as np
 import os    
 import timeit
-from ccpi.filters.gpu_regularizers import Diff4thHajiaboli, NML, GPU_ROF_TV
-from ccpi.filters.cpu_regularizers_cython import ROF_TV
+from ccpi.filters.gpu_regularizers import Diff4thHajiaboli, NML, TV_ROF_GPU
+from ccpi.filters.cpu_regularizers_cython import TV_ROF_CPU
 ###############################################################################
 def printParametersToString(pars):
         txt = r''
@@ -64,7 +64,7 @@ pars = {'algorithm': ROF_TV , \
         }
 print ("#################ROF TV CPU#####################")
 start_time = timeit.default_timer()
-rof_cpu = ROF_TV(pars['input'],
+rof_cpu = TV_ROF_CPU(pars['input'],
              pars['number_of_iterations'],
              pars['regularization_parameter'],
              pars['time_marching_parameter'] 
@@ -89,7 +89,7 @@ plt.title('{}'.format('CPU results'))
 
 print ("#################ROF TV GPU#####################")
 start_time = timeit.default_timer()
-rof_gpu = GPU_ROF_TV(pars['input'], 
+rof_gpu = TV_ROF_GPU(pars['input'], 
                      pars['number_of_iterations'], 
                      pars['time_marching_parameter'], 
                      pars['regularization_parameter'])
-- 
cgit v1.2.3


From 8082a76d4dfd9588590bab3fd26eae976b744a94 Mon Sep 17 00:00:00 2001
From: Daniil Kazantsev <dkazanc@hotmail.com>
Date: Mon, 26 Feb 2018 13:20:33 +0000
Subject: cpu-related cython file updated #33

---
 Wrappers/Python/src/cpu_regularizers.pyx | 77 +++++++++++++++++++++++++++-----
 Wrappers/Python/src/gpu_regularizers.pyx |  2 +-
 2 files changed, 68 insertions(+), 11 deletions(-)

diff --git a/Wrappers/Python/src/cpu_regularizers.pyx b/Wrappers/Python/src/cpu_regularizers.pyx
index e7ff78c..b8089a8 100644
--- a/Wrappers/Python/src/cpu_regularizers.pyx
+++ b/Wrappers/Python/src/cpu_regularizers.pyx
@@ -18,20 +18,20 @@ import cython
 import numpy as np
 cimport numpy as np
 
-cdef extern float TV_ROF(float *A, float *B, int dimX, int dimY, int dimZ, 
-                         int iterationsNumb, float tau, float flambda);
+cdef extern float TV_ROF_CPU(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float flambda);
+cdef extern float TV_FGP_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
 
-
-def ROF_TV(inputData, iterations, regularization_parameter, 
+# Can we use the same name here in "def" as the C function? 
+def TV_ROF_CPU(inputData, iterations, regularization_parameter, 
            marching_step_parameter):
     if inputData.ndim == 2:
-        return ROF_TV_2D(inputData, iterations, regularization_parameter, 
+        return TV_ROF_2D(inputData, iterations, regularization_parameter, 
                          marching_step_parameter)
     elif inputData.ndim == 3:
-        return ROF_TV_3D(inputData, iterations, regularization_parameter, 
+        return TV_ROF_3D(inputData, iterations, regularization_parameter, 
                          marching_step_parameter)
 
-def ROF_TV_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
+def TV_ROF_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
                      int iterations,
                      float regularization_parameter,
                      float marching_step_parameter
@@ -45,13 +45,13 @@ def ROF_TV_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
             np.zeros([dims[0],dims[1]], dtype='float32')
                    
     #/* Run ROF iterations for 2D data */
-    TV_ROF(&inputData[0,0], &B[0,0], dims[0], dims[1], 1, iterations,
+    TV_ROF_CPU(&inputData[0,0], &B[0,0], dims[0], dims[1], 1, iterations,
                marching_step_parameter, regularization_parameter)
     
     return B
         
             
-def ROF_TV_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
+def TV_ROF_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
                      int iterations,
                      float regularization_parameter,
                      float marching_step_parameter
@@ -65,8 +65,65 @@ def ROF_TV_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
             np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
            
     #/* Run ROF iterations for 3D data */
-    TV_ROF(&inputData[0,0, 0], &B[0,0, 0], dims[0], dims[1], dims[2], iterations,
+    TV_FGP_CPU(&inputData[0,0,0], &B[0,0,0], dims[0], dims[1], dims[2], iterations,
              marching_step_parameter, regularization_parameter)
 
     return B    
                      
+def TV_FGP_CPU(inputData, regularization_parameter, iterations, tolerance_param, methodTV, nonneg, printM):
+    if inputData.ndim == 2:
+        return TV_FGP_2D(inputData, regularization_parameter, iterations, tolerance_param, methodTV, nonneg, printM)
+    elif inputData.ndim == 3:
+        return TV_FGP_3D(inputData, regularization_parameter, iterations, tolerance_param, methodTV, nonneg, printM)
+
+def TV_FGP_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
+                     float regularization_parameter,
+                     int iterations, 
+                     float tolerance_param,
+                     int methodTV,
+                     int nonneg,
+                     int printM):
+                         
+    cdef long dims[2]
+    dims[0] = inputData.shape[0]
+    dims[1] = inputData.shape[1]
+    
+    cdef np.ndarray[np.float32_t, ndim=2, mode="c"] B = \
+            np.zeros([dims[0],dims[1]], dtype='float32')
+                   
+    #/* Run ROF iterations for 2D data */
+    TV_FGP_CPU(&inputData[0,0], &B[0,0], regularization_parameter, 
+                       iterations, 
+                       tolerance_param,
+                       methodTV,
+                       nonneg,
+                       printM,
+                       dims[0], dims[1], 1)
+    
+    return B
+        
+            
+def TV_FGP_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
+ float regularization_parameter,
+                     int iterations, 
+                     float tolerance_param,
+                     int methodTV,
+                     int nonneg,
+                     int printM):
+    cdef long dims[2]
+    dims[0] = inputData.shape[0]
+    dims[1] = inputData.shape[1]
+    dims[2] = inputData.shape[2]
+    
+    cdef np.ndarray[np.float32_t, ndim=3, mode="c"] B = \
+            np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
+           
+    #/* Run ROF iterations for 3D data */
+    TV_FGP_CPU(&inputData[0,0, 0], &B[0,0, 0], iterations, 
+                       tolerance_param,
+                       methodTV,
+                       nonneg,
+                       printM,
+                       dims[0], dims[1], dims[2])
+
+    return B  
diff --git a/Wrappers/Python/src/gpu_regularizers.pyx b/Wrappers/Python/src/gpu_regularizers.pyx
index 263fa4a..a14a20d 100644
--- a/Wrappers/Python/src/gpu_regularizers.pyx
+++ b/Wrappers/Python/src/gpu_regularizers.pyx
@@ -396,7 +396,7 @@ def TVFGP2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
     # Running CUDA code here    
     TV_FGP_GPU(            
             &inputData[0,0], &B[0,0],                        
-                       regularization_parameter , 
+                       regularization_parameter, 
                        iterations, 
                        tolerance_param,
                        methodTV,
-- 
cgit v1.2.3


From 74ff5b5f319b2f7ea3e078c62041ec8a1bb28335 Mon Sep 17 00:00:00 2001
From: algol <dkazanc@hotmail.com>
Date: Mon, 5 Mar 2018 18:12:01 +0000
Subject: Cmake/Cython fixes to compile ROF-FGP

---
 Core/CMakeLists.txt                                |  3 ++-
 Core/regularizers_CPU/FGP_TV_core.c                | 12 +++++------
 Core/regularizers_CPU/FGP_TV_core.h                |  2 +-
 Core/regularizers_CPU/ROF_TV_core.c                |  4 ++--
 Core/regularizers_CPU/ROF_TV_core.h                |  2 +-
 Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu    | 12 +++++------
 Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h     |  2 +-
 Wrappers/Python/demo/test_cpu_regularizers.py      | 25 ++++++++++++----------
 Wrappers/Python/src/cpu_regularizers.cpp           |  1 -
 Wrappers/Python/src/cpu_regularizers.pyx           | 25 +++++++++++-----------
 Wrappers/Python/src/gpu_regularizers.pyx           |  2 +-
 .../Python/test/test_cpu_vs_gpu_regularizers.py    |  9 ++++----
 12 files changed, 51 insertions(+), 48 deletions(-)

diff --git a/Core/CMakeLists.txt b/Core/CMakeLists.txt
index 3e3f89e..68b7111 100644
--- a/Core/CMakeLists.txt
+++ b/Core/CMakeLists.txt
@@ -129,7 +129,8 @@ if (CUDA_FOUND)
   CUDA_ADD_LIBRARY(cilregcuda SHARED
 	${CMAKE_CURRENT_SOURCE_DIR}/regularizers_GPU/Diffus_HO/Diff4th_GPU_kernel.cu
 	${CMAKE_CURRENT_SOURCE_DIR}/regularizers_GPU/NL_Regul/NLM_GPU_kernel.cu
-	${CMAKE_CURRENT_SOURCE_DIR}/regularizers_GPU/TV_ROF/TV_ROF_GPU.cu
+	${CMAKE_CURRENT_SOURCE_DIR}/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu
+	${CMAKE_CURRENT_SOURCE_DIR}/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu
   )
   if (UNIX)
     message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib")
diff --git a/Core/regularizers_CPU/FGP_TV_core.c b/Core/regularizers_CPU/FGP_TV_core.c
index 2f1439d..5365631 100644
--- a/Core/regularizers_CPU/FGP_TV_core.c
+++ b/Core/regularizers_CPU/FGP_TV_core.c
@@ -23,7 +23,7 @@ limitations under the License.
  *
  * Input Parameters:
  * 1. Noisy image/volume 
- * 2. lambda - regularization parameter 
+ * 2. lambdaPar - regularization parameter 
  * 3. Number of iterations
  * 4. eplsilon: tolerance constant 
  * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
@@ -37,7 +37,7 @@ limitations under the License.
  * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
  */
  
-float TV_FGP_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
+float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
 {
 	int ll, j, DimTotal;
 	float re, re1;
@@ -62,13 +62,13 @@ float TV_FGP_CPU(float *Input, float *Output, float lambda, int iter, float epsi
         for(ll=0; ll<iter; ll++) {
             
             /* computing the gradient of the objective function */
-            Obj_func2D(Input, Output, R1, R2, lambda, dimX, dimY);
+            Obj_func2D(Input, Output, R1, R2, lambdaPar, dimX, dimY);
             
             /* apply nonnegativity */
             if (nonneg == 1) for(j=0; j<dimX*dimY; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;}            
             
             /*Taking a step towards minus of the gradient*/
-            Grad_func2D(P1, P2, Output, R1, R2, lambda, dimX, dimY);
+            Grad_func2D(P1, P2, Output, R1, R2, lambdaPar, dimX, dimY);
             
             /* projection step */
             Proj_func2D(P1, P2, methodTV, dimX, dimY);
@@ -117,13 +117,13 @@ float TV_FGP_CPU(float *Input, float *Output, float lambda, int iter, float epsi
         for(ll=0; ll<iter; ll++) {
             
             /* computing the gradient of the objective function */
-            Obj_func3D(Input, Output, R1, R2, R3, lambda, dimX, dimY, dimZ);
+            Obj_func3D(Input, Output, R1, R2, R3, lambdaPar, dimX, dimY, dimZ);
             
             /* apply nonnegativity */
             if (nonneg == 1) for(j=0; j<dimX*dimY*dimZ; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;}  
             
             /*Taking a step towards minus of the gradient*/
-            Grad_func3D(P1, P2, P3, Output, R1, R2, R3, lambda, dimX, dimY, dimZ);
+            Grad_func3D(P1, P2, P3, Output, R1, R2, R3, lambdaPar, dimX, dimY, dimZ);
             
             /* projection step */
             Proj_func3D(P1, P2, P3, methodTV, dimX, dimY, dimZ);
diff --git a/Core/regularizers_CPU/FGP_TV_core.h b/Core/regularizers_CPU/FGP_TV_core.h
index 98ceaec..360251d 100644
--- a/Core/regularizers_CPU/FGP_TV_core.h
+++ b/Core/regularizers_CPU/FGP_TV_core.h
@@ -47,7 +47,7 @@ limitations under the License.
 #ifdef __cplusplus
 extern "C" {
 #endif
-CCPI_EXPORT float TV_FGP_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
+CCPI_EXPORT float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
 
 CCPI_EXPORT float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, int dimX, int dimY);
 CCPI_EXPORT float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda, int dimX, int dimY);
diff --git a/Core/regularizers_CPU/ROF_TV_core.c b/Core/regularizers_CPU/ROF_TV_core.c
index b2c6f00..a058f54 100644
--- a/Core/regularizers_CPU/ROF_TV_core.c
+++ b/Core/regularizers_CPU/ROF_TV_core.c
@@ -19,7 +19,7 @@
 
 #include "ROF_TV_core.h"
 
-#define EPS 1.0e-4
+#define EPS 1.0e-5
 #define MAX(x, y) (((x) > (y)) ? (x) : (y))
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 
@@ -46,7 +46,7 @@ int sign(float x) {
  */
 
 /* Running iterations of TV-ROF function */
-float TV_ROF_CPU(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float lambda)
+float TV_ROF_CPU_main(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float lambda)
 {
     float *D1, *D2, *D3;
     int i, DimTotal;
diff --git a/Core/regularizers_CPU/ROF_TV_core.h b/Core/regularizers_CPU/ROF_TV_core.h
index b32d0d5..63c0419 100644
--- a/Core/regularizers_CPU/ROF_TV_core.h
+++ b/Core/regularizers_CPU/ROF_TV_core.h
@@ -47,7 +47,7 @@ limitations under the License.
 extern "C" {
 #endif
 CCPI_EXPORT float TV_kernel(float *D1, float *D2, float *D3, float *B, float *A, float lambda, float tau, int dimY, int dimX, int dimZ);
-CCPI_EXPORT float TV_ROF_CPU(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float lambda);
+CCPI_EXPORT float TV_ROF_CPU_main(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float lambda);
 CCPI_EXPORT float D1_func(float *A, float *D1, int dimY, int dimX, int dimZ);
 CCPI_EXPORT float D2_func(float *A, float *D2, int dimY, int dimX, int dimZ);
 CCPI_EXPORT float D3_func(float *A, float *D3, int dimY, int dimX, int dimZ);
diff --git a/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu
index 0533a85..cbe9b5e 100755
--- a/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu
+++ b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu
@@ -25,7 +25,7 @@ limitations under the License.
  *
  * Input Parameters:
  * 1. Noisy image/volume 
- * 2. lambda - regularization parameter 
+ * 2. lambdaPar - regularization parameter 
  * 3. Number of iterations
  * 4. eplsilon: tolerance constant 
  * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
@@ -341,7 +341,7 @@ __global__ void copy_kernel3D(float *Input, float* Output, int N, int M, int Z,
 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
 
 ////////////MAIN HOST FUNCTION ///////////////
-extern "C" void TV_FGP_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
+extern "C" void TV_FGP_GPU(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
 {
     int deviceCount = -1; // number of devices
     cudaGetDeviceCount(&deviceCount);
@@ -383,13 +383,13 @@ extern "C" void TV_FGP_GPU(float *Input, float *Output, float lambda, int iter,
         cudaMemset(R2, 0, ImSize*sizeof(float));
 
         /********************** Run CUDA 2D kernel here ********************/    
-        multip = (1.0f/(8.0f*lambda));
+        multip = (1.0f/(8.0f*lambdaPar));
     
         /* The main kernel */
         for (i = 0; i < iter; i++) {
         
             /* computing the gradient of the objective function */
-            Obj_func2D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, dimX, dimY, ImSize, lambda);
+            Obj_func2D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, dimX, dimY, ImSize, lambdaPar);
             checkCudaErrors( cudaDeviceSynchronize() );
             checkCudaErrors(cudaPeekAtLastError() );
             
@@ -493,13 +493,13 @@ extern "C" void TV_FGP_GPU(float *Input, float *Output, float lambda, int iter,
             cudaMemset(R2, 0, ImSize*sizeof(float));
             cudaMemset(R3, 0, ImSize*sizeof(float));
             /********************** Run CUDA 3D kernel here ********************/    
-            multip = (1.0f/(8.0f*lambda));
+            multip = (1.0f/(8.0f*lambdaPar));
     
             /* The main kernel */
         for (i = 0; i < iter; i++) {
         
             /* computing the gradient of the objective function */
-            Obj_func3D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, lambda);
+            Obj_func3D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, lambdaPar);
             checkCudaErrors( cudaDeviceSynchronize() );
             checkCudaErrors(cudaPeekAtLastError() );
         
diff --git a/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h
index 15c7120..2b8fdcf 100755
--- a/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h
+++ b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h
@@ -5,6 +5,6 @@
 #ifndef _TV_FGP_GPU_
 #define _TV_FGP_GPU_
 
-extern "C" void TV_FGP_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);   
+extern "C" void TV_FGP_GPU(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);   
 
 #endif 
diff --git a/Wrappers/Python/demo/test_cpu_regularizers.py b/Wrappers/Python/demo/test_cpu_regularizers.py
index b08c418..53b8538 100644
--- a/Wrappers/Python/demo/test_cpu_regularizers.py
+++ b/Wrappers/Python/demo/test_cpu_regularizers.py
@@ -11,10 +11,9 @@ import numpy as np
 import os    
 from enum import Enum
 import timeit
-from ccpi.filters.cpu_regularizers_boost import SplitBregman_TV , FGP_TV ,\
-                                                 LLT_model, PatchBased_Regul ,\
+from ccpi.filters.cpu_regularizers_boost import SplitBregman_TV, LLT_model, PatchBased_Regul ,\
                                                  TGV_PD
-from ccpi.filters.cpu_regularizers_cython import ROF_TV
+from ccpi.filters.cpu_regularizers_cython import TV_ROF_CPU, TV_FGP_CPU
 
 ###############################################################################
 #https://stackoverflow.com/questions/13875989/comparing-image-in-url-to-image-in-filesystem-in-python/13884956#13884956
@@ -128,21 +127,25 @@ imgplot = plt.imshow(splitbregman,\
                      )
 
 ###################### FGP_TV #########################################
-# u = FGP_TV(single(u0), 0.05, 100, 1e-04);
+
 start_time = timeit.default_timer()
-pars = {'algorithm' : FGP_TV , \
+pars = {'algorithm' : TV_FGP_CPU , \
         'input' : u0,
         'regularization_parameter':0.05, \
         'number_of_iterations' :200 ,\
-        'tolerance_constant':1e-4,\
-        'TV_penalty': 0
+        'tolerance_constant':1e-5,\
+        'methodTV': 0 ,\
+        'nonneg': 0 ,\
+        'printingOut': 0
 }
 
-out = FGP_TV (pars['input'], 
+out = TV_FGP_CPU (pars['input'], 
               pars['regularization_parameter'],
               pars['number_of_iterations'],
               pars['tolerance_constant'], 
-              pars['TV_penalty'])  
+              pars['methodTV'],
+              pars['nonneg'],
+              pars['printingOut'])  
 
 fgp = out[0]
 rms = rmse(Im, fgp)
@@ -282,13 +285,13 @@ imgplot = plt.imshow(tgv, cmap="gray")
 
 start_time = timeit.default_timer()
 
-pars = {'algorithm': ROF_TV , \
+pars = {'algorithm': TV_ROF_CPU , \
         'input' : u0,\
         'regularization_parameter':0.04,\
         'marching_step': 0.0025,\
         'number_of_iterations': 300
         }
-rof = ROF_TV(pars['input'],
+rof = TV_ROF_CPU(pars['input'],
              pars['number_of_iterations'],
              pars['regularization_parameter'],
              pars['marching_step'] 
diff --git a/Wrappers/Python/src/cpu_regularizers.cpp b/Wrappers/Python/src/cpu_regularizers.cpp
index 43d5d11..b8156d5 100644
--- a/Wrappers/Python/src/cpu_regularizers.cpp
+++ b/Wrappers/Python/src/cpu_regularizers.cpp
@@ -1040,7 +1040,6 @@ BOOST_PYTHON_MODULE(cpu_regularizers_boost)
 	np::dtype dt2 = np::dtype::get_builtin<uint16_t>();
 
 	def("SplitBregman_TV", SplitBregman_TV);
-	def("FGP_TV", FGP_TV);
 	def("LLT_model", LLT_model);
 	def("PatchBased_Regul", PatchBased_Regul);
 	def("TGV_PD", TGV_PD);
diff --git a/Wrappers/Python/src/cpu_regularizers.pyx b/Wrappers/Python/src/cpu_regularizers.pyx
index b8089a8..448da31 100644
--- a/Wrappers/Python/src/cpu_regularizers.pyx
+++ b/Wrappers/Python/src/cpu_regularizers.pyx
@@ -18,8 +18,8 @@ import cython
 import numpy as np
 cimport numpy as np
 
-cdef extern float TV_ROF_CPU(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float flambda);
-cdef extern float TV_FGP_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
+cdef extern float TV_ROF_CPU_main(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float flambda);
+cdef extern float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);								  
 
 # Can we use the same name here in "def" as the C function? 
 def TV_ROF_CPU(inputData, iterations, regularization_parameter, 
@@ -45,11 +45,10 @@ def TV_ROF_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
             np.zeros([dims[0],dims[1]], dtype='float32')
                    
     #/* Run ROF iterations for 2D data */
-    TV_ROF_CPU(&inputData[0,0], &B[0,0], dims[0], dims[1], 1, iterations,
+    TV_ROF_CPU_main(&inputData[0,0], &B[0,0], dims[0], dims[1], 1, iterations,
                marching_step_parameter, regularization_parameter)
     
-    return B
-        
+    return B        
             
 def TV_ROF_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
                      int iterations,
@@ -65,7 +64,7 @@ def TV_ROF_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
             np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
            
     #/* Run ROF iterations for 3D data */
-    TV_FGP_CPU(&inputData[0,0,0], &B[0,0,0], dims[0], dims[1], dims[2], iterations,
+    TV_ROF_CPU_main(&inputData[0,0,0], &B[0,0,0], dims[0], dims[1], dims[2], iterations,
              marching_step_parameter, regularization_parameter)
 
     return B    
@@ -88,11 +87,11 @@ def TV_FGP_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
     dims[0] = inputData.shape[0]
     dims[1] = inputData.shape[1]
     
-    cdef np.ndarray[np.float32_t, ndim=2, mode="c"] B = \
+    cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \
             np.zeros([dims[0],dims[1]], dtype='float32')
                    
     #/* Run ROF iterations for 2D data */
-    TV_FGP_CPU(&inputData[0,0], &B[0,0], regularization_parameter, 
+    TV_FGP_CPU_main(&inputData[0,0], &outputData[0,0], regularization_parameter, 
                        iterations, 
                        tolerance_param,
                        methodTV,
@@ -100,8 +99,7 @@ def TV_FGP_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
                        printM,
                        dims[0], dims[1], 1)
     
-    return B
-        
+    return outputData        
             
 def TV_FGP_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
  float regularization_parameter,
@@ -115,15 +113,16 @@ def TV_FGP_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
     dims[1] = inputData.shape[1]
     dims[2] = inputData.shape[2]
     
-    cdef np.ndarray[np.float32_t, ndim=3, mode="c"] B = \
+    cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \
             np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
            
     #/* Run ROF iterations for 3D data */
-    TV_FGP_CPU(&inputData[0,0, 0], &B[0,0, 0], iterations, 
+    TV_FGP_CPU_main(&inputData[0,0,0], &outputData[0,0, 0], regularization_parameter,
+					   iterations, 
                        tolerance_param,
                        methodTV,
                        nonneg,
                        printM,
                        dims[0], dims[1], dims[2])
 
-    return B  
+    return outputData 
diff --git a/Wrappers/Python/src/gpu_regularizers.pyx b/Wrappers/Python/src/gpu_regularizers.pyx
index a14a20d..e99bfa7 100644
--- a/Wrappers/Python/src/gpu_regularizers.pyx
+++ b/Wrappers/Python/src/gpu_regularizers.pyx
@@ -26,7 +26,7 @@ cdef extern void NLM_GPU_kernel(float *A, float* B, float *Eucl_Vec,
                                 int SearchW, int SimilW, 
                                 int SearchW_real, float denh2, float lambdaf);
 cdef extern void TV_ROF_GPU(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambdaf);
-cdef extern void TV_FGP_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int N, int M, int Z);
+cdef extern void TV_FGP_GPU(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int N, int M, int Z);
 
 cdef extern float pad_crop(float *A, float *Ap, 
                            int OldSizeX, int OldSizeY, int OldSizeZ, 
diff --git a/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py b/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py
index 6344021..e162afa 100644
--- a/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py
+++ b/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py
@@ -12,7 +12,7 @@ import matplotlib.pyplot as plt
 import numpy as np
 import os    
 import timeit
-from ccpi.filters.gpu_regularizers import Diff4thHajiaboli, NML, TV_ROF_GPU
+from ccpi.filters.gpu_regularizers import Diff4thHajiaboli, NML, GPU_ROF_TV
 from ccpi.filters.cpu_regularizers_cython import TV_ROF_CPU
 ###############################################################################
 def printParametersToString(pars):
@@ -56,11 +56,11 @@ imgplot = plt.imshow(u0,cmap="gray")
 
 
 # set parameters
-pars = {'algorithm': ROF_TV , \
+pars = {'algorithm': TV_ROF_CPU , \
         'input' : u0,\
         'regularization_parameter':0.04,\
         'time_marching_parameter': 0.0025,\
-        'number_of_iterations': 600
+        'number_of_iterations': 1200
         }
 print ("#################ROF TV CPU#####################")
 start_time = timeit.default_timer()
@@ -89,13 +89,14 @@ plt.title('{}'.format('CPU results'))
 
 print ("#################ROF TV GPU#####################")
 start_time = timeit.default_timer()
-rof_gpu = TV_ROF_GPU(pars['input'], 
+rof_gpu = GPU_ROF_TV(pars['input'], 
                      pars['number_of_iterations'], 
                      pars['time_marching_parameter'], 
                      pars['regularization_parameter'])
                      
 rms = rmse(Im, rof_gpu)
 pars['rmse'] = rms
+pars['algorithm'] = GPU_ROF_TV
 txtstr = printParametersToString(pars)
 txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
 print (txtstr)
-- 
cgit v1.2.3


From ccf9b61bba1004af783c6333d58ea9611c0f81f2 Mon Sep 17 00:00:00 2001
From: Daniil Kazantsev <dkazanc@hotmail.com>
Date: Tue, 6 Mar 2018 11:16:41 +0000
Subject: ROF_FGP_unified syntax

---
 Core/regularizers_CPU/FGP_TV_core.c      |  8 ++---
 Core/regularizers_CPU/FGP_TV_core.h      |  2 +-
 Core/regularizers_CPU/ROF_TV_core.c      |  4 +--
 Core/regularizers_CPU/ROF_TV_core.h      |  6 ++--
 Wrappers/Python/src/cpu_regularizers.pyx | 59 +++++++++++++++-----------------
 5 files changed, 37 insertions(+), 42 deletions(-)

diff --git a/Core/regularizers_CPU/FGP_TV_core.c b/Core/regularizers_CPU/FGP_TV_core.c
index 5365631..9c0fcfc 100644
--- a/Core/regularizers_CPU/FGP_TV_core.c
+++ b/Core/regularizers_CPU/FGP_TV_core.c
@@ -37,7 +37,7 @@ limitations under the License.
  * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
  */
  
-float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
+float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
 {
 	int ll, j, DimTotal;
 	float re, re1;
@@ -47,7 +47,7 @@ float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iter, fl
 	
 	if (dimZ <= 1) {
 		/*2D case */				
-		float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL;		
+		float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL;
 		DimTotal = dimX*dimY;
 		
 		Output_prev = (float *) malloc( DimTotal * sizeof(float) );
@@ -59,7 +59,7 @@ float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iter, fl
 		R2 = (float *) malloc( DimTotal * sizeof(float) );
 		
 		/* begin iterations */
-        for(ll=0; ll<iter; ll++) {
+        for(ll=0; ll<iterationsNumb; ll++) {
             
             /* computing the gradient of the objective function */
             Obj_func2D(Input, Output, R1, R2, lambdaPar, dimX, dimY);
@@ -114,7 +114,7 @@ float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iter, fl
 		R3 = (float *) malloc( DimTotal * sizeof(float) );
 		
 		    /* begin iterations */
-        for(ll=0; ll<iter; ll++) {
+        for(ll=0; ll<iterationsNumb; ll++) {
             
             /* computing the gradient of the objective function */
             Obj_func3D(Input, Output, R1, R2, R3, lambdaPar, dimX, dimY, dimZ);
diff --git a/Core/regularizers_CPU/FGP_TV_core.h b/Core/regularizers_CPU/FGP_TV_core.h
index 360251d..43a8519 100644
--- a/Core/regularizers_CPU/FGP_TV_core.h
+++ b/Core/regularizers_CPU/FGP_TV_core.h
@@ -47,7 +47,7 @@ limitations under the License.
 #ifdef __cplusplus
 extern "C" {
 #endif
-CCPI_EXPORT float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
+CCPI_EXPORT float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
 
 CCPI_EXPORT float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, int dimX, int dimY);
 CCPI_EXPORT float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda, int dimX, int dimY);
diff --git a/Core/regularizers_CPU/ROF_TV_core.c b/Core/regularizers_CPU/ROF_TV_core.c
index a058f54..a59a3c6 100644
--- a/Core/regularizers_CPU/ROF_TV_core.c
+++ b/Core/regularizers_CPU/ROF_TV_core.c
@@ -46,7 +46,7 @@ int sign(float x) {
  */
 
 /* Running iterations of TV-ROF function */
-float TV_ROF_CPU_main(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float lambda)
+float TV_ROF_CPU_mainfloat TV_ROF_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float tau, int dimX, int dimY, int dimZ)
 {
     float *D1, *D2, *D3;
     int i, DimTotal;
@@ -68,7 +68,7 @@ float TV_ROF_CPU_main(float *Input, float *Output, int dimX, int dimY, int dimZ,
             D1_func(Output, D1, dimX, dimY, dimZ);
             D2_func(Output, D2, dimX, dimY, dimZ);           
             if (dimZ > 1) D3_func(Output, D3, dimX, dimY, dimZ); 			
-            TV_kernel(D1, D2, D3, Output, Input, lambda, tau, dimX, dimY, dimZ);  		          
+            TV_kernel(D1, D2, D3, Output, Input, lambdaPar, tau, dimX, dimY, dimZ);  		          
 		}           
     free(D1);free(D2); free(D3);
     return *Output;
diff --git a/Core/regularizers_CPU/ROF_TV_core.h b/Core/regularizers_CPU/ROF_TV_core.h
index 63c0419..14daf58 100644
--- a/Core/regularizers_CPU/ROF_TV_core.h
+++ b/Core/regularizers_CPU/ROF_TV_core.h
@@ -31,8 +31,8 @@ limitations under the License.
  * Input Parameters:
  * 1. Noisy image/volume [REQUIRED]
  * 2. lambda - regularization parameter [REQUIRED]
- * 3. tau - marching step for explicit scheme, ~1 is recommended [REQUIRED]
- * 4. Number of iterations, for explicit scheme >= 150 is recommended  [REQUIRED]
+ * 3. Number of iterations, for explicit scheme >= 150 is recommended  [REQUIRED]
+ * 4. tau - marching step for explicit scheme, ~1 is recommended [REQUIRED]
  *
  * Output:
  * [1] Regularized image/volume 
@@ -47,7 +47,7 @@ limitations under the License.
 extern "C" {
 #endif
 CCPI_EXPORT float TV_kernel(float *D1, float *D2, float *D3, float *B, float *A, float lambda, float tau, int dimY, int dimX, int dimZ);
-CCPI_EXPORT float TV_ROF_CPU_main(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float lambda);
+CCPI_EXPORT float TV_ROF_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float tau, int dimX, int dimY, int dimZ);
 CCPI_EXPORT float D1_func(float *A, float *D1, int dimY, int dimX, int dimZ);
 CCPI_EXPORT float D2_func(float *A, float *D2, int dimY, int dimX, int dimZ);
 CCPI_EXPORT float D3_func(float *A, float *D3, int dimY, int dimX, int dimZ);
diff --git a/Wrappers/Python/src/cpu_regularizers.pyx b/Wrappers/Python/src/cpu_regularizers.pyx
index 448da31..2654831 100644
--- a/Wrappers/Python/src/cpu_regularizers.pyx
+++ b/Wrappers/Python/src/cpu_regularizers.pyx
@@ -11,73 +11,68 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 
-Author: Edoardo Pasca
+Author: Edoardo Pasca, Daniil Kazantsev
 """
 
 import cython
 import numpy as np
 cimport numpy as np
 
-cdef extern float TV_ROF_CPU_main(float *Input, float *Output, int dimX, int dimY, int dimZ, int iterationsNumb, float tau, float flambda);
-cdef extern float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);								  
+cdef extern float TV_ROF_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float tau, int dimX, int dimY, int dimZ);
+cdef extern float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
 
-# Can we use the same name here in "def" as the C function? 
-def TV_ROF_CPU(inputData, iterations, regularization_parameter, 
+def TV_ROF_CPU(inputData, regularization_parameter, iterationsNumb
            marching_step_parameter):
     if inputData.ndim == 2:
-        return TV_ROF_2D(inputData, iterations, regularization_parameter, 
+        return TV_ROF_2D(inputData, regularization_parameter, iterationsNumb
                          marching_step_parameter)
     elif inputData.ndim == 3:
-        return TV_ROF_3D(inputData, iterations, regularization_parameter, 
+        return TV_ROF_3D(inputData, regularization_parameter, iterationsNumb
                          marching_step_parameter)
 
 def TV_ROF_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
-                     int iterations,
                      float regularization_parameter,
-                     float marching_step_parameter
-                     ):
-                         
+                     int iterationsNumb,                     
+                     float marching_step_parameter):
     cdef long dims[2]
     dims[0] = inputData.shape[0]
     dims[1] = inputData.shape[1]
     
-    cdef np.ndarray[np.float32_t, ndim=2, mode="c"] B = \
+    cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \
             np.zeros([dims[0],dims[1]], dtype='float32')
                    
-    #/* Run ROF iterations for 2D data */
-    TV_ROF_CPU_main(&inputData[0,0], &B[0,0], dims[0], dims[1], 1, iterations,
-               marching_step_parameter, regularization_parameter)
+    # Run ROF iterations for 2D data 
+    TV_ROF_CPU_main(&inputData[0,0], &outputData[0,0], regularization_parameter, iterationsNumb, marching_step_parameter, dims[0], dims[1], 1)
     
-    return B        
+    return outputData
             
 def TV_ROF_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
                      int iterations,
                      float regularization_parameter,
                      float marching_step_parameter
                      ):
-    cdef long dims[2]
+    cdef long dims[3]
     dims[0] = inputData.shape[0]
     dims[1] = inputData.shape[1]
     dims[2] = inputData.shape[2]
     
-    cdef np.ndarray[np.float32_t, ndim=3, mode="c"] B = \
+    cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \
             np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
            
-    #/* Run ROF iterations for 3D data */
-    TV_ROF_CPU_main(&inputData[0,0,0], &B[0,0,0], dims[0], dims[1], dims[2], iterations,
-             marching_step_parameter, regularization_parameter)
+    # Run ROF iterations for 3D data 
+    TV_ROF_CPU_main(&inputData[0,0,0], &outputData[0,0,0], regularization_parameter, iterationsNumb, marching_step_parameter, dims[0], dims[1], dims[2])
 
-    return B    
+    return outputData    
                      
-def TV_FGP_CPU(inputData, regularization_parameter, iterations, tolerance_param, methodTV, nonneg, printM):
+def TV_FGP_CPU(inputData, regularization_parameter, iterationsNumb, tolerance_param, methodTV, nonneg, printM):
     if inputData.ndim == 2:
-        return TV_FGP_2D(inputData, regularization_parameter, iterations, tolerance_param, methodTV, nonneg, printM)
+        return TV_FGP_2D(inputData, regularization_parameter, iterationsNumb, tolerance_param, methodTV, nonneg, printM)
     elif inputData.ndim == 3:
-        return TV_FGP_3D(inputData, regularization_parameter, iterations, tolerance_param, methodTV, nonneg, printM)
+        return TV_FGP_3D(inputData, regularization_parameter, iterationsNumb, tolerance_param, methodTV, nonneg, printM)
 
 def TV_FGP_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
                      float regularization_parameter,
-                     int iterations, 
+                     int iterationsNumb, 
                      float tolerance_param,
                      int methodTV,
                      int nonneg,
@@ -92,7 +87,7 @@ def TV_FGP_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
                    
     #/* Run ROF iterations for 2D data */
     TV_FGP_CPU_main(&inputData[0,0], &outputData[0,0], regularization_parameter, 
-                       iterations, 
+                       iterationsNumb, 
                        tolerance_param,
                        methodTV,
                        nonneg,
@@ -103,22 +98,22 @@ def TV_FGP_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
             
 def TV_FGP_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
  float regularization_parameter,
-                     int iterations, 
+                     int iterationsNumb, 
                      float tolerance_param,
                      int methodTV,
                      int nonneg,
                      int printM):
-    cdef long dims[2]
+    cdef long dims[3]
     dims[0] = inputData.shape[0]
     dims[1] = inputData.shape[1]
     dims[2] = inputData.shape[2]
     
     cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \
-            np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
+            np.zeros([dims[0], dims[1], dims[2]], dtype='float32')
            
     #/* Run ROF iterations for 3D data */
-    TV_FGP_CPU_main(&inputData[0,0,0], &outputData[0,0, 0], regularization_parameter,
-					   iterations, 
+    TV_FGP_CPU_main(&inputData[0,0,0], &outputData[0,0,0], regularization_parameter,
+					   iterationsNumb, 
                        tolerance_param,
                        methodTV,
                        nonneg,
-- 
cgit v1.2.3


From 8d310478254f3cda63f3663729b416f425ad70b6 Mon Sep 17 00:00:00 2001
From: algol <dkazanc@hotmail.com>
Date: Tue, 6 Mar 2018 11:45:53 +0000
Subject: work on FGP intergration

---
 Core/CMakeLists.txt                           |  2 +-
 Core/regularizers_CPU/ROF_TV_core.c           | 22 +++++++++++-----------
 Core/regularizers_CPU/utils.c                 |  8 ++++----
 Wrappers/Python/demo/test_cpu_regularizers.py | 17 +++++++++--------
 Wrappers/Python/src/cpu_regularizers.pyx      | 14 +++++---------
 5 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/Core/CMakeLists.txt b/Core/CMakeLists.txt
index 68b7111..4e85002 100644
--- a/Core/CMakeLists.txt
+++ b/Core/CMakeLists.txt
@@ -88,7 +88,7 @@ add_library(cilreg SHARED
 	    ${CMAKE_CURRENT_SOURCE_DIR}/regularizers_CPU/PatchBased_Regul_core.c
 	    ${CMAKE_CURRENT_SOURCE_DIR}/regularizers_CPU/SplitBregman_TV_core.c
 	    ${CMAKE_CURRENT_SOURCE_DIR}/regularizers_CPU/TGV_PD_core.c
-            ${CMAKE_CURRENT_SOURCE_DIR}/regularizers_CPU/ROF_TV_core.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularizers_CPU/ROF_TV_core.c
 	    ${CMAKE_CURRENT_SOURCE_DIR}/regularizers_CPU/utils.c
 	    )
 target_link_libraries(cilreg ${EXTRA_LIBRARIES} )
diff --git a/Core/regularizers_CPU/ROF_TV_core.c b/Core/regularizers_CPU/ROF_TV_core.c
index a59a3c6..a4f82a6 100644
--- a/Core/regularizers_CPU/ROF_TV_core.c
+++ b/Core/regularizers_CPU/ROF_TV_core.c
@@ -46,7 +46,7 @@ int sign(float x) {
  */
 
 /* Running iterations of TV-ROF function */
-float TV_ROF_CPU_mainfloat TV_ROF_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float tau, int dimX, int dimY, int dimZ)
+float TV_ROF_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float tau, int dimX, int dimY, int dimZ)
 {
     float *D1, *D2, *D3;
     int i, DimTotal;
@@ -132,9 +132,9 @@ float D1_func(float *A, float *D1, int dimX, int dimY, int dimZ)
                 NOMy_0 = A[index] - A[(j)*dimX + i2]; /* y- */
                 
                 denom1 = NOMx_1*NOMx_1;
-                denom2 = 0.5*(sign(NOMy_1) + sign(NOMy_0))*(MIN(fabs(NOMy_1),fabs(NOMy_0)));
+                denom2 = 0.5f*(sign(NOMy_1) + sign(NOMy_0))*(MIN(fabs(NOMy_1),fabs(NOMy_0)));
                 denom2 = denom2*denom2;
-                T1 = sqrt(denom1 + denom2 + EPS);
+                T1 = sqrtf(denom1 + denom2 + EPS);
                 D1[index] = NOMx_1/T1;
             }}
     }
@@ -170,11 +170,11 @@ float D2_func(float *A, float *D2, int dimX, int dimY, int dimZ)
                     
                     
                     denom1 = NOMy_1*NOMy_1;
-                    denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(fabs(NOMx_1),fabs(NOMx_0)));
+                    denom2 = 0.5f*(sign(NOMx_1) + sign(NOMx_0))*(MIN(fabs(NOMx_1),fabs(NOMx_0)));
                     denom2 = denom2*denom2;
-                    denom3 = 0.5*(sign(NOMz_1) + sign(NOMz_0))*(MIN(fabs(NOMz_1),fabs(NOMz_0)));
+                    denom3 = 0.5f*(sign(NOMz_1) + sign(NOMz_0))*(MIN(fabs(NOMz_1),fabs(NOMz_0)));
                     denom3 = denom3*denom3;
-                    T2 = sqrt(denom1 + denom2 + denom3 + EPS);
+                    T2 = sqrtf(denom1 + denom2 + denom3 + EPS);
                     D2[index] = NOMy_1/T2;
                 }}}
     }
@@ -196,9 +196,9 @@ float D2_func(float *A, float *D2, int dimX, int dimY, int dimZ)
                 /*NOMy_0 = A[(i)*dimY + j] - A[(i)*dimY + j2]; */  /* y- */
                 
                 denom1 = NOMy_1*NOMy_1;
-                denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(fabs(NOMx_1),fabs(NOMx_0)));
+                denom2 = 0.5f*(sign(NOMx_1) + sign(NOMx_0))*(MIN(fabs(NOMx_1),fabs(NOMx_0)));
                 denom2 = denom2*denom2;
-                T2 = sqrt(denom1 + denom2 + EPS);
+                T2 = sqrtf(denom1 + denom2 + EPS);
                 D2[index] = NOMy_1/T2;
             }}
     }
@@ -233,11 +233,11 @@ float D3_func(float *A, float *D3, int dimY, int dimX, int dimZ)
                 /*NOMz_0 = A[(dimX*dimY)*k + (i)*dimY + j] - A[(dimX*dimY)*k2 + (i)*dimY + j]; */ /* z- */
                 
                 denom1 = NOMz_1*NOMz_1;
-                denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(fabs(NOMx_1),fabs(NOMx_0)));
+                denom2 = 0.5f*(sign(NOMx_1) + sign(NOMx_0))*(MIN(fabs(NOMx_1),fabs(NOMx_0)));
                 denom2 = denom2*denom2;
-                denom3 = 0.5*(sign(NOMy_1) + sign(NOMy_0))*(MIN(fabs(NOMy_1),fabs(NOMy_0)));
+                denom3 = 0.5f*(sign(NOMy_1) + sign(NOMy_0))*(MIN(fabs(NOMy_1),fabs(NOMy_0)));
                 denom3 = denom3*denom3;
-                T3 = sqrt(denom1 + denom2 + denom3 + EPS);
+                T3 = sqrtf(denom1 + denom2 + denom3 + EPS);
                 D3[index] = NOMz_1/T3;
             }}}
     return *D3;
diff --git a/Core/regularizers_CPU/utils.c b/Core/regularizers_CPU/utils.c
index 951fb91..cdf3d0e 100644
--- a/Core/regularizers_CPU/utils.c
+++ b/Core/regularizers_CPU/utils.c
@@ -45,10 +45,10 @@ float TV_energy2D(float *U, float *U0, float *E_val, float lambda, int dimX, int
                 j1 = j + 1; if (j == dimY-1) j1 = j;
                 
                 /* Forward differences */
-                NOMx_2 = pow(U[j1*dimX + i] - U[index],2); /* x+ */
-                NOMy_2 = pow(U[j*dimX + i1] - U[index],2); /* y+ */
-                E_Grad += sqrt(NOMx_2 + NOMy_2); /* gradient term energy */
-                E_Data += 0.5f * lambda*(pow((U[index]-U0[index]),2)); /* fidelity term energy */
+                NOMx_2 = powf(U[j1*dimX + i] - U[index],2); /* x+ */
+                NOMy_2 = powf(U[j*dimX + i1] - U[index],2); /* y+ */
+                E_Grad += sqrtf(NOMx_2 + NOMy_2); /* gradient term energy */
+                E_Data += 0.5f * lambda*(powf((U[index]-U0[index]),2)); /* fidelity term energy */
 			}
 		}
 		E_val[0] = E_Grad + E_Data;
diff --git a/Wrappers/Python/demo/test_cpu_regularizers.py b/Wrappers/Python/demo/test_cpu_regularizers.py
index 53b8538..f1eb3c3 100644
--- a/Wrappers/Python/demo/test_cpu_regularizers.py
+++ b/Wrappers/Python/demo/test_cpu_regularizers.py
@@ -131,9 +131,9 @@ imgplot = plt.imshow(splitbregman,\
 start_time = timeit.default_timer()
 pars = {'algorithm' : TV_FGP_CPU , \
         'input' : u0,
-        'regularization_parameter':0.05, \
-        'number_of_iterations' :200 ,\
-        'tolerance_constant':1e-5,\
+        'regularization_parameter':0.07, \
+        'number_of_iterations' :300 ,\
+        'tolerance_constant':0.00001,\
         'methodTV': 0 ,\
         'nonneg': 0 ,\
         'printingOut': 0
@@ -156,7 +156,7 @@ txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
 print (txtstr)       
 
 
-a=fig.add_subplot(2,4,3)
+a=fig.add_subplot(2,4,4)
 
 # these are matplotlib.patch.Patch properties
 props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
@@ -168,8 +168,9 @@ imgplot = plt.imshow(fgp, \
 a.text(0.05, 0.95, txtstr, transform=a.transAxes, fontsize=14,
         verticalalignment='top', bbox=props)
 
-###################### LLT_model #########################################
 
+###################### LLT_model #########################################
+"""
 start_time = timeit.default_timer()
 
 pars = {'algorithm': LLT_model , \
@@ -204,7 +205,7 @@ a.text(0.05, 0.95, txtstr, transform=a.transAxes, fontsize=14,
 imgplot = plt.imshow(llt,\
                      cmap="gray"
                      )
-
+"""
 
 # ###################### PatchBased_Regul #########################################
 # # Quick 2D denoising example in Matlab:   
@@ -292,8 +293,8 @@ pars = {'algorithm': TV_ROF_CPU , \
         'number_of_iterations': 300
         }
 rof = TV_ROF_CPU(pars['input'],
-             pars['number_of_iterations'],
-             pars['regularization_parameter'],
+			 pars['regularization_parameter'],
+             pars['number_of_iterations'],             
              pars['marching_step'] 
              )
 #tgv = out
diff --git a/Wrappers/Python/src/cpu_regularizers.pyx b/Wrappers/Python/src/cpu_regularizers.pyx
index 2654831..d62ca59 100644
--- a/Wrappers/Python/src/cpu_regularizers.pyx
+++ b/Wrappers/Python/src/cpu_regularizers.pyx
@@ -21,14 +21,11 @@ cimport numpy as np
 cdef extern float TV_ROF_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float tau, int dimX, int dimY, int dimZ);
 cdef extern float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
 
-def TV_ROF_CPU(inputData, regularization_parameter, iterationsNumb
-           marching_step_parameter):
+def TV_ROF_CPU(inputData, regularization_parameter, iterationsNumb, marching_step_parameter):
     if inputData.ndim == 2:
-        return TV_ROF_2D(inputData, regularization_parameter, iterationsNumb
-                         marching_step_parameter)
+        return TV_ROF_2D(inputData, regularization_parameter, iterationsNumb, marching_step_parameter)
     elif inputData.ndim == 3:
-        return TV_ROF_3D(inputData, regularization_parameter, iterationsNumb
-                         marching_step_parameter)
+        return TV_ROF_3D(inputData, regularization_parameter, iterationsNumb, marching_step_parameter)
 
 def TV_ROF_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
                      float regularization_parameter,
@@ -47,10 +44,9 @@ def TV_ROF_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
     return outputData
             
 def TV_ROF_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
-                     int iterations,
+                     int iterationsNumb,
                      float regularization_parameter,
-                     float marching_step_parameter
-                     ):
+                     float marching_step_parameter):
     cdef long dims[3]
     dims[0] = inputData.shape[0]
     dims[1] = inputData.shape[1]
-- 
cgit v1.2.3


From 309d84445b5ec2980db7c79832958a6481343f17 Mon Sep 17 00:00:00 2001
From: Daniil Kazantsev <dkazanc@hotmail.com>
Date: Tue, 6 Mar 2018 12:46:09 +0000
Subject: FGP/ROF updates

---
 Core/regularizers_CPU/FGP_TV_core.c                | 139 +++++++++-----------
 Core/regularizers_CPU/FGP_TV_core.h                |   8 +-
 Core/regularizers_CPU/utils.c                      |  13 +-
 .../Matlab/mex_compile/regularizers_CPU/FGP_TV.c   | 142 ++-------------------
 Wrappers/Python/demo/test_cpu_regularizers.py      |  10 +-
 Wrappers/Python/src/cpu_regularizers.pyx           |   5 +-
 6 files changed, 89 insertions(+), 228 deletions(-)

diff --git a/Core/regularizers_CPU/FGP_TV_core.c b/Core/regularizers_CPU/FGP_TV_core.c
index 9c0fcfc..7a8ce48 100644
--- a/Core/regularizers_CPU/FGP_TV_core.c
+++ b/Core/regularizers_CPU/FGP_TV_core.c
@@ -46,7 +46,7 @@ float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iteratio
     int count = 0;
 	
 	if (dimZ <= 1) {
-		/*2D case */				
+		/*2D case */
 		float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL;
 		DimTotal = dimX*dimY;
 		
@@ -65,28 +65,28 @@ float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iteratio
             Obj_func2D(Input, Output, R1, R2, lambdaPar, dimX, dimY);
             
             /* apply nonnegativity */
-            if (nonneg == 1) for(j=0; j<dimX*dimY; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;}            
+            if (nonneg == 1) for(j=0; j<DimTotal; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;}            
             
             /*Taking a step towards minus of the gradient*/
             Grad_func2D(P1, P2, Output, R1, R2, lambdaPar, dimX, dimY);
             
             /* projection step */
-            Proj_func2D(P1, P2, methodTV, dimX, dimY);
+            Proj_func2D(P1, P2, methodTV, DimTotal);
             
             /*updating R and t*/
             tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
-            Rupd_func2D(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, dimX, dimY);
+            Rupd_func2D(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, DimTotal);
             
             /* check early stopping criteria */
             re = 0.0f; re1 = 0.0f;
-            for(j=0; j<dimX*dimY; j++)
+            for(j=0; j<DimTotal; j++)
             {
                 re += pow(Output[j] - Output_prev[j],2);
                 re1 += pow(Output[j],2);
             }
             re = sqrt(re)/sqrt(re1);
             if (re < epsil)  count++;
-				if (count > 4) break;				            
+				if (count > 4) break;
             
             /*storing old values*/
             copyIm(Output, Output_prev, dimX, dimY, 1);
@@ -120,21 +120,21 @@ float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar, int iteratio
             Obj_func3D(Input, Output, R1, R2, R3, lambdaPar, dimX, dimY, dimZ);
             
             /* apply nonnegativity */
-            if (nonneg == 1) for(j=0; j<dimX*dimY*dimZ; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;}  
+            if (nonneg == 1) for(j=0; j<DimTotal; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;}  
             
             /*Taking a step towards minus of the gradient*/
             Grad_func3D(P1, P2, P3, Output, R1, R2, R3, lambdaPar, dimX, dimY, dimZ);
             
             /* projection step */
-            Proj_func3D(P1, P2, P3, methodTV, dimX, dimY, dimZ);
+            Proj_func3D(P1, P2, P3, methodTV, DimTotal);
             
             /*updating R and t*/
             tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
-            Rupd_func3D(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, dimX, dimY, dimZ);
+            Rupd_func3D(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, DimTotal);
             
             /* calculate norm - stopping rules*/
             re = 0.0f; re1 = 0.0f;
-            for(j=0; j<dimX*dimY*dimZ; j++)
+            for(j=0; j<DimTotal; j++)
             {
                 re += pow(Output[j] - Output_prev[j],2);
                 re1 += pow(Output[j],2);
@@ -189,52 +189,46 @@ float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float la
         }}
     return 1;
 }
-float Proj_func2D(float *P1, float *P2, int methTV, int dimX, int dimY)
+float Proj_func2D(float *P1, float *P2, int methTV, int DimTotal)
 {
     float val1, val2, denom, sq_denom;
-    int i,j,index;
+    int i;
     if (methTV == 0) {
         /* isotropic TV*/
-#pragma omp parallel for shared(P1,P2) private(index,i,j,denom,sq_denom)
-        for(i=0; i<dimX; i++) {
-            for(j=0; j<dimY; j++) {
-				index = j*dimX+i;
-                denom = pow(P1[index],2) +  pow(P2[index],2);
+#pragma omp parallel for shared(P1,P2) private(i,denom,sq_denom)
+        for(i=0; i<DimTotal; i++) {
+                denom = powf(P1[i],2) +  powf(P2[i],2);
                 if (denom > 1.0f) {
-					sq_denom = 1.0f/sqrt(denom);
-                    P1[index] = P1[index]*sq_denom;
-                    P2[index] = P2[index]*sq_denom;
+					sq_denom = 1.0f/sqrtf(denom);
+                    P1[i] = P1[i]*sq_denom;
+                    P2[i] = P2[i]*sq_denom;
                 }
-            }}
+            }
     }
     else {
         /* anisotropic TV*/
-#pragma omp parallel for shared(P1,P2) private(index,i,j,val1,val2)
-        for(i=0; i<dimX; i++) {
-            for(j=0; j<dimY; j++) {
-				index = j*dimX+i;
-                val1 = fabs(P1[index]);
-                val2 = fabs(P2[index]);
+#pragma omp parallel for shared(P1,P2) private(i,val1,val2)
+        for(i=0; i<DimTotal; i++) {
+                val1 = fabs(P1[i]);
+                val2 = fabs(P2[i]);
                 if (val1 < 1.0f) {val1 = 1.0f;}
                 if (val2 < 1.0f) {val2 = 1.0f;}
-                P1[index] = P1[index]/val1;
-                P2[index] = P2[index]/val2;
-            }}
+                P1[i] = P1[i]/val1;
+                P2[i] = P2[i]/val2;
+            }
     }
     return 1;
 }
-float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, int dimX, int dimY)
+float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, int DimTotal)
 {
-    int i,j,index;
+    int i;
     float multip;
     multip = ((tk-1.0f)/tkp1);
-#pragma omp parallel for shared(P1,P2,P1_old,P2_old,R1,R2,multip) private(index,i,j)
-    for(i=0; i<dimX; i++) {
-        for(j=0; j<dimY; j++) {
-			index = j*dimX+i;
-            R1[index] = P1[index] + multip*(P1[index] - P1_old[index]);
-            R2[index] = P2[index] + multip*(P2[index] - P2_old[index]);
-        }}
+#pragma omp parallel for shared(P1,P2,P1_old,P2_old,R1,R2,multip) private(i)
+    for(i=0; i<DimTotal; i++) {       
+            R1[i] = P1[i] + multip*(P1[i] - P1_old[i]);
+            R2[i] = P2[i] + multip*(P2[i] - P2_old[i]);
+        }
     return 1;
 }
 
@@ -248,7 +242,7 @@ float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lamb
     for(i=0; i<dimX; i++) {
         for(j=0; j<dimY; j++) {
             for(k=0; k<dimZ; k++) {
-				index = (dimX*dimY)*k + j*dimX+i;	
+				index = (dimX*dimY)*k + j*dimX+i;
                 /* boundary conditions */
                 if (i == 0) {val1 = 0.0f;} else {val1 = R1[(dimX*dimY)*k + j*dimX + (i-1)];}
                 if (j == 0) {val2 = 0.0f;} else {val2 = R2[(dimX*dimY)*k + (j-1)*dimX + i];}
@@ -277,59 +271,50 @@ float Grad_func3D(float *P1, float *P2, float *P3, float *D, float *R1, float *R
             }}}
     return 1;
 }
-float Proj_func3D(float *P1, float *P2, float *P3, int methTV, int dimX, int dimY, int dimZ)
+float Proj_func3D(float *P1, float *P2, float *P3, int methTV, int DimTotal)
 {		
     float val1, val2, val3, denom, sq_denom;
-    int i,j,k, index;
+    int i;
     if (methTV == 0) {
 	/* isotropic TV*/
-	#pragma omp parallel for shared(P1,P2,P3) private(index,i,j,k,val1,val2,val3,sq_denom)
-    for(i=0; i<dimX; i++) {
-        for(j=0; j<dimY; j++) {
-            for(k=0; k<dimZ; k++) {
-				index = (dimX*dimY)*k + j*dimX+i;				
-				denom = pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2);				
+	#pragma omp parallel for shared(P1,P2,P3) private(i,val1,val2,val3,sq_denom)
+    for(i=0; i<DimTotal; i++) {        
+				denom = powf(P1[i],2) + powf(P2[i],2) + powf(P3[i],2);
                 if (denom > 1.0f) {
-					sq_denom = 1.0f/sqrt(denom);
-                    P1[index] = P1[index]*sq_denom;
-                    P2[index] = P2[index]*sq_denom;
-                    P3[index] = P3[index]*sq_denom;
-                }				
-			}}}	
+					sq_denom = 1.0f/sqrtf(denom);
+                    P1[i] = P1[i]*sq_denom;
+                    P2[i] = P2[i]*sq_denom;
+                    P3[i] = P3[i]*sq_denom;
+                }
+			}
 	}    
     else {
     /* anisotropic TV*/
-#pragma omp parallel for shared(P1,P2,P3) private(index,i,j,k,val1,val2,val3)
-    for(i=0; i<dimX; i++) {
-        for(j=0; j<dimY; j++) {
-            for(k=0; k<dimZ; k++) {
-				index = (dimX*dimY)*k + j*dimX+i;		
-                val1 = fabs(P1[index]);
-                val2 = fabs(P2[index]);
-                val3 = fabs(P3[index]);
+#pragma omp parallel for shared(P1,P2,P3) private(i,val1,val2,val3)
+    for(i=0; i<DimTotal; i++) {
+                val1 = fabs(P1[i]);
+                val2 = fabs(P2[i]);
+                val3 = fabs(P3[i]);
                 if (val1 < 1.0f) {val1 = 1.0f;}
                 if (val2 < 1.0f) {val2 = 1.0f;}
                 if (val3 < 1.0f) {val3 = 1.0f;}                
-                P1[index] = P1[index]/val1;
-                P2[index] = P2[index]/val2;
-                P3[index] = P3[index]/val3;
-            }}}
+                P1[i] = P1[i]/val1;
+                P2[i] = P2[i]/val2;
+                P3[i] = P3[i]/val3;
+            }
 		}
     return 1;
 }
-float Rupd_func3D(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, int dimX, int dimY, int dimZ)
+float Rupd_func3D(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, int DimTotal)
 {
-    int i,j,k,index;
+    int i;
     float multip;
     multip = ((tk-1.0f)/tkp1);
-#pragma omp parallel for shared(P1,P2,P3,P1_old,P2_old,P3_old,R1,R2,R3,multip) private(index,i,j,k)
-    for(i=0; i<dimX; i++) {
-        for(j=0; j<dimY; j++) {
-            for(k=0; k<dimZ; k++) {
-				index = (dimX*dimY)*k + j*dimX+i;	
-                R1[index] = P1[index] + multip*(P1[index] - P1_old[index]);
-                R2[index] = P2[index] + multip*(P2[index] - P2_old[index]);
-                R3[index] = P3[index] + multip*(P3[index] - P3_old[index]);
-            }}}
+#pragma omp parallel for shared(P1,P2,P3,P1_old,P2_old,P3_old,R1,R2,R3,multip) private(i)
+    for(i=0; i<DimTotal; i++) {
+                R1[i] = P1[i] + multip*(P1[i] - P1_old[i]);
+                R2[i] = P2[i] + multip*(P2[i] - P2_old[i]);
+                R3[i] = P3[i] + multip*(P3[i] - P3_old[i]);
+            }
     return 1;
 }
diff --git a/Core/regularizers_CPU/FGP_TV_core.h b/Core/regularizers_CPU/FGP_TV_core.h
index 43a8519..37e32f7 100644
--- a/Core/regularizers_CPU/FGP_TV_core.h
+++ b/Core/regularizers_CPU/FGP_TV_core.h
@@ -51,13 +51,13 @@ CCPI_EXPORT float TV_FGP_CPU_main(float *Input, float *Output, float lambdaPar,
 
 CCPI_EXPORT float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, int dimX, int dimY);
 CCPI_EXPORT float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda, int dimX, int dimY);
-CCPI_EXPORT float Proj_func2D(float *P1, float *P2, int methTV, int dimX, int dimY);
-CCPI_EXPORT float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, int dimX, int dimY);
+CCPI_EXPORT float Proj_func2D(float *P1, float *P2, int methTV, int DimTotal);
+CCPI_EXPORT float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, int DimTotal);
 
 CCPI_EXPORT float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ);
 CCPI_EXPORT float Grad_func3D(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ);
-CCPI_EXPORT float Proj_func3D(float *P1, float *P2, float *P3, int methTV, int dimX, int dimY, int dimZ);
-CCPI_EXPORT float Rupd_func3D(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, int dimX, int dimY, int dimZ);
+CCPI_EXPORT float Proj_func3D(float *P1, float *P2, float *P3, int methTV, int DimTotal);
+CCPI_EXPORT float Rupd_func3D(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, int DimTotal);
 #ifdef __cplusplus
 }
 #endif
diff --git a/Core/regularizers_CPU/utils.c b/Core/regularizers_CPU/utils.c
index cdf3d0e..0c02c45 100644
--- a/Core/regularizers_CPU/utils.c
+++ b/Core/regularizers_CPU/utils.c
@@ -18,6 +18,7 @@ limitations under the License.
 */
 
 #include "utils.h"
+#include <math.h>
 
 /* Copy Image */
 float copyIm(float *A, float *U, int dimX, int dimY, int dimZ)
@@ -34,7 +35,7 @@ float copyIm(float *A, float *U, int dimX, int dimY, int dimZ)
 float TV_energy2D(float *U, float *U0, float *E_val, float lambda, int dimX, int dimY)
 {
 	int i, j, i1, j1, index;
-	float NOMx_2, NOMy_2, E_Grad, E_Data;
+	float NOMx_2, NOMy_2, E_Grad=0.0f, E_Data=0.0f;
 	
 	/* first calculate \grad U_xy*/	
         for(j=0; j<dimY; j++) {
@@ -44,11 +45,11 @@ float TV_energy2D(float *U, float *U0, float *E_val, float lambda, int dimX, int
                 i1 = i + 1; if (i == dimX-1) i1 = i;
                 j1 = j + 1; if (j == dimY-1) j1 = j;
                 
-                /* Forward differences */
-                NOMx_2 = powf(U[j1*dimX + i] - U[index],2); /* x+ */
-                NOMy_2 = powf(U[j*dimX + i1] - U[index],2); /* y+ */
-                E_Grad += sqrtf(NOMx_2 + NOMy_2); /* gradient term energy */
-                E_Data += 0.5f * lambda*(powf((U[index]-U0[index]),2)); /* fidelity term energy */
+                /* Forward differences */                
+                NOMx_2 = powf((float)(U[j1*dimX + i] - U[index]),2); /* x+ */
+                NOMy_2 = powf((float)(U[j*dimX + i1] - U[index]),2); /* y+ */
+                E_Grad += sqrtf((float)(NOMx_2) + (float)(NOMy_2)); /* gradient term energy */
+                E_Data += 0.5f * lambda*(powf((float)(U[index]-U0[index]),2)); /* fidelity term energy */
 			}
 		}
 		E_val[0] = E_Grad + E_Data;
diff --git a/Wrappers/Matlab/mex_compile/regularizers_CPU/FGP_TV.c b/Wrappers/Matlab/mex_compile/regularizers_CPU/FGP_TV.c
index 30cea1a..7cc861a 100644
--- a/Wrappers/Matlab/mex_compile/regularizers_CPU/FGP_TV.c
+++ b/Wrappers/Matlab/mex_compile/regularizers_CPU/FGP_TV.c
@@ -53,9 +53,9 @@ void mexFunction(
         int nrhs, const mxArray *prhs[])
         
 {
-    int number_of_dims, iter, dimX, dimY, dimZ, ll, j, count, methTV;
+    int number_of_dims, iter, dimX, dimY, dimZ, methTV;
     const int  *dim_array;
-    float *A, *D=NULL, *D_old=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_old=NULL, *P2_old=NULL, *P3_old=NULL, *R1=NULL, *R2=NULL, *R3=NULL, lambda, tk, tkp1, re, re1, re_old, epsil;
+    float *Input, *Output, lambda, epsil;
     
     number_of_dims = mxGetNumberOfDimensions(prhs[0]);
     dim_array = mxGetDimensions(prhs[0]);
@@ -63,9 +63,9 @@ void mexFunction(
     /*Handling Matlab input data*/
     if ((nrhs < 2) || (nrhs > 5)) mexErrMsgTxt("At least 2 parameters is required: Image(2D/3D), Regularization parameter. The full list of parameters: Image(2D/3D), Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1')");
     
-    A  = (float *) mxGetData(prhs[0]); /*noisy image (2D/3D) */
+    Input  = (float *) mxGetData(prhs[0]); /*noisy image (2D/3D) */
     lambda =  (float) mxGetScalar(prhs[1]); /* regularization parameter */
-    iter = 50; /* default iterations number */
+    iter = 300; /* default iterations number */
     epsil = 0.0001; /* default tolerance constant */
     methTV = 0;  /* default isotropic TV penalty */
     
@@ -78,139 +78,17 @@ void mexFunction(
         if (strcmp(penalty_type, "l1") == 0)  methTV = 1;  /* enable 'l1' penalty */
         mxFree(penalty_type);
     }
-    /*output function value (last iteration) */
-    plhs[1] = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL);  
-    float *funcvalA = (float *) mxGetData(plhs[1]);
         
     if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input image must be in a single precision"); }
     
     /*Handling Matlab output data*/
-    dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];
-    
-    tk = 1.0f;
-    tkp1=1.0f;
-    count = 0;
-    re_old = 0.0f;
+    dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];       
     
     if (number_of_dims == 2) {
         dimZ = 1; /*2D case*/
-        D = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        D_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        P1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        P2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        P1_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        P2_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        R1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        R2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-        
-        /* begin iterations */
-        for(ll=0; ll<iter; ll++) {
-            
-            /* computing the gradient of the objective function */
-            Obj_func2D(A, D, R1, R2, lambda, dimX, dimY);
-            
-            /*Taking a step towards minus of the gradient*/
-            Grad_func2D(P1, P2, D, R1, R2, lambda, dimX, dimY);
-            
-            /* projection step */
-            Proj_func2D(P1, P2, methTV, dimX, dimY);
-            
-            /*updating R and t*/
-            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
-            Rupd_func2D(P1, P1_old, P2, P2_old, R1, R2, tkp1, tk, dimX, dimY);                
-            
-            /* calculate norm */
-            re = 0.0f; re1 = 0.0f;
-            for(j=0; j<dimX*dimY*dimZ; j++)
-            {
-                re += pow(D[j] - D_old[j],2);
-                re1 += pow(D[j],2);
-            }
-            re = sqrt(re)/sqrt(re1);
-            if (re < epsil)  count++;
-            if (count > 4) {
-                Obj_func_CALC2D(A, D, funcvalA, lambda, dimX, dimY); 
-                break; }
-            
-            /* check that the residual norm is decreasing */
-            if (ll > 2) {
-                if (re > re_old) {
-                    Obj_func_CALC2D(A, D, funcvalA, lambda, dimX, dimY);                                   
-                    break; }}            
-            re_old = re;
-            /*printf("%f %i %i \n", re, ll, count); */                      
-          
-            /*storing old values*/
-            copyIm(D, D_old, dimX, dimY, dimZ);
-            copyIm(P1, P1_old, dimX, dimY, dimZ);
-            copyIm(P2, P2_old, dimX, dimY, dimZ);
-            tk = tkp1;
-            
-            /* calculating the objective function value */
-            if (ll == (iter-1)) Obj_func_CALC2D(A, D, funcvalA, lambda, dimX, dimY);            
-        }
-        printf("FGP-TV iterations stopped at iteration %i with the function value %f \n", ll, funcvalA[0]);
-    }
-    if (number_of_dims == 3) {
-        D = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        D_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        P1 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        P2 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        P3 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        P1_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        P2_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        P3_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        R1 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        R2 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        R3 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-        
-        /* begin iterations */
-        for(ll=0; ll<iter; ll++) {
-            
-            /* computing the gradient of the objective function */
-            Obj_func3D(A, D, R1, R2, R3,lambda, dimX, dimY, dimZ);
-            
-            /*Taking a step towards minus of the gradient*/
-            Grad_func3D(P1, P2, P3, D, R1, R2, R3, lambda, dimX, dimY, dimZ);
-            
-            /* projection step */
-            Proj_func3D(P1, P2, P3, dimX, dimY, dimZ);
-            
-            /*updating R and t*/
-            tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
-            Rupd_func3D(P1, P1_old, P2, P2_old, P3, P3_old, R1, R2, R3, tkp1, tk, dimX, dimY, dimZ);
-            
-            /* calculate norm - stopping rules*/
-            re = 0.0f; re1 = 0.0f;
-            for(j=0; j<dimX*dimY*dimZ; j++)
-            {
-                re += pow(D[j] - D_old[j],2);
-                re1 += pow(D[j],2);
-            }
-            re = sqrt(re)/sqrt(re1);
-            /* stop if the norm residual is less than the tolerance EPS */
-            if (re < epsil)  count++;
-            if (count > 3) {
-                Obj_func_CALC3D(A, D, funcvalA, lambda, dimX, dimY, dimZ);                            
-                break;}
-            
-            /* check that the residual norm is decreasing */
-            if (ll > 2) {
-                if (re > re_old) {
-                Obj_func_CALC3D(A, D, funcvalA, lambda, dimX, dimY, dimZ);
-                }}            
-            re_old = re;
-            /*printf("%f %i %i \n", re, ll, count); */
-            
-            /*storing old values*/
-            copyIm(D, D_old, dimX, dimY, dimZ);
-            copyIm(P1, P1_old, dimX, dimY, dimZ);
-            copyIm(P2, P2_old, dimX, dimY, dimZ);
-            copyIm(P3, P3_old, dimX, dimY, dimZ);
-            tk = tkp1;
-            
-            if (ll == (iter-1)) Obj_func_CALC3D(A, D, funcvalA, lambda, dimX, dimY, dimZ);            
-        }
-        printf("FGP-TV iterations stopped at iteration %i with the function value %f \n", ll, funcvalA[0]);
-    }
+        Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
+        TV_FGP_CPU_main(Input, Output, lambda, iter, epsil, methTV, 0, 0, dimX, dimY, dimZ);
+    } 
+    if (number_of_dims == 3) {          
+    }   
 }
diff --git a/Wrappers/Python/demo/test_cpu_regularizers.py b/Wrappers/Python/demo/test_cpu_regularizers.py
index f1eb3c3..7f08605 100644
--- a/Wrappers/Python/demo/test_cpu_regularizers.py
+++ b/Wrappers/Python/demo/test_cpu_regularizers.py
@@ -111,12 +111,10 @@ rms = rmse(Im, splitbregman)
 pars['rmse'] = rms
 txtstr = printParametersToString(pars) 
 txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
-print (txtstr)
-    
+print (txtstr)   
 
 a=fig.add_subplot(2,4,2)
 
-
 # these are matplotlib.patch.Patch properties
 props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
 # place a text box in upper left in axes coords
@@ -130,14 +128,14 @@ imgplot = plt.imshow(splitbregman,\
 
 start_time = timeit.default_timer()
 pars = {'algorithm' : TV_FGP_CPU , \
-        'input' : u0,
+        'input' : u0,\
         'regularization_parameter':0.07, \
         'number_of_iterations' :300 ,\
         'tolerance_constant':0.00001,\
         'methodTV': 0 ,\
         'nonneg': 0 ,\
-        'printingOut': 0
-}
+        'printingOut': 0 
+        }
 
 out = TV_FGP_CPU (pars['input'], 
               pars['regularization_parameter'],
diff --git a/Wrappers/Python/src/cpu_regularizers.pyx b/Wrappers/Python/src/cpu_regularizers.pyx
index d62ca59..60e8627 100644
--- a/Wrappers/Python/src/cpu_regularizers.pyx
+++ b/Wrappers/Python/src/cpu_regularizers.pyx
@@ -93,7 +93,7 @@ def TV_FGP_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
     return outputData        
             
 def TV_FGP_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
- float regularization_parameter,
+                     float regularization_parameter,
                      int iterationsNumb, 
                      float tolerance_param,
                      int methodTV,
@@ -109,11 +109,10 @@ def TV_FGP_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
            
     #/* Run ROF iterations for 3D data */
     TV_FGP_CPU_main(&inputData[0,0,0], &outputData[0,0,0], regularization_parameter,
-					   iterationsNumb, 
+                       iterationsNumb, 
                        tolerance_param,
                        methodTV,
                        nonneg,
                        printM,
                        dims[0], dims[1], dims[2])
-
     return outputData 
-- 
cgit v1.2.3


From 69ecdd57434d591eb3fa4afefb72174d3e025fb9 Mon Sep 17 00:00:00 2001
From: algol <dkazanc@hotmail.com>
Date: Tue, 6 Mar 2018 12:48:21 +0000
Subject: FGP_CPU (Cythonized) now works in demo

---
 Wrappers/Python/demo/test_cpu_regularizers.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/Wrappers/Python/demo/test_cpu_regularizers.py b/Wrappers/Python/demo/test_cpu_regularizers.py
index 7f08605..1d97857 100644
--- a/Wrappers/Python/demo/test_cpu_regularizers.py
+++ b/Wrappers/Python/demo/test_cpu_regularizers.py
@@ -137,7 +137,7 @@ pars = {'algorithm' : TV_FGP_CPU , \
         'printingOut': 0 
         }
 
-out = TV_FGP_CPU (pars['input'], 
+fgp = TV_FGP_CPU(pars['input'], 
               pars['regularization_parameter'],
               pars['number_of_iterations'],
               pars['tolerance_constant'], 
@@ -145,7 +145,6 @@ out = TV_FGP_CPU (pars['input'],
               pars['nonneg'],
               pars['printingOut'])  
 
-fgp = out[0]
 rms = rmse(Im, fgp)
 pars['rmse'] = rms
 
@@ -154,7 +153,7 @@ txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
 print (txtstr)       
 
 
-a=fig.add_subplot(2,4,4)
+a=fig.add_subplot(2,4,3)
 
 # these are matplotlib.patch.Patch properties
 props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
@@ -168,7 +167,6 @@ a.text(0.05, 0.95, txtstr, transform=a.transAxes, fontsize=14,
 
 
 ###################### LLT_model #########################################
-"""
 start_time = timeit.default_timer()
 
 pars = {'algorithm': LLT_model , \
@@ -203,8 +201,6 @@ a.text(0.05, 0.95, txtstr, transform=a.transAxes, fontsize=14,
 imgplot = plt.imshow(llt,\
                      cmap="gray"
                      )
-"""
-
 # ###################### PatchBased_Regul #########################################
 # # Quick 2D denoising example in Matlab:   
 # #   Im = double(imread('lena_gray_256.tif'))/255;  % loading image
@@ -286,7 +282,7 @@ start_time = timeit.default_timer()
 
 pars = {'algorithm': TV_ROF_CPU , \
         'input' : u0,\
-        'regularization_parameter':0.04,\
+        'regularization_parameter':0.07,\
         'marching_step': 0.0025,\
         'number_of_iterations': 300
         }
-- 
cgit v1.2.3


From 5411ebbd4165c81b398b010d6ad9d11d2e973aad Mon Sep 17 00:00:00 2001
From: Daniil Kazantsev <dkazanc@hotmail.com>
Date: Tue, 6 Mar 2018 14:40:11 +0000
Subject: work on cythonization2

---
 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu | 17 ++---
 Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h  |  2 +-
 Wrappers/Python/src/gpu_regularizers.pyx        | 97 ++++++++++++++-----------
 3 files changed, 65 insertions(+), 51 deletions(-)

diff --git a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu
index 480855f..a30a89b 100755
--- a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu
+++ b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.cu
@@ -304,12 +304,11 @@ __host__ __device__ int sign (float x)
 
 /////////////////////////////////////////////////
 // HOST FUNCTION
-extern "C" void TV_ROF_GPU(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambda)
+extern "C" void TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z)
 {
 	    // set up device
 		int dev = 0;
 		CHECK(cudaSetDevice(dev));
-		
         float *d_input, *d_update, *d_D1, *d_D2;
         
 	if (Z == 0) Z = 1;
@@ -331,14 +330,14 @@ extern "C" void TV_ROF_GPU(float* Input, float* Output, int N, int M, int Z, int
             
             for(int n=0; n < iter; n++) {
                 /* calculate differences */
-                D1_func3D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M, Z);				
+                D1_func3D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M, Z);
                 CHECK(cudaDeviceSynchronize());
-				D2_func3D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M, Z);				                
+				D2_func3D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M, Z);
                 CHECK(cudaDeviceSynchronize());        
-                D3_func3D<<<dimGrid,dimBlock>>>(d_update, d_D3, N, M, Z);				
+                D3_func3D<<<dimGrid,dimBlock>>>(d_update, d_D3, N, M, Z);
                 CHECK(cudaDeviceSynchronize());        
                 /*running main kernel*/
-                TV_kernel3D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_D3, d_update, d_input, lambda, tau, N, M, Z);
+                TV_kernel3D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_D3, d_update, d_input, lambdaPar, tau, N, M, Z);
                 CHECK(cudaDeviceSynchronize());
             }
             
@@ -351,12 +350,12 @@ extern "C" void TV_ROF_GPU(float* Input, float* Output, int N, int M, int Z, int
              
             for(int n=0; n < iter; n++) {
                 /* calculate differences */
-                D1_func2D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M);				
+                D1_func2D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M);
                 CHECK(cudaDeviceSynchronize());
-				D2_func2D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M);				
+				D2_func2D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M);
                 CHECK(cudaDeviceSynchronize());        
                 /*running main kernel*/
-                TV_kernel2D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_update, d_input, lambda, tau, N, M);
+                TV_kernel2D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_update, d_input, lambdaPar, tau, N, M);
                 CHECK(cudaDeviceSynchronize());
             }
         }        
diff --git a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h
index 8b64d99..d772aba 100755
--- a/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h
+++ b/Core/regularizers_GPU/TV_ROF/TV_ROF_GPU_core.h
@@ -3,6 +3,6 @@
 #include "CCPiDefines.h"
 #include <stdio.h>
 
-extern "C" CCPI_EXPORT void TV_ROF_GPU(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambda);
+extern "C" CCPI_EXPORT void TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z);
 
 #endif 
diff --git a/Wrappers/Python/src/gpu_regularizers.pyx b/Wrappers/Python/src/gpu_regularizers.pyx
index e99bfa7..cb94e86 100644
--- a/Wrappers/Python/src/gpu_regularizers.pyx
+++ b/Wrappers/Python/src/gpu_regularizers.pyx
@@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 
-Author: Edoardo Pasca
+Author: Edoardo Pasca, Daniil Kazantsev
 """
 
 import cython
@@ -25,14 +25,16 @@ cdef extern void NLM_GPU_kernel(float *A, float* B, float *Eucl_Vec,
                                 int N, int M,  int Z, int dimension, 
                                 int SearchW, int SimilW, 
                                 int SearchW_real, float denh2, float lambdaf);
-cdef extern void TV_ROF_GPU(float* Input, float* Output, int N, int M, int Z, int iter, float tau, float lambdaf);
-cdef extern void TV_FGP_GPU(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int N, int M, int Z);
+cdef extern void TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z);
+#cdef extern void TV_FGP_GPU(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int N, int M, int Z);
+# correct the function
+cdef extern void TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int N, int M, int Z);
 
 cdef extern float pad_crop(float *A, float *Ap, 
                            int OldSizeX, int OldSizeY, int OldSizeZ, 
                            int NewSizeX, int NewSizeY, int NewSizeZ, 
                            int padXY, int switchpad_crop);
-
+#Diffusion 4th order regularizer
 def Diff4thHajiaboli(inputData, 
                      edge_preserv_parameter, 
                      iterations, 
@@ -50,7 +52,7 @@ def Diff4thHajiaboli(inputData,
                      iterations, 
                      time_marching_parameter,
                      regularization_parameter)
-        
+# patch-based nonlocal regularization
 def NML(inputData, 
                      SearchW_real, 
                      SimilW, 
@@ -68,23 +70,37 @@ def NML(inputData,
                      SimilW, 
                      h,
                      lambdaf)
-
-def GPU_ROF_TV(inputData,
+# Total-variation Rudin-Osher-Fatemi (ROF)
+def TV_ROF_GPU(inputData,
+                     regularization_parameter,
                      iterations, 
-                     time_marching_parameter,
-                     regularization_parameter):
+                     time_marching_parameter):
     if inputData.ndim == 2:
         return ROFTV2D(inputData, 
-                     iterations, 
-                     time_marching_parameter,
-                     regularization_parameter)
+                     regularization_parameter,
+                     iterations,
+                     time_marching_parameter)
     elif inputData.ndim == 3:
         return ROFTV3D(inputData, 
+                     regularization_parameter
                      iterations, 
-                     time_marching_parameter,
-                     regularization_parameter)
-
-                    
+                     time_marching_parameter)
+# Total-variation Fast-Gradient-Projection (FGP)
+def TV_FGP_GPU(inputData,
+                     regularization_parameter,
+                     iterations, 
+                     time_marching_parameter):
+    if inputData.ndim == 2:
+        return FGPTV2D(inputData, 
+                     regularization_parameter,
+                     iterations,
+                     time_marching_parameter)
+    elif inputData.ndim == 3:
+        return FGPTV3D(inputData, 
+                     regularization_parameter
+                     iterations, 
+                     time_marching_parameter)
+#****************************************************************#
 def Diff4thHajiaboli2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
                      float edge_preserv_parameter, 
                      int iterations, 
@@ -333,52 +349,51 @@ def NML3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
     return B                   
                      
 def ROFTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
+                     float regularization_parameter,
                      int iterations, 
-                     float time_marching_parameter,
-                     float regularization_parameter):
+                     float time_marching_parameter):
     
     cdef long dims[2]
     dims[0] = inputData.shape[0]
     dims[1] = inputData.shape[1]
 
-    cdef np.ndarray[np.float32_t, ndim=2, mode="c"] B = \
+    cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \
 		    np.zeros([dims[0],dims[1]], dtype='float32')
           
     # Running CUDA code here    
-    TV_ROF_GPU(            
-            &inputData[0,0], &B[0,0], 
-                       dims[0], dims[1], 1, 
+    TV_ROF_GPU_main(            
+            &inputData[0,0], &outputData[0,0], 
+                       regularization_parameter,
                        iterations , 
                        time_marching_parameter, 
-                       regularization_parameter);   
+                       dims[0], dims[1], 1);   
      
-    return B
+    return outputData
     
 def ROFTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
+                     float regularization_parameter,
                      int iterations, 
-                     float time_marching_parameter,
-                     float regularization_parameter):
+                     float time_marching_parameter):
     
     cdef long dims[3]
     dims[0] = inputData.shape[0]
     dims[1] = inputData.shape[1]
     dims[2] = inputData.shape[2]
 
-    cdef np.ndarray[np.float32_t, ndim=3, mode="c"] B = \
+    cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \
 		    np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
           
     # Running CUDA code here    
-    TV_ROF_GPU(            
-            &inputData[0,0,0], &B[0,0,0], 
-                       dims[0], dims[1], dims[2], 
+    TV_ROF_GPU_main(            
+            &inputData[0,0,0], &outputData[0,0,0], 
+                       regularization_parameter,
                        iterations , 
                        time_marching_parameter, 
-                       regularization_parameter);   
+                       dims[0], dims[1], dims[2]);   
      
-    return B
-
+    return outputData
 
-def TVFGP2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
+def FGPTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
                      float regularization_parameter,
                      int iterations, 
                      float tolerance_param,
@@ -390,12 +405,12 @@ def TVFGP2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
     dims[0] = inputData.shape[0]
     dims[1] = inputData.shape[1]
 
-    cdef np.ndarray[np.float32_t, ndim=2, mode="c"] B = \
+    cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \
 		    np.zeros([dims[0],dims[1]], dtype='float32')
           
     # Running CUDA code here    
     TV_FGP_GPU(            
-            &inputData[0,0], &B[0,0],                        
+            &inputData[0,0], &outputData[0,0],                        
                        regularization_parameter, 
                        iterations, 
                        tolerance_param,
@@ -404,9 +419,9 @@ def TVFGP2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
                        printM,
                        dims[0], dims[1], 1);   
      
-    return B
+    return outputData
     
-def TVFGP3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
+def FGPTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
                      float regularization_parameter,
                      int iterations, 
                      float tolerance_param,
@@ -419,12 +434,12 @@ def TVFGP3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
     dims[1] = inputData.shape[1]
     dims[2] = inputData.shape[2]
 
-    cdef np.ndarray[np.float32_t, ndim=3, mode="c"] B = \
+    cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \
 		    np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
           
     # Running CUDA code here    
     TV_FGP_GPU(            
-            &inputData[0,0,0], &B[0,0,0], 
+            &inputData[0,0,0], &outputData[0,0,0], 
                        regularization_parameter , 
                        iterations, 
                        tolerance_param,
@@ -433,7 +448,7 @@ def TVFGP3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
                        printM,
                        dims[0], dims[1], dims[2]);   
      
-    return B    
+    return outputData    
     
     
-- 
cgit v1.2.3


From 63ad8306c261a90c572d95084bf1dd8db2b3dce7 Mon Sep 17 00:00:00 2001
From: algol <dkazanc@hotmail.com>
Date: Tue, 6 Mar 2018 16:18:20 +0000
Subject: FGP/ROF-CPU/GPU fully working with new interfaces and regularizers.py
 script. This closes #39 and #38

---
 Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu    |   2 +-
 Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h     |   2 +-
 Wrappers/Python/ccpi/filters/regularizers.py       |  44 +++++++
 Wrappers/Python/src/gpu_regularizers.pyx           |  43 ++++---
 Wrappers/Python/test/test_cpu_vs_gpu.py            |  10 --
 .../Python/test/test_cpu_vs_gpu_regularizers.py    | 137 ++++++++++++++++++---
 Wrappers/Python/test/test_gpu_regularizers.py      |  16 +--
 7 files changed, 198 insertions(+), 56 deletions(-)
 create mode 100644 Wrappers/Python/ccpi/filters/regularizers.py
 delete mode 100644 Wrappers/Python/test/test_cpu_vs_gpu.py

diff --git a/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu
index cbe9b5e..61097fb 100755
--- a/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu
+++ b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.cu
@@ -341,7 +341,7 @@ __global__ void copy_kernel3D(float *Input, float* Output, int N, int M, int Z,
 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
 
 ////////////MAIN HOST FUNCTION ///////////////
-extern "C" void TV_FGP_GPU(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
+extern "C" void TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
 {
     int deviceCount = -1; // number of devices
     cudaGetDeviceCount(&deviceCount);
diff --git a/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h
index 2b8fdcf..107d243 100755
--- a/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h
+++ b/Core/regularizers_GPU/TV_FGP/TV_FGP_GPU_core.h
@@ -5,6 +5,6 @@
 #ifndef _TV_FGP_GPU_
 #define _TV_FGP_GPU_
 
-extern "C" void TV_FGP_GPU(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);   
+extern "C" void TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ); 
 
 #endif 
diff --git a/Wrappers/Python/ccpi/filters/regularizers.py b/Wrappers/Python/ccpi/filters/regularizers.py
new file mode 100644
index 0000000..d6dfa8c
--- /dev/null
+++ b/Wrappers/Python/ccpi/filters/regularizers.py
@@ -0,0 +1,44 @@
+"""
+script which assigns a proper device core function based on a flag ('cpu' or 'gpu')
+"""
+
+from ccpi.filters.cpu_regularizers_cython import TV_ROF_CPU, TV_FGP_CPU
+from ccpi.filters.gpu_regularizers import TV_ROF_GPU, TV_FGP_GPU
+
+def ROF_TV(inputData, regularization_parameter, iterations,
+                     time_marching_parameter,device='cpu'):
+    if device == 'cpu':
+        return TV_ROF_CPU(inputData,
+                     regularization_parameter,
+                     iterations, 
+                     time_marching_parameter)
+    elif device == 'gpu':
+        return TV_ROF_GPU(inputData,
+                     regularization_parameter,
+                     iterations, 
+                     time_marching_parameter)
+    else:
+        raise ValueError('Unknown device {0}. Expecting gpu or cpu'\
+                         .format(device))
+
+def FGP_TV(inputData, regularization_parameter,iterations,
+                     tolerance_param, methodTV, nonneg, printM, device='cpu'):
+    if device == 'cpu':
+        return TV_FGP_CPU(inputData,
+                     regularization_parameter,
+                     iterations, 
+                     tolerance_param,
+                     methodTV,
+                     nonneg,
+                     printM)
+    elif device == 'gpu':
+        return TV_FGP_GPU(inputData,
+                     regularization_parameter,
+                     iterations, 
+                     tolerance_param,
+                     methodTV,
+                     nonneg,
+                     printM)
+    else:
+        raise ValueError('Unknown device {0}. Expecting gpu or cpu'\
+                         .format(device))
\ No newline at end of file
diff --git a/Wrappers/Python/src/gpu_regularizers.pyx b/Wrappers/Python/src/gpu_regularizers.pyx
index cb94e86..f96156a 100644
--- a/Wrappers/Python/src/gpu_regularizers.pyx
+++ b/Wrappers/Python/src/gpu_regularizers.pyx
@@ -26,14 +26,14 @@ cdef extern void NLM_GPU_kernel(float *A, float* B, float *Eucl_Vec,
                                 int SearchW, int SimilW, 
                                 int SearchW_real, float denh2, float lambdaf);
 cdef extern void TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z);
-#cdef extern void TV_FGP_GPU(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int N, int M, int Z);
-# correct the function
 cdef extern void TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int N, int M, int Z);
 
+# padding function
 cdef extern float pad_crop(float *A, float *Ap, 
                            int OldSizeX, int OldSizeY, int OldSizeZ, 
                            int NewSizeX, int NewSizeY, int NewSizeZ, 
                            int padXY, int switchpad_crop);
+                           
 #Diffusion 4th order regularizer
 def Diff4thHajiaboli(inputData, 
                      edge_preserv_parameter, 
@@ -70,6 +70,7 @@ def NML(inputData,
                      SimilW, 
                      h,
                      lambdaf)
+                     
 # Total-variation Rudin-Osher-Fatemi (ROF)
 def TV_ROF_GPU(inputData,
                      regularization_parameter,
@@ -82,24 +83,34 @@ def TV_ROF_GPU(inputData,
                      time_marching_parameter)
     elif inputData.ndim == 3:
         return ROFTV3D(inputData, 
-                     regularization_parameter
+                     regularization_parameter,
                      iterations, 
                      time_marching_parameter)
+                     
 # Total-variation Fast-Gradient-Projection (FGP)
 def TV_FGP_GPU(inputData,
                      regularization_parameter,
                      iterations, 
-                     time_marching_parameter):
+                     tolerance_param,
+                     methodTV,
+                     nonneg,
+                     printM):
     if inputData.ndim == 2:
-        return FGPTV2D(inputData, 
+        return FGPTV2D(inputData,
                      regularization_parameter,
-                     iterations,
-                     time_marching_parameter)
+                     iterations, 
+                     tolerance_param,
+                     methodTV,
+                     nonneg,
+                     printM)
     elif inputData.ndim == 3:
-        return FGPTV3D(inputData, 
-                     regularization_parameter
+        return FGPTV3D(inputData,
+                     regularization_parameter,
                      iterations, 
-                     time_marching_parameter)
+                     tolerance_param,
+                     methodTV,
+                     nonneg,
+                     printM)
 #****************************************************************#
 def Diff4thHajiaboli2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
                      float edge_preserv_parameter, 
@@ -347,7 +358,8 @@ def NML3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
              switchpad_crop)
     
     return B                   
-                     
+  
+# Total-variation ROF 
 def ROFTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
                      float regularization_parameter,
                      int iterations, 
@@ -393,6 +405,7 @@ def ROFTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
      
     return outputData
 
+# Total-variation Fast-Gradient-Projection (FGP)
 def FGPTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
                      float regularization_parameter,
                      int iterations, 
@@ -409,8 +422,7 @@ def FGPTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
 		    np.zeros([dims[0],dims[1]], dtype='float32')
           
     # Running CUDA code here    
-    TV_FGP_GPU(            
-            &inputData[0,0], &outputData[0,0],                        
+    TV_FGP_GPU_main(&inputData[0,0], &outputData[0,0],                        
                        regularization_parameter, 
                        iterations, 
                        tolerance_param,
@@ -438,7 +450,7 @@ def FGPTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
 		    np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
           
     # Running CUDA code here    
-    TV_FGP_GPU(            
+    TV_FGP_GPU_main(            
             &inputData[0,0,0], &outputData[0,0,0], 
                        regularization_parameter , 
                        iterations, 
@@ -449,6 +461,3 @@ def FGPTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
                        dims[0], dims[1], dims[2]);   
      
     return outputData    
-    
-    
-    
diff --git a/Wrappers/Python/test/test_cpu_vs_gpu.py b/Wrappers/Python/test/test_cpu_vs_gpu.py
deleted file mode 100644
index 74d65dd..0000000
--- a/Wrappers/Python/test/test_cpu_vs_gpu.py
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Feb 21 12:12:22 2018
-
-# CPU vs GPU comparison tests
-
-@author: algol
-"""
-
diff --git a/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py b/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py
index e162afa..a9d0f31 100644
--- a/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py
+++ b/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py
@@ -5,15 +5,17 @@ Created on Thu Feb 22 11:39:43 2018
 
 Testing CPU implementation against GPU one
 
-@author: algol
+@author: Daniil Kazantsev
 """
 
 import matplotlib.pyplot as plt
 import numpy as np
-import os    
+import os
 import timeit
-from ccpi.filters.gpu_regularizers import Diff4thHajiaboli, NML, GPU_ROF_TV
-from ccpi.filters.cpu_regularizers_cython import TV_ROF_CPU
+#from ccpi.filters.cpu_regularizers_cython import TV_ROF_CPU, TV_FGP_CPU
+#from ccpi.filters.gpu_regularizers import TV_ROF_GPU, TV_FGP_GPU
+from ccpi.filters.regularizers import ROF_TV, FGP_TV
+
 ###############################################################################
 def printParametersToString(pars):
         txt = r''
@@ -47,6 +49,11 @@ u0 = Im + np.random.normal(loc = Im ,
 f = np.frompyfunc(lambda x: 0 if x < 0 else x, 1,1)
 u0 = f(u0).astype('float32')
 
+
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+print ("____________ROF-TV bench___________________")
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+
 ## plot 
 fig = plt.figure(1)
 plt.suptitle('Comparison of ROF-TV regularizer using CPU and GPU implementations')
@@ -54,22 +61,19 @@ a=fig.add_subplot(1,4,1)
 a.set_title('Noisy Image')
 imgplot = plt.imshow(u0,cmap="gray")
 
-
 # set parameters
-pars = {'algorithm': TV_ROF_CPU , \
+pars = {'algorithm': ROF_TV, \
         'input' : u0,\
         'regularization_parameter':0.04,\
-        'time_marching_parameter': 0.0025,\
-        'number_of_iterations': 1200
+        'number_of_iterations': 1200,\
+        'time_marching_parameter': 0.0025        
         }
-print ("#################ROF TV CPU#####################")
+print ("#############ROF TV CPU####################")
 start_time = timeit.default_timer()
-rof_cpu = TV_ROF_CPU(pars['input'],
-             pars['number_of_iterations'],
+rof_cpu = ROF_TV(pars['input'],
              pars['regularization_parameter'],
-             pars['time_marching_parameter'] 
-             )
-#tgv = out
+             pars['number_of_iterations'],
+             pars['time_marching_parameter'],'cpu')
 rms = rmse(Im, rof_cpu)
 pars['rmse'] = rms
 
@@ -87,16 +91,16 @@ imgplot = plt.imshow(rof_cpu, cmap="gray")
 plt.title('{}'.format('CPU results'))
 
 
-print ("#################ROF TV GPU#####################")
+print ("##############ROF TV GPU##################")
 start_time = timeit.default_timer()
-rof_gpu = GPU_ROF_TV(pars['input'], 
+rof_gpu = ROF_TV(pars['input'], 
+                     pars['regularization_parameter'],
                      pars['number_of_iterations'], 
-                     pars['time_marching_parameter'], 
-                     pars['regularization_parameter'])
+                     pars['time_marching_parameter'],'gpu')
                      
 rms = rmse(Im, rof_gpu)
 pars['rmse'] = rms
-pars['algorithm'] = GPU_ROF_TV
+pars['algorithm'] = ROF_TV
 txtstr = printParametersToString(pars)
 txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
 print (txtstr)
@@ -112,7 +116,8 @@ plt.title('{}'.format('GPU results'))
 
 
 print ("--------Compare the results--------")
-tolerance = 1e-06
+tolerance = 1e-05
+diff_im = np.zeros(np.shape(rof_cpu))
 diff_im = abs(rof_cpu - rof_gpu)
 diff_im[diff_im > tolerance] = 1
 a=fig.add_subplot(1,4,4)
@@ -122,3 +127,95 @@ if (diff_im.sum() > 1):
     print ("Arrays do not match!")
 else:
     print ("Arrays match")
+
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+print ("____________FGP-TV bench___________________")
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+
+## plot 
+fig = plt.figure(2)
+plt.suptitle('Comparison of FGP-TV regularizer using CPU and GPU implementations')
+a=fig.add_subplot(1,4,1)
+a.set_title('Noisy Image')
+imgplot = plt.imshow(u0,cmap="gray")
+
+# set parameters
+pars = {'algorithm' : FGP_TV, \
+        'input' : u0,\
+        'regularization_parameter':0.04, \
+        'number_of_iterations' :1200 ,\
+        'tolerance_constant':0.00001,\
+        'methodTV': 0 ,\
+        'nonneg': 0 ,\
+        'printingOut': 0 
+        }
+        
+print ("#############FGP TV CPU####################")
+start_time = timeit.default_timer()
+fgp_cpu = FGP_TV(pars['input'], 
+              pars['regularization_parameter'],
+              pars['number_of_iterations'],
+              pars['tolerance_constant'], 
+              pars['methodTV'],
+              pars['nonneg'],
+              pars['printingOut'],'cpu')  
+             
+             
+rms = rmse(Im, fgp_cpu)
+pars['rmse'] = rms
+
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+a=fig.add_subplot(1,4,2)
+
+# these are matplotlib.patch.Patch properties
+props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
+# place a text box in upper left in axes coords
+a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
+         verticalalignment='top', bbox=props)
+imgplot = plt.imshow(fgp_cpu, cmap="gray")
+plt.title('{}'.format('CPU results'))
+
+
+print ("##############FGP TV GPU##################")
+start_time = timeit.default_timer()
+fgp_gpu = FGP_TV(pars['input'], 
+              pars['regularization_parameter'],
+              pars['number_of_iterations'],
+              pars['tolerance_constant'], 
+              pars['methodTV'],
+              pars['nonneg'],
+              pars['printingOut'],'gpu')
+                                   
+rms = rmse(Im, fgp_gpu)
+pars['rmse'] = rms
+pars['algorithm'] = FGP_TV
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+a=fig.add_subplot(1,4,3)
+
+# these are matplotlib.patch.Patch properties
+props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
+# place a text box in upper left in axes coords
+a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
+         verticalalignment='top', bbox=props)
+imgplot = plt.imshow(fgp_gpu, cmap="gray")
+plt.title('{}'.format('GPU results'))
+
+
+print ("--------Compare the results--------")
+tolerance = 1e-05
+diff_im = np.zeros(np.shape(rof_cpu))
+diff_im = abs(fgp_cpu - fgp_gpu)
+diff_im[diff_im > tolerance] = 1
+a=fig.add_subplot(1,4,4)
+imgplot = plt.imshow(diff_im, vmin=0, vmax=1, cmap="gray")
+plt.title('{}'.format('Pixels larger threshold difference'))
+if (diff_im.sum() > 1):
+    print ("Arrays do not match!")
+else:
+    print ("Arrays match")
+
+
diff --git a/Wrappers/Python/test/test_gpu_regularizers.py b/Wrappers/Python/test/test_gpu_regularizers.py
index c473270..04aeeb4 100644
--- a/Wrappers/Python/test/test_gpu_regularizers.py
+++ b/Wrappers/Python/test/test_gpu_regularizers.py
@@ -11,7 +11,8 @@ import numpy as np
 import os    
 from enum import Enum
 import timeit
-from ccpi.filters.gpu_regularizers import Diff4thHajiaboli, NML, GPU_ROF_TV
+from ccpi.filters.gpu_regularizers import Diff4thHajiaboli, NML, TV_ROF_GPU
+
 ###############################################################################
 def printParametersToString(pars):
         txt = r''
@@ -152,17 +153,18 @@ plt.colorbar(ticks=[0, 0.03], orientation='vertical')
 start_time = timeit.default_timer()
 
 pars = {
-'algorithm' : GPU_ROF_TV , \
+'algorithm' : TV_ROF_GPU , \
         'input' : u0,
         'regularization_parameter': 0.04,\
-        'time_marching_parameter': 0.0025, \
-        'number_of_iterations':300        
+        'number_of_iterations':300,\
+        'time_marching_parameter': 0.0025
+        
 	}
 
-rof_tv = GPU_ROF_TV(pars['input'], 
+rof_tv = TV_ROF_GPU(pars['input'], 
+                     pars['regularization_parameter'],
                      pars['number_of_iterations'], 
-                     pars['time_marching_parameter'], 
-                     pars['regularization_parameter'])
+                     pars['time_marching_parameter'])
                      
 rms = rmse(Im, rof_tv)
 pars['rmse'] = rms
-- 
cgit v1.2.3


From 4593040d228fec5ef1f1a301e511708d47f0d55e Mon Sep 17 00:00:00 2001
From: algol <dkazanc@hotmail.com>
Date: Tue, 6 Mar 2018 16:51:31 +0000
Subject: demos fixed #40

---
 Wrappers/Python/demo/test_cpu_regularizers.py      |  21 +-
 Wrappers/Python/src/cpu_regularizers.cpp           | 289 ---------------------
 .../Python/test/test_cpu_vs_gpu_regularizers.py    |   4 +-
 Wrappers/Python/test/test_gpu_regularizers.py      |  59 ++++-
 4 files changed, 63 insertions(+), 310 deletions(-)

diff --git a/Wrappers/Python/demo/test_cpu_regularizers.py b/Wrappers/Python/demo/test_cpu_regularizers.py
index 1d97857..76b9de7 100644
--- a/Wrappers/Python/demo/test_cpu_regularizers.py
+++ b/Wrappers/Python/demo/test_cpu_regularizers.py
@@ -11,10 +11,8 @@ import numpy as np
 import os    
 from enum import Enum
 import timeit
-from ccpi.filters.cpu_regularizers_boost import SplitBregman_TV, LLT_model, PatchBased_Regul ,\
-                                                 TGV_PD
-from ccpi.filters.cpu_regularizers_cython import TV_ROF_CPU, TV_FGP_CPU
-
+from ccpi.filters.cpu_regularizers_boost import SplitBregman_TV, LLT_model, PatchBased_Regul, TGV_PD
+from ccpi.filters.regularizers import ROF_TV, FGP_TV
 ###############################################################################
 #https://stackoverflow.com/questions/13875989/comparing-image-in-url-to-image-in-filesystem-in-python/13884956#13884956
 #NRMSE a normalization of the root of the mean squared error
@@ -127,7 +125,7 @@ imgplot = plt.imshow(splitbregman,\
 ###################### FGP_TV #########################################
 
 start_time = timeit.default_timer()
-pars = {'algorithm' : TV_FGP_CPU , \
+pars = {'algorithm' : FGP_TV , \
         'input' : u0,\
         'regularization_parameter':0.07, \
         'number_of_iterations' :300 ,\
@@ -137,13 +135,13 @@ pars = {'algorithm' : TV_FGP_CPU , \
         'printingOut': 0 
         }
 
-fgp = TV_FGP_CPU(pars['input'], 
+fgp = FGP_TV(pars['input'], 
               pars['regularization_parameter'],
               pars['number_of_iterations'],
               pars['tolerance_constant'], 
               pars['methodTV'],
               pars['nonneg'],
-              pars['printingOut'])  
+              pars['printingOut'], 'cpu')  
 
 rms = rmse(Im, fgp)
 pars['rmse'] = rms
@@ -280,18 +278,17 @@ imgplot = plt.imshow(tgv, cmap="gray")
 
 start_time = timeit.default_timer()
 
-pars = {'algorithm': TV_ROF_CPU , \
+pars = {'algorithm': ROF_TV , \
         'input' : u0,\
         'regularization_parameter':0.07,\
         'marching_step': 0.0025,\
         'number_of_iterations': 300
         }
-rof = TV_ROF_CPU(pars['input'],
+rof = ROF_TV(pars['input'],
 			 pars['regularization_parameter'],
              pars['number_of_iterations'],             
-             pars['marching_step'] 
-             )
-#tgv = out
+             pars['marching_step'], 'cpu')
+
 rms = rmse(Im, rof)
 pars['rmse'] = rms
 
diff --git a/Wrappers/Python/src/cpu_regularizers.cpp b/Wrappers/Python/src/cpu_regularizers.cpp
index b8156d5..88b7c3c 100644
--- a/Wrappers/Python/src/cpu_regularizers.cpp
+++ b/Wrappers/Python/src/cpu_regularizers.cpp
@@ -303,292 +303,6 @@ bp::list SplitBregman_TV(np::ndarray input, double d_mu, int iter, double d_epsi
 
 	}
 
-
-
-//bp::list FGP_TV(np::ndarray input, double d_mu, int iter, double d_epsil, int methTV) {
-
-	//// the result is in the following list
-	//bp::list result;
-
-	//int number_of_dims, dimX, dimY, dimZ, ll, j, count;
-	//float *A, *D = NULL, *D_old = NULL, *P1 = NULL, *P2 = NULL, *P3 = NULL, *P1_old = NULL, *P2_old = NULL, *P3_old = NULL, *R1 = NULL, *R2 = NULL, *R3 = NULL;
-	//float lambda, tk, tkp1, re, re1, re_old, epsil, funcval;
-
-	////number_of_dims = mxGetNumberOfDimensions(prhs[0]);
-	////dim_array = mxGetDimensions(prhs[0]);
-
-	//number_of_dims = input.get_nd();
-	//int dim_array[3];
-
-	//dim_array[0] = input.shape(0);
-	//dim_array[1] = input.shape(1);
-	//if (number_of_dims == 2) {
-		//dim_array[2] = -1;
-	//}
-	//else {
-		//dim_array[2] = input.shape(2);
-	//}
-	//// Parameter handling is be done in Python
-	/////*Handling Matlab input data*/
-	////if ((nrhs < 2) || (nrhs > 5)) mexErrMsgTxt("At least 2 parameters is required: Image(2D/3D), Regularization parameter. The full list of parameters: Image(2D/3D), Regularization parameter, iterations number, tolerance, penalty type ('iso' or 'l1')");
-
-	/////*Handling Matlab input data*/
-	////A = (float *)mxGetData(prhs[0]); /*noisy image (2D/3D) */
-	//A = reinterpret_cast<float *>(input.get_data());
-
-	////mu = (float)mxGetScalar(prhs[1]); /* regularization parameter */
-	//lambda = (float)d_mu;
-
-	////iter = 35; /* default iterations number */
-
-	////epsil = 0.0001; /* default tolerance constant */
-	//epsil = (float)d_epsil;
-	////methTV = 0;  /* default isotropic TV penalty */
-	////if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5))  iter = (int)mxGetScalar(prhs[2]); /* iterations number */
-	////if ((nrhs == 4) || (nrhs == 5))  epsil = (float)mxGetScalar(prhs[3]); /* tolerance constant */
-	////if (nrhs == 5) {
-	////	char *penalty_type;
-	////	penalty_type = mxArrayToString(prhs[4]); /* choosing TV penalty: 'iso' or 'l1', 'iso' is the default */
-	////	if ((strcmp(penalty_type, "l1") != 0) && (strcmp(penalty_type, "iso") != 0)) mexErrMsgTxt("Choose TV type: 'iso' or 'l1',");
-	////	if (strcmp(penalty_type, "l1") == 0)  methTV = 1;  /* enable 'l1' penalty */
-	////	mxFree(penalty_type);
-	////}
-	////if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) { mexErrMsgTxt("The input image must be in a single precision"); }
-
-	////plhs[1] = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL);
-	//bp::tuple shape1 = bp::make_tuple(dim_array[0], dim_array[1]);
-	//np::dtype dtype = np::dtype::get_builtin<float>();
-	//np::ndarray out1 = np::zeros(shape1, dtype);
-	
-	////float *funcvalA = (float *)mxGetData(plhs[1]);
-	//float * funcvalA = reinterpret_cast<float *>(out1.get_data());
-	////if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) { mexErrMsgTxt("The input image must be in a single precision"); }
-
-	///*Handling Matlab output data*/
-	//dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];
-
-	//tk = 1.0f;
-	//tkp1 = 1.0f;
-	//count = 1;
-	//re_old = 0.0f;
-
-	//if (number_of_dims == 2) {
-		//dimZ = 1; /*2D case*/
-		///*D = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-		//D_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-		//P1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-		//P2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-		//P1_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-		//P2_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-		//R1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
-		//R2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));*/
-
-		//bp::tuple shape = bp::make_tuple(dim_array[0], dim_array[1]);
-		//np::dtype dtype = np::dtype::get_builtin<float>();
-
-
-		//np::ndarray npD      = np::zeros(shape, dtype);
-		//np::ndarray npD_old  = np::zeros(shape, dtype);
-		//np::ndarray npP1     = np::zeros(shape, dtype);
-		//np::ndarray npP2     = np::zeros(shape, dtype);
-		//np::ndarray npP1_old = np::zeros(shape, dtype);
-		//np::ndarray npP2_old = np::zeros(shape, dtype);
-		//np::ndarray npR1     = np::zeros(shape, dtype);
-		//np::ndarray npR2     = np::zeros(shape, dtype);
-
-		//D      = reinterpret_cast<float *>(npD.get_data());
-		//D_old  = reinterpret_cast<float *>(npD_old.get_data());
-		//P1     = reinterpret_cast<float *>(npP1.get_data());
-		//P2     = reinterpret_cast<float *>(npP2.get_data());
-		//P1_old = reinterpret_cast<float *>(npP1_old.get_data());
-		//P2_old = reinterpret_cast<float *>(npP2_old.get_data());
-		//R1     = reinterpret_cast<float *>(npR1.get_data());
-		//R2     = reinterpret_cast<float *>(npR2.get_data());
-
-		///* begin iterations */
-		//for (ll = 0; ll<iter; ll++) {
-			///* computing the gradient of the objective function */
-			//Obj_func2D(A, D, R1, R2, lambda, dimX, dimY);
-
-			///*Taking a step towards minus of the gradient*/
-			//Grad_func2D(P1, P2, D, R1, R2, lambda, dimX, dimY);
-  
-            ///* projection step */
-            //Proj_func2D(P1, P2, methTV, dimX, dimY);
-            
-            ///*updating R and t*/
-            //tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
-            //Rupd_func2D(P1, P1_old, P2, P2_old, R1, R2, tkp1, tk, dimX, dimY);
-
-			///* calculate norm */
-			//re = 0.0f; re1 = 0.0f;
-			//for (j = 0; j<dimX*dimY*dimZ; j++)
-			//{
-				//re += pow(D[j] - D_old[j], 2);
-				//re1 += pow(D[j], 2);
-			//}
-			//re = sqrt(re) / sqrt(re1);
-			//if (re < epsil)  count++;
-			//if (count > 4) {
-				//Obj_func2D(A, D, P1, P2, lambda, dimX, dimY);
-				//funcval = 0.0f;
-				//for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
-				////funcvalA[0] = sqrt(funcval);
-				//float fv = sqrt(funcval);
-				//std::memcpy(funcvalA, &fv, sizeof(float));
-				//break;
-			//}
-
-			///* check that the residual norm is decreasing */
-			//if (ll > 2) {
-				//if (re > re_old) {
-					//Obj_func2D(A, D, P1, P2, lambda, dimX, dimY);
-					//funcval = 0.0f;
-					//for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
-					////funcvalA[0] = sqrt(funcval);
-					//float fv = sqrt(funcval);
-					//std::memcpy(funcvalA, &fv, sizeof(float));
-					//break;
-				//}
-			//}
-			//re_old = re;
-			///*printf("%f %i %i \n", re, ll, count); */
-
-			///*storing old values*/
-			//copyIm(D, D_old, dimX, dimY, dimZ);
-			//copyIm(P1, P1_old, dimX, dimY, dimZ);
-			//copyIm(P2, P2_old, dimX, dimY, dimZ);
-			//tk = tkp1;
-
-			///* calculating the objective function value */
-			//if (ll == (iter - 1)) {
-				//Obj_func2D(A, D, P1, P2, lambda, dimX, dimY);
-				//funcval = 0.0f;
-				//for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
-				////funcvalA[0] = sqrt(funcval);
-				//float fv = sqrt(funcval);
-				//std::memcpy(funcvalA, &fv, sizeof(float));
-			//}
-		//}
-		////printf("FGP-TV iterations stopped at iteration %i with the function value %f \n", ll, funcvalA[0]);
-		//result.append<np::ndarray>(npD);
-		//result.append<np::ndarray>(out1);
-		//result.append<int>(ll);
-	//}
-	//if (number_of_dims == 3) {
-		///*D = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		//D_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		//P1 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		//P2 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		//P3 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		//P1_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		//P2_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		//P3_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		//R1 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		//R2 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-		//R3 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));*/
-		//bp::tuple shape = bp::make_tuple(dim_array[0], dim_array[1], dim_array[2]);
-		//np::dtype dtype = np::dtype::get_builtin<float>();
-		
-		//np::ndarray npD      = np::zeros(shape, dtype);
-		//np::ndarray npD_old  = np::zeros(shape, dtype);
-		//np::ndarray npP1     = np::zeros(shape, dtype);
-		//np::ndarray npP2     = np::zeros(shape, dtype);
-		//np::ndarray npP3     = np::zeros(shape, dtype);
-		//np::ndarray npP1_old = np::zeros(shape, dtype);
-		//np::ndarray npP2_old = np::zeros(shape, dtype);
-		//np::ndarray npP3_old = np::zeros(shape, dtype);
-		//np::ndarray npR1     = np::zeros(shape, dtype);
-		//np::ndarray npR2     = np::zeros(shape, dtype);
-		//np::ndarray npR3     = np::zeros(shape, dtype);
-
-		//D      = reinterpret_cast<float *>(npD.get_data());
-		//D_old  = reinterpret_cast<float *>(npD_old.get_data());
-		//P1     = reinterpret_cast<float *>(npP1.get_data());
-		//P2     = reinterpret_cast<float *>(npP2.get_data());
-		//P3     = reinterpret_cast<float *>(npP3.get_data());
-		//P1_old = reinterpret_cast<float *>(npP1_old.get_data());
-		//P2_old = reinterpret_cast<float *>(npP2_old.get_data());
-		//P3_old = reinterpret_cast<float *>(npP3_old.get_data());
-		//R1     = reinterpret_cast<float *>(npR1.get_data());
-		//R2     = reinterpret_cast<float *>(npR2.get_data());
-		//R3     = reinterpret_cast<float *>(npR3.get_data());
-		///* begin iterations */
-		//for (ll = 0; ll<iter; ll++) {
-			///* computing the gradient of the objective function */
-			//Obj_func3D(A, D, R1, R2, R3, lambda, dimX, dimY, dimZ);
-			///*Taking a step towards minus of the gradient*/
-			//Grad_func3D(P1, P2, P3, D, R1, R2, R3, lambda, dimX, dimY, dimZ);
-
-			///* projection step */
-			//Proj_func3D(P1, P2, P3, dimX, dimY, dimZ);
-
-			///*updating R and t*/
-			//tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
-			//Rupd_func3D(P1, P1_old, P2, P2_old, P3, P3_old, R1, R2, R3, tkp1, tk, dimX, dimY, dimZ);
-
-			///* calculate norm - stopping rules*/
-			//re = 0.0f; re1 = 0.0f;
-			//for (j = 0; j<dimX*dimY*dimZ; j++)
-			//{
-				//re += pow(D[j] - D_old[j], 2);
-				//re1 += pow(D[j], 2);
-			//}
-			//re = sqrt(re) / sqrt(re1);
-			///* stop if the norm residual is less than the tolerance EPS */
-			//if (re < epsil)  count++;
-			//if (count > 3) {
-				//Obj_func3D(A, D, P1, P2, P3, lambda, dimX, dimY, dimZ);
-				//funcval = 0.0f;
-				//for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
-				////funcvalA[0] = sqrt(funcval);
-				//float fv = sqrt(funcval);
-				//std::memcpy(funcvalA, &fv, sizeof(float));
-				//break;
-			//}
-
-			///* check that the residual norm is decreasing */
-			//if (ll > 2) {
-				//if (re > re_old) {
-					//Obj_func3D(A, D, P1, P2, P3, lambda, dimX, dimY, dimZ);
-					//funcval = 0.0f;
-					//for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
-					////funcvalA[0] = sqrt(funcval);
-					//float fv = sqrt(funcval);
-					//std::memcpy(funcvalA, &fv, sizeof(float));
-					//break;
-				//}
-			//}
-
-			//re_old = re;
-			///*printf("%f %i %i \n", re, ll, count); */
-
-			///*storing old values*/
-			//copyIm(D, D_old, dimX, dimY, dimZ);
-			//copyIm(P1, P1_old, dimX, dimY, dimZ);
-			//copyIm(P2, P2_old, dimX, dimY, dimZ);
-			//copyIm(P3, P3_old, dimX, dimY, dimZ);
-			//tk = tkp1;
-
-			//if (ll == (iter - 1)) {
-				//Obj_func3D(A, D, P1, P2, P3, lambda, dimX, dimY, dimZ);
-				//funcval = 0.0f;
-				//for (j = 0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j], 2);
-				////funcvalA[0] = sqrt(funcval);
-				//float fv = sqrt(funcval);
-				//std::memcpy(funcvalA, &fv, sizeof(float));
-			//}
-
-		//}
-		////printf("FGP-TV iterations stopped at iteration %i with the function value %f \n", ll, funcvalA[0]);
-		//result.append<np::ndarray>(npD);
-		//result.append<np::ndarray>(out1);
-		//result.append<int>(ll);
-	//}
-
-	//return result;
-//}
-
 bp::list LLT_model(np::ndarray input, double d_lambda, double d_tau, int iter, double d_epsil, int switcher) {
 	// the result is in the following list
 	bp::list result;
@@ -1022,9 +736,6 @@ bp::list TGV_PD(np::ndarray input, double d_lambda, double d_alpha1, double d_al
 		result.append<np::ndarray>(npU);
 	}
 	
-
-	
-	
 	return result;
 }
 
diff --git a/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py b/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py
index a9d0f31..63be1a0 100644
--- a/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py
+++ b/Wrappers/Python/test/test_cpu_vs_gpu_regularizers.py
@@ -3,7 +3,7 @@
 """
 Created on Thu Feb 22 11:39:43 2018
 
-Testing CPU implementation against GPU one
+Testing CPU implementation against the GPU one
 
 @author: Daniil Kazantsev
 """
@@ -12,8 +12,6 @@ import matplotlib.pyplot as plt
 import numpy as np
 import os
 import timeit
-#from ccpi.filters.cpu_regularizers_cython import TV_ROF_CPU, TV_FGP_CPU
-#from ccpi.filters.gpu_regularizers import TV_ROF_GPU, TV_FGP_GPU
 from ccpi.filters.regularizers import ROF_TV, FGP_TV
 
 ###############################################################################
diff --git a/Wrappers/Python/test/test_gpu_regularizers.py b/Wrappers/Python/test/test_gpu_regularizers.py
index 04aeeb4..640b3f9 100644
--- a/Wrappers/Python/test/test_gpu_regularizers.py
+++ b/Wrappers/Python/test/test_gpu_regularizers.py
@@ -11,8 +11,8 @@ import numpy as np
 import os    
 from enum import Enum
 import timeit
-from ccpi.filters.gpu_regularizers import Diff4thHajiaboli, NML, TV_ROF_GPU
-
+from ccpi.filters.gpu_regularizers import Diff4thHajiaboli, NML
+from ccpi.filters.regularizers import ROF_TV, FGP_TV
 ###############################################################################
 def printParametersToString(pars):
         txt = r''
@@ -151,20 +151,19 @@ plt.colorbar(ticks=[0, 0.03], orientation='vertical')
         
 ## Rudin-Osher-Fatemi (ROF) TV regularization
 start_time = timeit.default_timer()
-
 pars = {
-'algorithm' : TV_ROF_GPU , \
+'algorithm' : ROF_TV , \
         'input' : u0,
         'regularization_parameter': 0.04,\
         'number_of_iterations':300,\
         'time_marching_parameter': 0.0025
         
 	}
-
+	
 rof_tv = TV_ROF_GPU(pars['input'], 
                      pars['regularization_parameter'],
                      pars['number_of_iterations'], 
-                     pars['time_marching_parameter'])
+                     pars['time_marching_parameter'],'gpu')
                      
 rms = rmse(Im, rof_tv)
 pars['rmse'] = rms
@@ -190,3 +189,51 @@ a.text(0.05, 0.95, 'rof_tv - u0', transform=a.transAxes, fontsize=14,
 imgplot = plt.imshow((rof_tv - u0)**2, vmin=0, vmax=0.03, cmap="gray")
 plt.colorbar(ticks=[0, 0.03], orientation='vertical')
 plt.show()
+
+## Fast-Gradient Projection TV regularization
+"""
+start_time = timeit.default_timer()
+
+pars = {'algorithm' : FGP_TV, \
+        'input' : u0,\
+        'regularization_parameter':0.04, \
+        'number_of_iterations' :1200 ,\
+        'tolerance_constant':0.00001,\
+        'methodTV': 0 ,\
+        'nonneg': 0 ,\
+        'printingOut': 0 
+        }
+
+fgp_gpu = FGP_TV(pars['input'], 
+              pars['regularization_parameter'],
+              pars['number_of_iterations'],
+              pars['tolerance_constant'], 
+              pars['methodTV'],
+              pars['nonneg'],
+              pars['printingOut'],'gpu')
+                                   
+rms = rmse(Im, fgp_gpu)
+pars['rmse'] = rms
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+a=fig.add_subplot(2,4,4)
+
+# these are matplotlib.patch.Patch properties
+props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
+# place a text box in upper left in axes coords
+a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=12,
+         verticalalignment='top', bbox=props)
+imgplot = plt.imshow(fgp_gpu, cmap="gray")
+
+a=fig.add_subplot(2,4,8)
+
+# these are matplotlib.patch.Patch properties
+props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
+# place a text box in upper left in axes coords
+a.text(0.05, 0.95, 'fgp_gpu - u0', transform=a.transAxes, fontsize=14,
+         verticalalignment='top', bbox=props)
+imgplot = plt.imshow((fgp_gpu - u0)**2, vmin=0, vmax=0.03, cmap="gray")
+plt.colorbar(ticks=[0, 0.03], orientation='vertical')
+plt.show()
+"""
-- 
cgit v1.2.3


From fbb7a12f7714978c251f02bf84ab9a66c762f428 Mon Sep 17 00:00:00 2001
From: algol <dkazanc@hotmail.com>
Date: Wed, 7 Mar 2018 10:15:13 +0000
Subject: small correct #40

---
 Wrappers/Python/src/cpu_regularizers.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Wrappers/Python/src/cpu_regularizers.cpp b/Wrappers/Python/src/cpu_regularizers.cpp
index 88b7c3c..3529ebd 100644
--- a/Wrappers/Python/src/cpu_regularizers.cpp
+++ b/Wrappers/Python/src/cpu_regularizers.cpp
@@ -27,7 +27,6 @@ limitations under the License.
 #include "boost/tuple/tuple.hpp"
 
 #include "SplitBregman_TV_core.h"
-//#include "FGP_TV_core.h"
 #include "LLT_model_core.h"
 #include "PatchBased_Regul_core.h"
 #include "TGV_PD_core.h"
-- 
cgit v1.2.3